aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig3
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c589
-rw-r--r--arch/x86/kvm/i8254.c41
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c87
-rw-r--r--arch/x86/kvm/irq.h10
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h31
-rw-r--r--arch/x86/kvm/lapic.c52
-rw-r--r--arch/x86/kvm/lapic.h8
-rw-r--r--arch/x86/kvm/mmu.c152
-rw-r--r--arch/x86/kvm/mmu.h35
-rw-r--r--arch/x86/kvm/paging_tmpl.h36
-rw-r--r--arch/x86/kvm/svm.c660
-rw-r--r--arch/x86/kvm/trace.h224
-rw-r--r--arch/x86/kvm/vmx.c862
-rw-r--r--arch/x86/kvm/x86.c1680
-rw-r--r--arch/x86/kvm/x86.h30
18 files changed, 3159 insertions, 1346 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b84e571f4175..970bbd479516 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,8 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select USER_RETURN_NOTIFIER
32 select KVM_MMIO
31 ---help--- 33 ---help---
32 Support hosting fully virtualized guest machines using hardware 34 Support hosting fully virtualized guest machines using hardware
33 virtualization extensions. You will need a fairly recent 35 virtualization extensions. You will need a fairly recent
@@ -64,6 +66,7 @@ config KVM_AMD
64 66
65# OK, it's a little counter-intuitive to do this, but it puts it neatly under 67# OK, it's a little counter-intuitive to do this, but it puts it neatly under
66# the virtualization menu. 68# the virtualization menu.
69source drivers/vhost/Kconfig
67source drivers/lguest/Kconfig 70source drivers/lguest/Kconfig
68source drivers/virtio/Kconfig 71source drivers/virtio/Kconfig
69 72
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 0e7fe78d0f74..31a7035c4bd9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,8 @@ CFLAGS_svm.o := -I.
6CFLAGS_vmx.o := -I. 6CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o) 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o)
10kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
11 12
12kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1be5cd640e93..4dade6ac0827 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -32,7 +32,7 @@
32#include <linux/module.h> 32#include <linux/module.h>
33#include <asm/kvm_emulate.h> 33#include <asm/kvm_emulate.h>
34 34
35#include "mmu.h" /* for is_long_mode() */ 35#include "x86.h"
36 36
37/* 37/*
38 * Opcode effective-address decode tables. 38 * Opcode effective-address decode tables.
@@ -75,6 +75,10 @@
75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
77#define GroupMask 0xff /* Group number stored in bits 0:7 */ 77#define GroupMask 0xff /* Group number stored in bits 0:7 */
78/* Misc flags */
79#define Lock (1<<26) /* lock prefix is allowed for the instruction */
80#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
81#define No64 (1<<28)
78/* Source 2 operand type */ 82/* Source 2 operand type */
79#define Src2None (0<<29) 83#define Src2None (0<<29)
80#define Src2CL (1<<29) 84#define Src2CL (1<<29)
@@ -86,35 +90,40 @@
86enum { 90enum {
87 Group1_80, Group1_81, Group1_82, Group1_83, 91 Group1_80, Group1_81, Group1_82, Group1_83,
88 Group1A, Group3_Byte, Group3, Group4, Group5, Group7, 92 Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
93 Group8, Group9,
89}; 94};
90 95
91static u32 opcode_table[256] = { 96static u32 opcode_table[256] = {
92 /* 0x00 - 0x07 */ 97 /* 0x00 - 0x07 */
93 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 98 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
94 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 99 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
95 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 100 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
101 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
96 /* 0x08 - 0x0F */ 102 /* 0x08 - 0x0F */
97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 103 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 104 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
99 0, 0, 0, 0, 105 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
106 ImplicitOps | Stack | No64, 0,
100 /* 0x10 - 0x17 */ 107 /* 0x10 - 0x17 */
101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 108 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 109 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
103 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 110 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
111 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
104 /* 0x18 - 0x1F */ 112 /* 0x18 - 0x1F */
105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 113 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 114 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 115 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
116 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
108 /* 0x20 - 0x27 */ 117 /* 0x20 - 0x27 */
109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 118 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 119 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
111 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 120 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
112 /* 0x28 - 0x2F */ 121 /* 0x28 - 0x2F */
113 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 122 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
114 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 123 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
115 0, 0, 0, 0, 124 0, 0, 0, 0,
116 /* 0x30 - 0x37 */ 125 /* 0x30 - 0x37 */
117 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 126 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
118 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 127 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
119 0, 0, 0, 0, 128 0, 0, 0, 0,
120 /* 0x38 - 0x3F */ 129 /* 0x38 - 0x3F */
@@ -133,7 +142,8 @@ static u32 opcode_table[256] = {
133 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 142 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
134 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 143 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
135 /* 0x60 - 0x67 */ 144 /* 0x60 - 0x67 */
136 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 145 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
146 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
137 0, 0, 0, 0, 147 0, 0, 0, 0,
138 /* 0x68 - 0x6F */ 148 /* 0x68 - 0x6F */
139 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 149 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
@@ -149,7 +159,7 @@ static u32 opcode_table[256] = {
149 Group | Group1_80, Group | Group1_81, 159 Group | Group1_80, Group | Group1_81,
150 Group | Group1_82, Group | Group1_83, 160 Group | Group1_82, Group | Group1_83,
151 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 161 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
152 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 162 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
153 /* 0x88 - 0x8F */ 163 /* 0x88 - 0x8F */
154 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 164 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
155 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 165 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -158,7 +168,7 @@ static u32 opcode_table[256] = {
158 /* 0x90 - 0x97 */ 168 /* 0x90 - 0x97 */
159 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 169 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
160 /* 0x98 - 0x9F */ 170 /* 0x98 - 0x9F */
161 0, 0, SrcImm | Src2Imm16, 0, 171 0, 0, SrcImm | Src2Imm16 | No64, 0,
162 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 172 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
163 /* 0xA0 - 0xA7 */ 173 /* 0xA0 - 0xA7 */
164 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 174 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -185,7 +195,7 @@ static u32 opcode_table[256] = {
185 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 195 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
186 /* 0xC8 - 0xCF */ 196 /* 0xC8 - 0xCF */
187 0, 0, 0, ImplicitOps | Stack, 197 0, 0, 0, ImplicitOps | Stack,
188 ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, 198 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
189 /* 0xD0 - 0xD7 */ 199 /* 0xD0 - 0xD7 */
190 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 200 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
191 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 201 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -198,12 +208,12 @@ static u32 opcode_table[256] = {
198 ByteOp | SrcImmUByte, SrcImmUByte, 208 ByteOp | SrcImmUByte, SrcImmUByte,
199 /* 0xE8 - 0xEF */ 209 /* 0xE8 - 0xEF */
200 SrcImm | Stack, SrcImm | ImplicitOps, 210 SrcImm | Stack, SrcImm | ImplicitOps,
201 SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, 211 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
202 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 212 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
203 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 213 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
204 /* 0xF0 - 0xF7 */ 214 /* 0xF0 - 0xF7 */
205 0, 0, 0, 0, 215 0, 0, 0, 0,
206 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, 216 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
207 /* 0xF8 - 0xFF */ 217 /* 0xF8 - 0xFF */
208 ImplicitOps, 0, ImplicitOps, ImplicitOps, 218 ImplicitOps, 0, ImplicitOps, ImplicitOps,
209 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, 219 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
@@ -211,16 +221,20 @@ static u32 opcode_table[256] = {
211 221
212static u32 twobyte_table[256] = { 222static u32 twobyte_table[256] = {
213 /* 0x00 - 0x0F */ 223 /* 0x00 - 0x0F */
214 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 224 0, Group | GroupDual | Group7, 0, 0,
215 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 225 0, ImplicitOps, ImplicitOps | Priv, 0,
226 ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
227 0, ImplicitOps | ModRM, 0, 0,
216 /* 0x10 - 0x1F */ 228 /* 0x10 - 0x1F */
217 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 229 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
218 /* 0x20 - 0x2F */ 230 /* 0x20 - 0x2F */
219 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, 231 ModRM | ImplicitOps | Priv, ModRM | Priv,
232 ModRM | ImplicitOps | Priv, ModRM | Priv,
233 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0,
221 /* 0x30 - 0x3F */ 235 /* 0x30 - 0x3F */
222 ImplicitOps, 0, ImplicitOps, 0, 236 ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
223 ImplicitOps, ImplicitOps, 0, 0, 237 ImplicitOps, ImplicitOps | Priv, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0, 238 0, 0, 0, 0, 0, 0, 0, 0,
225 /* 0x40 - 0x47 */ 239 /* 0x40 - 0x47 */
226 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 240 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -244,25 +258,29 @@ static u32 twobyte_table[256] = {
244 /* 0x90 - 0x9F */ 258 /* 0x90 - 0x9F */
245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0xA0 - 0xA7 */ 260 /* 0xA0 - 0xA7 */
247 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 261 ImplicitOps | Stack, ImplicitOps | Stack,
262 0, DstMem | SrcReg | ModRM | BitOp,
248 DstMem | SrcReg | Src2ImmByte | ModRM, 263 DstMem | SrcReg | Src2ImmByte | ModRM,
249 DstMem | SrcReg | Src2CL | ModRM, 0, 0, 264 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
250 /* 0xA8 - 0xAF */ 265 /* 0xA8 - 0xAF */
251 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 266 ImplicitOps | Stack, ImplicitOps | Stack,
267 0, DstMem | SrcReg | ModRM | BitOp | Lock,
252 DstMem | SrcReg | Src2ImmByte | ModRM, 268 DstMem | SrcReg | Src2ImmByte | ModRM,
253 DstMem | SrcReg | Src2CL | ModRM, 269 DstMem | SrcReg | Src2CL | ModRM,
254 ModRM, 0, 270 ModRM, 0,
255 /* 0xB0 - 0xB7 */ 271 /* 0xB0 - 0xB7 */
256 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 272 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
257 DstMem | SrcReg | ModRM | BitOp, 273 0, DstMem | SrcReg | ModRM | BitOp | Lock,
258 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, 274 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
259 DstReg | SrcMem16 | ModRM | Mov, 275 DstReg | SrcMem16 | ModRM | Mov,
260 /* 0xB8 - 0xBF */ 276 /* 0xB8 - 0xBF */
261 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, 277 0, 0,
278 Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
262 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, 279 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
263 DstReg | SrcMem16 | ModRM | Mov, 280 DstReg | SrcMem16 | ModRM | Mov,
264 /* 0xC0 - 0xCF */ 281 /* 0xC0 - 0xCF */
265 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, 282 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
283 0, 0, 0, Group | GroupDual | Group9,
266 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0,
267 /* 0xD0 - 0xDF */ 285 /* 0xD0 - 0xDF */
268 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -274,25 +292,41 @@ static u32 twobyte_table[256] = {
274 292
275static u32 group_table[] = { 293static u32 group_table[] = {
276 [Group1_80*8] = 294 [Group1_80*8] =
277 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 295 ByteOp | DstMem | SrcImm | ModRM | Lock,
278 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 296 ByteOp | DstMem | SrcImm | ModRM | Lock,
279 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 297 ByteOp | DstMem | SrcImm | ModRM | Lock,
280 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 298 ByteOp | DstMem | SrcImm | ModRM | Lock,
299 ByteOp | DstMem | SrcImm | ModRM | Lock,
300 ByteOp | DstMem | SrcImm | ModRM | Lock,
301 ByteOp | DstMem | SrcImm | ModRM | Lock,
302 ByteOp | DstMem | SrcImm | ModRM,
281 [Group1_81*8] = 303 [Group1_81*8] =
282 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 304 DstMem | SrcImm | ModRM | Lock,
283 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 305 DstMem | SrcImm | ModRM | Lock,
284 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 306 DstMem | SrcImm | ModRM | Lock,
285 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 307 DstMem | SrcImm | ModRM | Lock,
308 DstMem | SrcImm | ModRM | Lock,
309 DstMem | SrcImm | ModRM | Lock,
310 DstMem | SrcImm | ModRM | Lock,
311 DstMem | SrcImm | ModRM,
286 [Group1_82*8] = 312 [Group1_82*8] =
287 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 313 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
288 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 314 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
289 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 315 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
290 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 316 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
317 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
318 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
319 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
320 ByteOp | DstMem | SrcImm | ModRM | No64,
291 [Group1_83*8] = 321 [Group1_83*8] =
292 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 322 DstMem | SrcImmByte | ModRM | Lock,
293 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 323 DstMem | SrcImmByte | ModRM | Lock,
294 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 324 DstMem | SrcImmByte | ModRM | Lock,
295 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 325 DstMem | SrcImmByte | ModRM | Lock,
326 DstMem | SrcImmByte | ModRM | Lock,
327 DstMem | SrcImmByte | ModRM | Lock,
328 DstMem | SrcImmByte | ModRM | Lock,
329 DstMem | SrcImmByte | ModRM,
296 [Group1A*8] = 330 [Group1A*8] =
297 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, 331 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
298 [Group3_Byte*8] = 332 [Group3_Byte*8] =
@@ -311,24 +345,39 @@ static u32 group_table[] = {
311 SrcMem | ModRM | Stack, 0, 345 SrcMem | ModRM | Stack, 0,
312 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, 346 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
313 [Group7*8] = 347 [Group7*8] =
314 0, 0, ModRM | SrcMem, ModRM | SrcMem, 348 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
315 SrcNone | ModRM | DstMem | Mov, 0, 349 SrcNone | ModRM | DstMem | Mov, 0,
316 SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, 350 SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
351 [Group8*8] =
352 0, 0, 0, 0,
353 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
354 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
355 [Group9*8] =
356 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
317}; 357};
318 358
319static u32 group2_table[] = { 359static u32 group2_table[] = {
320 [Group7*8] = 360 [Group7*8] =
321 SrcNone | ModRM, 0, 0, SrcNone | ModRM, 361 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
322 SrcNone | ModRM | DstMem | Mov, 0, 362 SrcNone | ModRM | DstMem | Mov, 0,
323 SrcMem16 | ModRM | Mov, 0, 363 SrcMem16 | ModRM | Mov, 0,
364 [Group9*8] =
365 0, 0, 0, 0, 0, 0, 0, 0,
324}; 366};
325 367
326/* EFLAGS bit definitions. */ 368/* EFLAGS bit definitions. */
369#define EFLG_ID (1<<21)
370#define EFLG_VIP (1<<20)
371#define EFLG_VIF (1<<19)
372#define EFLG_AC (1<<18)
327#define EFLG_VM (1<<17) 373#define EFLG_VM (1<<17)
328#define EFLG_RF (1<<16) 374#define EFLG_RF (1<<16)
375#define EFLG_IOPL (3<<12)
376#define EFLG_NT (1<<14)
329#define EFLG_OF (1<<11) 377#define EFLG_OF (1<<11)
330#define EFLG_DF (1<<10) 378#define EFLG_DF (1<<10)
331#define EFLG_IF (1<<9) 379#define EFLG_IF (1<<9)
380#define EFLG_TF (1<<8)
332#define EFLG_SF (1<<7) 381#define EFLG_SF (1<<7)
333#define EFLG_ZF (1<<6) 382#define EFLG_ZF (1<<6)
334#define EFLG_AF (1<<4) 383#define EFLG_AF (1<<4)
@@ -597,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
597 646
598 if (linear < fc->start || linear >= fc->end) { 647 if (linear < fc->start || linear >= fc->end) {
599 size = min(15UL, PAGE_SIZE - offset_in_page(linear)); 648 size = min(15UL, PAGE_SIZE - offset_in_page(linear));
600 rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); 649 rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
601 if (rc) 650 if (rc)
602 return rc; 651 return rc;
603 fc->start = linear; 652 fc->start = linear;
@@ -613,6 +662,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
613{ 662{
614 int rc = 0; 663 int rc = 0;
615 664
665 /* x86 instructions are limited to 15 bytes. */
666 if (eip + size - ctxt->decode.eip_orig > 15)
667 return X86EMUL_UNHANDLEABLE;
616 eip += ctxt->cs_base; 668 eip += ctxt->cs_base;
617 while (size--) { 669 while (size--) {
618 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 670 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
@@ -649,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
649 op_bytes = 3; 701 op_bytes = 3;
650 *address = 0; 702 *address = 0;
651 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 703 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
652 ctxt->vcpu); 704 ctxt->vcpu, NULL);
653 if (rc) 705 if (rc)
654 return rc; 706 return rc;
655 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 707 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
656 ctxt->vcpu); 708 ctxt->vcpu, NULL);
657 return rc; 709 return rc;
658} 710}
659 711
@@ -871,12 +923,13 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
871 /* Shadow copy of register state. Committed on successful emulation. */ 923 /* Shadow copy of register state. Committed on successful emulation. */
872 924
873 memset(c, 0, sizeof(struct decode_cache)); 925 memset(c, 0, sizeof(struct decode_cache));
874 c->eip = kvm_rip_read(ctxt->vcpu); 926 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
875 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 927 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
876 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 928 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
877 929
878 switch (mode) { 930 switch (mode) {
879 case X86EMUL_MODE_REAL: 931 case X86EMUL_MODE_REAL:
932 case X86EMUL_MODE_VM86:
880 case X86EMUL_MODE_PROT16: 933 case X86EMUL_MODE_PROT16:
881 def_op_bytes = def_ad_bytes = 2; 934 def_op_bytes = def_ad_bytes = 2;
882 break; 935 break;
@@ -962,6 +1015,11 @@ done_prefixes:
962 } 1015 }
963 } 1016 }
964 1017
1018 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
1019 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
1020 return -1;
1021 }
1022
965 if (c->d & Group) { 1023 if (c->d & Group) {
966 group = c->d & GroupMask; 1024 group = c->d & GroupMask;
967 c->modrm = insn_fetch(u8, 1, c->eip); 1025 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1179,13 +1237,119 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1179 rc = ops->read_emulated(register_address(c, ss_base(ctxt), 1237 rc = ops->read_emulated(register_address(c, ss_base(ctxt),
1180 c->regs[VCPU_REGS_RSP]), 1238 c->regs[VCPU_REGS_RSP]),
1181 dest, len, ctxt->vcpu); 1239 dest, len, ctxt->vcpu);
1182 if (rc != 0) 1240 if (rc != X86EMUL_CONTINUE)
1183 return rc; 1241 return rc;
1184 1242
1185 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); 1243 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
1186 return rc; 1244 return rc;
1187} 1245}
1188 1246
1247static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1248 struct x86_emulate_ops *ops,
1249 void *dest, int len)
1250{
1251 int rc;
1252 unsigned long val, change_mask;
1253 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1254 int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
1255
1256 rc = emulate_pop(ctxt, ops, &val, len);
1257 if (rc != X86EMUL_CONTINUE)
1258 return rc;
1259
1260 change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
1261 | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
1262
1263 switch(ctxt->mode) {
1264 case X86EMUL_MODE_PROT64:
1265 case X86EMUL_MODE_PROT32:
1266 case X86EMUL_MODE_PROT16:
1267 if (cpl == 0)
1268 change_mask |= EFLG_IOPL;
1269 if (cpl <= iopl)
1270 change_mask |= EFLG_IF;
1271 break;
1272 case X86EMUL_MODE_VM86:
1273 if (iopl < 3) {
1274 kvm_inject_gp(ctxt->vcpu, 0);
1275 return X86EMUL_PROPAGATE_FAULT;
1276 }
1277 change_mask |= EFLG_IF;
1278 break;
1279 default: /* real mode */
1280 change_mask |= (EFLG_IOPL | EFLG_IF);
1281 break;
1282 }
1283
1284 *(unsigned long *)dest =
1285 (ctxt->eflags & ~change_mask) | (val & change_mask);
1286
1287 return rc;
1288}
1289
1290static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1291{
1292 struct decode_cache *c = &ctxt->decode;
1293 struct kvm_segment segment;
1294
1295 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
1296
1297 c->src.val = segment.selector;
1298 emulate_push(ctxt);
1299}
1300
1301static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1302 struct x86_emulate_ops *ops, int seg)
1303{
1304 struct decode_cache *c = &ctxt->decode;
1305 unsigned long selector;
1306 int rc;
1307
1308 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1309 if (rc != 0)
1310 return rc;
1311
1312 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
1313 return rc;
1314}
1315
1316static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
1317{
1318 struct decode_cache *c = &ctxt->decode;
1319 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1320 int reg = VCPU_REGS_RAX;
1321
1322 while (reg <= VCPU_REGS_RDI) {
1323 (reg == VCPU_REGS_RSP) ?
1324 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1325
1326 emulate_push(ctxt);
1327 ++reg;
1328 }
1329}
1330
1331static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1332 struct x86_emulate_ops *ops)
1333{
1334 struct decode_cache *c = &ctxt->decode;
1335 int rc = 0;
1336 int reg = VCPU_REGS_RDI;
1337
1338 while (reg >= VCPU_REGS_RAX) {
1339 if (reg == VCPU_REGS_RSP) {
1340 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1341 c->op_bytes);
1342 --reg;
1343 }
1344
1345 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1346 if (rc != 0)
1347 break;
1348 --reg;
1349 }
1350 return rc;
1351}
1352
1189static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1353static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1190 struct x86_emulate_ops *ops) 1354 struct x86_emulate_ops *ops)
1191{ 1355{
@@ -1290,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1290 int rc; 1454 int rc;
1291 1455
1292 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); 1456 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1293 if (rc != 0) 1457 if (rc != X86EMUL_CONTINUE)
1294 return rc; 1458 return rc;
1295 1459
1296 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1460 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
@@ -1305,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1305 (u32) c->regs[VCPU_REGS_RBX]; 1469 (u32) c->regs[VCPU_REGS_RBX];
1306 1470
1307 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); 1471 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1308 if (rc != 0) 1472 if (rc != X86EMUL_CONTINUE)
1309 return rc; 1473 return rc;
1310 ctxt->eflags |= EFLG_ZF; 1474 ctxt->eflags |= EFLG_ZF;
1311 } 1475 }
@@ -1327,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1327 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1491 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1328 if (rc) 1492 if (rc)
1329 return rc; 1493 return rc;
1330 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); 1494 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
1331 return rc; 1495 return rc;
1332} 1496}
1333 1497
@@ -1371,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1371 &c->dst.val, 1535 &c->dst.val,
1372 c->dst.bytes, 1536 c->dst.bytes,
1373 ctxt->vcpu); 1537 ctxt->vcpu);
1374 if (rc != 0) 1538 if (rc != X86EMUL_CONTINUE)
1375 return rc; 1539 return rc;
1376 break; 1540 break;
1377 case OP_NONE: 1541 case OP_NONE:
@@ -1434,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1434 u64 msr_data; 1598 u64 msr_data;
1435 1599
1436 /* syscall is not available in real mode */ 1600 /* syscall is not available in real mode */
1437 if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL 1601 if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
1438 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) 1602 return X86EMUL_UNHANDLEABLE;
1439 return -1;
1440 1603
1441 setup_syscalls_segments(ctxt, &cs, &ss); 1604 setup_syscalls_segments(ctxt, &cs, &ss);
1442 1605
@@ -1473,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1473 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1636 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1474 } 1637 }
1475 1638
1476 return 0; 1639 return X86EMUL_CONTINUE;
1477} 1640}
1478 1641
1479static int 1642static int
@@ -1483,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1483 struct kvm_segment cs, ss; 1646 struct kvm_segment cs, ss;
1484 u64 msr_data; 1647 u64 msr_data;
1485 1648
1486 /* inject #UD if LOCK prefix is used */ 1649 /* inject #GP if in real mode */
1487 if (c->lock_prefix) 1650 if (ctxt->mode == X86EMUL_MODE_REAL) {
1488 return -1;
1489
1490 /* inject #GP if in real mode or paging is disabled */
1491 if (ctxt->mode == X86EMUL_MODE_REAL ||
1492 !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1493 kvm_inject_gp(ctxt->vcpu, 0); 1651 kvm_inject_gp(ctxt->vcpu, 0);
1494 return -1; 1652 return X86EMUL_UNHANDLEABLE;
1495 } 1653 }
1496 1654
1497 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1655 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1498 * Therefore, we inject an #UD. 1656 * Therefore, we inject an #UD.
1499 */ 1657 */
1500 if (ctxt->mode == X86EMUL_MODE_PROT64) 1658 if (ctxt->mode == X86EMUL_MODE_PROT64)
1501 return -1; 1659 return X86EMUL_UNHANDLEABLE;
1502 1660
1503 setup_syscalls_segments(ctxt, &cs, &ss); 1661 setup_syscalls_segments(ctxt, &cs, &ss);
1504 1662
@@ -1507,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1507 case X86EMUL_MODE_PROT32: 1665 case X86EMUL_MODE_PROT32:
1508 if ((msr_data & 0xfffc) == 0x0) { 1666 if ((msr_data & 0xfffc) == 0x0) {
1509 kvm_inject_gp(ctxt->vcpu, 0); 1667 kvm_inject_gp(ctxt->vcpu, 0);
1510 return -1; 1668 return X86EMUL_PROPAGATE_FAULT;
1511 } 1669 }
1512 break; 1670 break;
1513 case X86EMUL_MODE_PROT64: 1671 case X86EMUL_MODE_PROT64:
1514 if (msr_data == 0x0) { 1672 if (msr_data == 0x0) {
1515 kvm_inject_gp(ctxt->vcpu, 0); 1673 kvm_inject_gp(ctxt->vcpu, 0);
1516 return -1; 1674 return X86EMUL_PROPAGATE_FAULT;
1517 } 1675 }
1518 break; 1676 break;
1519 } 1677 }
@@ -1538,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1538 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 1696 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
1539 c->regs[VCPU_REGS_RSP] = msr_data; 1697 c->regs[VCPU_REGS_RSP] = msr_data;
1540 1698
1541 return 0; 1699 return X86EMUL_CONTINUE;
1542} 1700}
1543 1701
1544static int 1702static int
@@ -1549,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1549 u64 msr_data; 1707 u64 msr_data;
1550 int usermode; 1708 int usermode;
1551 1709
1552 /* inject #UD if LOCK prefix is used */ 1710 /* inject #GP if in real mode or Virtual 8086 mode */
1553 if (c->lock_prefix) 1711 if (ctxt->mode == X86EMUL_MODE_REAL ||
1554 return -1; 1712 ctxt->mode == X86EMUL_MODE_VM86) {
1555
1556 /* inject #GP if in real mode or paging is disabled */
1557 if (ctxt->mode == X86EMUL_MODE_REAL
1558 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1559 kvm_inject_gp(ctxt->vcpu, 0);
1560 return -1;
1561 }
1562
1563 /* sysexit must be called from CPL 0 */
1564 if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
1565 kvm_inject_gp(ctxt->vcpu, 0); 1713 kvm_inject_gp(ctxt->vcpu, 0);
1566 return -1; 1714 return X86EMUL_UNHANDLEABLE;
1567 } 1715 }
1568 1716
1569 setup_syscalls_segments(ctxt, &cs, &ss); 1717 setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1581,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1581 cs.selector = (u16)(msr_data + 16); 1729 cs.selector = (u16)(msr_data + 16);
1582 if ((msr_data & 0xfffc) == 0x0) { 1730 if ((msr_data & 0xfffc) == 0x0) {
1583 kvm_inject_gp(ctxt->vcpu, 0); 1731 kvm_inject_gp(ctxt->vcpu, 0);
1584 return -1; 1732 return X86EMUL_PROPAGATE_FAULT;
1585 } 1733 }
1586 ss.selector = (u16)(msr_data + 24); 1734 ss.selector = (u16)(msr_data + 24);
1587 break; 1735 break;
@@ -1589,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1589 cs.selector = (u16)(msr_data + 32); 1737 cs.selector = (u16)(msr_data + 32);
1590 if (msr_data == 0x0) { 1738 if (msr_data == 0x0) {
1591 kvm_inject_gp(ctxt->vcpu, 0); 1739 kvm_inject_gp(ctxt->vcpu, 0);
1592 return -1; 1740 return X86EMUL_PROPAGATE_FAULT;
1593 } 1741 }
1594 ss.selector = cs.selector + 8; 1742 ss.selector = cs.selector + 8;
1595 cs.db = 0; 1743 cs.db = 0;
@@ -1605,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1605 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; 1753 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
1606 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; 1754 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
1607 1755
1608 return 0; 1756 return X86EMUL_CONTINUE;
1757}
1758
1759static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
1760{
1761 int iopl;
1762 if (ctxt->mode == X86EMUL_MODE_REAL)
1763 return false;
1764 if (ctxt->mode == X86EMUL_MODE_VM86)
1765 return true;
1766 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1767 return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
1768}
1769
1770static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1771 struct x86_emulate_ops *ops,
1772 u16 port, u16 len)
1773{
1774 struct kvm_segment tr_seg;
1775 int r;
1776 u16 io_bitmap_ptr;
1777 u8 perm, bit_idx = port & 0x7;
1778 unsigned mask = (1 << len) - 1;
1779
1780 kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
1781 if (tr_seg.unusable)
1782 return false;
1783 if (tr_seg.limit < 103)
1784 return false;
1785 r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
1786 NULL);
1787 if (r != X86EMUL_CONTINUE)
1788 return false;
1789 if (io_bitmap_ptr + port/8 > tr_seg.limit)
1790 return false;
1791 r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
1792 ctxt->vcpu, NULL);
1793 if (r != X86EMUL_CONTINUE)
1794 return false;
1795 if ((perm >> bit_idx) & mask)
1796 return false;
1797 return true;
1798}
1799
1800static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
1801 struct x86_emulate_ops *ops,
1802 u16 port, u16 len)
1803{
1804 if (emulator_bad_iopl(ctxt))
1805 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
1806 return false;
1807 return true;
1609} 1808}
1610 1809
1611int 1810int
@@ -1629,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1629 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 1828 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1630 saved_eip = c->eip; 1829 saved_eip = c->eip;
1631 1830
1831 /* LOCK prefix is allowed only with some instructions */
1832 if (c->lock_prefix && !(c->d & Lock)) {
1833 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1834 goto done;
1835 }
1836
1837 /* Privileged instruction can be executed only in CPL=0 */
1838 if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
1839 kvm_inject_gp(ctxt->vcpu, 0);
1840 goto done;
1841 }
1842
1632 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) 1843 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1633 memop = c->modrm_ea; 1844 memop = c->modrm_ea;
1634 1845
@@ -1669,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1669 &c->src.val, 1880 &c->src.val,
1670 c->src.bytes, 1881 c->src.bytes,
1671 ctxt->vcpu); 1882 ctxt->vcpu);
1672 if (rc != 0) 1883 if (rc != X86EMUL_CONTINUE)
1673 goto done; 1884 goto done;
1674 c->src.orig_val = c->src.val; 1885 c->src.orig_val = c->src.val;
1675 } 1886 }
@@ -1688,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1688 c->dst.ptr = (void *)c->dst.ptr + 1899 c->dst.ptr = (void *)c->dst.ptr +
1689 (c->src.val & mask) / 8; 1900 (c->src.val & mask) / 8;
1690 } 1901 }
1691 if (!(c->d & Mov) && 1902 if (!(c->d & Mov)) {
1692 /* optimisation - avoid slow emulated read */ 1903 /* optimisation - avoid slow emulated read */
1693 ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 1904 rc = ops->read_emulated((unsigned long)c->dst.ptr,
1694 &c->dst.val, 1905 &c->dst.val,
1695 c->dst.bytes, ctxt->vcpu)) != 0)) 1906 c->dst.bytes,
1696 goto done; 1907 ctxt->vcpu);
1908 if (rc != X86EMUL_CONTINUE)
1909 goto done;
1910 }
1697 } 1911 }
1698 c->dst.orig_val = c->dst.val; 1912 c->dst.orig_val = c->dst.val;
1699 1913
@@ -1707,18 +1921,45 @@ special_insn:
1707 add: /* add */ 1921 add: /* add */
1708 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 1922 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1709 break; 1923 break;
1924 case 0x06: /* push es */
1925 emulate_push_sreg(ctxt, VCPU_SREG_ES);
1926 break;
1927 case 0x07: /* pop es */
1928 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1929 if (rc != 0)
1930 goto done;
1931 break;
1710 case 0x08 ... 0x0d: 1932 case 0x08 ... 0x0d:
1711 or: /* or */ 1933 or: /* or */
1712 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 1934 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1713 break; 1935 break;
1936 case 0x0e: /* push cs */
1937 emulate_push_sreg(ctxt, VCPU_SREG_CS);
1938 break;
1714 case 0x10 ... 0x15: 1939 case 0x10 ... 0x15:
1715 adc: /* adc */ 1940 adc: /* adc */
1716 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 1941 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1717 break; 1942 break;
1943 case 0x16: /* push ss */
1944 emulate_push_sreg(ctxt, VCPU_SREG_SS);
1945 break;
1946 case 0x17: /* pop ss */
1947 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1948 if (rc != 0)
1949 goto done;
1950 break;
1718 case 0x18 ... 0x1d: 1951 case 0x18 ... 0x1d:
1719 sbb: /* sbb */ 1952 sbb: /* sbb */
1720 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1953 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1721 break; 1954 break;
1955 case 0x1e: /* push ds */
1956 emulate_push_sreg(ctxt, VCPU_SREG_DS);
1957 break;
1958 case 0x1f: /* pop ds */
1959 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1960 if (rc != 0)
1961 goto done;
1962 break;
1722 case 0x20 ... 0x25: 1963 case 0x20 ... 0x25:
1723 and: /* and */ 1964 and: /* and */
1724 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1965 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
@@ -1750,6 +1991,14 @@ special_insn:
1750 if (rc != 0) 1991 if (rc != 0)
1751 goto done; 1992 goto done;
1752 break; 1993 break;
1994 case 0x60: /* pusha */
1995 emulate_pusha(ctxt);
1996 break;
1997 case 0x61: /* popa */
1998 rc = emulate_popa(ctxt, ops);
1999 if (rc != 0)
2000 goto done;
2001 break;
1753 case 0x63: /* movsxd */ 2002 case 0x63: /* movsxd */
1754 if (ctxt->mode != X86EMUL_MODE_PROT64) 2003 if (ctxt->mode != X86EMUL_MODE_PROT64)
1755 goto cannot_emulate; 2004 goto cannot_emulate;
@@ -1761,7 +2010,12 @@ special_insn:
1761 break; 2010 break;
1762 case 0x6c: /* insb */ 2011 case 0x6c: /* insb */
1763 case 0x6d: /* insw/insd */ 2012 case 0x6d: /* insw/insd */
1764 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 2013 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2014 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2015 kvm_inject_gp(ctxt->vcpu, 0);
2016 goto done;
2017 }
2018 if (kvm_emulate_pio_string(ctxt->vcpu,
1765 1, 2019 1,
1766 (c->d & ByteOp) ? 1 : c->op_bytes, 2020 (c->d & ByteOp) ? 1 : c->op_bytes,
1767 c->rep_prefix ? 2021 c->rep_prefix ?
@@ -1777,7 +2031,12 @@ special_insn:
1777 return 0; 2031 return 0;
1778 case 0x6e: /* outsb */ 2032 case 0x6e: /* outsb */
1779 case 0x6f: /* outsw/outsd */ 2033 case 0x6f: /* outsw/outsd */
1780 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 2034 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2035 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2036 kvm_inject_gp(ctxt->vcpu, 0);
2037 goto done;
2038 }
2039 if (kvm_emulate_pio_string(ctxt->vcpu,
1781 0, 2040 0,
1782 (c->d & ByteOp) ? 1 : c->op_bytes, 2041 (c->d & ByteOp) ? 1 : c->op_bytes,
1783 c->rep_prefix ? 2042 c->rep_prefix ?
@@ -1863,25 +2122,19 @@ special_insn:
1863 break; 2122 break;
1864 case 0x8e: { /* mov seg, r/m16 */ 2123 case 0x8e: { /* mov seg, r/m16 */
1865 uint16_t sel; 2124 uint16_t sel;
1866 int type_bits;
1867 int err;
1868 2125
1869 sel = c->src.val; 2126 sel = c->src.val;
1870 if (c->modrm_reg == VCPU_SREG_SS)
1871 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
1872 2127
1873 if (c->modrm_reg <= 5) { 2128 if (c->modrm_reg == VCPU_SREG_CS ||
1874 type_bits = (c->modrm_reg == 1) ? 9 : 1; 2129 c->modrm_reg > VCPU_SREG_GS) {
1875 err = kvm_load_segment_descriptor(ctxt->vcpu, sel, 2130 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1876 type_bits, c->modrm_reg); 2131 goto done;
1877 } else {
1878 printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
1879 c->modrm);
1880 goto cannot_emulate;
1881 } 2132 }
1882 2133
1883 if (err < 0) 2134 if (c->modrm_reg == VCPU_SREG_SS)
1884 goto cannot_emulate; 2135 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
2136
2137 rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
1885 2138
1886 c->dst.type = OP_NONE; /* Disable writeback. */ 2139 c->dst.type = OP_NONE; /* Disable writeback. */
1887 break; 2140 break;
@@ -1910,7 +2163,10 @@ special_insn:
1910 c->dst.type = OP_REG; 2163 c->dst.type = OP_REG;
1911 c->dst.ptr = (unsigned long *) &ctxt->eflags; 2164 c->dst.ptr = (unsigned long *) &ctxt->eflags;
1912 c->dst.bytes = c->op_bytes; 2165 c->dst.bytes = c->op_bytes;
1913 goto pop_instruction; 2166 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
2167 if (rc != X86EMUL_CONTINUE)
2168 goto done;
2169 break;
1914 case 0xa0 ... 0xa1: /* mov */ 2170 case 0xa0 ... 0xa1: /* mov */
1915 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 2171 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1916 c->dst.val = c->src.val; 2172 c->dst.val = c->src.val;
@@ -1924,11 +2180,12 @@ special_insn:
1924 c->dst.ptr = (unsigned long *)register_address(c, 2180 c->dst.ptr = (unsigned long *)register_address(c,
1925 es_base(ctxt), 2181 es_base(ctxt),
1926 c->regs[VCPU_REGS_RDI]); 2182 c->regs[VCPU_REGS_RDI]);
1927 if ((rc = ops->read_emulated(register_address(c, 2183 rc = ops->read_emulated(register_address(c,
1928 seg_override_base(ctxt, c), 2184 seg_override_base(ctxt, c),
1929 c->regs[VCPU_REGS_RSI]), 2185 c->regs[VCPU_REGS_RSI]),
1930 &c->dst.val, 2186 &c->dst.val,
1931 c->dst.bytes, ctxt->vcpu)) != 0) 2187 c->dst.bytes, ctxt->vcpu);
2188 if (rc != X86EMUL_CONTINUE)
1932 goto done; 2189 goto done;
1933 register_address_increment(c, &c->regs[VCPU_REGS_RSI], 2190 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
1934 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 2191 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -1943,10 +2200,11 @@ special_insn:
1943 c->src.ptr = (unsigned long *)register_address(c, 2200 c->src.ptr = (unsigned long *)register_address(c,
1944 seg_override_base(ctxt, c), 2201 seg_override_base(ctxt, c),
1945 c->regs[VCPU_REGS_RSI]); 2202 c->regs[VCPU_REGS_RSI]);
1946 if ((rc = ops->read_emulated((unsigned long)c->src.ptr, 2203 rc = ops->read_emulated((unsigned long)c->src.ptr,
1947 &c->src.val, 2204 &c->src.val,
1948 c->src.bytes, 2205 c->src.bytes,
1949 ctxt->vcpu)) != 0) 2206 ctxt->vcpu);
2207 if (rc != X86EMUL_CONTINUE)
1950 goto done; 2208 goto done;
1951 2209
1952 c->dst.type = OP_NONE; /* Disable writeback. */ 2210 c->dst.type = OP_NONE; /* Disable writeback. */
@@ -1954,10 +2212,11 @@ special_insn:
1954 c->dst.ptr = (unsigned long *)register_address(c, 2212 c->dst.ptr = (unsigned long *)register_address(c,
1955 es_base(ctxt), 2213 es_base(ctxt),
1956 c->regs[VCPU_REGS_RDI]); 2214 c->regs[VCPU_REGS_RDI]);
1957 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 2215 rc = ops->read_emulated((unsigned long)c->dst.ptr,
1958 &c->dst.val, 2216 &c->dst.val,
1959 c->dst.bytes, 2217 c->dst.bytes,
1960 ctxt->vcpu)) != 0) 2218 ctxt->vcpu);
2219 if (rc != X86EMUL_CONTINUE)
1961 goto done; 2220 goto done;
1962 2221
1963 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2222 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
@@ -1987,12 +2246,13 @@ special_insn:
1987 c->dst.type = OP_REG; 2246 c->dst.type = OP_REG;
1988 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2247 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1989 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 2248 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1990 if ((rc = ops->read_emulated(register_address(c, 2249 rc = ops->read_emulated(register_address(c,
1991 seg_override_base(ctxt, c), 2250 seg_override_base(ctxt, c),
1992 c->regs[VCPU_REGS_RSI]), 2251 c->regs[VCPU_REGS_RSI]),
1993 &c->dst.val, 2252 &c->dst.val,
1994 c->dst.bytes, 2253 c->dst.bytes,
1995 ctxt->vcpu)) != 0) 2254 ctxt->vcpu);
2255 if (rc != X86EMUL_CONTINUE)
1996 goto done; 2256 goto done;
1997 register_address_increment(c, &c->regs[VCPU_REGS_RSI], 2257 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
1998 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 2258 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2048,11 +2308,9 @@ special_insn:
2048 case 0xe9: /* jmp rel */ 2308 case 0xe9: /* jmp rel */
2049 goto jmp; 2309 goto jmp;
2050 case 0xea: /* jmp far */ 2310 case 0xea: /* jmp far */
2051 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, 2311 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
2052 VCPU_SREG_CS) < 0) { 2312 VCPU_SREG_CS))
2053 DPRINTF("jmp far: Failed to load CS descriptor\n"); 2313 goto done;
2054 goto cannot_emulate;
2055 }
2056 2314
2057 c->eip = c->src.val; 2315 c->eip = c->src.val;
2058 break; 2316 break;
@@ -2070,7 +2328,13 @@ special_insn:
2070 case 0xef: /* out (e/r)ax,dx */ 2328 case 0xef: /* out (e/r)ax,dx */
2071 port = c->regs[VCPU_REGS_RDX]; 2329 port = c->regs[VCPU_REGS_RDX];
2072 io_dir_in = 0; 2330 io_dir_in = 0;
2073 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, 2331 do_io:
2332 if (!emulator_io_permited(ctxt, ops, port,
2333 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2334 kvm_inject_gp(ctxt->vcpu, 0);
2335 goto done;
2336 }
2337 if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
2074 (c->d & ByteOp) ? 1 : c->op_bytes, 2338 (c->d & ByteOp) ? 1 : c->op_bytes,
2075 port) != 0) { 2339 port) != 0) {
2076 c->eip = saved_eip; 2340 c->eip = saved_eip;
@@ -2095,13 +2359,21 @@ special_insn:
2095 c->dst.type = OP_NONE; /* Disable writeback. */ 2359 c->dst.type = OP_NONE; /* Disable writeback. */
2096 break; 2360 break;
2097 case 0xfa: /* cli */ 2361 case 0xfa: /* cli */
2098 ctxt->eflags &= ~X86_EFLAGS_IF; 2362 if (emulator_bad_iopl(ctxt))
2099 c->dst.type = OP_NONE; /* Disable writeback. */ 2363 kvm_inject_gp(ctxt->vcpu, 0);
2364 else {
2365 ctxt->eflags &= ~X86_EFLAGS_IF;
2366 c->dst.type = OP_NONE; /* Disable writeback. */
2367 }
2100 break; 2368 break;
2101 case 0xfb: /* sti */ 2369 case 0xfb: /* sti */
2102 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); 2370 if (emulator_bad_iopl(ctxt))
2103 ctxt->eflags |= X86_EFLAGS_IF; 2371 kvm_inject_gp(ctxt->vcpu, 0);
2104 c->dst.type = OP_NONE; /* Disable writeback. */ 2372 else {
2373 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
2374 ctxt->eflags |= X86_EFLAGS_IF;
2375 c->dst.type = OP_NONE; /* Disable writeback. */
2376 }
2105 break; 2377 break;
2106 case 0xfc: /* cld */ 2378 case 0xfc: /* cld */
2107 ctxt->eflags &= ~EFLG_DF; 2379 ctxt->eflags &= ~EFLG_DF;
@@ -2204,8 +2476,9 @@ twobyte_insn:
2204 } 2476 }
2205 break; 2477 break;
2206 case 0x05: /* syscall */ 2478 case 0x05: /* syscall */
2207 if (emulate_syscall(ctxt) == -1) 2479 rc = emulate_syscall(ctxt);
2208 goto cannot_emulate; 2480 if (rc != X86EMUL_CONTINUE)
2481 goto done;
2209 else 2482 else
2210 goto writeback; 2483 goto writeback;
2211 break; 2484 break;
@@ -2276,14 +2549,16 @@ twobyte_insn:
2276 c->dst.type = OP_NONE; 2549 c->dst.type = OP_NONE;
2277 break; 2550 break;
2278 case 0x34: /* sysenter */ 2551 case 0x34: /* sysenter */
2279 if (emulate_sysenter(ctxt) == -1) 2552 rc = emulate_sysenter(ctxt);
2280 goto cannot_emulate; 2553 if (rc != X86EMUL_CONTINUE)
2554 goto done;
2281 else 2555 else
2282 goto writeback; 2556 goto writeback;
2283 break; 2557 break;
2284 case 0x35: /* sysexit */ 2558 case 0x35: /* sysexit */
2285 if (emulate_sysexit(ctxt) == -1) 2559 rc = emulate_sysexit(ctxt);
2286 goto cannot_emulate; 2560 if (rc != X86EMUL_CONTINUE)
2561 goto done;
2287 else 2562 else
2288 goto writeback; 2563 goto writeback;
2289 break; 2564 break;
@@ -2297,6 +2572,14 @@ twobyte_insn:
2297 jmp_rel(c, c->src.val); 2572 jmp_rel(c, c->src.val);
2298 c->dst.type = OP_NONE; 2573 c->dst.type = OP_NONE;
2299 break; 2574 break;
2575 case 0xa0: /* push fs */
2576 emulate_push_sreg(ctxt, VCPU_SREG_FS);
2577 break;
2578 case 0xa1: /* pop fs */
2579 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2580 if (rc != 0)
2581 goto done;
2582 break;
2300 case 0xa3: 2583 case 0xa3:
2301 bt: /* bt */ 2584 bt: /* bt */
2302 c->dst.type = OP_NONE; 2585 c->dst.type = OP_NONE;
@@ -2308,6 +2591,14 @@ twobyte_insn:
2308 case 0xa5: /* shld cl, r, r/m */ 2591 case 0xa5: /* shld cl, r, r/m */
2309 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 2592 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
2310 break; 2593 break;
2594 case 0xa8: /* push gs */
2595 emulate_push_sreg(ctxt, VCPU_SREG_GS);
2596 break;
2597 case 0xa9: /* pop gs */
2598 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2599 if (rc != 0)
2600 goto done;
2601 break;
2311 case 0xab: 2602 case 0xab:
2312 bts: /* bts */ 2603 bts: /* bts */
2313 /* only subword offset */ 2604 /* only subword offset */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 144e7f60b5e2..0150affad25d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -29,7 +29,10 @@
29 * Based on QEMU and Xen. 29 * Based on QEMU and Xen.
30 */ 30 */
31 31
32#define pr_fmt(fmt) "pit: " fmt
33
32#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
35#include <linux/slab.h>
33 36
34#include "irq.h" 37#include "irq.h"
35#include "i8254.h" 38#include "i8254.h"
@@ -240,11 +243,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
240{ 243{
241 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 244 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
242 irq_ack_notifier); 245 irq_ack_notifier);
243 spin_lock(&ps->inject_lock); 246 raw_spin_lock(&ps->inject_lock);
244 if (atomic_dec_return(&ps->pit_timer.pending) < 0) 247 if (atomic_dec_return(&ps->pit_timer.pending) < 0)
245 atomic_inc(&ps->pit_timer.pending); 248 atomic_inc(&ps->pit_timer.pending);
246 ps->irq_ack = 1; 249 ps->irq_ack = 1;
247 spin_unlock(&ps->inject_lock); 250 raw_spin_unlock(&ps->inject_lock);
248} 251}
249 252
250void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 253void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -262,7 +265,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
262 265
263static void destroy_pit_timer(struct kvm_timer *pt) 266static void destroy_pit_timer(struct kvm_timer *pt)
264{ 267{
265 pr_debug("pit: execute del timer!\n"); 268 pr_debug("execute del timer!\n");
266 hrtimer_cancel(&pt->timer); 269 hrtimer_cancel(&pt->timer);
267} 270}
268 271
@@ -284,7 +287,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
284 287
285 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 288 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
286 289
287 pr_debug("pit: create pit timer, interval is %llu nsec\n", interval); 290 pr_debug("create pit timer, interval is %llu nsec\n", interval);
288 291
289 /* TODO The new value only affected after the retriggered */ 292 /* TODO The new value only affected after the retriggered */
290 hrtimer_cancel(&pt->timer); 293 hrtimer_cancel(&pt->timer);
@@ -309,7 +312,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
309 312
310 WARN_ON(!mutex_is_locked(&ps->lock)); 313 WARN_ON(!mutex_is_locked(&ps->lock));
311 314
312 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); 315 pr_debug("load_count val is %d, channel is %d\n", val, channel);
313 316
314 /* 317 /*
315 * The largest possible initial count is 0; this is equivalent 318 * The largest possible initial count is 0; this is equivalent
@@ -395,8 +398,8 @@ static int pit_ioport_write(struct kvm_io_device *this,
395 mutex_lock(&pit_state->lock); 398 mutex_lock(&pit_state->lock);
396 399
397 if (val != 0) 400 if (val != 0)
398 pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n", 401 pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n",
399 (unsigned int)addr, len, val); 402 (unsigned int)addr, len, val);
400 403
401 if (addr == 3) { 404 if (addr == 3) {
402 channel = val >> 6; 405 channel = val >> 6;
@@ -465,6 +468,9 @@ static int pit_ioport_read(struct kvm_io_device *this,
465 return -EOPNOTSUPP; 468 return -EOPNOTSUPP;
466 469
467 addr &= KVM_PIT_CHANNEL_MASK; 470 addr &= KVM_PIT_CHANNEL_MASK;
471 if (addr == 3)
472 return 0;
473
468 s = &pit_state->channels[addr]; 474 s = &pit_state->channels[addr];
469 475
470 mutex_lock(&pit_state->lock); 476 mutex_lock(&pit_state->lock);
@@ -600,7 +606,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
600 .write = speaker_ioport_write, 606 .write = speaker_ioport_write,
601}; 607};
602 608
603/* Caller must have writers lock on slots_lock */ 609/* Caller must hold slots_lock */
604struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) 610struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
605{ 611{
606 struct kvm_pit *pit; 612 struct kvm_pit *pit;
@@ -619,7 +625,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
619 625
620 mutex_init(&pit->pit_state.lock); 626 mutex_init(&pit->pit_state.lock);
621 mutex_lock(&pit->pit_state.lock); 627 mutex_lock(&pit->pit_state.lock);
622 spin_lock_init(&pit->pit_state.inject_lock); 628 raw_spin_lock_init(&pit->pit_state.inject_lock);
623 629
624 kvm->arch.vpit = pit; 630 kvm->arch.vpit = pit;
625 pit->kvm = kvm; 631 pit->kvm = kvm;
@@ -640,13 +646,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
640 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); 646 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
641 647
642 kvm_iodevice_init(&pit->dev, &pit_dev_ops); 648 kvm_iodevice_init(&pit->dev, &pit_dev_ops);
643 ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); 649 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
644 if (ret < 0) 650 if (ret < 0)
645 goto fail; 651 goto fail;
646 652
647 if (flags & KVM_PIT_SPEAKER_DUMMY) { 653 if (flags & KVM_PIT_SPEAKER_DUMMY) {
648 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); 654 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
649 ret = __kvm_io_bus_register_dev(&kvm->pio_bus, 655 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
650 &pit->speaker_dev); 656 &pit->speaker_dev);
651 if (ret < 0) 657 if (ret < 0)
652 goto fail_unregister; 658 goto fail_unregister;
@@ -655,11 +661,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
655 return pit; 661 return pit;
656 662
657fail_unregister: 663fail_unregister:
658 __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); 664 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
659 665
660fail: 666fail:
661 if (pit->irq_source_id >= 0) 667 kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
662 kvm_free_irq_source_id(kvm, pit->irq_source_id); 668 kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
669 kvm_free_irq_source_id(kvm, pit->irq_source_id);
663 670
664 kfree(pit); 671 kfree(pit);
665 return NULL; 672 return NULL;
@@ -688,10 +695,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
688 struct kvm_vcpu *vcpu; 695 struct kvm_vcpu *vcpu;
689 int i; 696 int i;
690 697
691 mutex_lock(&kvm->irq_lock);
692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 698 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
693 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 699 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
694 mutex_unlock(&kvm->irq_lock);
695 700
696 /* 701 /*
697 * Provides NMI watchdog support via Virtual Wire mode. 702 * Provides NMI watchdog support via Virtual Wire mode.
@@ -720,12 +725,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
720 /* Try to inject pending interrupts when 725 /* Try to inject pending interrupts when
721 * last one has been acked. 726 * last one has been acked.
722 */ 727 */
723 spin_lock(&ps->inject_lock); 728 raw_spin_lock(&ps->inject_lock);
724 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { 729 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
725 ps->irq_ack = 0; 730 ps->irq_ack = 0;
726 inject = 1; 731 inject = 1;
727 } 732 }
728 spin_unlock(&ps->inject_lock); 733 raw_spin_unlock(&ps->inject_lock);
729 if (inject) 734 if (inject)
730 __inject_pit_timer_intr(kvm); 735 __inject_pit_timer_intr(kvm);
731 } 736 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index d4c1c7ffdc09..900d6b0ba7c2 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
27 u32 speaker_data_on; 27 u32 speaker_data_on;
28 struct mutex lock; 28 struct mutex lock;
29 struct kvm_pit *pit; 29 struct kvm_pit *pit;
30 spinlock_t inject_lock; 30 raw_spinlock_t inject_lock;
31 unsigned long irq_ack; 31 unsigned long irq_ack;
32 struct kvm_irq_ack_notifier irq_ack_notifier; 32 struct kvm_irq_ack_notifier irq_ack_notifier;
33}; 33};
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 01f151682802..a790fa128a9f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,6 +26,7 @@
26 * Port from Qemu. 26 * Port from Qemu.
27 */ 27 */
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/slab.h>
29#include <linux/bitops.h> 30#include <linux/bitops.h>
30#include "irq.h" 31#include "irq.h"
31 32
@@ -38,16 +39,25 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
38 s->isr_ack |= (1 << irq); 39 s->isr_ack |= (1 << irq);
39 if (s != &s->pics_state->pics[0]) 40 if (s != &s->pics_state->pics[0])
40 irq += 8; 41 irq += 8;
42 /*
43 * We are dropping lock while calling ack notifiers since ack
44 * notifier callbacks for assigned devices call into PIC recursively.
45 * Other interrupt may be delivered to PIC while lock is dropped but
46 * it should be safe since PIC state is already updated at this stage.
47 */
48 raw_spin_unlock(&s->pics_state->lock);
41 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 49 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
50 raw_spin_lock(&s->pics_state->lock);
42} 51}
43 52
44void kvm_pic_clear_isr_ack(struct kvm *kvm) 53void kvm_pic_clear_isr_ack(struct kvm *kvm)
45{ 54{
46 struct kvm_pic *s = pic_irqchip(kvm); 55 struct kvm_pic *s = pic_irqchip(kvm);
47 spin_lock(&s->lock); 56
57 raw_spin_lock(&s->lock);
48 s->pics[0].isr_ack = 0xff; 58 s->pics[0].isr_ack = 0xff;
49 s->pics[1].isr_ack = 0xff; 59 s->pics[1].isr_ack = 0xff;
50 spin_unlock(&s->lock); 60 raw_spin_unlock(&s->lock);
51} 61}
52 62
53/* 63/*
@@ -148,9 +158,9 @@ static void pic_update_irq(struct kvm_pic *s)
148 158
149void kvm_pic_update_irq(struct kvm_pic *s) 159void kvm_pic_update_irq(struct kvm_pic *s)
150{ 160{
151 spin_lock(&s->lock); 161 raw_spin_lock(&s->lock);
152 pic_update_irq(s); 162 pic_update_irq(s);
153 spin_unlock(&s->lock); 163 raw_spin_unlock(&s->lock);
154} 164}
155 165
156int kvm_pic_set_irq(void *opaque, int irq, int level) 166int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -158,14 +168,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
158 struct kvm_pic *s = opaque; 168 struct kvm_pic *s = opaque;
159 int ret = -1; 169 int ret = -1;
160 170
161 spin_lock(&s->lock); 171 raw_spin_lock(&s->lock);
162 if (irq >= 0 && irq < PIC_NUM_PINS) { 172 if (irq >= 0 && irq < PIC_NUM_PINS) {
163 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 173 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
164 pic_update_irq(s); 174 pic_update_irq(s);
165 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 175 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
166 s->pics[irq >> 3].imr, ret == 0); 176 s->pics[irq >> 3].imr, ret == 0);
167 } 177 }
168 spin_unlock(&s->lock); 178 raw_spin_unlock(&s->lock);
169 179
170 return ret; 180 return ret;
171} 181}
@@ -176,16 +186,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
176static inline void pic_intack(struct kvm_kpic_state *s, int irq) 186static inline void pic_intack(struct kvm_kpic_state *s, int irq)
177{ 187{
178 s->isr |= 1 << irq; 188 s->isr |= 1 << irq;
179 if (s->auto_eoi) {
180 if (s->rotate_on_auto_eoi)
181 s->priority_add = (irq + 1) & 7;
182 pic_clear_isr(s, irq);
183 }
184 /* 189 /*
185 * We don't clear a level sensitive interrupt here 190 * We don't clear a level sensitive interrupt here
186 */ 191 */
187 if (!(s->elcr & (1 << irq))) 192 if (!(s->elcr & (1 << irq)))
188 s->irr &= ~(1 << irq); 193 s->irr &= ~(1 << irq);
194
195 if (s->auto_eoi) {
196 if (s->rotate_on_auto_eoi)
197 s->priority_add = (irq + 1) & 7;
198 pic_clear_isr(s, irq);
199 }
200
189} 201}
190 202
191int kvm_pic_read_irq(struct kvm *kvm) 203int kvm_pic_read_irq(struct kvm *kvm)
@@ -193,7 +205,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
193 int irq, irq2, intno; 205 int irq, irq2, intno;
194 struct kvm_pic *s = pic_irqchip(kvm); 206 struct kvm_pic *s = pic_irqchip(kvm);
195 207
196 spin_lock(&s->lock); 208 raw_spin_lock(&s->lock);
197 irq = pic_get_irq(&s->pics[0]); 209 irq = pic_get_irq(&s->pics[0]);
198 if (irq >= 0) { 210 if (irq >= 0) {
199 pic_intack(&s->pics[0], irq); 211 pic_intack(&s->pics[0], irq);
@@ -218,29 +230,18 @@ int kvm_pic_read_irq(struct kvm *kvm)
218 intno = s->pics[0].irq_base + irq; 230 intno = s->pics[0].irq_base + irq;
219 } 231 }
220 pic_update_irq(s); 232 pic_update_irq(s);
221 spin_unlock(&s->lock); 233 raw_spin_unlock(&s->lock);
222 234
223 return intno; 235 return intno;
224} 236}
225 237
226void kvm_pic_reset(struct kvm_kpic_state *s) 238void kvm_pic_reset(struct kvm_kpic_state *s)
227{ 239{
228 int irq, irqbase, n; 240 int irq;
229 struct kvm *kvm = s->pics_state->irq_request_opaque; 241 struct kvm *kvm = s->pics_state->irq_request_opaque;
230 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; 242 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
243 u8 irr = s->irr, isr = s->imr;
231 244
232 if (s == &s->pics_state->pics[0])
233 irqbase = 0;
234 else
235 irqbase = 8;
236
237 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
238 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
239 if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
240 n = irq + irqbase;
241 kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
242 }
243 }
244 s->last_irr = 0; 245 s->last_irr = 0;
245 s->irr = 0; 246 s->irr = 0;
246 s->imr = 0; 247 s->imr = 0;
@@ -256,6 +257,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
256 s->rotate_on_auto_eoi = 0; 257 s->rotate_on_auto_eoi = 0;
257 s->special_fully_nested_mode = 0; 258 s->special_fully_nested_mode = 0;
258 s->init4 = 0; 259 s->init4 = 0;
260
261 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
262 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
263 if (irr & (1 << irq) || isr & (1 << irq)) {
264 pic_clear_isr(s, irq);
265 }
266 }
259} 267}
260 268
261static void pic_ioport_write(void *opaque, u32 addr, u32 val) 269static void pic_ioport_write(void *opaque, u32 addr, u32 val)
@@ -298,9 +306,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
298 priority = get_priority(s, s->isr); 306 priority = get_priority(s, s->isr);
299 if (priority != 8) { 307 if (priority != 8) {
300 irq = (priority + s->priority_add) & 7; 308 irq = (priority + s->priority_add) & 7;
301 pic_clear_isr(s, irq);
302 if (cmd == 5) 309 if (cmd == 5)
303 s->priority_add = (irq + 1) & 7; 310 s->priority_add = (irq + 1) & 7;
311 pic_clear_isr(s, irq);
304 pic_update_irq(s->pics_state); 312 pic_update_irq(s->pics_state);
305 } 313 }
306 break; 314 break;
@@ -436,7 +444,7 @@ static int picdev_write(struct kvm_io_device *this,
436 printk(KERN_ERR "PIC: non byte write\n"); 444 printk(KERN_ERR "PIC: non byte write\n");
437 return 0; 445 return 0;
438 } 446 }
439 spin_lock(&s->lock); 447 raw_spin_lock(&s->lock);
440 switch (addr) { 448 switch (addr) {
441 case 0x20: 449 case 0x20:
442 case 0x21: 450 case 0x21:
@@ -449,7 +457,7 @@ static int picdev_write(struct kvm_io_device *this,
449 elcr_ioport_write(&s->pics[addr & 1], addr, data); 457 elcr_ioport_write(&s->pics[addr & 1], addr, data);
450 break; 458 break;
451 } 459 }
452 spin_unlock(&s->lock); 460 raw_spin_unlock(&s->lock);
453 return 0; 461 return 0;
454} 462}
455 463
@@ -466,7 +474,7 @@ static int picdev_read(struct kvm_io_device *this,
466 printk(KERN_ERR "PIC: non byte read\n"); 474 printk(KERN_ERR "PIC: non byte read\n");
467 return 0; 475 return 0;
468 } 476 }
469 spin_lock(&s->lock); 477 raw_spin_lock(&s->lock);
470 switch (addr) { 478 switch (addr) {
471 case 0x20: 479 case 0x20:
472 case 0x21: 480 case 0x21:
@@ -480,7 +488,7 @@ static int picdev_read(struct kvm_io_device *this,
480 break; 488 break;
481 } 489 }
482 *(unsigned char *)val = data; 490 *(unsigned char *)val = data;
483 spin_unlock(&s->lock); 491 raw_spin_unlock(&s->lock);
484 return 0; 492 return 0;
485} 493}
486 494
@@ -514,7 +522,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
514 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 522 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
515 if (!s) 523 if (!s)
516 return NULL; 524 return NULL;
517 spin_lock_init(&s->lock); 525 raw_spin_lock_init(&s->lock);
518 s->kvm = kvm; 526 s->kvm = kvm;
519 s->pics[0].elcr_mask = 0xf8; 527 s->pics[0].elcr_mask = 0xf8;
520 s->pics[1].elcr_mask = 0xde; 528 s->pics[1].elcr_mask = 0xde;
@@ -527,7 +535,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
527 * Initialize PIO device 535 * Initialize PIO device
528 */ 536 */
529 kvm_iodevice_init(&s->dev, &picdev_ops); 537 kvm_iodevice_init(&s->dev, &picdev_ops);
530 ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); 538 mutex_lock(&kvm->slots_lock);
539 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
540 mutex_unlock(&kvm->slots_lock);
531 if (ret < 0) { 541 if (ret < 0) {
532 kfree(s); 542 kfree(s);
533 return NULL; 543 return NULL;
@@ -535,3 +545,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
535 545
536 return s; 546 return s;
537} 547}
548
549void kvm_destroy_pic(struct kvm *kvm)
550{
551 struct kvm_pic *vpic = kvm->arch.vpic;
552
553 if (vpic) {
554 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
555 kvm->arch.vpic = NULL;
556 kfree(vpic);
557 }
558}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7d6058a2fd38..34b15915754d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -62,7 +62,7 @@ struct kvm_kpic_state {
62}; 62};
63 63
64struct kvm_pic { 64struct kvm_pic {
65 spinlock_t lock; 65 raw_spinlock_t lock;
66 unsigned pending_acks; 66 unsigned pending_acks;
67 struct kvm *kvm; 67 struct kvm *kvm;
68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
@@ -71,9 +71,11 @@ struct kvm_pic {
71 int output; /* intr from master PIC */ 71 int output; /* intr from master PIC */
72 struct kvm_io_device dev; 72 struct kvm_io_device dev;
73 void (*ack_notifier)(void *opaque, int irq); 73 void (*ack_notifier)(void *opaque, int irq);
74 unsigned long irq_states[16];
74}; 75};
75 76
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 77struct kvm_pic *kvm_create_pic(struct kvm *kvm);
78void kvm_destroy_pic(struct kvm *kvm);
77int kvm_pic_read_irq(struct kvm *kvm); 79int kvm_pic_read_irq(struct kvm *kvm);
78void kvm_pic_update_irq(struct kvm_pic *s); 80void kvm_pic_update_irq(struct kvm_pic *s);
79void kvm_pic_clear_isr_ack(struct kvm *kvm); 81void kvm_pic_clear_isr_ack(struct kvm *kvm);
@@ -85,7 +87,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
85 87
86static inline int irqchip_in_kernel(struct kvm *kvm) 88static inline int irqchip_in_kernel(struct kvm *kvm)
87{ 89{
88 return pic_irqchip(kvm) != NULL; 90 int ret;
91
92 ret = (pic_irqchip(kvm) != NULL);
93 smp_rmb();
94 return ret;
89} 95}
90 96
91void kvm_pic_reset(struct kvm_kpic_state *s); 97void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 7bcc5b6a4403..cff851cf5322 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -1,6 +1,11 @@
1#ifndef ASM_KVM_CACHE_REGS_H 1#ifndef ASM_KVM_CACHE_REGS_H
2#define ASM_KVM_CACHE_REGS_H 2#define ASM_KVM_CACHE_REGS_H
3 3
4#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
5#define KVM_POSSIBLE_CR4_GUEST_BITS \
6 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
7 | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
8
4static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, 9static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
5 enum kvm_reg reg) 10 enum kvm_reg reg)
6{ 11{
@@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
38 return vcpu->arch.pdptrs[index]; 43 return vcpu->arch.pdptrs[index];
39} 44}
40 45
46static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
47{
48 ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
49 if (tmask & vcpu->arch.cr0_guest_owned_bits)
50 kvm_x86_ops->decache_cr0_guest_bits(vcpu);
51 return vcpu->arch.cr0 & mask;
52}
53
54static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
55{
56 return kvm_read_cr0_bits(vcpu, ~0UL);
57}
58
59static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
60{
61 ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
62 if (tmask & vcpu->arch.cr4_guest_owned_bits)
63 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
64 return vcpu->arch.cr4 & mask;
65}
66
67static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
68{
69 return kvm_read_cr4_bits(vcpu, ~0UL);
70}
71
41#endif 72#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 23c217692ea9..1eb7a4ae0c9c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -26,13 +26,13 @@
26#include <linux/io.h> 26#include <linux/io.h>
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/math64.h> 28#include <linux/math64.h>
29#include <linux/slab.h>
29#include <asm/processor.h> 30#include <asm/processor.h>
30#include <asm/msr.h> 31#include <asm/msr.h>
31#include <asm/page.h> 32#include <asm/page.h>
32#include <asm/current.h> 33#include <asm/current.h>
33#include <asm/apicdef.h> 34#include <asm/apicdef.h>
34#include <asm/atomic.h> 35#include <asm/atomic.h>
35#include <asm/apicdef.h>
36#include "kvm_cache_regs.h" 36#include "kvm_cache_regs.h"
37#include "irq.h" 37#include "irq.h"
38#include "trace.h" 38#include "trace.h"
@@ -374,6 +374,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
374 if (unlikely(!apic_enabled(apic))) 374 if (unlikely(!apic_enabled(apic)))
375 break; 375 break;
376 376
377 if (trig_mode) {
378 apic_debug("level trig mode for vector %d", vector);
379 apic_set_vector(vector, apic->regs + APIC_TMR);
380 } else
381 apic_clear_vector(vector, apic->regs + APIC_TMR);
382
377 result = !apic_test_and_set_irr(vector, apic); 383 result = !apic_test_and_set_irr(vector, apic);
378 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 384 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
379 trig_mode, vector, !result); 385 trig_mode, vector, !result);
@@ -384,11 +390,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
384 break; 390 break;
385 } 391 }
386 392
387 if (trig_mode) {
388 apic_debug("level trig mode for vector %d", vector);
389 apic_set_vector(vector, apic->regs + APIC_TMR);
390 } else
391 apic_clear_vector(vector, apic->regs + APIC_TMR);
392 kvm_vcpu_kick(vcpu); 393 kvm_vcpu_kick(vcpu);
393 break; 394 break;
394 395
@@ -471,11 +472,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
471 trigger_mode = IOAPIC_LEVEL_TRIG; 472 trigger_mode = IOAPIC_LEVEL_TRIG;
472 else 473 else
473 trigger_mode = IOAPIC_EDGE_TRIG; 474 trigger_mode = IOAPIC_EDGE_TRIG;
474 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { 475 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
475 mutex_lock(&apic->vcpu->kvm->irq_lock);
476 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 476 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
477 mutex_unlock(&apic->vcpu->kvm->irq_lock);
478 }
479} 477}
480 478
481static void apic_send_ipi(struct kvm_lapic *apic) 479static void apic_send_ipi(struct kvm_lapic *apic)
@@ -504,9 +502,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
504 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 502 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
505 irq.vector); 503 irq.vector);
506 504
507 mutex_lock(&apic->vcpu->kvm->irq_lock);
508 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 505 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
509 mutex_unlock(&apic->vcpu->kvm->irq_lock);
510} 506}
511 507
512static u32 apic_get_tmcct(struct kvm_lapic *apic) 508static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -1156,6 +1152,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1156 hrtimer_cancel(&apic->lapic_timer.timer); 1152 hrtimer_cancel(&apic->lapic_timer.timer);
1157 update_divide_count(apic); 1153 update_divide_count(apic);
1158 start_apic_timer(apic); 1154 start_apic_timer(apic);
1155 apic->irr_pending = true;
1159} 1156}
1160 1157
1161void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1158void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1250,3 +1247,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
1250 1247
1251 return 0; 1248 return 0;
1252} 1249}
1250
1251int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
1252{
1253 struct kvm_lapic *apic = vcpu->arch.apic;
1254
1255 if (!irqchip_in_kernel(vcpu->kvm))
1256 return 1;
1257
1258 /* if this is ICR write vector before command */
1259 if (reg == APIC_ICR)
1260 apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
1261 return apic_reg_write(apic, reg, (u32)data);
1262}
1263
1264int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
1265{
1266 struct kvm_lapic *apic = vcpu->arch.apic;
1267 u32 low, high = 0;
1268
1269 if (!irqchip_in_kernel(vcpu->kvm))
1270 return 1;
1271
1272 if (apic_reg_read(apic, reg, 4, &low))
1273 return 1;
1274 if (reg == APIC_ICR)
1275 apic_reg_read(apic, APIC_ICR2, 4, &high);
1276
1277 *data = (((u64)high) << 32) | low;
1278
1279 return 0;
1280}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 40010b09c4aa..f5fe32c5edad 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
48 48
49int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); 49int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
50int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); 50int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
51
52int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
53int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
54
55static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
56{
57 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
58}
51#endif 59#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 818b92ad82cf..19a8906bcaa2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include "mmu.h" 20#include "mmu.h"
21#include "x86.h"
21#include "kvm_cache_regs.h" 22#include "kvm_cache_regs.h"
22 23
23#include <linux/kvm_host.h> 24#include <linux/kvm_host.h>
@@ -29,6 +30,8 @@
29#include <linux/swap.h> 30#include <linux/swap.h>
30#include <linux/hugetlb.h> 31#include <linux/hugetlb.h>
31#include <linux/compiler.h> 32#include <linux/compiler.h>
33#include <linux/srcu.h>
34#include <linux/slab.h>
32 35
33#include <asm/page.h> 36#include <asm/page.h>
34#include <asm/cmpxchg.h> 37#include <asm/cmpxchg.h>
@@ -136,16 +139,6 @@ module_param(oos_shadow, bool, 0644);
136#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 139#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
137 | PT64_NX_MASK) 140 | PT64_NX_MASK)
138 141
139#define PFERR_PRESENT_MASK (1U << 0)
140#define PFERR_WRITE_MASK (1U << 1)
141#define PFERR_USER_MASK (1U << 2)
142#define PFERR_RSVD_MASK (1U << 3)
143#define PFERR_FETCH_MASK (1U << 4)
144
145#define PT_PDPE_LEVEL 3
146#define PT_DIRECTORY_LEVEL 2
147#define PT_PAGE_TABLE_LEVEL 1
148
149#define RMAP_EXT 4 142#define RMAP_EXT 4
150 143
151#define ACC_EXEC_MASK 1 144#define ACC_EXEC_MASK 1
@@ -153,6 +146,9 @@ module_param(oos_shadow, bool, 0644);
153#define ACC_USER_MASK PT_USER_MASK 146#define ACC_USER_MASK PT_USER_MASK
154#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 147#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
155 148
149#include <trace/events/kvm.h>
150
151#undef TRACE_INCLUDE_FILE
156#define CREATE_TRACE_POINTS 152#define CREATE_TRACE_POINTS
157#include "mmutrace.h" 153#include "mmutrace.h"
158 154
@@ -229,7 +225,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
229 225
230static int is_write_protection(struct kvm_vcpu *vcpu) 226static int is_write_protection(struct kvm_vcpu *vcpu)
231{ 227{
232 return vcpu->arch.cr0 & X86_CR0_WP; 228 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
233} 229}
234 230
235static int is_cpuid_PSE36(void) 231static int is_cpuid_PSE36(void)
@@ -239,7 +235,7 @@ static int is_cpuid_PSE36(void)
239 235
240static int is_nx(struct kvm_vcpu *vcpu) 236static int is_nx(struct kvm_vcpu *vcpu)
241{ 237{
242 return vcpu->arch.shadow_efer & EFER_NX; 238 return vcpu->arch.efer & EFER_NX;
243} 239}
244 240
245static int is_shadow_present_pte(u64 pte) 241static int is_shadow_present_pte(u64 pte)
@@ -253,7 +249,7 @@ static int is_large_pte(u64 pte)
253 return pte & PT_PAGE_SIZE_MASK; 249 return pte & PT_PAGE_SIZE_MASK;
254} 250}
255 251
256static int is_writeble_pte(unsigned long pte) 252static int is_writable_pte(unsigned long pte)
257{ 253{
258 return pte & PT_WRITABLE_MASK; 254 return pte & PT_WRITABLE_MASK;
259} 255}
@@ -470,24 +466,10 @@ static int has_wrprotected_page(struct kvm *kvm,
470 466
471static int host_mapping_level(struct kvm *kvm, gfn_t gfn) 467static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
472{ 468{
473 unsigned long page_size = PAGE_SIZE; 469 unsigned long page_size;
474 struct vm_area_struct *vma;
475 unsigned long addr;
476 int i, ret = 0; 470 int i, ret = 0;
477 471
478 addr = gfn_to_hva(kvm, gfn); 472 page_size = kvm_host_page_size(kvm, gfn);
479 if (kvm_is_error_hva(addr))
480 return page_size;
481
482 down_read(&current->mm->mmap_sem);
483 vma = find_vma(current->mm, addr);
484 if (!vma)
485 goto out;
486
487 page_size = vma_kernel_pagesize(vma);
488
489out:
490 up_read(&current->mm->mmap_sem);
491 473
492 for (i = PT_PAGE_TABLE_LEVEL; 474 for (i = PT_PAGE_TABLE_LEVEL;
493 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { 475 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
@@ -503,8 +485,7 @@ out:
503static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 485static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
504{ 486{
505 struct kvm_memory_slot *slot; 487 struct kvm_memory_slot *slot;
506 int host_level; 488 int host_level, level, max_level;
507 int level = PT_PAGE_TABLE_LEVEL;
508 489
509 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 490 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
510 if (slot && slot->dirty_bitmap) 491 if (slot && slot->dirty_bitmap)
@@ -515,11 +496,12 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
515 if (host_level == PT_PAGE_TABLE_LEVEL) 496 if (host_level == PT_PAGE_TABLE_LEVEL)
516 return host_level; 497 return host_level;
517 498
518 for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { 499 max_level = kvm_x86_ops->get_lpage_level() < host_level ?
500 kvm_x86_ops->get_lpage_level() : host_level;
519 501
502 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
520 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 503 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
521 break; 504 break;
522 }
523 505
524 return level - 1; 506 return level - 1;
525} 507}
@@ -635,7 +617,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
635 pfn = spte_to_pfn(*spte); 617 pfn = spte_to_pfn(*spte);
636 if (*spte & shadow_accessed_mask) 618 if (*spte & shadow_accessed_mask)
637 kvm_set_pfn_accessed(pfn); 619 kvm_set_pfn_accessed(pfn);
638 if (is_writeble_pte(*spte)) 620 if (is_writable_pte(*spte))
639 kvm_set_pfn_dirty(pfn); 621 kvm_set_pfn_dirty(pfn);
640 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); 622 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
641 if (!*rmapp) { 623 if (!*rmapp) {
@@ -664,6 +646,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
664 prev_desc = desc; 646 prev_desc = desc;
665 desc = desc->more; 647 desc = desc->more;
666 } 648 }
649 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
667 BUG(); 650 BUG();
668 } 651 }
669} 652}
@@ -710,7 +693,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
710 BUG_ON(!spte); 693 BUG_ON(!spte);
711 BUG_ON(!(*spte & PT_PRESENT_MASK)); 694 BUG_ON(!(*spte & PT_PRESENT_MASK));
712 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 695 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
713 if (is_writeble_pte(*spte)) { 696 if (is_writable_pte(*spte)) {
714 __set_spte(spte, *spte & ~PT_WRITABLE_MASK); 697 __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
715 write_protected = 1; 698 write_protected = 1;
716 } 699 }
@@ -734,7 +717,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
734 BUG_ON(!(*spte & PT_PRESENT_MASK)); 717 BUG_ON(!(*spte & PT_PRESENT_MASK));
735 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 718 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
736 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 719 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
737 if (is_writeble_pte(*spte)) { 720 if (is_writable_pte(*spte)) {
738 rmap_remove(kvm, spte); 721 rmap_remove(kvm, spte);
739 --kvm->stat.lpages; 722 --kvm->stat.lpages;
740 __set_spte(spte, shadow_trap_nonpresent_pte); 723 __set_spte(spte, shadow_trap_nonpresent_pte);
@@ -789,7 +772,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
789 772
790 new_spte &= ~PT_WRITABLE_MASK; 773 new_spte &= ~PT_WRITABLE_MASK;
791 new_spte &= ~SPTE_HOST_WRITEABLE; 774 new_spte &= ~SPTE_HOST_WRITEABLE;
792 if (is_writeble_pte(*spte)) 775 if (is_writable_pte(*spte))
793 kvm_set_pfn_dirty(spte_to_pfn(*spte)); 776 kvm_set_pfn_dirty(spte_to_pfn(*spte));
794 __set_spte(spte, new_spte); 777 __set_spte(spte, new_spte);
795 spte = rmap_next(kvm, rmapp, spte); 778 spte = rmap_next(kvm, rmapp, spte);
@@ -807,35 +790,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
807 unsigned long data)) 790 unsigned long data))
808{ 791{
809 int i, j; 792 int i, j;
793 int ret;
810 int retval = 0; 794 int retval = 0;
795 struct kvm_memslots *slots;
811 796
812 /* 797 slots = rcu_dereference(kvm->memslots);
813 * If mmap_sem isn't taken, we can look the memslots with only 798
814 * the mmu_lock by skipping over the slots with userspace_addr == 0. 799 for (i = 0; i < slots->nmemslots; i++) {
815 */ 800 struct kvm_memory_slot *memslot = &slots->memslots[i];
816 for (i = 0; i < kvm->nmemslots; i++) {
817 struct kvm_memory_slot *memslot = &kvm->memslots[i];
818 unsigned long start = memslot->userspace_addr; 801 unsigned long start = memslot->userspace_addr;
819 unsigned long end; 802 unsigned long end;
820 803
821 /* mmu_lock protects userspace_addr */
822 if (!start)
823 continue;
824
825 end = start + (memslot->npages << PAGE_SHIFT); 804 end = start + (memslot->npages << PAGE_SHIFT);
826 if (hva >= start && hva < end) { 805 if (hva >= start && hva < end) {
827 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 806 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
828 807
829 retval |= handler(kvm, &memslot->rmap[gfn_offset], 808 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
830 data);
831 809
832 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 810 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
833 int idx = gfn_offset; 811 int idx = gfn_offset;
834 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 812 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
835 retval |= handler(kvm, 813 ret |= handler(kvm,
836 &memslot->lpage_info[j][idx].rmap_pde, 814 &memslot->lpage_info[j][idx].rmap_pde,
837 data); 815 data);
838 } 816 }
817 trace_kvm_age_page(hva, memslot, ret);
818 retval |= ret;
839 } 819 }
840 } 820 }
841 821
@@ -858,9 +838,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
858 u64 *spte; 838 u64 *spte;
859 int young = 0; 839 int young = 0;
860 840
861 /* always return old for EPT */ 841 /*
842 * Emulate the accessed bit for EPT, by checking if this page has
843 * an EPT mapping, and clearing it if it does. On the next access,
844 * a new EPT mapping will be established.
845 * This has some overhead, but not as much as the cost of swapping
846 * out actively used pages or breaking up actively used hugepages.
847 */
862 if (!shadow_accessed_mask) 848 if (!shadow_accessed_mask)
863 return 0; 849 return kvm_unmap_rmapp(kvm, rmapp, data);
864 850
865 spte = rmap_next(kvm, rmapp, NULL); 851 spte = rmap_next(kvm, rmapp, NULL);
866 while (spte) { 852 while (spte) {
@@ -1504,8 +1490,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1504 for_each_sp(pages, sp, parents, i) { 1490 for_each_sp(pages, sp, parents, i) {
1505 kvm_mmu_zap_page(kvm, sp); 1491 kvm_mmu_zap_page(kvm, sp);
1506 mmu_pages_clear_parents(&parents); 1492 mmu_pages_clear_parents(&parents);
1493 zapped++;
1507 } 1494 }
1508 zapped += pages.nr;
1509 kvm_mmu_pages_init(parent, &parents, &pages); 1495 kvm_mmu_pages_init(parent, &parents, &pages);
1510 } 1496 }
1511 1497
@@ -1556,14 +1542,16 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1556 */ 1542 */
1557 1543
1558 if (used_pages > kvm_nr_mmu_pages) { 1544 if (used_pages > kvm_nr_mmu_pages) {
1559 while (used_pages > kvm_nr_mmu_pages) { 1545 while (used_pages > kvm_nr_mmu_pages &&
1546 !list_empty(&kvm->arch.active_mmu_pages)) {
1560 struct kvm_mmu_page *page; 1547 struct kvm_mmu_page *page;
1561 1548
1562 page = container_of(kvm->arch.active_mmu_pages.prev, 1549 page = container_of(kvm->arch.active_mmu_pages.prev,
1563 struct kvm_mmu_page, link); 1550 struct kvm_mmu_page, link);
1564 kvm_mmu_zap_page(kvm, page); 1551 used_pages -= kvm_mmu_zap_page(kvm, page);
1565 used_pages--; 1552 used_pages--;
1566 } 1553 }
1554 kvm_nr_mmu_pages = used_pages;
1567 kvm->arch.n_free_mmu_pages = 0; 1555 kvm->arch.n_free_mmu_pages = 0;
1568 } 1556 }
1569 else 1557 else
@@ -1610,14 +1598,15 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1610 && !sp->role.invalid) { 1598 && !sp->role.invalid) {
1611 pgprintk("%s: zap %lx %x\n", 1599 pgprintk("%s: zap %lx %x\n",
1612 __func__, gfn, sp->role.word); 1600 __func__, gfn, sp->role.word);
1613 kvm_mmu_zap_page(kvm, sp); 1601 if (kvm_mmu_zap_page(kvm, sp))
1602 nn = bucket->first;
1614 } 1603 }
1615 } 1604 }
1616} 1605}
1617 1606
1618static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 1607static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1619{ 1608{
1620 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); 1609 int slot = memslot_id(kvm, gfn);
1621 struct kvm_mmu_page *sp = page_header(__pa(pte)); 1610 struct kvm_mmu_page *sp = page_header(__pa(pte));
1622 1611
1623 __set_bit(slot, sp->slot_bitmap); 1612 __set_bit(slot, sp->slot_bitmap);
@@ -1641,7 +1630,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1641{ 1630{
1642 struct page *page; 1631 struct page *page;
1643 1632
1644 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 1633 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
1645 1634
1646 if (gpa == UNMAPPED_GVA) 1635 if (gpa == UNMAPPED_GVA)
1647 return NULL; 1636 return NULL;
@@ -1854,7 +1843,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1854 * is responsibility of mmu_get_page / kvm_sync_page. 1843 * is responsibility of mmu_get_page / kvm_sync_page.
1855 * Same reasoning can be applied to dirty page accounting. 1844 * Same reasoning can be applied to dirty page accounting.
1856 */ 1845 */
1857 if (!can_unsync && is_writeble_pte(*sptep)) 1846 if (!can_unsync && is_writable_pte(*sptep))
1858 goto set_pte; 1847 goto set_pte;
1859 1848
1860 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 1849 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1862,7 +1851,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1862 __func__, gfn); 1851 __func__, gfn);
1863 ret = 1; 1852 ret = 1;
1864 pte_access &= ~ACC_WRITE_MASK; 1853 pte_access &= ~ACC_WRITE_MASK;
1865 if (is_writeble_pte(spte)) 1854 if (is_writable_pte(spte))
1866 spte &= ~PT_WRITABLE_MASK; 1855 spte &= ~PT_WRITABLE_MASK;
1867 } 1856 }
1868 } 1857 }
@@ -1883,7 +1872,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1883 bool reset_host_protection) 1872 bool reset_host_protection)
1884{ 1873{
1885 int was_rmapped = 0; 1874 int was_rmapped = 0;
1886 int was_writeble = is_writeble_pte(*sptep); 1875 int was_writable = is_writable_pte(*sptep);
1887 int rmap_count; 1876 int rmap_count;
1888 1877
1889 pgprintk("%s: spte %llx access %x write_fault %d" 1878 pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1934,7 +1923,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1934 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 1923 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1935 rmap_recycle(vcpu, sptep, gfn); 1924 rmap_recycle(vcpu, sptep, gfn);
1936 } else { 1925 } else {
1937 if (was_writeble) 1926 if (was_writable)
1938 kvm_release_pfn_dirty(pfn); 1927 kvm_release_pfn_dirty(pfn);
1939 else 1928 else
1940 kvm_release_pfn_clean(pfn); 1929 kvm_release_pfn_clean(pfn);
@@ -2164,8 +2153,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2164 spin_unlock(&vcpu->kvm->mmu_lock); 2153 spin_unlock(&vcpu->kvm->mmu_lock);
2165} 2154}
2166 2155
2167static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 2156static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2157 u32 access, u32 *error)
2168{ 2158{
2159 if (error)
2160 *error = 0;
2169 return vaddr; 2161 return vaddr;
2170} 2162}
2171 2163
@@ -2749,7 +2741,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2749 if (tdp_enabled) 2741 if (tdp_enabled)
2750 return 0; 2742 return 0;
2751 2743
2752 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 2744 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2753 2745
2754 spin_lock(&vcpu->kvm->mmu_lock); 2746 spin_lock(&vcpu->kvm->mmu_lock);
2755 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2747 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -2789,7 +2781,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2789 if (r) 2781 if (r)
2790 goto out; 2782 goto out;
2791 2783
2792 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); 2784 er = emulate_instruction(vcpu, cr2, error_code, 0);
2793 2785
2794 switch (er) { 2786 switch (er) {
2795 case EMULATE_DONE: 2787 case EMULATE_DONE:
@@ -2800,6 +2792,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2800 case EMULATE_FAIL: 2792 case EMULATE_FAIL:
2801 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2793 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2802 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 2794 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2795 vcpu->run->internal.ndata = 0;
2803 return 0; 2796 return 0;
2804 default: 2797 default:
2805 BUG(); 2798 BUG();
@@ -2848,16 +2841,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2848 */ 2841 */
2849 page = alloc_page(GFP_KERNEL | __GFP_DMA32); 2842 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
2850 if (!page) 2843 if (!page)
2851 goto error_1; 2844 return -ENOMEM;
2845
2852 vcpu->arch.mmu.pae_root = page_address(page); 2846 vcpu->arch.mmu.pae_root = page_address(page);
2853 for (i = 0; i < 4; ++i) 2847 for (i = 0; i < 4; ++i)
2854 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2848 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2855 2849
2856 return 0; 2850 return 0;
2857
2858error_1:
2859 free_mmu_pages(vcpu);
2860 return -ENOMEM;
2861} 2851}
2862 2852
2863int kvm_mmu_create(struct kvm_vcpu *vcpu) 2853int kvm_mmu_create(struct kvm_vcpu *vcpu)
@@ -2937,10 +2927,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2937 spin_lock(&kvm_lock); 2927 spin_lock(&kvm_lock);
2938 2928
2939 list_for_each_entry(kvm, &vm_list, vm_list) { 2929 list_for_each_entry(kvm, &vm_list, vm_list) {
2940 int npages; 2930 int npages, idx;
2941 2931
2942 if (!down_read_trylock(&kvm->slots_lock)) 2932 idx = srcu_read_lock(&kvm->srcu);
2943 continue;
2944 spin_lock(&kvm->mmu_lock); 2933 spin_lock(&kvm->mmu_lock);
2945 npages = kvm->arch.n_alloc_mmu_pages - 2934 npages = kvm->arch.n_alloc_mmu_pages -
2946 kvm->arch.n_free_mmu_pages; 2935 kvm->arch.n_free_mmu_pages;
@@ -2953,7 +2942,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2953 nr_to_scan--; 2942 nr_to_scan--;
2954 2943
2955 spin_unlock(&kvm->mmu_lock); 2944 spin_unlock(&kvm->mmu_lock);
2956 up_read(&kvm->slots_lock); 2945 srcu_read_unlock(&kvm->srcu, idx);
2957 } 2946 }
2958 if (kvm_freed) 2947 if (kvm_freed)
2959 list_move_tail(&kvm_freed->vm_list, &vm_list); 2948 list_move_tail(&kvm_freed->vm_list, &vm_list);
@@ -3020,9 +3009,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3020 int i; 3009 int i;
3021 unsigned int nr_mmu_pages; 3010 unsigned int nr_mmu_pages;
3022 unsigned int nr_pages = 0; 3011 unsigned int nr_pages = 0;
3012 struct kvm_memslots *slots;
3023 3013
3024 for (i = 0; i < kvm->nmemslots; i++) 3014 slots = rcu_dereference(kvm->memslots);
3025 nr_pages += kvm->memslots[i].npages; 3015 for (i = 0; i < slots->nmemslots; i++)
3016 nr_pages += slots->memslots[i].npages;
3026 3017
3027 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 3018 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3028 nr_mmu_pages = max(nr_mmu_pages, 3019 nr_mmu_pages = max(nr_mmu_pages,
@@ -3247,7 +3238,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3247 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) 3238 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3248 audit_mappings_page(vcpu, ent, va, level - 1); 3239 audit_mappings_page(vcpu, ent, va, level - 1);
3249 else { 3240 else {
3250 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 3241 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3251 gfn_t gfn = gpa >> PAGE_SHIFT; 3242 gfn_t gfn = gpa >> PAGE_SHIFT;
3252 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); 3243 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3253 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; 3244 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
@@ -3292,10 +3283,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
3292static int count_rmaps(struct kvm_vcpu *vcpu) 3283static int count_rmaps(struct kvm_vcpu *vcpu)
3293{ 3284{
3294 int nmaps = 0; 3285 int nmaps = 0;
3295 int i, j, k; 3286 int i, j, k, idx;
3296 3287
3288 idx = srcu_read_lock(&kvm->srcu);
3289 slots = rcu_dereference(kvm->memslots);
3297 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3290 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3298 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; 3291 struct kvm_memory_slot *m = &slots->memslots[i];
3299 struct kvm_rmap_desc *d; 3292 struct kvm_rmap_desc *d;
3300 3293
3301 for (j = 0; j < m->npages; ++j) { 3294 for (j = 0; j < m->npages; ++j) {
@@ -3318,6 +3311,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3318 } 3311 }
3319 } 3312 }
3320 } 3313 }
3314 srcu_read_unlock(&kvm->srcu, idx);
3321 return nmaps; 3315 return nmaps;
3322} 3316}
3323 3317
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 61a1b3884b49..be66759321a5 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -2,6 +2,7 @@
2#define __KVM_X86_MMU_H 2#define __KVM_X86_MMU_H
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5#include "kvm_cache_regs.h"
5 6
6#define PT64_PT_BITS 9 7#define PT64_PT_BITS 9
7#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) 8#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -37,6 +38,16 @@
37#define PT32_ROOT_LEVEL 2 38#define PT32_ROOT_LEVEL 2
38#define PT32E_ROOT_LEVEL 3 39#define PT32E_ROOT_LEVEL 3
39 40
41#define PT_PDPE_LEVEL 3
42#define PT_DIRECTORY_LEVEL 2
43#define PT_PAGE_TABLE_LEVEL 1
44
45#define PFERR_PRESENT_MASK (1U << 0)
46#define PFERR_WRITE_MASK (1U << 1)
47#define PFERR_USER_MASK (1U << 2)
48#define PFERR_RSVD_MASK (1U << 3)
49#define PFERR_FETCH_MASK (1U << 4)
50
40int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
41 52
42static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 53static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
@@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
53 return kvm_mmu_load(vcpu); 64 return kvm_mmu_load(vcpu);
54} 65}
55 66
56static inline int is_long_mode(struct kvm_vcpu *vcpu)
57{
58#ifdef CONFIG_X86_64
59 return vcpu->arch.shadow_efer & EFER_LMA;
60#else
61 return 0;
62#endif
63}
64
65static inline int is_pae(struct kvm_vcpu *vcpu)
66{
67 return vcpu->arch.cr4 & X86_CR4_PAE;
68}
69
70static inline int is_pse(struct kvm_vcpu *vcpu)
71{
72 return vcpu->arch.cr4 & X86_CR4_PSE;
73}
74
75static inline int is_paging(struct kvm_vcpu *vcpu)
76{
77 return vcpu->arch.cr0 & X86_CR0_PG;
78}
79
80static inline int is_present_gpte(unsigned long pte) 67static inline int is_present_gpte(unsigned long pte)
81{ 68{
82 return pte & PT_PRESENT_MASK; 69 return pte & PT_PRESENT_MASK;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 72558f8ff3f5..81eab9a50e6a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -150,7 +150,9 @@ walk:
150 walker->table_gfn[walker->level - 1] = table_gfn; 150 walker->table_gfn[walker->level - 1] = table_gfn;
151 walker->pte_gpa[walker->level - 1] = pte_gpa; 151 walker->pte_gpa[walker->level - 1] = pte_gpa;
152 152
153 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); 153 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
154 goto not_present;
155
154 trace_kvm_mmu_paging_element(pte, walker->level); 156 trace_kvm_mmu_paging_element(pte, walker->level);
155 157
156 if (!is_present_gpte(pte)) 158 if (!is_present_gpte(pte))
@@ -160,7 +162,7 @@ walk:
160 if (rsvd_fault) 162 if (rsvd_fault)
161 goto access_error; 163 goto access_error;
162 164
163 if (write_fault && !is_writeble_pte(pte)) 165 if (write_fault && !is_writable_pte(pte))
164 if (user_fault || is_write_protection(vcpu)) 166 if (user_fault || is_write_protection(vcpu))
165 goto access_error; 167 goto access_error;
166 168
@@ -455,8 +457,6 @@ out_unlock:
455static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 457static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
456{ 458{
457 struct kvm_shadow_walk_iterator iterator; 459 struct kvm_shadow_walk_iterator iterator;
458 pt_element_t gpte;
459 gpa_t pte_gpa = -1;
460 int level; 460 int level;
461 u64 *sptep; 461 u64 *sptep;
462 int need_flush = 0; 462 int need_flush = 0;
@@ -467,14 +467,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
467 level = iterator.level; 467 level = iterator.level;
468 sptep = iterator.sptep; 468 sptep = iterator.sptep;
469 469
470 /* FIXME: properly handle invlpg on large guest pages */
471 if (level == PT_PAGE_TABLE_LEVEL || 470 if (level == PT_PAGE_TABLE_LEVEL ||
472 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
473 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
474 struct kvm_mmu_page *sp = page_header(__pa(sptep));
475
476 pte_gpa = (sp->gfn << PAGE_SHIFT);
477 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
478 473
479 if (is_shadow_present_pte(*sptep)) { 474 if (is_shadow_present_pte(*sptep)) {
480 rmap_remove(vcpu->kvm, sptep); 475 rmap_remove(vcpu->kvm, sptep);
@@ -493,32 +488,25 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
493 if (need_flush) 488 if (need_flush)
494 kvm_flush_remote_tlbs(vcpu->kvm); 489 kvm_flush_remote_tlbs(vcpu->kvm);
495 spin_unlock(&vcpu->kvm->mmu_lock); 490 spin_unlock(&vcpu->kvm->mmu_lock);
496
497 if (pte_gpa == -1)
498 return;
499 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
500 sizeof(pt_element_t)))
501 return;
502 if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) {
503 if (mmu_topup_memory_caches(vcpu))
504 return;
505 kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
506 sizeof(pt_element_t), 0);
507 }
508} 491}
509 492
510static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
494 u32 *error)
511{ 495{
512 struct guest_walker walker; 496 struct guest_walker walker;
513 gpa_t gpa = UNMAPPED_GVA; 497 gpa_t gpa = UNMAPPED_GVA;
514 int r; 498 int r;
515 499
516 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); 500 r = FNAME(walk_addr)(&walker, vcpu, vaddr,
501 !!(access & PFERR_WRITE_MASK),
502 !!(access & PFERR_USER_MASK),
503 !!(access & PFERR_FETCH_MASK));
517 504
518 if (r) { 505 if (r) {
519 gpa = gfn_to_gpa(walker.gfn); 506 gpa = gfn_to_gpa(walker.gfn);
520 gpa |= vaddr & ~PAGE_MASK; 507 gpa |= vaddr & ~PAGE_MASK;
521 } 508 } else if (error)
509 *error = walker.error_code;
522 510
523 return gpa; 511 return gpa;
524} 512}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c17404add91f..737361fcd503 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -26,6 +26,7 @@
26#include <linux/highmem.h> 26#include <linux/highmem.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/ftrace_event.h> 28#include <linux/ftrace_event.h>
29#include <linux/slab.h>
29 30
30#include <asm/desc.h> 31#include <asm/desc.h>
31 32
@@ -46,6 +47,7 @@ MODULE_LICENSE("GPL");
46#define SVM_FEATURE_NPT (1 << 0) 47#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1) 48#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_FEATURE_SVML (1 << 2) 49#define SVM_FEATURE_SVML (1 << 2)
50#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
49 51
50#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 52#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
51#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 53#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -53,15 +55,6 @@ MODULE_LICENSE("GPL");
53 55
54#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 56#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
55 57
56/* Turn on to get debugging output*/
57/* #define NESTED_DEBUG */
58
59#ifdef NESTED_DEBUG
60#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
61#else
62#define nsvm_printk(fmt, args...) do {} while(0)
63#endif
64
65static const u32 host_save_user_msrs[] = { 58static const u32 host_save_user_msrs[] = {
66#ifdef CONFIG_X86_64 59#ifdef CONFIG_X86_64
67 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, 60 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -85,6 +78,9 @@ struct nested_state {
85 /* gpa pointers to the real vectors */ 78 /* gpa pointers to the real vectors */
86 u64 vmcb_msrpm; 79 u64 vmcb_msrpm;
87 80
81 /* A VMEXIT is required but not yet emulated */
82 bool exit_required;
83
88 /* cache for intercepts of the guest */ 84 /* cache for intercepts of the guest */
89 u16 intercept_cr_read; 85 u16 intercept_cr_read;
90 u16 intercept_cr_write; 86 u16 intercept_cr_write;
@@ -112,6 +108,8 @@ struct vcpu_svm {
112 u32 *msrpm; 108 u32 *msrpm;
113 109
114 struct nested_state nested; 110 struct nested_state nested;
111
112 bool nmi_singlestep;
115}; 113};
116 114
117/* enable NPT for AMD64 and X86 with PAE */ 115/* enable NPT for AMD64 and X86 with PAE */
@@ -234,7 +232,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
234 efer &= ~EFER_LME; 232 efer &= ~EFER_LME;
235 233
236 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 234 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
237 vcpu->arch.shadow_efer = efer; 235 vcpu->arch.efer = efer;
238} 236}
239 237
240static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 238static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -286,7 +284,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
286 struct vcpu_svm *svm = to_svm(vcpu); 284 struct vcpu_svm *svm = to_svm(vcpu);
287 285
288 if (!svm->next_rip) { 286 if (!svm->next_rip) {
289 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != 287 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
290 EMULATE_DONE) 288 EMULATE_DONE)
291 printk(KERN_DEBUG "%s: NOP\n", __func__); 289 printk(KERN_DEBUG "%s: NOP\n", __func__);
292 return; 290 return;
@@ -316,75 +314,79 @@ static void svm_hardware_disable(void *garbage)
316 cpu_svm_disable(); 314 cpu_svm_disable();
317} 315}
318 316
319static void svm_hardware_enable(void *garbage) 317static int svm_hardware_enable(void *garbage)
320{ 318{
321 319
322 struct svm_cpu_data *svm_data; 320 struct svm_cpu_data *sd;
323 uint64_t efer; 321 uint64_t efer;
324 struct descriptor_table gdt_descr; 322 struct descriptor_table gdt_descr;
325 struct desc_struct *gdt; 323 struct desc_struct *gdt;
326 int me = raw_smp_processor_id(); 324 int me = raw_smp_processor_id();
327 325
326 rdmsrl(MSR_EFER, efer);
327 if (efer & EFER_SVME)
328 return -EBUSY;
329
328 if (!has_svm()) { 330 if (!has_svm()) {
329 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); 331 printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
330 return; 332 me);
333 return -EINVAL;
331 } 334 }
332 svm_data = per_cpu(svm_data, me); 335 sd = per_cpu(svm_data, me);
333 336
334 if (!svm_data) { 337 if (!sd) {
335 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", 338 printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
336 me); 339 me);
337 return; 340 return -EINVAL;
338 } 341 }
339 342
340 svm_data->asid_generation = 1; 343 sd->asid_generation = 1;
341 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 344 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
342 svm_data->next_asid = svm_data->max_asid + 1; 345 sd->next_asid = sd->max_asid + 1;
343 346
344 kvm_get_gdt(&gdt_descr); 347 kvm_get_gdt(&gdt_descr);
345 gdt = (struct desc_struct *)gdt_descr.base; 348 gdt = (struct desc_struct *)gdt_descr.base;
346 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 349 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
347 350
348 rdmsrl(MSR_EFER, efer);
349 wrmsrl(MSR_EFER, efer | EFER_SVME); 351 wrmsrl(MSR_EFER, efer | EFER_SVME);
350 352
351 wrmsrl(MSR_VM_HSAVE_PA, 353 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
352 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 354
355 return 0;
353} 356}
354 357
355static void svm_cpu_uninit(int cpu) 358static void svm_cpu_uninit(int cpu)
356{ 359{
357 struct svm_cpu_data *svm_data 360 struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
358 = per_cpu(svm_data, raw_smp_processor_id());
359 361
360 if (!svm_data) 362 if (!sd)
361 return; 363 return;
362 364
363 per_cpu(svm_data, raw_smp_processor_id()) = NULL; 365 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
364 __free_page(svm_data->save_area); 366 __free_page(sd->save_area);
365 kfree(svm_data); 367 kfree(sd);
366} 368}
367 369
368static int svm_cpu_init(int cpu) 370static int svm_cpu_init(int cpu)
369{ 371{
370 struct svm_cpu_data *svm_data; 372 struct svm_cpu_data *sd;
371 int r; 373 int r;
372 374
373 svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); 375 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
374 if (!svm_data) 376 if (!sd)
375 return -ENOMEM; 377 return -ENOMEM;
376 svm_data->cpu = cpu; 378 sd->cpu = cpu;
377 svm_data->save_area = alloc_page(GFP_KERNEL); 379 sd->save_area = alloc_page(GFP_KERNEL);
378 r = -ENOMEM; 380 r = -ENOMEM;
379 if (!svm_data->save_area) 381 if (!sd->save_area)
380 goto err_1; 382 goto err_1;
381 383
382 per_cpu(svm_data, cpu) = svm_data; 384 per_cpu(svm_data, cpu) = sd;
383 385
384 return 0; 386 return 0;
385 387
386err_1: 388err_1:
387 kfree(svm_data); 389 kfree(sd);
388 return r; 390 return r;
389 391
390} 392}
@@ -476,7 +478,7 @@ static __init int svm_hardware_setup(void)
476 kvm_enable_efer_bits(EFER_SVME); 478 kvm_enable_efer_bits(EFER_SVME);
477 } 479 }
478 480
479 for_each_online_cpu(cpu) { 481 for_each_possible_cpu(cpu) {
480 r = svm_cpu_init(cpu); 482 r = svm_cpu_init(cpu);
481 if (r) 483 if (r)
482 goto err; 484 goto err;
@@ -510,7 +512,7 @@ static __exit void svm_hardware_unsetup(void)
510{ 512{
511 int cpu; 513 int cpu;
512 514
513 for_each_online_cpu(cpu) 515 for_each_possible_cpu(cpu)
514 svm_cpu_uninit(cpu); 516 svm_cpu_uninit(cpu);
515 517
516 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 518 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
@@ -539,6 +541,8 @@ static void init_vmcb(struct vcpu_svm *svm)
539 struct vmcb_control_area *control = &svm->vmcb->control; 541 struct vmcb_control_area *control = &svm->vmcb->control;
540 struct vmcb_save_area *save = &svm->vmcb->save; 542 struct vmcb_save_area *save = &svm->vmcb->save;
541 543
544 svm->vcpu.fpu_active = 1;
545
542 control->intercept_cr_read = INTERCEPT_CR0_MASK | 546 control->intercept_cr_read = INTERCEPT_CR0_MASK |
543 INTERCEPT_CR3_MASK | 547 INTERCEPT_CR3_MASK |
544 INTERCEPT_CR4_MASK; 548 INTERCEPT_CR4_MASK;
@@ -551,13 +555,19 @@ static void init_vmcb(struct vcpu_svm *svm)
551 control->intercept_dr_read = INTERCEPT_DR0_MASK | 555 control->intercept_dr_read = INTERCEPT_DR0_MASK |
552 INTERCEPT_DR1_MASK | 556 INTERCEPT_DR1_MASK |
553 INTERCEPT_DR2_MASK | 557 INTERCEPT_DR2_MASK |
554 INTERCEPT_DR3_MASK; 558 INTERCEPT_DR3_MASK |
559 INTERCEPT_DR4_MASK |
560 INTERCEPT_DR5_MASK |
561 INTERCEPT_DR6_MASK |
562 INTERCEPT_DR7_MASK;
555 563
556 control->intercept_dr_write = INTERCEPT_DR0_MASK | 564 control->intercept_dr_write = INTERCEPT_DR0_MASK |
557 INTERCEPT_DR1_MASK | 565 INTERCEPT_DR1_MASK |
558 INTERCEPT_DR2_MASK | 566 INTERCEPT_DR2_MASK |
559 INTERCEPT_DR3_MASK | 567 INTERCEPT_DR3_MASK |
568 INTERCEPT_DR4_MASK |
560 INTERCEPT_DR5_MASK | 569 INTERCEPT_DR5_MASK |
570 INTERCEPT_DR6_MASK |
561 INTERCEPT_DR7_MASK; 571 INTERCEPT_DR7_MASK;
562 572
563 control->intercept_exceptions = (1 << PF_VECTOR) | 573 control->intercept_exceptions = (1 << PF_VECTOR) |
@@ -568,6 +578,7 @@ static void init_vmcb(struct vcpu_svm *svm)
568 control->intercept = (1ULL << INTERCEPT_INTR) | 578 control->intercept = (1ULL << INTERCEPT_INTR) |
569 (1ULL << INTERCEPT_NMI) | 579 (1ULL << INTERCEPT_NMI) |
570 (1ULL << INTERCEPT_SMI) | 580 (1ULL << INTERCEPT_SMI) |
581 (1ULL << INTERCEPT_SELECTIVE_CR0) |
571 (1ULL << INTERCEPT_CPUID) | 582 (1ULL << INTERCEPT_CPUID) |
572 (1ULL << INTERCEPT_INVD) | 583 (1ULL << INTERCEPT_INVD) |
573 (1ULL << INTERCEPT_HLT) | 584 (1ULL << INTERCEPT_HLT) |
@@ -625,11 +636,12 @@ static void init_vmcb(struct vcpu_svm *svm)
625 save->rip = 0x0000fff0; 636 save->rip = 0x0000fff0;
626 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 637 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
627 638
628 /* 639 /* This is the guest-visible cr0 value.
629 * cr0 val on cpu init should be 0x60000010, we enable cpu 640 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
630 * cache by default. the orderly way is to enable cache in bios.
631 */ 641 */
632 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; 642 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
643 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
644
633 save->cr4 = X86_CR4_PAE; 645 save->cr4 = X86_CR4_PAE;
634 /* rdx = ?? */ 646 /* rdx = ?? */
635 647
@@ -639,13 +651,9 @@ static void init_vmcb(struct vcpu_svm *svm)
639 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | 651 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
640 (1ULL << INTERCEPT_INVLPG)); 652 (1ULL << INTERCEPT_INVLPG));
641 control->intercept_exceptions &= ~(1 << PF_VECTOR); 653 control->intercept_exceptions &= ~(1 << PF_VECTOR);
642 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| 654 control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
643 INTERCEPT_CR3_MASK); 655 control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
644 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
645 INTERCEPT_CR3_MASK);
646 save->g_pat = 0x0007040600070406ULL; 656 save->g_pat = 0x0007040600070406ULL;
647 /* enable caching because the QEMU Bios doesn't enable it */
648 save->cr0 = X86_CR0_ET;
649 save->cr3 = 0; 657 save->cr3 = 0;
650 save->cr4 = 0; 658 save->cr4 = 0;
651 } 659 }
@@ -654,6 +662,11 @@ static void init_vmcb(struct vcpu_svm *svm)
654 svm->nested.vmcb = 0; 662 svm->nested.vmcb = 0;
655 svm->vcpu.arch.hflags = 0; 663 svm->vcpu.arch.hflags = 0;
656 664
665 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
666 control->pause_filter_count = 3000;
667 control->intercept |= (1ULL << INTERCEPT_PAUSE);
668 }
669
657 enable_gif(svm); 670 enable_gif(svm);
658} 671}
659 672
@@ -693,29 +706,28 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
693 if (err) 706 if (err)
694 goto free_svm; 707 goto free_svm;
695 708
709 err = -ENOMEM;
696 page = alloc_page(GFP_KERNEL); 710 page = alloc_page(GFP_KERNEL);
697 if (!page) { 711 if (!page)
698 err = -ENOMEM;
699 goto uninit; 712 goto uninit;
700 }
701 713
702 err = -ENOMEM;
703 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 714 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
704 if (!msrpm_pages) 715 if (!msrpm_pages)
705 goto uninit; 716 goto free_page1;
706 717
707 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 718 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
708 if (!nested_msrpm_pages) 719 if (!nested_msrpm_pages)
709 goto uninit; 720 goto free_page2;
710
711 svm->msrpm = page_address(msrpm_pages);
712 svm_vcpu_init_msrpm(svm->msrpm);
713 721
714 hsave_page = alloc_page(GFP_KERNEL); 722 hsave_page = alloc_page(GFP_KERNEL);
715 if (!hsave_page) 723 if (!hsave_page)
716 goto uninit; 724 goto free_page3;
725
717 svm->nested.hsave = page_address(hsave_page); 726 svm->nested.hsave = page_address(hsave_page);
718 727
728 svm->msrpm = page_address(msrpm_pages);
729 svm_vcpu_init_msrpm(svm->msrpm);
730
719 svm->nested.msrpm = page_address(nested_msrpm_pages); 731 svm->nested.msrpm = page_address(nested_msrpm_pages);
720 732
721 svm->vmcb = page_address(page); 733 svm->vmcb = page_address(page);
@@ -725,13 +737,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
725 init_vmcb(svm); 737 init_vmcb(svm);
726 738
727 fx_init(&svm->vcpu); 739 fx_init(&svm->vcpu);
728 svm->vcpu.fpu_active = 1;
729 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 740 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
730 if (kvm_vcpu_is_bsp(&svm->vcpu)) 741 if (kvm_vcpu_is_bsp(&svm->vcpu))
731 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 742 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
732 743
733 return &svm->vcpu; 744 return &svm->vcpu;
734 745
746free_page3:
747 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
748free_page2:
749 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
750free_page1:
751 __free_page(page);
735uninit: 752uninit:
736 kvm_vcpu_uninit(&svm->vcpu); 753 kvm_vcpu_uninit(&svm->vcpu);
737free_svm: 754free_svm:
@@ -758,17 +775,18 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
758 int i; 775 int i;
759 776
760 if (unlikely(cpu != vcpu->cpu)) { 777 if (unlikely(cpu != vcpu->cpu)) {
761 u64 tsc_this, delta; 778 u64 delta;
762 779
763 /* 780 if (check_tsc_unstable()) {
764 * Make sure that the guest sees a monotonically 781 /*
765 * increasing TSC. 782 * Make sure that the guest sees a monotonically
766 */ 783 * increasing TSC.
767 rdtscll(tsc_this); 784 */
768 delta = vcpu->arch.host_tsc - tsc_this; 785 delta = vcpu->arch.host_tsc - native_read_tsc();
769 svm->vmcb->control.tsc_offset += delta; 786 svm->vmcb->control.tsc_offset += delta;
770 if (is_nested(svm)) 787 if (is_nested(svm))
771 svm->nested.hsave->control.tsc_offset += delta; 788 svm->nested.hsave->control.tsc_offset += delta;
789 }
772 vcpu->cpu = cpu; 790 vcpu->cpu = cpu;
773 kvm_migrate_timers(vcpu); 791 kvm_migrate_timers(vcpu);
774 svm->asid_generation = 0; 792 svm->asid_generation = 0;
@@ -787,7 +805,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
787 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 805 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
788 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 806 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
789 807
790 rdtscll(vcpu->arch.host_tsc); 808 vcpu->arch.host_tsc = native_read_tsc();
791} 809}
792 810
793static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 811static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -950,42 +968,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
950 svm->vmcb->save.gdtr.base = dt->base ; 968 svm->vmcb->save.gdtr.base = dt->base ;
951} 969}
952 970
971static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
972{
973}
974
953static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 975static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
954{ 976{
955} 977}
956 978
979static void update_cr0_intercept(struct vcpu_svm *svm)
980{
981 ulong gcr0 = svm->vcpu.arch.cr0;
982 u64 *hcr0 = &svm->vmcb->save.cr0;
983
984 if (!svm->vcpu.fpu_active)
985 *hcr0 |= SVM_CR0_SELECTIVE_MASK;
986 else
987 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
988 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
989
990
991 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
992 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
993 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
994 } else {
995 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
996 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
997 }
998}
999
957static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1000static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
958{ 1001{
959 struct vcpu_svm *svm = to_svm(vcpu); 1002 struct vcpu_svm *svm = to_svm(vcpu);
960 1003
961#ifdef CONFIG_X86_64 1004#ifdef CONFIG_X86_64
962 if (vcpu->arch.shadow_efer & EFER_LME) { 1005 if (vcpu->arch.efer & EFER_LME) {
963 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1006 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
964 vcpu->arch.shadow_efer |= EFER_LMA; 1007 vcpu->arch.efer |= EFER_LMA;
965 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 1008 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
966 } 1009 }
967 1010
968 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 1011 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
969 vcpu->arch.shadow_efer &= ~EFER_LMA; 1012 vcpu->arch.efer &= ~EFER_LMA;
970 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 1013 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
971 } 1014 }
972 } 1015 }
973#endif 1016#endif
974 if (npt_enabled) 1017 vcpu->arch.cr0 = cr0;
975 goto set;
976 1018
977 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { 1019 if (!npt_enabled)
978 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1020 cr0 |= X86_CR0_PG | X86_CR0_WP;
979 vcpu->fpu_active = 1;
980 }
981 1021
982 vcpu->arch.cr0 = cr0; 1022 if (!vcpu->fpu_active)
983 cr0 |= X86_CR0_PG | X86_CR0_WP;
984 if (!vcpu->fpu_active) {
985 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
986 cr0 |= X86_CR0_TS; 1023 cr0 |= X86_CR0_TS;
987 }
988set:
989 /* 1024 /*
990 * re-enable caching here because the QEMU bios 1025 * re-enable caching here because the QEMU bios
991 * does not do it - this results in some delay at 1026 * does not do it - this results in some delay at
@@ -993,6 +1028,7 @@ set:
993 */ 1028 */
994 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1029 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
995 svm->vmcb->save.cr0 = cr0; 1030 svm->vmcb->save.cr0 = cr0;
1031 update_cr0_intercept(svm);
996} 1032}
997 1033
998static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1034static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1045,7 +1081,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1045 svm->vmcb->control.intercept_exceptions &= 1081 svm->vmcb->control.intercept_exceptions &=
1046 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1082 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
1047 1083
1048 if (vcpu->arch.singlestep) 1084 if (svm->nmi_singlestep)
1049 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1085 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
1050 1086
1051 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1087 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -1060,26 +1096,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1060 vcpu->guest_debug = 0; 1096 vcpu->guest_debug = 0;
1061} 1097}
1062 1098
1063static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1099static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1064{ 1100{
1065 int old_debug = vcpu->guest_debug;
1066 struct vcpu_svm *svm = to_svm(vcpu); 1101 struct vcpu_svm *svm = to_svm(vcpu);
1067 1102
1068 vcpu->guest_debug = dbg->control;
1069
1070 update_db_intercept(vcpu);
1071
1072 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1103 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1073 svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; 1104 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1074 else 1105 else
1075 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1106 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1076 1107
1077 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 1108 update_db_intercept(vcpu);
1078 svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1079 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1080 svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1081
1082 return 0;
1083} 1109}
1084 1110
1085static void load_host_msrs(struct kvm_vcpu *vcpu) 1111static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1096,91 +1122,85 @@ static void save_host_msrs(struct kvm_vcpu *vcpu)
1096#endif 1122#endif
1097} 1123}
1098 1124
1099static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) 1125static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1100{ 1126{
1101 if (svm_data->next_asid > svm_data->max_asid) { 1127 if (sd->next_asid > sd->max_asid) {
1102 ++svm_data->asid_generation; 1128 ++sd->asid_generation;
1103 svm_data->next_asid = 1; 1129 sd->next_asid = 1;
1104 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 1130 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1105 } 1131 }
1106 1132
1107 svm->asid_generation = svm_data->asid_generation; 1133 svm->asid_generation = sd->asid_generation;
1108 svm->vmcb->control.asid = svm_data->next_asid++; 1134 svm->vmcb->control.asid = sd->next_asid++;
1109} 1135}
1110 1136
1111static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 1137static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
1112{ 1138{
1113 struct vcpu_svm *svm = to_svm(vcpu); 1139 struct vcpu_svm *svm = to_svm(vcpu);
1114 unsigned long val;
1115 1140
1116 switch (dr) { 1141 switch (dr) {
1117 case 0 ... 3: 1142 case 0 ... 3:
1118 val = vcpu->arch.db[dr]; 1143 *dest = vcpu->arch.db[dr];
1119 break; 1144 break;
1145 case 4:
1146 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1147 return EMULATE_FAIL; /* will re-inject UD */
1148 /* fall through */
1120 case 6: 1149 case 6:
1121 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1150 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1122 val = vcpu->arch.dr6; 1151 *dest = vcpu->arch.dr6;
1123 else 1152 else
1124 val = svm->vmcb->save.dr6; 1153 *dest = svm->vmcb->save.dr6;
1125 break; 1154 break;
1155 case 5:
1156 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1157 return EMULATE_FAIL; /* will re-inject UD */
1158 /* fall through */
1126 case 7: 1159 case 7:
1127 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1160 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1128 val = vcpu->arch.dr7; 1161 *dest = vcpu->arch.dr7;
1129 else 1162 else
1130 val = svm->vmcb->save.dr7; 1163 *dest = svm->vmcb->save.dr7;
1131 break; 1164 break;
1132 default:
1133 val = 0;
1134 } 1165 }
1135 1166
1136 return val; 1167 return EMULATE_DONE;
1137} 1168}
1138 1169
1139static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, 1170static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
1140 int *exception)
1141{ 1171{
1142 struct vcpu_svm *svm = to_svm(vcpu); 1172 struct vcpu_svm *svm = to_svm(vcpu);
1143 1173
1144 *exception = 0;
1145
1146 switch (dr) { 1174 switch (dr) {
1147 case 0 ... 3: 1175 case 0 ... 3:
1148 vcpu->arch.db[dr] = value; 1176 vcpu->arch.db[dr] = value;
1149 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 1177 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1150 vcpu->arch.eff_db[dr] = value; 1178 vcpu->arch.eff_db[dr] = value;
1151 return; 1179 break;
1152 case 4 ... 5: 1180 case 4:
1153 if (vcpu->arch.cr4 & X86_CR4_DE) 1181 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1154 *exception = UD_VECTOR; 1182 return EMULATE_FAIL; /* will re-inject UD */
1155 return; 1183 /* fall through */
1156 case 6: 1184 case 6:
1157 if (value & 0xffffffff00000000ULL) {
1158 *exception = GP_VECTOR;
1159 return;
1160 }
1161 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; 1185 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
1162 return; 1186 break;
1187 case 5:
1188 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1189 return EMULATE_FAIL; /* will re-inject UD */
1190 /* fall through */
1163 case 7: 1191 case 7:
1164 if (value & 0xffffffff00000000ULL) {
1165 *exception = GP_VECTOR;
1166 return;
1167 }
1168 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; 1192 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1169 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 1193 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1170 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1194 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1171 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); 1195 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1172 } 1196 }
1173 return; 1197 break;
1174 default:
1175 /* FIXME: Possible case? */
1176 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1177 __func__, dr);
1178 *exception = UD_VECTOR;
1179 return;
1180 } 1198 }
1199
1200 return EMULATE_DONE;
1181} 1201}
1182 1202
1183static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1203static int pf_interception(struct vcpu_svm *svm)
1184{ 1204{
1185 u64 fault_address; 1205 u64 fault_address;
1186 u32 error_code; 1206 u32 error_code;
@@ -1194,17 +1214,19 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1194 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1214 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1195} 1215}
1196 1216
1197static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1217static int db_interception(struct vcpu_svm *svm)
1198{ 1218{
1219 struct kvm_run *kvm_run = svm->vcpu.run;
1220
1199 if (!(svm->vcpu.guest_debug & 1221 if (!(svm->vcpu.guest_debug &
1200 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 1222 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1201 !svm->vcpu.arch.singlestep) { 1223 !svm->nmi_singlestep) {
1202 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 1224 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1203 return 1; 1225 return 1;
1204 } 1226 }
1205 1227
1206 if (svm->vcpu.arch.singlestep) { 1228 if (svm->nmi_singlestep) {
1207 svm->vcpu.arch.singlestep = false; 1229 svm->nmi_singlestep = false;
1208 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) 1230 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1209 svm->vmcb->save.rflags &= 1231 svm->vmcb->save.rflags &=
1210 ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1232 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
@@ -1223,35 +1245,41 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1223 return 1; 1245 return 1;
1224} 1246}
1225 1247
1226static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1248static int bp_interception(struct vcpu_svm *svm)
1227{ 1249{
1250 struct kvm_run *kvm_run = svm->vcpu.run;
1251
1228 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1252 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1229 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1253 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1230 kvm_run->debug.arch.exception = BP_VECTOR; 1254 kvm_run->debug.arch.exception = BP_VECTOR;
1231 return 0; 1255 return 0;
1232} 1256}
1233 1257
1234static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1258static int ud_interception(struct vcpu_svm *svm)
1235{ 1259{
1236 int er; 1260 int er;
1237 1261
1238 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 1262 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
1239 if (er != EMULATE_DONE) 1263 if (er != EMULATE_DONE)
1240 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1264 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1241 return 1; 1265 return 1;
1242} 1266}
1243 1267
1244static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1268static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1245{ 1269{
1270 struct vcpu_svm *svm = to_svm(vcpu);
1246 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1271 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1247 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
1248 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
1249 svm->vcpu.fpu_active = 1; 1272 svm->vcpu.fpu_active = 1;
1273 update_cr0_intercept(svm);
1274}
1250 1275
1276static int nm_interception(struct vcpu_svm *svm)
1277{
1278 svm_fpu_activate(&svm->vcpu);
1251 return 1; 1279 return 1;
1252} 1280}
1253 1281
1254static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1282static int mc_interception(struct vcpu_svm *svm)
1255{ 1283{
1256 /* 1284 /*
1257 * On an #MC intercept the MCE handler is not called automatically in 1285 * On an #MC intercept the MCE handler is not called automatically in
@@ -1264,8 +1292,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1264 return 1; 1292 return 1;
1265} 1293}
1266 1294
1267static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1295static int shutdown_interception(struct vcpu_svm *svm)
1268{ 1296{
1297 struct kvm_run *kvm_run = svm->vcpu.run;
1298
1269 /* 1299 /*
1270 * VMCB is undefined after a SHUTDOWN intercept 1300 * VMCB is undefined after a SHUTDOWN intercept
1271 * so reinitialize it. 1301 * so reinitialize it.
@@ -1277,7 +1307,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1277 return 0; 1307 return 0;
1278} 1308}
1279 1309
1280static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1310static int io_interception(struct vcpu_svm *svm)
1281{ 1311{
1282 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1312 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1283 int size, in, string; 1313 int size, in, string;
@@ -1291,7 +1321,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1291 1321
1292 if (string) { 1322 if (string) {
1293 if (emulate_instruction(&svm->vcpu, 1323 if (emulate_instruction(&svm->vcpu,
1294 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) 1324 0, 0, 0) == EMULATE_DO_MMIO)
1295 return 0; 1325 return 0;
1296 return 1; 1326 return 1;
1297 } 1327 }
@@ -1301,33 +1331,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1301 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1331 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1302 1332
1303 skip_emulated_instruction(&svm->vcpu); 1333 skip_emulated_instruction(&svm->vcpu);
1304 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1334 return kvm_emulate_pio(&svm->vcpu, in, size, port);
1305} 1335}
1306 1336
1307static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1337static int nmi_interception(struct vcpu_svm *svm)
1308{ 1338{
1309 return 1; 1339 return 1;
1310} 1340}
1311 1341
1312static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1342static int intr_interception(struct vcpu_svm *svm)
1313{ 1343{
1314 ++svm->vcpu.stat.irq_exits; 1344 ++svm->vcpu.stat.irq_exits;
1315 return 1; 1345 return 1;
1316} 1346}
1317 1347
1318static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1348static int nop_on_interception(struct vcpu_svm *svm)
1319{ 1349{
1320 return 1; 1350 return 1;
1321} 1351}
1322 1352
1323static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1353static int halt_interception(struct vcpu_svm *svm)
1324{ 1354{
1325 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 1355 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1326 skip_emulated_instruction(&svm->vcpu); 1356 skip_emulated_instruction(&svm->vcpu);
1327 return kvm_emulate_halt(&svm->vcpu); 1357 return kvm_emulate_halt(&svm->vcpu);
1328} 1358}
1329 1359
1330static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1360static int vmmcall_interception(struct vcpu_svm *svm)
1331{ 1361{
1332 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1362 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1333 skip_emulated_instruction(&svm->vcpu); 1363 skip_emulated_instruction(&svm->vcpu);
@@ -1337,7 +1367,7 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1337 1367
1338static int nested_svm_check_permissions(struct vcpu_svm *svm) 1368static int nested_svm_check_permissions(struct vcpu_svm *svm)
1339{ 1369{
1340 if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) 1370 if (!(svm->vcpu.arch.efer & EFER_SVME)
1341 || !is_paging(&svm->vcpu)) { 1371 || !is_paging(&svm->vcpu)) {
1342 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1372 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1343 return 1; 1373 return 1;
@@ -1378,8 +1408,15 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1378 1408
1379 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1409 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1380 1410
1381 if (nested_svm_exit_handled(svm)) { 1411 if (svm->nested.intercept & 1ULL) {
1382 nsvm_printk("VMexit -> INTR\n"); 1412 /*
1413 * The #vmexit can't be emulated here directly because this
1414 * code path runs with irqs and preemtion disabled. A
1415 * #vmexit emulation might sleep. Only signal request for
1416 * the #vmexit here.
1417 */
1418 svm->nested.exit_required = true;
1419 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1383 return 1; 1420 return 1;
1384 } 1421 }
1385 1422
@@ -1390,10 +1427,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
1390{ 1427{
1391 struct page *page; 1428 struct page *page;
1392 1429
1393 down_read(&current->mm->mmap_sem);
1394 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1430 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1395 up_read(&current->mm->mmap_sem);
1396
1397 if (is_error_page(page)) 1431 if (is_error_page(page))
1398 goto error; 1432 goto error;
1399 1433
@@ -1532,14 +1566,12 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1532 } 1566 }
1533 default: { 1567 default: {
1534 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1568 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1535 nsvm_printk("exit code: 0x%x\n", exit_code);
1536 if (svm->nested.intercept & exit_bits) 1569 if (svm->nested.intercept & exit_bits)
1537 vmexit = NESTED_EXIT_DONE; 1570 vmexit = NESTED_EXIT_DONE;
1538 } 1571 }
1539 } 1572 }
1540 1573
1541 if (vmexit == NESTED_EXIT_DONE) { 1574 if (vmexit == NESTED_EXIT_DONE) {
1542 nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
1543 nested_svm_vmexit(svm); 1575 nested_svm_vmexit(svm);
1544 } 1576 }
1545 1577
@@ -1584,6 +1616,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1584 struct vmcb *hsave = svm->nested.hsave; 1616 struct vmcb *hsave = svm->nested.hsave;
1585 struct vmcb *vmcb = svm->vmcb; 1617 struct vmcb *vmcb = svm->vmcb;
1586 1618
1619 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1620 vmcb->control.exit_info_1,
1621 vmcb->control.exit_info_2,
1622 vmcb->control.exit_int_info,
1623 vmcb->control.exit_int_info_err);
1624
1587 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1625 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
1588 if (!nested_vmcb) 1626 if (!nested_vmcb)
1589 return 1; 1627 return 1;
@@ -1617,6 +1655,22 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1617 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 1655 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1618 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 1656 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1619 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 1657 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
1658
1659 /*
1660 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
1661 * to make sure that we do not lose injected events. So check event_inj
1662 * here and copy it to exit_int_info if it is valid.
1663 * Exit_int_info and event_inj can't be both valid because the case
1664 * below only happens on a VMRUN instruction intercept which has
1665 * no valid exit_int_info set.
1666 */
1667 if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
1668 struct vmcb_control_area *nc = &nested_vmcb->control;
1669
1670 nc->exit_int_info = vmcb->control.event_inj;
1671 nc->exit_int_info_err = vmcb->control.event_inj_err;
1672 }
1673
1620 nested_vmcb->control.tlb_ctl = 0; 1674 nested_vmcb->control.tlb_ctl = 0;
1621 nested_vmcb->control.event_inj = 0; 1675 nested_vmcb->control.event_inj = 0;
1622 nested_vmcb->control.event_inj_err = 0; 1676 nested_vmcb->control.event_inj_err = 0;
@@ -1628,10 +1682,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1628 /* Restore the original control entries */ 1682 /* Restore the original control entries */
1629 copy_vmcb_control_area(vmcb, hsave); 1683 copy_vmcb_control_area(vmcb, hsave);
1630 1684
1631 /* Kill any pending exceptions */
1632 if (svm->vcpu.arch.exception.pending == true)
1633 nsvm_printk("WARNING: Pending Exception\n");
1634
1635 kvm_clear_exception_queue(&svm->vcpu); 1685 kvm_clear_exception_queue(&svm->vcpu);
1636 kvm_clear_interrupt_queue(&svm->vcpu); 1686 kvm_clear_interrupt_queue(&svm->vcpu);
1637 1687
@@ -1702,6 +1752,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1702 /* nested_vmcb is our indicator if nested SVM is activated */ 1752 /* nested_vmcb is our indicator if nested SVM is activated */
1703 svm->nested.vmcb = svm->vmcb->save.rax; 1753 svm->nested.vmcb = svm->vmcb->save.rax;
1704 1754
1755 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1756 nested_vmcb->save.rip,
1757 nested_vmcb->control.int_ctl,
1758 nested_vmcb->control.event_inj,
1759 nested_vmcb->control.nested_ctl);
1760
1705 /* Clear internal status */ 1761 /* Clear internal status */
1706 kvm_clear_exception_queue(&svm->vcpu); 1762 kvm_clear_exception_queue(&svm->vcpu);
1707 kvm_clear_interrupt_queue(&svm->vcpu); 1763 kvm_clear_interrupt_queue(&svm->vcpu);
@@ -1714,8 +1770,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1714 hsave->save.ds = vmcb->save.ds; 1770 hsave->save.ds = vmcb->save.ds;
1715 hsave->save.gdtr = vmcb->save.gdtr; 1771 hsave->save.gdtr = vmcb->save.gdtr;
1716 hsave->save.idtr = vmcb->save.idtr; 1772 hsave->save.idtr = vmcb->save.idtr;
1717 hsave->save.efer = svm->vcpu.arch.shadow_efer; 1773 hsave->save.efer = svm->vcpu.arch.efer;
1718 hsave->save.cr0 = svm->vcpu.arch.cr0; 1774 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
1719 hsave->save.cr4 = svm->vcpu.arch.cr4; 1775 hsave->save.cr4 = svm->vcpu.arch.cr4;
1720 hsave->save.rflags = vmcb->save.rflags; 1776 hsave->save.rflags = vmcb->save.rflags;
1721 hsave->save.rip = svm->next_rip; 1777 hsave->save.rip = svm->next_rip;
@@ -1789,28 +1845,15 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1789 svm->nested.intercept = nested_vmcb->control.intercept; 1845 svm->nested.intercept = nested_vmcb->control.intercept;
1790 1846
1791 force_new_asid(&svm->vcpu); 1847 force_new_asid(&svm->vcpu);
1792 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
1793 svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
1794 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 1848 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
1795 if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
1796 nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
1797 nested_vmcb->control.int_ctl);
1798 }
1799 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 1849 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
1800 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 1850 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
1801 else 1851 else
1802 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 1852 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1803 1853
1804 nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
1805 nested_vmcb->control.exit_int_info,
1806 nested_vmcb->control.int_state);
1807
1808 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 1854 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1809 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 1855 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1810 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 1856 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1811 if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
1812 nsvm_printk("Injecting Event: 0x%x\n",
1813 nested_vmcb->control.event_inj);
1814 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 1857 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1815 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 1858 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1816 1859
@@ -1837,7 +1880,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1837 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 1880 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1838} 1881}
1839 1882
1840static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1883static int vmload_interception(struct vcpu_svm *svm)
1841{ 1884{
1842 struct vmcb *nested_vmcb; 1885 struct vmcb *nested_vmcb;
1843 1886
@@ -1857,7 +1900,7 @@ static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1857 return 1; 1900 return 1;
1858} 1901}
1859 1902
1860static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1903static int vmsave_interception(struct vcpu_svm *svm)
1861{ 1904{
1862 struct vmcb *nested_vmcb; 1905 struct vmcb *nested_vmcb;
1863 1906
@@ -1877,10 +1920,8 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1877 return 1; 1920 return 1;
1878} 1921}
1879 1922
1880static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1923static int vmrun_interception(struct vcpu_svm *svm)
1881{ 1924{
1882 nsvm_printk("VMrun\n");
1883
1884 if (nested_svm_check_permissions(svm)) 1925 if (nested_svm_check_permissions(svm))
1885 return 1; 1926 return 1;
1886 1927
@@ -1907,7 +1948,7 @@ failed:
1907 return 1; 1948 return 1;
1908} 1949}
1909 1950
1910static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1951static int stgi_interception(struct vcpu_svm *svm)
1911{ 1952{
1912 if (nested_svm_check_permissions(svm)) 1953 if (nested_svm_check_permissions(svm))
1913 return 1; 1954 return 1;
@@ -1920,7 +1961,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1920 return 1; 1961 return 1;
1921} 1962}
1922 1963
1923static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1964static int clgi_interception(struct vcpu_svm *svm)
1924{ 1965{
1925 if (nested_svm_check_permissions(svm)) 1966 if (nested_svm_check_permissions(svm))
1926 return 1; 1967 return 1;
@@ -1937,10 +1978,12 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1937 return 1; 1978 return 1;
1938} 1979}
1939 1980
1940static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1981static int invlpga_interception(struct vcpu_svm *svm)
1941{ 1982{
1942 struct kvm_vcpu *vcpu = &svm->vcpu; 1983 struct kvm_vcpu *vcpu = &svm->vcpu;
1943 nsvm_printk("INVLPGA\n"); 1984
1985 trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
1986 vcpu->arch.regs[VCPU_REGS_RAX]);
1944 1987
1945 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 1988 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
1946 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); 1989 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
@@ -1950,15 +1993,21 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1950 return 1; 1993 return 1;
1951} 1994}
1952 1995
1953static int invalid_op_interception(struct vcpu_svm *svm, 1996static int skinit_interception(struct vcpu_svm *svm)
1954 struct kvm_run *kvm_run) 1997{
1998 trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
1999
2000 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2001 return 1;
2002}
2003
2004static int invalid_op_interception(struct vcpu_svm *svm)
1955{ 2005{
1956 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2006 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1957 return 1; 2007 return 1;
1958} 2008}
1959 2009
1960static int task_switch_interception(struct vcpu_svm *svm, 2010static int task_switch_interception(struct vcpu_svm *svm)
1961 struct kvm_run *kvm_run)
1962{ 2011{
1963 u16 tss_selector; 2012 u16 tss_selector;
1964 int reason; 2013 int reason;
@@ -2008,41 +2057,42 @@ static int task_switch_interception(struct vcpu_svm *svm,
2008 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2057 return kvm_task_switch(&svm->vcpu, tss_selector, reason);
2009} 2058}
2010 2059
2011static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2060static int cpuid_interception(struct vcpu_svm *svm)
2012{ 2061{
2013 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2062 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2014 kvm_emulate_cpuid(&svm->vcpu); 2063 kvm_emulate_cpuid(&svm->vcpu);
2015 return 1; 2064 return 1;
2016} 2065}
2017 2066
2018static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2067static int iret_interception(struct vcpu_svm *svm)
2019{ 2068{
2020 ++svm->vcpu.stat.nmi_window_exits; 2069 ++svm->vcpu.stat.nmi_window_exits;
2021 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); 2070 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
2022 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2071 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2023 return 1; 2072 return 1;
2024} 2073}
2025 2074
2026static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2075static int invlpg_interception(struct vcpu_svm *svm)
2027{ 2076{
2028 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) 2077 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2029 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2078 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2030 return 1; 2079 return 1;
2031} 2080}
2032 2081
2033static int emulate_on_interception(struct vcpu_svm *svm, 2082static int emulate_on_interception(struct vcpu_svm *svm)
2034 struct kvm_run *kvm_run)
2035{ 2083{
2036 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) 2084 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2037 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2085 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2038 return 1; 2086 return 1;
2039} 2087}
2040 2088
2041static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2089static int cr8_write_interception(struct vcpu_svm *svm)
2042{ 2090{
2091 struct kvm_run *kvm_run = svm->vcpu.run;
2092
2043 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2093 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2044 /* instruction emulation calls kvm_set_cr8() */ 2094 /* instruction emulation calls kvm_set_cr8() */
2045 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); 2095 emulate_instruction(&svm->vcpu, 0, 0, 0);
2046 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2096 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2047 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2097 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2048 return 1; 2098 return 1;
@@ -2128,14 +2178,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2128 return 0; 2178 return 0;
2129} 2179}
2130 2180
2131static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2181static int rdmsr_interception(struct vcpu_svm *svm)
2132{ 2182{
2133 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2183 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2134 u64 data; 2184 u64 data;
2135 2185
2136 if (svm_get_msr(&svm->vcpu, ecx, &data)) 2186 if (svm_get_msr(&svm->vcpu, ecx, &data)) {
2187 trace_kvm_msr_read_ex(ecx);
2137 kvm_inject_gp(&svm->vcpu, 0); 2188 kvm_inject_gp(&svm->vcpu, 0);
2138 else { 2189 } else {
2139 trace_kvm_msr_read(ecx, data); 2190 trace_kvm_msr_read(ecx, data);
2140 2191
2141 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; 2192 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
@@ -2221,33 +2272,36 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2221 return 0; 2272 return 0;
2222} 2273}
2223 2274
2224static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2275static int wrmsr_interception(struct vcpu_svm *svm)
2225{ 2276{
2226 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2277 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2227 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2278 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
2228 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 2279 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2229 2280
2230 trace_kvm_msr_write(ecx, data);
2231 2281
2232 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2282 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2233 if (svm_set_msr(&svm->vcpu, ecx, data)) 2283 if (svm_set_msr(&svm->vcpu, ecx, data)) {
2284 trace_kvm_msr_write_ex(ecx, data);
2234 kvm_inject_gp(&svm->vcpu, 0); 2285 kvm_inject_gp(&svm->vcpu, 0);
2235 else 2286 } else {
2287 trace_kvm_msr_write(ecx, data);
2236 skip_emulated_instruction(&svm->vcpu); 2288 skip_emulated_instruction(&svm->vcpu);
2289 }
2237 return 1; 2290 return 1;
2238} 2291}
2239 2292
2240static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2293static int msr_interception(struct vcpu_svm *svm)
2241{ 2294{
2242 if (svm->vmcb->control.exit_info_1) 2295 if (svm->vmcb->control.exit_info_1)
2243 return wrmsr_interception(svm, kvm_run); 2296 return wrmsr_interception(svm);
2244 else 2297 else
2245 return rdmsr_interception(svm, kvm_run); 2298 return rdmsr_interception(svm);
2246} 2299}
2247 2300
2248static int interrupt_window_interception(struct vcpu_svm *svm, 2301static int interrupt_window_interception(struct vcpu_svm *svm)
2249 struct kvm_run *kvm_run)
2250{ 2302{
2303 struct kvm_run *kvm_run = svm->vcpu.run;
2304
2251 svm_clear_vintr(svm); 2305 svm_clear_vintr(svm);
2252 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2306 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2253 /* 2307 /*
@@ -2265,13 +2319,18 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
2265 return 1; 2319 return 1;
2266} 2320}
2267 2321
2268static int (*svm_exit_handlers[])(struct vcpu_svm *svm, 2322static int pause_interception(struct vcpu_svm *svm)
2269 struct kvm_run *kvm_run) = { 2323{
2324 kvm_vcpu_on_spin(&(svm->vcpu));
2325 return 1;
2326}
2327
2328static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2270 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2329 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2271 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2330 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2272 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2331 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2273 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2332 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2274 /* for now: */ 2333 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2275 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2334 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
2276 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2335 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2277 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2336 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
@@ -2280,11 +2339,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2280 [SVM_EXIT_READ_DR1] = emulate_on_interception, 2339 [SVM_EXIT_READ_DR1] = emulate_on_interception,
2281 [SVM_EXIT_READ_DR2] = emulate_on_interception, 2340 [SVM_EXIT_READ_DR2] = emulate_on_interception,
2282 [SVM_EXIT_READ_DR3] = emulate_on_interception, 2341 [SVM_EXIT_READ_DR3] = emulate_on_interception,
2342 [SVM_EXIT_READ_DR4] = emulate_on_interception,
2343 [SVM_EXIT_READ_DR5] = emulate_on_interception,
2344 [SVM_EXIT_READ_DR6] = emulate_on_interception,
2345 [SVM_EXIT_READ_DR7] = emulate_on_interception,
2283 [SVM_EXIT_WRITE_DR0] = emulate_on_interception, 2346 [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
2284 [SVM_EXIT_WRITE_DR1] = emulate_on_interception, 2347 [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
2285 [SVM_EXIT_WRITE_DR2] = emulate_on_interception, 2348 [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
2286 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 2349 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
2350 [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
2287 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 2351 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
2352 [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
2288 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 2353 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
2289 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 2354 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2290 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 2355 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
@@ -2301,6 +2366,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2301 [SVM_EXIT_CPUID] = cpuid_interception, 2366 [SVM_EXIT_CPUID] = cpuid_interception,
2302 [SVM_EXIT_IRET] = iret_interception, 2367 [SVM_EXIT_IRET] = iret_interception,
2303 [SVM_EXIT_INVD] = emulate_on_interception, 2368 [SVM_EXIT_INVD] = emulate_on_interception,
2369 [SVM_EXIT_PAUSE] = pause_interception,
2304 [SVM_EXIT_HLT] = halt_interception, 2370 [SVM_EXIT_HLT] = halt_interception,
2305 [SVM_EXIT_INVLPG] = invlpg_interception, 2371 [SVM_EXIT_INVLPG] = invlpg_interception,
2306 [SVM_EXIT_INVLPGA] = invlpga_interception, 2372 [SVM_EXIT_INVLPGA] = invlpga_interception,
@@ -2314,26 +2380,36 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2314 [SVM_EXIT_VMSAVE] = vmsave_interception, 2380 [SVM_EXIT_VMSAVE] = vmsave_interception,
2315 [SVM_EXIT_STGI] = stgi_interception, 2381 [SVM_EXIT_STGI] = stgi_interception,
2316 [SVM_EXIT_CLGI] = clgi_interception, 2382 [SVM_EXIT_CLGI] = clgi_interception,
2317 [SVM_EXIT_SKINIT] = invalid_op_interception, 2383 [SVM_EXIT_SKINIT] = skinit_interception,
2318 [SVM_EXIT_WBINVD] = emulate_on_interception, 2384 [SVM_EXIT_WBINVD] = emulate_on_interception,
2319 [SVM_EXIT_MONITOR] = invalid_op_interception, 2385 [SVM_EXIT_MONITOR] = invalid_op_interception,
2320 [SVM_EXIT_MWAIT] = invalid_op_interception, 2386 [SVM_EXIT_MWAIT] = invalid_op_interception,
2321 [SVM_EXIT_NPF] = pf_interception, 2387 [SVM_EXIT_NPF] = pf_interception,
2322}; 2388};
2323 2389
2324static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 2390static int handle_exit(struct kvm_vcpu *vcpu)
2325{ 2391{
2326 struct vcpu_svm *svm = to_svm(vcpu); 2392 struct vcpu_svm *svm = to_svm(vcpu);
2393 struct kvm_run *kvm_run = vcpu->run;
2327 u32 exit_code = svm->vmcb->control.exit_code; 2394 u32 exit_code = svm->vmcb->control.exit_code;
2328 2395
2329 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2396 trace_kvm_exit(exit_code, svm->vmcb->save.rip);
2330 2397
2398 if (unlikely(svm->nested.exit_required)) {
2399 nested_svm_vmexit(svm);
2400 svm->nested.exit_required = false;
2401
2402 return 1;
2403 }
2404
2331 if (is_nested(svm)) { 2405 if (is_nested(svm)) {
2332 int vmexit; 2406 int vmexit;
2333 2407
2334 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", 2408 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
2335 exit_code, svm->vmcb->control.exit_info_1, 2409 svm->vmcb->control.exit_info_1,
2336 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); 2410 svm->vmcb->control.exit_info_2,
2411 svm->vmcb->control.exit_int_info,
2412 svm->vmcb->control.exit_int_info_err);
2337 2413
2338 vmexit = nested_svm_exit_special(svm); 2414 vmexit = nested_svm_exit_special(svm);
2339 2415
@@ -2346,20 +2422,10 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2346 2422
2347 svm_complete_interrupts(svm); 2423 svm_complete_interrupts(svm);
2348 2424
2349 if (npt_enabled) { 2425 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
2350 int mmu_reload = 0;
2351 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
2352 svm_set_cr0(vcpu, svm->vmcb->save.cr0);
2353 mmu_reload = 1;
2354 }
2355 vcpu->arch.cr0 = svm->vmcb->save.cr0; 2426 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2427 if (npt_enabled)
2356 vcpu->arch.cr3 = svm->vmcb->save.cr3; 2428 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2357 if (mmu_reload) {
2358 kvm_mmu_reset_context(vcpu);
2359 kvm_mmu_load(vcpu);
2360 }
2361 }
2362
2363 2429
2364 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 2430 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2365 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2431 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2383,15 +2449,15 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2383 return 0; 2449 return 0;
2384 } 2450 }
2385 2451
2386 return svm_exit_handlers[exit_code](svm, kvm_run); 2452 return svm_exit_handlers[exit_code](svm);
2387} 2453}
2388 2454
2389static void reload_tss(struct kvm_vcpu *vcpu) 2455static void reload_tss(struct kvm_vcpu *vcpu)
2390{ 2456{
2391 int cpu = raw_smp_processor_id(); 2457 int cpu = raw_smp_processor_id();
2392 2458
2393 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 2459 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
2394 svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ 2460 sd->tss_desc->type = 9; /* available 32/64-bit TSS */
2395 load_TR_desc(); 2461 load_TR_desc();
2396} 2462}
2397 2463
@@ -2399,12 +2465,12 @@ static void pre_svm_run(struct vcpu_svm *svm)
2399{ 2465{
2400 int cpu = raw_smp_processor_id(); 2466 int cpu = raw_smp_processor_id();
2401 2467
2402 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 2468 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
2403 2469
2404 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 2470 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
2405 /* FIXME: handle wraparound of asid_generation */ 2471 /* FIXME: handle wraparound of asid_generation */
2406 if (svm->asid_generation != svm_data->asid_generation) 2472 if (svm->asid_generation != sd->asid_generation)
2407 new_asid(svm, svm_data); 2473 new_asid(svm, sd);
2408} 2474}
2409 2475
2410static void svm_inject_nmi(struct kvm_vcpu *vcpu) 2476static void svm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2413,7 +2479,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
2413 2479
2414 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 2480 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
2415 vcpu->arch.hflags |= HF_NMI_MASK; 2481 vcpu->arch.hflags |= HF_NMI_MASK;
2416 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); 2482 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
2417 ++vcpu->stat.nmi_injections; 2483 ++vcpu->stat.nmi_injections;
2418} 2484}
2419 2485
@@ -2460,20 +2526,47 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2460 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 2526 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2461} 2527}
2462 2528
2529static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
2530{
2531 struct vcpu_svm *svm = to_svm(vcpu);
2532
2533 return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
2534}
2535
2536static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2537{
2538 struct vcpu_svm *svm = to_svm(vcpu);
2539
2540 if (masked) {
2541 svm->vcpu.arch.hflags |= HF_NMI_MASK;
2542 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
2543 } else {
2544 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
2545 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
2546 }
2547}
2548
2463static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) 2549static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2464{ 2550{
2465 struct vcpu_svm *svm = to_svm(vcpu); 2551 struct vcpu_svm *svm = to_svm(vcpu);
2466 struct vmcb *vmcb = svm->vmcb; 2552 struct vmcb *vmcb = svm->vmcb;
2467 return (vmcb->save.rflags & X86_EFLAGS_IF) && 2553 int ret;
2468 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2554
2469 gif_set(svm) && 2555 if (!gif_set(svm) ||
2470 !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); 2556 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
2557 return 0;
2558
2559 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
2560
2561 if (is_nested(svm))
2562 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
2563
2564 return ret;
2471} 2565}
2472 2566
2473static void enable_irq_window(struct kvm_vcpu *vcpu) 2567static void enable_irq_window(struct kvm_vcpu *vcpu)
2474{ 2568{
2475 struct vcpu_svm *svm = to_svm(vcpu); 2569 struct vcpu_svm *svm = to_svm(vcpu);
2476 nsvm_printk("Trying to open IRQ window\n");
2477 2570
2478 nested_svm_intr(svm); 2571 nested_svm_intr(svm);
2479 2572
@@ -2498,7 +2591,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2498 /* Something prevents NMI from been injected. Single step over 2591 /* Something prevents NMI from been injected. Single step over
2499 possible problem (IRET or exception injection or interrupt 2592 possible problem (IRET or exception injection or interrupt
2500 shadow) */ 2593 shadow) */
2501 vcpu->arch.singlestep = true; 2594 svm->nmi_singlestep = true;
2502 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2595 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2503 update_db_intercept(vcpu); 2596 update_db_intercept(vcpu);
2504} 2597}
@@ -2588,13 +2681,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2588#define R "e" 2681#define R "e"
2589#endif 2682#endif
2590 2683
2591static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2684static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2592{ 2685{
2593 struct vcpu_svm *svm = to_svm(vcpu); 2686 struct vcpu_svm *svm = to_svm(vcpu);
2594 u16 fs_selector; 2687 u16 fs_selector;
2595 u16 gs_selector; 2688 u16 gs_selector;
2596 u16 ldt_selector; 2689 u16 ldt_selector;
2597 2690
2691 /*
2692 * A vmexit emulation is required before the vcpu can be executed
2693 * again.
2694 */
2695 if (unlikely(svm->nested.exit_required))
2696 return;
2697
2598 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 2698 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2599 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 2699 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2600 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 2700 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -2727,12 +2827,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
2727 2827
2728 svm->vmcb->save.cr3 = root; 2828 svm->vmcb->save.cr3 = root;
2729 force_new_asid(vcpu); 2829 force_new_asid(vcpu);
2730
2731 if (vcpu->fpu_active) {
2732 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
2733 svm->vmcb->save.cr0 |= X86_CR0_TS;
2734 vcpu->fpu_active = 0;
2735 }
2736} 2830}
2737 2831
2738static int is_disabled(void) 2832static int is_disabled(void)
@@ -2781,6 +2875,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
2781 return 0; 2875 return 0;
2782} 2876}
2783 2877
2878static void svm_cpuid_update(struct kvm_vcpu *vcpu)
2879{
2880}
2881
2784static const struct trace_print_flags svm_exit_reasons_str[] = { 2882static const struct trace_print_flags svm_exit_reasons_str[] = {
2785 { SVM_EXIT_READ_CR0, "read_cr0" }, 2883 { SVM_EXIT_READ_CR0, "read_cr0" },
2786 { SVM_EXIT_READ_CR3, "read_cr3" }, 2884 { SVM_EXIT_READ_CR3, "read_cr3" },
@@ -2834,9 +2932,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
2834 { -1, NULL } 2932 { -1, NULL }
2835}; 2933};
2836 2934
2837static bool svm_gb_page_enable(void) 2935static int svm_get_lpage_level(void)
2838{ 2936{
2839 return true; 2937 return PT_PDPE_LEVEL;
2938}
2939
2940static bool svm_rdtscp_supported(void)
2941{
2942 return false;
2943}
2944
2945static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
2946{
2947 struct vcpu_svm *svm = to_svm(vcpu);
2948
2949 update_cr0_intercept(svm);
2950 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
2840} 2951}
2841 2952
2842static struct kvm_x86_ops svm_x86_ops = { 2953static struct kvm_x86_ops svm_x86_ops = {
@@ -2865,6 +2976,7 @@ static struct kvm_x86_ops svm_x86_ops = {
2865 .set_segment = svm_set_segment, 2976 .set_segment = svm_set_segment,
2866 .get_cpl = svm_get_cpl, 2977 .get_cpl = svm_get_cpl,
2867 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 2978 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
2979 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
2868 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 2980 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
2869 .set_cr0 = svm_set_cr0, 2981 .set_cr0 = svm_set_cr0,
2870 .set_cr3 = svm_set_cr3, 2982 .set_cr3 = svm_set_cr3,
@@ -2879,6 +2991,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2879 .cache_reg = svm_cache_reg, 2991 .cache_reg = svm_cache_reg,
2880 .get_rflags = svm_get_rflags, 2992 .get_rflags = svm_get_rflags,
2881 .set_rflags = svm_set_rflags, 2993 .set_rflags = svm_set_rflags,
2994 .fpu_activate = svm_fpu_activate,
2995 .fpu_deactivate = svm_fpu_deactivate,
2882 2996
2883 .tlb_flush = svm_flush_tlb, 2997 .tlb_flush = svm_flush_tlb,
2884 2998
@@ -2893,6 +3007,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2893 .queue_exception = svm_queue_exception, 3007 .queue_exception = svm_queue_exception,
2894 .interrupt_allowed = svm_interrupt_allowed, 3008 .interrupt_allowed = svm_interrupt_allowed,
2895 .nmi_allowed = svm_nmi_allowed, 3009 .nmi_allowed = svm_nmi_allowed,
3010 .get_nmi_mask = svm_get_nmi_mask,
3011 .set_nmi_mask = svm_set_nmi_mask,
2896 .enable_nmi_window = enable_nmi_window, 3012 .enable_nmi_window = enable_nmi_window,
2897 .enable_irq_window = enable_irq_window, 3013 .enable_irq_window = enable_irq_window,
2898 .update_cr8_intercept = update_cr8_intercept, 3014 .update_cr8_intercept = update_cr8_intercept,
@@ -2902,7 +3018,11 @@ static struct kvm_x86_ops svm_x86_ops = {
2902 .get_mt_mask = svm_get_mt_mask, 3018 .get_mt_mask = svm_get_mt_mask,
2903 3019
2904 .exit_reasons_str = svm_exit_reasons_str, 3020 .exit_reasons_str = svm_exit_reasons_str,
2905 .gb_page_enable = svm_gb_page_enable, 3021 .get_lpage_level = svm_get_lpage_level,
3022
3023 .cpuid_update = svm_cpuid_update,
3024
3025 .rdtscp_supported = svm_rdtscp_supported,
2906}; 3026};
2907 3027
2908static int __init svm_init(void) 3028static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0d480e77eacf..6ad30a29f044 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall,
56); 56);
57 57
58/* 58/*
59 * Tracepoint for hypercall.
60 */
61TRACE_EVENT(kvm_hv_hypercall,
62 TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
63 __u64 ingpa, __u64 outgpa),
64 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
65
66 TP_STRUCT__entry(
67 __field( __u16, code )
68 __field( bool, fast )
69 __field( __u16, rep_cnt )
70 __field( __u16, rep_idx )
71 __field( __u64, ingpa )
72 __field( __u64, outgpa )
73 ),
74
75 TP_fast_assign(
76 __entry->code = code;
77 __entry->fast = fast;
78 __entry->rep_cnt = rep_cnt;
79 __entry->rep_idx = rep_idx;
80 __entry->ingpa = ingpa;
81 __entry->outgpa = outgpa;
82 ),
83
84 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
85 __entry->code, __entry->fast ? "fast" : "slow",
86 __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
87 __entry->outgpa)
88);
89
90/*
59 * Tracepoint for PIO. 91 * Tracepoint for PIO.
60 */ 92 */
61TRACE_EVENT(kvm_pio, 93TRACE_EVENT(kvm_pio,
@@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault,
214 * Tracepoint for guest MSR access. 246 * Tracepoint for guest MSR access.
215 */ 247 */
216TRACE_EVENT(kvm_msr, 248TRACE_EVENT(kvm_msr,
217 TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), 249 TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
218 TP_ARGS(rw, ecx, data), 250 TP_ARGS(write, ecx, data, exception),
219 251
220 TP_STRUCT__entry( 252 TP_STRUCT__entry(
221 __field( unsigned int, rw ) 253 __field( unsigned, write )
222 __field( unsigned int, ecx ) 254 __field( u32, ecx )
223 __field( unsigned long, data ) 255 __field( u64, data )
256 __field( u8, exception )
224 ), 257 ),
225 258
226 TP_fast_assign( 259 TP_fast_assign(
227 __entry->rw = rw; 260 __entry->write = write;
228 __entry->ecx = ecx; 261 __entry->ecx = ecx;
229 __entry->data = data; 262 __entry->data = data;
263 __entry->exception = exception;
230 ), 264 ),
231 265
232 TP_printk("msr_%s %x = 0x%lx", 266 TP_printk("msr_%s %x = 0x%llx%s",
233 __entry->rw ? "write" : "read", 267 __entry->write ? "write" : "read",
234 __entry->ecx, __entry->data) 268 __entry->ecx, __entry->data,
269 __entry->exception ? " (#GP)" : "")
235); 270);
236 271
237#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) 272#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
238#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) 273#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
274#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
275#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
239 276
240/* 277/*
241 * Tracepoint for guest CR access. 278 * Tracepoint for guest CR access.
@@ -349,6 +386,171 @@ TRACE_EVENT(kvm_apic_accept_irq,
349 __entry->coalesced ? " (coalesced)" : "") 386 __entry->coalesced ? " (coalesced)" : "")
350); 387);
351 388
389/*
390 * Tracepoint for nested VMRUN
391 */
392TRACE_EVENT(kvm_nested_vmrun,
393 TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
394 __u32 event_inj, bool npt),
395 TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
396
397 TP_STRUCT__entry(
398 __field( __u64, rip )
399 __field( __u64, vmcb )
400 __field( __u64, nested_rip )
401 __field( __u32, int_ctl )
402 __field( __u32, event_inj )
403 __field( bool, npt )
404 ),
405
406 TP_fast_assign(
407 __entry->rip = rip;
408 __entry->vmcb = vmcb;
409 __entry->nested_rip = nested_rip;
410 __entry->int_ctl = int_ctl;
411 __entry->event_inj = event_inj;
412 __entry->npt = npt;
413 ),
414
415 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
416 "event_inj: 0x%08x npt: %s\n",
417 __entry->rip, __entry->vmcb, __entry->nested_rip,
418 __entry->int_ctl, __entry->event_inj,
419 __entry->npt ? "on" : "off")
420);
421
422/*
423 * Tracepoint for #VMEXIT while nested
424 */
425TRACE_EVENT(kvm_nested_vmexit,
426 TP_PROTO(__u64 rip, __u32 exit_code,
427 __u64 exit_info1, __u64 exit_info2,
428 __u32 exit_int_info, __u32 exit_int_info_err),
429 TP_ARGS(rip, exit_code, exit_info1, exit_info2,
430 exit_int_info, exit_int_info_err),
431
432 TP_STRUCT__entry(
433 __field( __u64, rip )
434 __field( __u32, exit_code )
435 __field( __u64, exit_info1 )
436 __field( __u64, exit_info2 )
437 __field( __u32, exit_int_info )
438 __field( __u32, exit_int_info_err )
439 ),
440
441 TP_fast_assign(
442 __entry->rip = rip;
443 __entry->exit_code = exit_code;
444 __entry->exit_info1 = exit_info1;
445 __entry->exit_info2 = exit_info2;
446 __entry->exit_int_info = exit_int_info;
447 __entry->exit_int_info_err = exit_int_info_err;
448 ),
449 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
450 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
451 __entry->rip,
452 ftrace_print_symbols_seq(p, __entry->exit_code,
453 kvm_x86_ops->exit_reasons_str),
454 __entry->exit_info1, __entry->exit_info2,
455 __entry->exit_int_info, __entry->exit_int_info_err)
456);
457
458/*
459 * Tracepoint for #VMEXIT reinjected to the guest
460 */
461TRACE_EVENT(kvm_nested_vmexit_inject,
462 TP_PROTO(__u32 exit_code,
463 __u64 exit_info1, __u64 exit_info2,
464 __u32 exit_int_info, __u32 exit_int_info_err),
465 TP_ARGS(exit_code, exit_info1, exit_info2,
466 exit_int_info, exit_int_info_err),
467
468 TP_STRUCT__entry(
469 __field( __u32, exit_code )
470 __field( __u64, exit_info1 )
471 __field( __u64, exit_info2 )
472 __field( __u32, exit_int_info )
473 __field( __u32, exit_int_info_err )
474 ),
475
476 TP_fast_assign(
477 __entry->exit_code = exit_code;
478 __entry->exit_info1 = exit_info1;
479 __entry->exit_info2 = exit_info2;
480 __entry->exit_int_info = exit_int_info;
481 __entry->exit_int_info_err = exit_int_info_err;
482 ),
483
484 TP_printk("reason: %s ext_inf1: 0x%016llx "
485 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
486 ftrace_print_symbols_seq(p, __entry->exit_code,
487 kvm_x86_ops->exit_reasons_str),
488 __entry->exit_info1, __entry->exit_info2,
489 __entry->exit_int_info, __entry->exit_int_info_err)
490);
491
492/*
493 * Tracepoint for nested #vmexit because of interrupt pending
494 */
495TRACE_EVENT(kvm_nested_intr_vmexit,
496 TP_PROTO(__u64 rip),
497 TP_ARGS(rip),
498
499 TP_STRUCT__entry(
500 __field( __u64, rip )
501 ),
502
503 TP_fast_assign(
504 __entry->rip = rip
505 ),
506
507 TP_printk("rip: 0x%016llx\n", __entry->rip)
508);
509
510/*
511 * Tracepoint for nested #vmexit because of interrupt pending
512 */
513TRACE_EVENT(kvm_invlpga,
514 TP_PROTO(__u64 rip, int asid, u64 address),
515 TP_ARGS(rip, asid, address),
516
517 TP_STRUCT__entry(
518 __field( __u64, rip )
519 __field( int, asid )
520 __field( __u64, address )
521 ),
522
523 TP_fast_assign(
524 __entry->rip = rip;
525 __entry->asid = asid;
526 __entry->address = address;
527 ),
528
529 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
530 __entry->rip, __entry->asid, __entry->address)
531);
532
533/*
534 * Tracepoint for nested #vmexit because of interrupt pending
535 */
536TRACE_EVENT(kvm_skinit,
537 TP_PROTO(__u64 rip, __u32 slb),
538 TP_ARGS(rip, slb),
539
540 TP_STRUCT__entry(
541 __field( __u64, rip )
542 __field( __u32, slb )
543 ),
544
545 TP_fast_assign(
546 __entry->rip = rip;
547 __entry->slb = slb;
548 ),
549
550 TP_printk("rip: 0x%016llx slb: 0x%08x\n",
551 __entry->rip, __entry->slb)
552);
553
352#endif /* _TRACE_KVM_H */ 554#endif /* _TRACE_KVM_H */
353 555
354/* This part must be outside protection */ 556/* This part must be outside protection */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed53b42caba1..2f8db0ec8ae4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -26,6 +26,7 @@
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/moduleparam.h> 27#include <linux/moduleparam.h>
28#include <linux/ftrace_event.h> 28#include <linux/ftrace_event.h>
29#include <linux/slab.h>
29#include "kvm_cache_regs.h" 30#include "kvm_cache_regs.h"
30#include "x86.h" 31#include "x86.h"
31 32
@@ -61,12 +62,54 @@ module_param_named(unrestricted_guest,
61static int __read_mostly emulate_invalid_guest_state = 0; 62static int __read_mostly emulate_invalid_guest_state = 0;
62module_param(emulate_invalid_guest_state, bool, S_IRUGO); 63module_param(emulate_invalid_guest_state, bool, S_IRUGO);
63 64
65#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
66 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
67#define KVM_GUEST_CR0_MASK \
68 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
69#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
70 (X86_CR0_WP | X86_CR0_NE)
71#define KVM_VM_CR0_ALWAYS_ON \
72 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
73#define KVM_CR4_GUEST_OWNED_BITS \
74 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
75 | X86_CR4_OSXMMEXCPT)
76
77#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
78#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
79
80#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
81
82/*
83 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
84 * ple_gap: upper bound on the amount of time between two successive
85 * executions of PAUSE in a loop. Also indicate if ple enabled.
86 * According to test, this time is usually small than 41 cycles.
87 * ple_window: upper bound on the amount of time a guest is allowed to execute
88 * in a PAUSE loop. Tests indicate that most spinlocks are held for
89 * less than 2^12 cycles
90 * Time is measured based on a counter that runs at the same rate as the TSC,
91 * refer SDM volume 3b section 21.6.13 & 22.1.3.
92 */
93#define KVM_VMX_DEFAULT_PLE_GAP 41
94#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
95static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
96module_param(ple_gap, int, S_IRUGO);
97
98static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
99module_param(ple_window, int, S_IRUGO);
100
64struct vmcs { 101struct vmcs {
65 u32 revision_id; 102 u32 revision_id;
66 u32 abort; 103 u32 abort;
67 char data[0]; 104 char data[0];
68}; 105};
69 106
107struct shared_msr_entry {
108 unsigned index;
109 u64 data;
110 u64 mask;
111};
112
70struct vcpu_vmx { 113struct vcpu_vmx {
71 struct kvm_vcpu vcpu; 114 struct kvm_vcpu vcpu;
72 struct list_head local_vcpus_link; 115 struct list_head local_vcpus_link;
@@ -74,13 +117,12 @@ struct vcpu_vmx {
74 int launched; 117 int launched;
75 u8 fail; 118 u8 fail;
76 u32 idt_vectoring_info; 119 u32 idt_vectoring_info;
77 struct kvm_msr_entry *guest_msrs; 120 struct shared_msr_entry *guest_msrs;
78 struct kvm_msr_entry *host_msrs;
79 int nmsrs; 121 int nmsrs;
80 int save_nmsrs; 122 int save_nmsrs;
81 int msr_offset_efer;
82#ifdef CONFIG_X86_64 123#ifdef CONFIG_X86_64
83 int msr_offset_kernel_gs_base; 124 u64 msr_host_kernel_gs_base;
125 u64 msr_guest_kernel_gs_base;
84#endif 126#endif
85 struct vmcs *vmcs; 127 struct vmcs *vmcs;
86 struct { 128 struct {
@@ -88,11 +130,10 @@ struct vcpu_vmx {
88 u16 fs_sel, gs_sel, ldt_sel; 130 u16 fs_sel, gs_sel, ldt_sel;
89 int gs_ldt_reload_needed; 131 int gs_ldt_reload_needed;
90 int fs_reload_needed; 132 int fs_reload_needed;
91 int guest_efer_loaded;
92 } host_state; 133 } host_state;
93 struct { 134 struct {
94 int vm86_active; 135 int vm86_active;
95 u8 save_iopl; 136 ulong save_rflags;
96 struct kvm_save_segment { 137 struct kvm_save_segment {
97 u16 selector; 138 u16 selector;
98 unsigned long base; 139 unsigned long base;
@@ -107,13 +148,14 @@ struct vcpu_vmx {
107 } rmode; 148 } rmode;
108 int vpid; 149 int vpid;
109 bool emulation_required; 150 bool emulation_required;
110 enum emulation_result invalid_state_emulation_result;
111 151
112 /* Support for vnmi-less CPUs */ 152 /* Support for vnmi-less CPUs */
113 int soft_vnmi_blocked; 153 int soft_vnmi_blocked;
114 ktime_t entry_time; 154 ktime_t entry_time;
115 s64 vnmi_blocked_time; 155 s64 vnmi_blocked_time;
116 u32 exit_reason; 156 u32 exit_reason;
157
158 bool rdtscp_enabled;
117}; 159};
118 160
119static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 161static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -176,6 +218,8 @@ static struct kvm_vmx_segment_field {
176 VMX_SEGMENT_FIELD(LDTR), 218 VMX_SEGMENT_FIELD(LDTR),
177}; 219};
178 220
221static u64 host_efer;
222
179static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 223static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
180 224
181/* 225/*
@@ -184,28 +228,12 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
184 */ 228 */
185static const u32 vmx_msr_index[] = { 229static const u32 vmx_msr_index[] = {
186#ifdef CONFIG_X86_64 230#ifdef CONFIG_X86_64
187 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, 231 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
188#endif 232#endif
189 MSR_EFER, MSR_K6_STAR, 233 MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
190}; 234};
191#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 235#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
192 236
193static void load_msrs(struct kvm_msr_entry *e, int n)
194{
195 int i;
196
197 for (i = 0; i < n; ++i)
198 wrmsrl(e[i].index, e[i].data);
199}
200
201static void save_msrs(struct kvm_msr_entry *e, int n)
202{
203 int i;
204
205 for (i = 0; i < n; ++i)
206 rdmsrl(e[i].index, e[i].data);
207}
208
209static inline int is_page_fault(u32 intr_info) 237static inline int is_page_fault(u32 intr_info)
210{ 238{
211 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 239 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -293,6 +321,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
293 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); 321 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
294} 322}
295 323
324static inline bool cpu_has_vmx_ept_1g_page(void)
325{
326 return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
327}
328
296static inline int cpu_has_vmx_invept_individual_addr(void) 329static inline int cpu_has_vmx_invept_individual_addr(void)
297{ 330{
298 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); 331 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -320,11 +353,15 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
320 SECONDARY_EXEC_UNRESTRICTED_GUEST; 353 SECONDARY_EXEC_UNRESTRICTED_GUEST;
321} 354}
322 355
356static inline int cpu_has_vmx_ple(void)
357{
358 return vmcs_config.cpu_based_2nd_exec_ctrl &
359 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
360}
361
323static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 362static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
324{ 363{
325 return flexpriority_enabled && 364 return flexpriority_enabled && irqchip_in_kernel(kvm);
326 (cpu_has_vmx_virtualize_apic_accesses()) &&
327 (irqchip_in_kernel(kvm));
328} 365}
329 366
330static inline int cpu_has_vmx_vpid(void) 367static inline int cpu_has_vmx_vpid(void)
@@ -333,6 +370,12 @@ static inline int cpu_has_vmx_vpid(void)
333 SECONDARY_EXEC_ENABLE_VPID; 370 SECONDARY_EXEC_ENABLE_VPID;
334} 371}
335 372
373static inline int cpu_has_vmx_rdtscp(void)
374{
375 return vmcs_config.cpu_based_2nd_exec_ctrl &
376 SECONDARY_EXEC_RDTSCP;
377}
378
336static inline int cpu_has_virtual_nmis(void) 379static inline int cpu_has_virtual_nmis(void)
337{ 380{
338 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 381 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -348,7 +391,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
348 int i; 391 int i;
349 392
350 for (i = 0; i < vmx->nmsrs; ++i) 393 for (i = 0; i < vmx->nmsrs; ++i)
351 if (vmx->guest_msrs[i].index == msr) 394 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
352 return i; 395 return i;
353 return -1; 396 return -1;
354} 397}
@@ -379,7 +422,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
379 : : "a" (&operand), "c" (ext) : "cc", "memory"); 422 : : "a" (&operand), "c" (ext) : "cc", "memory");
380} 423}
381 424
382static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 425static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
383{ 426{
384 int i; 427 int i;
385 428
@@ -537,22 +580,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
537{ 580{
538 u32 eb; 581 u32 eb;
539 582
540 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); 583 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
541 if (!vcpu->fpu_active) 584 (1u << NM_VECTOR) | (1u << DB_VECTOR);
542 eb |= 1u << NM_VECTOR; 585 if ((vcpu->guest_debug &
543 /* 586 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
544 * Unconditionally intercept #DB so we can maintain dr6 without 587 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
545 * reading it every exit. 588 eb |= 1u << BP_VECTOR;
546 */
547 eb |= 1u << DB_VECTOR;
548 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
549 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
550 eb |= 1u << BP_VECTOR;
551 }
552 if (to_vmx(vcpu)->rmode.vm86_active) 589 if (to_vmx(vcpu)->rmode.vm86_active)
553 eb = ~0; 590 eb = ~0;
554 if (enable_ept) 591 if (enable_ept)
555 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 592 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
593 if (vcpu->fpu_active)
594 eb &= ~(1u << NM_VECTOR);
556 vmcs_write32(EXCEPTION_BITMAP, eb); 595 vmcs_write32(EXCEPTION_BITMAP, eb);
557} 596}
558 597
@@ -570,17 +609,12 @@ static void reload_tss(void)
570 load_TR_desc(); 609 load_TR_desc();
571} 610}
572 611
573static void load_transition_efer(struct vcpu_vmx *vmx) 612static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
574{ 613{
575 int efer_offset = vmx->msr_offset_efer;
576 u64 host_efer;
577 u64 guest_efer; 614 u64 guest_efer;
578 u64 ignore_bits; 615 u64 ignore_bits;
579 616
580 if (efer_offset < 0) 617 guest_efer = vmx->vcpu.arch.efer;
581 return;
582 host_efer = vmx->host_msrs[efer_offset].data;
583 guest_efer = vmx->guest_msrs[efer_offset].data;
584 618
585 /* 619 /*
586 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 620 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -593,27 +627,17 @@ static void load_transition_efer(struct vcpu_vmx *vmx)
593 if (guest_efer & EFER_LMA) 627 if (guest_efer & EFER_LMA)
594 ignore_bits &= ~(u64)EFER_SCE; 628 ignore_bits &= ~(u64)EFER_SCE;
595#endif 629#endif
596 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
597 return;
598
599 vmx->host_state.guest_efer_loaded = 1;
600 guest_efer &= ~ignore_bits; 630 guest_efer &= ~ignore_bits;
601 guest_efer |= host_efer & ignore_bits; 631 guest_efer |= host_efer & ignore_bits;
602 wrmsrl(MSR_EFER, guest_efer); 632 vmx->guest_msrs[efer_offset].data = guest_efer;
603 vmx->vcpu.stat.efer_reload++; 633 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
604} 634 return true;
605
606static void reload_host_efer(struct vcpu_vmx *vmx)
607{
608 if (vmx->host_state.guest_efer_loaded) {
609 vmx->host_state.guest_efer_loaded = 0;
610 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
611 }
612} 635}
613 636
614static void vmx_save_host_state(struct kvm_vcpu *vcpu) 637static void vmx_save_host_state(struct kvm_vcpu *vcpu)
615{ 638{
616 struct vcpu_vmx *vmx = to_vmx(vcpu); 639 struct vcpu_vmx *vmx = to_vmx(vcpu);
640 int i;
617 641
618 if (vmx->host_state.loaded) 642 if (vmx->host_state.loaded)
619 return; 643 return;
@@ -650,13 +674,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
650#endif 674#endif
651 675
652#ifdef CONFIG_X86_64 676#ifdef CONFIG_X86_64
653 if (is_long_mode(&vmx->vcpu)) 677 if (is_long_mode(&vmx->vcpu)) {
654 save_msrs(vmx->host_msrs + 678 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
655 vmx->msr_offset_kernel_gs_base, 1); 679 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
656 680 }
657#endif 681#endif
658 load_msrs(vmx->guest_msrs, vmx->save_nmsrs); 682 for (i = 0; i < vmx->save_nmsrs; ++i)
659 load_transition_efer(vmx); 683 kvm_set_shared_msr(vmx->guest_msrs[i].index,
684 vmx->guest_msrs[i].data,
685 vmx->guest_msrs[i].mask);
660} 686}
661 687
662static void __vmx_load_host_state(struct vcpu_vmx *vmx) 688static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -684,9 +710,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
684 local_irq_restore(flags); 710 local_irq_restore(flags);
685 } 711 }
686 reload_tss(); 712 reload_tss();
687 save_msrs(vmx->guest_msrs, vmx->save_nmsrs); 713#ifdef CONFIG_X86_64
688 load_msrs(vmx->host_msrs, vmx->save_nmsrs); 714 if (is_long_mode(&vmx->vcpu)) {
689 reload_host_efer(vmx); 715 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
716 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
717 }
718#endif
690} 719}
691 720
692static void vmx_load_host_state(struct vcpu_vmx *vmx) 721static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -763,38 +792,51 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
763 792
764static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 793static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
765{ 794{
795 ulong cr0;
796
766 if (vcpu->fpu_active) 797 if (vcpu->fpu_active)
767 return; 798 return;
768 vcpu->fpu_active = 1; 799 vcpu->fpu_active = 1;
769 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); 800 cr0 = vmcs_readl(GUEST_CR0);
770 if (vcpu->arch.cr0 & X86_CR0_TS) 801 cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
771 vmcs_set_bits(GUEST_CR0, X86_CR0_TS); 802 cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
803 vmcs_writel(GUEST_CR0, cr0);
772 update_exception_bitmap(vcpu); 804 update_exception_bitmap(vcpu);
805 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
806 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
773} 807}
774 808
809static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
810
775static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 811static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
776{ 812{
777 if (!vcpu->fpu_active) 813 vmx_decache_cr0_guest_bits(vcpu);
778 return; 814 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
779 vcpu->fpu_active = 0;
780 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
781 update_exception_bitmap(vcpu); 815 update_exception_bitmap(vcpu);
816 vcpu->arch.cr0_guest_owned_bits = 0;
817 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
818 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
782} 819}
783 820
784static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 821static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
785{ 822{
786 unsigned long rflags; 823 unsigned long rflags, save_rflags;
787 824
788 rflags = vmcs_readl(GUEST_RFLAGS); 825 rflags = vmcs_readl(GUEST_RFLAGS);
789 if (to_vmx(vcpu)->rmode.vm86_active) 826 if (to_vmx(vcpu)->rmode.vm86_active) {
790 rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 827 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
828 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
829 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
830 }
791 return rflags; 831 return rflags;
792} 832}
793 833
794static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 834static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
795{ 835{
796 if (to_vmx(vcpu)->rmode.vm86_active) 836 if (to_vmx(vcpu)->rmode.vm86_active) {
837 to_vmx(vcpu)->rmode.save_rflags = rflags;
797 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 838 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
839 }
798 vmcs_writel(GUEST_RFLAGS, rflags); 840 vmcs_writel(GUEST_RFLAGS, rflags);
799} 841}
800 842
@@ -874,22 +916,22 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
874 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 916 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
875} 917}
876 918
919static bool vmx_rdtscp_supported(void)
920{
921 return cpu_has_vmx_rdtscp();
922}
923
877/* 924/*
878 * Swap MSR entry in host/guest MSR entry array. 925 * Swap MSR entry in host/guest MSR entry array.
879 */ 926 */
880#ifdef CONFIG_X86_64
881static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 927static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
882{ 928{
883 struct kvm_msr_entry tmp; 929 struct shared_msr_entry tmp;
884 930
885 tmp = vmx->guest_msrs[to]; 931 tmp = vmx->guest_msrs[to];
886 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 932 vmx->guest_msrs[to] = vmx->guest_msrs[from];
887 vmx->guest_msrs[from] = tmp; 933 vmx->guest_msrs[from] = tmp;
888 tmp = vmx->host_msrs[to];
889 vmx->host_msrs[to] = vmx->host_msrs[from];
890 vmx->host_msrs[from] = tmp;
891} 934}
892#endif
893 935
894/* 936/*
895 * Set up the vmcs to automatically save and restore system 937 * Set up the vmcs to automatically save and restore system
@@ -898,15 +940,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
898 */ 940 */
899static void setup_msrs(struct vcpu_vmx *vmx) 941static void setup_msrs(struct vcpu_vmx *vmx)
900{ 942{
901 int save_nmsrs; 943 int save_nmsrs, index;
902 unsigned long *msr_bitmap; 944 unsigned long *msr_bitmap;
903 945
904 vmx_load_host_state(vmx); 946 vmx_load_host_state(vmx);
905 save_nmsrs = 0; 947 save_nmsrs = 0;
906#ifdef CONFIG_X86_64 948#ifdef CONFIG_X86_64
907 if (is_long_mode(&vmx->vcpu)) { 949 if (is_long_mode(&vmx->vcpu)) {
908 int index;
909
910 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 950 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
911 if (index >= 0) 951 if (index >= 0)
912 move_msr_up(vmx, index, save_nmsrs++); 952 move_msr_up(vmx, index, save_nmsrs++);
@@ -916,25 +956,23 @@ static void setup_msrs(struct vcpu_vmx *vmx)
916 index = __find_msr_index(vmx, MSR_CSTAR); 956 index = __find_msr_index(vmx, MSR_CSTAR);
917 if (index >= 0) 957 if (index >= 0)
918 move_msr_up(vmx, index, save_nmsrs++); 958 move_msr_up(vmx, index, save_nmsrs++);
919 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); 959 index = __find_msr_index(vmx, MSR_TSC_AUX);
920 if (index >= 0) 960 if (index >= 0 && vmx->rdtscp_enabled)
921 move_msr_up(vmx, index, save_nmsrs++); 961 move_msr_up(vmx, index, save_nmsrs++);
922 /* 962 /*
923 * MSR_K6_STAR is only needed on long mode guests, and only 963 * MSR_K6_STAR is only needed on long mode guests, and only
924 * if efer.sce is enabled. 964 * if efer.sce is enabled.
925 */ 965 */
926 index = __find_msr_index(vmx, MSR_K6_STAR); 966 index = __find_msr_index(vmx, MSR_K6_STAR);
927 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) 967 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
928 move_msr_up(vmx, index, save_nmsrs++); 968 move_msr_up(vmx, index, save_nmsrs++);
929 } 969 }
930#endif 970#endif
931 vmx->save_nmsrs = save_nmsrs; 971 index = __find_msr_index(vmx, MSR_EFER);
972 if (index >= 0 && update_transition_efer(vmx, index))
973 move_msr_up(vmx, index, save_nmsrs++);
932 974
933#ifdef CONFIG_X86_64 975 vmx->save_nmsrs = save_nmsrs;
934 vmx->msr_offset_kernel_gs_base =
935 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
936#endif
937 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
938 976
939 if (cpu_has_vmx_msr_bitmap()) { 977 if (cpu_has_vmx_msr_bitmap()) {
940 if (is_long_mode(&vmx->vcpu)) 978 if (is_long_mode(&vmx->vcpu))
@@ -976,7 +1014,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
976static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 1014static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
977{ 1015{
978 u64 data; 1016 u64 data;
979 struct kvm_msr_entry *msr; 1017 struct shared_msr_entry *msr;
980 1018
981 if (!pdata) { 1019 if (!pdata) {
982 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); 1020 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -991,9 +1029,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
991 case MSR_GS_BASE: 1029 case MSR_GS_BASE:
992 data = vmcs_readl(GUEST_GS_BASE); 1030 data = vmcs_readl(GUEST_GS_BASE);
993 break; 1031 break;
1032 case MSR_KERNEL_GS_BASE:
1033 vmx_load_host_state(to_vmx(vcpu));
1034 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
1035 break;
1036#endif
994 case MSR_EFER: 1037 case MSR_EFER:
995 return kvm_get_msr_common(vcpu, msr_index, pdata); 1038 return kvm_get_msr_common(vcpu, msr_index, pdata);
996#endif
997 case MSR_IA32_TSC: 1039 case MSR_IA32_TSC:
998 data = guest_read_tsc(); 1040 data = guest_read_tsc();
999 break; 1041 break;
@@ -1006,7 +1048,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1006 case MSR_IA32_SYSENTER_ESP: 1048 case MSR_IA32_SYSENTER_ESP:
1007 data = vmcs_readl(GUEST_SYSENTER_ESP); 1049 data = vmcs_readl(GUEST_SYSENTER_ESP);
1008 break; 1050 break;
1051 case MSR_TSC_AUX:
1052 if (!to_vmx(vcpu)->rdtscp_enabled)
1053 return 1;
1054 /* Otherwise falls through */
1009 default: 1055 default:
1056 vmx_load_host_state(to_vmx(vcpu));
1010 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1057 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1011 if (msr) { 1058 if (msr) {
1012 vmx_load_host_state(to_vmx(vcpu)); 1059 vmx_load_host_state(to_vmx(vcpu));
@@ -1028,7 +1075,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1028static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1075static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1029{ 1076{
1030 struct vcpu_vmx *vmx = to_vmx(vcpu); 1077 struct vcpu_vmx *vmx = to_vmx(vcpu);
1031 struct kvm_msr_entry *msr; 1078 struct shared_msr_entry *msr;
1032 u64 host_tsc; 1079 u64 host_tsc;
1033 int ret = 0; 1080 int ret = 0;
1034 1081
@@ -1044,6 +1091,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1044 case MSR_GS_BASE: 1091 case MSR_GS_BASE:
1045 vmcs_writel(GUEST_GS_BASE, data); 1092 vmcs_writel(GUEST_GS_BASE, data);
1046 break; 1093 break;
1094 case MSR_KERNEL_GS_BASE:
1095 vmx_load_host_state(vmx);
1096 vmx->msr_guest_kernel_gs_base = data;
1097 break;
1047#endif 1098#endif
1048 case MSR_IA32_SYSENTER_CS: 1099 case MSR_IA32_SYSENTER_CS:
1049 vmcs_write32(GUEST_SYSENTER_CS, data); 1100 vmcs_write32(GUEST_SYSENTER_CS, data);
@@ -1064,7 +1115,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1064 vcpu->arch.pat = data; 1115 vcpu->arch.pat = data;
1065 break; 1116 break;
1066 } 1117 }
1067 /* Otherwise falls through to kvm_set_msr_common */ 1118 ret = kvm_set_msr_common(vcpu, msr_index, data);
1119 break;
1120 case MSR_TSC_AUX:
1121 if (!vmx->rdtscp_enabled)
1122 return 1;
1123 /* Check reserved bit, higher 32 bits should be zero */
1124 if ((data >> 32) != 0)
1125 return 1;
1126 /* Otherwise falls through */
1068 default: 1127 default:
1069 msr = find_msr_entry(vmx, msr_index); 1128 msr = find_msr_entry(vmx, msr_index);
1070 if (msr) { 1129 if (msr) {
@@ -1097,30 +1156,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1097 } 1156 }
1098} 1157}
1099 1158
1100static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1159static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1101{ 1160{
1102 int old_debug = vcpu->guest_debug;
1103 unsigned long flags;
1104
1105 vcpu->guest_debug = dbg->control;
1106 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
1107 vcpu->guest_debug = 0;
1108
1109 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1161 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1110 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); 1162 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1111 else 1163 else
1112 vmcs_writel(GUEST_DR7, vcpu->arch.dr7); 1164 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1113 1165
1114 flags = vmcs_readl(GUEST_RFLAGS);
1115 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
1116 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1117 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1118 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1119 vmcs_writel(GUEST_RFLAGS, flags);
1120
1121 update_exception_bitmap(vcpu); 1166 update_exception_bitmap(vcpu);
1122
1123 return 0;
1124} 1167}
1125 1168
1126static __init int cpu_has_kvm_support(void) 1169static __init int cpu_has_kvm_support(void)
@@ -1139,12 +1182,15 @@ static __init int vmx_disabled_by_bios(void)
1139 /* locked but not enabled */ 1182 /* locked but not enabled */
1140} 1183}
1141 1184
1142static void hardware_enable(void *garbage) 1185static int hardware_enable(void *garbage)
1143{ 1186{
1144 int cpu = raw_smp_processor_id(); 1187 int cpu = raw_smp_processor_id();
1145 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1188 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1146 u64 old; 1189 u64 old;
1147 1190
1191 if (read_cr4() & X86_CR4_VMXE)
1192 return -EBUSY;
1193
1148 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1194 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1149 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1195 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1150 if ((old & (FEATURE_CONTROL_LOCKED | 1196 if ((old & (FEATURE_CONTROL_LOCKED |
@@ -1159,6 +1205,10 @@ static void hardware_enable(void *garbage)
1159 asm volatile (ASM_VMX_VMXON_RAX 1205 asm volatile (ASM_VMX_VMXON_RAX
1160 : : "a"(&phys_addr), "m"(phys_addr) 1206 : : "a"(&phys_addr), "m"(phys_addr)
1161 : "memory", "cc"); 1207 : "memory", "cc");
1208
1209 ept_sync_global();
1210
1211 return 0;
1162} 1212}
1163 1213
1164static void vmclear_local_vcpus(void) 1214static void vmclear_local_vcpus(void)
@@ -1232,6 +1282,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1232 CPU_BASED_USE_IO_BITMAPS | 1282 CPU_BASED_USE_IO_BITMAPS |
1233 CPU_BASED_MOV_DR_EXITING | 1283 CPU_BASED_MOV_DR_EXITING |
1234 CPU_BASED_USE_TSC_OFFSETING | 1284 CPU_BASED_USE_TSC_OFFSETING |
1285 CPU_BASED_MWAIT_EXITING |
1286 CPU_BASED_MONITOR_EXITING |
1235 CPU_BASED_INVLPG_EXITING; 1287 CPU_BASED_INVLPG_EXITING;
1236 opt = CPU_BASED_TPR_SHADOW | 1288 opt = CPU_BASED_TPR_SHADOW |
1237 CPU_BASED_USE_MSR_BITMAPS | 1289 CPU_BASED_USE_MSR_BITMAPS |
@@ -1250,7 +1302,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1250 SECONDARY_EXEC_WBINVD_EXITING | 1302 SECONDARY_EXEC_WBINVD_EXITING |
1251 SECONDARY_EXEC_ENABLE_VPID | 1303 SECONDARY_EXEC_ENABLE_VPID |
1252 SECONDARY_EXEC_ENABLE_EPT | 1304 SECONDARY_EXEC_ENABLE_EPT |
1253 SECONDARY_EXEC_UNRESTRICTED_GUEST; 1305 SECONDARY_EXEC_UNRESTRICTED_GUEST |
1306 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
1307 SECONDARY_EXEC_RDTSCP;
1254 if (adjust_vmx_controls(min2, opt2, 1308 if (adjust_vmx_controls(min2, opt2,
1255 MSR_IA32_VMX_PROCBASED_CTLS2, 1309 MSR_IA32_VMX_PROCBASED_CTLS2,
1256 &_cpu_based_2nd_exec_control) < 0) 1310 &_cpu_based_2nd_exec_control) < 0)
@@ -1344,15 +1398,17 @@ static void free_kvm_area(void)
1344{ 1398{
1345 int cpu; 1399 int cpu;
1346 1400
1347 for_each_online_cpu(cpu) 1401 for_each_possible_cpu(cpu) {
1348 free_vmcs(per_cpu(vmxarea, cpu)); 1402 free_vmcs(per_cpu(vmxarea, cpu));
1403 per_cpu(vmxarea, cpu) = NULL;
1404 }
1349} 1405}
1350 1406
1351static __init int alloc_kvm_area(void) 1407static __init int alloc_kvm_area(void)
1352{ 1408{
1353 int cpu; 1409 int cpu;
1354 1410
1355 for_each_online_cpu(cpu) { 1411 for_each_possible_cpu(cpu) {
1356 struct vmcs *vmcs; 1412 struct vmcs *vmcs;
1357 1413
1358 vmcs = alloc_vmcs_cpu(cpu); 1414 vmcs = alloc_vmcs_cpu(cpu);
@@ -1394,6 +1450,9 @@ static __init int hardware_setup(void)
1394 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 1450 if (enable_ept && !cpu_has_vmx_ept_2m_page())
1395 kvm_disable_largepages(); 1451 kvm_disable_largepages();
1396 1452
1453 if (!cpu_has_vmx_ple())
1454 ple_gap = 0;
1455
1397 return alloc_kvm_area(); 1456 return alloc_kvm_area();
1398} 1457}
1399 1458
@@ -1431,8 +1490,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1431 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); 1490 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
1432 1491
1433 flags = vmcs_readl(GUEST_RFLAGS); 1492 flags = vmcs_readl(GUEST_RFLAGS);
1434 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 1493 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1435 flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); 1494 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1436 vmcs_writel(GUEST_RFLAGS, flags); 1495 vmcs_writel(GUEST_RFLAGS, flags);
1437 1496
1438 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 1497 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1459,8 +1518,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1459static gva_t rmode_tss_base(struct kvm *kvm) 1518static gva_t rmode_tss_base(struct kvm *kvm)
1460{ 1519{
1461 if (!kvm->arch.tss_addr) { 1520 if (!kvm->arch.tss_addr) {
1462 gfn_t base_gfn = kvm->memslots[0].base_gfn + 1521 struct kvm_memslots *slots;
1463 kvm->memslots[0].npages - 3; 1522 gfn_t base_gfn;
1523
1524 slots = rcu_dereference(kvm->memslots);
1525 base_gfn = kvm->memslots->memslots[0].base_gfn +
1526 kvm->memslots->memslots[0].npages - 3;
1464 return base_gfn << PAGE_SHIFT; 1527 return base_gfn << PAGE_SHIFT;
1465 } 1528 }
1466 return kvm->arch.tss_addr; 1529 return kvm->arch.tss_addr;
@@ -1501,8 +1564,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1501 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 1564 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1502 1565
1503 flags = vmcs_readl(GUEST_RFLAGS); 1566 flags = vmcs_readl(GUEST_RFLAGS);
1504 vmx->rmode.save_iopl 1567 vmx->rmode.save_rflags = flags;
1505 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1506 1568
1507 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1569 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1508 1570
@@ -1536,11 +1598,17 @@ continue_rmode:
1536static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1598static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1537{ 1599{
1538 struct vcpu_vmx *vmx = to_vmx(vcpu); 1600 struct vcpu_vmx *vmx = to_vmx(vcpu);
1539 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 1601 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1540 1602
1541 vcpu->arch.shadow_efer = efer;
1542 if (!msr) 1603 if (!msr)
1543 return; 1604 return;
1605
1606 /*
1607 * Force kernel_gs_base reloading before EFER changes, as control
1608 * of this msr depends on is_long_mode().
1609 */
1610 vmx_load_host_state(to_vmx(vcpu));
1611 vcpu->arch.efer = efer;
1544 if (efer & EFER_LMA) { 1612 if (efer & EFER_LMA) {
1545 vmcs_write32(VM_ENTRY_CONTROLS, 1613 vmcs_write32(VM_ENTRY_CONTROLS,
1546 vmcs_read32(VM_ENTRY_CONTROLS) | 1614 vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1570,13 +1638,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1570 (guest_tr_ar & ~AR_TYPE_MASK) 1638 (guest_tr_ar & ~AR_TYPE_MASK)
1571 | AR_TYPE_BUSY_64_TSS); 1639 | AR_TYPE_BUSY_64_TSS);
1572 } 1640 }
1573 vcpu->arch.shadow_efer |= EFER_LMA; 1641 vcpu->arch.efer |= EFER_LMA;
1574 vmx_set_efer(vcpu, vcpu->arch.shadow_efer); 1642 vmx_set_efer(vcpu, vcpu->arch.efer);
1575} 1643}
1576 1644
1577static void exit_lmode(struct kvm_vcpu *vcpu) 1645static void exit_lmode(struct kvm_vcpu *vcpu)
1578{ 1646{
1579 vcpu->arch.shadow_efer &= ~EFER_LMA; 1647 vcpu->arch.efer &= ~EFER_LMA;
1580 1648
1581 vmcs_write32(VM_ENTRY_CONTROLS, 1649 vmcs_write32(VM_ENTRY_CONTROLS,
1582 vmcs_read32(VM_ENTRY_CONTROLS) 1650 vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1592,10 +1660,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1592 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1660 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1593} 1661}
1594 1662
1663static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1664{
1665 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
1666
1667 vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
1668 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1669}
1670
1595static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1671static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1596{ 1672{
1597 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; 1673 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
1598 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; 1674
1675 vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
1676 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
1599} 1677}
1600 1678
1601static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 1679static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -1640,7 +1718,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1640 (CPU_BASED_CR3_LOAD_EXITING | 1718 (CPU_BASED_CR3_LOAD_EXITING |
1641 CPU_BASED_CR3_STORE_EXITING)); 1719 CPU_BASED_CR3_STORE_EXITING));
1642 vcpu->arch.cr0 = cr0; 1720 vcpu->arch.cr0 = cr0;
1643 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1721 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1644 } else if (!is_paging(vcpu)) { 1722 } else if (!is_paging(vcpu)) {
1645 /* From nonpaging to paging */ 1723 /* From nonpaging to paging */
1646 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1724 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1648,23 +1726,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1648 ~(CPU_BASED_CR3_LOAD_EXITING | 1726 ~(CPU_BASED_CR3_LOAD_EXITING |
1649 CPU_BASED_CR3_STORE_EXITING)); 1727 CPU_BASED_CR3_STORE_EXITING));
1650 vcpu->arch.cr0 = cr0; 1728 vcpu->arch.cr0 = cr0;
1651 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1729 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1652 } 1730 }
1653 1731
1654 if (!(cr0 & X86_CR0_WP)) 1732 if (!(cr0 & X86_CR0_WP))
1655 *hw_cr0 &= ~X86_CR0_WP; 1733 *hw_cr0 &= ~X86_CR0_WP;
1656} 1734}
1657 1735
1658static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
1659 struct kvm_vcpu *vcpu)
1660{
1661 if (!is_paging(vcpu)) {
1662 *hw_cr4 &= ~X86_CR4_PAE;
1663 *hw_cr4 |= X86_CR4_PSE;
1664 } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
1665 *hw_cr4 &= ~X86_CR4_PAE;
1666}
1667
1668static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1736static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1669{ 1737{
1670 struct vcpu_vmx *vmx = to_vmx(vcpu); 1738 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1676,8 +1744,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1676 else 1744 else
1677 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; 1745 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
1678 1746
1679 vmx_fpu_deactivate(vcpu);
1680
1681 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 1747 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
1682 enter_pmode(vcpu); 1748 enter_pmode(vcpu);
1683 1749
@@ -1685,7 +1751,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1685 enter_rmode(vcpu); 1751 enter_rmode(vcpu);
1686 1752
1687#ifdef CONFIG_X86_64 1753#ifdef CONFIG_X86_64
1688 if (vcpu->arch.shadow_efer & EFER_LME) { 1754 if (vcpu->arch.efer & EFER_LME) {
1689 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 1755 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1690 enter_lmode(vcpu); 1756 enter_lmode(vcpu);
1691 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 1757 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1696,12 +1762,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1696 if (enable_ept) 1762 if (enable_ept)
1697 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 1763 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1698 1764
1765 if (!vcpu->fpu_active)
1766 hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
1767
1699 vmcs_writel(CR0_READ_SHADOW, cr0); 1768 vmcs_writel(CR0_READ_SHADOW, cr0);
1700 vmcs_writel(GUEST_CR0, hw_cr0); 1769 vmcs_writel(GUEST_CR0, hw_cr0);
1701 vcpu->arch.cr0 = cr0; 1770 vcpu->arch.cr0 = cr0;
1702
1703 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1704 vmx_fpu_activate(vcpu);
1705} 1771}
1706 1772
1707static u64 construct_eptp(unsigned long root_hpa) 1773static u64 construct_eptp(unsigned long root_hpa)
@@ -1727,12 +1793,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1727 vmcs_write64(EPT_POINTER, eptp); 1793 vmcs_write64(EPT_POINTER, eptp);
1728 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 1794 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1729 vcpu->kvm->arch.ept_identity_map_addr; 1795 vcpu->kvm->arch.ept_identity_map_addr;
1796 ept_load_pdptrs(vcpu);
1730 } 1797 }
1731 1798
1732 vmx_flush_tlb(vcpu); 1799 vmx_flush_tlb(vcpu);
1733 vmcs_writel(GUEST_CR3, guest_cr3); 1800 vmcs_writel(GUEST_CR3, guest_cr3);
1734 if (vcpu->arch.cr0 & X86_CR0_PE)
1735 vmx_fpu_deactivate(vcpu);
1736} 1801}
1737 1802
1738static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1803static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1741,8 +1806,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1741 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 1806 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1742 1807
1743 vcpu->arch.cr4 = cr4; 1808 vcpu->arch.cr4 = cr4;
1744 if (enable_ept) 1809 if (enable_ept) {
1745 ept_update_paging_mode_cr4(&hw_cr4, vcpu); 1810 if (!is_paging(vcpu)) {
1811 hw_cr4 &= ~X86_CR4_PAE;
1812 hw_cr4 |= X86_CR4_PSE;
1813 } else if (!(cr4 & X86_CR4_PAE)) {
1814 hw_cr4 &= ~X86_CR4_PAE;
1815 }
1816 }
1746 1817
1747 vmcs_writel(CR4_READ_SHADOW, cr4); 1818 vmcs_writel(CR4_READ_SHADOW, cr4);
1748 vmcs_writel(GUEST_CR4, hw_cr4); 1819 vmcs_writel(GUEST_CR4, hw_cr4);
@@ -1780,7 +1851,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
1780 1851
1781static int vmx_get_cpl(struct kvm_vcpu *vcpu) 1852static int vmx_get_cpl(struct kvm_vcpu *vcpu)
1782{ 1853{
1783 if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ 1854 if (!is_protmode(vcpu))
1784 return 0; 1855 return 0;
1785 1856
1786 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ 1857 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
@@ -2035,7 +2106,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
2035static bool guest_state_valid(struct kvm_vcpu *vcpu) 2106static bool guest_state_valid(struct kvm_vcpu *vcpu)
2036{ 2107{
2037 /* real mode guest state checks */ 2108 /* real mode guest state checks */
2038 if (!(vcpu->arch.cr0 & X86_CR0_PE)) { 2109 if (!is_protmode(vcpu)) {
2039 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 2110 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
2040 return false; 2111 return false;
2041 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 2112 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
@@ -2168,7 +2239,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
2168 struct kvm_userspace_memory_region kvm_userspace_mem; 2239 struct kvm_userspace_memory_region kvm_userspace_mem;
2169 int r = 0; 2240 int r = 0;
2170 2241
2171 down_write(&kvm->slots_lock); 2242 mutex_lock(&kvm->slots_lock);
2172 if (kvm->arch.apic_access_page) 2243 if (kvm->arch.apic_access_page)
2173 goto out; 2244 goto out;
2174 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; 2245 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -2181,7 +2252,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
2181 2252
2182 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); 2253 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
2183out: 2254out:
2184 up_write(&kvm->slots_lock); 2255 mutex_unlock(&kvm->slots_lock);
2185 return r; 2256 return r;
2186} 2257}
2187 2258
@@ -2190,7 +2261,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
2190 struct kvm_userspace_memory_region kvm_userspace_mem; 2261 struct kvm_userspace_memory_region kvm_userspace_mem;
2191 int r = 0; 2262 int r = 0;
2192 2263
2193 down_write(&kvm->slots_lock); 2264 mutex_lock(&kvm->slots_lock);
2194 if (kvm->arch.ept_identity_pagetable) 2265 if (kvm->arch.ept_identity_pagetable)
2195 goto out; 2266 goto out;
2196 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; 2267 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
@@ -2205,7 +2276,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
2205 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, 2276 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
2206 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); 2277 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
2207out: 2278out:
2208 up_write(&kvm->slots_lock); 2279 mutex_unlock(&kvm->slots_lock);
2209 return r; 2280 return r;
2210} 2281}
2211 2282
@@ -2302,13 +2373,22 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2302 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2373 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2303 if (vmx->vpid == 0) 2374 if (vmx->vpid == 0)
2304 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2375 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2305 if (!enable_ept) 2376 if (!enable_ept) {
2306 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2377 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2378 enable_unrestricted_guest = 0;
2379 }
2307 if (!enable_unrestricted_guest) 2380 if (!enable_unrestricted_guest)
2308 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2381 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2382 if (!ple_gap)
2383 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2309 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2384 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2310 } 2385 }
2311 2386
2387 if (ple_gap) {
2388 vmcs_write32(PLE_GAP, ple_gap);
2389 vmcs_write32(PLE_WINDOW, ple_window);
2390 }
2391
2312 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); 2392 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2313 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2393 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2314 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2394 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -2368,18 +2448,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2368 for (i = 0; i < NR_VMX_MSR; ++i) { 2448 for (i = 0; i < NR_VMX_MSR; ++i) {
2369 u32 index = vmx_msr_index[i]; 2449 u32 index = vmx_msr_index[i];
2370 u32 data_low, data_high; 2450 u32 data_low, data_high;
2371 u64 data;
2372 int j = vmx->nmsrs; 2451 int j = vmx->nmsrs;
2373 2452
2374 if (rdmsr_safe(index, &data_low, &data_high) < 0) 2453 if (rdmsr_safe(index, &data_low, &data_high) < 0)
2375 continue; 2454 continue;
2376 if (wrmsr_safe(index, data_low, data_high) < 0) 2455 if (wrmsr_safe(index, data_low, data_high) < 0)
2377 continue; 2456 continue;
2378 data = data_low | ((u64)data_high << 32); 2457 vmx->guest_msrs[j].index = i;
2379 vmx->host_msrs[j].index = index; 2458 vmx->guest_msrs[j].data = 0;
2380 vmx->host_msrs[j].reserved = 0; 2459 vmx->guest_msrs[j].mask = -1ull;
2381 vmx->host_msrs[j].data = data;
2382 vmx->guest_msrs[j] = vmx->host_msrs[j];
2383 ++vmx->nmsrs; 2460 ++vmx->nmsrs;
2384 } 2461 }
2385 2462
@@ -2389,7 +2466,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2389 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 2466 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2390 2467
2391 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 2468 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2392 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 2469 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
2470 if (enable_ept)
2471 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2472 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2393 2473
2394 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; 2474 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
2395 rdtscll(tsc_this); 2475 rdtscll(tsc_this);
@@ -2414,10 +2494,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2414{ 2494{
2415 struct vcpu_vmx *vmx = to_vmx(vcpu); 2495 struct vcpu_vmx *vmx = to_vmx(vcpu);
2416 u64 msr; 2496 u64 msr;
2417 int ret; 2497 int ret, idx;
2418 2498
2419 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2499 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2420 down_read(&vcpu->kvm->slots_lock); 2500 idx = srcu_read_lock(&vcpu->kvm->srcu);
2421 if (!init_rmode(vmx->vcpu.kvm)) { 2501 if (!init_rmode(vmx->vcpu.kvm)) {
2422 ret = -ENOMEM; 2502 ret = -ENOMEM;
2423 goto out; 2503 goto out;
@@ -2510,8 +2590,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2510 if (vmx->vpid != 0) 2590 if (vmx->vpid != 0)
2511 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2591 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2512 2592
2513 vmx->vcpu.arch.cr0 = 0x60000010; 2593 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
2514 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ 2594 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
2515 vmx_set_cr4(&vmx->vcpu, 0); 2595 vmx_set_cr4(&vmx->vcpu, 0);
2516 vmx_set_efer(&vmx->vcpu, 0); 2596 vmx_set_efer(&vmx->vcpu, 0);
2517 vmx_fpu_activate(&vmx->vcpu); 2597 vmx_fpu_activate(&vmx->vcpu);
@@ -2525,7 +2605,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2525 vmx->emulation_required = 0; 2605 vmx->emulation_required = 0;
2526 2606
2527out: 2607out:
2528 up_read(&vcpu->kvm->slots_lock); 2608 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2529 return ret; 2609 return ret;
2530} 2610}
2531 2611
@@ -2623,8 +2703,35 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2623 return 0; 2703 return 0;
2624 2704
2625 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 2705 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2626 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS | 2706 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
2627 GUEST_INTR_STATE_NMI)); 2707}
2708
2709static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2710{
2711 if (!cpu_has_virtual_nmis())
2712 return to_vmx(vcpu)->soft_vnmi_blocked;
2713 else
2714 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2715 GUEST_INTR_STATE_NMI);
2716}
2717
2718static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2719{
2720 struct vcpu_vmx *vmx = to_vmx(vcpu);
2721
2722 if (!cpu_has_virtual_nmis()) {
2723 if (vmx->soft_vnmi_blocked != masked) {
2724 vmx->soft_vnmi_blocked = masked;
2725 vmx->vnmi_blocked_time = 0;
2726 }
2727 } else {
2728 if (masked)
2729 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2730 GUEST_INTR_STATE_NMI);
2731 else
2732 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2733 GUEST_INTR_STATE_NMI);
2734 }
2628} 2735}
2629 2736
2630static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 2737static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
@@ -2659,7 +2766,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2659 * Cause the #SS fault with 0 error code in VM86 mode. 2766 * Cause the #SS fault with 0 error code in VM86 mode.
2660 */ 2767 */
2661 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2768 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2662 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2769 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
2663 return 1; 2770 return 1;
2664 /* 2771 /*
2665 * Forward all other exceptions that are valid in real mode. 2772 * Forward all other exceptions that are valid in real mode.
@@ -2674,6 +2781,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2674 kvm_queue_exception(vcpu, vec); 2781 kvm_queue_exception(vcpu, vec);
2675 return 1; 2782 return 1;
2676 case BP_VECTOR: 2783 case BP_VECTOR:
2784 /*
2785 * Update instruction length as we may reinject the exception
2786 * from user space while in guest debugging mode.
2787 */
2788 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
2789 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2677 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 2790 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2678 return 0; 2791 return 0;
2679 /* fall through */ 2792 /* fall through */
@@ -2710,15 +2823,16 @@ static void kvm_machine_check(void)
2710#endif 2823#endif
2711} 2824}
2712 2825
2713static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2826static int handle_machine_check(struct kvm_vcpu *vcpu)
2714{ 2827{
2715 /* already handled by vcpu_run */ 2828 /* already handled by vcpu_run */
2716 return 1; 2829 return 1;
2717} 2830}
2718 2831
2719static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2832static int handle_exception(struct kvm_vcpu *vcpu)
2720{ 2833{
2721 struct vcpu_vmx *vmx = to_vmx(vcpu); 2834 struct vcpu_vmx *vmx = to_vmx(vcpu);
2835 struct kvm_run *kvm_run = vcpu->run;
2722 u32 intr_info, ex_no, error_code; 2836 u32 intr_info, ex_no, error_code;
2723 unsigned long cr2, rip, dr6; 2837 unsigned long cr2, rip, dr6;
2724 u32 vect_info; 2838 u32 vect_info;
@@ -2728,12 +2842,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2728 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2842 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2729 2843
2730 if (is_machine_check(intr_info)) 2844 if (is_machine_check(intr_info))
2731 return handle_machine_check(vcpu, kvm_run); 2845 return handle_machine_check(vcpu);
2732 2846
2733 if ((vect_info & VECTORING_INFO_VALID_MASK) && 2847 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2734 !is_page_fault(intr_info)) 2848 !is_page_fault(intr_info)) {
2735 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 2849 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2736 "intr info 0x%x\n", __func__, vect_info, intr_info); 2850 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
2851 vcpu->run->internal.ndata = 2;
2852 vcpu->run->internal.data[0] = vect_info;
2853 vcpu->run->internal.data[1] = intr_info;
2854 return 0;
2855 }
2737 2856
2738 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 2857 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2739 return 1; /* already handled by vmx_vcpu_run() */ 2858 return 1; /* already handled by vmx_vcpu_run() */
@@ -2744,7 +2863,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2744 } 2863 }
2745 2864
2746 if (is_invalid_opcode(intr_info)) { 2865 if (is_invalid_opcode(intr_info)) {
2747 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 2866 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
2748 if (er != EMULATE_DONE) 2867 if (er != EMULATE_DONE)
2749 kvm_queue_exception(vcpu, UD_VECTOR); 2868 kvm_queue_exception(vcpu, UD_VECTOR);
2750 return 1; 2869 return 1;
@@ -2790,6 +2909,13 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2790 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 2909 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
2791 /* fall through */ 2910 /* fall through */
2792 case BP_VECTOR: 2911 case BP_VECTOR:
2912 /*
2913 * Update instruction length as we may reinject #BP from
2914 * user space while in guest debugging mode. Reading it for
2915 * #DB as well causes no harm, it is not used in that case.
2916 */
2917 vmx->vcpu.arch.event_exit_inst_len =
2918 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2793 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2919 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2794 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 2920 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
2795 kvm_run->debug.arch.exception = ex_no; 2921 kvm_run->debug.arch.exception = ex_no;
@@ -2803,20 +2929,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2803 return 0; 2929 return 0;
2804} 2930}
2805 2931
2806static int handle_external_interrupt(struct kvm_vcpu *vcpu, 2932static int handle_external_interrupt(struct kvm_vcpu *vcpu)
2807 struct kvm_run *kvm_run)
2808{ 2933{
2809 ++vcpu->stat.irq_exits; 2934 ++vcpu->stat.irq_exits;
2810 return 1; 2935 return 1;
2811} 2936}
2812 2937
2813static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2938static int handle_triple_fault(struct kvm_vcpu *vcpu)
2814{ 2939{
2815 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2940 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2816 return 0; 2941 return 0;
2817} 2942}
2818 2943
2819static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2944static int handle_io(struct kvm_vcpu *vcpu)
2820{ 2945{
2821 unsigned long exit_qualification; 2946 unsigned long exit_qualification;
2822 int size, in, string; 2947 int size, in, string;
@@ -2827,8 +2952,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2827 string = (exit_qualification & 16) != 0; 2952 string = (exit_qualification & 16) != 0;
2828 2953
2829 if (string) { 2954 if (string) {
2830 if (emulate_instruction(vcpu, 2955 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2831 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
2832 return 0; 2956 return 0;
2833 return 1; 2957 return 1;
2834 } 2958 }
@@ -2838,7 +2962,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 port = exit_qualification >> 16; 2962 port = exit_qualification >> 16;
2839 2963
2840 skip_emulated_instruction(vcpu); 2964 skip_emulated_instruction(vcpu);
2841 return kvm_emulate_pio(vcpu, kvm_run, in, size, port); 2965 return kvm_emulate_pio(vcpu, in, size, port);
2842} 2966}
2843 2967
2844static void 2968static void
@@ -2852,7 +2976,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2852 hypercall[2] = 0xc1; 2976 hypercall[2] = 0xc1;
2853} 2977}
2854 2978
2855static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2979static int handle_cr(struct kvm_vcpu *vcpu)
2856{ 2980{
2857 unsigned long exit_qualification, val; 2981 unsigned long exit_qualification, val;
2858 int cr; 2982 int cr;
@@ -2887,17 +3011,16 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2887 return 1; 3011 return 1;
2888 if (cr8_prev <= cr8) 3012 if (cr8_prev <= cr8)
2889 return 1; 3013 return 1;
2890 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 3014 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2891 return 0; 3015 return 0;
2892 } 3016 }
2893 }; 3017 };
2894 break; 3018 break;
2895 case 2: /* clts */ 3019 case 2: /* clts */
2896 vmx_fpu_deactivate(vcpu); 3020 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
2897 vcpu->arch.cr0 &= ~X86_CR0_TS; 3021 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
2898 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2899 vmx_fpu_activate(vcpu);
2900 skip_emulated_instruction(vcpu); 3022 skip_emulated_instruction(vcpu);
3023 vmx_fpu_activate(vcpu);
2901 return 1; 3024 return 1;
2902 case 1: /*mov from cr*/ 3025 case 1: /*mov from cr*/
2903 switch (cr) { 3026 switch (cr) {
@@ -2915,25 +3038,37 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2915 } 3038 }
2916 break; 3039 break;
2917 case 3: /* lmsw */ 3040 case 3: /* lmsw */
2918 kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); 3041 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
3042 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
3043 kvm_lmsw(vcpu, val);
2919 3044
2920 skip_emulated_instruction(vcpu); 3045 skip_emulated_instruction(vcpu);
2921 return 1; 3046 return 1;
2922 default: 3047 default:
2923 break; 3048 break;
2924 } 3049 }
2925 kvm_run->exit_reason = 0; 3050 vcpu->run->exit_reason = 0;
2926 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 3051 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2927 (int)(exit_qualification >> 4) & 3, cr); 3052 (int)(exit_qualification >> 4) & 3, cr);
2928 return 0; 3053 return 0;
2929} 3054}
2930 3055
2931static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3056static int check_dr_alias(struct kvm_vcpu *vcpu)
3057{
3058 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
3059 kvm_queue_exception(vcpu, UD_VECTOR);
3060 return -1;
3061 }
3062 return 0;
3063}
3064
3065static int handle_dr(struct kvm_vcpu *vcpu)
2932{ 3066{
2933 unsigned long exit_qualification; 3067 unsigned long exit_qualification;
2934 unsigned long val; 3068 unsigned long val;
2935 int dr, reg; 3069 int dr, reg;
2936 3070
3071 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
2937 if (!kvm_require_cpl(vcpu, 0)) 3072 if (!kvm_require_cpl(vcpu, 0))
2938 return 1; 3073 return 1;
2939 dr = vmcs_readl(GUEST_DR7); 3074 dr = vmcs_readl(GUEST_DR7);
@@ -2944,13 +3079,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2944 * guest debugging itself. 3079 * guest debugging itself.
2945 */ 3080 */
2946 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 3081 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
2947 kvm_run->debug.arch.dr6 = vcpu->arch.dr6; 3082 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
2948 kvm_run->debug.arch.dr7 = dr; 3083 vcpu->run->debug.arch.dr7 = dr;
2949 kvm_run->debug.arch.pc = 3084 vcpu->run->debug.arch.pc =
2950 vmcs_readl(GUEST_CS_BASE) + 3085 vmcs_readl(GUEST_CS_BASE) +
2951 vmcs_readl(GUEST_RIP); 3086 vmcs_readl(GUEST_RIP);
2952 kvm_run->debug.arch.exception = DB_VECTOR; 3087 vcpu->run->debug.arch.exception = DB_VECTOR;
2953 kvm_run->exit_reason = KVM_EXIT_DEBUG; 3088 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
2954 return 0; 3089 return 0;
2955 } else { 3090 } else {
2956 vcpu->arch.dr7 &= ~DR7_GD; 3091 vcpu->arch.dr7 &= ~DR7_GD;
@@ -2969,14 +3104,20 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2969 case 0 ... 3: 3104 case 0 ... 3:
2970 val = vcpu->arch.db[dr]; 3105 val = vcpu->arch.db[dr];
2971 break; 3106 break;
3107 case 4:
3108 if (check_dr_alias(vcpu) < 0)
3109 return 1;
3110 /* fall through */
2972 case 6: 3111 case 6:
2973 val = vcpu->arch.dr6; 3112 val = vcpu->arch.dr6;
2974 break; 3113 break;
2975 case 7: 3114 case 5:
3115 if (check_dr_alias(vcpu) < 0)
3116 return 1;
3117 /* fall through */
3118 default: /* 7 */
2976 val = vcpu->arch.dr7; 3119 val = vcpu->arch.dr7;
2977 break; 3120 break;
2978 default:
2979 val = 0;
2980 } 3121 }
2981 kvm_register_write(vcpu, reg, val); 3122 kvm_register_write(vcpu, reg, val);
2982 } else { 3123 } else {
@@ -2987,21 +3128,25 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2987 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 3128 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
2988 vcpu->arch.eff_db[dr] = val; 3129 vcpu->arch.eff_db[dr] = val;
2989 break; 3130 break;
2990 case 4 ... 5: 3131 case 4:
2991 if (vcpu->arch.cr4 & X86_CR4_DE) 3132 if (check_dr_alias(vcpu) < 0)
2992 kvm_queue_exception(vcpu, UD_VECTOR); 3133 return 1;
2993 break; 3134 /* fall through */
2994 case 6: 3135 case 6:
2995 if (val & 0xffffffff00000000ULL) { 3136 if (val & 0xffffffff00000000ULL) {
2996 kvm_queue_exception(vcpu, GP_VECTOR); 3137 kvm_inject_gp(vcpu, 0);
2997 break; 3138 return 1;
2998 } 3139 }
2999 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 3140 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
3000 break; 3141 break;
3001 case 7: 3142 case 5:
3143 if (check_dr_alias(vcpu) < 0)
3144 return 1;
3145 /* fall through */
3146 default: /* 7 */
3002 if (val & 0xffffffff00000000ULL) { 3147 if (val & 0xffffffff00000000ULL) {
3003 kvm_queue_exception(vcpu, GP_VECTOR); 3148 kvm_inject_gp(vcpu, 0);
3004 break; 3149 return 1;
3005 } 3150 }
3006 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 3151 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
3007 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 3152 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
@@ -3016,18 +3161,19 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016 return 1; 3161 return 1;
3017} 3162}
3018 3163
3019static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3164static int handle_cpuid(struct kvm_vcpu *vcpu)
3020{ 3165{
3021 kvm_emulate_cpuid(vcpu); 3166 kvm_emulate_cpuid(vcpu);
3022 return 1; 3167 return 1;
3023} 3168}
3024 3169
3025static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3170static int handle_rdmsr(struct kvm_vcpu *vcpu)
3026{ 3171{
3027 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3172 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3028 u64 data; 3173 u64 data;
3029 3174
3030 if (vmx_get_msr(vcpu, ecx, &data)) { 3175 if (vmx_get_msr(vcpu, ecx, &data)) {
3176 trace_kvm_msr_read_ex(ecx);
3031 kvm_inject_gp(vcpu, 0); 3177 kvm_inject_gp(vcpu, 0);
3032 return 1; 3178 return 1;
3033 } 3179 }
@@ -3041,31 +3187,29 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3041 return 1; 3187 return 1;
3042} 3188}
3043 3189
3044static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3190static int handle_wrmsr(struct kvm_vcpu *vcpu)
3045{ 3191{
3046 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3192 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3047 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3193 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
3048 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 3194 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3049 3195
3050 trace_kvm_msr_write(ecx, data);
3051
3052 if (vmx_set_msr(vcpu, ecx, data) != 0) { 3196 if (vmx_set_msr(vcpu, ecx, data) != 0) {
3197 trace_kvm_msr_write_ex(ecx, data);
3053 kvm_inject_gp(vcpu, 0); 3198 kvm_inject_gp(vcpu, 0);
3054 return 1; 3199 return 1;
3055 } 3200 }
3056 3201
3202 trace_kvm_msr_write(ecx, data);
3057 skip_emulated_instruction(vcpu); 3203 skip_emulated_instruction(vcpu);
3058 return 1; 3204 return 1;
3059} 3205}
3060 3206
3061static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, 3207static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3062 struct kvm_run *kvm_run)
3063{ 3208{
3064 return 1; 3209 return 1;
3065} 3210}
3066 3211
3067static int handle_interrupt_window(struct kvm_vcpu *vcpu, 3212static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3068 struct kvm_run *kvm_run)
3069{ 3213{
3070 u32 cpu_based_vm_exec_control; 3214 u32 cpu_based_vm_exec_control;
3071 3215
@@ -3081,34 +3225,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
3081 * possible 3225 * possible
3082 */ 3226 */
3083 if (!irqchip_in_kernel(vcpu->kvm) && 3227 if (!irqchip_in_kernel(vcpu->kvm) &&
3084 kvm_run->request_interrupt_window && 3228 vcpu->run->request_interrupt_window &&
3085 !kvm_cpu_has_interrupt(vcpu)) { 3229 !kvm_cpu_has_interrupt(vcpu)) {
3086 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 3230 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3087 return 0; 3231 return 0;
3088 } 3232 }
3089 return 1; 3233 return 1;
3090} 3234}
3091 3235
3092static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3236static int handle_halt(struct kvm_vcpu *vcpu)
3093{ 3237{
3094 skip_emulated_instruction(vcpu); 3238 skip_emulated_instruction(vcpu);
3095 return kvm_emulate_halt(vcpu); 3239 return kvm_emulate_halt(vcpu);
3096} 3240}
3097 3241
3098static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3242static int handle_vmcall(struct kvm_vcpu *vcpu)
3099{ 3243{
3100 skip_emulated_instruction(vcpu); 3244 skip_emulated_instruction(vcpu);
3101 kvm_emulate_hypercall(vcpu); 3245 kvm_emulate_hypercall(vcpu);
3102 return 1; 3246 return 1;
3103} 3247}
3104 3248
3105static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3249static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3106{ 3250{
3107 kvm_queue_exception(vcpu, UD_VECTOR); 3251 kvm_queue_exception(vcpu, UD_VECTOR);
3108 return 1; 3252 return 1;
3109} 3253}
3110 3254
3111static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3255static int handle_invlpg(struct kvm_vcpu *vcpu)
3112{ 3256{
3113 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3257 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3114 3258
@@ -3117,14 +3261,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3117 return 1; 3261 return 1;
3118} 3262}
3119 3263
3120static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3264static int handle_wbinvd(struct kvm_vcpu *vcpu)
3121{ 3265{
3122 skip_emulated_instruction(vcpu); 3266 skip_emulated_instruction(vcpu);
3123 /* TODO: Add support for VT-d/pass-through device */ 3267 /* TODO: Add support for VT-d/pass-through device */
3124 return 1; 3268 return 1;
3125} 3269}
3126 3270
3127static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3271static int handle_apic_access(struct kvm_vcpu *vcpu)
3128{ 3272{
3129 unsigned long exit_qualification; 3273 unsigned long exit_qualification;
3130 enum emulation_result er; 3274 enum emulation_result er;
@@ -3133,7 +3277,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3133 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3277 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3134 offset = exit_qualification & 0xffful; 3278 offset = exit_qualification & 0xffful;
3135 3279
3136 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3280 er = emulate_instruction(vcpu, 0, 0, 0);
3137 3281
3138 if (er != EMULATE_DONE) { 3282 if (er != EMULATE_DONE) {
3139 printk(KERN_ERR 3283 printk(KERN_ERR
@@ -3144,7 +3288,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3144 return 1; 3288 return 1;
3145} 3289}
3146 3290
3147static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3291static int handle_task_switch(struct kvm_vcpu *vcpu)
3148{ 3292{
3149 struct vcpu_vmx *vmx = to_vmx(vcpu); 3293 struct vcpu_vmx *vmx = to_vmx(vcpu);
3150 unsigned long exit_qualification; 3294 unsigned long exit_qualification;
@@ -3198,7 +3342,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3198 return 1; 3342 return 1;
3199} 3343}
3200 3344
3201static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3345static int handle_ept_violation(struct kvm_vcpu *vcpu)
3202{ 3346{
3203 unsigned long exit_qualification; 3347 unsigned long exit_qualification;
3204 gpa_t gpa; 3348 gpa_t gpa;
@@ -3219,8 +3363,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3219 vmcs_readl(GUEST_LINEAR_ADDRESS)); 3363 vmcs_readl(GUEST_LINEAR_ADDRESS));
3220 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3364 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3221 (long unsigned int)exit_qualification); 3365 (long unsigned int)exit_qualification);
3222 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3366 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3223 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; 3367 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
3224 return 0; 3368 return 0;
3225 } 3369 }
3226 3370
@@ -3290,7 +3434,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3290 } 3434 }
3291} 3435}
3292 3436
3293static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3437static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3294{ 3438{
3295 u64 sptes[4]; 3439 u64 sptes[4];
3296 int nr_sptes, i; 3440 int nr_sptes, i;
@@ -3306,13 +3450,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3306 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) 3450 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
3307 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); 3451 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
3308 3452
3309 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3453 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3310 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; 3454 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
3311 3455
3312 return 0; 3456 return 0;
3313} 3457}
3314 3458
3315static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3459static int handle_nmi_window(struct kvm_vcpu *vcpu)
3316{ 3460{
3317 u32 cpu_based_vm_exec_control; 3461 u32 cpu_based_vm_exec_control;
3318 3462
@@ -3325,36 +3469,55 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3325 return 1; 3469 return 1;
3326} 3470}
3327 3471
3328static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, 3472static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3329 struct kvm_run *kvm_run)
3330{ 3473{
3331 struct vcpu_vmx *vmx = to_vmx(vcpu); 3474 struct vcpu_vmx *vmx = to_vmx(vcpu);
3332 enum emulation_result err = EMULATE_DONE; 3475 enum emulation_result err = EMULATE_DONE;
3333 3476 int ret = 1;
3334 local_irq_enable();
3335 preempt_enable();
3336 3477
3337 while (!guest_state_valid(vcpu)) { 3478 while (!guest_state_valid(vcpu)) {
3338 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3479 err = emulate_instruction(vcpu, 0, 0, 0);
3339 3480
3340 if (err == EMULATE_DO_MMIO) 3481 if (err == EMULATE_DO_MMIO) {
3341 break; 3482 ret = 0;
3483 goto out;
3484 }
3342 3485
3343 if (err != EMULATE_DONE) { 3486 if (err != EMULATE_DONE) {
3344 kvm_report_emulation_failure(vcpu, "emulation failure"); 3487 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3345 break; 3488 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3489 vcpu->run->internal.ndata = 0;
3490 ret = 0;
3491 goto out;
3346 } 3492 }
3347 3493
3348 if (signal_pending(current)) 3494 if (signal_pending(current))
3349 break; 3495 goto out;
3350 if (need_resched()) 3496 if (need_resched())
3351 schedule(); 3497 schedule();
3352 } 3498 }
3353 3499
3354 preempt_disable(); 3500 vmx->emulation_required = 0;
3355 local_irq_disable(); 3501out:
3502 return ret;
3503}
3504
3505/*
3506 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
3507 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
3508 */
3509static int handle_pause(struct kvm_vcpu *vcpu)
3510{
3511 skip_emulated_instruction(vcpu);
3512 kvm_vcpu_on_spin(vcpu);
3513
3514 return 1;
3515}
3356 3516
3357 vmx->invalid_state_emulation_result = err; 3517static int handle_invalid_op(struct kvm_vcpu *vcpu)
3518{
3519 kvm_queue_exception(vcpu, UD_VECTOR);
3520 return 1;
3358} 3521}
3359 3522
3360/* 3523/*
@@ -3362,8 +3525,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3362 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 3525 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3363 * to be done to userspace and return 0. 3526 * to be done to userspace and return 0.
3364 */ 3527 */
3365static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, 3528static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3366 struct kvm_run *kvm_run) = {
3367 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 3529 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
3368 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 3530 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
3369 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 3531 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
@@ -3394,6 +3556,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3394 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3556 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3395 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3557 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3396 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 3558 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3559 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
3560 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
3561 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
3397}; 3562};
3398 3563
3399static const int kvm_vmx_max_exit_handlers = 3564static const int kvm_vmx_max_exit_handlers =
@@ -3403,7 +3568,7 @@ static const int kvm_vmx_max_exit_handlers =
3403 * The guest has exited. See if we can fix it or if we need userspace 3568 * The guest has exited. See if we can fix it or if we need userspace
3404 * assistance. 3569 * assistance.
3405 */ 3570 */
3406static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3571static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3407{ 3572{
3408 struct vcpu_vmx *vmx = to_vmx(vcpu); 3573 struct vcpu_vmx *vmx = to_vmx(vcpu);
3409 u32 exit_reason = vmx->exit_reason; 3574 u32 exit_reason = vmx->exit_reason;
@@ -3411,13 +3576,9 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3411 3576
3412 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3577 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
3413 3578
3414 /* If we need to emulate an MMIO from handle_invalid_guest_state 3579 /* If guest state is invalid, start emulating */
3415 * we just return 0 */ 3580 if (vmx->emulation_required && emulate_invalid_guest_state)
3416 if (vmx->emulation_required && emulate_invalid_guest_state) { 3581 return handle_invalid_guest_state(vcpu);
3417 if (guest_state_valid(vcpu))
3418 vmx->emulation_required = 0;
3419 return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
3420 }
3421 3582
3422 /* Access CR3 don't cause VMExit in paging mode, so we need 3583 /* Access CR3 don't cause VMExit in paging mode, so we need
3423 * to sync with guest real CR3. */ 3584 * to sync with guest real CR3. */
@@ -3425,8 +3586,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3425 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3586 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3426 3587
3427 if (unlikely(vmx->fail)) { 3588 if (unlikely(vmx->fail)) {
3428 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3589 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3429 kvm_run->fail_entry.hardware_entry_failure_reason 3590 vcpu->run->fail_entry.hardware_entry_failure_reason
3430 = vmcs_read32(VM_INSTRUCTION_ERROR); 3591 = vmcs_read32(VM_INSTRUCTION_ERROR);
3431 return 0; 3592 return 0;
3432 } 3593 }
@@ -3459,10 +3620,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3459 3620
3460 if (exit_reason < kvm_vmx_max_exit_handlers 3621 if (exit_reason < kvm_vmx_max_exit_handlers
3461 && kvm_vmx_exit_handlers[exit_reason]) 3622 && kvm_vmx_exit_handlers[exit_reason])
3462 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 3623 return kvm_vmx_exit_handlers[exit_reason](vcpu);
3463 else { 3624 else {
3464 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3625 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3465 kvm_run->hw.hardware_exit_reason = exit_reason; 3626 vcpu->run->hw.hardware_exit_reason = exit_reason;
3466 } 3627 }
3467 return 0; 3628 return 0;
3468} 3629}
@@ -3600,23 +3761,18 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3600#define Q "l" 3761#define Q "l"
3601#endif 3762#endif
3602 3763
3603static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3764static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3604{ 3765{
3605 struct vcpu_vmx *vmx = to_vmx(vcpu); 3766 struct vcpu_vmx *vmx = to_vmx(vcpu);
3606 3767
3607 if (enable_ept && is_paging(vcpu)) {
3608 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3609 ept_load_pdptrs(vcpu);
3610 }
3611 /* Record the guest's net vcpu time for enforced NMI injections. */ 3768 /* Record the guest's net vcpu time for enforced NMI injections. */
3612 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3769 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3613 vmx->entry_time = ktime_get(); 3770 vmx->entry_time = ktime_get();
3614 3771
3615 /* Handle invalid guest state instead of entering VMX */ 3772 /* Don't enter VMX if guest state is invalid, let the exit handler
3616 if (vmx->emulation_required && emulate_invalid_guest_state) { 3773 start emulation until we arrive back to a valid state */
3617 handle_invalid_guest_state(vcpu, kvm_run); 3774 if (vmx->emulation_required && emulate_invalid_guest_state)
3618 return; 3775 return;
3619 }
3620 3776
3621 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 3777 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3622 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 3778 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -3636,9 +3792,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3636 */ 3792 */
3637 vmcs_writel(HOST_CR0, read_cr0()); 3793 vmcs_writel(HOST_CR0, read_cr0());
3638 3794
3639 if (vcpu->arch.switch_db_regs)
3640 set_debugreg(vcpu->arch.dr6, 6);
3641
3642 asm( 3795 asm(
3643 /* Store host registers */ 3796 /* Store host registers */
3644 "push %%"R"dx; push %%"R"bp;" 3797 "push %%"R"dx; push %%"R"bp;"
@@ -3739,9 +3892,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3739 | (1 << VCPU_EXREG_PDPTR)); 3892 | (1 << VCPU_EXREG_PDPTR));
3740 vcpu->arch.regs_dirty = 0; 3893 vcpu->arch.regs_dirty = 0;
3741 3894
3742 if (vcpu->arch.switch_db_regs)
3743 get_debugreg(vcpu->arch.dr6, 6);
3744
3745 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3895 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3746 if (vmx->rmode.irq.pending) 3896 if (vmx->rmode.irq.pending)
3747 fixup_rmode_irq(vmx); 3897 fixup_rmode_irq(vmx);
@@ -3775,7 +3925,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3775 __clear_bit(vmx->vpid, vmx_vpid_bitmap); 3925 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3776 spin_unlock(&vmx_vpid_lock); 3926 spin_unlock(&vmx_vpid_lock);
3777 vmx_free_vmcs(vcpu); 3927 vmx_free_vmcs(vcpu);
3778 kfree(vmx->host_msrs);
3779 kfree(vmx->guest_msrs); 3928 kfree(vmx->guest_msrs);
3780 kvm_vcpu_uninit(vcpu); 3929 kvm_vcpu_uninit(vcpu);
3781 kmem_cache_free(kvm_vcpu_cache, vmx); 3930 kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -3802,10 +3951,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3802 goto uninit_vcpu; 3951 goto uninit_vcpu;
3803 } 3952 }
3804 3953
3805 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3806 if (!vmx->host_msrs)
3807 goto free_guest_msrs;
3808
3809 vmx->vmcs = alloc_vmcs(); 3954 vmx->vmcs = alloc_vmcs();
3810 if (!vmx->vmcs) 3955 if (!vmx->vmcs)
3811 goto free_msrs; 3956 goto free_msrs;
@@ -3836,8 +3981,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3836free_vmcs: 3981free_vmcs:
3837 free_vmcs(vmx->vmcs); 3982 free_vmcs(vmx->vmcs);
3838free_msrs: 3983free_msrs:
3839 kfree(vmx->host_msrs);
3840free_guest_msrs:
3841 kfree(vmx->guest_msrs); 3984 kfree(vmx->guest_msrs);
3842uninit_vcpu: 3985uninit_vcpu:
3843 kvm_vcpu_uninit(&vmx->vcpu); 3986 kvm_vcpu_uninit(&vmx->vcpu);
@@ -3877,7 +4020,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3877 * b. VT-d with snooping control feature: snooping control feature of 4020 * b. VT-d with snooping control feature: snooping control feature of
3878 * VT-d engine can guarantee the cache correctness. Just set it 4021 * VT-d engine can guarantee the cache correctness. Just set it
3879 * to WB to keep consistent with host. So the same as item 3. 4022 * to WB to keep consistent with host. So the same as item 3.
3880 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep 4023 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
3881 * consistent with host MTRR 4024 * consistent with host MTRR
3882 */ 4025 */
3883 if (is_mmio) 4026 if (is_mmio)
@@ -3888,37 +4031,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3888 VMX_EPT_MT_EPTE_SHIFT; 4031 VMX_EPT_MT_EPTE_SHIFT;
3889 else 4032 else
3890 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) 4033 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
3891 | VMX_EPT_IGMT_BIT; 4034 | VMX_EPT_IPAT_BIT;
3892 4035
3893 return ret; 4036 return ret;
3894} 4037}
3895 4038
4039#define _ER(x) { EXIT_REASON_##x, #x }
4040
3896static const struct trace_print_flags vmx_exit_reasons_str[] = { 4041static const struct trace_print_flags vmx_exit_reasons_str[] = {
3897 { EXIT_REASON_EXCEPTION_NMI, "exception" }, 4042 _ER(EXCEPTION_NMI),
3898 { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, 4043 _ER(EXTERNAL_INTERRUPT),
3899 { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, 4044 _ER(TRIPLE_FAULT),
3900 { EXIT_REASON_NMI_WINDOW, "nmi_window" }, 4045 _ER(PENDING_INTERRUPT),
3901 { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, 4046 _ER(NMI_WINDOW),
3902 { EXIT_REASON_CR_ACCESS, "cr_access" }, 4047 _ER(TASK_SWITCH),
3903 { EXIT_REASON_DR_ACCESS, "dr_access" }, 4048 _ER(CPUID),
3904 { EXIT_REASON_CPUID, "cpuid" }, 4049 _ER(HLT),
3905 { EXIT_REASON_MSR_READ, "rdmsr" }, 4050 _ER(INVLPG),
3906 { EXIT_REASON_MSR_WRITE, "wrmsr" }, 4051 _ER(RDPMC),
3907 { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, 4052 _ER(RDTSC),
3908 { EXIT_REASON_HLT, "halt" }, 4053 _ER(VMCALL),
3909 { EXIT_REASON_INVLPG, "invlpg" }, 4054 _ER(VMCLEAR),
3910 { EXIT_REASON_VMCALL, "hypercall" }, 4055 _ER(VMLAUNCH),
3911 { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, 4056 _ER(VMPTRLD),
3912 { EXIT_REASON_APIC_ACCESS, "apic_access" }, 4057 _ER(VMPTRST),
3913 { EXIT_REASON_WBINVD, "wbinvd" }, 4058 _ER(VMREAD),
3914 { EXIT_REASON_TASK_SWITCH, "task_switch" }, 4059 _ER(VMRESUME),
3915 { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, 4060 _ER(VMWRITE),
4061 _ER(VMOFF),
4062 _ER(VMON),
4063 _ER(CR_ACCESS),
4064 _ER(DR_ACCESS),
4065 _ER(IO_INSTRUCTION),
4066 _ER(MSR_READ),
4067 _ER(MSR_WRITE),
4068 _ER(MWAIT_INSTRUCTION),
4069 _ER(MONITOR_INSTRUCTION),
4070 _ER(PAUSE_INSTRUCTION),
4071 _ER(MCE_DURING_VMENTRY),
4072 _ER(TPR_BELOW_THRESHOLD),
4073 _ER(APIC_ACCESS),
4074 _ER(EPT_VIOLATION),
4075 _ER(EPT_MISCONFIG),
4076 _ER(WBINVD),
3916 { -1, NULL } 4077 { -1, NULL }
3917}; 4078};
3918 4079
3919static bool vmx_gb_page_enable(void) 4080#undef _ER
4081
4082static int vmx_get_lpage_level(void)
3920{ 4083{
3921 return false; 4084 if (enable_ept && !cpu_has_vmx_ept_1g_page())
4085 return PT_DIRECTORY_LEVEL;
4086 else
4087 /* For shadow and EPT supported 1GB page */
4088 return PT_PDPE_LEVEL;
4089}
4090
4091static inline u32 bit(int bitno)
4092{
4093 return 1 << (bitno & 31);
4094}
4095
4096static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4097{
4098 struct kvm_cpuid_entry2 *best;
4099 struct vcpu_vmx *vmx = to_vmx(vcpu);
4100 u32 exec_control;
4101
4102 vmx->rdtscp_enabled = false;
4103 if (vmx_rdtscp_supported()) {
4104 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
4105 if (exec_control & SECONDARY_EXEC_RDTSCP) {
4106 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
4107 if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
4108 vmx->rdtscp_enabled = true;
4109 else {
4110 exec_control &= ~SECONDARY_EXEC_RDTSCP;
4111 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4112 exec_control);
4113 }
4114 }
4115 }
3922} 4116}
3923 4117
3924static struct kvm_x86_ops vmx_x86_ops = { 4118static struct kvm_x86_ops vmx_x86_ops = {
@@ -3947,6 +4141,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3947 .set_segment = vmx_set_segment, 4141 .set_segment = vmx_set_segment,
3948 .get_cpl = vmx_get_cpl, 4142 .get_cpl = vmx_get_cpl,
3949 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 4143 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4144 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
3950 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 4145 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
3951 .set_cr0 = vmx_set_cr0, 4146 .set_cr0 = vmx_set_cr0,
3952 .set_cr3 = vmx_set_cr3, 4147 .set_cr3 = vmx_set_cr3,
@@ -3959,6 +4154,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
3959 .cache_reg = vmx_cache_reg, 4154 .cache_reg = vmx_cache_reg,
3960 .get_rflags = vmx_get_rflags, 4155 .get_rflags = vmx_get_rflags,
3961 .set_rflags = vmx_set_rflags, 4156 .set_rflags = vmx_set_rflags,
4157 .fpu_activate = vmx_fpu_activate,
4158 .fpu_deactivate = vmx_fpu_deactivate,
3962 4159
3963 .tlb_flush = vmx_flush_tlb, 4160 .tlb_flush = vmx_flush_tlb,
3964 4161
@@ -3973,6 +4170,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
3973 .queue_exception = vmx_queue_exception, 4170 .queue_exception = vmx_queue_exception,
3974 .interrupt_allowed = vmx_interrupt_allowed, 4171 .interrupt_allowed = vmx_interrupt_allowed,
3975 .nmi_allowed = vmx_nmi_allowed, 4172 .nmi_allowed = vmx_nmi_allowed,
4173 .get_nmi_mask = vmx_get_nmi_mask,
4174 .set_nmi_mask = vmx_set_nmi_mask,
3976 .enable_nmi_window = enable_nmi_window, 4175 .enable_nmi_window = enable_nmi_window,
3977 .enable_irq_window = enable_irq_window, 4176 .enable_irq_window = enable_irq_window,
3978 .update_cr8_intercept = update_cr8_intercept, 4177 .update_cr8_intercept = update_cr8_intercept,
@@ -3982,12 +4181,21 @@ static struct kvm_x86_ops vmx_x86_ops = {
3982 .get_mt_mask = vmx_get_mt_mask, 4181 .get_mt_mask = vmx_get_mt_mask,
3983 4182
3984 .exit_reasons_str = vmx_exit_reasons_str, 4183 .exit_reasons_str = vmx_exit_reasons_str,
3985 .gb_page_enable = vmx_gb_page_enable, 4184 .get_lpage_level = vmx_get_lpage_level,
4185
4186 .cpuid_update = vmx_cpuid_update,
4187
4188 .rdtscp_supported = vmx_rdtscp_supported,
3986}; 4189};
3987 4190
3988static int __init vmx_init(void) 4191static int __init vmx_init(void)
3989{ 4192{
3990 int r; 4193 int r, i;
4194
4195 rdmsrl_safe(MSR_EFER, &host_efer);
4196
4197 for (i = 0; i < NR_VMX_MSR; ++i)
4198 kvm_define_shared_msr(i, vmx_msr_index[i]);
3991 4199
3992 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); 4200 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3993 if (!vmx_io_bitmap_a) 4201 if (!vmx_io_bitmap_a)
@@ -4049,8 +4257,6 @@ static int __init vmx_init(void)
4049 if (bypass_guest_pf) 4257 if (bypass_guest_pf)
4050 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 4258 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4051 4259
4052 ept_sync_global();
4053
4054 return 0; 4260 return 0;
4055 4261
4056out3: 4262out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae07d261527c..c4f35b545c1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,11 +37,15 @@
37#include <linux/iommu.h> 37#include <linux/iommu.h>
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <linux/user-return-notifier.h>
41#include <linux/srcu.h>
42#include <linux/slab.h>
40#include <trace/events/kvm.h> 43#include <trace/events/kvm.h>
41#undef TRACE_INCLUDE_FILE 44#undef TRACE_INCLUDE_FILE
42#define CREATE_TRACE_POINTS 45#define CREATE_TRACE_POINTS
43#include "trace.h" 46#include "trace.h"
44 47
48#include <asm/debugreg.h>
45#include <asm/uaccess.h> 49#include <asm/uaccess.h>
46#include <asm/msr.h> 50#include <asm/msr.h>
47#include <asm/desc.h> 51#include <asm/desc.h>
@@ -87,6 +91,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
87int ignore_msrs = 0; 91int ignore_msrs = 0;
88module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 92module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
89 93
94#define KVM_NR_SHARED_MSRS 16
95
96struct kvm_shared_msrs_global {
97 int nr;
98 u32 msrs[KVM_NR_SHARED_MSRS];
99};
100
101struct kvm_shared_msrs {
102 struct user_return_notifier urn;
103 bool registered;
104 struct kvm_shared_msr_values {
105 u64 host;
106 u64 curr;
107 } values[KVM_NR_SHARED_MSRS];
108};
109
110static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
111static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
112
90struct kvm_stats_debugfs_item debugfs_entries[] = { 113struct kvm_stats_debugfs_item debugfs_entries[] = {
91 { "pf_fixed", VCPU_STAT(pf_fixed) }, 114 { "pf_fixed", VCPU_STAT(pf_fixed) },
92 { "pf_guest", VCPU_STAT(pf_guest) }, 115 { "pf_guest", VCPU_STAT(pf_guest) },
@@ -123,6 +146,83 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
123 { NULL } 146 { NULL }
124}; 147};
125 148
149static void kvm_on_user_return(struct user_return_notifier *urn)
150{
151 unsigned slot;
152 struct kvm_shared_msrs *locals
153 = container_of(urn, struct kvm_shared_msrs, urn);
154 struct kvm_shared_msr_values *values;
155
156 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
157 values = &locals->values[slot];
158 if (values->host != values->curr) {
159 wrmsrl(shared_msrs_global.msrs[slot], values->host);
160 values->curr = values->host;
161 }
162 }
163 locals->registered = false;
164 user_return_notifier_unregister(urn);
165}
166
167static void shared_msr_update(unsigned slot, u32 msr)
168{
169 struct kvm_shared_msrs *smsr;
170 u64 value;
171
172 smsr = &__get_cpu_var(shared_msrs);
173 /* only read, and nobody should modify it at this time,
174 * so don't need lock */
175 if (slot >= shared_msrs_global.nr) {
176 printk(KERN_ERR "kvm: invalid MSR slot!");
177 return;
178 }
179 rdmsrl_safe(msr, &value);
180 smsr->values[slot].host = value;
181 smsr->values[slot].curr = value;
182}
183
184void kvm_define_shared_msr(unsigned slot, u32 msr)
185{
186 if (slot >= shared_msrs_global.nr)
187 shared_msrs_global.nr = slot + 1;
188 shared_msrs_global.msrs[slot] = msr;
189 /* we need ensured the shared_msr_global have been updated */
190 smp_wmb();
191}
192EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
193
194static void kvm_shared_msr_cpu_online(void)
195{
196 unsigned i;
197
198 for (i = 0; i < shared_msrs_global.nr; ++i)
199 shared_msr_update(i, shared_msrs_global.msrs[i]);
200}
201
202void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
203{
204 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
205
206 if (((value ^ smsr->values[slot].curr) & mask) == 0)
207 return;
208 smsr->values[slot].curr = value;
209 wrmsrl(shared_msrs_global.msrs[slot], value);
210 if (!smsr->registered) {
211 smsr->urn.on_user_return = kvm_on_user_return;
212 user_return_notifier_register(&smsr->urn);
213 smsr->registered = true;
214 }
215}
216EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
217
218static void drop_user_return_notifiers(void *ignore)
219{
220 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
221
222 if (smsr->registered)
223 kvm_on_user_return(&smsr->urn);
224}
225
126unsigned long segment_base(u16 selector) 226unsigned long segment_base(u16 selector)
127{ 227{
128 struct descriptor_table gdt; 228 struct descriptor_table gdt;
@@ -170,12 +270,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
170} 270}
171EXPORT_SYMBOL_GPL(kvm_set_apic_base); 271EXPORT_SYMBOL_GPL(kvm_set_apic_base);
172 272
273#define EXCPT_BENIGN 0
274#define EXCPT_CONTRIBUTORY 1
275#define EXCPT_PF 2
276
277static int exception_class(int vector)
278{
279 switch (vector) {
280 case PF_VECTOR:
281 return EXCPT_PF;
282 case DE_VECTOR:
283 case TS_VECTOR:
284 case NP_VECTOR:
285 case SS_VECTOR:
286 case GP_VECTOR:
287 return EXCPT_CONTRIBUTORY;
288 default:
289 break;
290 }
291 return EXCPT_BENIGN;
292}
293
294static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
295 unsigned nr, bool has_error, u32 error_code)
296{
297 u32 prev_nr;
298 int class1, class2;
299
300 if (!vcpu->arch.exception.pending) {
301 queue:
302 vcpu->arch.exception.pending = true;
303 vcpu->arch.exception.has_error_code = has_error;
304 vcpu->arch.exception.nr = nr;
305 vcpu->arch.exception.error_code = error_code;
306 return;
307 }
308
309 /* to check exception */
310 prev_nr = vcpu->arch.exception.nr;
311 if (prev_nr == DF_VECTOR) {
312 /* triple fault -> shutdown */
313 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
314 return;
315 }
316 class1 = exception_class(prev_nr);
317 class2 = exception_class(nr);
318 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
319 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
320 /* generate double fault per SDM Table 5-5 */
321 vcpu->arch.exception.pending = true;
322 vcpu->arch.exception.has_error_code = true;
323 vcpu->arch.exception.nr = DF_VECTOR;
324 vcpu->arch.exception.error_code = 0;
325 } else
326 /* replace previous exception with a new one in a hope
327 that instruction re-execution will regenerate lost
328 exception */
329 goto queue;
330}
331
173void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 332void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
174{ 333{
175 WARN_ON(vcpu->arch.exception.pending); 334 kvm_multiple_exception(vcpu, nr, false, 0);
176 vcpu->arch.exception.pending = true;
177 vcpu->arch.exception.has_error_code = false;
178 vcpu->arch.exception.nr = nr;
179} 335}
180EXPORT_SYMBOL_GPL(kvm_queue_exception); 336EXPORT_SYMBOL_GPL(kvm_queue_exception);
181 337
@@ -183,25 +339,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
183 u32 error_code) 339 u32 error_code)
184{ 340{
185 ++vcpu->stat.pf_guest; 341 ++vcpu->stat.pf_guest;
186
187 if (vcpu->arch.exception.pending) {
188 switch(vcpu->arch.exception.nr) {
189 case DF_VECTOR:
190 /* triple fault -> shutdown */
191 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
192 return;
193 case PF_VECTOR:
194 vcpu->arch.exception.nr = DF_VECTOR;
195 vcpu->arch.exception.error_code = 0;
196 return;
197 default:
198 /* replace previous exception with a new one in a hope
199 that instruction re-execution will regenerate lost
200 exception */
201 vcpu->arch.exception.pending = false;
202 break;
203 }
204 }
205 vcpu->arch.cr2 = addr; 342 vcpu->arch.cr2 = addr;
206 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 343 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
207} 344}
@@ -214,11 +351,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
214 351
215void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 352void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
216{ 353{
217 WARN_ON(vcpu->arch.exception.pending); 354 kvm_multiple_exception(vcpu, nr, true, error_code);
218 vcpu->arch.exception.pending = true;
219 vcpu->arch.exception.has_error_code = true;
220 vcpu->arch.exception.nr = nr;
221 vcpu->arch.exception.error_code = error_code;
222} 355}
223EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 356EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
224 357
@@ -296,41 +429,38 @@ out:
296 429
297void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 430void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
298{ 431{
299 if (cr0 & CR0_RESERVED_BITS) { 432 cr0 |= X86_CR0_ET;
300 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 433
301 cr0, vcpu->arch.cr0); 434#ifdef CONFIG_X86_64
435 if (cr0 & 0xffffffff00000000UL) {
302 kvm_inject_gp(vcpu, 0); 436 kvm_inject_gp(vcpu, 0);
303 return; 437 return;
304 } 438 }
439#endif
440
441 cr0 &= ~CR0_RESERVED_BITS;
305 442
306 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 443 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
307 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
308 kvm_inject_gp(vcpu, 0); 444 kvm_inject_gp(vcpu, 0);
309 return; 445 return;
310 } 446 }
311 447
312 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 448 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
313 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
314 "and a clear PE flag\n");
315 kvm_inject_gp(vcpu, 0); 449 kvm_inject_gp(vcpu, 0);
316 return; 450 return;
317 } 451 }
318 452
319 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 453 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
320#ifdef CONFIG_X86_64 454#ifdef CONFIG_X86_64
321 if ((vcpu->arch.shadow_efer & EFER_LME)) { 455 if ((vcpu->arch.efer & EFER_LME)) {
322 int cs_db, cs_l; 456 int cs_db, cs_l;
323 457
324 if (!is_pae(vcpu)) { 458 if (!is_pae(vcpu)) {
325 printk(KERN_DEBUG "set_cr0: #GP, start paging "
326 "in long mode while PAE is disabled\n");
327 kvm_inject_gp(vcpu, 0); 459 kvm_inject_gp(vcpu, 0);
328 return; 460 return;
329 } 461 }
330 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 462 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
331 if (cs_l) { 463 if (cs_l) {
332 printk(KERN_DEBUG "set_cr0: #GP, start paging "
333 "in long mode while CS.L == 1\n");
334 kvm_inject_gp(vcpu, 0); 464 kvm_inject_gp(vcpu, 0);
335 return; 465 return;
336 466
@@ -338,8 +468,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
338 } else 468 } else
339#endif 469#endif
340 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 470 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
341 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
342 "reserved bits\n");
343 kvm_inject_gp(vcpu, 0); 471 kvm_inject_gp(vcpu, 0);
344 return; 472 return;
345 } 473 }
@@ -356,38 +484,33 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
356 484
357void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 485void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
358{ 486{
359 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 487 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
360} 488}
361EXPORT_SYMBOL_GPL(kvm_lmsw); 489EXPORT_SYMBOL_GPL(kvm_lmsw);
362 490
363void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 491void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
364{ 492{
365 unsigned long old_cr4 = vcpu->arch.cr4; 493 unsigned long old_cr4 = kvm_read_cr4(vcpu);
366 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 494 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
367 495
368 if (cr4 & CR4_RESERVED_BITS) { 496 if (cr4 & CR4_RESERVED_BITS) {
369 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
370 kvm_inject_gp(vcpu, 0); 497 kvm_inject_gp(vcpu, 0);
371 return; 498 return;
372 } 499 }
373 500
374 if (is_long_mode(vcpu)) { 501 if (is_long_mode(vcpu)) {
375 if (!(cr4 & X86_CR4_PAE)) { 502 if (!(cr4 & X86_CR4_PAE)) {
376 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
377 "in long mode\n");
378 kvm_inject_gp(vcpu, 0); 503 kvm_inject_gp(vcpu, 0);
379 return; 504 return;
380 } 505 }
381 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 506 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
382 && ((cr4 ^ old_cr4) & pdptr_bits) 507 && ((cr4 ^ old_cr4) & pdptr_bits)
383 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 508 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
384 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
385 kvm_inject_gp(vcpu, 0); 509 kvm_inject_gp(vcpu, 0);
386 return; 510 return;
387 } 511 }
388 512
389 if (cr4 & X86_CR4_VMXE) { 513 if (cr4 & X86_CR4_VMXE) {
390 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
391 kvm_inject_gp(vcpu, 0); 514 kvm_inject_gp(vcpu, 0);
392 return; 515 return;
393 } 516 }
@@ -408,21 +531,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
408 531
409 if (is_long_mode(vcpu)) { 532 if (is_long_mode(vcpu)) {
410 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 533 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
411 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
412 kvm_inject_gp(vcpu, 0); 534 kvm_inject_gp(vcpu, 0);
413 return; 535 return;
414 } 536 }
415 } else { 537 } else {
416 if (is_pae(vcpu)) { 538 if (is_pae(vcpu)) {
417 if (cr3 & CR3_PAE_RESERVED_BITS) { 539 if (cr3 & CR3_PAE_RESERVED_BITS) {
418 printk(KERN_DEBUG
419 "set_cr3: #GP, reserved bits\n");
420 kvm_inject_gp(vcpu, 0); 540 kvm_inject_gp(vcpu, 0);
421 return; 541 return;
422 } 542 }
423 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 543 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
424 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
425 "reserved bits\n");
426 kvm_inject_gp(vcpu, 0); 544 kvm_inject_gp(vcpu, 0);
427 return; 545 return;
428 } 546 }
@@ -454,7 +572,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3);
454void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 572void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
455{ 573{
456 if (cr8 & CR8_RESERVED_BITS) { 574 if (cr8 & CR8_RESERVED_BITS) {
457 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
458 kvm_inject_gp(vcpu, 0); 575 kvm_inject_gp(vcpu, 0);
459 return; 576 return;
460 } 577 }
@@ -484,16 +601,21 @@ static inline u32 bit(int bitno)
484 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 601 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
485 * 602 *
486 * This list is modified at module load time to reflect the 603 * This list is modified at module load time to reflect the
487 * capabilities of the host cpu. 604 * capabilities of the host cpu. This capabilities test skips MSRs that are
605 * kvm-specific. Those are put in the beginning of the list.
488 */ 606 */
607
608#define KVM_SAVE_MSRS_BEGIN 5
489static u32 msrs_to_save[] = { 609static u32 msrs_to_save[] = {
610 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
611 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
612 HV_X64_MSR_APIC_ASSIST_PAGE,
490 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 613 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
491 MSR_K6_STAR, 614 MSR_K6_STAR,
492#ifdef CONFIG_X86_64 615#ifdef CONFIG_X86_64
493 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 616 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
494#endif 617#endif
495 MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 618 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
496 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
497}; 619};
498 620
499static unsigned num_msrs_to_save; 621static unsigned num_msrs_to_save;
@@ -505,15 +627,12 @@ static u32 emulated_msrs[] = {
505static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 627static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
506{ 628{
507 if (efer & efer_reserved_bits) { 629 if (efer & efer_reserved_bits) {
508 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
509 efer);
510 kvm_inject_gp(vcpu, 0); 630 kvm_inject_gp(vcpu, 0);
511 return; 631 return;
512 } 632 }
513 633
514 if (is_paging(vcpu) 634 if (is_paging(vcpu)
515 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 635 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
516 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
517 kvm_inject_gp(vcpu, 0); 636 kvm_inject_gp(vcpu, 0);
518 return; 637 return;
519 } 638 }
@@ -523,7 +642,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
523 642
524 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 643 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
525 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 644 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
526 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
527 kvm_inject_gp(vcpu, 0); 645 kvm_inject_gp(vcpu, 0);
528 return; 646 return;
529 } 647 }
@@ -534,7 +652,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
534 652
535 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 653 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
536 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 654 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
537 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
538 kvm_inject_gp(vcpu, 0); 655 kvm_inject_gp(vcpu, 0);
539 return; 656 return;
540 } 657 }
@@ -543,9 +660,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
543 kvm_x86_ops->set_efer(vcpu, efer); 660 kvm_x86_ops->set_efer(vcpu, efer);
544 661
545 efer &= ~EFER_LMA; 662 efer &= ~EFER_LMA;
546 efer |= vcpu->arch.shadow_efer & EFER_LMA; 663 efer |= vcpu->arch.efer & EFER_LMA;
547 664
548 vcpu->arch.shadow_efer = efer; 665 vcpu->arch.efer = efer;
549 666
550 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 667 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
551 kvm_mmu_reset_context(vcpu); 668 kvm_mmu_reset_context(vcpu);
@@ -580,7 +697,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
580{ 697{
581 static int version; 698 static int version;
582 struct pvclock_wall_clock wc; 699 struct pvclock_wall_clock wc;
583 struct timespec now, sys, boot; 700 struct timespec boot;
584 701
585 if (!wall_clock) 702 if (!wall_clock)
586 return; 703 return;
@@ -595,9 +712,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
595 * wall clock specified here. guest system time equals host 712 * wall clock specified here. guest system time equals host
596 * system time for us, thus we must fill in host boot time here. 713 * system time for us, thus we must fill in host boot time here.
597 */ 714 */
598 now = current_kernel_time(); 715 getboottime(&boot);
599 ktime_get_ts(&sys);
600 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
601 716
602 wc.sec = boot.tv_sec; 717 wc.sec = boot.tv_sec;
603 wc.nsec = boot.tv_nsec; 718 wc.nsec = boot.tv_nsec;
@@ -672,12 +787,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
672 local_irq_save(flags); 787 local_irq_save(flags);
673 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); 788 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
674 ktime_get_ts(&ts); 789 ktime_get_ts(&ts);
790 monotonic_to_bootbased(&ts);
675 local_irq_restore(flags); 791 local_irq_restore(flags);
676 792
677 /* With all the info we got, fill in the values */ 793 /* With all the info we got, fill in the values */
678 794
679 vcpu->hv_clock.system_time = ts.tv_nsec + 795 vcpu->hv_clock.system_time = ts.tv_nsec +
680 (NSEC_PER_SEC * (u64)ts.tv_sec); 796 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
797
681 /* 798 /*
682 * The interface expects us to write an even number signaling that the 799 * The interface expects us to write an even number signaling that the
683 * update is finished. Since the guest won't see the intermediate 800 * update is finished. Since the guest won't see the intermediate
@@ -823,9 +940,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
823 if (msr >= MSR_IA32_MC0_CTL && 940 if (msr >= MSR_IA32_MC0_CTL &&
824 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 941 msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
825 u32 offset = msr - MSR_IA32_MC0_CTL; 942 u32 offset = msr - MSR_IA32_MC0_CTL;
826 /* only 0 or all 1s can be written to IA32_MCi_CTL */ 943 /* only 0 or all 1s can be written to IA32_MCi_CTL
944 * some Linux kernels though clear bit 10 in bank 4 to
945 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
946 * this to avoid an uncatched #GP in the guest
947 */
827 if ((offset & 0x3) == 0 && 948 if ((offset & 0x3) == 0 &&
828 data != 0 && data != ~(u64)0) 949 data != 0 && (data | (1 << 10)) != ~(u64)0)
829 return -1; 950 return -1;
830 vcpu->arch.mce_banks[offset] = data; 951 vcpu->arch.mce_banks[offset] = data;
831 break; 952 break;
@@ -835,6 +956,132 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
835 return 0; 956 return 0;
836} 957}
837 958
959static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
960{
961 struct kvm *kvm = vcpu->kvm;
962 int lm = is_long_mode(vcpu);
963 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
964 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
965 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
966 : kvm->arch.xen_hvm_config.blob_size_32;
967 u32 page_num = data & ~PAGE_MASK;
968 u64 page_addr = data & PAGE_MASK;
969 u8 *page;
970 int r;
971
972 r = -E2BIG;
973 if (page_num >= blob_size)
974 goto out;
975 r = -ENOMEM;
976 page = kzalloc(PAGE_SIZE, GFP_KERNEL);
977 if (!page)
978 goto out;
979 r = -EFAULT;
980 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
981 goto out_free;
982 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
983 goto out_free;
984 r = 0;
985out_free:
986 kfree(page);
987out:
988 return r;
989}
990
991static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
992{
993 return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
994}
995
996static bool kvm_hv_msr_partition_wide(u32 msr)
997{
998 bool r = false;
999 switch (msr) {
1000 case HV_X64_MSR_GUEST_OS_ID:
1001 case HV_X64_MSR_HYPERCALL:
1002 r = true;
1003 break;
1004 }
1005
1006 return r;
1007}
1008
1009static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1010{
1011 struct kvm *kvm = vcpu->kvm;
1012
1013 switch (msr) {
1014 case HV_X64_MSR_GUEST_OS_ID:
1015 kvm->arch.hv_guest_os_id = data;
1016 /* setting guest os id to zero disables hypercall page */
1017 if (!kvm->arch.hv_guest_os_id)
1018 kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1019 break;
1020 case HV_X64_MSR_HYPERCALL: {
1021 u64 gfn;
1022 unsigned long addr;
1023 u8 instructions[4];
1024
1025 /* if guest os id is not set hypercall should remain disabled */
1026 if (!kvm->arch.hv_guest_os_id)
1027 break;
1028 if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1029 kvm->arch.hv_hypercall = data;
1030 break;
1031 }
1032 gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1033 addr = gfn_to_hva(kvm, gfn);
1034 if (kvm_is_error_hva(addr))
1035 return 1;
1036 kvm_x86_ops->patch_hypercall(vcpu, instructions);
1037 ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1038 if (copy_to_user((void __user *)addr, instructions, 4))
1039 return 1;
1040 kvm->arch.hv_hypercall = data;
1041 break;
1042 }
1043 default:
1044 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1045 "data 0x%llx\n", msr, data);
1046 return 1;
1047 }
1048 return 0;
1049}
1050
1051static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1052{
1053 switch (msr) {
1054 case HV_X64_MSR_APIC_ASSIST_PAGE: {
1055 unsigned long addr;
1056
1057 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1058 vcpu->arch.hv_vapic = data;
1059 break;
1060 }
1061 addr = gfn_to_hva(vcpu->kvm, data >>
1062 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1063 if (kvm_is_error_hva(addr))
1064 return 1;
1065 if (clear_user((void __user *)addr, PAGE_SIZE))
1066 return 1;
1067 vcpu->arch.hv_vapic = data;
1068 break;
1069 }
1070 case HV_X64_MSR_EOI:
1071 return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
1072 case HV_X64_MSR_ICR:
1073 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
1074 case HV_X64_MSR_TPR:
1075 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1076 default:
1077 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1078 "data 0x%llx\n", msr, data);
1079 return 1;
1080 }
1081
1082 return 0;
1083}
1084
838int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1085int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
839{ 1086{
840 switch (msr) { 1087 switch (msr) {
@@ -949,7 +1196,19 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
949 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1196 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
950 "0x%x data 0x%llx\n", msr, data); 1197 "0x%x data 0x%llx\n", msr, data);
951 break; 1198 break;
1199 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1200 if (kvm_hv_msr_partition_wide(msr)) {
1201 int r;
1202 mutex_lock(&vcpu->kvm->lock);
1203 r = set_msr_hyperv_pw(vcpu, msr, data);
1204 mutex_unlock(&vcpu->kvm->lock);
1205 return r;
1206 } else
1207 return set_msr_hyperv(vcpu, msr, data);
1208 break;
952 default: 1209 default:
1210 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1211 return xen_hvm_config(vcpu, data);
953 if (!ignore_msrs) { 1212 if (!ignore_msrs) {
954 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1213 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
955 msr, data); 1214 msr, data);
@@ -1046,6 +1305,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1046 return 0; 1305 return 0;
1047} 1306}
1048 1307
1308static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1309{
1310 u64 data = 0;
1311 struct kvm *kvm = vcpu->kvm;
1312
1313 switch (msr) {
1314 case HV_X64_MSR_GUEST_OS_ID:
1315 data = kvm->arch.hv_guest_os_id;
1316 break;
1317 case HV_X64_MSR_HYPERCALL:
1318 data = kvm->arch.hv_hypercall;
1319 break;
1320 default:
1321 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1322 return 1;
1323 }
1324
1325 *pdata = data;
1326 return 0;
1327}
1328
1329static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1330{
1331 u64 data = 0;
1332
1333 switch (msr) {
1334 case HV_X64_MSR_VP_INDEX: {
1335 int r;
1336 struct kvm_vcpu *v;
1337 kvm_for_each_vcpu(r, v, vcpu->kvm)
1338 if (v == vcpu)
1339 data = r;
1340 break;
1341 }
1342 case HV_X64_MSR_EOI:
1343 return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
1344 case HV_X64_MSR_ICR:
1345 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
1346 case HV_X64_MSR_TPR:
1347 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
1348 default:
1349 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1350 return 1;
1351 }
1352 *pdata = data;
1353 return 0;
1354}
1355
1049int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1356int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1050{ 1357{
1051 u64 data; 1358 u64 data;
@@ -1097,7 +1404,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1097 data |= (((uint64_t)4ULL) << 40); 1404 data |= (((uint64_t)4ULL) << 40);
1098 break; 1405 break;
1099 case MSR_EFER: 1406 case MSR_EFER:
1100 data = vcpu->arch.shadow_efer; 1407 data = vcpu->arch.efer;
1101 break; 1408 break;
1102 case MSR_KVM_WALL_CLOCK: 1409 case MSR_KVM_WALL_CLOCK:
1103 data = vcpu->kvm->arch.wall_clock; 1410 data = vcpu->kvm->arch.wall_clock;
@@ -1112,6 +1419,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1112 case MSR_IA32_MCG_STATUS: 1419 case MSR_IA32_MCG_STATUS:
1113 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1420 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1114 return get_msr_mce(vcpu, msr, pdata); 1421 return get_msr_mce(vcpu, msr, pdata);
1422 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1423 if (kvm_hv_msr_partition_wide(msr)) {
1424 int r;
1425 mutex_lock(&vcpu->kvm->lock);
1426 r = get_msr_hyperv_pw(vcpu, msr, pdata);
1427 mutex_unlock(&vcpu->kvm->lock);
1428 return r;
1429 } else
1430 return get_msr_hyperv(vcpu, msr, pdata);
1431 break;
1115 default: 1432 default:
1116 if (!ignore_msrs) { 1433 if (!ignore_msrs) {
1117 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1434 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -1137,15 +1454,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1137 int (*do_msr)(struct kvm_vcpu *vcpu, 1454 int (*do_msr)(struct kvm_vcpu *vcpu,
1138 unsigned index, u64 *data)) 1455 unsigned index, u64 *data))
1139{ 1456{
1140 int i; 1457 int i, idx;
1141 1458
1142 vcpu_load(vcpu); 1459 vcpu_load(vcpu);
1143 1460
1144 down_read(&vcpu->kvm->slots_lock); 1461 idx = srcu_read_lock(&vcpu->kvm->srcu);
1145 for (i = 0; i < msrs->nmsrs; ++i) 1462 for (i = 0; i < msrs->nmsrs; ++i)
1146 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1463 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1147 break; 1464 break;
1148 up_read(&vcpu->kvm->slots_lock); 1465 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1149 1466
1150 vcpu_put(vcpu); 1467 vcpu_put(vcpu);
1151 1468
@@ -1224,6 +1541,14 @@ int kvm_dev_ioctl_check_extension(long ext)
1224 case KVM_CAP_PIT2: 1541 case KVM_CAP_PIT2:
1225 case KVM_CAP_PIT_STATE2: 1542 case KVM_CAP_PIT_STATE2:
1226 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1543 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1544 case KVM_CAP_XEN_HVM:
1545 case KVM_CAP_ADJUST_CLOCK:
1546 case KVM_CAP_VCPU_EVENTS:
1547 case KVM_CAP_HYPERV:
1548 case KVM_CAP_HYPERV_VAPIC:
1549 case KVM_CAP_HYPERV_SPIN:
1550 case KVM_CAP_PCI_SEGMENT:
1551 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1227 r = 1; 1552 r = 1;
1228 break; 1553 break;
1229 case KVM_CAP_COALESCED_MMIO: 1554 case KVM_CAP_COALESCED_MMIO:
@@ -1238,8 +1563,8 @@ int kvm_dev_ioctl_check_extension(long ext)
1238 case KVM_CAP_NR_MEMSLOTS: 1563 case KVM_CAP_NR_MEMSLOTS:
1239 r = KVM_MEMORY_SLOTS; 1564 r = KVM_MEMORY_SLOTS;
1240 break; 1565 break;
1241 case KVM_CAP_PV_MMU: 1566 case KVM_CAP_PV_MMU: /* obsolete */
1242 r = !tdp_enabled; 1567 r = 0;
1243 break; 1568 break;
1244 case KVM_CAP_IOMMU: 1569 case KVM_CAP_IOMMU:
1245 r = iommu_found(); 1570 r = iommu_found();
@@ -1326,13 +1651,19 @@ out:
1326void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1651void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1327{ 1652{
1328 kvm_x86_ops->vcpu_load(vcpu, cpu); 1653 kvm_x86_ops->vcpu_load(vcpu, cpu);
1654 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1655 unsigned long khz = cpufreq_quick_get(cpu);
1656 if (!khz)
1657 khz = tsc_khz;
1658 per_cpu(cpu_tsc_khz, cpu) = khz;
1659 }
1329 kvm_request_guest_time_update(vcpu); 1660 kvm_request_guest_time_update(vcpu);
1330} 1661}
1331 1662
1332void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1663void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1333{ 1664{
1334 kvm_x86_ops->vcpu_put(vcpu);
1335 kvm_put_guest_fpu(vcpu); 1665 kvm_put_guest_fpu(vcpu);
1666 kvm_x86_ops->vcpu_put(vcpu);
1336} 1667}
1337 1668
1338static int is_efer_nx(void) 1669static int is_efer_nx(void)
@@ -1381,6 +1712,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1381 if (copy_from_user(cpuid_entries, entries, 1712 if (copy_from_user(cpuid_entries, entries,
1382 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1713 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1383 goto out_free; 1714 goto out_free;
1715 vcpu_load(vcpu);
1384 for (i = 0; i < cpuid->nent; i++) { 1716 for (i = 0; i < cpuid->nent; i++) {
1385 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1717 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1386 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1718 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1397,6 +1729,8 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1397 cpuid_fix_nx_cap(vcpu); 1729 cpuid_fix_nx_cap(vcpu);
1398 r = 0; 1730 r = 0;
1399 kvm_apic_set_version(vcpu); 1731 kvm_apic_set_version(vcpu);
1732 kvm_x86_ops->cpuid_update(vcpu);
1733 vcpu_put(vcpu);
1400 1734
1401out_free: 1735out_free:
1402 vfree(cpuid_entries); 1736 vfree(cpuid_entries);
@@ -1417,8 +1751,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1417 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1751 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1418 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1752 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1419 goto out; 1753 goto out;
1754 vcpu_load(vcpu);
1420 vcpu->arch.cpuid_nent = cpuid->nent; 1755 vcpu->arch.cpuid_nent = cpuid->nent;
1421 kvm_apic_set_version(vcpu); 1756 kvm_apic_set_version(vcpu);
1757 kvm_x86_ops->cpuid_update(vcpu);
1758 vcpu_put(vcpu);
1422 return 0; 1759 return 0;
1423 1760
1424out: 1761out:
@@ -1461,12 +1798,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1461 u32 index, int *nent, int maxnent) 1798 u32 index, int *nent, int maxnent)
1462{ 1799{
1463 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1800 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1464 unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
1465#ifdef CONFIG_X86_64 1801#ifdef CONFIG_X86_64
1802 unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
1803 ? F(GBPAGES) : 0;
1466 unsigned f_lm = F(LM); 1804 unsigned f_lm = F(LM);
1467#else 1805#else
1806 unsigned f_gbpages = 0;
1468 unsigned f_lm = 0; 1807 unsigned f_lm = 0;
1469#endif 1808#endif
1809 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
1470 1810
1471 /* cpuid 1.edx */ 1811 /* cpuid 1.edx */
1472 const u32 kvm_supported_word0_x86_features = 1812 const u32 kvm_supported_word0_x86_features =
@@ -1486,7 +1826,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1486 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1826 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1487 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1827 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1488 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1828 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1489 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 1829 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
1490 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1830 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1491 /* cpuid 1.ecx */ 1831 /* cpuid 1.ecx */
1492 const u32 kvm_supported_word4_x86_features = 1832 const u32 kvm_supported_word4_x86_features =
@@ -1733,7 +2073,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1733 return 0; 2073 return 0;
1734 if (mce->status & MCI_STATUS_UC) { 2074 if (mce->status & MCI_STATUS_UC) {
1735 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2075 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
1736 !(vcpu->arch.cr4 & X86_CR4_MCE)) { 2076 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
1737 printk(KERN_DEBUG "kvm: set_mce: " 2077 printk(KERN_DEBUG "kvm: set_mce: "
1738 "injects mce exception while " 2078 "injects mce exception while "
1739 "previous one is in progress!\n"); 2079 "previous one is in progress!\n");
@@ -1759,6 +2099,65 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1759 return 0; 2099 return 0;
1760} 2100}
1761 2101
2102static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2103 struct kvm_vcpu_events *events)
2104{
2105 vcpu_load(vcpu);
2106
2107 events->exception.injected = vcpu->arch.exception.pending;
2108 events->exception.nr = vcpu->arch.exception.nr;
2109 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2110 events->exception.error_code = vcpu->arch.exception.error_code;
2111
2112 events->interrupt.injected = vcpu->arch.interrupt.pending;
2113 events->interrupt.nr = vcpu->arch.interrupt.nr;
2114 events->interrupt.soft = vcpu->arch.interrupt.soft;
2115
2116 events->nmi.injected = vcpu->arch.nmi_injected;
2117 events->nmi.pending = vcpu->arch.nmi_pending;
2118 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2119
2120 events->sipi_vector = vcpu->arch.sipi_vector;
2121
2122 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2123 | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
2124
2125 vcpu_put(vcpu);
2126}
2127
2128static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2129 struct kvm_vcpu_events *events)
2130{
2131 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2132 | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
2133 return -EINVAL;
2134
2135 vcpu_load(vcpu);
2136
2137 vcpu->arch.exception.pending = events->exception.injected;
2138 vcpu->arch.exception.nr = events->exception.nr;
2139 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
2140 vcpu->arch.exception.error_code = events->exception.error_code;
2141
2142 vcpu->arch.interrupt.pending = events->interrupt.injected;
2143 vcpu->arch.interrupt.nr = events->interrupt.nr;
2144 vcpu->arch.interrupt.soft = events->interrupt.soft;
2145 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2146 kvm_pic_clear_isr_ack(vcpu->kvm);
2147
2148 vcpu->arch.nmi_injected = events->nmi.injected;
2149 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
2150 vcpu->arch.nmi_pending = events->nmi.pending;
2151 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
2152
2153 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2154 vcpu->arch.sipi_vector = events->sipi_vector;
2155
2156 vcpu_put(vcpu);
2157
2158 return 0;
2159}
2160
1762long kvm_arch_vcpu_ioctl(struct file *filp, 2161long kvm_arch_vcpu_ioctl(struct file *filp,
1763 unsigned int ioctl, unsigned long arg) 2162 unsigned int ioctl, unsigned long arg)
1764{ 2163{
@@ -1769,6 +2168,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1769 2168
1770 switch (ioctl) { 2169 switch (ioctl) {
1771 case KVM_GET_LAPIC: { 2170 case KVM_GET_LAPIC: {
2171 r = -EINVAL;
2172 if (!vcpu->arch.apic)
2173 goto out;
1772 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2174 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1773 2175
1774 r = -ENOMEM; 2176 r = -ENOMEM;
@@ -1784,6 +2186,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1784 break; 2186 break;
1785 } 2187 }
1786 case KVM_SET_LAPIC: { 2188 case KVM_SET_LAPIC: {
2189 r = -EINVAL;
2190 if (!vcpu->arch.apic)
2191 goto out;
1787 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2192 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1788 r = -ENOMEM; 2193 r = -ENOMEM;
1789 if (!lapic) 2194 if (!lapic)
@@ -1910,6 +2315,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1910 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2315 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1911 break; 2316 break;
1912 } 2317 }
2318 case KVM_GET_VCPU_EVENTS: {
2319 struct kvm_vcpu_events events;
2320
2321 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
2322
2323 r = -EFAULT;
2324 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
2325 break;
2326 r = 0;
2327 break;
2328 }
2329 case KVM_SET_VCPU_EVENTS: {
2330 struct kvm_vcpu_events events;
2331
2332 r = -EFAULT;
2333 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
2334 break;
2335
2336 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2337 break;
2338 }
1913 default: 2339 default:
1914 r = -EINVAL; 2340 r = -EINVAL;
1915 } 2341 }
@@ -1941,14 +2367,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1941 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 2367 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1942 return -EINVAL; 2368 return -EINVAL;
1943 2369
1944 down_write(&kvm->slots_lock); 2370 mutex_lock(&kvm->slots_lock);
1945 spin_lock(&kvm->mmu_lock); 2371 spin_lock(&kvm->mmu_lock);
1946 2372
1947 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 2373 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1948 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 2374 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1949 2375
1950 spin_unlock(&kvm->mmu_lock); 2376 spin_unlock(&kvm->mmu_lock);
1951 up_write(&kvm->slots_lock); 2377 mutex_unlock(&kvm->slots_lock);
1952 return 0; 2378 return 0;
1953} 2379}
1954 2380
@@ -1957,13 +2383,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1957 return kvm->arch.n_alloc_mmu_pages; 2383 return kvm->arch.n_alloc_mmu_pages;
1958} 2384}
1959 2385
2386gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2387{
2388 int i;
2389 struct kvm_mem_alias *alias;
2390 struct kvm_mem_aliases *aliases;
2391
2392 aliases = rcu_dereference(kvm->arch.aliases);
2393
2394 for (i = 0; i < aliases->naliases; ++i) {
2395 alias = &aliases->aliases[i];
2396 if (alias->flags & KVM_ALIAS_INVALID)
2397 continue;
2398 if (gfn >= alias->base_gfn
2399 && gfn < alias->base_gfn + alias->npages)
2400 return alias->target_gfn + gfn - alias->base_gfn;
2401 }
2402 return gfn;
2403}
2404
1960gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 2405gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1961{ 2406{
1962 int i; 2407 int i;
1963 struct kvm_mem_alias *alias; 2408 struct kvm_mem_alias *alias;
2409 struct kvm_mem_aliases *aliases;
1964 2410
1965 for (i = 0; i < kvm->arch.naliases; ++i) { 2411 aliases = rcu_dereference(kvm->arch.aliases);
1966 alias = &kvm->arch.aliases[i]; 2412
2413 for (i = 0; i < aliases->naliases; ++i) {
2414 alias = &aliases->aliases[i];
1967 if (gfn >= alias->base_gfn 2415 if (gfn >= alias->base_gfn
1968 && gfn < alias->base_gfn + alias->npages) 2416 && gfn < alias->base_gfn + alias->npages)
1969 return alias->target_gfn + gfn - alias->base_gfn; 2417 return alias->target_gfn + gfn - alias->base_gfn;
@@ -1981,6 +2429,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1981{ 2429{
1982 int r, n; 2430 int r, n;
1983 struct kvm_mem_alias *p; 2431 struct kvm_mem_alias *p;
2432 struct kvm_mem_aliases *aliases, *old_aliases;
1984 2433
1985 r = -EINVAL; 2434 r = -EINVAL;
1986 /* General sanity checks */ 2435 /* General sanity checks */
@@ -1997,26 +2446,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1997 < alias->target_phys_addr) 2446 < alias->target_phys_addr)
1998 goto out; 2447 goto out;
1999 2448
2000 down_write(&kvm->slots_lock); 2449 r = -ENOMEM;
2001 spin_lock(&kvm->mmu_lock); 2450 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2451 if (!aliases)
2452 goto out;
2002 2453
2003 p = &kvm->arch.aliases[alias->slot]; 2454 mutex_lock(&kvm->slots_lock);
2455
2456 /* invalidate any gfn reference in case of deletion/shrinking */
2457 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2458 aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
2459 old_aliases = kvm->arch.aliases;
2460 rcu_assign_pointer(kvm->arch.aliases, aliases);
2461 synchronize_srcu_expedited(&kvm->srcu);
2462 kvm_mmu_zap_all(kvm);
2463 kfree(old_aliases);
2464
2465 r = -ENOMEM;
2466 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2467 if (!aliases)
2468 goto out_unlock;
2469
2470 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2471
2472 p = &aliases->aliases[alias->slot];
2004 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 2473 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2005 p->npages = alias->memory_size >> PAGE_SHIFT; 2474 p->npages = alias->memory_size >> PAGE_SHIFT;
2006 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 2475 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2476 p->flags &= ~(KVM_ALIAS_INVALID);
2007 2477
2008 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 2478 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2009 if (kvm->arch.aliases[n - 1].npages) 2479 if (aliases->aliases[n - 1].npages)
2010 break; 2480 break;
2011 kvm->arch.naliases = n; 2481 aliases->naliases = n;
2012
2013 spin_unlock(&kvm->mmu_lock);
2014 kvm_mmu_zap_all(kvm);
2015
2016 up_write(&kvm->slots_lock);
2017 2482
2018 return 0; 2483 old_aliases = kvm->arch.aliases;
2484 rcu_assign_pointer(kvm->arch.aliases, aliases);
2485 synchronize_srcu_expedited(&kvm->srcu);
2486 kfree(old_aliases);
2487 r = 0;
2019 2488
2489out_unlock:
2490 mutex_unlock(&kvm->slots_lock);
2020out: 2491out:
2021 return r; 2492 return r;
2022} 2493}
@@ -2038,9 +2509,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2038 sizeof(struct kvm_pic_state)); 2509 sizeof(struct kvm_pic_state));
2039 break; 2510 break;
2040 case KVM_IRQCHIP_IOAPIC: 2511 case KVM_IRQCHIP_IOAPIC:
2041 memcpy(&chip->chip.ioapic, 2512 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2042 ioapic_irqchip(kvm),
2043 sizeof(struct kvm_ioapic_state));
2044 break; 2513 break;
2045 default: 2514 default:
2046 r = -EINVAL; 2515 r = -EINVAL;
@@ -2056,25 +2525,21 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2056 r = 0; 2525 r = 0;
2057 switch (chip->chip_id) { 2526 switch (chip->chip_id) {
2058 case KVM_IRQCHIP_PIC_MASTER: 2527 case KVM_IRQCHIP_PIC_MASTER:
2059 spin_lock(&pic_irqchip(kvm)->lock); 2528 raw_spin_lock(&pic_irqchip(kvm)->lock);
2060 memcpy(&pic_irqchip(kvm)->pics[0], 2529 memcpy(&pic_irqchip(kvm)->pics[0],
2061 &chip->chip.pic, 2530 &chip->chip.pic,
2062 sizeof(struct kvm_pic_state)); 2531 sizeof(struct kvm_pic_state));
2063 spin_unlock(&pic_irqchip(kvm)->lock); 2532 raw_spin_unlock(&pic_irqchip(kvm)->lock);
2064 break; 2533 break;
2065 case KVM_IRQCHIP_PIC_SLAVE: 2534 case KVM_IRQCHIP_PIC_SLAVE:
2066 spin_lock(&pic_irqchip(kvm)->lock); 2535 raw_spin_lock(&pic_irqchip(kvm)->lock);
2067 memcpy(&pic_irqchip(kvm)->pics[1], 2536 memcpy(&pic_irqchip(kvm)->pics[1],
2068 &chip->chip.pic, 2537 &chip->chip.pic,
2069 sizeof(struct kvm_pic_state)); 2538 sizeof(struct kvm_pic_state));
2070 spin_unlock(&pic_irqchip(kvm)->lock); 2539 raw_spin_unlock(&pic_irqchip(kvm)->lock);
2071 break; 2540 break;
2072 case KVM_IRQCHIP_IOAPIC: 2541 case KVM_IRQCHIP_IOAPIC:
2073 mutex_lock(&kvm->irq_lock); 2542 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2074 memcpy(ioapic_irqchip(kvm),
2075 &chip->chip.ioapic,
2076 sizeof(struct kvm_ioapic_state));
2077 mutex_unlock(&kvm->irq_lock);
2078 break; 2543 break;
2079 default: 2544 default:
2080 r = -EINVAL; 2545 r = -EINVAL;
@@ -2151,29 +2616,63 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2151int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2616int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2152 struct kvm_dirty_log *log) 2617 struct kvm_dirty_log *log)
2153{ 2618{
2154 int r; 2619 int r, i;
2155 int n;
2156 struct kvm_memory_slot *memslot; 2620 struct kvm_memory_slot *memslot;
2157 int is_dirty = 0; 2621 unsigned long n;
2622 unsigned long is_dirty = 0;
2623 unsigned long *dirty_bitmap = NULL;
2158 2624
2159 down_write(&kvm->slots_lock); 2625 mutex_lock(&kvm->slots_lock);
2160 2626
2161 r = kvm_get_dirty_log(kvm, log, &is_dirty); 2627 r = -EINVAL;
2162 if (r) 2628 if (log->slot >= KVM_MEMORY_SLOTS)
2629 goto out;
2630
2631 memslot = &kvm->memslots->memslots[log->slot];
2632 r = -ENOENT;
2633 if (!memslot->dirty_bitmap)
2634 goto out;
2635
2636 n = kvm_dirty_bitmap_bytes(memslot);
2637
2638 r = -ENOMEM;
2639 dirty_bitmap = vmalloc(n);
2640 if (!dirty_bitmap)
2163 goto out; 2641 goto out;
2642 memset(dirty_bitmap, 0, n);
2643
2644 for (i = 0; !is_dirty && i < n/sizeof(long); i++)
2645 is_dirty = memslot->dirty_bitmap[i];
2164 2646
2165 /* If nothing is dirty, don't bother messing with page tables. */ 2647 /* If nothing is dirty, don't bother messing with page tables. */
2166 if (is_dirty) { 2648 if (is_dirty) {
2649 struct kvm_memslots *slots, *old_slots;
2650
2167 spin_lock(&kvm->mmu_lock); 2651 spin_lock(&kvm->mmu_lock);
2168 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2652 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2169 spin_unlock(&kvm->mmu_lock); 2653 spin_unlock(&kvm->mmu_lock);
2170 memslot = &kvm->memslots[log->slot]; 2654
2171 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2655 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2172 memset(memslot->dirty_bitmap, 0, n); 2656 if (!slots)
2657 goto out_free;
2658
2659 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2660 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
2661
2662 old_slots = kvm->memslots;
2663 rcu_assign_pointer(kvm->memslots, slots);
2664 synchronize_srcu_expedited(&kvm->srcu);
2665 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2666 kfree(old_slots);
2173 } 2667 }
2668
2174 r = 0; 2669 r = 0;
2670 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2671 r = -EFAULT;
2672out_free:
2673 vfree(dirty_bitmap);
2175out: 2674out:
2176 up_write(&kvm->slots_lock); 2675 mutex_unlock(&kvm->slots_lock);
2177 return r; 2676 return r;
2178} 2677}
2179 2678
@@ -2182,7 +2681,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2182{ 2681{
2183 struct kvm *kvm = filp->private_data; 2682 struct kvm *kvm = filp->private_data;
2184 void __user *argp = (void __user *)arg; 2683 void __user *argp = (void __user *)arg;
2185 int r = -EINVAL; 2684 int r = -ENOTTY;
2186 /* 2685 /*
2187 * This union makes it completely explicit to gcc-3.x 2686 * This union makes it completely explicit to gcc-3.x
2188 * that these two variables' stack usage should be 2687 * that these two variables' stack usage should be
@@ -2244,25 +2743,39 @@ long kvm_arch_vm_ioctl(struct file *filp,
2244 if (r) 2743 if (r)
2245 goto out; 2744 goto out;
2246 break; 2745 break;
2247 case KVM_CREATE_IRQCHIP: 2746 case KVM_CREATE_IRQCHIP: {
2747 struct kvm_pic *vpic;
2748
2749 mutex_lock(&kvm->lock);
2750 r = -EEXIST;
2751 if (kvm->arch.vpic)
2752 goto create_irqchip_unlock;
2248 r = -ENOMEM; 2753 r = -ENOMEM;
2249 kvm->arch.vpic = kvm_create_pic(kvm); 2754 vpic = kvm_create_pic(kvm);
2250 if (kvm->arch.vpic) { 2755 if (vpic) {
2251 r = kvm_ioapic_init(kvm); 2756 r = kvm_ioapic_init(kvm);
2252 if (r) { 2757 if (r) {
2253 kfree(kvm->arch.vpic); 2758 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
2254 kvm->arch.vpic = NULL; 2759 &vpic->dev);
2255 goto out; 2760 kfree(vpic);
2761 goto create_irqchip_unlock;
2256 } 2762 }
2257 } else 2763 } else
2258 goto out; 2764 goto create_irqchip_unlock;
2765 smp_wmb();
2766 kvm->arch.vpic = vpic;
2767 smp_wmb();
2259 r = kvm_setup_default_irq_routing(kvm); 2768 r = kvm_setup_default_irq_routing(kvm);
2260 if (r) { 2769 if (r) {
2261 kfree(kvm->arch.vpic); 2770 mutex_lock(&kvm->irq_lock);
2262 kfree(kvm->arch.vioapic); 2771 kvm_ioapic_destroy(kvm);
2263 goto out; 2772 kvm_destroy_pic(kvm);
2773 mutex_unlock(&kvm->irq_lock);
2264 } 2774 }
2775 create_irqchip_unlock:
2776 mutex_unlock(&kvm->lock);
2265 break; 2777 break;
2778 }
2266 case KVM_CREATE_PIT: 2779 case KVM_CREATE_PIT:
2267 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2780 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2268 goto create_pit; 2781 goto create_pit;
@@ -2272,7 +2785,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2272 sizeof(struct kvm_pit_config))) 2785 sizeof(struct kvm_pit_config)))
2273 goto out; 2786 goto out;
2274 create_pit: 2787 create_pit:
2275 down_write(&kvm->slots_lock); 2788 mutex_lock(&kvm->slots_lock);
2276 r = -EEXIST; 2789 r = -EEXIST;
2277 if (kvm->arch.vpit) 2790 if (kvm->arch.vpit)
2278 goto create_pit_unlock; 2791 goto create_pit_unlock;
@@ -2281,7 +2794,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2281 if (kvm->arch.vpit) 2794 if (kvm->arch.vpit)
2282 r = 0; 2795 r = 0;
2283 create_pit_unlock: 2796 create_pit_unlock:
2284 up_write(&kvm->slots_lock); 2797 mutex_unlock(&kvm->slots_lock);
2285 break; 2798 break;
2286 case KVM_IRQ_LINE_STATUS: 2799 case KVM_IRQ_LINE_STATUS:
2287 case KVM_IRQ_LINE: { 2800 case KVM_IRQ_LINE: {
@@ -2292,10 +2805,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2292 goto out; 2805 goto out;
2293 if (irqchip_in_kernel(kvm)) { 2806 if (irqchip_in_kernel(kvm)) {
2294 __s32 status; 2807 __s32 status;
2295 mutex_lock(&kvm->irq_lock);
2296 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2808 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2297 irq_event.irq, irq_event.level); 2809 irq_event.irq, irq_event.level);
2298 mutex_unlock(&kvm->irq_lock);
2299 if (ioctl == KVM_IRQ_LINE_STATUS) { 2810 if (ioctl == KVM_IRQ_LINE_STATUS) {
2300 irq_event.status = status; 2811 irq_event.status = status;
2301 if (copy_to_user(argp, &irq_event, 2812 if (copy_to_user(argp, &irq_event,
@@ -2421,6 +2932,55 @@ long kvm_arch_vm_ioctl(struct file *filp,
2421 r = 0; 2932 r = 0;
2422 break; 2933 break;
2423 } 2934 }
2935 case KVM_XEN_HVM_CONFIG: {
2936 r = -EFAULT;
2937 if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
2938 sizeof(struct kvm_xen_hvm_config)))
2939 goto out;
2940 r = -EINVAL;
2941 if (kvm->arch.xen_hvm_config.flags)
2942 goto out;
2943 r = 0;
2944 break;
2945 }
2946 case KVM_SET_CLOCK: {
2947 struct timespec now;
2948 struct kvm_clock_data user_ns;
2949 u64 now_ns;
2950 s64 delta;
2951
2952 r = -EFAULT;
2953 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
2954 goto out;
2955
2956 r = -EINVAL;
2957 if (user_ns.flags)
2958 goto out;
2959
2960 r = 0;
2961 ktime_get_ts(&now);
2962 now_ns = timespec_to_ns(&now);
2963 delta = user_ns.clock - now_ns;
2964 kvm->arch.kvmclock_offset = delta;
2965 break;
2966 }
2967 case KVM_GET_CLOCK: {
2968 struct timespec now;
2969 struct kvm_clock_data user_ns;
2970 u64 now_ns;
2971
2972 ktime_get_ts(&now);
2973 now_ns = timespec_to_ns(&now);
2974 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
2975 user_ns.flags = 0;
2976
2977 r = -EFAULT;
2978 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
2979 goto out;
2980 r = 0;
2981 break;
2982 }
2983
2424 default: 2984 default:
2425 ; 2985 ;
2426 } 2986 }
@@ -2433,7 +2993,8 @@ static void kvm_init_msr_list(void)
2433 u32 dummy[2]; 2993 u32 dummy[2];
2434 unsigned i, j; 2994 unsigned i, j;
2435 2995
2436 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2996 /* skip the first msrs in the list. KVM-specific */
2997 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2437 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2998 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2438 continue; 2999 continue;
2439 if (j < i) 3000 if (j < i)
@@ -2450,7 +3011,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
2450 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 3011 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2451 return 0; 3012 return 0;
2452 3013
2453 return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); 3014 return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
2454} 3015}
2455 3016
2456static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 3017static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
@@ -2459,17 +3020,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2459 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 3020 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2460 return 0; 3021 return 0;
2461 3022
2462 return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); 3023 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
2463} 3024}
2464 3025
2465static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3026gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
2466 struct kvm_vcpu *vcpu) 3027{
3028 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3029 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3030}
3031
3032 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3033{
3034 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3035 access |= PFERR_FETCH_MASK;
3036 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3037}
3038
3039gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3040{
3041 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3042 access |= PFERR_WRITE_MASK;
3043 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3044}
3045
3046/* uses this to access any guest's mapped memory without checking CPL */
3047gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3048{
3049 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
3050}
3051
3052static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3053 struct kvm_vcpu *vcpu, u32 access,
3054 u32 *error)
2467{ 3055{
2468 void *data = val; 3056 void *data = val;
2469 int r = X86EMUL_CONTINUE; 3057 int r = X86EMUL_CONTINUE;
2470 3058
2471 while (bytes) { 3059 while (bytes) {
2472 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3060 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
2473 unsigned offset = addr & (PAGE_SIZE-1); 3061 unsigned offset = addr & (PAGE_SIZE-1);
2474 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3062 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2475 int ret; 3063 int ret;
@@ -2492,14 +3080,37 @@ out:
2492 return r; 3080 return r;
2493} 3081}
2494 3082
3083/* used for instruction fetching */
3084static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
3085 struct kvm_vcpu *vcpu, u32 *error)
3086{
3087 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3088 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3089 access | PFERR_FETCH_MASK, error);
3090}
3091
3092static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
3093 struct kvm_vcpu *vcpu, u32 *error)
3094{
3095 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3096 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3097 error);
3098}
3099
3100static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3101 struct kvm_vcpu *vcpu, u32 *error)
3102{
3103 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
3104}
3105
2495static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 3106static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2496 struct kvm_vcpu *vcpu) 3107 struct kvm_vcpu *vcpu, u32 *error)
2497{ 3108{
2498 void *data = val; 3109 void *data = val;
2499 int r = X86EMUL_CONTINUE; 3110 int r = X86EMUL_CONTINUE;
2500 3111
2501 while (bytes) { 3112 while (bytes) {
2502 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3113 gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
2503 unsigned offset = addr & (PAGE_SIZE-1); 3114 unsigned offset = addr & (PAGE_SIZE-1);
2504 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3115 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2505 int ret; 3116 int ret;
@@ -2529,6 +3140,7 @@ static int emulator_read_emulated(unsigned long addr,
2529 struct kvm_vcpu *vcpu) 3140 struct kvm_vcpu *vcpu)
2530{ 3141{
2531 gpa_t gpa; 3142 gpa_t gpa;
3143 u32 error_code;
2532 3144
2533 if (vcpu->mmio_read_completed) { 3145 if (vcpu->mmio_read_completed) {
2534 memcpy(val, vcpu->mmio_data, bytes); 3146 memcpy(val, vcpu->mmio_data, bytes);
@@ -2538,17 +3150,20 @@ static int emulator_read_emulated(unsigned long addr,
2538 return X86EMUL_CONTINUE; 3150 return X86EMUL_CONTINUE;
2539 } 3151 }
2540 3152
2541 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3153 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
3154
3155 if (gpa == UNMAPPED_GVA) {
3156 kvm_inject_page_fault(vcpu, addr, error_code);
3157 return X86EMUL_PROPAGATE_FAULT;
3158 }
2542 3159
2543 /* For APIC access vmexit */ 3160 /* For APIC access vmexit */
2544 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3161 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2545 goto mmio; 3162 goto mmio;
2546 3163
2547 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 3164 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
2548 == X86EMUL_CONTINUE) 3165 == X86EMUL_CONTINUE)
2549 return X86EMUL_CONTINUE; 3166 return X86EMUL_CONTINUE;
2550 if (gpa == UNMAPPED_GVA)
2551 return X86EMUL_PROPAGATE_FAULT;
2552 3167
2553mmio: 3168mmio:
2554 /* 3169 /*
@@ -2587,11 +3202,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
2587 struct kvm_vcpu *vcpu) 3202 struct kvm_vcpu *vcpu)
2588{ 3203{
2589 gpa_t gpa; 3204 gpa_t gpa;
3205 u32 error_code;
2590 3206
2591 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3207 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
2592 3208
2593 if (gpa == UNMAPPED_GVA) { 3209 if (gpa == UNMAPPED_GVA) {
2594 kvm_inject_page_fault(vcpu, addr, 2); 3210 kvm_inject_page_fault(vcpu, addr, error_code);
2595 return X86EMUL_PROPAGATE_FAULT; 3211 return X86EMUL_PROPAGATE_FAULT;
2596 } 3212 }
2597 3213
@@ -2655,7 +3271,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
2655 char *kaddr; 3271 char *kaddr;
2656 u64 val; 3272 u64 val;
2657 3273
2658 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3274 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
2659 3275
2660 if (gpa == UNMAPPED_GVA || 3276 if (gpa == UNMAPPED_GVA ||
2661 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3277 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -2692,35 +3308,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2692 3308
2693int emulate_clts(struct kvm_vcpu *vcpu) 3309int emulate_clts(struct kvm_vcpu *vcpu)
2694{ 3310{
2695 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 3311 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3312 kvm_x86_ops->fpu_activate(vcpu);
2696 return X86EMUL_CONTINUE; 3313 return X86EMUL_CONTINUE;
2697} 3314}
2698 3315
2699int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3316int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2700{ 3317{
2701 struct kvm_vcpu *vcpu = ctxt->vcpu; 3318 return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
2702
2703 switch (dr) {
2704 case 0 ... 3:
2705 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2706 return X86EMUL_CONTINUE;
2707 default:
2708 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2709 return X86EMUL_UNHANDLEABLE;
2710 }
2711} 3319}
2712 3320
2713int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3321int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2714{ 3322{
2715 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3323 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2716 int exception;
2717 3324
2718 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 3325 return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
2719 if (exception) {
2720 /* FIXME: better handling */
2721 return X86EMUL_UNHANDLEABLE;
2722 }
2723 return X86EMUL_CONTINUE;
2724} 3326}
2725 3327
2726void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3328void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -2734,7 +3336,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2734 3336
2735 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3337 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2736 3338
2737 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 3339 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
2738 3340
2739 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 3341 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2740 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 3342 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -2742,7 +3344,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2742EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3344EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2743 3345
2744static struct x86_emulate_ops emulate_ops = { 3346static struct x86_emulate_ops emulate_ops = {
2745 .read_std = kvm_read_guest_virt, 3347 .read_std = kvm_read_guest_virt_system,
3348 .fetch = kvm_fetch_guest_virt,
2746 .read_emulated = emulator_read_emulated, 3349 .read_emulated = emulator_read_emulated,
2747 .write_emulated = emulator_write_emulated, 3350 .write_emulated = emulator_write_emulated,
2748 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3351 .cmpxchg_emulated = emulator_cmpxchg_emulated,
@@ -2757,13 +3360,13 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
2757} 3360}
2758 3361
2759int emulate_instruction(struct kvm_vcpu *vcpu, 3362int emulate_instruction(struct kvm_vcpu *vcpu,
2760 struct kvm_run *run,
2761 unsigned long cr2, 3363 unsigned long cr2,
2762 u16 error_code, 3364 u16 error_code,
2763 int emulation_type) 3365 int emulation_type)
2764{ 3366{
2765 int r, shadow_mask; 3367 int r, shadow_mask;
2766 struct decode_cache *c; 3368 struct decode_cache *c;
3369 struct kvm_run *run = vcpu->run;
2767 3370
2768 kvm_clear_exception_queue(vcpu); 3371 kvm_clear_exception_queue(vcpu);
2769 vcpu->arch.mmio_fault_cr2 = cr2; 3372 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -2783,10 +3386,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2783 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3386 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2784 3387
2785 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3388 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2786 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 3389 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
2787 vcpu->arch.emulate_ctxt.mode = 3390 vcpu->arch.emulate_ctxt.mode =
3391 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
2788 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3392 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2789 ? X86EMUL_MODE_REAL : cs_l 3393 ? X86EMUL_MODE_VM86 : cs_l
2790 ? X86EMUL_MODE_PROT64 : cs_db 3394 ? X86EMUL_MODE_PROT64 : cs_db
2791 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3395 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2792 3396
@@ -2861,7 +3465,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2861 return EMULATE_DO_MMIO; 3465 return EMULATE_DO_MMIO;
2862 } 3466 }
2863 3467
2864 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 3468 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2865 3469
2866 if (vcpu->mmio_is_write) { 3470 if (vcpu->mmio_is_write) {
2867 vcpu->mmio_needed = 0; 3471 vcpu->mmio_needed = 0;
@@ -2878,12 +3482,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
2878 gva_t q = vcpu->arch.pio.guest_gva; 3482 gva_t q = vcpu->arch.pio.guest_gva;
2879 unsigned bytes; 3483 unsigned bytes;
2880 int ret; 3484 int ret;
3485 u32 error_code;
2881 3486
2882 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 3487 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2883 if (vcpu->arch.pio.in) 3488 if (vcpu->arch.pio.in)
2884 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 3489 ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
2885 else 3490 else
2886 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 3491 ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
3492
3493 if (ret == X86EMUL_PROPAGATE_FAULT)
3494 kvm_inject_page_fault(vcpu, q, error_code);
3495
2887 return ret; 3496 return ret;
2888} 3497}
2889 3498
@@ -2904,7 +3513,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
2904 if (io->in) { 3513 if (io->in) {
2905 r = pio_copy_data(vcpu); 3514 r = pio_copy_data(vcpu);
2906 if (r) 3515 if (r)
2907 return r; 3516 goto out;
2908 } 3517 }
2909 3518
2910 delta = 1; 3519 delta = 1;
@@ -2931,7 +3540,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
2931 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 3540 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2932 } 3541 }
2933 } 3542 }
2934 3543out:
2935 io->count -= io->cur_count; 3544 io->count -= io->cur_count;
2936 io->cur_count = 0; 3545 io->cur_count = 0;
2937 3546
@@ -2944,11 +3553,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
2944 int r; 3553 int r;
2945 3554
2946 if (vcpu->arch.pio.in) 3555 if (vcpu->arch.pio.in)
2947 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3556 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
2948 vcpu->arch.pio.size, pd); 3557 vcpu->arch.pio.size, pd);
2949 else 3558 else
2950 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3559 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
2951 vcpu->arch.pio.size, pd); 3560 vcpu->arch.pio.port, vcpu->arch.pio.size,
3561 pd);
2952 return r; 3562 return r;
2953} 3563}
2954 3564
@@ -2959,7 +3569,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
2959 int i, r = 0; 3569 int i, r = 0;
2960 3570
2961 for (i = 0; i < io->cur_count; i++) { 3571 for (i = 0; i < io->cur_count; i++) {
2962 if (kvm_io_bus_write(&vcpu->kvm->pio_bus, 3572 if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
2963 io->port, io->size, pd)) { 3573 io->port, io->size, pd)) {
2964 r = -EOPNOTSUPP; 3574 r = -EOPNOTSUPP;
2965 break; 3575 break;
@@ -2969,11 +3579,12 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
2969 return r; 3579 return r;
2970} 3580}
2971 3581
2972int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3582int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
2973 int size, unsigned port)
2974{ 3583{
2975 unsigned long val; 3584 unsigned long val;
2976 3585
3586 trace_kvm_pio(!in, port, size, 1);
3587
2977 vcpu->run->exit_reason = KVM_EXIT_IO; 3588 vcpu->run->exit_reason = KVM_EXIT_IO;
2978 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3589 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2979 vcpu->run->io.size = vcpu->arch.pio.size = size; 3590 vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -2985,11 +3596,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2985 vcpu->arch.pio.down = 0; 3596 vcpu->arch.pio.down = 0;
2986 vcpu->arch.pio.rep = 0; 3597 vcpu->arch.pio.rep = 0;
2987 3598
2988 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3599 if (!vcpu->arch.pio.in) {
2989 size, 1); 3600 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2990 3601 memcpy(vcpu->arch.pio_data, &val, 4);
2991 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 3602 }
2992 memcpy(vcpu->arch.pio_data, &val, 4);
2993 3603
2994 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3604 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
2995 complete_pio(vcpu); 3605 complete_pio(vcpu);
@@ -2999,13 +3609,15 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2999} 3609}
3000EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3610EXPORT_SYMBOL_GPL(kvm_emulate_pio);
3001 3611
3002int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3612int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3003 int size, unsigned long count, int down, 3613 int size, unsigned long count, int down,
3004 gva_t address, int rep, unsigned port) 3614 gva_t address, int rep, unsigned port)
3005{ 3615{
3006 unsigned now, in_page; 3616 unsigned now, in_page;
3007 int ret = 0; 3617 int ret = 0;
3008 3618
3619 trace_kvm_pio(!in, port, size, count);
3620
3009 vcpu->run->exit_reason = KVM_EXIT_IO; 3621 vcpu->run->exit_reason = KVM_EXIT_IO;
3010 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3622 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3011 vcpu->run->io.size = vcpu->arch.pio.size = size; 3623 vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3017,9 +3629,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
3017 vcpu->arch.pio.down = down; 3629 vcpu->arch.pio.down = down;
3018 vcpu->arch.pio.rep = rep; 3630 vcpu->arch.pio.rep = rep;
3019 3631
3020 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3021 size, count);
3022
3023 if (!count) { 3632 if (!count) {
3024 kvm_x86_ops->skip_emulated_instruction(vcpu); 3633 kvm_x86_ops->skip_emulated_instruction(vcpu);
3025 return 1; 3634 return 1;
@@ -3051,10 +3660,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
3051 if (!vcpu->arch.pio.in) { 3660 if (!vcpu->arch.pio.in) {
3052 /* string PIO write */ 3661 /* string PIO write */
3053 ret = pio_copy_data(vcpu); 3662 ret = pio_copy_data(vcpu);
3054 if (ret == X86EMUL_PROPAGATE_FAULT) { 3663 if (ret == X86EMUL_PROPAGATE_FAULT)
3055 kvm_inject_gp(vcpu, 0);
3056 return 1; 3664 return 1;
3057 }
3058 if (ret == 0 && !pio_string_write(vcpu)) { 3665 if (ret == 0 && !pio_string_write(vcpu)) {
3059 complete_pio(vcpu); 3666 complete_pio(vcpu);
3060 if (vcpu->arch.pio.count == 0) 3667 if (vcpu->arch.pio.count == 0)
@@ -3072,9 +3679,6 @@ static void bounce_off(void *info)
3072 /* nothing */ 3679 /* nothing */
3073} 3680}
3074 3681
3075static unsigned int ref_freq;
3076static unsigned long tsc_khz_ref;
3077
3078static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3682static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3079 void *data) 3683 void *data)
3080{ 3684{
@@ -3083,14 +3687,11 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
3083 struct kvm_vcpu *vcpu; 3687 struct kvm_vcpu *vcpu;
3084 int i, send_ipi = 0; 3688 int i, send_ipi = 0;
3085 3689
3086 if (!ref_freq)
3087 ref_freq = freq->old;
3088
3089 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3690 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3090 return 0; 3691 return 0;
3091 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3692 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3092 return 0; 3693 return 0;
3093 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 3694 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3094 3695
3095 spin_lock(&kvm_lock); 3696 spin_lock(&kvm_lock);
3096 list_for_each_entry(kvm, &vm_list, vm_list) { 3697 list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3127,9 +3728,28 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
3127 .notifier_call = kvmclock_cpufreq_notifier 3728 .notifier_call = kvmclock_cpufreq_notifier
3128}; 3729};
3129 3730
3731static void kvm_timer_init(void)
3732{
3733 int cpu;
3734
3735 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3736 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3737 CPUFREQ_TRANSITION_NOTIFIER);
3738 for_each_online_cpu(cpu) {
3739 unsigned long khz = cpufreq_get(cpu);
3740 if (!khz)
3741 khz = tsc_khz;
3742 per_cpu(cpu_tsc_khz, cpu) = khz;
3743 }
3744 } else {
3745 for_each_possible_cpu(cpu)
3746 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3747 }
3748}
3749
3130int kvm_arch_init(void *opaque) 3750int kvm_arch_init(void *opaque)
3131{ 3751{
3132 int r, cpu; 3752 int r;
3133 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3753 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3134 3754
3135 if (kvm_x86_ops) { 3755 if (kvm_x86_ops) {
@@ -3161,13 +3781,7 @@ int kvm_arch_init(void *opaque)
3161 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3781 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3162 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3782 PT_DIRTY_MASK, PT64_NX_MASK, 0);
3163 3783
3164 for_each_possible_cpu(cpu) 3784 kvm_timer_init();
3165 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3166 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3167 tsc_khz_ref = tsc_khz;
3168 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3169 CPUFREQ_TRANSITION_NOTIFIER);
3170 }
3171 3785
3172 return 0; 3786 return 0;
3173 3787
@@ -3206,11 +3820,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
3206 return a0 | ((gpa_t)a1 << 32); 3820 return a0 | ((gpa_t)a1 << 32);
3207} 3821}
3208 3822
3823int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
3824{
3825 u64 param, ingpa, outgpa, ret;
3826 uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
3827 bool fast, longmode;
3828 int cs_db, cs_l;
3829
3830 /*
3831 * hypercall generates UD from non zero cpl and real mode
3832 * per HYPER-V spec
3833 */
3834 if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
3835 kvm_queue_exception(vcpu, UD_VECTOR);
3836 return 0;
3837 }
3838
3839 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3840 longmode = is_long_mode(vcpu) && cs_l == 1;
3841
3842 if (!longmode) {
3843 param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
3844 (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
3845 ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
3846 (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
3847 outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
3848 (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
3849 }
3850#ifdef CONFIG_X86_64
3851 else {
3852 param = kvm_register_read(vcpu, VCPU_REGS_RCX);
3853 ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
3854 outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
3855 }
3856#endif
3857
3858 code = param & 0xffff;
3859 fast = (param >> 16) & 0x1;
3860 rep_cnt = (param >> 32) & 0xfff;
3861 rep_idx = (param >> 48) & 0xfff;
3862
3863 trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
3864
3865 switch (code) {
3866 case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
3867 kvm_vcpu_on_spin(vcpu);
3868 break;
3869 default:
3870 res = HV_STATUS_INVALID_HYPERCALL_CODE;
3871 break;
3872 }
3873
3874 ret = res | (((u64)rep_done & 0xfff) << 32);
3875 if (longmode) {
3876 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
3877 } else {
3878 kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
3879 kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
3880 }
3881
3882 return 1;
3883}
3884
3209int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 3885int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
3210{ 3886{
3211 unsigned long nr, a0, a1, a2, a3, ret; 3887 unsigned long nr, a0, a1, a2, a3, ret;
3212 int r = 1; 3888 int r = 1;
3213 3889
3890 if (kvm_hv_hypercall_enabled(vcpu->kvm))
3891 return kvm_hv_hypercall(vcpu);
3892
3214 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 3893 nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
3215 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 3894 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
3216 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 3895 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -3253,10 +3932,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
3253int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 3932int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3254{ 3933{
3255 char instruction[3]; 3934 char instruction[3];
3256 int ret = 0;
3257 unsigned long rip = kvm_rip_read(vcpu); 3935 unsigned long rip = kvm_rip_read(vcpu);
3258 3936
3259
3260 /* 3937 /*
3261 * Blow out the MMU to ensure that no other VCPU has an active mapping 3938 * Blow out the MMU to ensure that no other VCPU has an active mapping
3262 * to ensure that the updated hypercall appears atomically across all 3939 * to ensure that the updated hypercall appears atomically across all
@@ -3265,11 +3942,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3265 kvm_mmu_zap_all(vcpu->kvm); 3942 kvm_mmu_zap_all(vcpu->kvm);
3266 3943
3267 kvm_x86_ops->patch_hypercall(vcpu, instruction); 3944 kvm_x86_ops->patch_hypercall(vcpu, instruction);
3268 if (emulator_write_emulated(rip, instruction, 3, vcpu)
3269 != X86EMUL_CONTINUE)
3270 ret = -EFAULT;
3271 3945
3272 return ret; 3946 return emulator_write_emulated(rip, instruction, 3, vcpu);
3273} 3947}
3274 3948
3275static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3949static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -3295,17 +3969,16 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3295 unsigned long *rflags) 3969 unsigned long *rflags)
3296{ 3970{
3297 kvm_lmsw(vcpu, msw); 3971 kvm_lmsw(vcpu, msw);
3298 *rflags = kvm_x86_ops->get_rflags(vcpu); 3972 *rflags = kvm_get_rflags(vcpu);
3299} 3973}
3300 3974
3301unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3975unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3302{ 3976{
3303 unsigned long value; 3977 unsigned long value;
3304 3978
3305 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3306 switch (cr) { 3979 switch (cr) {
3307 case 0: 3980 case 0:
3308 value = vcpu->arch.cr0; 3981 value = kvm_read_cr0(vcpu);
3309 break; 3982 break;
3310 case 2: 3983 case 2:
3311 value = vcpu->arch.cr2; 3984 value = vcpu->arch.cr2;
@@ -3314,7 +3987,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3314 value = vcpu->arch.cr3; 3987 value = vcpu->arch.cr3;
3315 break; 3988 break;
3316 case 4: 3989 case 4:
3317 value = vcpu->arch.cr4; 3990 value = kvm_read_cr4(vcpu);
3318 break; 3991 break;
3319 case 8: 3992 case 8:
3320 value = kvm_get_cr8(vcpu); 3993 value = kvm_get_cr8(vcpu);
@@ -3332,8 +4005,8 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3332{ 4005{
3333 switch (cr) { 4006 switch (cr) {
3334 case 0: 4007 case 0:
3335 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 4008 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3336 *rflags = kvm_x86_ops->get_rflags(vcpu); 4009 *rflags = kvm_get_rflags(vcpu);
3337 break; 4010 break;
3338 case 2: 4011 case 2:
3339 vcpu->arch.cr2 = val; 4012 vcpu->arch.cr2 = val;
@@ -3342,7 +4015,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3342 kvm_set_cr3(vcpu, val); 4015 kvm_set_cr3(vcpu, val);
3343 break; 4016 break;
3344 case 4: 4017 case 4:
3345 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 4018 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3346 break; 4019 break;
3347 case 8: 4020 case 8:
3348 kvm_set_cr8(vcpu, val & 0xfUL); 4021 kvm_set_cr8(vcpu, val & 0xfUL);
@@ -3409,6 +4082,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3409 } 4082 }
3410 return best; 4083 return best;
3411} 4084}
4085EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
3412 4086
3413int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 4087int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3414{ 4088{
@@ -3453,18 +4127,18 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3453 * 4127 *
3454 * No need to exit to userspace if we already have an interrupt queued. 4128 * No need to exit to userspace if we already have an interrupt queued.
3455 */ 4129 */
3456static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 4130static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
3457 struct kvm_run *kvm_run)
3458{ 4131{
3459 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 4132 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3460 kvm_run->request_interrupt_window && 4133 vcpu->run->request_interrupt_window &&
3461 kvm_arch_interrupt_allowed(vcpu)); 4134 kvm_arch_interrupt_allowed(vcpu));
3462} 4135}
3463 4136
3464static void post_kvm_run_save(struct kvm_vcpu *vcpu, 4137static void post_kvm_run_save(struct kvm_vcpu *vcpu)
3465 struct kvm_run *kvm_run)
3466{ 4138{
3467 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 4139 struct kvm_run *kvm_run = vcpu->run;
4140
4141 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3468 kvm_run->cr8 = kvm_get_cr8(vcpu); 4142 kvm_run->cr8 = kvm_get_cr8(vcpu);
3469 kvm_run->apic_base = kvm_get_apic_base(vcpu); 4143 kvm_run->apic_base = kvm_get_apic_base(vcpu);
3470 if (irqchip_in_kernel(vcpu->kvm)) 4144 if (irqchip_in_kernel(vcpu->kvm))
@@ -3492,14 +4166,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
3492static void vapic_exit(struct kvm_vcpu *vcpu) 4166static void vapic_exit(struct kvm_vcpu *vcpu)
3493{ 4167{
3494 struct kvm_lapic *apic = vcpu->arch.apic; 4168 struct kvm_lapic *apic = vcpu->arch.apic;
4169 int idx;
3495 4170
3496 if (!apic || !apic->vapic_addr) 4171 if (!apic || !apic->vapic_addr)
3497 return; 4172 return;
3498 4173
3499 down_read(&vcpu->kvm->slots_lock); 4174 idx = srcu_read_lock(&vcpu->kvm->srcu);
3500 kvm_release_page_dirty(apic->vapic_page); 4175 kvm_release_page_dirty(apic->vapic_page);
3501 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 4176 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3502 up_read(&vcpu->kvm->slots_lock); 4177 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3503} 4178}
3504 4179
3505static void update_cr8_intercept(struct kvm_vcpu *vcpu) 4180static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -3525,7 +4200,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3525 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 4200 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3526} 4201}
3527 4202
3528static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 4203static void inject_pending_event(struct kvm_vcpu *vcpu)
3529{ 4204{
3530 /* try to reinject previous events if any */ 4205 /* try to reinject previous events if any */
3531 if (vcpu->arch.exception.pending) { 4206 if (vcpu->arch.exception.pending) {
@@ -3561,11 +4236,11 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3561 } 4236 }
3562} 4237}
3563 4238
3564static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 4239static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3565{ 4240{
3566 int r; 4241 int r;
3567 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 4242 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3568 kvm_run->request_interrupt_window; 4243 vcpu->run->request_interrupt_window;
3569 4244
3570 if (vcpu->requests) 4245 if (vcpu->requests)
3571 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 4246 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3586,21 +4261,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3586 kvm_x86_ops->tlb_flush(vcpu); 4261 kvm_x86_ops->tlb_flush(vcpu);
3587 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 4262 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3588 &vcpu->requests)) { 4263 &vcpu->requests)) {
3589 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 4264 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
3590 r = 0; 4265 r = 0;
3591 goto out; 4266 goto out;
3592 } 4267 }
3593 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 4268 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3594 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 4269 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3595 r = 0; 4270 r = 0;
3596 goto out; 4271 goto out;
3597 } 4272 }
4273 if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
4274 vcpu->fpu_active = 0;
4275 kvm_x86_ops->fpu_deactivate(vcpu);
4276 }
3598 } 4277 }
3599 4278
3600 preempt_disable(); 4279 preempt_disable();
3601 4280
3602 kvm_x86_ops->prepare_guest_switch(vcpu); 4281 kvm_x86_ops->prepare_guest_switch(vcpu);
3603 kvm_load_guest_fpu(vcpu); 4282 if (vcpu->fpu_active)
4283 kvm_load_guest_fpu(vcpu);
3604 4284
3605 local_irq_disable(); 4285 local_irq_disable();
3606 4286
@@ -3615,7 +4295,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3615 goto out; 4295 goto out;
3616 } 4296 }
3617 4297
3618 inject_pending_event(vcpu, kvm_run); 4298 inject_pending_event(vcpu);
3619 4299
3620 /* enable NMI/IRQ window open exits if needed */ 4300 /* enable NMI/IRQ window open exits if needed */
3621 if (vcpu->arch.nmi_pending) 4301 if (vcpu->arch.nmi_pending)
@@ -3628,7 +4308,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3628 kvm_lapic_sync_to_vapic(vcpu); 4308 kvm_lapic_sync_to_vapic(vcpu);
3629 } 4309 }
3630 4310
3631 up_read(&vcpu->kvm->slots_lock); 4311 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
3632 4312
3633 kvm_guest_enter(); 4313 kvm_guest_enter();
3634 4314
@@ -3641,16 +4321,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3641 } 4321 }
3642 4322
3643 trace_kvm_entry(vcpu->vcpu_id); 4323 trace_kvm_entry(vcpu->vcpu_id);
3644 kvm_x86_ops->run(vcpu, kvm_run); 4324 kvm_x86_ops->run(vcpu);
3645 4325
3646 if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { 4326 /*
3647 set_debugreg(current->thread.debugreg0, 0); 4327 * If the guest has used debug registers, at least dr7
3648 set_debugreg(current->thread.debugreg1, 1); 4328 * will be disabled while returning to the host.
3649 set_debugreg(current->thread.debugreg2, 2); 4329 * If we don't have active breakpoints in the host, we don't
3650 set_debugreg(current->thread.debugreg3, 3); 4330 * care about the messed up debug address registers. But if
3651 set_debugreg(current->thread.debugreg6, 6); 4331 * we have some of them active, restore the old state.
3652 set_debugreg(current->thread.debugreg7, 7); 4332 */
3653 } 4333 if (hw_breakpoint_active())
4334 hw_breakpoint_restore();
3654 4335
3655 set_bit(KVM_REQ_KICK, &vcpu->requests); 4336 set_bit(KVM_REQ_KICK, &vcpu->requests);
3656 local_irq_enable(); 4337 local_irq_enable();
@@ -3669,7 +4350,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3669 4350
3670 preempt_enable(); 4351 preempt_enable();
3671 4352
3672 down_read(&vcpu->kvm->slots_lock); 4353 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3673 4354
3674 /* 4355 /*
3675 * Profile KVM exit RIPs: 4356 * Profile KVM exit RIPs:
@@ -3682,15 +4363,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3682 4363
3683 kvm_lapic_sync_from_vapic(vcpu); 4364 kvm_lapic_sync_from_vapic(vcpu);
3684 4365
3685 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 4366 r = kvm_x86_ops->handle_exit(vcpu);
3686out: 4367out:
3687 return r; 4368 return r;
3688} 4369}
3689 4370
3690 4371
3691static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 4372static int __vcpu_run(struct kvm_vcpu *vcpu)
3692{ 4373{
3693 int r; 4374 int r;
4375 struct kvm *kvm = vcpu->kvm;
3694 4376
3695 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 4377 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3696 pr_debug("vcpu %d received sipi with vector # %x\n", 4378 pr_debug("vcpu %d received sipi with vector # %x\n",
@@ -3702,17 +4384,17 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3702 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4384 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3703 } 4385 }
3704 4386
3705 down_read(&vcpu->kvm->slots_lock); 4387 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3706 vapic_enter(vcpu); 4388 vapic_enter(vcpu);
3707 4389
3708 r = 1; 4390 r = 1;
3709 while (r > 0) { 4391 while (r > 0) {
3710 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 4392 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3711 r = vcpu_enter_guest(vcpu, kvm_run); 4393 r = vcpu_enter_guest(vcpu);
3712 else { 4394 else {
3713 up_read(&vcpu->kvm->slots_lock); 4395 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
3714 kvm_vcpu_block(vcpu); 4396 kvm_vcpu_block(vcpu);
3715 down_read(&vcpu->kvm->slots_lock); 4397 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3716 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4398 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3717 { 4399 {
3718 switch(vcpu->arch.mp_state) { 4400 switch(vcpu->arch.mp_state) {
@@ -3736,25 +4418,25 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3736 if (kvm_cpu_has_pending_timer(vcpu)) 4418 if (kvm_cpu_has_pending_timer(vcpu))
3737 kvm_inject_pending_timer_irqs(vcpu); 4419 kvm_inject_pending_timer_irqs(vcpu);
3738 4420
3739 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 4421 if (dm_request_for_irq_injection(vcpu)) {
3740 r = -EINTR; 4422 r = -EINTR;
3741 kvm_run->exit_reason = KVM_EXIT_INTR; 4423 vcpu->run->exit_reason = KVM_EXIT_INTR;
3742 ++vcpu->stat.request_irq_exits; 4424 ++vcpu->stat.request_irq_exits;
3743 } 4425 }
3744 if (signal_pending(current)) { 4426 if (signal_pending(current)) {
3745 r = -EINTR; 4427 r = -EINTR;
3746 kvm_run->exit_reason = KVM_EXIT_INTR; 4428 vcpu->run->exit_reason = KVM_EXIT_INTR;
3747 ++vcpu->stat.signal_exits; 4429 ++vcpu->stat.signal_exits;
3748 } 4430 }
3749 if (need_resched()) { 4431 if (need_resched()) {
3750 up_read(&vcpu->kvm->slots_lock); 4432 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
3751 kvm_resched(vcpu); 4433 kvm_resched(vcpu);
3752 down_read(&vcpu->kvm->slots_lock); 4434 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3753 } 4435 }
3754 } 4436 }
3755 4437
3756 up_read(&vcpu->kvm->slots_lock); 4438 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
3757 post_kvm_run_save(vcpu, kvm_run); 4439 post_kvm_run_save(vcpu);
3758 4440
3759 vapic_exit(vcpu); 4441 vapic_exit(vcpu);
3760 4442
@@ -3783,21 +4465,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3783 kvm_set_cr8(vcpu, kvm_run->cr8); 4465 kvm_set_cr8(vcpu, kvm_run->cr8);
3784 4466
3785 if (vcpu->arch.pio.cur_count) { 4467 if (vcpu->arch.pio.cur_count) {
4468 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3786 r = complete_pio(vcpu); 4469 r = complete_pio(vcpu);
4470 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
3787 if (r) 4471 if (r)
3788 goto out; 4472 goto out;
3789 } 4473 }
3790#if CONFIG_HAS_IOMEM
3791 if (vcpu->mmio_needed) { 4474 if (vcpu->mmio_needed) {
3792 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4475 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3793 vcpu->mmio_read_completed = 1; 4476 vcpu->mmio_read_completed = 1;
3794 vcpu->mmio_needed = 0; 4477 vcpu->mmio_needed = 0;
3795 4478
3796 down_read(&vcpu->kvm->slots_lock); 4479 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3797 r = emulate_instruction(vcpu, kvm_run, 4480 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
3798 vcpu->arch.mmio_fault_cr2, 0,
3799 EMULTYPE_NO_DECODE); 4481 EMULTYPE_NO_DECODE);
3800 up_read(&vcpu->kvm->slots_lock); 4482 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
3801 if (r == EMULATE_DO_MMIO) { 4483 if (r == EMULATE_DO_MMIO) {
3802 /* 4484 /*
3803 * Read-modify-write. Back to userspace. 4485 * Read-modify-write. Back to userspace.
@@ -3806,12 +4488,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3806 goto out; 4488 goto out;
3807 } 4489 }
3808 } 4490 }
3809#endif
3810 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 4491 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3811 kvm_register_write(vcpu, VCPU_REGS_RAX, 4492 kvm_register_write(vcpu, VCPU_REGS_RAX,
3812 kvm_run->hypercall.ret); 4493 kvm_run->hypercall.ret);
3813 4494
3814 r = __vcpu_run(vcpu, kvm_run); 4495 r = __vcpu_run(vcpu);
3815 4496
3816out: 4497out:
3817 if (vcpu->sigset_active) 4498 if (vcpu->sigset_active)
@@ -3845,13 +4526,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3845#endif 4526#endif
3846 4527
3847 regs->rip = kvm_rip_read(vcpu); 4528 regs->rip = kvm_rip_read(vcpu);
3848 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 4529 regs->rflags = kvm_get_rflags(vcpu);
3849
3850 /*
3851 * Don't leak debug flags in case they were set for guest debugging
3852 */
3853 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3854 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3855 4530
3856 vcpu_put(vcpu); 4531 vcpu_put(vcpu);
3857 4532
@@ -3879,12 +4554,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3879 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 4554 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3880 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 4555 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3881 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 4556 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3882
3883#endif 4557#endif
3884 4558
3885 kvm_rip_write(vcpu, regs->rip); 4559 kvm_rip_write(vcpu, regs->rip);
3886 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 4560 kvm_set_rflags(vcpu, regs->rflags);
3887
3888 4561
3889 vcpu->arch.exception.pending = false; 4562 vcpu->arch.exception.pending = false;
3890 4563
@@ -3933,13 +4606,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3933 sregs->gdt.limit = dt.limit; 4606 sregs->gdt.limit = dt.limit;
3934 sregs->gdt.base = dt.base; 4607 sregs->gdt.base = dt.base;
3935 4608
3936 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4609 sregs->cr0 = kvm_read_cr0(vcpu);
3937 sregs->cr0 = vcpu->arch.cr0;
3938 sregs->cr2 = vcpu->arch.cr2; 4610 sregs->cr2 = vcpu->arch.cr2;
3939 sregs->cr3 = vcpu->arch.cr3; 4611 sregs->cr3 = vcpu->arch.cr3;
3940 sregs->cr4 = vcpu->arch.cr4; 4612 sregs->cr4 = kvm_read_cr4(vcpu);
3941 sregs->cr8 = kvm_get_cr8(vcpu); 4613 sregs->cr8 = kvm_get_cr8(vcpu);
3942 sregs->efer = vcpu->arch.shadow_efer; 4614 sregs->efer = vcpu->arch.efer;
3943 sregs->apic_base = kvm_get_apic_base(vcpu); 4615 sregs->apic_base = kvm_get_apic_base(vcpu);
3944 4616
3945 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 4617 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
@@ -4027,14 +4699,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4027{ 4699{
4028 struct descriptor_table dtable; 4700 struct descriptor_table dtable;
4029 u16 index = selector >> 3; 4701 u16 index = selector >> 3;
4702 int ret;
4703 u32 err;
4704 gva_t addr;
4030 4705
4031 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4706 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4032 4707
4033 if (dtable.limit < index * 8 + 7) { 4708 if (dtable.limit < index * 8 + 7) {
4034 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 4709 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4035 return 1; 4710 return X86EMUL_PROPAGATE_FAULT;
4036 } 4711 }
4037 return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4712 addr = dtable.base + index * 8;
4713 ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
4714 vcpu, &err);
4715 if (ret == X86EMUL_PROPAGATE_FAULT)
4716 kvm_inject_page_fault(vcpu, addr, err);
4717
4718 return ret;
4038} 4719}
4039 4720
4040/* allowed just for 8 bytes segments */ 4721/* allowed just for 8 bytes segments */
@@ -4048,15 +4729,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4048 4729
4049 if (dtable.limit < index * 8 + 7) 4730 if (dtable.limit < index * 8 + 7)
4050 return 1; 4731 return 1;
4051 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4732 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
4733}
4734
4735static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
4736 struct desc_struct *seg_desc)
4737{
4738 u32 base_addr = get_desc_base(seg_desc);
4739
4740 return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
4052} 4741}
4053 4742
4054static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, 4743static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
4055 struct desc_struct *seg_desc) 4744 struct desc_struct *seg_desc)
4056{ 4745{
4057 u32 base_addr = get_desc_base(seg_desc); 4746 u32 base_addr = get_desc_base(seg_desc);
4058 4747
4059 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 4748 return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
4060} 4749}
4061 4750
4062static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 4751static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
@@ -4067,18 +4756,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4067 return kvm_seg.selector; 4756 return kvm_seg.selector;
4068} 4757}
4069 4758
4070static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
4071 u16 selector,
4072 struct kvm_segment *kvm_seg)
4073{
4074 struct desc_struct seg_desc;
4075
4076 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
4077 return 1;
4078 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
4079 return 0;
4080}
4081
4082static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 4759static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4083{ 4760{
4084 struct kvm_segment segvar = { 4761 struct kvm_segment segvar = {
@@ -4096,34 +4773,122 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
4096 .unusable = 0, 4773 .unusable = 0,
4097 }; 4774 };
4098 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 4775 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4099 return 0; 4776 return X86EMUL_CONTINUE;
4100} 4777}
4101 4778
4102static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) 4779static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4103{ 4780{
4104 return (seg != VCPU_SREG_LDTR) && 4781 return (seg != VCPU_SREG_LDTR) &&
4105 (seg != VCPU_SREG_TR) && 4782 (seg != VCPU_SREG_TR) &&
4106 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); 4783 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4107} 4784}
4108 4785
4109int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4786int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
4110 int type_bits, int seg)
4111{ 4787{
4112 struct kvm_segment kvm_seg; 4788 struct kvm_segment kvm_seg;
4789 struct desc_struct seg_desc;
4790 u8 dpl, rpl, cpl;
4791 unsigned err_vec = GP_VECTOR;
4792 u32 err_code = 0;
4793 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
4794 int ret;
4113 4795
4114 if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) 4796 if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
4115 return kvm_load_realmode_segment(vcpu, selector, seg); 4797 return kvm_load_realmode_segment(vcpu, selector, seg);
4116 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
4117 return 1;
4118 kvm_seg.type |= type_bits;
4119 4798
4120 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 4799 /* NULL selector is not valid for TR, CS and SS */
4121 seg != VCPU_SREG_LDTR) 4800 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
4122 if (!kvm_seg.s) 4801 && null_selector)
4123 kvm_seg.unusable = 1; 4802 goto exception;
4803
4804 /* TR should be in GDT only */
4805 if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
4806 goto exception;
4807
4808 ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
4809 if (ret)
4810 return ret;
4811
4812 seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
4813
4814 if (null_selector) { /* for NULL selector skip all following checks */
4815 kvm_seg.unusable = 1;
4816 goto load;
4817 }
4818
4819 err_code = selector & 0xfffc;
4820 err_vec = GP_VECTOR;
4124 4821
4822 /* can't load system descriptor into segment selecor */
4823 if (seg <= VCPU_SREG_GS && !kvm_seg.s)
4824 goto exception;
4825
4826 if (!kvm_seg.present) {
4827 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
4828 goto exception;
4829 }
4830
4831 rpl = selector & 3;
4832 dpl = kvm_seg.dpl;
4833 cpl = kvm_x86_ops->get_cpl(vcpu);
4834
4835 switch (seg) {
4836 case VCPU_SREG_SS:
4837 /*
4838 * segment is not a writable data segment or segment
4839 * selector's RPL != CPL or segment selector's RPL != CPL
4840 */
4841 if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
4842 goto exception;
4843 break;
4844 case VCPU_SREG_CS:
4845 if (!(kvm_seg.type & 8))
4846 goto exception;
4847
4848 if (kvm_seg.type & 4) {
4849 /* conforming */
4850 if (dpl > cpl)
4851 goto exception;
4852 } else {
4853 /* nonconforming */
4854 if (rpl > cpl || dpl != cpl)
4855 goto exception;
4856 }
4857 /* CS(RPL) <- CPL */
4858 selector = (selector & 0xfffc) | cpl;
4859 break;
4860 case VCPU_SREG_TR:
4861 if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
4862 goto exception;
4863 break;
4864 case VCPU_SREG_LDTR:
4865 if (kvm_seg.s || kvm_seg.type != 2)
4866 goto exception;
4867 break;
4868 default: /* DS, ES, FS, or GS */
4869 /*
4870 * segment is not a data or readable code segment or
4871 * ((segment is a data or nonconforming code segment)
4872 * and (both RPL and CPL > DPL))
4873 */
4874 if ((kvm_seg.type & 0xa) == 0x8 ||
4875 (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
4876 goto exception;
4877 break;
4878 }
4879
4880 if (!kvm_seg.unusable && kvm_seg.s) {
4881 /* mark segment as accessed */
4882 kvm_seg.type |= 1;
4883 seg_desc.type |= 1;
4884 save_guest_segment_descriptor(vcpu, selector, &seg_desc);
4885 }
4886load:
4125 kvm_set_segment(vcpu, &kvm_seg, seg); 4887 kvm_set_segment(vcpu, &kvm_seg, seg);
4126 return 0; 4888 return X86EMUL_CONTINUE;
4889exception:
4890 kvm_queue_exception_e(vcpu, err_vec, err_code);
4891 return X86EMUL_PROPAGATE_FAULT;
4127} 4892}
4128 4893
4129static void save_state_to_tss32(struct kvm_vcpu *vcpu, 4894static void save_state_to_tss32(struct kvm_vcpu *vcpu,
@@ -4131,7 +4896,7 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4131{ 4896{
4132 tss->cr3 = vcpu->arch.cr3; 4897 tss->cr3 = vcpu->arch.cr3;
4133 tss->eip = kvm_rip_read(vcpu); 4898 tss->eip = kvm_rip_read(vcpu);
4134 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 4899 tss->eflags = kvm_get_rflags(vcpu);
4135 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4900 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4136 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4901 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4137 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4902 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4149,13 +4914,21 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4149 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4914 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4150} 4915}
4151 4916
4917static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
4918{
4919 struct kvm_segment kvm_seg;
4920 kvm_get_segment(vcpu, &kvm_seg, seg);
4921 kvm_seg.selector = sel;
4922 kvm_set_segment(vcpu, &kvm_seg, seg);
4923}
4924
4152static int load_state_from_tss32(struct kvm_vcpu *vcpu, 4925static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4153 struct tss_segment_32 *tss) 4926 struct tss_segment_32 *tss)
4154{ 4927{
4155 kvm_set_cr3(vcpu, tss->cr3); 4928 kvm_set_cr3(vcpu, tss->cr3);
4156 4929
4157 kvm_rip_write(vcpu, tss->eip); 4930 kvm_rip_write(vcpu, tss->eip);
4158 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 4931 kvm_set_rflags(vcpu, tss->eflags | 2);
4159 4932
4160 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4933 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4161 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4934 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
@@ -4166,25 +4939,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4166 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 4939 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4167 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 4940 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4168 4941
4169 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 4942 /*
4943 * SDM says that segment selectors are loaded before segment
4944 * descriptors
4945 */
4946 kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
4947 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
4948 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
4949 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
4950 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
4951 kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
4952 kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
4953
4954 /*
4955 * Now load segment descriptors. If fault happenes at this stage
4956 * it is handled in a context of new task
4957 */
4958 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
4170 return 1; 4959 return 1;
4171 4960
4172 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4961 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
4173 return 1; 4962 return 1;
4174 4963
4175 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4964 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
4176 return 1; 4965 return 1;
4177 4966
4178 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4967 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
4179 return 1; 4968 return 1;
4180 4969
4181 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4970 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
4182 return 1; 4971 return 1;
4183 4972
4184 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 4973 if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
4185 return 1; 4974 return 1;
4186 4975
4187 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 4976 if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
4188 return 1; 4977 return 1;
4189 return 0; 4978 return 0;
4190} 4979}
@@ -4193,7 +4982,7 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4193 struct tss_segment_16 *tss) 4982 struct tss_segment_16 *tss)
4194{ 4983{
4195 tss->ip = kvm_rip_read(vcpu); 4984 tss->ip = kvm_rip_read(vcpu);
4196 tss->flag = kvm_x86_ops->get_rflags(vcpu); 4985 tss->flag = kvm_get_rflags(vcpu);
4197 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4986 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4198 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4987 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4199 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4988 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4208,14 +4997,13 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4208 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4997 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4209 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4998 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4210 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4999 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4211 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
4212} 5000}
4213 5001
4214static int load_state_from_tss16(struct kvm_vcpu *vcpu, 5002static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4215 struct tss_segment_16 *tss) 5003 struct tss_segment_16 *tss)
4216{ 5004{
4217 kvm_rip_write(vcpu, tss->ip); 5005 kvm_rip_write(vcpu, tss->ip);
4218 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 5006 kvm_set_rflags(vcpu, tss->flag | 2);
4219 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 5007 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4220 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 5008 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4221 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 5009 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
@@ -4225,19 +5013,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4225 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 5013 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
4226 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 5014 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
4227 5015
4228 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 5016 /*
5017 * SDM says that segment selectors are loaded before segment
5018 * descriptors
5019 */
5020 kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
5021 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
5022 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
5023 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5024 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5025
5026 /*
5027 * Now load segment descriptors. If fault happenes at this stage
5028 * it is handled in a context of new task
5029 */
5030 if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
4229 return 1; 5031 return 1;
4230 5032
4231 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 5033 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
4232 return 1; 5034 return 1;
4233 5035
4234 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 5036 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
4235 return 1; 5037 return 1;
4236 5038
4237 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 5039 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
4238 return 1; 5040 return 1;
4239 5041
4240 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 5042 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
4241 return 1; 5043 return 1;
4242 return 0; 5044 return 0;
4243} 5045}
@@ -4259,7 +5061,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4259 sizeof tss_segment_16)) 5061 sizeof tss_segment_16))
4260 goto out; 5062 goto out;
4261 5063
4262 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 5064 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
4263 &tss_segment_16, sizeof tss_segment_16)) 5065 &tss_segment_16, sizeof tss_segment_16))
4264 goto out; 5066 goto out;
4265 5067
@@ -4267,7 +5069,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4267 tss_segment_16.prev_task_link = old_tss_sel; 5069 tss_segment_16.prev_task_link = old_tss_sel;
4268 5070
4269 if (kvm_write_guest(vcpu->kvm, 5071 if (kvm_write_guest(vcpu->kvm,
4270 get_tss_base_addr(vcpu, nseg_desc), 5072 get_tss_base_addr_write(vcpu, nseg_desc),
4271 &tss_segment_16.prev_task_link, 5073 &tss_segment_16.prev_task_link,
4272 sizeof tss_segment_16.prev_task_link)) 5074 sizeof tss_segment_16.prev_task_link))
4273 goto out; 5075 goto out;
@@ -4298,7 +5100,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4298 sizeof tss_segment_32)) 5100 sizeof tss_segment_32))
4299 goto out; 5101 goto out;
4300 5102
4301 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 5103 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
4302 &tss_segment_32, sizeof tss_segment_32)) 5104 &tss_segment_32, sizeof tss_segment_32))
4303 goto out; 5105 goto out;
4304 5106
@@ -4306,7 +5108,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4306 tss_segment_32.prev_task_link = old_tss_sel; 5108 tss_segment_32.prev_task_link = old_tss_sel;
4307 5109
4308 if (kvm_write_guest(vcpu->kvm, 5110 if (kvm_write_guest(vcpu->kvm,
4309 get_tss_base_addr(vcpu, nseg_desc), 5111 get_tss_base_addr_write(vcpu, nseg_desc),
4310 &tss_segment_32.prev_task_link, 5112 &tss_segment_32.prev_task_link,
4311 sizeof tss_segment_32.prev_task_link)) 5113 sizeof tss_segment_32.prev_task_link))
4312 goto out; 5114 goto out;
@@ -4328,8 +5130,9 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4328 int ret = 0; 5130 int ret = 0;
4329 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 5131 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
4330 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 5132 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
5133 u32 desc_limit;
4331 5134
4332 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 5135 old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
4333 5136
4334 /* FIXME: Handle errors. Failure to read either TSS or their 5137 /* FIXME: Handle errors. Failure to read either TSS or their
4335 * descriptors should generate a pagefault. 5138 * descriptors should generate a pagefault.
@@ -4350,7 +5153,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4350 } 5153 }
4351 } 5154 }
4352 5155
4353 if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { 5156 desc_limit = get_desc_limit(&nseg_desc);
5157 if (!nseg_desc.p ||
5158 ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
5159 desc_limit < 0x2b)) {
4354 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 5160 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
4355 return 1; 5161 return 1;
4356 } 5162 }
@@ -4361,8 +5167,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4361 } 5167 }
4362 5168
4363 if (reason == TASK_SWITCH_IRET) { 5169 if (reason == TASK_SWITCH_IRET) {
4364 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 5170 u32 eflags = kvm_get_rflags(vcpu);
4365 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 5171 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4366 } 5172 }
4367 5173
4368 /* set back link to prev task only if NT bit is set in eflags 5174 /* set back link to prev task only if NT bit is set in eflags
@@ -4370,11 +5176,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4370 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 5176 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4371 old_tss_sel = 0xffff; 5177 old_tss_sel = 0xffff;
4372 5178
4373 /* set back link to prev task only if NT bit is set in eflags
4374 note that old_tss_sel is not used afetr this point */
4375 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4376 old_tss_sel = 0xffff;
4377
4378 if (nseg_desc.type & 8) 5179 if (nseg_desc.type & 8)
4379 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 5180 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4380 old_tss_base, &nseg_desc); 5181 old_tss_base, &nseg_desc);
@@ -4383,8 +5184,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4383 old_tss_base, &nseg_desc); 5184 old_tss_base, &nseg_desc);
4384 5185
4385 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 5186 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4386 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 5187 u32 eflags = kvm_get_rflags(vcpu);
4387 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 5188 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4388 } 5189 }
4389 5190
4390 if (reason != TASK_SWITCH_IRET) { 5191 if (reason != TASK_SWITCH_IRET) {
@@ -4393,7 +5194,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4393 &nseg_desc); 5194 &nseg_desc);
4394 } 5195 }
4395 5196
4396 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 5197 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
4397 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 5198 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4398 tr_seg.type = 11; 5199 tr_seg.type = 11;
4399 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 5200 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
@@ -4424,20 +5225,20 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4424 5225
4425 kvm_set_cr8(vcpu, sregs->cr8); 5226 kvm_set_cr8(vcpu, sregs->cr8);
4426 5227
4427 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 5228 mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
4428 kvm_x86_ops->set_efer(vcpu, sregs->efer); 5229 kvm_x86_ops->set_efer(vcpu, sregs->efer);
4429 kvm_set_apic_base(vcpu, sregs->apic_base); 5230 kvm_set_apic_base(vcpu, sregs->apic_base);
4430 5231
4431 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 5232 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
4432
4433 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4434 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 5233 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4435 vcpu->arch.cr0 = sregs->cr0; 5234 vcpu->arch.cr0 = sregs->cr0;
4436 5235
4437 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 5236 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
4438 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5237 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4439 if (!is_long_mode(vcpu) && is_pae(vcpu)) 5238 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4440 load_pdptrs(vcpu, vcpu->arch.cr3); 5239 load_pdptrs(vcpu, vcpu->arch.cr3);
5240 mmu_reset_needed = 1;
5241 }
4441 5242
4442 if (mmu_reset_needed) 5243 if (mmu_reset_needed)
4443 kvm_mmu_reset_context(vcpu); 5244 kvm_mmu_reset_context(vcpu);
@@ -4467,7 +5268,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4467 /* Older userspace won't unhalt the vcpu on reset. */ 5268 /* Older userspace won't unhalt the vcpu on reset. */
4468 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 5269 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4469 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 5270 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4470 !(vcpu->arch.cr0 & X86_CR0_PE)) 5271 !is_protmode(vcpu))
4471 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5272 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4472 5273
4473 vcpu_put(vcpu); 5274 vcpu_put(vcpu);
@@ -4478,12 +5279,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4478int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 5279int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4479 struct kvm_guest_debug *dbg) 5280 struct kvm_guest_debug *dbg)
4480{ 5281{
5282 unsigned long rflags;
4481 int i, r; 5283 int i, r;
4482 5284
4483 vcpu_load(vcpu); 5285 vcpu_load(vcpu);
4484 5286
4485 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 5287 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
4486 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 5288 r = -EBUSY;
5289 if (vcpu->arch.exception.pending)
5290 goto unlock_out;
5291 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
5292 kvm_queue_exception(vcpu, DB_VECTOR);
5293 else
5294 kvm_queue_exception(vcpu, BP_VECTOR);
5295 }
5296
5297 /*
5298 * Read rflags as long as potentially injected trace flags are still
5299 * filtered out.
5300 */
5301 rflags = kvm_get_rflags(vcpu);
5302
5303 vcpu->guest_debug = dbg->control;
5304 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
5305 vcpu->guest_debug = 0;
5306
5307 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4487 for (i = 0; i < KVM_NR_DB_REGS; ++i) 5308 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4488 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 5309 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4489 vcpu->arch.switch_db_regs = 5310 vcpu->arch.switch_db_regs =
@@ -4494,13 +5315,23 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4494 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 5315 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4495 } 5316 }
4496 5317
4497 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 5318 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
5319 vcpu->arch.singlestep_cs =
5320 get_segment_selector(vcpu, VCPU_SREG_CS);
5321 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
5322 }
5323
5324 /*
5325 * Trigger an rflags update that will inject or remove the trace
5326 * flags.
5327 */
5328 kvm_set_rflags(vcpu, rflags);
4498 5329
4499 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 5330 kvm_x86_ops->set_guest_debug(vcpu, dbg);
4500 kvm_queue_exception(vcpu, DB_VECTOR);
4501 else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4502 kvm_queue_exception(vcpu, BP_VECTOR);
4503 5331
5332 r = 0;
5333
5334unlock_out:
4504 vcpu_put(vcpu); 5335 vcpu_put(vcpu);
4505 5336
4506 return r; 5337 return r;
@@ -4535,11 +5366,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4535{ 5366{
4536 unsigned long vaddr = tr->linear_address; 5367 unsigned long vaddr = tr->linear_address;
4537 gpa_t gpa; 5368 gpa_t gpa;
5369 int idx;
4538 5370
4539 vcpu_load(vcpu); 5371 vcpu_load(vcpu);
4540 down_read(&vcpu->kvm->slots_lock); 5372 idx = srcu_read_lock(&vcpu->kvm->srcu);
4541 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 5373 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
4542 up_read(&vcpu->kvm->slots_lock); 5374 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4543 tr->physical_address = gpa; 5375 tr->physical_address = gpa;
4544 tr->valid = gpa != UNMAPPED_GVA; 5376 tr->valid = gpa != UNMAPPED_GVA;
4545 tr->writeable = 1; 5377 tr->writeable = 1;
@@ -4620,14 +5452,14 @@ EXPORT_SYMBOL_GPL(fx_init);
4620 5452
4621void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5453void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4622{ 5454{
4623 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 5455 if (vcpu->guest_fpu_loaded)
4624 return; 5456 return;
4625 5457
4626 vcpu->guest_fpu_loaded = 1; 5458 vcpu->guest_fpu_loaded = 1;
4627 kvm_fx_save(&vcpu->arch.host_fx_image); 5459 kvm_fx_save(&vcpu->arch.host_fx_image);
4628 kvm_fx_restore(&vcpu->arch.guest_fx_image); 5460 kvm_fx_restore(&vcpu->arch.guest_fx_image);
5461 trace_kvm_fpu(1);
4629} 5462}
4630EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4631 5463
4632void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5464void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4633{ 5465{
@@ -4638,8 +5470,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4638 kvm_fx_save(&vcpu->arch.guest_fx_image); 5470 kvm_fx_save(&vcpu->arch.guest_fx_image);
4639 kvm_fx_restore(&vcpu->arch.host_fx_image); 5471 kvm_fx_restore(&vcpu->arch.host_fx_image);
4640 ++vcpu->stat.fpu_reload; 5472 ++vcpu->stat.fpu_reload;
5473 set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
5474 trace_kvm_fpu(0);
4641} 5475}
4642EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4643 5476
4644void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 5477void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4645{ 5478{
@@ -4701,14 +5534,26 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4701 return kvm_x86_ops->vcpu_reset(vcpu); 5534 return kvm_x86_ops->vcpu_reset(vcpu);
4702} 5535}
4703 5536
4704void kvm_arch_hardware_enable(void *garbage) 5537int kvm_arch_hardware_enable(void *garbage)
4705{ 5538{
4706 kvm_x86_ops->hardware_enable(garbage); 5539 /*
5540 * Since this may be called from a hotplug notifcation,
5541 * we can't get the CPU frequency directly.
5542 */
5543 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5544 int cpu = raw_smp_processor_id();
5545 per_cpu(cpu_tsc_khz, cpu) = 0;
5546 }
5547
5548 kvm_shared_msr_cpu_online();
5549
5550 return kvm_x86_ops->hardware_enable(garbage);
4707} 5551}
4708 5552
4709void kvm_arch_hardware_disable(void *garbage) 5553void kvm_arch_hardware_disable(void *garbage)
4710{ 5554{
4711 kvm_x86_ops->hardware_disable(garbage); 5555 kvm_x86_ops->hardware_disable(garbage);
5556 drop_user_return_notifiers(garbage);
4712} 5557}
4713 5558
4714int kvm_arch_hardware_setup(void) 5559int kvm_arch_hardware_setup(void)
@@ -4762,12 +5607,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4762 GFP_KERNEL); 5607 GFP_KERNEL);
4763 if (!vcpu->arch.mce_banks) { 5608 if (!vcpu->arch.mce_banks) {
4764 r = -ENOMEM; 5609 r = -ENOMEM;
4765 goto fail_mmu_destroy; 5610 goto fail_free_lapic;
4766 } 5611 }
4767 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5612 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
4768 5613
4769 return 0; 5614 return 0;
4770 5615fail_free_lapic:
5616 kvm_free_lapic(vcpu);
4771fail_mmu_destroy: 5617fail_mmu_destroy:
4772 kvm_mmu_destroy(vcpu); 5618 kvm_mmu_destroy(vcpu);
4773fail_free_pio_data: 5619fail_free_pio_data:
@@ -4778,10 +5624,13 @@ fail:
4778 5624
4779void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5625void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4780{ 5626{
5627 int idx;
5628
5629 kfree(vcpu->arch.mce_banks);
4781 kvm_free_lapic(vcpu); 5630 kvm_free_lapic(vcpu);
4782 down_read(&vcpu->kvm->slots_lock); 5631 idx = srcu_read_lock(&vcpu->kvm->srcu);
4783 kvm_mmu_destroy(vcpu); 5632 kvm_mmu_destroy(vcpu);
4784 up_read(&vcpu->kvm->slots_lock); 5633 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4785 free_page((unsigned long)vcpu->arch.pio_data); 5634 free_page((unsigned long)vcpu->arch.pio_data);
4786} 5635}
4787 5636
@@ -4792,6 +5641,12 @@ struct kvm *kvm_arch_create_vm(void)
4792 if (!kvm) 5641 if (!kvm)
4793 return ERR_PTR(-ENOMEM); 5642 return ERR_PTR(-ENOMEM);
4794 5643
5644 kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
5645 if (!kvm->arch.aliases) {
5646 kfree(kvm);
5647 return ERR_PTR(-ENOMEM);
5648 }
5649
4795 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5650 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4796 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5651 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4797 5652
@@ -4848,16 +5703,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
4848 put_page(kvm->arch.apic_access_page); 5703 put_page(kvm->arch.apic_access_page);
4849 if (kvm->arch.ept_identity_pagetable) 5704 if (kvm->arch.ept_identity_pagetable)
4850 put_page(kvm->arch.ept_identity_pagetable); 5705 put_page(kvm->arch.ept_identity_pagetable);
5706 cleanup_srcu_struct(&kvm->srcu);
5707 kfree(kvm->arch.aliases);
4851 kfree(kvm); 5708 kfree(kvm);
4852} 5709}
4853 5710
4854int kvm_arch_set_memory_region(struct kvm *kvm, 5711int kvm_arch_prepare_memory_region(struct kvm *kvm,
4855 struct kvm_userspace_memory_region *mem, 5712 struct kvm_memory_slot *memslot,
4856 struct kvm_memory_slot old, 5713 struct kvm_memory_slot old,
5714 struct kvm_userspace_memory_region *mem,
4857 int user_alloc) 5715 int user_alloc)
4858{ 5716{
4859 int npages = mem->memory_size >> PAGE_SHIFT; 5717 int npages = memslot->npages;
4860 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4861 5718
4862 /*To keep backward compatibility with older userspace, 5719 /*To keep backward compatibility with older userspace,
4863 *x86 needs to hanlde !user_alloc case. 5720 *x86 needs to hanlde !user_alloc case.
@@ -4877,26 +5734,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4877 if (IS_ERR((void *)userspace_addr)) 5734 if (IS_ERR((void *)userspace_addr))
4878 return PTR_ERR((void *)userspace_addr); 5735 return PTR_ERR((void *)userspace_addr);
4879 5736
4880 /* set userspace_addr atomically for kvm_hva_to_rmapp */
4881 spin_lock(&kvm->mmu_lock);
4882 memslot->userspace_addr = userspace_addr; 5737 memslot->userspace_addr = userspace_addr;
4883 spin_unlock(&kvm->mmu_lock);
4884 } else {
4885 if (!old.user_alloc && old.rmap) {
4886 int ret;
4887
4888 down_write(&current->mm->mmap_sem);
4889 ret = do_munmap(current->mm, old.userspace_addr,
4890 old.npages * PAGE_SIZE);
4891 up_write(&current->mm->mmap_sem);
4892 if (ret < 0)
4893 printk(KERN_WARNING
4894 "kvm_vm_ioctl_set_memory_region: "
4895 "failed to munmap memory\n");
4896 }
4897 } 5738 }
4898 } 5739 }
4899 5740
5741
5742 return 0;
5743}
5744
5745void kvm_arch_commit_memory_region(struct kvm *kvm,
5746 struct kvm_userspace_memory_region *mem,
5747 struct kvm_memory_slot old,
5748 int user_alloc)
5749{
5750
5751 int npages = mem->memory_size >> PAGE_SHIFT;
5752
5753 if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
5754 int ret;
5755
5756 down_write(&current->mm->mmap_sem);
5757 ret = do_munmap(current->mm, old.userspace_addr,
5758 old.npages * PAGE_SIZE);
5759 up_write(&current->mm->mmap_sem);
5760 if (ret < 0)
5761 printk(KERN_WARNING
5762 "kvm_vm_ioctl_set_memory_region: "
5763 "failed to munmap memory\n");
5764 }
5765
4900 spin_lock(&kvm->mmu_lock); 5766 spin_lock(&kvm->mmu_lock);
4901 if (!kvm->arch.n_requested_mmu_pages) { 5767 if (!kvm->arch.n_requested_mmu_pages) {
4902 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 5768 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
@@ -4905,8 +5771,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4905 5771
4906 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 5772 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4907 spin_unlock(&kvm->mmu_lock); 5773 spin_unlock(&kvm->mmu_lock);
4908
4909 return 0;
4910} 5774}
4911 5775
4912void kvm_arch_flush_shadow(struct kvm *kvm) 5776void kvm_arch_flush_shadow(struct kvm *kvm)
@@ -4946,8 +5810,36 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4946 return kvm_x86_ops->interrupt_allowed(vcpu); 5810 return kvm_x86_ops->interrupt_allowed(vcpu);
4947} 5811}
4948 5812
5813unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5814{
5815 unsigned long rflags;
5816
5817 rflags = kvm_x86_ops->get_rflags(vcpu);
5818 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5819 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
5820 return rflags;
5821}
5822EXPORT_SYMBOL_GPL(kvm_get_rflags);
5823
5824void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5825{
5826 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5827 vcpu->arch.singlestep_cs ==
5828 get_segment_selector(vcpu, VCPU_SREG_CS) &&
5829 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5830 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5831 kvm_x86_ops->set_rflags(vcpu, rflags);
5832}
5833EXPORT_SYMBOL_GPL(kvm_set_rflags);
5834
4949EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 5835EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
4950EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 5836EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
4951EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 5837EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
4952EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 5838EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
4953EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 5839EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
5840EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
5841EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
5842EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5843EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5844EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5845EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea585d2a..2d101639bd8d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -2,6 +2,7 @@
2#define ARCH_X86_KVM_X86_H 2#define ARCH_X86_KVM_X86_H
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5#include "kvm_cache_regs.h"
5 6
6static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) 7static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
7{ 8{
@@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
35struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 36struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
36 u32 function, u32 index); 37 u32 function, u32 index);
37 38
39static inline bool is_protmode(struct kvm_vcpu *vcpu)
40{
41 return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
42}
43
44static inline int is_long_mode(struct kvm_vcpu *vcpu)
45{
46#ifdef CONFIG_X86_64
47 return vcpu->arch.efer & EFER_LMA;
48#else
49 return 0;
50#endif
51}
52
53static inline int is_pae(struct kvm_vcpu *vcpu)
54{
55 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
56}
57
58static inline int is_pse(struct kvm_vcpu *vcpu)
59{
60 return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
61}
62
63static inline int is_paging(struct kvm_vcpu *vcpu)
64{
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66}
67
38#endif 68#endif