aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-10-30 04:43:08 -0400
committerIngo Molnar <mingo@elte.hu>2010-10-30 04:43:08 -0400
commit169ed55bd30305b933f52bfab32a58671d44ab68 (patch)
tree32e280957474f458901abfce16fa2a1687ef7497 /arch/x86/kvm
parent3d7851b3cdd43a734e5cc4c643fd886ab28ad4d5 (diff)
parent45f81b1c96d9793e47ce925d257ea693ce0b193e (diff)
Merge branch 'tip/perf/jump-label-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into perf/urgent
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/emulate.c2262
-rw-r--r--arch/x86/kvm/i8254.c11
-rw-r--r--arch/x86/kvm/i8259.c25
-rw-r--r--arch/x86/kvm/irq.c9
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h9
-rw-r--r--arch/x86/kvm/lapic.c15
-rw-r--r--arch/x86/kvm/mmu.c918
-rw-r--r--arch/x86/kvm/mmu.h9
-rw-r--r--arch/x86/kvm/mmu_audit.c299
-rw-r--r--arch/x86/kvm/mmutrace.h19
-rw-r--r--arch/x86/kvm/paging_tmpl.h202
-rw-r--r--arch/x86/kvm/svm.c283
-rw-r--r--arch/x86/kvm/timer.c2
-rw-r--r--arch/x86/kvm/vmx.c219
-rw-r--r--arch/x86/kvm/x86.c780
-rw-r--r--arch/x86/kvm/x86.h8
18 files changed, 3296 insertions, 1783 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd479516..ddc131ff438f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,13 @@ config KVM_AMD
64 To compile this as a module, choose M here: the module 64 To compile this as a module, choose M here: the module
65 will be called kvm-amd. 65 will be called kvm-amd.
66 66
67config KVM_MMU_AUDIT
68 bool "Audit KVM MMU"
69 depends on KVM && TRACEPOINTS
70 ---help---
71 This option adds a R/W kVM module parameter 'mmu_audit', which allows
72 audit KVM MMU at runtime.
73
67# OK, it's a little counter-intuitive to do this, but it puts it neatly under 74# OK, it's a little counter-intuitive to do this, but it puts it neatly under
68# the virtualization menu. 75# the virtualization menu.
69source drivers/vhost/Kconfig 76source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 66ca98aafdd6..38b6e8dafaff 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,7 +9,7 @@
9 * privileged instructions: 9 * privileged instructions:
10 * 10 *
11 * Copyright (C) 2006 Qumranet 11 * Copyright (C) 2006 Qumranet
12 * Copyright 2010 Red Hat, Inc. and/or its affilates. 12 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
13 * 13 *
14 * Avi Kivity <avi@qumranet.com> 14 * Avi Kivity <avi@qumranet.com>
15 * Yaniv Kamay <yaniv@qumranet.com> 15 * Yaniv Kamay <yaniv@qumranet.com>
@@ -51,13 +51,13 @@
51#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ 51#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
52#define DstReg (2<<1) /* Register operand. */ 52#define DstReg (2<<1) /* Register operand. */
53#define DstMem (3<<1) /* Memory operand. */ 53#define DstMem (3<<1) /* Memory operand. */
54#define DstAcc (4<<1) /* Destination Accumulator */ 54#define DstAcc (4<<1) /* Destination Accumulator */
55#define DstDI (5<<1) /* Destination is in ES:(E)DI */ 55#define DstDI (5<<1) /* Destination is in ES:(E)DI */
56#define DstMem64 (6<<1) /* 64bit memory operand */ 56#define DstMem64 (6<<1) /* 64bit memory operand */
57#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */
57#define DstMask (7<<1) 58#define DstMask (7<<1)
58/* Source operand type. */ 59/* Source operand type. */
59#define SrcNone (0<<4) /* No source operand. */ 60#define SrcNone (0<<4) /* No source operand. */
60#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
61#define SrcReg (1<<4) /* Register operand. */ 61#define SrcReg (1<<4) /* Register operand. */
62#define SrcMem (2<<4) /* Memory operand. */ 62#define SrcMem (2<<4) /* Memory operand. */
63#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ 63#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
@@ -71,6 +71,7 @@
71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ 71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ 72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
73#define SrcAcc (0xd<<4) /* Source Accumulator */ 73#define SrcAcc (0xd<<4) /* Source Accumulator */
74#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */
74#define SrcMask (0xf<<4) 75#define SrcMask (0xf<<4)
75/* Generic ModRM decode. */ 76/* Generic ModRM decode. */
76#define ModRM (1<<8) 77#define ModRM (1<<8)
@@ -82,8 +83,10 @@
82#define Stack (1<<13) /* Stack instruction (push/pop) */ 83#define Stack (1<<13) /* Stack instruction (push/pop) */
83#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 84#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
84#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 85#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
85#define GroupMask 0xff /* Group number stored in bits 0:7 */
86/* Misc flags */ 86/* Misc flags */
87#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
88#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
89#define Undefined (1<<25) /* No Such Instruction */
87#define Lock (1<<26) /* lock prefix is allowed for the instruction */ 90#define Lock (1<<26) /* lock prefix is allowed for the instruction */
88#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 91#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
89#define No64 (1<<28) 92#define No64 (1<<28)
@@ -92,285 +95,30 @@
92#define Src2CL (1<<29) 95#define Src2CL (1<<29)
93#define Src2ImmByte (2<<29) 96#define Src2ImmByte (2<<29)
94#define Src2One (3<<29) 97#define Src2One (3<<29)
98#define Src2Imm (4<<29)
95#define Src2Mask (7<<29) 99#define Src2Mask (7<<29)
96 100
97enum { 101#define X2(x...) x, x
98 Group1_80, Group1_81, Group1_82, Group1_83, 102#define X3(x...) X2(x), x
99 Group1A, Group3_Byte, Group3, Group4, Group5, Group7, 103#define X4(x...) X2(x), X2(x)
100 Group8, Group9, 104#define X5(x...) X4(x), x
105#define X6(x...) X4(x), X2(x)
106#define X7(x...) X4(x), X3(x)
107#define X8(x...) X4(x), X4(x)
108#define X16(x...) X8(x), X8(x)
109
110struct opcode {
111 u32 flags;
112 union {
113 int (*execute)(struct x86_emulate_ctxt *ctxt);
114 struct opcode *group;
115 struct group_dual *gdual;
116 } u;
101}; 117};
102 118
103static u32 opcode_table[256] = { 119struct group_dual {
104 /* 0x00 - 0x07 */ 120 struct opcode mod012[8];
105 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 121 struct opcode mod3[8];
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
109 /* 0x08 - 0x0F */
110 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
113 ImplicitOps | Stack | No64, 0,
114 /* 0x10 - 0x17 */
115 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
117 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
118 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
119 /* 0x18 - 0x1F */
120 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
121 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
122 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
123 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
124 /* 0x20 - 0x27 */
125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
127 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
128 /* 0x28 - 0x2F */
129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
131 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
132 /* 0x30 - 0x37 */
133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
135 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
136 /* 0x38 - 0x3F */
137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
139 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
140 0, 0,
141 /* 0x40 - 0x47 */
142 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
143 /* 0x48 - 0x4F */
144 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
145 /* 0x50 - 0x57 */
146 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
147 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
148 /* 0x58 - 0x5F */
149 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
150 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
151 /* 0x60 - 0x67 */
152 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
153 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
154 0, 0, 0, 0,
155 /* 0x68 - 0x6F */
156 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
157 DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
158 SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
159 /* 0x70 - 0x77 */
160 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
161 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
162 /* 0x78 - 0x7F */
163 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
164 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
165 /* 0x80 - 0x87 */
166 Group | Group1_80, Group | Group1_81,
167 Group | Group1_82, Group | Group1_83,
168 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
169 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
170 /* 0x88 - 0x8F */
171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
173 DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
174 ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
175 /* 0x90 - 0x97 */
176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
177 /* 0x98 - 0x9F */
178 0, 0, SrcImmFAddr | No64, 0,
179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
180 /* 0xA0 - 0xA7 */
181 ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
182 ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
185 /* 0xA8 - 0xAF */
186 DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
188 ByteOp | DstDI | String, DstDI | String,
189 /* 0xB0 - 0xB7 */
190 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
191 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
192 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
193 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
194 /* 0xB8 - 0xBF */
195 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
196 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
197 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
198 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
199 /* 0xC0 - 0xC7 */
200 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
201 0, ImplicitOps | Stack, 0, 0,
202 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
203 /* 0xC8 - 0xCF */
204 0, 0, 0, ImplicitOps | Stack,
205 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
206 /* 0xD0 - 0xD7 */
207 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
208 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
209 0, 0, 0, 0,
210 /* 0xD8 - 0xDF */
211 0, 0, 0, 0, 0, 0, 0, 0,
212 /* 0xE0 - 0xE7 */
213 0, 0, 0, 0,
214 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
216 /* 0xE8 - 0xEF */
217 SrcImm | Stack, SrcImm | ImplicitOps,
218 SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
221 /* 0xF0 - 0xF7 */
222 0, 0, 0, 0,
223 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
224 /* 0xF8 - 0xFF */
225 ImplicitOps, 0, ImplicitOps, ImplicitOps,
226 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
227};
228
229static u32 twobyte_table[256] = {
230 /* 0x00 - 0x0F */
231 0, Group | GroupDual | Group7, 0, 0,
232 0, ImplicitOps, ImplicitOps | Priv, 0,
233 ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
234 0, ImplicitOps | ModRM, 0, 0,
235 /* 0x10 - 0x1F */
236 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
237 /* 0x20 - 0x2F */
238 ModRM | ImplicitOps | Priv, ModRM | Priv,
239 ModRM | ImplicitOps | Priv, ModRM | Priv,
240 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 /* 0x30 - 0x3F */
243 ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
244 ImplicitOps, ImplicitOps | Priv, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0x40 - 0x47 */
247 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
248 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
249 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
250 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
251 /* 0x48 - 0x4F */
252 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
253 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
254 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
255 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
256 /* 0x50 - 0x5F */
257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
258 /* 0x60 - 0x6F */
259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
260 /* 0x70 - 0x7F */
261 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
262 /* 0x80 - 0x8F */
263 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
264 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
265 /* 0x90 - 0x9F */
266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
267 /* 0xA0 - 0xA7 */
268 ImplicitOps | Stack, ImplicitOps | Stack,
269 0, DstMem | SrcReg | ModRM | BitOp,
270 DstMem | SrcReg | Src2ImmByte | ModRM,
271 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
272 /* 0xA8 - 0xAF */
273 ImplicitOps | Stack, ImplicitOps | Stack,
274 0, DstMem | SrcReg | ModRM | BitOp | Lock,
275 DstMem | SrcReg | Src2ImmByte | ModRM,
276 DstMem | SrcReg | Src2CL | ModRM,
277 ModRM, 0,
278 /* 0xB0 - 0xB7 */
279 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
280 0, DstMem | SrcReg | ModRM | BitOp | Lock,
281 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
282 DstReg | SrcMem16 | ModRM | Mov,
283 /* 0xB8 - 0xBF */
284 0, 0,
285 Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
286 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
287 DstReg | SrcMem16 | ModRM | Mov,
288 /* 0xC0 - 0xCF */
289 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
290 0, 0, 0, Group | GroupDual | Group9,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 /* 0xD0 - 0xDF */
293 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
294 /* 0xE0 - 0xEF */
295 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
296 /* 0xF0 - 0xFF */
297 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
298};
299
300static u32 group_table[] = {
301 [Group1_80*8] =
302 ByteOp | DstMem | SrcImm | ModRM | Lock,
303 ByteOp | DstMem | SrcImm | ModRM | Lock,
304 ByteOp | DstMem | SrcImm | ModRM | Lock,
305 ByteOp | DstMem | SrcImm | ModRM | Lock,
306 ByteOp | DstMem | SrcImm | ModRM | Lock,
307 ByteOp | DstMem | SrcImm | ModRM | Lock,
308 ByteOp | DstMem | SrcImm | ModRM | Lock,
309 ByteOp | DstMem | SrcImm | ModRM,
310 [Group1_81*8] =
311 DstMem | SrcImm | ModRM | Lock,
312 DstMem | SrcImm | ModRM | Lock,
313 DstMem | SrcImm | ModRM | Lock,
314 DstMem | SrcImm | ModRM | Lock,
315 DstMem | SrcImm | ModRM | Lock,
316 DstMem | SrcImm | ModRM | Lock,
317 DstMem | SrcImm | ModRM | Lock,
318 DstMem | SrcImm | ModRM,
319 [Group1_82*8] =
320 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
321 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
322 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
323 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
324 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
325 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
326 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
327 ByteOp | DstMem | SrcImm | ModRM | No64,
328 [Group1_83*8] =
329 DstMem | SrcImmByte | ModRM | Lock,
330 DstMem | SrcImmByte | ModRM | Lock,
331 DstMem | SrcImmByte | ModRM | Lock,
332 DstMem | SrcImmByte | ModRM | Lock,
333 DstMem | SrcImmByte | ModRM | Lock,
334 DstMem | SrcImmByte | ModRM | Lock,
335 DstMem | SrcImmByte | ModRM | Lock,
336 DstMem | SrcImmByte | ModRM,
337 [Group1A*8] =
338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
339 [Group3_Byte*8] =
340 ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
342 0, 0, 0, 0,
343 [Group3*8] =
344 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
346 0, 0, 0, 0,
347 [Group4*8] =
348 ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
349 0, 0, 0, 0, 0, 0,
350 [Group5*8] =
351 DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
352 SrcMem | ModRM | Stack, 0,
353 SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
354 SrcMem | ModRM | Stack, 0,
355 [Group7*8] =
356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
357 SrcNone | ModRM | DstMem | Mov, 0,
358 SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
359 [Group8*8] =
360 0, 0, 0, 0,
361 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
362 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
363 [Group9*8] =
364 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
365};
366
367static u32 group2_table[] = {
368 [Group7*8] =
369 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
370 SrcNone | ModRM | DstMem | Mov, 0,
371 SrcMem16 | ModRM | Mov | Priv, 0,
372 [Group9*8] =
373 0, 0, 0, 0, 0, 0, 0, 0,
374}; 122};
375 123
376/* EFLAGS bit definitions. */ 124/* EFLAGS bit definitions. */
@@ -392,6 +140,9 @@ static u32 group2_table[] = {
392#define EFLG_PF (1<<2) 140#define EFLG_PF (1<<2)
393#define EFLG_CF (1<<0) 141#define EFLG_CF (1<<0)
394 142
143#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
144#define EFLG_RESERVED_ONE_MASK 2
145
395/* 146/*
396 * Instruction emulation: 147 * Instruction emulation:
397 * Most instructions are emulated directly via a fragment of inline assembly 148 * Most instructions are emulated directly via a fragment of inline assembly
@@ -444,13 +195,13 @@ static u32 group2_table[] = {
444#define ON64(x) 195#define ON64(x)
445#endif 196#endif
446 197
447#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ 198#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
448 do { \ 199 do { \
449 __asm__ __volatile__ ( \ 200 __asm__ __volatile__ ( \
450 _PRE_EFLAGS("0", "4", "2") \ 201 _PRE_EFLAGS("0", "4", "2") \
451 _op _suffix " %"_x"3,%1; " \ 202 _op _suffix " %"_x"3,%1; " \
452 _POST_EFLAGS("0", "4", "2") \ 203 _POST_EFLAGS("0", "4", "2") \
453 : "=m" (_eflags), "=m" ((_dst).val), \ 204 : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
454 "=&r" (_tmp) \ 205 "=&r" (_tmp) \
455 : _y ((_src).val), "i" (EFLAGS_MASK)); \ 206 : _y ((_src).val), "i" (EFLAGS_MASK)); \
456 } while (0) 207 } while (0)
@@ -463,13 +214,13 @@ static u32 group2_table[] = {
463 \ 214 \
464 switch ((_dst).bytes) { \ 215 switch ((_dst).bytes) { \
465 case 2: \ 216 case 2: \
466 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ 217 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
467 break; \ 218 break; \
468 case 4: \ 219 case 4: \
469 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ 220 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
470 break; \ 221 break; \
471 case 8: \ 222 case 8: \
472 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ 223 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
473 break; \ 224 break; \
474 } \ 225 } \
475 } while (0) 226 } while (0)
@@ -479,7 +230,7 @@ static u32 group2_table[] = {
479 unsigned long _tmp; \ 230 unsigned long _tmp; \
480 switch ((_dst).bytes) { \ 231 switch ((_dst).bytes) { \
481 case 1: \ 232 case 1: \
482 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ 233 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
483 break; \ 234 break; \
484 default: \ 235 default: \
485 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 236 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
@@ -566,6 +317,74 @@ static u32 group2_table[] = {
566 } \ 317 } \
567 } while (0) 318 } while (0)
568 319
320#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \
321 do { \
322 unsigned long _tmp; \
323 \
324 __asm__ __volatile__ ( \
325 _PRE_EFLAGS("0", "4", "1") \
326 _op _suffix " %5; " \
327 _POST_EFLAGS("0", "4", "1") \
328 : "=m" (_eflags), "=&r" (_tmp), \
329 "+a" (_rax), "+d" (_rdx) \
330 : "i" (EFLAGS_MASK), "m" ((_src).val), \
331 "a" (_rax), "d" (_rdx)); \
332 } while (0)
333
334#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
335 do { \
336 unsigned long _tmp; \
337 \
338 __asm__ __volatile__ ( \
339 _PRE_EFLAGS("0", "5", "1") \
340 "1: \n\t" \
341 _op _suffix " %6; " \
342 "2: \n\t" \
343 _POST_EFLAGS("0", "5", "1") \
344 ".pushsection .fixup,\"ax\" \n\t" \
345 "3: movb $1, %4 \n\t" \
346 "jmp 2b \n\t" \
347 ".popsection \n\t" \
348 _ASM_EXTABLE(1b, 3b) \
349 : "=m" (_eflags), "=&r" (_tmp), \
350 "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \
351 : "i" (EFLAGS_MASK), "m" ((_src).val), \
352 "a" (_rax), "d" (_rdx)); \
353 } while (0)
354
355/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
356#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
357 do { \
358 switch((_src).bytes) { \
359 case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \
360 case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \
361 case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \
362 case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \
363 } \
364 } while (0)
365
366#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \
367 do { \
368 switch((_src).bytes) { \
369 case 1: \
370 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
371 _eflags, "b", _ex); \
372 break; \
373 case 2: \
374 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
375 _eflags, "w", _ex); \
376 break; \
377 case 4: \
378 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
379 _eflags, "l", _ex); \
380 break; \
381 case 8: ON64( \
382 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
383 _eflags, "q", _ex)); \
384 break; \
385 } \
386 } while (0)
387
569/* Fetch next part of the instruction being emulated. */ 388/* Fetch next part of the instruction being emulated. */
570#define insn_fetch(_type, _size, _eip) \ 389#define insn_fetch(_type, _size, _eip) \
571({ unsigned long _x; \ 390({ unsigned long _x; \
@@ -661,7 +480,6 @@ static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
661 ctxt->exception = vec; 480 ctxt->exception = vec;
662 ctxt->error_code = error; 481 ctxt->error_code = error;
663 ctxt->error_code_valid = valid; 482 ctxt->error_code_valid = valid;
664 ctxt->restart = false;
665} 483}
666 484
667static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 485static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
@@ -669,11 +487,9 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
669 emulate_exception(ctxt, GP_VECTOR, err, true); 487 emulate_exception(ctxt, GP_VECTOR, err, true);
670} 488}
671 489
672static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, 490static void emulate_pf(struct x86_emulate_ctxt *ctxt)
673 int err)
674{ 491{
675 ctxt->cr2 = addr; 492 emulate_exception(ctxt, PF_VECTOR, 0, true);
676 emulate_exception(ctxt, PF_VECTOR, err, true);
677} 493}
678 494
679static void emulate_ud(struct x86_emulate_ctxt *ctxt) 495static void emulate_ud(struct x86_emulate_ctxt *ctxt)
@@ -686,6 +502,12 @@ static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
686 emulate_exception(ctxt, TS_VECTOR, err, true); 502 emulate_exception(ctxt, TS_VECTOR, err, true);
687} 503}
688 504
505static int emulate_de(struct x86_emulate_ctxt *ctxt)
506{
507 emulate_exception(ctxt, DE_VECTOR, 0, false);
508 return X86EMUL_PROPAGATE_FAULT;
509}
510
689static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 511static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
690 struct x86_emulate_ops *ops, 512 struct x86_emulate_ops *ops,
691 unsigned long eip, u8 *dest) 513 unsigned long eip, u8 *dest)
@@ -742,7 +564,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
742 564
743static int read_descriptor(struct x86_emulate_ctxt *ctxt, 565static int read_descriptor(struct x86_emulate_ctxt *ctxt,
744 struct x86_emulate_ops *ops, 566 struct x86_emulate_ops *ops,
745 void *ptr, 567 ulong addr,
746 u16 *size, unsigned long *address, int op_bytes) 568 u16 *size, unsigned long *address, int op_bytes)
747{ 569{
748 int rc; 570 int rc;
@@ -750,12 +572,10 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
750 if (op_bytes == 2) 572 if (op_bytes == 2)
751 op_bytes = 3; 573 op_bytes = 3;
752 *address = 0; 574 *address = 0;
753 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 575 rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
754 ctxt->vcpu, NULL);
755 if (rc != X86EMUL_CONTINUE) 576 if (rc != X86EMUL_CONTINUE)
756 return rc; 577 return rc;
757 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 578 rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
758 ctxt->vcpu, NULL);
759 return rc; 579 return rc;
760} 580}
761 581
@@ -794,6 +614,24 @@ static int test_cc(unsigned int condition, unsigned int flags)
794 return (!!rc ^ (condition & 1)); 614 return (!!rc ^ (condition & 1));
795} 615}
796 616
617static void fetch_register_operand(struct operand *op)
618{
619 switch (op->bytes) {
620 case 1:
621 op->val = *(u8 *)op->addr.reg;
622 break;
623 case 2:
624 op->val = *(u16 *)op->addr.reg;
625 break;
626 case 4:
627 op->val = *(u32 *)op->addr.reg;
628 break;
629 case 8:
630 op->val = *(u64 *)op->addr.reg;
631 break;
632 }
633}
634
797static void decode_register_operand(struct operand *op, 635static void decode_register_operand(struct operand *op,
798 struct decode_cache *c, 636 struct decode_cache *c,
799 int inhibit_bytereg) 637 int inhibit_bytereg)
@@ -805,34 +643,25 @@ static void decode_register_operand(struct operand *op,
805 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); 643 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
806 op->type = OP_REG; 644 op->type = OP_REG;
807 if ((c->d & ByteOp) && !inhibit_bytereg) { 645 if ((c->d & ByteOp) && !inhibit_bytereg) {
808 op->ptr = decode_register(reg, c->regs, highbyte_regs); 646 op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
809 op->val = *(u8 *)op->ptr;
810 op->bytes = 1; 647 op->bytes = 1;
811 } else { 648 } else {
812 op->ptr = decode_register(reg, c->regs, 0); 649 op->addr.reg = decode_register(reg, c->regs, 0);
813 op->bytes = c->op_bytes; 650 op->bytes = c->op_bytes;
814 switch (op->bytes) {
815 case 2:
816 op->val = *(u16 *)op->ptr;
817 break;
818 case 4:
819 op->val = *(u32 *)op->ptr;
820 break;
821 case 8:
822 op->val = *(u64 *) op->ptr;
823 break;
824 }
825 } 651 }
652 fetch_register_operand(op);
826 op->orig_val = op->val; 653 op->orig_val = op->val;
827} 654}
828 655
829static int decode_modrm(struct x86_emulate_ctxt *ctxt, 656static int decode_modrm(struct x86_emulate_ctxt *ctxt,
830 struct x86_emulate_ops *ops) 657 struct x86_emulate_ops *ops,
658 struct operand *op)
831{ 659{
832 struct decode_cache *c = &ctxt->decode; 660 struct decode_cache *c = &ctxt->decode;
833 u8 sib; 661 u8 sib;
834 int index_reg = 0, base_reg = 0, scale; 662 int index_reg = 0, base_reg = 0, scale;
835 int rc = X86EMUL_CONTINUE; 663 int rc = X86EMUL_CONTINUE;
664 ulong modrm_ea = 0;
836 665
837 if (c->rex_prefix) { 666 if (c->rex_prefix) {
838 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 667 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -844,16 +673,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
844 c->modrm_mod |= (c->modrm & 0xc0) >> 6; 673 c->modrm_mod |= (c->modrm & 0xc0) >> 6;
845 c->modrm_reg |= (c->modrm & 0x38) >> 3; 674 c->modrm_reg |= (c->modrm & 0x38) >> 3;
846 c->modrm_rm |= (c->modrm & 0x07); 675 c->modrm_rm |= (c->modrm & 0x07);
847 c->modrm_ea = 0; 676 c->modrm_seg = VCPU_SREG_DS;
848 c->use_modrm_ea = 1;
849 677
850 if (c->modrm_mod == 3) { 678 if (c->modrm_mod == 3) {
851 c->modrm_ptr = decode_register(c->modrm_rm, 679 op->type = OP_REG;
680 op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
681 op->addr.reg = decode_register(c->modrm_rm,
852 c->regs, c->d & ByteOp); 682 c->regs, c->d & ByteOp);
853 c->modrm_val = *(unsigned long *)c->modrm_ptr; 683 fetch_register_operand(op);
854 return rc; 684 return rc;
855 } 685 }
856 686
687 op->type = OP_MEM;
688
857 if (c->ad_bytes == 2) { 689 if (c->ad_bytes == 2) {
858 unsigned bx = c->regs[VCPU_REGS_RBX]; 690 unsigned bx = c->regs[VCPU_REGS_RBX];
859 unsigned bp = c->regs[VCPU_REGS_RBP]; 691 unsigned bp = c->regs[VCPU_REGS_RBP];
@@ -864,47 +696,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
864 switch (c->modrm_mod) { 696 switch (c->modrm_mod) {
865 case 0: 697 case 0:
866 if (c->modrm_rm == 6) 698 if (c->modrm_rm == 6)
867 c->modrm_ea += insn_fetch(u16, 2, c->eip); 699 modrm_ea += insn_fetch(u16, 2, c->eip);
868 break; 700 break;
869 case 1: 701 case 1:
870 c->modrm_ea += insn_fetch(s8, 1, c->eip); 702 modrm_ea += insn_fetch(s8, 1, c->eip);
871 break; 703 break;
872 case 2: 704 case 2:
873 c->modrm_ea += insn_fetch(u16, 2, c->eip); 705 modrm_ea += insn_fetch(u16, 2, c->eip);
874 break; 706 break;
875 } 707 }
876 switch (c->modrm_rm) { 708 switch (c->modrm_rm) {
877 case 0: 709 case 0:
878 c->modrm_ea += bx + si; 710 modrm_ea += bx + si;
879 break; 711 break;
880 case 1: 712 case 1:
881 c->modrm_ea += bx + di; 713 modrm_ea += bx + di;
882 break; 714 break;
883 case 2: 715 case 2:
884 c->modrm_ea += bp + si; 716 modrm_ea += bp + si;
885 break; 717 break;
886 case 3: 718 case 3:
887 c->modrm_ea += bp + di; 719 modrm_ea += bp + di;
888 break; 720 break;
889 case 4: 721 case 4:
890 c->modrm_ea += si; 722 modrm_ea += si;
891 break; 723 break;
892 case 5: 724 case 5:
893 c->modrm_ea += di; 725 modrm_ea += di;
894 break; 726 break;
895 case 6: 727 case 6:
896 if (c->modrm_mod != 0) 728 if (c->modrm_mod != 0)
897 c->modrm_ea += bp; 729 modrm_ea += bp;
898 break; 730 break;
899 case 7: 731 case 7:
900 c->modrm_ea += bx; 732 modrm_ea += bx;
901 break; 733 break;
902 } 734 }
903 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 735 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
904 (c->modrm_rm == 6 && c->modrm_mod != 0)) 736 (c->modrm_rm == 6 && c->modrm_mod != 0))
905 if (!c->has_seg_override) 737 c->modrm_seg = VCPU_SREG_SS;
906 set_seg_override(c, VCPU_SREG_SS); 738 modrm_ea = (u16)modrm_ea;
907 c->modrm_ea = (u16)c->modrm_ea;
908 } else { 739 } else {
909 /* 32/64-bit ModR/M decode. */ 740 /* 32/64-bit ModR/M decode. */
910 if ((c->modrm_rm & 7) == 4) { 741 if ((c->modrm_rm & 7) == 4) {
@@ -914,410 +745,74 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
914 scale = sib >> 6; 745 scale = sib >> 6;
915 746
916 if ((base_reg & 7) == 5 && c->modrm_mod == 0) 747 if ((base_reg & 7) == 5 && c->modrm_mod == 0)
917 c->modrm_ea += insn_fetch(s32, 4, c->eip); 748 modrm_ea += insn_fetch(s32, 4, c->eip);
918 else 749 else
919 c->modrm_ea += c->regs[base_reg]; 750 modrm_ea += c->regs[base_reg];
920 if (index_reg != 4) 751 if (index_reg != 4)
921 c->modrm_ea += c->regs[index_reg] << scale; 752 modrm_ea += c->regs[index_reg] << scale;
922 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { 753 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
923 if (ctxt->mode == X86EMUL_MODE_PROT64) 754 if (ctxt->mode == X86EMUL_MODE_PROT64)
924 c->rip_relative = 1; 755 c->rip_relative = 1;
925 } else 756 } else
926 c->modrm_ea += c->regs[c->modrm_rm]; 757 modrm_ea += c->regs[c->modrm_rm];
927 switch (c->modrm_mod) { 758 switch (c->modrm_mod) {
928 case 0: 759 case 0:
929 if (c->modrm_rm == 5) 760 if (c->modrm_rm == 5)
930 c->modrm_ea += insn_fetch(s32, 4, c->eip); 761 modrm_ea += insn_fetch(s32, 4, c->eip);
931 break; 762 break;
932 case 1: 763 case 1:
933 c->modrm_ea += insn_fetch(s8, 1, c->eip); 764 modrm_ea += insn_fetch(s8, 1, c->eip);
934 break; 765 break;
935 case 2: 766 case 2:
936 c->modrm_ea += insn_fetch(s32, 4, c->eip); 767 modrm_ea += insn_fetch(s32, 4, c->eip);
937 break; 768 break;
938 } 769 }
939 } 770 }
771 op->addr.mem = modrm_ea;
940done: 772done:
941 return rc; 773 return rc;
942} 774}
943 775
944static int decode_abs(struct x86_emulate_ctxt *ctxt, 776static int decode_abs(struct x86_emulate_ctxt *ctxt,
945 struct x86_emulate_ops *ops) 777 struct x86_emulate_ops *ops,
778 struct operand *op)
946{ 779{
947 struct decode_cache *c = &ctxt->decode; 780 struct decode_cache *c = &ctxt->decode;
948 int rc = X86EMUL_CONTINUE; 781 int rc = X86EMUL_CONTINUE;
949 782
783 op->type = OP_MEM;
950 switch (c->ad_bytes) { 784 switch (c->ad_bytes) {
951 case 2: 785 case 2:
952 c->modrm_ea = insn_fetch(u16, 2, c->eip); 786 op->addr.mem = insn_fetch(u16, 2, c->eip);
953 break; 787 break;
954 case 4: 788 case 4:
955 c->modrm_ea = insn_fetch(u32, 4, c->eip); 789 op->addr.mem = insn_fetch(u32, 4, c->eip);
956 break; 790 break;
957 case 8: 791 case 8:
958 c->modrm_ea = insn_fetch(u64, 8, c->eip); 792 op->addr.mem = insn_fetch(u64, 8, c->eip);
959 break; 793 break;
960 } 794 }
961done: 795done:
962 return rc; 796 return rc;
963} 797}
964 798
965int 799static void fetch_bit_operand(struct decode_cache *c)
966x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
967{ 800{
968 struct decode_cache *c = &ctxt->decode; 801 long sv = 0, mask;
969 int rc = X86EMUL_CONTINUE;
970 int mode = ctxt->mode;
971 int def_op_bytes, def_ad_bytes, group;
972
973
974 /* we cannot decode insn before we complete previous rep insn */
975 WARN_ON(ctxt->restart);
976
977 c->eip = ctxt->eip;
978 c->fetch.start = c->fetch.end = c->eip;
979 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
980
981 switch (mode) {
982 case X86EMUL_MODE_REAL:
983 case X86EMUL_MODE_VM86:
984 case X86EMUL_MODE_PROT16:
985 def_op_bytes = def_ad_bytes = 2;
986 break;
987 case X86EMUL_MODE_PROT32:
988 def_op_bytes = def_ad_bytes = 4;
989 break;
990#ifdef CONFIG_X86_64
991 case X86EMUL_MODE_PROT64:
992 def_op_bytes = 4;
993 def_ad_bytes = 8;
994 break;
995#endif
996 default:
997 return -1;
998 }
999
1000 c->op_bytes = def_op_bytes;
1001 c->ad_bytes = def_ad_bytes;
1002
1003 /* Legacy prefixes. */
1004 for (;;) {
1005 switch (c->b = insn_fetch(u8, 1, c->eip)) {
1006 case 0x66: /* operand-size override */
1007 /* switch between 2/4 bytes */
1008 c->op_bytes = def_op_bytes ^ 6;
1009 break;
1010 case 0x67: /* address-size override */
1011 if (mode == X86EMUL_MODE_PROT64)
1012 /* switch between 4/8 bytes */
1013 c->ad_bytes = def_ad_bytes ^ 12;
1014 else
1015 /* switch between 2/4 bytes */
1016 c->ad_bytes = def_ad_bytes ^ 6;
1017 break;
1018 case 0x26: /* ES override */
1019 case 0x2e: /* CS override */
1020 case 0x36: /* SS override */
1021 case 0x3e: /* DS override */
1022 set_seg_override(c, (c->b >> 3) & 3);
1023 break;
1024 case 0x64: /* FS override */
1025 case 0x65: /* GS override */
1026 set_seg_override(c, c->b & 7);
1027 break;
1028 case 0x40 ... 0x4f: /* REX */
1029 if (mode != X86EMUL_MODE_PROT64)
1030 goto done_prefixes;
1031 c->rex_prefix = c->b;
1032 continue;
1033 case 0xf0: /* LOCK */
1034 c->lock_prefix = 1;
1035 break;
1036 case 0xf2: /* REPNE/REPNZ */
1037 c->rep_prefix = REPNE_PREFIX;
1038 break;
1039 case 0xf3: /* REP/REPE/REPZ */
1040 c->rep_prefix = REPE_PREFIX;
1041 break;
1042 default:
1043 goto done_prefixes;
1044 }
1045
1046 /* Any legacy prefix after a REX prefix nullifies its effect. */
1047
1048 c->rex_prefix = 0;
1049 }
1050
1051done_prefixes:
1052
1053 /* REX prefix. */
1054 if (c->rex_prefix)
1055 if (c->rex_prefix & 8)
1056 c->op_bytes = 8; /* REX.W */
1057
1058 /* Opcode byte(s). */
1059 c->d = opcode_table[c->b];
1060 if (c->d == 0) {
1061 /* Two-byte opcode? */
1062 if (c->b == 0x0f) {
1063 c->twobyte = 1;
1064 c->b = insn_fetch(u8, 1, c->eip);
1065 c->d = twobyte_table[c->b];
1066 }
1067 }
1068
1069 if (c->d & Group) {
1070 group = c->d & GroupMask;
1071 c->modrm = insn_fetch(u8, 1, c->eip);
1072 --c->eip;
1073
1074 group = (group << 3) + ((c->modrm >> 3) & 7);
1075 if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
1076 c->d = group2_table[group];
1077 else
1078 c->d = group_table[group];
1079 }
1080
1081 /* Unrecognised? */
1082 if (c->d == 0) {
1083 DPRINTF("Cannot emulate %02x\n", c->b);
1084 return -1;
1085 }
1086
1087 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
1088 c->op_bytes = 8;
1089
1090 /* ModRM and SIB bytes. */
1091 if (c->d & ModRM)
1092 rc = decode_modrm(ctxt, ops);
1093 else if (c->d & MemAbs)
1094 rc = decode_abs(ctxt, ops);
1095 if (rc != X86EMUL_CONTINUE)
1096 goto done;
1097
1098 if (!c->has_seg_override)
1099 set_seg_override(c, VCPU_SREG_DS);
1100
1101 if (!(!c->twobyte && c->b == 0x8d))
1102 c->modrm_ea += seg_override_base(ctxt, ops, c);
1103
1104 if (c->ad_bytes != 8)
1105 c->modrm_ea = (u32)c->modrm_ea;
1106
1107 if (c->rip_relative)
1108 c->modrm_ea += c->eip;
1109
1110 /*
1111 * Decode and fetch the source operand: register, memory
1112 * or immediate.
1113 */
1114 switch (c->d & SrcMask) {
1115 case SrcNone:
1116 break;
1117 case SrcReg:
1118 decode_register_operand(&c->src, c, 0);
1119 break;
1120 case SrcMem16:
1121 c->src.bytes = 2;
1122 goto srcmem_common;
1123 case SrcMem32:
1124 c->src.bytes = 4;
1125 goto srcmem_common;
1126 case SrcMem:
1127 c->src.bytes = (c->d & ByteOp) ? 1 :
1128 c->op_bytes;
1129 /* Don't fetch the address for invlpg: it could be unmapped. */
1130 if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
1131 break;
1132 srcmem_common:
1133 /*
1134 * For instructions with a ModR/M byte, switch to register
1135 * access if Mod = 3.
1136 */
1137 if ((c->d & ModRM) && c->modrm_mod == 3) {
1138 c->src.type = OP_REG;
1139 c->src.val = c->modrm_val;
1140 c->src.ptr = c->modrm_ptr;
1141 break;
1142 }
1143 c->src.type = OP_MEM;
1144 c->src.ptr = (unsigned long *)c->modrm_ea;
1145 c->src.val = 0;
1146 break;
1147 case SrcImm:
1148 case SrcImmU:
1149 c->src.type = OP_IMM;
1150 c->src.ptr = (unsigned long *)c->eip;
1151 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1152 if (c->src.bytes == 8)
1153 c->src.bytes = 4;
1154 /* NB. Immediates are sign-extended as necessary. */
1155 switch (c->src.bytes) {
1156 case 1:
1157 c->src.val = insn_fetch(s8, 1, c->eip);
1158 break;
1159 case 2:
1160 c->src.val = insn_fetch(s16, 2, c->eip);
1161 break;
1162 case 4:
1163 c->src.val = insn_fetch(s32, 4, c->eip);
1164 break;
1165 }
1166 if ((c->d & SrcMask) == SrcImmU) {
1167 switch (c->src.bytes) {
1168 case 1:
1169 c->src.val &= 0xff;
1170 break;
1171 case 2:
1172 c->src.val &= 0xffff;
1173 break;
1174 case 4:
1175 c->src.val &= 0xffffffff;
1176 break;
1177 }
1178 }
1179 break;
1180 case SrcImmByte:
1181 case SrcImmUByte:
1182 c->src.type = OP_IMM;
1183 c->src.ptr = (unsigned long *)c->eip;
1184 c->src.bytes = 1;
1185 if ((c->d & SrcMask) == SrcImmByte)
1186 c->src.val = insn_fetch(s8, 1, c->eip);
1187 else
1188 c->src.val = insn_fetch(u8, 1, c->eip);
1189 break;
1190 case SrcAcc:
1191 c->src.type = OP_REG;
1192 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1193 c->src.ptr = &c->regs[VCPU_REGS_RAX];
1194 switch (c->src.bytes) {
1195 case 1:
1196 c->src.val = *(u8 *)c->src.ptr;
1197 break;
1198 case 2:
1199 c->src.val = *(u16 *)c->src.ptr;
1200 break;
1201 case 4:
1202 c->src.val = *(u32 *)c->src.ptr;
1203 break;
1204 case 8:
1205 c->src.val = *(u64 *)c->src.ptr;
1206 break;
1207 }
1208 break;
1209 case SrcOne:
1210 c->src.bytes = 1;
1211 c->src.val = 1;
1212 break;
1213 case SrcSI:
1214 c->src.type = OP_MEM;
1215 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1216 c->src.ptr = (unsigned long *)
1217 register_address(c, seg_override_base(ctxt, ops, c),
1218 c->regs[VCPU_REGS_RSI]);
1219 c->src.val = 0;
1220 break;
1221 case SrcImmFAddr:
1222 c->src.type = OP_IMM;
1223 c->src.ptr = (unsigned long *)c->eip;
1224 c->src.bytes = c->op_bytes + 2;
1225 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
1226 break;
1227 case SrcMemFAddr:
1228 c->src.type = OP_MEM;
1229 c->src.ptr = (unsigned long *)c->modrm_ea;
1230 c->src.bytes = c->op_bytes + 2;
1231 break;
1232 }
1233 802
1234 /* 803 if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
1235 * Decode and fetch the second source operand: register, memory 804 mask = ~(c->dst.bytes * 8 - 1);
1236 * or immediate.
1237 */
1238 switch (c->d & Src2Mask) {
1239 case Src2None:
1240 break;
1241 case Src2CL:
1242 c->src2.bytes = 1;
1243 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
1244 break;
1245 case Src2ImmByte:
1246 c->src2.type = OP_IMM;
1247 c->src2.ptr = (unsigned long *)c->eip;
1248 c->src2.bytes = 1;
1249 c->src2.val = insn_fetch(u8, 1, c->eip);
1250 break;
1251 case Src2One:
1252 c->src2.bytes = 1;
1253 c->src2.val = 1;
1254 break;
1255 }
1256 805
1257 /* Decode and fetch the destination operand: register or memory. */ 806 if (c->src.bytes == 2)
1258 switch (c->d & DstMask) { 807 sv = (s16)c->src.val & (s16)mask;
1259 case ImplicitOps: 808 else if (c->src.bytes == 4)
1260 /* Special instructions do their own operand decoding. */ 809 sv = (s32)c->src.val & (s32)mask;
1261 return 0;
1262 case DstReg:
1263 decode_register_operand(&c->dst, c,
1264 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
1265 break;
1266 case DstMem:
1267 case DstMem64:
1268 if ((c->d & ModRM) && c->modrm_mod == 3) {
1269 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1270 c->dst.type = OP_REG;
1271 c->dst.val = c->dst.orig_val = c->modrm_val;
1272 c->dst.ptr = c->modrm_ptr;
1273 break;
1274 }
1275 c->dst.type = OP_MEM;
1276 c->dst.ptr = (unsigned long *)c->modrm_ea;
1277 if ((c->d & DstMask) == DstMem64)
1278 c->dst.bytes = 8;
1279 else
1280 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1281 c->dst.val = 0;
1282 if (c->d & BitOp) {
1283 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1284 810
1285 c->dst.ptr = (void *)c->dst.ptr + 811 c->dst.addr.mem += (sv >> 3);
1286 (c->src.val & mask) / 8;
1287 }
1288 break;
1289 case DstAcc:
1290 c->dst.type = OP_REG;
1291 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1292 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1293 switch (c->dst.bytes) {
1294 case 1:
1295 c->dst.val = *(u8 *)c->dst.ptr;
1296 break;
1297 case 2:
1298 c->dst.val = *(u16 *)c->dst.ptr;
1299 break;
1300 case 4:
1301 c->dst.val = *(u32 *)c->dst.ptr;
1302 break;
1303 case 8:
1304 c->dst.val = *(u64 *)c->dst.ptr;
1305 break;
1306 }
1307 c->dst.orig_val = c->dst.val;
1308 break;
1309 case DstDI:
1310 c->dst.type = OP_MEM;
1311 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1312 c->dst.ptr = (unsigned long *)
1313 register_address(c, es_base(ctxt, ops),
1314 c->regs[VCPU_REGS_RDI]);
1315 c->dst.val = 0;
1316 break;
1317 } 812 }
1318 813
1319done: 814 /* only subword offset */
1320 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 815 c->src.val &= (c->dst.bytes << 3) - 1;
1321} 816}
1322 817
1323static int read_emulated(struct x86_emulate_ctxt *ctxt, 818static int read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -1337,7 +832,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1337 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, 832 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
1338 ctxt->vcpu); 833 ctxt->vcpu);
1339 if (rc == X86EMUL_PROPAGATE_FAULT) 834 if (rc == X86EMUL_PROPAGATE_FAULT)
1340 emulate_pf(ctxt, addr, err); 835 emulate_pf(ctxt);
1341 if (rc != X86EMUL_CONTINUE) 836 if (rc != X86EMUL_CONTINUE)
1342 return rc; 837 return rc;
1343 mc->end += n; 838 mc->end += n;
@@ -1424,7 +919,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1424 addr = dt.address + index * 8; 919 addr = dt.address + index * 8;
1425 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 920 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1426 if (ret == X86EMUL_PROPAGATE_FAULT) 921 if (ret == X86EMUL_PROPAGATE_FAULT)
1427 emulate_pf(ctxt, addr, err); 922 emulate_pf(ctxt);
1428 923
1429 return ret; 924 return ret;
1430} 925}
@@ -1450,7 +945,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1450 addr = dt.address + index * 8; 945 addr = dt.address + index * 8;
1451 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 946 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1452 if (ret == X86EMUL_PROPAGATE_FAULT) 947 if (ret == X86EMUL_PROPAGATE_FAULT)
1453 emulate_pf(ctxt, addr, err); 948 emulate_pf(ctxt);
1454 949
1455 return ret; 950 return ret;
1456} 951}
@@ -1573,6 +1068,25 @@ exception:
1573 return X86EMUL_PROPAGATE_FAULT; 1068 return X86EMUL_PROPAGATE_FAULT;
1574} 1069}
1575 1070
1071static void write_register_operand(struct operand *op)
1072{
1073 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
1074 switch (op->bytes) {
1075 case 1:
1076 *(u8 *)op->addr.reg = (u8)op->val;
1077 break;
1078 case 2:
1079 *(u16 *)op->addr.reg = (u16)op->val;
1080 break;
1081 case 4:
1082 *op->addr.reg = (u32)op->val;
1083 break; /* 64b: zero-extend */
1084 case 8:
1085 *op->addr.reg = op->val;
1086 break;
1087 }
1088}
1089
1576static inline int writeback(struct x86_emulate_ctxt *ctxt, 1090static inline int writeback(struct x86_emulate_ctxt *ctxt,
1577 struct x86_emulate_ops *ops) 1091 struct x86_emulate_ops *ops)
1578{ 1092{
@@ -1582,28 +1096,12 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1582 1096
1583 switch (c->dst.type) { 1097 switch (c->dst.type) {
1584 case OP_REG: 1098 case OP_REG:
1585 /* The 4-byte case *is* correct: 1099 write_register_operand(&c->dst);
1586 * in 64-bit mode we zero-extend.
1587 */
1588 switch (c->dst.bytes) {
1589 case 1:
1590 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1591 break;
1592 case 2:
1593 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1594 break;
1595 case 4:
1596 *c->dst.ptr = (u32)c->dst.val;
1597 break; /* 64b: zero-ext */
1598 case 8:
1599 *c->dst.ptr = c->dst.val;
1600 break;
1601 }
1602 break; 1100 break;
1603 case OP_MEM: 1101 case OP_MEM:
1604 if (c->lock_prefix) 1102 if (c->lock_prefix)
1605 rc = ops->cmpxchg_emulated( 1103 rc = ops->cmpxchg_emulated(
1606 (unsigned long)c->dst.ptr, 1104 c->dst.addr.mem,
1607 &c->dst.orig_val, 1105 &c->dst.orig_val,
1608 &c->dst.val, 1106 &c->dst.val,
1609 c->dst.bytes, 1107 c->dst.bytes,
@@ -1611,14 +1109,13 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1611 ctxt->vcpu); 1109 ctxt->vcpu);
1612 else 1110 else
1613 rc = ops->write_emulated( 1111 rc = ops->write_emulated(
1614 (unsigned long)c->dst.ptr, 1112 c->dst.addr.mem,
1615 &c->dst.val, 1113 &c->dst.val,
1616 c->dst.bytes, 1114 c->dst.bytes,
1617 &err, 1115 &err,
1618 ctxt->vcpu); 1116 ctxt->vcpu);
1619 if (rc == X86EMUL_PROPAGATE_FAULT) 1117 if (rc == X86EMUL_PROPAGATE_FAULT)
1620 emulate_pf(ctxt, 1118 emulate_pf(ctxt);
1621 (unsigned long)c->dst.ptr, err);
1622 if (rc != X86EMUL_CONTINUE) 1119 if (rc != X86EMUL_CONTINUE)
1623 return rc; 1120 return rc;
1624 break; 1121 break;
@@ -1640,8 +1137,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1640 c->dst.bytes = c->op_bytes; 1137 c->dst.bytes = c->op_bytes;
1641 c->dst.val = c->src.val; 1138 c->dst.val = c->src.val;
1642 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1139 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1643 c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), 1140 c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
1644 c->regs[VCPU_REGS_RSP]); 1141 c->regs[VCPU_REGS_RSP]);
1645} 1142}
1646 1143
1647static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1144static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1701,6 +1198,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1701 *(unsigned long *)dest = 1198 *(unsigned long *)dest =
1702 (ctxt->eflags & ~change_mask) | (val & change_mask); 1199 (ctxt->eflags & ~change_mask) | (val & change_mask);
1703 1200
1201 if (rc == X86EMUL_PROPAGATE_FAULT)
1202 emulate_pf(ctxt);
1203
1704 return rc; 1204 return rc;
1705} 1205}
1706 1206
@@ -1778,6 +1278,150 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1778 return rc; 1278 return rc;
1779} 1279}
1780 1280
1281int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1282 struct x86_emulate_ops *ops, int irq)
1283{
1284 struct decode_cache *c = &ctxt->decode;
1285 int rc;
1286 struct desc_ptr dt;
1287 gva_t cs_addr;
1288 gva_t eip_addr;
1289 u16 cs, eip;
1290 u32 err;
1291
1292 /* TODO: Add limit checks */
1293 c->src.val = ctxt->eflags;
1294 emulate_push(ctxt, ops);
1295 rc = writeback(ctxt, ops);
1296 if (rc != X86EMUL_CONTINUE)
1297 return rc;
1298
1299 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
1300
1301 c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
1302 emulate_push(ctxt, ops);
1303 rc = writeback(ctxt, ops);
1304 if (rc != X86EMUL_CONTINUE)
1305 return rc;
1306
1307 c->src.val = c->eip;
1308 emulate_push(ctxt, ops);
1309 rc = writeback(ctxt, ops);
1310 if (rc != X86EMUL_CONTINUE)
1311 return rc;
1312
1313 c->dst.type = OP_NONE;
1314
1315 ops->get_idt(&dt, ctxt->vcpu);
1316
1317 eip_addr = dt.address + (irq << 2);
1318 cs_addr = dt.address + (irq << 2) + 2;
1319
1320 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
1321 if (rc != X86EMUL_CONTINUE)
1322 return rc;
1323
1324 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
1325 if (rc != X86EMUL_CONTINUE)
1326 return rc;
1327
1328 rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
1329 if (rc != X86EMUL_CONTINUE)
1330 return rc;
1331
1332 c->eip = eip;
1333
1334 return rc;
1335}
1336
1337static int emulate_int(struct x86_emulate_ctxt *ctxt,
1338 struct x86_emulate_ops *ops, int irq)
1339{
1340 switch(ctxt->mode) {
1341 case X86EMUL_MODE_REAL:
1342 return emulate_int_real(ctxt, ops, irq);
1343 case X86EMUL_MODE_VM86:
1344 case X86EMUL_MODE_PROT16:
1345 case X86EMUL_MODE_PROT32:
1346 case X86EMUL_MODE_PROT64:
1347 default:
1348 /* Protected mode interrupts unimplemented yet */
1349 return X86EMUL_UNHANDLEABLE;
1350 }
1351}
1352
1353static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1354 struct x86_emulate_ops *ops)
1355{
1356 struct decode_cache *c = &ctxt->decode;
1357 int rc = X86EMUL_CONTINUE;
1358 unsigned long temp_eip = 0;
1359 unsigned long temp_eflags = 0;
1360 unsigned long cs = 0;
1361 unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
1362 EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
1363 EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
1364 unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
1365
1366 /* TODO: Add stack limit check */
1367
1368 rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
1369
1370 if (rc != X86EMUL_CONTINUE)
1371 return rc;
1372
1373 if (temp_eip & ~0xffff) {
1374 emulate_gp(ctxt, 0);
1375 return X86EMUL_PROPAGATE_FAULT;
1376 }
1377
1378 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1379
1380 if (rc != X86EMUL_CONTINUE)
1381 return rc;
1382
1383 rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
1384
1385 if (rc != X86EMUL_CONTINUE)
1386 return rc;
1387
1388 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
1389
1390 if (rc != X86EMUL_CONTINUE)
1391 return rc;
1392
1393 c->eip = temp_eip;
1394
1395
1396 if (c->op_bytes == 4)
1397 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
1398 else if (c->op_bytes == 2) {
1399 ctxt->eflags &= ~0xffff;
1400 ctxt->eflags |= temp_eflags;
1401 }
1402
1403 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
1404 ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
1405
1406 return rc;
1407}
1408
1409static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
1410 struct x86_emulate_ops* ops)
1411{
1412 switch(ctxt->mode) {
1413 case X86EMUL_MODE_REAL:
1414 return emulate_iret_real(ctxt, ops);
1415 case X86EMUL_MODE_VM86:
1416 case X86EMUL_MODE_PROT16:
1417 case X86EMUL_MODE_PROT32:
1418 case X86EMUL_MODE_PROT64:
1419 default:
1420 /* iret from protected mode unimplemented yet */
1421 return X86EMUL_UNHANDLEABLE;
1422 }
1423}
1424
1781static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1425static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1782 struct x86_emulate_ops *ops) 1426 struct x86_emulate_ops *ops)
1783{ 1427{
@@ -1819,6 +1463,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1819 struct x86_emulate_ops *ops) 1463 struct x86_emulate_ops *ops)
1820{ 1464{
1821 struct decode_cache *c = &ctxt->decode; 1465 struct decode_cache *c = &ctxt->decode;
1466 unsigned long *rax = &c->regs[VCPU_REGS_RAX];
1467 unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
1468 u8 de = 0;
1822 1469
1823 switch (c->modrm_reg) { 1470 switch (c->modrm_reg) {
1824 case 0 ... 1: /* test */ 1471 case 0 ... 1: /* test */
@@ -1830,10 +1477,26 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1830 case 3: /* neg */ 1477 case 3: /* neg */
1831 emulate_1op("neg", c->dst, ctxt->eflags); 1478 emulate_1op("neg", c->dst, ctxt->eflags);
1832 break; 1479 break;
1480 case 4: /* mul */
1481 emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
1482 break;
1483 case 5: /* imul */
1484 emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
1485 break;
1486 case 6: /* div */
1487 emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
1488 ctxt->eflags, de);
1489 break;
1490 case 7: /* idiv */
1491 emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
1492 ctxt->eflags, de);
1493 break;
1833 default: 1494 default:
1834 return 0; 1495 return X86EMUL_UNHANDLEABLE;
1835 } 1496 }
1836 return 1; 1497 if (de)
1498 return emulate_de(ctxt);
1499 return X86EMUL_CONTINUE;
1837} 1500}
1838 1501
1839static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, 1502static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1905,6 +1568,23 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1905 return rc; 1568 return rc;
1906} 1569}
1907 1570
1571static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
1572 struct x86_emulate_ops *ops, int seg)
1573{
1574 struct decode_cache *c = &ctxt->decode;
1575 unsigned short sel;
1576 int rc;
1577
1578 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
1579
1580 rc = load_segment_descriptor(ctxt, ops, sel, seg);
1581 if (rc != X86EMUL_CONTINUE)
1582 return rc;
1583
1584 c->dst.val = c->src.val;
1585 return rc;
1586}
1587
1908static inline void 1588static inline void
1909setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1589setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1910 struct x86_emulate_ops *ops, struct desc_struct *cs, 1590 struct x86_emulate_ops *ops, struct desc_struct *cs,
@@ -2160,9 +1840,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2160 struct x86_emulate_ops *ops, 1840 struct x86_emulate_ops *ops,
2161 u16 port, u16 len) 1841 u16 port, u16 len)
2162{ 1842{
1843 if (ctxt->perm_ok)
1844 return true;
1845
2163 if (emulator_bad_iopl(ctxt, ops)) 1846 if (emulator_bad_iopl(ctxt, ops))
2164 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 1847 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
2165 return false; 1848 return false;
1849
1850 ctxt->perm_ok = true;
1851
2166 return true; 1852 return true;
2167} 1853}
2168 1854
@@ -2254,7 +1940,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2254 &err); 1940 &err);
2255 if (ret == X86EMUL_PROPAGATE_FAULT) { 1941 if (ret == X86EMUL_PROPAGATE_FAULT) {
2256 /* FIXME: need to provide precise fault address */ 1942 /* FIXME: need to provide precise fault address */
2257 emulate_pf(ctxt, old_tss_base, err); 1943 emulate_pf(ctxt);
2258 return ret; 1944 return ret;
2259 } 1945 }
2260 1946
@@ -2264,7 +1950,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2264 &err); 1950 &err);
2265 if (ret == X86EMUL_PROPAGATE_FAULT) { 1951 if (ret == X86EMUL_PROPAGATE_FAULT) {
2266 /* FIXME: need to provide precise fault address */ 1952 /* FIXME: need to provide precise fault address */
2267 emulate_pf(ctxt, old_tss_base, err); 1953 emulate_pf(ctxt);
2268 return ret; 1954 return ret;
2269 } 1955 }
2270 1956
@@ -2272,7 +1958,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2272 &err); 1958 &err);
2273 if (ret == X86EMUL_PROPAGATE_FAULT) { 1959 if (ret == X86EMUL_PROPAGATE_FAULT) {
2274 /* FIXME: need to provide precise fault address */ 1960 /* FIXME: need to provide precise fault address */
2275 emulate_pf(ctxt, new_tss_base, err); 1961 emulate_pf(ctxt);
2276 return ret; 1962 return ret;
2277 } 1963 }
2278 1964
@@ -2285,7 +1971,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2285 ctxt->vcpu, &err); 1971 ctxt->vcpu, &err);
2286 if (ret == X86EMUL_PROPAGATE_FAULT) { 1972 if (ret == X86EMUL_PROPAGATE_FAULT) {
2287 /* FIXME: need to provide precise fault address */ 1973 /* FIXME: need to provide precise fault address */
2288 emulate_pf(ctxt, new_tss_base, err); 1974 emulate_pf(ctxt);
2289 return ret; 1975 return ret;
2290 } 1976 }
2291 } 1977 }
@@ -2396,7 +2082,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2396 &err); 2082 &err);
2397 if (ret == X86EMUL_PROPAGATE_FAULT) { 2083 if (ret == X86EMUL_PROPAGATE_FAULT) {
2398 /* FIXME: need to provide precise fault address */ 2084 /* FIXME: need to provide precise fault address */
2399 emulate_pf(ctxt, old_tss_base, err); 2085 emulate_pf(ctxt);
2400 return ret; 2086 return ret;
2401 } 2087 }
2402 2088
@@ -2406,7 +2092,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2406 &err); 2092 &err);
2407 if (ret == X86EMUL_PROPAGATE_FAULT) { 2093 if (ret == X86EMUL_PROPAGATE_FAULT) {
2408 /* FIXME: need to provide precise fault address */ 2094 /* FIXME: need to provide precise fault address */
2409 emulate_pf(ctxt, old_tss_base, err); 2095 emulate_pf(ctxt);
2410 return ret; 2096 return ret;
2411 } 2097 }
2412 2098
@@ -2414,7 +2100,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2414 &err); 2100 &err);
2415 if (ret == X86EMUL_PROPAGATE_FAULT) { 2101 if (ret == X86EMUL_PROPAGATE_FAULT) {
2416 /* FIXME: need to provide precise fault address */ 2102 /* FIXME: need to provide precise fault address */
2417 emulate_pf(ctxt, new_tss_base, err); 2103 emulate_pf(ctxt);
2418 return ret; 2104 return ret;
2419 } 2105 }
2420 2106
@@ -2427,7 +2113,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2427 ctxt->vcpu, &err); 2113 ctxt->vcpu, &err);
2428 if (ret == X86EMUL_PROPAGATE_FAULT) { 2114 if (ret == X86EMUL_PROPAGATE_FAULT) {
2429 /* FIXME: need to provide precise fault address */ 2115 /* FIXME: need to provide precise fault address */
2430 emulate_pf(ctxt, new_tss_base, err); 2116 emulate_pf(ctxt);
2431 return ret; 2117 return ret;
2432 } 2118 }
2433 } 2119 }
@@ -2523,10 +2209,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2523} 2209}
2524 2210
2525int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 2211int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2526 struct x86_emulate_ops *ops,
2527 u16 tss_selector, int reason, 2212 u16 tss_selector, int reason,
2528 bool has_error_code, u32 error_code) 2213 bool has_error_code, u32 error_code)
2529{ 2214{
2215 struct x86_emulate_ops *ops = ctxt->ops;
2530 struct decode_cache *c = &ctxt->decode; 2216 struct decode_cache *c = &ctxt->decode;
2531 int rc; 2217 int rc;
2532 2218
@@ -2552,16 +2238,784 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
2552 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2238 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2553 2239
2554 register_address_increment(c, &c->regs[reg], df * op->bytes); 2240 register_address_increment(c, &c->regs[reg], df * op->bytes);
2555 op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); 2241 op->addr.mem = register_address(c, base, c->regs[reg]);
2242}
2243
2244static int em_push(struct x86_emulate_ctxt *ctxt)
2245{
2246 emulate_push(ctxt, ctxt->ops);
2247 return X86EMUL_CONTINUE;
2248}
2249
2250static int em_das(struct x86_emulate_ctxt *ctxt)
2251{
2252 struct decode_cache *c = &ctxt->decode;
2253 u8 al, old_al;
2254 bool af, cf, old_cf;
2255
2256 cf = ctxt->eflags & X86_EFLAGS_CF;
2257 al = c->dst.val;
2258
2259 old_al = al;
2260 old_cf = cf;
2261 cf = false;
2262 af = ctxt->eflags & X86_EFLAGS_AF;
2263 if ((al & 0x0f) > 9 || af) {
2264 al -= 6;
2265 cf = old_cf | (al >= 250);
2266 af = true;
2267 } else {
2268 af = false;
2269 }
2270 if (old_al > 0x99 || old_cf) {
2271 al -= 0x60;
2272 cf = true;
2273 }
2274
2275 c->dst.val = al;
2276 /* Set PF, ZF, SF */
2277 c->src.type = OP_IMM;
2278 c->src.val = 0;
2279 c->src.bytes = 1;
2280 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2281 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2282 if (cf)
2283 ctxt->eflags |= X86_EFLAGS_CF;
2284 if (af)
2285 ctxt->eflags |= X86_EFLAGS_AF;
2286 return X86EMUL_CONTINUE;
2287}
2288
2289static int em_call_far(struct x86_emulate_ctxt *ctxt)
2290{
2291 struct decode_cache *c = &ctxt->decode;
2292 u16 sel, old_cs;
2293 ulong old_eip;
2294 int rc;
2295
2296 old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2297 old_eip = c->eip;
2298
2299 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
2300 if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
2301 return X86EMUL_CONTINUE;
2302
2303 c->eip = 0;
2304 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2305
2306 c->src.val = old_cs;
2307 emulate_push(ctxt, ctxt->ops);
2308 rc = writeback(ctxt, ctxt->ops);
2309 if (rc != X86EMUL_CONTINUE)
2310 return rc;
2311
2312 c->src.val = old_eip;
2313 emulate_push(ctxt, ctxt->ops);
2314 rc = writeback(ctxt, ctxt->ops);
2315 if (rc != X86EMUL_CONTINUE)
2316 return rc;
2317
2318 c->dst.type = OP_NONE;
2319
2320 return X86EMUL_CONTINUE;
2321}
2322
2323static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2324{
2325 struct decode_cache *c = &ctxt->decode;
2326 int rc;
2327
2328 c->dst.type = OP_REG;
2329 c->dst.addr.reg = &c->eip;
2330 c->dst.bytes = c->op_bytes;
2331 rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
2332 if (rc != X86EMUL_CONTINUE)
2333 return rc;
2334 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
2335 return X86EMUL_CONTINUE;
2336}
2337
2338static int em_imul(struct x86_emulate_ctxt *ctxt)
2339{
2340 struct decode_cache *c = &ctxt->decode;
2341
2342 emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
2343 return X86EMUL_CONTINUE;
2344}
2345
2346static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
2347{
2348 struct decode_cache *c = &ctxt->decode;
2349
2350 c->dst.val = c->src2.val;
2351 return em_imul(ctxt);
2352}
2353
2354static int em_cwd(struct x86_emulate_ctxt *ctxt)
2355{
2356 struct decode_cache *c = &ctxt->decode;
2357
2358 c->dst.type = OP_REG;
2359 c->dst.bytes = c->src.bytes;
2360 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
2361 c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
2362
2363 return X86EMUL_CONTINUE;
2364}
2365
2366static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2367{
2368 unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
2369 struct decode_cache *c = &ctxt->decode;
2370 u64 tsc = 0;
2371
2372 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
2373 emulate_gp(ctxt, 0);
2374 return X86EMUL_PROPAGATE_FAULT;
2375 }
2376 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
2377 c->regs[VCPU_REGS_RAX] = (u32)tsc;
2378 c->regs[VCPU_REGS_RDX] = tsc >> 32;
2379 return X86EMUL_CONTINUE;
2380}
2381
2382static int em_mov(struct x86_emulate_ctxt *ctxt)
2383{
2384 struct decode_cache *c = &ctxt->decode;
2385 c->dst.val = c->src.val;
2386 return X86EMUL_CONTINUE;
2387}
2388
2389#define D(_y) { .flags = (_y) }
2390#define N D(0)
2391#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
2392#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
2393#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
2394
2395#define D2bv(_f) D((_f) | ByteOp), D(_f)
2396#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
2397
2398#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \
2399 D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \
2400 D2bv(((_f) & ~Lock) | DstAcc | SrcImm)
2401
2402
2403static struct opcode group1[] = {
2404 X7(D(Lock)), N
2405};
2406
2407static struct opcode group1A[] = {
2408 D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
2409};
2410
2411static struct opcode group3[] = {
2412 D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
2413 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
2414 X4(D(SrcMem | ModRM)),
2415};
2416
2417static struct opcode group4[] = {
2418 D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
2419 N, N, N, N, N, N,
2420};
2421
2422static struct opcode group5[] = {
2423 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
2424 D(SrcMem | ModRM | Stack),
2425 I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
2426 D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
2427 D(SrcMem | ModRM | Stack), N,
2428};
2429
2430static struct group_dual group7 = { {
2431 N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
2432 D(SrcNone | ModRM | DstMem | Mov), N,
2433 D(SrcMem16 | ModRM | Mov | Priv),
2434 D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
2435}, {
2436 D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
2437 D(SrcNone | ModRM | DstMem | Mov), N,
2438 D(SrcMem16 | ModRM | Mov | Priv), N,
2439} };
2440
2441static struct opcode group8[] = {
2442 N, N, N, N,
2443 D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
2444 D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
2445};
2446
2447static struct group_dual group9 = { {
2448 N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
2449}, {
2450 N, N, N, N, N, N, N, N,
2451} };
2452
2453static struct opcode group11[] = {
2454 I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
2455};
2456
2457static struct opcode opcode_table[256] = {
2458 /* 0x00 - 0x07 */
2459 D6ALU(Lock),
2460 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2461 /* 0x08 - 0x0F */
2462 D6ALU(Lock),
2463 D(ImplicitOps | Stack | No64), N,
2464 /* 0x10 - 0x17 */
2465 D6ALU(Lock),
2466 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2467 /* 0x18 - 0x1F */
2468 D6ALU(Lock),
2469 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2470 /* 0x20 - 0x27 */
2471 D6ALU(Lock), N, N,
2472 /* 0x28 - 0x2F */
2473 D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das),
2474 /* 0x30 - 0x37 */
2475 D6ALU(Lock), N, N,
2476 /* 0x38 - 0x3F */
2477 D6ALU(0), N, N,
2478 /* 0x40 - 0x4F */
2479 X16(D(DstReg)),
2480 /* 0x50 - 0x57 */
2481 X8(I(SrcReg | Stack, em_push)),
2482 /* 0x58 - 0x5F */
2483 X8(D(DstReg | Stack)),
2484 /* 0x60 - 0x67 */
2485 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2486 N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
2487 N, N, N, N,
2488 /* 0x68 - 0x6F */
2489 I(SrcImm | Mov | Stack, em_push),
2490 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
2491 I(SrcImmByte | Mov | Stack, em_push),
2492 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
2493 D2bv(DstDI | Mov | String), /* insb, insw/insd */
2494 D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
2495 /* 0x70 - 0x7F */
2496 X16(D(SrcImmByte)),
2497 /* 0x80 - 0x87 */
2498 G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
2499 G(DstMem | SrcImm | ModRM | Group, group1),
2500 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
2501 G(DstMem | SrcImmByte | ModRM | Group, group1),
2502 D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
2503 /* 0x88 - 0x8F */
2504 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
2505 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
2506 D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
2507 D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
2508 /* 0x90 - 0x97 */
2509 X8(D(SrcAcc | DstReg)),
2510 /* 0x98 - 0x9F */
2511 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
2512 I(SrcImmFAddr | No64, em_call_far), N,
2513 D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
2514 /* 0xA0 - 0xA7 */
2515 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
2516 I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
2517 I2bv(SrcSI | DstDI | Mov | String, em_mov),
2518 D2bv(SrcSI | DstDI | String),
2519 /* 0xA8 - 0xAF */
2520 D2bv(DstAcc | SrcImm),
2521 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
2522 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
2523 D2bv(SrcAcc | DstDI | String),
2524 /* 0xB0 - 0xB7 */
2525 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
2526 /* 0xB8 - 0xBF */
2527 X8(I(DstReg | SrcImm | Mov, em_mov)),
2528 /* 0xC0 - 0xC7 */
2529 D2bv(DstMem | SrcImmByte | ModRM),
2530 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
2531 D(ImplicitOps | Stack),
2532 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
2533 G(ByteOp, group11), G(0, group11),
2534 /* 0xC8 - 0xCF */
2535 N, N, N, D(ImplicitOps | Stack),
2536 D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
2537 /* 0xD0 - 0xD7 */
2538 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
2539 N, N, N, N,
2540 /* 0xD8 - 0xDF */
2541 N, N, N, N, N, N, N, N,
2542 /* 0xE0 - 0xE7 */
2543 X4(D(SrcImmByte)),
2544 D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte),
2545 /* 0xE8 - 0xEF */
2546 D(SrcImm | Stack), D(SrcImm | ImplicitOps),
2547 D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
2548 D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps),
2549 /* 0xF0 - 0xF7 */
2550 N, N, N, N,
2551 D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
2552 /* 0xF8 - 0xFF */
2553 D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
2554 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
2555};
2556
2557static struct opcode twobyte_table[256] = {
2558 /* 0x00 - 0x0F */
2559 N, GD(0, &group7), N, N,
2560 N, D(ImplicitOps), D(ImplicitOps | Priv), N,
2561 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
2562 N, D(ImplicitOps | ModRM), N, N,
2563 /* 0x10 - 0x1F */
2564 N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
2565 /* 0x20 - 0x2F */
2566 D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264),
2567 D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264),
2568 N, N, N, N,
2569 N, N, N, N, N, N, N, N,
2570 /* 0x30 - 0x3F */
2571 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
2572 D(ImplicitOps | Priv), N,
2573 D(ImplicitOps), D(ImplicitOps | Priv), N, N,
2574 N, N, N, N, N, N, N, N,
2575 /* 0x40 - 0x4F */
2576 X16(D(DstReg | SrcMem | ModRM | Mov)),
2577 /* 0x50 - 0x5F */
2578 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2579 /* 0x60 - 0x6F */
2580 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2581 /* 0x70 - 0x7F */
2582 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2583 /* 0x80 - 0x8F */
2584 X16(D(SrcImm)),
2585 /* 0x90 - 0x9F */
2586 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
2587 /* 0xA0 - 0xA7 */
2588 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
2589 N, D(DstMem | SrcReg | ModRM | BitOp),
2590 D(DstMem | SrcReg | Src2ImmByte | ModRM),
2591 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
2592 /* 0xA8 - 0xAF */
2593 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
2594 N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
2595 D(DstMem | SrcReg | Src2ImmByte | ModRM),
2596 D(DstMem | SrcReg | Src2CL | ModRM),
2597 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
2598 /* 0xB0 - 0xB7 */
2599 D2bv(DstMem | SrcReg | ModRM | Lock),
2600 D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
2601 D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
2602 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
2603 /* 0xB8 - 0xBF */
2604 N, N,
2605 G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
2606 D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
2607 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
2608 /* 0xC0 - 0xCF */
2609 D2bv(DstMem | SrcReg | ModRM | Lock),
2610 N, D(DstMem | SrcReg | ModRM | Mov),
2611 N, N, N, GD(0, &group9),
2612 N, N, N, N, N, N, N, N,
2613 /* 0xD0 - 0xDF */
2614 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2615 /* 0xE0 - 0xEF */
2616 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2617 /* 0xF0 - 0xFF */
2618 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
2619};
2620
2621#undef D
2622#undef N
2623#undef G
2624#undef GD
2625#undef I
2626
2627#undef D2bv
2628#undef I2bv
2629#undef D6ALU
2630
2631static unsigned imm_size(struct decode_cache *c)
2632{
2633 unsigned size;
2634
2635 size = (c->d & ByteOp) ? 1 : c->op_bytes;
2636 if (size == 8)
2637 size = 4;
2638 return size;
2639}
2640
2641static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
2642 unsigned size, bool sign_extension)
2643{
2644 struct decode_cache *c = &ctxt->decode;
2645 struct x86_emulate_ops *ops = ctxt->ops;
2646 int rc = X86EMUL_CONTINUE;
2647
2648 op->type = OP_IMM;
2649 op->bytes = size;
2650 op->addr.mem = c->eip;
2651 /* NB. Immediates are sign-extended as necessary. */
2652 switch (op->bytes) {
2653 case 1:
2654 op->val = insn_fetch(s8, 1, c->eip);
2655 break;
2656 case 2:
2657 op->val = insn_fetch(s16, 2, c->eip);
2658 break;
2659 case 4:
2660 op->val = insn_fetch(s32, 4, c->eip);
2661 break;
2662 }
2663 if (!sign_extension) {
2664 switch (op->bytes) {
2665 case 1:
2666 op->val &= 0xff;
2667 break;
2668 case 2:
2669 op->val &= 0xffff;
2670 break;
2671 case 4:
2672 op->val &= 0xffffffff;
2673 break;
2674 }
2675 }
2676done:
2677 return rc;
2556} 2678}
2557 2679
2558int 2680int
2559x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 2681x86_decode_insn(struct x86_emulate_ctxt *ctxt)
2560{ 2682{
2683 struct x86_emulate_ops *ops = ctxt->ops;
2684 struct decode_cache *c = &ctxt->decode;
2685 int rc = X86EMUL_CONTINUE;
2686 int mode = ctxt->mode;
2687 int def_op_bytes, def_ad_bytes, dual, goffset;
2688 struct opcode opcode, *g_mod012, *g_mod3;
2689 struct operand memop = { .type = OP_NONE };
2690
2691 c->eip = ctxt->eip;
2692 c->fetch.start = c->fetch.end = c->eip;
2693 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
2694
2695 switch (mode) {
2696 case X86EMUL_MODE_REAL:
2697 case X86EMUL_MODE_VM86:
2698 case X86EMUL_MODE_PROT16:
2699 def_op_bytes = def_ad_bytes = 2;
2700 break;
2701 case X86EMUL_MODE_PROT32:
2702 def_op_bytes = def_ad_bytes = 4;
2703 break;
2704#ifdef CONFIG_X86_64
2705 case X86EMUL_MODE_PROT64:
2706 def_op_bytes = 4;
2707 def_ad_bytes = 8;
2708 break;
2709#endif
2710 default:
2711 return -1;
2712 }
2713
2714 c->op_bytes = def_op_bytes;
2715 c->ad_bytes = def_ad_bytes;
2716
2717 /* Legacy prefixes. */
2718 for (;;) {
2719 switch (c->b = insn_fetch(u8, 1, c->eip)) {
2720 case 0x66: /* operand-size override */
2721 /* switch between 2/4 bytes */
2722 c->op_bytes = def_op_bytes ^ 6;
2723 break;
2724 case 0x67: /* address-size override */
2725 if (mode == X86EMUL_MODE_PROT64)
2726 /* switch between 4/8 bytes */
2727 c->ad_bytes = def_ad_bytes ^ 12;
2728 else
2729 /* switch between 2/4 bytes */
2730 c->ad_bytes = def_ad_bytes ^ 6;
2731 break;
2732 case 0x26: /* ES override */
2733 case 0x2e: /* CS override */
2734 case 0x36: /* SS override */
2735 case 0x3e: /* DS override */
2736 set_seg_override(c, (c->b >> 3) & 3);
2737 break;
2738 case 0x64: /* FS override */
2739 case 0x65: /* GS override */
2740 set_seg_override(c, c->b & 7);
2741 break;
2742 case 0x40 ... 0x4f: /* REX */
2743 if (mode != X86EMUL_MODE_PROT64)
2744 goto done_prefixes;
2745 c->rex_prefix = c->b;
2746 continue;
2747 case 0xf0: /* LOCK */
2748 c->lock_prefix = 1;
2749 break;
2750 case 0xf2: /* REPNE/REPNZ */
2751 c->rep_prefix = REPNE_PREFIX;
2752 break;
2753 case 0xf3: /* REP/REPE/REPZ */
2754 c->rep_prefix = REPE_PREFIX;
2755 break;
2756 default:
2757 goto done_prefixes;
2758 }
2759
2760 /* Any legacy prefix after a REX prefix nullifies its effect. */
2761
2762 c->rex_prefix = 0;
2763 }
2764
2765done_prefixes:
2766
2767 /* REX prefix. */
2768 if (c->rex_prefix & 8)
2769 c->op_bytes = 8; /* REX.W */
2770
2771 /* Opcode byte(s). */
2772 opcode = opcode_table[c->b];
2773 /* Two-byte opcode? */
2774 if (c->b == 0x0f) {
2775 c->twobyte = 1;
2776 c->b = insn_fetch(u8, 1, c->eip);
2777 opcode = twobyte_table[c->b];
2778 }
2779 c->d = opcode.flags;
2780
2781 if (c->d & Group) {
2782 dual = c->d & GroupDual;
2783 c->modrm = insn_fetch(u8, 1, c->eip);
2784 --c->eip;
2785
2786 if (c->d & GroupDual) {
2787 g_mod012 = opcode.u.gdual->mod012;
2788 g_mod3 = opcode.u.gdual->mod3;
2789 } else
2790 g_mod012 = g_mod3 = opcode.u.group;
2791
2792 c->d &= ~(Group | GroupDual);
2793
2794 goffset = (c->modrm >> 3) & 7;
2795
2796 if ((c->modrm >> 6) == 3)
2797 opcode = g_mod3[goffset];
2798 else
2799 opcode = g_mod012[goffset];
2800 c->d |= opcode.flags;
2801 }
2802
2803 c->execute = opcode.u.execute;
2804
2805 /* Unrecognised? */
2806 if (c->d == 0 || (c->d & Undefined)) {
2807 DPRINTF("Cannot emulate %02x\n", c->b);
2808 return -1;
2809 }
2810
2811 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2812 c->op_bytes = 8;
2813
2814 if (c->d & Op3264) {
2815 if (mode == X86EMUL_MODE_PROT64)
2816 c->op_bytes = 8;
2817 else
2818 c->op_bytes = 4;
2819 }
2820
2821 /* ModRM and SIB bytes. */
2822 if (c->d & ModRM) {
2823 rc = decode_modrm(ctxt, ops, &memop);
2824 if (!c->has_seg_override)
2825 set_seg_override(c, c->modrm_seg);
2826 } else if (c->d & MemAbs)
2827 rc = decode_abs(ctxt, ops, &memop);
2828 if (rc != X86EMUL_CONTINUE)
2829 goto done;
2830
2831 if (!c->has_seg_override)
2832 set_seg_override(c, VCPU_SREG_DS);
2833
2834 if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
2835 memop.addr.mem += seg_override_base(ctxt, ops, c);
2836
2837 if (memop.type == OP_MEM && c->ad_bytes != 8)
2838 memop.addr.mem = (u32)memop.addr.mem;
2839
2840 if (memop.type == OP_MEM && c->rip_relative)
2841 memop.addr.mem += c->eip;
2842
2843 /*
2844 * Decode and fetch the source operand: register, memory
2845 * or immediate.
2846 */
2847 switch (c->d & SrcMask) {
2848 case SrcNone:
2849 break;
2850 case SrcReg:
2851 decode_register_operand(&c->src, c, 0);
2852 break;
2853 case SrcMem16:
2854 memop.bytes = 2;
2855 goto srcmem_common;
2856 case SrcMem32:
2857 memop.bytes = 4;
2858 goto srcmem_common;
2859 case SrcMem:
2860 memop.bytes = (c->d & ByteOp) ? 1 :
2861 c->op_bytes;
2862 srcmem_common:
2863 c->src = memop;
2864 break;
2865 case SrcImmU16:
2866 rc = decode_imm(ctxt, &c->src, 2, false);
2867 break;
2868 case SrcImm:
2869 rc = decode_imm(ctxt, &c->src, imm_size(c), true);
2870 break;
2871 case SrcImmU:
2872 rc = decode_imm(ctxt, &c->src, imm_size(c), false);
2873 break;
2874 case SrcImmByte:
2875 rc = decode_imm(ctxt, &c->src, 1, true);
2876 break;
2877 case SrcImmUByte:
2878 rc = decode_imm(ctxt, &c->src, 1, false);
2879 break;
2880 case SrcAcc:
2881 c->src.type = OP_REG;
2882 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2883 c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
2884 fetch_register_operand(&c->src);
2885 break;
2886 case SrcOne:
2887 c->src.bytes = 1;
2888 c->src.val = 1;
2889 break;
2890 case SrcSI:
2891 c->src.type = OP_MEM;
2892 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2893 c->src.addr.mem =
2894 register_address(c, seg_override_base(ctxt, ops, c),
2895 c->regs[VCPU_REGS_RSI]);
2896 c->src.val = 0;
2897 break;
2898 case SrcImmFAddr:
2899 c->src.type = OP_IMM;
2900 c->src.addr.mem = c->eip;
2901 c->src.bytes = c->op_bytes + 2;
2902 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
2903 break;
2904 case SrcMemFAddr:
2905 memop.bytes = c->op_bytes + 2;
2906 goto srcmem_common;
2907 break;
2908 }
2909
2910 if (rc != X86EMUL_CONTINUE)
2911 goto done;
2912
2913 /*
2914 * Decode and fetch the second source operand: register, memory
2915 * or immediate.
2916 */
2917 switch (c->d & Src2Mask) {
2918 case Src2None:
2919 break;
2920 case Src2CL:
2921 c->src2.bytes = 1;
2922 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
2923 break;
2924 case Src2ImmByte:
2925 rc = decode_imm(ctxt, &c->src2, 1, true);
2926 break;
2927 case Src2One:
2928 c->src2.bytes = 1;
2929 c->src2.val = 1;
2930 break;
2931 case Src2Imm:
2932 rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
2933 break;
2934 }
2935
2936 if (rc != X86EMUL_CONTINUE)
2937 goto done;
2938
2939 /* Decode and fetch the destination operand: register or memory. */
2940 switch (c->d & DstMask) {
2941 case DstReg:
2942 decode_register_operand(&c->dst, c,
2943 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
2944 break;
2945 case DstImmUByte:
2946 c->dst.type = OP_IMM;
2947 c->dst.addr.mem = c->eip;
2948 c->dst.bytes = 1;
2949 c->dst.val = insn_fetch(u8, 1, c->eip);
2950 break;
2951 case DstMem:
2952 case DstMem64:
2953 c->dst = memop;
2954 if ((c->d & DstMask) == DstMem64)
2955 c->dst.bytes = 8;
2956 else
2957 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2958 if (c->d & BitOp)
2959 fetch_bit_operand(c);
2960 c->dst.orig_val = c->dst.val;
2961 break;
2962 case DstAcc:
2963 c->dst.type = OP_REG;
2964 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2965 c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
2966 fetch_register_operand(&c->dst);
2967 c->dst.orig_val = c->dst.val;
2968 break;
2969 case DstDI:
2970 c->dst.type = OP_MEM;
2971 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2972 c->dst.addr.mem =
2973 register_address(c, es_base(ctxt, ops),
2974 c->regs[VCPU_REGS_RDI]);
2975 c->dst.val = 0;
2976 break;
2977 case ImplicitOps:
2978 /* Special instructions do their own operand decoding. */
2979 default:
2980 c->dst.type = OP_NONE; /* Disable writeback. */
2981 return 0;
2982 }
2983
2984done:
2985 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2986}
2987
2988static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
2989{
2990 struct decode_cache *c = &ctxt->decode;
2991
2992 /* The second termination condition only applies for REPE
2993 * and REPNE. Test if the repeat string operation prefix is
2994 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
2995 * corresponding termination condition according to:
2996 * - if REPE/REPZ and ZF = 0 then done
2997 * - if REPNE/REPNZ and ZF = 1 then done
2998 */
2999 if (((c->b == 0xa6) || (c->b == 0xa7) ||
3000 (c->b == 0xae) || (c->b == 0xaf))
3001 && (((c->rep_prefix == REPE_PREFIX) &&
3002 ((ctxt->eflags & EFLG_ZF) == 0))
3003 || ((c->rep_prefix == REPNE_PREFIX) &&
3004 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
3005 return true;
3006
3007 return false;
3008}
3009
3010int
3011x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3012{
3013 struct x86_emulate_ops *ops = ctxt->ops;
2561 u64 msr_data; 3014 u64 msr_data;
2562 struct decode_cache *c = &ctxt->decode; 3015 struct decode_cache *c = &ctxt->decode;
2563 int rc = X86EMUL_CONTINUE; 3016 int rc = X86EMUL_CONTINUE;
2564 int saved_dst_type = c->dst.type; 3017 int saved_dst_type = c->dst.type;
3018 int irq; /* Used for int 3, int, and into */
2565 3019
2566 ctxt->decode.mem_read.pos = 0; 3020 ctxt->decode.mem_read.pos = 0;
2567 3021
@@ -2576,6 +3030,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2576 goto done; 3030 goto done;
2577 } 3031 }
2578 3032
3033 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
3034 emulate_ud(ctxt);
3035 goto done;
3036 }
3037
2579 /* Privileged instruction can be executed only in CPL=0 */ 3038 /* Privileged instruction can be executed only in CPL=0 */
2580 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 3039 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
2581 emulate_gp(ctxt, 0); 3040 emulate_gp(ctxt, 0);
@@ -2583,35 +3042,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2583 } 3042 }
2584 3043
2585 if (c->rep_prefix && (c->d & String)) { 3044 if (c->rep_prefix && (c->d & String)) {
2586 ctxt->restart = true;
2587 /* All REP prefixes have the same first termination condition */ 3045 /* All REP prefixes have the same first termination condition */
2588 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 3046 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2589 string_done:
2590 ctxt->restart = false;
2591 ctxt->eip = c->eip; 3047 ctxt->eip = c->eip;
2592 goto done; 3048 goto done;
2593 } 3049 }
2594 /* The second termination condition only applies for REPE
2595 * and REPNE. Test if the repeat string operation prefix is
2596 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
2597 * corresponding termination condition according to:
2598 * - if REPE/REPZ and ZF = 0 then done
2599 * - if REPNE/REPNZ and ZF = 1 then done
2600 */
2601 if ((c->b == 0xa6) || (c->b == 0xa7) ||
2602 (c->b == 0xae) || (c->b == 0xaf)) {
2603 if ((c->rep_prefix == REPE_PREFIX) &&
2604 ((ctxt->eflags & EFLG_ZF) == 0))
2605 goto string_done;
2606 if ((c->rep_prefix == REPNE_PREFIX) &&
2607 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
2608 goto string_done;
2609 }
2610 c->eip = ctxt->eip;
2611 } 3050 }
2612 3051
2613 if (c->src.type == OP_MEM) { 3052 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
2614 rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, 3053 rc = read_emulated(ctxt, ops, c->src.addr.mem,
2615 c->src.valptr, c->src.bytes); 3054 c->src.valptr, c->src.bytes);
2616 if (rc != X86EMUL_CONTINUE) 3055 if (rc != X86EMUL_CONTINUE)
2617 goto done; 3056 goto done;
@@ -2619,7 +3058,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2619 } 3058 }
2620 3059
2621 if (c->src2.type == OP_MEM) { 3060 if (c->src2.type == OP_MEM) {
2622 rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, 3061 rc = read_emulated(ctxt, ops, c->src2.addr.mem,
2623 &c->src2.val, c->src2.bytes); 3062 &c->src2.val, c->src2.bytes);
2624 if (rc != X86EMUL_CONTINUE) 3063 if (rc != X86EMUL_CONTINUE)
2625 goto done; 3064 goto done;
@@ -2631,7 +3070,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2631 3070
2632 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3071 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
2633 /* optimisation - avoid slow emulated read if Mov */ 3072 /* optimisation - avoid slow emulated read if Mov */
2634 rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, 3073 rc = read_emulated(ctxt, ops, c->dst.addr.mem,
2635 &c->dst.val, c->dst.bytes); 3074 &c->dst.val, c->dst.bytes);
2636 if (rc != X86EMUL_CONTINUE) 3075 if (rc != X86EMUL_CONTINUE)
2637 goto done; 3076 goto done;
@@ -2640,6 +3079,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2640 3079
2641special_insn: 3080special_insn:
2642 3081
3082 if (c->execute) {
3083 rc = c->execute(ctxt);
3084 if (rc != X86EMUL_CONTINUE)
3085 goto done;
3086 goto writeback;
3087 }
3088
2643 if (c->twobyte) 3089 if (c->twobyte)
2644 goto twobyte_insn; 3090 goto twobyte_insn;
2645 3091
@@ -2653,8 +3099,6 @@ special_insn:
2653 break; 3099 break;
2654 case 0x07: /* pop es */ 3100 case 0x07: /* pop es */
2655 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 3101 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
2656 if (rc != X86EMUL_CONTINUE)
2657 goto done;
2658 break; 3102 break;
2659 case 0x08 ... 0x0d: 3103 case 0x08 ... 0x0d:
2660 or: /* or */ 3104 or: /* or */
@@ -2672,8 +3116,6 @@ special_insn:
2672 break; 3116 break;
2673 case 0x17: /* pop ss */ 3117 case 0x17: /* pop ss */
2674 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 3118 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
2675 if (rc != X86EMUL_CONTINUE)
2676 goto done;
2677 break; 3119 break;
2678 case 0x18 ... 0x1d: 3120 case 0x18 ... 0x1d:
2679 sbb: /* sbb */ 3121 sbb: /* sbb */
@@ -2684,8 +3126,6 @@ special_insn:
2684 break; 3126 break;
2685 case 0x1f: /* pop ds */ 3127 case 0x1f: /* pop ds */
2686 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 3128 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
2687 if (rc != X86EMUL_CONTINUE)
2688 goto done;
2689 break; 3129 break;
2690 case 0x20 ... 0x25: 3130 case 0x20 ... 0x25:
2691 and: /* and */ 3131 and: /* and */
@@ -2709,58 +3149,29 @@ special_insn:
2709 case 0x48 ... 0x4f: /* dec r16/r32 */ 3149 case 0x48 ... 0x4f: /* dec r16/r32 */
2710 emulate_1op("dec", c->dst, ctxt->eflags); 3150 emulate_1op("dec", c->dst, ctxt->eflags);
2711 break; 3151 break;
2712 case 0x50 ... 0x57: /* push reg */
2713 emulate_push(ctxt, ops);
2714 break;
2715 case 0x58 ... 0x5f: /* pop reg */ 3152 case 0x58 ... 0x5f: /* pop reg */
2716 pop_instruction: 3153 pop_instruction:
2717 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); 3154 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
2718 if (rc != X86EMUL_CONTINUE)
2719 goto done;
2720 break; 3155 break;
2721 case 0x60: /* pusha */ 3156 case 0x60: /* pusha */
2722 rc = emulate_pusha(ctxt, ops); 3157 rc = emulate_pusha(ctxt, ops);
2723 if (rc != X86EMUL_CONTINUE)
2724 goto done;
2725 break; 3158 break;
2726 case 0x61: /* popa */ 3159 case 0x61: /* popa */
2727 rc = emulate_popa(ctxt, ops); 3160 rc = emulate_popa(ctxt, ops);
2728 if (rc != X86EMUL_CONTINUE)
2729 goto done;
2730 break; 3161 break;
2731 case 0x63: /* movsxd */ 3162 case 0x63: /* movsxd */
2732 if (ctxt->mode != X86EMUL_MODE_PROT64) 3163 if (ctxt->mode != X86EMUL_MODE_PROT64)
2733 goto cannot_emulate; 3164 goto cannot_emulate;
2734 c->dst.val = (s32) c->src.val; 3165 c->dst.val = (s32) c->src.val;
2735 break; 3166 break;
2736 case 0x68: /* push imm */
2737 case 0x6a: /* push imm8 */
2738 emulate_push(ctxt, ops);
2739 break;
2740 case 0x6c: /* insb */ 3167 case 0x6c: /* insb */
2741 case 0x6d: /* insw/insd */ 3168 case 0x6d: /* insw/insd */
2742 c->dst.bytes = min(c->dst.bytes, 4u); 3169 c->src.val = c->regs[VCPU_REGS_RDX];
2743 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 3170 goto do_io_in;
2744 c->dst.bytes)) {
2745 emulate_gp(ctxt, 0);
2746 goto done;
2747 }
2748 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
2749 c->regs[VCPU_REGS_RDX], &c->dst.val))
2750 goto done; /* IO is needed, skip writeback */
2751 break;
2752 case 0x6e: /* outsb */ 3171 case 0x6e: /* outsb */
2753 case 0x6f: /* outsw/outsd */ 3172 case 0x6f: /* outsw/outsd */
2754 c->src.bytes = min(c->src.bytes, 4u); 3173 c->dst.val = c->regs[VCPU_REGS_RDX];
2755 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 3174 goto do_io_out;
2756 c->src.bytes)) {
2757 emulate_gp(ctxt, 0);
2758 goto done;
2759 }
2760 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
2761 &c->src.val, 1, ctxt->vcpu);
2762
2763 c->dst.type = OP_NONE; /* nothing to writeback */
2764 break; 3175 break;
2765 case 0x70 ... 0x7f: /* jcc (short) */ 3176 case 0x70 ... 0x7f: /* jcc (short) */
2766 if (test_cc(c->b, ctxt->eflags)) 3177 if (test_cc(c->b, ctxt->eflags))
@@ -2793,29 +3204,15 @@ special_insn:
2793 case 0x86 ... 0x87: /* xchg */ 3204 case 0x86 ... 0x87: /* xchg */
2794 xchg: 3205 xchg:
2795 /* Write back the register source. */ 3206 /* Write back the register source. */
2796 switch (c->dst.bytes) { 3207 c->src.val = c->dst.val;
2797 case 1: 3208 write_register_operand(&c->src);
2798 *(u8 *) c->src.ptr = (u8) c->dst.val;
2799 break;
2800 case 2:
2801 *(u16 *) c->src.ptr = (u16) c->dst.val;
2802 break;
2803 case 4:
2804 *c->src.ptr = (u32) c->dst.val;
2805 break; /* 64b reg: zero-extend */
2806 case 8:
2807 *c->src.ptr = c->dst.val;
2808 break;
2809 }
2810 /* 3209 /*
2811 * Write back the memory destination with implicit LOCK 3210 * Write back the memory destination with implicit LOCK
2812 * prefix. 3211 * prefix.
2813 */ 3212 */
2814 c->dst.val = c->src.val; 3213 c->dst.val = c->src.orig_val;
2815 c->lock_prefix = 1; 3214 c->lock_prefix = 1;
2816 break; 3215 break;
2817 case 0x88 ... 0x8b: /* mov */
2818 goto mov;
2819 case 0x8c: /* mov r/m, sreg */ 3216 case 0x8c: /* mov r/m, sreg */
2820 if (c->modrm_reg > VCPU_SREG_GS) { 3217 if (c->modrm_reg > VCPU_SREG_GS) {
2821 emulate_ud(ctxt); 3218 emulate_ud(ctxt);
@@ -2824,7 +3221,7 @@ special_insn:
2824 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 3221 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
2825 break; 3222 break;
2826 case 0x8d: /* lea r16/r32, m */ 3223 case 0x8d: /* lea r16/r32, m */
2827 c->dst.val = c->modrm_ea; 3224 c->dst.val = c->src.addr.mem;
2828 break; 3225 break;
2829 case 0x8e: { /* mov seg, r/m16 */ 3226 case 0x8e: { /* mov seg, r/m16 */
2830 uint16_t sel; 3227 uint16_t sel;
@@ -2847,76 +3244,87 @@ special_insn:
2847 } 3244 }
2848 case 0x8f: /* pop (sole member of Grp1a) */ 3245 case 0x8f: /* pop (sole member of Grp1a) */
2849 rc = emulate_grp1a(ctxt, ops); 3246 rc = emulate_grp1a(ctxt, ops);
2850 if (rc != X86EMUL_CONTINUE)
2851 goto done;
2852 break; 3247 break;
2853 case 0x90: /* nop / xchg r8,rax */ 3248 case 0x90 ... 0x97: /* nop / xchg reg, rax */
2854 if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { 3249 if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
2855 c->dst.type = OP_NONE; /* nop */
2856 break; 3250 break;
2857 }
2858 case 0x91 ... 0x97: /* xchg reg,rax */
2859 c->src.type = OP_REG;
2860 c->src.bytes = c->op_bytes;
2861 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
2862 c->src.val = *(c->src.ptr);
2863 goto xchg; 3251 goto xchg;
3252 case 0x98: /* cbw/cwde/cdqe */
3253 switch (c->op_bytes) {
3254 case 2: c->dst.val = (s8)c->dst.val; break;
3255 case 4: c->dst.val = (s16)c->dst.val; break;
3256 case 8: c->dst.val = (s32)c->dst.val; break;
3257 }
3258 break;
2864 case 0x9c: /* pushf */ 3259 case 0x9c: /* pushf */
2865 c->src.val = (unsigned long) ctxt->eflags; 3260 c->src.val = (unsigned long) ctxt->eflags;
2866 emulate_push(ctxt, ops); 3261 emulate_push(ctxt, ops);
2867 break; 3262 break;
2868 case 0x9d: /* popf */ 3263 case 0x9d: /* popf */
2869 c->dst.type = OP_REG; 3264 c->dst.type = OP_REG;
2870 c->dst.ptr = (unsigned long *) &ctxt->eflags; 3265 c->dst.addr.reg = &ctxt->eflags;
2871 c->dst.bytes = c->op_bytes; 3266 c->dst.bytes = c->op_bytes;
2872 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); 3267 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
2873 if (rc != X86EMUL_CONTINUE)
2874 goto done;
2875 break; 3268 break;
2876 case 0xa0 ... 0xa3: /* mov */
2877 case 0xa4 ... 0xa5: /* movs */
2878 goto mov;
2879 case 0xa6 ... 0xa7: /* cmps */ 3269 case 0xa6 ... 0xa7: /* cmps */
2880 c->dst.type = OP_NONE; /* Disable writeback. */ 3270 c->dst.type = OP_NONE; /* Disable writeback. */
2881 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 3271 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
2882 goto cmp; 3272 goto cmp;
2883 case 0xa8 ... 0xa9: /* test ax, imm */ 3273 case 0xa8 ... 0xa9: /* test ax, imm */
2884 goto test; 3274 goto test;
2885 case 0xaa ... 0xab: /* stos */
2886 c->dst.val = c->regs[VCPU_REGS_RAX];
2887 break;
2888 case 0xac ... 0xad: /* lods */
2889 goto mov;
2890 case 0xae ... 0xaf: /* scas */ 3275 case 0xae ... 0xaf: /* scas */
2891 DPRINTF("Urk! I don't handle SCAS.\n"); 3276 goto cmp;
2892 goto cannot_emulate;
2893 case 0xb0 ... 0xbf: /* mov r, imm */
2894 goto mov;
2895 case 0xc0 ... 0xc1: 3277 case 0xc0 ... 0xc1:
2896 emulate_grp2(ctxt); 3278 emulate_grp2(ctxt);
2897 break; 3279 break;
2898 case 0xc3: /* ret */ 3280 case 0xc3: /* ret */
2899 c->dst.type = OP_REG; 3281 c->dst.type = OP_REG;
2900 c->dst.ptr = &c->eip; 3282 c->dst.addr.reg = &c->eip;
2901 c->dst.bytes = c->op_bytes; 3283 c->dst.bytes = c->op_bytes;
2902 goto pop_instruction; 3284 goto pop_instruction;
2903 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ 3285 case 0xc4: /* les */
2904 mov: 3286 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
2905 c->dst.val = c->src.val; 3287 break;
3288 case 0xc5: /* lds */
3289 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
2906 break; 3290 break;
2907 case 0xcb: /* ret far */ 3291 case 0xcb: /* ret far */
2908 rc = emulate_ret_far(ctxt, ops); 3292 rc = emulate_ret_far(ctxt, ops);
2909 if (rc != X86EMUL_CONTINUE) 3293 break;
2910 goto done; 3294 case 0xcc: /* int3 */
3295 irq = 3;
3296 goto do_interrupt;
3297 case 0xcd: /* int n */
3298 irq = c->src.val;
3299 do_interrupt:
3300 rc = emulate_int(ctxt, ops, irq);
3301 break;
3302 case 0xce: /* into */
3303 if (ctxt->eflags & EFLG_OF) {
3304 irq = 4;
3305 goto do_interrupt;
3306 }
3307 break;
3308 case 0xcf: /* iret */
3309 rc = emulate_iret(ctxt, ops);
2911 break; 3310 break;
2912 case 0xd0 ... 0xd1: /* Grp2 */ 3311 case 0xd0 ... 0xd1: /* Grp2 */
2913 c->src.val = 1;
2914 emulate_grp2(ctxt); 3312 emulate_grp2(ctxt);
2915 break; 3313 break;
2916 case 0xd2 ... 0xd3: /* Grp2 */ 3314 case 0xd2 ... 0xd3: /* Grp2 */
2917 c->src.val = c->regs[VCPU_REGS_RCX]; 3315 c->src.val = c->regs[VCPU_REGS_RCX];
2918 emulate_grp2(ctxt); 3316 emulate_grp2(ctxt);
2919 break; 3317 break;
3318 case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
3319 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
3320 if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
3321 (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
3322 jmp_rel(c, c->src.val);
3323 break;
3324 case 0xe3: /* jcxz/jecxz/jrcxz */
3325 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
3326 jmp_rel(c, c->src.val);
3327 break;
2920 case 0xe4: /* inb */ 3328 case 0xe4: /* inb */
2921 case 0xe5: /* in */ 3329 case 0xe5: /* in */
2922 goto do_io_in; 3330 goto do_io_in;
@@ -2964,15 +3372,16 @@ special_insn:
2964 break; 3372 break;
2965 case 0xee: /* out dx,al */ 3373 case 0xee: /* out dx,al */
2966 case 0xef: /* out dx,(e/r)ax */ 3374 case 0xef: /* out dx,(e/r)ax */
2967 c->src.val = c->regs[VCPU_REGS_RDX]; 3375 c->dst.val = c->regs[VCPU_REGS_RDX];
2968 do_io_out: 3376 do_io_out:
2969 c->dst.bytes = min(c->dst.bytes, 4u); 3377 c->src.bytes = min(c->src.bytes, 4u);
2970 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 3378 if (!emulator_io_permited(ctxt, ops, c->dst.val,
3379 c->src.bytes)) {
2971 emulate_gp(ctxt, 0); 3380 emulate_gp(ctxt, 0);
2972 goto done; 3381 goto done;
2973 } 3382 }
2974 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, 3383 ops->pio_out_emulated(c->src.bytes, c->dst.val,
2975 ctxt->vcpu); 3384 &c->src.val, 1, ctxt->vcpu);
2976 c->dst.type = OP_NONE; /* Disable writeback. */ 3385 c->dst.type = OP_NONE; /* Disable writeback. */
2977 break; 3386 break;
2978 case 0xf4: /* hlt */ 3387 case 0xf4: /* hlt */
@@ -2981,24 +3390,22 @@ special_insn:
2981 case 0xf5: /* cmc */ 3390 case 0xf5: /* cmc */
2982 /* complement carry flag from eflags reg */ 3391 /* complement carry flag from eflags reg */
2983 ctxt->eflags ^= EFLG_CF; 3392 ctxt->eflags ^= EFLG_CF;
2984 c->dst.type = OP_NONE; /* Disable writeback. */
2985 break; 3393 break;
2986 case 0xf6 ... 0xf7: /* Grp3 */ 3394 case 0xf6 ... 0xf7: /* Grp3 */
2987 if (!emulate_grp3(ctxt, ops)) 3395 rc = emulate_grp3(ctxt, ops);
2988 goto cannot_emulate;
2989 break; 3396 break;
2990 case 0xf8: /* clc */ 3397 case 0xf8: /* clc */
2991 ctxt->eflags &= ~EFLG_CF; 3398 ctxt->eflags &= ~EFLG_CF;
2992 c->dst.type = OP_NONE; /* Disable writeback. */ 3399 break;
3400 case 0xf9: /* stc */
3401 ctxt->eflags |= EFLG_CF;
2993 break; 3402 break;
2994 case 0xfa: /* cli */ 3403 case 0xfa: /* cli */
2995 if (emulator_bad_iopl(ctxt, ops)) { 3404 if (emulator_bad_iopl(ctxt, ops)) {
2996 emulate_gp(ctxt, 0); 3405 emulate_gp(ctxt, 0);
2997 goto done; 3406 goto done;
2998 } else { 3407 } else
2999 ctxt->eflags &= ~X86_EFLAGS_IF; 3408 ctxt->eflags &= ~X86_EFLAGS_IF;
3000 c->dst.type = OP_NONE; /* Disable writeback. */
3001 }
3002 break; 3409 break;
3003 case 0xfb: /* sti */ 3410 case 0xfb: /* sti */
3004 if (emulator_bad_iopl(ctxt, ops)) { 3411 if (emulator_bad_iopl(ctxt, ops)) {
@@ -3007,29 +3414,29 @@ special_insn:
3007 } else { 3414 } else {
3008 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; 3415 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
3009 ctxt->eflags |= X86_EFLAGS_IF; 3416 ctxt->eflags |= X86_EFLAGS_IF;
3010 c->dst.type = OP_NONE; /* Disable writeback. */
3011 } 3417 }
3012 break; 3418 break;
3013 case 0xfc: /* cld */ 3419 case 0xfc: /* cld */
3014 ctxt->eflags &= ~EFLG_DF; 3420 ctxt->eflags &= ~EFLG_DF;
3015 c->dst.type = OP_NONE; /* Disable writeback. */
3016 break; 3421 break;
3017 case 0xfd: /* std */ 3422 case 0xfd: /* std */
3018 ctxt->eflags |= EFLG_DF; 3423 ctxt->eflags |= EFLG_DF;
3019 c->dst.type = OP_NONE; /* Disable writeback. */
3020 break; 3424 break;
3021 case 0xfe: /* Grp4 */ 3425 case 0xfe: /* Grp4 */
3022 grp45: 3426 grp45:
3023 rc = emulate_grp45(ctxt, ops); 3427 rc = emulate_grp45(ctxt, ops);
3024 if (rc != X86EMUL_CONTINUE)
3025 goto done;
3026 break; 3428 break;
3027 case 0xff: /* Grp5 */ 3429 case 0xff: /* Grp5 */
3028 if (c->modrm_reg == 5) 3430 if (c->modrm_reg == 5)
3029 goto jump_far; 3431 goto jump_far;
3030 goto grp45; 3432 goto grp45;
3433 default:
3434 goto cannot_emulate;
3031 } 3435 }
3032 3436
3437 if (rc != X86EMUL_CONTINUE)
3438 goto done;
3439
3033writeback: 3440writeback:
3034 rc = writeback(ctxt, ops); 3441 rc = writeback(ctxt, ops);
3035 if (rc != X86EMUL_CONTINUE) 3442 if (rc != X86EMUL_CONTINUE)
@@ -3050,25 +3457,32 @@ writeback:
3050 &c->dst); 3457 &c->dst);
3051 3458
3052 if (c->rep_prefix && (c->d & String)) { 3459 if (c->rep_prefix && (c->d & String)) {
3053 struct read_cache *rc = &ctxt->decode.io_read; 3460 struct read_cache *r = &ctxt->decode.io_read;
3054 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); 3461 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
3055 /* 3462
3056 * Re-enter guest when pio read ahead buffer is empty or, 3463 if (!string_insn_completed(ctxt)) {
3057 * if it is not used, after each 1024 iteration. 3464 /*
3058 */ 3465 * Re-enter guest when pio read ahead buffer is empty
3059 if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || 3466 * or, if it is not used, after each 1024 iteration.
3060 (rc->end != 0 && rc->end == rc->pos)) 3467 */
3061 ctxt->restart = false; 3468 if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
3469 (r->end == 0 || r->end != r->pos)) {
3470 /*
3471 * Reset read cache. Usually happens before
3472 * decode, but since instruction is restarted
3473 * we have to do it here.
3474 */
3475 ctxt->decode.mem_read.end = 0;
3476 return EMULATION_RESTART;
3477 }
3478 goto done; /* skip rip writeback */
3479 }
3062 } 3480 }
3063 /* 3481
3064 * reset read cache here in case string instruction is restared
3065 * without decoding
3066 */
3067 ctxt->decode.mem_read.end = 0;
3068 ctxt->eip = c->eip; 3482 ctxt->eip = c->eip;
3069 3483
3070done: 3484done:
3071 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 3485 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3072 3486
3073twobyte_insn: 3487twobyte_insn:
3074 switch (c->b) { 3488 switch (c->b) {
@@ -3091,7 +3505,7 @@ twobyte_insn:
3091 c->dst.type = OP_NONE; 3505 c->dst.type = OP_NONE;
3092 break; 3506 break;
3093 case 2: /* lgdt */ 3507 case 2: /* lgdt */
3094 rc = read_descriptor(ctxt, ops, c->src.ptr, 3508 rc = read_descriptor(ctxt, ops, c->src.addr.mem,
3095 &size, &address, c->op_bytes); 3509 &size, &address, c->op_bytes);
3096 if (rc != X86EMUL_CONTINUE) 3510 if (rc != X86EMUL_CONTINUE)
3097 goto done; 3511 goto done;
@@ -3104,14 +3518,12 @@ twobyte_insn:
3104 switch (c->modrm_rm) { 3518 switch (c->modrm_rm) {
3105 case 1: 3519 case 1:
3106 rc = kvm_fix_hypercall(ctxt->vcpu); 3520 rc = kvm_fix_hypercall(ctxt->vcpu);
3107 if (rc != X86EMUL_CONTINUE)
3108 goto done;
3109 break; 3521 break;
3110 default: 3522 default:
3111 goto cannot_emulate; 3523 goto cannot_emulate;
3112 } 3524 }
3113 } else { 3525 } else {
3114 rc = read_descriptor(ctxt, ops, c->src.ptr, 3526 rc = read_descriptor(ctxt, ops, c->src.addr.mem,
3115 &size, &address, 3527 &size, &address,
3116 c->op_bytes); 3528 c->op_bytes);
3117 if (rc != X86EMUL_CONTINUE) 3529 if (rc != X86EMUL_CONTINUE)
@@ -3126,7 +3538,7 @@ twobyte_insn:
3126 c->dst.val = ops->get_cr(0, ctxt->vcpu); 3538 c->dst.val = ops->get_cr(0, ctxt->vcpu);
3127 break; 3539 break;
3128 case 6: /* lmsw */ 3540 case 6: /* lmsw */
3129 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) | 3541 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
3130 (c->src.val & 0x0f), ctxt->vcpu); 3542 (c->src.val & 0x0f), ctxt->vcpu);
3131 c->dst.type = OP_NONE; 3543 c->dst.type = OP_NONE;
3132 break; 3544 break;
@@ -3134,7 +3546,7 @@ twobyte_insn:
3134 emulate_ud(ctxt); 3546 emulate_ud(ctxt);
3135 goto done; 3547 goto done;
3136 case 7: /* invlpg*/ 3548 case 7: /* invlpg*/
3137 emulate_invlpg(ctxt->vcpu, c->modrm_ea); 3549 emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
3138 /* Disable writeback. */ 3550 /* Disable writeback. */
3139 c->dst.type = OP_NONE; 3551 c->dst.type = OP_NONE;
3140 break; 3552 break;
@@ -3144,23 +3556,16 @@ twobyte_insn:
3144 break; 3556 break;
3145 case 0x05: /* syscall */ 3557 case 0x05: /* syscall */
3146 rc = emulate_syscall(ctxt, ops); 3558 rc = emulate_syscall(ctxt, ops);
3147 if (rc != X86EMUL_CONTINUE)
3148 goto done;
3149 else
3150 goto writeback;
3151 break; 3559 break;
3152 case 0x06: 3560 case 0x06:
3153 emulate_clts(ctxt->vcpu); 3561 emulate_clts(ctxt->vcpu);
3154 c->dst.type = OP_NONE;
3155 break; 3562 break;
3156 case 0x09: /* wbinvd */ 3563 case 0x09: /* wbinvd */
3157 kvm_emulate_wbinvd(ctxt->vcpu); 3564 kvm_emulate_wbinvd(ctxt->vcpu);
3158 c->dst.type = OP_NONE;
3159 break; 3565 break;
3160 case 0x08: /* invd */ 3566 case 0x08: /* invd */
3161 case 0x0d: /* GrpP (prefetch) */ 3567 case 0x0d: /* GrpP (prefetch) */
3162 case 0x18: /* Grp16 (prefetch/nop) */ 3568 case 0x18: /* Grp16 (prefetch/nop) */
3163 c->dst.type = OP_NONE;
3164 break; 3569 break;
3165 case 0x20: /* mov cr, reg */ 3570 case 0x20: /* mov cr, reg */
3166 switch (c->modrm_reg) { 3571 switch (c->modrm_reg) {
@@ -3170,8 +3575,7 @@ twobyte_insn:
3170 emulate_ud(ctxt); 3575 emulate_ud(ctxt);
3171 goto done; 3576 goto done;
3172 } 3577 }
3173 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3578 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
3174 c->dst.type = OP_NONE; /* no writeback */
3175 break; 3579 break;
3176 case 0x21: /* mov from dr to reg */ 3580 case 0x21: /* mov from dr to reg */
3177 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3581 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
@@ -3179,11 +3583,10 @@ twobyte_insn:
3179 emulate_ud(ctxt); 3583 emulate_ud(ctxt);
3180 goto done; 3584 goto done;
3181 } 3585 }
3182 ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); 3586 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
3183 c->dst.type = OP_NONE; /* no writeback */
3184 break; 3587 break;
3185 case 0x22: /* mov reg, cr */ 3588 case 0x22: /* mov reg, cr */
3186 if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { 3589 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
3187 emulate_gp(ctxt, 0); 3590 emulate_gp(ctxt, 0);
3188 goto done; 3591 goto done;
3189 } 3592 }
@@ -3196,7 +3599,7 @@ twobyte_insn:
3196 goto done; 3599 goto done;
3197 } 3600 }
3198 3601
3199 if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & 3602 if (ops->set_dr(c->modrm_reg, c->src.val &
3200 ((ctxt->mode == X86EMUL_MODE_PROT64) ? 3603 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
3201 ~0ULL : ~0U), ctxt->vcpu) < 0) { 3604 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3202 /* #UD condition is already handled by the code above */ 3605 /* #UD condition is already handled by the code above */
@@ -3215,7 +3618,6 @@ twobyte_insn:
3215 goto done; 3618 goto done;
3216 } 3619 }
3217 rc = X86EMUL_CONTINUE; 3620 rc = X86EMUL_CONTINUE;
3218 c->dst.type = OP_NONE;
3219 break; 3621 break;
3220 case 0x32: 3622 case 0x32:
3221 /* rdmsr */ 3623 /* rdmsr */
@@ -3227,21 +3629,12 @@ twobyte_insn:
3227 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 3629 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
3228 } 3630 }
3229 rc = X86EMUL_CONTINUE; 3631 rc = X86EMUL_CONTINUE;
3230 c->dst.type = OP_NONE;
3231 break; 3632 break;
3232 case 0x34: /* sysenter */ 3633 case 0x34: /* sysenter */
3233 rc = emulate_sysenter(ctxt, ops); 3634 rc = emulate_sysenter(ctxt, ops);
3234 if (rc != X86EMUL_CONTINUE)
3235 goto done;
3236 else
3237 goto writeback;
3238 break; 3635 break;
3239 case 0x35: /* sysexit */ 3636 case 0x35: /* sysexit */
3240 rc = emulate_sysexit(ctxt, ops); 3637 rc = emulate_sysexit(ctxt, ops);
3241 if (rc != X86EMUL_CONTINUE)
3242 goto done;
3243 else
3244 goto writeback;
3245 break; 3638 break;
3246 case 0x40 ... 0x4f: /* cmov */ 3639 case 0x40 ... 0x4f: /* cmov */
3247 c->dst.val = c->dst.orig_val = c->src.val; 3640 c->dst.val = c->dst.orig_val = c->src.val;
@@ -3251,15 +3644,15 @@ twobyte_insn:
3251 case 0x80 ... 0x8f: /* jnz rel, etc*/ 3644 case 0x80 ... 0x8f: /* jnz rel, etc*/
3252 if (test_cc(c->b, ctxt->eflags)) 3645 if (test_cc(c->b, ctxt->eflags))
3253 jmp_rel(c, c->src.val); 3646 jmp_rel(c, c->src.val);
3254 c->dst.type = OP_NONE; 3647 break;
3648 case 0x90 ... 0x9f: /* setcc r/m8 */
3649 c->dst.val = test_cc(c->b, ctxt->eflags);
3255 break; 3650 break;
3256 case 0xa0: /* push fs */ 3651 case 0xa0: /* push fs */
3257 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); 3652 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
3258 break; 3653 break;
3259 case 0xa1: /* pop fs */ 3654 case 0xa1: /* pop fs */
3260 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3655 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
3261 if (rc != X86EMUL_CONTINUE)
3262 goto done;
3263 break; 3656 break;
3264 case 0xa3: 3657 case 0xa3:
3265 bt: /* bt */ 3658 bt: /* bt */
@@ -3277,13 +3670,9 @@ twobyte_insn:
3277 break; 3670 break;
3278 case 0xa9: /* pop gs */ 3671 case 0xa9: /* pop gs */
3279 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3672 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
3280 if (rc != X86EMUL_CONTINUE)
3281 goto done;
3282 break; 3673 break;
3283 case 0xab: 3674 case 0xab:
3284 bts: /* bts */ 3675 bts: /* bts */
3285 /* only subword offset */
3286 c->src.val &= (c->dst.bytes << 3) - 1;
3287 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 3676 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
3288 break; 3677 break;
3289 case 0xac: /* shrd imm8, r, r/m */ 3678 case 0xac: /* shrd imm8, r, r/m */
@@ -3306,15 +3695,22 @@ twobyte_insn:
3306 } else { 3695 } else {
3307 /* Failure: write the value we saw to EAX. */ 3696 /* Failure: write the value we saw to EAX. */
3308 c->dst.type = OP_REG; 3697 c->dst.type = OP_REG;
3309 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 3698 c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
3310 } 3699 }
3311 break; 3700 break;
3701 case 0xb2: /* lss */
3702 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
3703 break;
3312 case 0xb3: 3704 case 0xb3:
3313 btr: /* btr */ 3705 btr: /* btr */
3314 /* only subword offset */
3315 c->src.val &= (c->dst.bytes << 3) - 1;
3316 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); 3706 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
3317 break; 3707 break;
3708 case 0xb4: /* lfs */
3709 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
3710 break;
3711 case 0xb5: /* lgs */
3712 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
3713 break;
3318 case 0xb6 ... 0xb7: /* movzx */ 3714 case 0xb6 ... 0xb7: /* movzx */
3319 c->dst.bytes = c->op_bytes; 3715 c->dst.bytes = c->op_bytes;
3320 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val 3716 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
@@ -3334,15 +3730,43 @@ twobyte_insn:
3334 break; 3730 break;
3335 case 0xbb: 3731 case 0xbb:
3336 btc: /* btc */ 3732 btc: /* btc */
3337 /* only subword offset */
3338 c->src.val &= (c->dst.bytes << 3) - 1;
3339 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); 3733 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
3340 break; 3734 break;
3735 case 0xbc: { /* bsf */
3736 u8 zf;
3737 __asm__ ("bsf %2, %0; setz %1"
3738 : "=r"(c->dst.val), "=q"(zf)
3739 : "r"(c->src.val));
3740 ctxt->eflags &= ~X86_EFLAGS_ZF;
3741 if (zf) {
3742 ctxt->eflags |= X86_EFLAGS_ZF;
3743 c->dst.type = OP_NONE; /* Disable writeback. */
3744 }
3745 break;
3746 }
3747 case 0xbd: { /* bsr */
3748 u8 zf;
3749 __asm__ ("bsr %2, %0; setz %1"
3750 : "=r"(c->dst.val), "=q"(zf)
3751 : "r"(c->src.val));
3752 ctxt->eflags &= ~X86_EFLAGS_ZF;
3753 if (zf) {
3754 ctxt->eflags |= X86_EFLAGS_ZF;
3755 c->dst.type = OP_NONE; /* Disable writeback. */
3756 }
3757 break;
3758 }
3341 case 0xbe ... 0xbf: /* movsx */ 3759 case 0xbe ... 0xbf: /* movsx */
3342 c->dst.bytes = c->op_bytes; 3760 c->dst.bytes = c->op_bytes;
3343 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : 3761 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
3344 (s16) c->src.val; 3762 (s16) c->src.val;
3345 break; 3763 break;
3764 case 0xc0 ... 0xc1: /* xadd */
3765 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
3766 /* Write back the register source. */
3767 c->src.val = c->dst.orig_val;
3768 write_register_operand(&c->src);
3769 break;
3346 case 0xc3: /* movnti */ 3770 case 0xc3: /* movnti */
3347 c->dst.bytes = c->op_bytes; 3771 c->dst.bytes = c->op_bytes;
3348 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : 3772 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
@@ -3350,10 +3774,14 @@ twobyte_insn:
3350 break; 3774 break;
3351 case 0xc7: /* Grp9 (cmpxchg8b) */ 3775 case 0xc7: /* Grp9 (cmpxchg8b) */
3352 rc = emulate_grp9(ctxt, ops); 3776 rc = emulate_grp9(ctxt, ops);
3353 if (rc != X86EMUL_CONTINUE)
3354 goto done;
3355 break; 3777 break;
3778 default:
3779 goto cannot_emulate;
3356 } 3780 }
3781
3782 if (rc != X86EMUL_CONTINUE)
3783 goto done;
3784
3357 goto writeback; 3785 goto writeback;
3358 3786
3359cannot_emulate: 3787cannot_emulate:
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index ddeb2314b522..efad72385058 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,7 +5,7 @@
5 * Copyright (c) 2006 Intel Corporation 5 * Copyright (c) 2006 Intel Corporation
6 * Copyright (c) 2007 Keir Fraser, XenSource Inc 6 * Copyright (c) 2007 Keir Fraser, XenSource Inc
7 * Copyright (c) 2008 Intel Corporation 7 * Copyright (c) 2008 Intel Corporation
8 * Copyright 2009 Red Hat, Inc. and/or its affilates. 8 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal 11 * of this software and associated documentation files (the "Software"), to deal
@@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
232 } 232 }
233} 233}
234 234
235int pit_has_pending_timer(struct kvm_vcpu *vcpu)
236{
237 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
238
239 if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
240 return atomic_read(&pit->pit_state.pit_timer.pending);
241 return 0;
242}
243
244static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) 235static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
245{ 236{
246 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 237 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 4b7b73ce2098..f628234fbeca 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard 4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation 5 * Copyright (c) 2007 Intel Corporation
6 * Copyright 2009 Red Hat, Inc. and/or its affilates. 6 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
7 * 7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal 9 * of this software and associated documentation files (the "Software"), to deal
@@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level);
39static void pic_lock(struct kvm_pic *s) 39static void pic_lock(struct kvm_pic *s)
40 __acquires(&s->lock) 40 __acquires(&s->lock)
41{ 41{
42 raw_spin_lock(&s->lock); 42 spin_lock(&s->lock);
43} 43}
44 44
45static void pic_unlock(struct kvm_pic *s) 45static void pic_unlock(struct kvm_pic *s)
@@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s)
51 51
52 s->wakeup_needed = false; 52 s->wakeup_needed = false;
53 53
54 raw_spin_unlock(&s->lock); 54 spin_unlock(&s->lock);
55 55
56 if (wakeup) { 56 if (wakeup) {
57 kvm_for_each_vcpu(i, vcpu, s->kvm) { 57 kvm_for_each_vcpu(i, vcpu, s->kvm) {
@@ -67,6 +67,7 @@ static void pic_unlock(struct kvm_pic *s)
67 if (!found) 67 if (!found)
68 return; 68 return;
69 69
70 kvm_make_request(KVM_REQ_EVENT, found);
70 kvm_vcpu_kick(found); 71 kvm_vcpu_kick(found);
71 } 72 }
72} 73}
@@ -308,13 +309,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
308 addr &= 1; 309 addr &= 1;
309 if (addr == 0) { 310 if (addr == 0) {
310 if (val & 0x10) { 311 if (val & 0x10) {
311 kvm_pic_reset(s); /* init */
312 /*
313 * deassert a pending interrupt
314 */
315 pic_irq_request(s->pics_state->kvm, 0);
316 s->init_state = 1;
317 s->init4 = val & 1; 312 s->init4 = val & 1;
313 s->last_irr = 0;
314 s->imr = 0;
315 s->priority_add = 0;
316 s->special_mask = 0;
317 s->read_reg_select = 0;
318 if (!s->init4) {
319 s->special_fully_nested_mode = 0;
320 s->auto_eoi = 0;
321 }
322 s->init_state = 1;
318 if (val & 0x02) 323 if (val & 0x02)
319 printk(KERN_ERR "single mode not supported"); 324 printk(KERN_ERR "single mode not supported");
320 if (val & 0x08) 325 if (val & 0x08)
@@ -564,7 +569,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
564 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 569 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
565 if (!s) 570 if (!s)
566 return NULL; 571 return NULL;
567 raw_spin_lock_init(&s->lock); 572 spin_lock_init(&s->lock);
568 s->kvm = kvm; 573 s->kvm = kvm;
569 s->pics[0].elcr_mask = 0xf8; 574 s->pics[0].elcr_mask = 0xf8;
570 s->pics[1].elcr_mask = 0xde; 575 s->pics[1].elcr_mask = 0xde;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 2095a049835e..7e06ba1618bd 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * irq.c: API for in kernel interrupt controller 2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation. 3 * Copyright (c) 2007, Intel Corporation.
4 * Copyright 2009 Red Hat, Inc. and/or its affilates. 4 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -33,12 +33,7 @@
33 */ 33 */
34int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 34int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
35{ 35{
36 int ret; 36 return apic_has_pending_timer(vcpu);
37
38 ret = pit_has_pending_timer(vcpu);
39 ret |= apic_has_pending_timer(vcpu);
40
41 return ret;
42} 37}
43EXPORT_SYMBOL(kvm_cpu_has_pending_timer); 38EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
44 39
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 63c314502993..ba910d149410 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -60,7 +60,7 @@ struct kvm_kpic_state {
60}; 60};
61 61
62struct kvm_pic { 62struct kvm_pic {
63 raw_spinlock_t lock; 63 spinlock_t lock;
64 bool wakeup_needed; 64 bool wakeup_needed;
65 unsigned pending_acks; 65 unsigned pending_acks;
66 struct kvm *kvm; 66 struct kvm *kvm;
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 6491ac8e755b..975bb45329a1 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -42,7 +42,14 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
42 (unsigned long *)&vcpu->arch.regs_avail)) 42 (unsigned long *)&vcpu->arch.regs_avail))
43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); 43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
44 44
45 return vcpu->arch.pdptrs[index]; 45 return vcpu->arch.walk_mmu->pdptrs[index];
46}
47
48static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
49{
50 load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
51
52 return mmu->pdptrs[index];
46} 53}
47 54
48static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) 55static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 22b06f7660f4..413f8973a855 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,7 +5,7 @@
5 * Copyright (C) 2006 Qumranet, Inc. 5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell 6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel 7 * Copyright (C) 2007 Intel
8 * Copyright 2009 Red Hat, Inc. and/or its affilates. 8 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Dor Laor <dor.laor@qumranet.com> 11 * Dor Laor <dor.laor@qumranet.com>
@@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
259 259
260static void apic_update_ppr(struct kvm_lapic *apic) 260static void apic_update_ppr(struct kvm_lapic *apic)
261{ 261{
262 u32 tpr, isrv, ppr; 262 u32 tpr, isrv, ppr, old_ppr;
263 int isr; 263 int isr;
264 264
265 old_ppr = apic_get_reg(apic, APIC_PROCPRI);
265 tpr = apic_get_reg(apic, APIC_TASKPRI); 266 tpr = apic_get_reg(apic, APIC_TASKPRI);
266 isr = apic_find_highest_isr(apic); 267 isr = apic_find_highest_isr(apic);
267 isrv = (isr != -1) ? isr : 0; 268 isrv = (isr != -1) ? isr : 0;
@@ -274,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic)
274 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", 275 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
275 apic, ppr, isr, isrv); 276 apic, ppr, isr, isrv);
276 277
277 apic_set_reg(apic, APIC_PROCPRI, ppr); 278 if (old_ppr != ppr) {
279 apic_set_reg(apic, APIC_PROCPRI, ppr);
280 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
281 }
278} 282}
279 283
280static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) 284static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
@@ -391,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
391 break; 395 break;
392 } 396 }
393 397
398 kvm_make_request(KVM_REQ_EVENT, vcpu);
394 kvm_vcpu_kick(vcpu); 399 kvm_vcpu_kick(vcpu);
395 break; 400 break;
396 401
@@ -416,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
416 "INIT on a runnable vcpu %d\n", 421 "INIT on a runnable vcpu %d\n",
417 vcpu->vcpu_id); 422 vcpu->vcpu_id);
418 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 423 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
424 kvm_make_request(KVM_REQ_EVENT, vcpu);
419 kvm_vcpu_kick(vcpu); 425 kvm_vcpu_kick(vcpu);
420 } else { 426 } else {
421 apic_debug("Ignoring de-assert INIT to vcpu %d\n", 427 apic_debug("Ignoring de-assert INIT to vcpu %d\n",
@@ -430,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
430 result = 1; 436 result = 1;
431 vcpu->arch.sipi_vector = vector; 437 vcpu->arch.sipi_vector = vector;
432 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 438 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
439 kvm_make_request(KVM_REQ_EVENT, vcpu);
433 kvm_vcpu_kick(vcpu); 440 kvm_vcpu_kick(vcpu);
434 } 441 }
435 break; 442 break;
@@ -475,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
475 trigger_mode = IOAPIC_EDGE_TRIG; 482 trigger_mode = IOAPIC_EDGE_TRIG;
476 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) 483 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
477 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 484 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
485 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
478} 486}
479 487
480static void apic_send_ipi(struct kvm_lapic *apic) 488static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1151,6 +1159,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1151 update_divide_count(apic); 1159 update_divide_count(apic);
1152 start_apic_timer(apic); 1160 start_apic_timer(apic);
1153 apic->irr_pending = true; 1161 apic->irr_pending = true;
1162 kvm_make_request(KVM_REQ_EVENT, vcpu);
1154} 1163}
1155 1164
1156void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1165void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 311f6dad8951..908ea5464a51 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,7 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -49,15 +49,25 @@
49 */ 49 */
50bool tdp_enabled = false; 50bool tdp_enabled = false;
51 51
52#undef MMU_DEBUG 52enum {
53 AUDIT_PRE_PAGE_FAULT,
54 AUDIT_POST_PAGE_FAULT,
55 AUDIT_PRE_PTE_WRITE,
56 AUDIT_POST_PTE_WRITE,
57 AUDIT_PRE_SYNC,
58 AUDIT_POST_SYNC
59};
53 60
54#undef AUDIT 61char *audit_point_name[] = {
62 "pre page fault",
63 "post page fault",
64 "pre pte write",
65 "post pte write",
66 "pre sync",
67 "post sync"
68};
55 69
56#ifdef AUDIT 70#undef MMU_DEBUG
57static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
58#else
59static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
60#endif
61 71
62#ifdef MMU_DEBUG 72#ifdef MMU_DEBUG
63 73
@@ -71,7 +81,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
71 81
72#endif 82#endif
73 83
74#if defined(MMU_DEBUG) || defined(AUDIT) 84#ifdef MMU_DEBUG
75static int dbg = 0; 85static int dbg = 0;
76module_param(dbg, bool, 0644); 86module_param(dbg, bool, 0644);
77#endif 87#endif
@@ -89,6 +99,8 @@ module_param(oos_shadow, bool, 0644);
89 } 99 }
90#endif 100#endif
91 101
102#define PTE_PREFETCH_NUM 8
103
92#define PT_FIRST_AVAIL_BITS_SHIFT 9 104#define PT_FIRST_AVAIL_BITS_SHIFT 9
93#define PT64_SECOND_AVAIL_BITS_SHIFT 52 105#define PT64_SECOND_AVAIL_BITS_SHIFT 52
94 106
@@ -178,6 +190,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
178static struct kmem_cache *pte_chain_cache; 190static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache; 191static struct kmem_cache *rmap_desc_cache;
180static struct kmem_cache *mmu_page_header_cache; 192static struct kmem_cache *mmu_page_header_cache;
193static struct percpu_counter kvm_total_used_mmu_pages;
181 194
182static u64 __read_mostly shadow_trap_nonpresent_pte; 195static u64 __read_mostly shadow_trap_nonpresent_pte;
183static u64 __read_mostly shadow_notrap_nonpresent_pte; 196static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -299,18 +312,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
299#endif 312#endif
300} 313}
301 314
315static bool spte_has_volatile_bits(u64 spte)
316{
317 if (!shadow_accessed_mask)
318 return false;
319
320 if (!is_shadow_present_pte(spte))
321 return false;
322
323 if ((spte & shadow_accessed_mask) &&
324 (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
325 return false;
326
327 return true;
328}
329
330static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
331{
332 return (old_spte & bit_mask) && !(new_spte & bit_mask);
333}
334
302static void update_spte(u64 *sptep, u64 new_spte) 335static void update_spte(u64 *sptep, u64 new_spte)
303{ 336{
304 u64 old_spte; 337 u64 mask, old_spte = *sptep;
338
339 WARN_ON(!is_rmap_spte(new_spte));
340
341 new_spte |= old_spte & shadow_dirty_mask;
305 342
306 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || 343 mask = shadow_accessed_mask;
307 !is_rmap_spte(*sptep)) 344 if (is_writable_pte(old_spte))
345 mask |= shadow_dirty_mask;
346
347 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
308 __set_spte(sptep, new_spte); 348 __set_spte(sptep, new_spte);
309 else { 349 else
310 old_spte = __xchg_spte(sptep, new_spte); 350 old_spte = __xchg_spte(sptep, new_spte);
311 if (old_spte & shadow_accessed_mask) 351
312 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); 352 if (!shadow_accessed_mask)
313 } 353 return;
354
355 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
356 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
357 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
358 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
314} 359}
315 360
316static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 361static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -367,7 +412,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
367 if (r) 412 if (r)
368 goto out; 413 goto out;
369 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, 414 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
370 rmap_desc_cache, 4); 415 rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
371 if (r) 416 if (r)
372 goto out; 417 goto out;
373 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 418 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -591,6 +636,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
591 desc->sptes[0] = (u64 *)*rmapp; 636 desc->sptes[0] = (u64 *)*rmapp;
592 desc->sptes[1] = spte; 637 desc->sptes[1] = spte;
593 *rmapp = (unsigned long)desc | 1; 638 *rmapp = (unsigned long)desc | 1;
639 ++count;
594 } else { 640 } else {
595 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 641 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
596 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 642 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -603,7 +649,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
603 desc = desc->more; 649 desc = desc->more;
604 } 650 }
605 for (i = 0; desc->sptes[i]; ++i) 651 for (i = 0; desc->sptes[i]; ++i)
606 ; 652 ++count;
607 desc->sptes[i] = spte; 653 desc->sptes[i] = spte;
608 } 654 }
609 return count; 655 return count;
@@ -645,18 +691,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
645 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 691 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
646 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); 692 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
647 if (!*rmapp) { 693 if (!*rmapp) {
648 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 694 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
649 BUG(); 695 BUG();
650 } else if (!(*rmapp & 1)) { 696 } else if (!(*rmapp & 1)) {
651 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); 697 rmap_printk("rmap_remove: %p 1->0\n", spte);
652 if ((u64 *)*rmapp != spte) { 698 if ((u64 *)*rmapp != spte) {
653 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", 699 printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
654 spte, *spte);
655 BUG(); 700 BUG();
656 } 701 }
657 *rmapp = 0; 702 *rmapp = 0;
658 } else { 703 } else {
659 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); 704 rmap_printk("rmap_remove: %p many->many\n", spte);
660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 705 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
661 prev_desc = NULL; 706 prev_desc = NULL;
662 while (desc) { 707 while (desc) {
@@ -670,7 +715,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
670 prev_desc = desc; 715 prev_desc = desc;
671 desc = desc->more; 716 desc = desc->more;
672 } 717 }
673 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); 718 pr_err("rmap_remove: %p many->many\n", spte);
674 BUG(); 719 BUG();
675 } 720 }
676} 721}
@@ -680,18 +725,18 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
680 pfn_t pfn; 725 pfn_t pfn;
681 u64 old_spte = *sptep; 726 u64 old_spte = *sptep;
682 727
683 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || 728 if (!spte_has_volatile_bits(old_spte))
684 old_spte & shadow_accessed_mask) {
685 __set_spte(sptep, new_spte); 729 __set_spte(sptep, new_spte);
686 } else 730 else
687 old_spte = __xchg_spte(sptep, new_spte); 731 old_spte = __xchg_spte(sptep, new_spte);
688 732
689 if (!is_rmap_spte(old_spte)) 733 if (!is_rmap_spte(old_spte))
690 return; 734 return;
735
691 pfn = spte_to_pfn(old_spte); 736 pfn = spte_to_pfn(old_spte);
692 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 737 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
693 kvm_set_pfn_accessed(pfn); 738 kvm_set_pfn_accessed(pfn);
694 if (is_writable_pte(old_spte)) 739 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
695 kvm_set_pfn_dirty(pfn); 740 kvm_set_pfn_dirty(pfn);
696} 741}
697 742
@@ -746,13 +791,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
746 } 791 }
747 spte = rmap_next(kvm, rmapp, spte); 792 spte = rmap_next(kvm, rmapp, spte);
748 } 793 }
749 if (write_protected) {
750 pfn_t pfn;
751
752 spte = rmap_next(kvm, rmapp, NULL);
753 pfn = spte_to_pfn(*spte);
754 kvm_set_pfn_dirty(pfn);
755 }
756 794
757 /* check for huge page mappings */ 795 /* check for huge page mappings */
758 for (i = PT_DIRECTORY_LEVEL; 796 for (i = PT_DIRECTORY_LEVEL;
@@ -947,6 +985,18 @@ static int is_empty_shadow_page(u64 *spt)
947} 985}
948#endif 986#endif
949 987
988/*
989 * This value is the sum of all of the kvm instances's
990 * kvm->arch.n_used_mmu_pages values. We need a global,
991 * aggregate version in order to make the slab shrinker
992 * faster
993 */
994static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
995{
996 kvm->arch.n_used_mmu_pages += nr;
997 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
998}
999
950static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1000static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
951{ 1001{
952 ASSERT(is_empty_shadow_page(sp->spt)); 1002 ASSERT(is_empty_shadow_page(sp->spt));
@@ -956,7 +1006,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
956 if (!sp->role.direct) 1006 if (!sp->role.direct)
957 __free_page(virt_to_page(sp->gfns)); 1007 __free_page(virt_to_page(sp->gfns));
958 kmem_cache_free(mmu_page_header_cache, sp); 1008 kmem_cache_free(mmu_page_header_cache, sp);
959 ++kvm->arch.n_free_mmu_pages; 1009 kvm_mod_used_mmu_pages(kvm, -1);
960} 1010}
961 1011
962static unsigned kvm_page_table_hashfn(gfn_t gfn) 1012static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -979,7 +1029,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
979 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 1029 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
980 sp->multimapped = 0; 1030 sp->multimapped = 0;
981 sp->parent_pte = parent_pte; 1031 sp->parent_pte = parent_pte;
982 --vcpu->kvm->arch.n_free_mmu_pages; 1032 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
983 return sp; 1033 return sp;
984} 1034}
985 1035
@@ -1403,7 +1453,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1403 if (role.direct) 1453 if (role.direct)
1404 role.cr4_pae = 0; 1454 role.cr4_pae = 0;
1405 role.access = access; 1455 role.access = access;
1406 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1456 if (!vcpu->arch.mmu.direct_map
1457 && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1407 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1458 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1408 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1459 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1409 role.quadrant = quadrant; 1460 role.quadrant = quadrant;
@@ -1458,6 +1509,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1458 iterator->addr = addr; 1509 iterator->addr = addr;
1459 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 1510 iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1460 iterator->level = vcpu->arch.mmu.shadow_root_level; 1511 iterator->level = vcpu->arch.mmu.shadow_root_level;
1512
1513 if (iterator->level == PT64_ROOT_LEVEL &&
1514 vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1515 !vcpu->arch.mmu.direct_map)
1516 --iterator->level;
1517
1461 if (iterator->level == PT32E_ROOT_LEVEL) { 1518 if (iterator->level == PT32E_ROOT_LEVEL) {
1462 iterator->shadow_addr 1519 iterator->shadow_addr
1463 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 1520 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -1665,41 +1722,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1665 1722
1666/* 1723/*
1667 * Changing the number of mmu pages allocated to the vm 1724 * Changing the number of mmu pages allocated to the vm
1668 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1725 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1669 */ 1726 */
1670void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1727void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1671{ 1728{
1672 int used_pages;
1673 LIST_HEAD(invalid_list); 1729 LIST_HEAD(invalid_list);
1674
1675 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1676 used_pages = max(0, used_pages);
1677
1678 /* 1730 /*
1679 * If we set the number of mmu pages to be smaller be than the 1731 * If we set the number of mmu pages to be smaller be than the
1680 * number of actived pages , we must to free some mmu pages before we 1732 * number of actived pages , we must to free some mmu pages before we
1681 * change the value 1733 * change the value
1682 */ 1734 */
1683 1735
1684 if (used_pages > kvm_nr_mmu_pages) { 1736 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
1685 while (used_pages > kvm_nr_mmu_pages && 1737 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1686 !list_empty(&kvm->arch.active_mmu_pages)) { 1738 !list_empty(&kvm->arch.active_mmu_pages)) {
1687 struct kvm_mmu_page *page; 1739 struct kvm_mmu_page *page;
1688 1740
1689 page = container_of(kvm->arch.active_mmu_pages.prev, 1741 page = container_of(kvm->arch.active_mmu_pages.prev,
1690 struct kvm_mmu_page, link); 1742 struct kvm_mmu_page, link);
1691 used_pages -= kvm_mmu_prepare_zap_page(kvm, page, 1743 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1692 &invalid_list); 1744 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1693 } 1745 }
1694 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1746 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1695 kvm_nr_mmu_pages = used_pages;
1696 kvm->arch.n_free_mmu_pages = 0;
1697 } 1747 }
1698 else
1699 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1700 - kvm->arch.n_alloc_mmu_pages;
1701 1748
1702 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; 1749 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1703} 1750}
1704 1751
1705static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1752static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -1709,11 +1756,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1709 LIST_HEAD(invalid_list); 1756 LIST_HEAD(invalid_list);
1710 int r; 1757 int r;
1711 1758
1712 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1759 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1713 r = 0; 1760 r = 0;
1714 1761
1715 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1762 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1716 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1763 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1717 sp->role.word); 1764 sp->role.word);
1718 r = 1; 1765 r = 1;
1719 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1766 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -1729,7 +1776,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1729 LIST_HEAD(invalid_list); 1776 LIST_HEAD(invalid_list);
1730 1777
1731 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1778 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1732 pgprintk("%s: zap %lx %x\n", 1779 pgprintk("%s: zap %llx %x\n",
1733 __func__, gfn, sp->role.word); 1780 __func__, gfn, sp->role.word);
1734 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1781 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1735 } 1782 }
@@ -1925,7 +1972,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1925 * whether the guest actually used the pte (in order to detect 1972 * whether the guest actually used the pte (in order to detect
1926 * demand paging). 1973 * demand paging).
1927 */ 1974 */
1928 spte = shadow_base_present_pte | shadow_dirty_mask; 1975 spte = shadow_base_present_pte;
1929 if (!speculative) 1976 if (!speculative)
1930 spte |= shadow_accessed_mask; 1977 spte |= shadow_accessed_mask;
1931 if (!dirty) 1978 if (!dirty)
@@ -1948,8 +1995,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1948 spte |= (u64)pfn << PAGE_SHIFT; 1995 spte |= (u64)pfn << PAGE_SHIFT;
1949 1996
1950 if ((pte_access & ACC_WRITE_MASK) 1997 if ((pte_access & ACC_WRITE_MASK)
1951 || (!tdp_enabled && write_fault && !is_write_protection(vcpu) 1998 || (!vcpu->arch.mmu.direct_map && write_fault
1952 && !user_fault)) { 1999 && !is_write_protection(vcpu) && !user_fault)) {
1953 2000
1954 if (level > PT_PAGE_TABLE_LEVEL && 2001 if (level > PT_PAGE_TABLE_LEVEL &&
1955 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2002 has_wrprotected_page(vcpu->kvm, gfn, level)) {
@@ -1960,7 +2007,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1960 2007
1961 spte |= PT_WRITABLE_MASK; 2008 spte |= PT_WRITABLE_MASK;
1962 2009
1963 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) 2010 if (!vcpu->arch.mmu.direct_map
2011 && !(pte_access & ACC_WRITE_MASK))
1964 spte &= ~PT_USER_MASK; 2012 spte &= ~PT_USER_MASK;
1965 2013
1966 /* 2014 /*
@@ -1973,7 +2021,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1973 goto set_pte; 2021 goto set_pte;
1974 2022
1975 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 2023 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1976 pgprintk("%s: found shadow page for %lx, marking ro\n", 2024 pgprintk("%s: found shadow page for %llx, marking ro\n",
1977 __func__, gfn); 2025 __func__, gfn);
1978 ret = 1; 2026 ret = 1;
1979 pte_access &= ~ACC_WRITE_MASK; 2027 pte_access &= ~ACC_WRITE_MASK;
@@ -1986,8 +2034,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1986 mark_page_dirty(vcpu->kvm, gfn); 2034 mark_page_dirty(vcpu->kvm, gfn);
1987 2035
1988set_pte: 2036set_pte:
1989 if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1990 kvm_set_pfn_dirty(pfn);
1991 update_spte(sptep, spte); 2037 update_spte(sptep, spte);
1992done: 2038done:
1993 return ret; 2039 return ret;
@@ -2004,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2004 int rmap_count; 2050 int rmap_count;
2005 2051
2006 pgprintk("%s: spte %llx access %x write_fault %d" 2052 pgprintk("%s: spte %llx access %x write_fault %d"
2007 " user_fault %d gfn %lx\n", 2053 " user_fault %d gfn %llx\n",
2008 __func__, *sptep, pt_access, 2054 __func__, *sptep, pt_access,
2009 write_fault, user_fault, gfn); 2055 write_fault, user_fault, gfn);
2010 2056
@@ -2023,7 +2069,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2023 __set_spte(sptep, shadow_trap_nonpresent_pte); 2069 __set_spte(sptep, shadow_trap_nonpresent_pte);
2024 kvm_flush_remote_tlbs(vcpu->kvm); 2070 kvm_flush_remote_tlbs(vcpu->kvm);
2025 } else if (pfn != spte_to_pfn(*sptep)) { 2071 } else if (pfn != spte_to_pfn(*sptep)) {
2026 pgprintk("hfn old %lx new %lx\n", 2072 pgprintk("hfn old %llx new %llx\n",
2027 spte_to_pfn(*sptep), pfn); 2073 spte_to_pfn(*sptep), pfn);
2028 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2074 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2029 kvm_flush_remote_tlbs(vcpu->kvm); 2075 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2040,7 +2086,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2040 } 2086 }
2041 2087
2042 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2088 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2043 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", 2089 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2044 is_large_pte(*sptep)? "2MB" : "4kB", 2090 is_large_pte(*sptep)? "2MB" : "4kB",
2045 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 2091 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2046 *sptep, sptep); 2092 *sptep, sptep);
@@ -2064,6 +2110,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2064{ 2110{
2065} 2111}
2066 2112
2113static struct kvm_memory_slot *
2114pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
2115{
2116 struct kvm_memory_slot *slot;
2117
2118 slot = gfn_to_memslot(vcpu->kvm, gfn);
2119 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
2120 (no_dirty_log && slot->dirty_bitmap))
2121 slot = NULL;
2122
2123 return slot;
2124}
2125
2126static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2127 bool no_dirty_log)
2128{
2129 struct kvm_memory_slot *slot;
2130 unsigned long hva;
2131
2132 slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
2133 if (!slot) {
2134 get_page(bad_page);
2135 return page_to_pfn(bad_page);
2136 }
2137
2138 hva = gfn_to_hva_memslot(slot, gfn);
2139
2140 return hva_to_pfn_atomic(vcpu->kvm, hva);
2141}
2142
2143static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2144 struct kvm_mmu_page *sp,
2145 u64 *start, u64 *end)
2146{
2147 struct page *pages[PTE_PREFETCH_NUM];
2148 unsigned access = sp->role.access;
2149 int i, ret;
2150 gfn_t gfn;
2151
2152 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2153 if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
2154 return -1;
2155
2156 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2157 if (ret <= 0)
2158 return -1;
2159
2160 for (i = 0; i < ret; i++, gfn++, start++)
2161 mmu_set_spte(vcpu, start, ACC_ALL,
2162 access, 0, 0, 1, NULL,
2163 sp->role.level, gfn,
2164 page_to_pfn(pages[i]), true, true);
2165
2166 return 0;
2167}
2168
2169static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2170 struct kvm_mmu_page *sp, u64 *sptep)
2171{
2172 u64 *spte, *start = NULL;
2173 int i;
2174
2175 WARN_ON(!sp->role.direct);
2176
2177 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2178 spte = sp->spt + i;
2179
2180 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2181 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
2182 if (!start)
2183 continue;
2184 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2185 break;
2186 start = NULL;
2187 } else if (!start)
2188 start = spte;
2189 }
2190}
2191
2192static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2193{
2194 struct kvm_mmu_page *sp;
2195
2196 /*
2197 * Since it's no accessed bit on EPT, it's no way to
2198 * distinguish between actually accessed translations
2199 * and prefetched, so disable pte prefetch if EPT is
2200 * enabled.
2201 */
2202 if (!shadow_accessed_mask)
2203 return;
2204
2205 sp = page_header(__pa(sptep));
2206 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2207 return;
2208
2209 __direct_pte_prefetch(vcpu, sp, sptep);
2210}
2211
2067static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2212static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2068 int level, gfn_t gfn, pfn_t pfn) 2213 int level, gfn_t gfn, pfn_t pfn)
2069{ 2214{
@@ -2077,6 +2222,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2077 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2222 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
2078 0, write, 1, &pt_write, 2223 0, write, 1, &pt_write,
2079 level, gfn, pfn, false, true); 2224 level, gfn, pfn, false, true);
2225 direct_pte_prefetch(vcpu, iterator.sptep);
2080 ++vcpu->stat.pf_fixed; 2226 ++vcpu->stat.pf_fixed;
2081 break; 2227 break;
2082 } 2228 }
@@ -2098,28 +2244,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2098 __set_spte(iterator.sptep, 2244 __set_spte(iterator.sptep,
2099 __pa(sp->spt) 2245 __pa(sp->spt)
2100 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2246 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2101 | shadow_user_mask | shadow_x_mask); 2247 | shadow_user_mask | shadow_x_mask
2248 | shadow_accessed_mask);
2102 } 2249 }
2103 } 2250 }
2104 return pt_write; 2251 return pt_write;
2105} 2252}
2106 2253
2107static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) 2254static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2108{ 2255{
2109 char buf[1]; 2256 siginfo_t info;
2110 void __user *hva; 2257
2111 int r; 2258 info.si_signo = SIGBUS;
2259 info.si_errno = 0;
2260 info.si_code = BUS_MCEERR_AR;
2261 info.si_addr = (void __user *)address;
2262 info.si_addr_lsb = PAGE_SHIFT;
2112 2263
2113 /* Touch the page, so send SIGBUS */ 2264 send_sig_info(SIGBUS, &info, tsk);
2114 hva = (void __user *)gfn_to_hva(kvm, gfn);
2115 r = copy_from_user(buf, hva, 1);
2116} 2265}
2117 2266
2118static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2267static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2119{ 2268{
2120 kvm_release_pfn_clean(pfn); 2269 kvm_release_pfn_clean(pfn);
2121 if (is_hwpoison_pfn(pfn)) { 2270 if (is_hwpoison_pfn(pfn)) {
2122 kvm_send_hwpoison_signal(kvm, gfn); 2271 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
2123 return 0; 2272 return 0;
2124 } else if (is_fault_pfn(pfn)) 2273 } else if (is_fault_pfn(pfn))
2125 return -EFAULT; 2274 return -EFAULT;
@@ -2179,7 +2328,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2179 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2328 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2180 return; 2329 return;
2181 spin_lock(&vcpu->kvm->mmu_lock); 2330 spin_lock(&vcpu->kvm->mmu_lock);
2182 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2331 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2332 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2333 vcpu->arch.mmu.direct_map)) {
2183 hpa_t root = vcpu->arch.mmu.root_hpa; 2334 hpa_t root = vcpu->arch.mmu.root_hpa;
2184 2335
2185 sp = page_header(root); 2336 sp = page_header(root);
@@ -2222,80 +2373,158 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2222 return ret; 2373 return ret;
2223} 2374}
2224 2375
2225static int mmu_alloc_roots(struct kvm_vcpu *vcpu) 2376static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2226{ 2377{
2227 int i;
2228 gfn_t root_gfn;
2229 struct kvm_mmu_page *sp; 2378 struct kvm_mmu_page *sp;
2230 int direct = 0; 2379 unsigned i;
2231 u64 pdptr;
2232
2233 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2234 2380
2235 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2381 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2382 spin_lock(&vcpu->kvm->mmu_lock);
2383 kvm_mmu_free_some_pages(vcpu);
2384 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2385 1, ACC_ALL, NULL);
2386 ++sp->root_count;
2387 spin_unlock(&vcpu->kvm->mmu_lock);
2388 vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2389 } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2390 for (i = 0; i < 4; ++i) {
2391 hpa_t root = vcpu->arch.mmu.pae_root[i];
2392
2393 ASSERT(!VALID_PAGE(root));
2394 spin_lock(&vcpu->kvm->mmu_lock);
2395 kvm_mmu_free_some_pages(vcpu);
2396 sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
2397 PT32_ROOT_LEVEL, 1, ACC_ALL,
2398 NULL);
2399 root = __pa(sp->spt);
2400 ++sp->root_count;
2401 spin_unlock(&vcpu->kvm->mmu_lock);
2402 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2403 }
2404 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2405 } else
2406 BUG();
2407
2408 return 0;
2409}
2410
2411static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2412{
2413 struct kvm_mmu_page *sp;
2414 u64 pdptr, pm_mask;
2415 gfn_t root_gfn;
2416 int i;
2417
2418 root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2419
2420 if (mmu_check_root(vcpu, root_gfn))
2421 return 1;
2422
2423 /*
2424 * Do we shadow a long mode page table? If so we need to
2425 * write-protect the guests page table root.
2426 */
2427 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2236 hpa_t root = vcpu->arch.mmu.root_hpa; 2428 hpa_t root = vcpu->arch.mmu.root_hpa;
2237 2429
2238 ASSERT(!VALID_PAGE(root)); 2430 ASSERT(!VALID_PAGE(root));
2239 if (mmu_check_root(vcpu, root_gfn)) 2431
2240 return 1;
2241 if (tdp_enabled) {
2242 direct = 1;
2243 root_gfn = 0;
2244 }
2245 spin_lock(&vcpu->kvm->mmu_lock); 2432 spin_lock(&vcpu->kvm->mmu_lock);
2246 kvm_mmu_free_some_pages(vcpu); 2433 kvm_mmu_free_some_pages(vcpu);
2247 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2434 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2248 PT64_ROOT_LEVEL, direct, 2435 0, ACC_ALL, NULL);
2249 ACC_ALL, NULL);
2250 root = __pa(sp->spt); 2436 root = __pa(sp->spt);
2251 ++sp->root_count; 2437 ++sp->root_count;
2252 spin_unlock(&vcpu->kvm->mmu_lock); 2438 spin_unlock(&vcpu->kvm->mmu_lock);
2253 vcpu->arch.mmu.root_hpa = root; 2439 vcpu->arch.mmu.root_hpa = root;
2254 return 0; 2440 return 0;
2255 } 2441 }
2256 direct = !is_paging(vcpu); 2442
2443 /*
2444 * We shadow a 32 bit page table. This may be a legacy 2-level
2445 * or a PAE 3-level page table. In either case we need to be aware that
2446 * the shadow page table may be a PAE or a long mode page table.
2447 */
2448 pm_mask = PT_PRESENT_MASK;
2449 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
2450 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
2451
2257 for (i = 0; i < 4; ++i) { 2452 for (i = 0; i < 4; ++i) {
2258 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2453 hpa_t root = vcpu->arch.mmu.pae_root[i];
2259 2454
2260 ASSERT(!VALID_PAGE(root)); 2455 ASSERT(!VALID_PAGE(root));
2261 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2456 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2262 pdptr = kvm_pdptr_read(vcpu, i); 2457 pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2263 if (!is_present_gpte(pdptr)) { 2458 if (!is_present_gpte(pdptr)) {
2264 vcpu->arch.mmu.pae_root[i] = 0; 2459 vcpu->arch.mmu.pae_root[i] = 0;
2265 continue; 2460 continue;
2266 } 2461 }
2267 root_gfn = pdptr >> PAGE_SHIFT; 2462 root_gfn = pdptr >> PAGE_SHIFT;
2268 } else if (vcpu->arch.mmu.root_level == 0) 2463 if (mmu_check_root(vcpu, root_gfn))
2269 root_gfn = 0; 2464 return 1;
2270 if (mmu_check_root(vcpu, root_gfn))
2271 return 1;
2272 if (tdp_enabled) {
2273 direct = 1;
2274 root_gfn = i << 30;
2275 } 2465 }
2276 spin_lock(&vcpu->kvm->mmu_lock); 2466 spin_lock(&vcpu->kvm->mmu_lock);
2277 kvm_mmu_free_some_pages(vcpu); 2467 kvm_mmu_free_some_pages(vcpu);
2278 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2468 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2279 PT32_ROOT_LEVEL, direct, 2469 PT32_ROOT_LEVEL, 0,
2280 ACC_ALL, NULL); 2470 ACC_ALL, NULL);
2281 root = __pa(sp->spt); 2471 root = __pa(sp->spt);
2282 ++sp->root_count; 2472 ++sp->root_count;
2283 spin_unlock(&vcpu->kvm->mmu_lock); 2473 spin_unlock(&vcpu->kvm->mmu_lock);
2284 2474
2285 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2475 vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2286 } 2476 }
2287 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2477 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2478
2479 /*
2480 * If we shadow a 32 bit page table with a long mode page
2481 * table we enter this path.
2482 */
2483 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2484 if (vcpu->arch.mmu.lm_root == NULL) {
2485 /*
2486 * The additional page necessary for this is only
2487 * allocated on demand.
2488 */
2489
2490 u64 *lm_root;
2491
2492 lm_root = (void*)get_zeroed_page(GFP_KERNEL);
2493 if (lm_root == NULL)
2494 return 1;
2495
2496 lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
2497
2498 vcpu->arch.mmu.lm_root = lm_root;
2499 }
2500
2501 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
2502 }
2503
2288 return 0; 2504 return 0;
2289} 2505}
2290 2506
2507static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2508{
2509 if (vcpu->arch.mmu.direct_map)
2510 return mmu_alloc_direct_roots(vcpu);
2511 else
2512 return mmu_alloc_shadow_roots(vcpu);
2513}
2514
2291static void mmu_sync_roots(struct kvm_vcpu *vcpu) 2515static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2292{ 2516{
2293 int i; 2517 int i;
2294 struct kvm_mmu_page *sp; 2518 struct kvm_mmu_page *sp;
2295 2519
2520 if (vcpu->arch.mmu.direct_map)
2521 return;
2522
2296 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2523 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2297 return; 2524 return;
2298 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2525
2526 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2527 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2299 hpa_t root = vcpu->arch.mmu.root_hpa; 2528 hpa_t root = vcpu->arch.mmu.root_hpa;
2300 sp = page_header(root); 2529 sp = page_header(root);
2301 mmu_sync_children(vcpu, sp); 2530 mmu_sync_children(vcpu, sp);
@@ -2310,6 +2539,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2310 mmu_sync_children(vcpu, sp); 2539 mmu_sync_children(vcpu, sp);
2311 } 2540 }
2312 } 2541 }
2542 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2313} 2543}
2314 2544
2315void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2545void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2327,6 +2557,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2327 return vaddr; 2557 return vaddr;
2328} 2558}
2329 2559
2560static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2561 u32 access, u32 *error)
2562{
2563 if (error)
2564 *error = 0;
2565 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2566}
2567
2330static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2568static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2331 u32 error_code) 2569 u32 error_code)
2332{ 2570{
@@ -2393,10 +2631,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
2393 mmu_free_roots(vcpu); 2631 mmu_free_roots(vcpu);
2394} 2632}
2395 2633
2396static int nonpaging_init_context(struct kvm_vcpu *vcpu) 2634static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2635 struct kvm_mmu *context)
2397{ 2636{
2398 struct kvm_mmu *context = &vcpu->arch.mmu;
2399
2400 context->new_cr3 = nonpaging_new_cr3; 2637 context->new_cr3 = nonpaging_new_cr3;
2401 context->page_fault = nonpaging_page_fault; 2638 context->page_fault = nonpaging_page_fault;
2402 context->gva_to_gpa = nonpaging_gva_to_gpa; 2639 context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2407,6 +2644,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2407 context->root_level = 0; 2644 context->root_level = 0;
2408 context->shadow_root_level = PT32E_ROOT_LEVEL; 2645 context->shadow_root_level = PT32E_ROOT_LEVEL;
2409 context->root_hpa = INVALID_PAGE; 2646 context->root_hpa = INVALID_PAGE;
2647 context->direct_map = true;
2648 context->nx = false;
2410 return 0; 2649 return 0;
2411} 2650}
2412 2651
@@ -2422,11 +2661,14 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
2422 mmu_free_roots(vcpu); 2661 mmu_free_roots(vcpu);
2423} 2662}
2424 2663
2425static void inject_page_fault(struct kvm_vcpu *vcpu, 2664static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2426 u64 addr, 2665{
2427 u32 err_code) 2666 return vcpu->arch.cr3;
2667}
2668
2669static void inject_page_fault(struct kvm_vcpu *vcpu)
2428{ 2670{
2429 kvm_inject_page_fault(vcpu, addr, err_code); 2671 vcpu->arch.mmu.inject_page_fault(vcpu);
2430} 2672}
2431 2673
2432static void paging_free(struct kvm_vcpu *vcpu) 2674static void paging_free(struct kvm_vcpu *vcpu)
@@ -2434,12 +2676,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
2434 nonpaging_free(vcpu); 2676 nonpaging_free(vcpu);
2435} 2677}
2436 2678
2437static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) 2679static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2438{ 2680{
2439 int bit7; 2681 int bit7;
2440 2682
2441 bit7 = (gpte >> 7) & 1; 2683 bit7 = (gpte >> 7) & 1;
2442 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; 2684 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2443} 2685}
2444 2686
2445#define PTTYPE 64 2687#define PTTYPE 64
@@ -2450,13 +2692,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2450#include "paging_tmpl.h" 2692#include "paging_tmpl.h"
2451#undef PTTYPE 2693#undef PTTYPE
2452 2694
2453static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) 2695static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
2696 struct kvm_mmu *context,
2697 int level)
2454{ 2698{
2455 struct kvm_mmu *context = &vcpu->arch.mmu;
2456 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2699 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2457 u64 exb_bit_rsvd = 0; 2700 u64 exb_bit_rsvd = 0;
2458 2701
2459 if (!is_nx(vcpu)) 2702 if (!context->nx)
2460 exb_bit_rsvd = rsvd_bits(63, 63); 2703 exb_bit_rsvd = rsvd_bits(63, 63);
2461 switch (level) { 2704 switch (level) {
2462 case PT32_ROOT_LEVEL: 2705 case PT32_ROOT_LEVEL:
@@ -2511,9 +2754,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2511 } 2754 }
2512} 2755}
2513 2756
2514static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2757static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2758 struct kvm_mmu *context,
2759 int level)
2515{ 2760{
2516 struct kvm_mmu *context = &vcpu->arch.mmu; 2761 context->nx = is_nx(vcpu);
2762
2763 reset_rsvds_bits_mask(vcpu, context, level);
2517 2764
2518 ASSERT(is_pae(vcpu)); 2765 ASSERT(is_pae(vcpu));
2519 context->new_cr3 = paging_new_cr3; 2766 context->new_cr3 = paging_new_cr3;
@@ -2526,20 +2773,23 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2526 context->root_level = level; 2773 context->root_level = level;
2527 context->shadow_root_level = level; 2774 context->shadow_root_level = level;
2528 context->root_hpa = INVALID_PAGE; 2775 context->root_hpa = INVALID_PAGE;
2776 context->direct_map = false;
2529 return 0; 2777 return 0;
2530} 2778}
2531 2779
2532static int paging64_init_context(struct kvm_vcpu *vcpu) 2780static int paging64_init_context(struct kvm_vcpu *vcpu,
2781 struct kvm_mmu *context)
2533{ 2782{
2534 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2783 return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2535 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2536} 2784}
2537 2785
2538static int paging32_init_context(struct kvm_vcpu *vcpu) 2786static int paging32_init_context(struct kvm_vcpu *vcpu,
2787 struct kvm_mmu *context)
2539{ 2788{
2540 struct kvm_mmu *context = &vcpu->arch.mmu; 2789 context->nx = false;
2790
2791 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2541 2792
2542 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2543 context->new_cr3 = paging_new_cr3; 2793 context->new_cr3 = paging_new_cr3;
2544 context->page_fault = paging32_page_fault; 2794 context->page_fault = paging32_page_fault;
2545 context->gva_to_gpa = paging32_gva_to_gpa; 2795 context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2550,18 +2800,19 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2550 context->root_level = PT32_ROOT_LEVEL; 2800 context->root_level = PT32_ROOT_LEVEL;
2551 context->shadow_root_level = PT32E_ROOT_LEVEL; 2801 context->shadow_root_level = PT32E_ROOT_LEVEL;
2552 context->root_hpa = INVALID_PAGE; 2802 context->root_hpa = INVALID_PAGE;
2803 context->direct_map = false;
2553 return 0; 2804 return 0;
2554} 2805}
2555 2806
2556static int paging32E_init_context(struct kvm_vcpu *vcpu) 2807static int paging32E_init_context(struct kvm_vcpu *vcpu,
2808 struct kvm_mmu *context)
2557{ 2809{
2558 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2810 return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
2559 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2560} 2811}
2561 2812
2562static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 2813static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2563{ 2814{
2564 struct kvm_mmu *context = &vcpu->arch.mmu; 2815 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2565 2816
2566 context->new_cr3 = nonpaging_new_cr3; 2817 context->new_cr3 = nonpaging_new_cr3;
2567 context->page_fault = tdp_page_fault; 2818 context->page_fault = tdp_page_fault;
@@ -2571,20 +2822,29 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2571 context->invlpg = nonpaging_invlpg; 2822 context->invlpg = nonpaging_invlpg;
2572 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2823 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2573 context->root_hpa = INVALID_PAGE; 2824 context->root_hpa = INVALID_PAGE;
2825 context->direct_map = true;
2826 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2827 context->get_cr3 = get_cr3;
2828 context->inject_page_fault = kvm_inject_page_fault;
2829 context->nx = is_nx(vcpu);
2574 2830
2575 if (!is_paging(vcpu)) { 2831 if (!is_paging(vcpu)) {
2832 context->nx = false;
2576 context->gva_to_gpa = nonpaging_gva_to_gpa; 2833 context->gva_to_gpa = nonpaging_gva_to_gpa;
2577 context->root_level = 0; 2834 context->root_level = 0;
2578 } else if (is_long_mode(vcpu)) { 2835 } else if (is_long_mode(vcpu)) {
2579 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2836 context->nx = is_nx(vcpu);
2837 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
2580 context->gva_to_gpa = paging64_gva_to_gpa; 2838 context->gva_to_gpa = paging64_gva_to_gpa;
2581 context->root_level = PT64_ROOT_LEVEL; 2839 context->root_level = PT64_ROOT_LEVEL;
2582 } else if (is_pae(vcpu)) { 2840 } else if (is_pae(vcpu)) {
2583 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2841 context->nx = is_nx(vcpu);
2842 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
2584 context->gva_to_gpa = paging64_gva_to_gpa; 2843 context->gva_to_gpa = paging64_gva_to_gpa;
2585 context->root_level = PT32E_ROOT_LEVEL; 2844 context->root_level = PT32E_ROOT_LEVEL;
2586 } else { 2845 } else {
2587 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); 2846 context->nx = false;
2847 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2588 context->gva_to_gpa = paging32_gva_to_gpa; 2848 context->gva_to_gpa = paging32_gva_to_gpa;
2589 context->root_level = PT32_ROOT_LEVEL; 2849 context->root_level = PT32_ROOT_LEVEL;
2590 } 2850 }
@@ -2592,33 +2852,83 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2592 return 0; 2852 return 0;
2593} 2853}
2594 2854
2595static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 2855int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
2596{ 2856{
2597 int r; 2857 int r;
2598
2599 ASSERT(vcpu); 2858 ASSERT(vcpu);
2600 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2859 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2601 2860
2602 if (!is_paging(vcpu)) 2861 if (!is_paging(vcpu))
2603 r = nonpaging_init_context(vcpu); 2862 r = nonpaging_init_context(vcpu, context);
2604 else if (is_long_mode(vcpu)) 2863 else if (is_long_mode(vcpu))
2605 r = paging64_init_context(vcpu); 2864 r = paging64_init_context(vcpu, context);
2606 else if (is_pae(vcpu)) 2865 else if (is_pae(vcpu))
2607 r = paging32E_init_context(vcpu); 2866 r = paging32E_init_context(vcpu, context);
2608 else 2867 else
2609 r = paging32_init_context(vcpu); 2868 r = paging32_init_context(vcpu, context);
2610 2869
2611 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 2870 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2612 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 2871 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2613 2872
2614 return r; 2873 return r;
2615} 2874}
2875EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
2876
2877static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2878{
2879 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
2880
2881 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
2882 vcpu->arch.walk_mmu->get_cr3 = get_cr3;
2883 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
2884
2885 return r;
2886}
2887
2888static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
2889{
2890 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
2891
2892 g_context->get_cr3 = get_cr3;
2893 g_context->inject_page_fault = kvm_inject_page_fault;
2894
2895 /*
2896 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
2897 * translation of l2_gpa to l1_gpa addresses is done using the
2898 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
2899 * functions between mmu and nested_mmu are swapped.
2900 */
2901 if (!is_paging(vcpu)) {
2902 g_context->nx = false;
2903 g_context->root_level = 0;
2904 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
2905 } else if (is_long_mode(vcpu)) {
2906 g_context->nx = is_nx(vcpu);
2907 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
2908 g_context->root_level = PT64_ROOT_LEVEL;
2909 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
2910 } else if (is_pae(vcpu)) {
2911 g_context->nx = is_nx(vcpu);
2912 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
2913 g_context->root_level = PT32E_ROOT_LEVEL;
2914 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
2915 } else {
2916 g_context->nx = false;
2917 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
2918 g_context->root_level = PT32_ROOT_LEVEL;
2919 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
2920 }
2921
2922 return 0;
2923}
2616 2924
2617static int init_kvm_mmu(struct kvm_vcpu *vcpu) 2925static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2618{ 2926{
2619 vcpu->arch.update_pte.pfn = bad_pfn; 2927 vcpu->arch.update_pte.pfn = bad_pfn;
2620 2928
2621 if (tdp_enabled) 2929 if (mmu_is_nested(vcpu))
2930 return init_kvm_nested_mmu(vcpu);
2931 else if (tdp_enabled)
2622 return init_kvm_tdp_mmu(vcpu); 2932 return init_kvm_tdp_mmu(vcpu);
2623 else 2933 else
2624 return init_kvm_softmmu(vcpu); 2934 return init_kvm_softmmu(vcpu);
@@ -2653,7 +2963,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2653 if (r) 2963 if (r)
2654 goto out; 2964 goto out;
2655 /* set_cr3() should ensure TLB has been flushed */ 2965 /* set_cr3() should ensure TLB has been flushed */
2656 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2966 vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2657out: 2967out:
2658 return r; 2968 return r;
2659} 2969}
@@ -2663,6 +2973,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2663{ 2973{
2664 mmu_free_roots(vcpu); 2974 mmu_free_roots(vcpu);
2665} 2975}
2976EXPORT_SYMBOL_GPL(kvm_mmu_unload);
2666 2977
2667static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, 2978static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2668 struct kvm_mmu_page *sp, 2979 struct kvm_mmu_page *sp,
@@ -2695,7 +3006,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2695 return; 3006 return;
2696 } 3007 }
2697 3008
2698 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) 3009 if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2699 return; 3010 return;
2700 3011
2701 ++vcpu->kvm->stat.mmu_pte_updated; 3012 ++vcpu->kvm->stat.mmu_pte_updated;
@@ -2837,7 +3148,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2837 kvm_mmu_access_page(vcpu, gfn); 3148 kvm_mmu_access_page(vcpu, gfn);
2838 kvm_mmu_free_some_pages(vcpu); 3149 kvm_mmu_free_some_pages(vcpu);
2839 ++vcpu->kvm->stat.mmu_pte_write; 3150 ++vcpu->kvm->stat.mmu_pte_write;
2840 kvm_mmu_audit(vcpu, "pre pte write"); 3151 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
2841 if (guest_initiated) { 3152 if (guest_initiated) {
2842 if (gfn == vcpu->arch.last_pt_write_gfn 3153 if (gfn == vcpu->arch.last_pt_write_gfn
2843 && !last_updated_pte_accessed(vcpu)) { 3154 && !last_updated_pte_accessed(vcpu)) {
@@ -2910,7 +3221,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2910 } 3221 }
2911 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 3222 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2912 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3223 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2913 kvm_mmu_audit(vcpu, "post pte write"); 3224 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
2914 spin_unlock(&vcpu->kvm->mmu_lock); 3225 spin_unlock(&vcpu->kvm->mmu_lock);
2915 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { 3226 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2916 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); 3227 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -2923,7 +3234,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2923 gpa_t gpa; 3234 gpa_t gpa;
2924 int r; 3235 int r;
2925 3236
2926 if (tdp_enabled) 3237 if (vcpu->arch.mmu.direct_map)
2927 return 0; 3238 return 0;
2928 3239
2929 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 3240 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -2937,21 +3248,18 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2937 3248
2938void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 3249void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2939{ 3250{
2940 int free_pages;
2941 LIST_HEAD(invalid_list); 3251 LIST_HEAD(invalid_list);
2942 3252
2943 free_pages = vcpu->kvm->arch.n_free_mmu_pages; 3253 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
2944 while (free_pages < KVM_REFILL_PAGES &&
2945 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 3254 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2946 struct kvm_mmu_page *sp; 3255 struct kvm_mmu_page *sp;
2947 3256
2948 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 3257 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2949 struct kvm_mmu_page, link); 3258 struct kvm_mmu_page, link);
2950 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3259 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2951 &invalid_list); 3260 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2952 ++vcpu->kvm->stat.mmu_recycled; 3261 ++vcpu->kvm->stat.mmu_recycled;
2953 } 3262 }
2954 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2955} 3263}
2956 3264
2957int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 3265int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -3013,6 +3321,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3013static void free_mmu_pages(struct kvm_vcpu *vcpu) 3321static void free_mmu_pages(struct kvm_vcpu *vcpu)
3014{ 3322{
3015 free_page((unsigned long)vcpu->arch.mmu.pae_root); 3323 free_page((unsigned long)vcpu->arch.mmu.pae_root);
3324 if (vcpu->arch.mmu.lm_root != NULL)
3325 free_page((unsigned long)vcpu->arch.mmu.lm_root);
3016} 3326}
3017 3327
3018static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 3328static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -3054,15 +3364,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3054 return init_kvm_mmu(vcpu); 3364 return init_kvm_mmu(vcpu);
3055} 3365}
3056 3366
3057void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3058{
3059 ASSERT(vcpu);
3060
3061 destroy_kvm_mmu(vcpu);
3062 free_mmu_pages(vcpu);
3063 mmu_free_memory_caches(vcpu);
3064}
3065
3066void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 3367void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3067{ 3368{
3068 struct kvm_mmu_page *sp; 3369 struct kvm_mmu_page *sp;
@@ -3112,23 +3413,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3112{ 3413{
3113 struct kvm *kvm; 3414 struct kvm *kvm;
3114 struct kvm *kvm_freed = NULL; 3415 struct kvm *kvm_freed = NULL;
3115 int cache_count = 0; 3416
3417 if (nr_to_scan == 0)
3418 goto out;
3116 3419
3117 spin_lock(&kvm_lock); 3420 spin_lock(&kvm_lock);
3118 3421
3119 list_for_each_entry(kvm, &vm_list, vm_list) { 3422 list_for_each_entry(kvm, &vm_list, vm_list) {
3120 int npages, idx, freed_pages; 3423 int idx, freed_pages;
3121 LIST_HEAD(invalid_list); 3424 LIST_HEAD(invalid_list);
3122 3425
3123 idx = srcu_read_lock(&kvm->srcu); 3426 idx = srcu_read_lock(&kvm->srcu);
3124 spin_lock(&kvm->mmu_lock); 3427 spin_lock(&kvm->mmu_lock);
3125 npages = kvm->arch.n_alloc_mmu_pages - 3428 if (!kvm_freed && nr_to_scan > 0 &&
3126 kvm->arch.n_free_mmu_pages; 3429 kvm->arch.n_used_mmu_pages > 0) {
3127 cache_count += npages;
3128 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3129 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, 3430 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3130 &invalid_list); 3431 &invalid_list);
3131 cache_count -= freed_pages;
3132 kvm_freed = kvm; 3432 kvm_freed = kvm;
3133 } 3433 }
3134 nr_to_scan--; 3434 nr_to_scan--;
@@ -3142,7 +3442,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3142 3442
3143 spin_unlock(&kvm_lock); 3443 spin_unlock(&kvm_lock);
3144 3444
3145 return cache_count; 3445out:
3446 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3146} 3447}
3147 3448
3148static struct shrinker mmu_shrinker = { 3449static struct shrinker mmu_shrinker = {
@@ -3163,6 +3464,7 @@ static void mmu_destroy_caches(void)
3163void kvm_mmu_module_exit(void) 3464void kvm_mmu_module_exit(void)
3164{ 3465{
3165 mmu_destroy_caches(); 3466 mmu_destroy_caches();
3467 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3166 unregister_shrinker(&mmu_shrinker); 3468 unregister_shrinker(&mmu_shrinker);
3167} 3469}
3168 3470
@@ -3185,6 +3487,9 @@ int kvm_mmu_module_init(void)
3185 if (!mmu_page_header_cache) 3487 if (!mmu_page_header_cache)
3186 goto nomem; 3488 goto nomem;
3187 3489
3490 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
3491 goto nomem;
3492
3188 register_shrinker(&mmu_shrinker); 3493 register_shrinker(&mmu_shrinker);
3189 3494
3190 return 0; 3495 return 0;
@@ -3355,271 +3660,18 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3355} 3660}
3356EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3661EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3357 3662
3358#ifdef AUDIT 3663#ifdef CONFIG_KVM_MMU_AUDIT
3359 3664#include "mmu_audit.c"
3360static const char *audit_msg; 3665#else
3361 3666static void mmu_audit_disable(void) { }
3362static gva_t canonicalize(gva_t gva)
3363{
3364#ifdef CONFIG_X86_64
3365 gva = (long long)(gva << 16) >> 16;
3366#endif 3667#endif
3367 return gva;
3368}
3369
3370
3371typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3372
3373static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3374 inspect_spte_fn fn)
3375{
3376 int i;
3377
3378 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3379 u64 ent = sp->spt[i];
3380
3381 if (is_shadow_present_pte(ent)) {
3382 if (!is_last_spte(ent, sp->role.level)) {
3383 struct kvm_mmu_page *child;
3384 child = page_header(ent & PT64_BASE_ADDR_MASK);
3385 __mmu_spte_walk(kvm, child, fn);
3386 } else
3387 fn(kvm, &sp->spt[i]);
3388 }
3389 }
3390}
3391
3392static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3393{
3394 int i;
3395 struct kvm_mmu_page *sp;
3396
3397 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3398 return;
3399 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3400 hpa_t root = vcpu->arch.mmu.root_hpa;
3401 sp = page_header(root);
3402 __mmu_spte_walk(vcpu->kvm, sp, fn);
3403 return;
3404 }
3405 for (i = 0; i < 4; ++i) {
3406 hpa_t root = vcpu->arch.mmu.pae_root[i];
3407
3408 if (root && VALID_PAGE(root)) {
3409 root &= PT64_BASE_ADDR_MASK;
3410 sp = page_header(root);
3411 __mmu_spte_walk(vcpu->kvm, sp, fn);
3412 }
3413 }
3414 return;
3415}
3416
3417static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3418 gva_t va, int level)
3419{
3420 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
3421 int i;
3422 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
3423
3424 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
3425 u64 ent = pt[i];
3426
3427 if (ent == shadow_trap_nonpresent_pte)
3428 continue;
3429
3430 va = canonicalize(va);
3431 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3432 audit_mappings_page(vcpu, ent, va, level - 1);
3433 else {
3434 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3435 gfn_t gfn = gpa >> PAGE_SHIFT;
3436 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3437 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3438 3668
3439 if (is_error_pfn(pfn)) { 3669void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3440 kvm_release_pfn_clean(pfn);
3441 continue;
3442 }
3443
3444 if (is_shadow_present_pte(ent)
3445 && (ent & PT64_BASE_ADDR_MASK) != hpa)
3446 printk(KERN_ERR "xx audit error: (%s) levels %d"
3447 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3448 audit_msg, vcpu->arch.mmu.root_level,
3449 va, gpa, hpa, ent,
3450 is_shadow_present_pte(ent));
3451 else if (ent == shadow_notrap_nonpresent_pte
3452 && !is_error_hpa(hpa))
3453 printk(KERN_ERR "audit: (%s) notrap shadow,"
3454 " valid guest gva %lx\n", audit_msg, va);
3455 kvm_release_pfn_clean(pfn);
3456
3457 }
3458 }
3459}
3460
3461static void audit_mappings(struct kvm_vcpu *vcpu)
3462{
3463 unsigned i;
3464
3465 if (vcpu->arch.mmu.root_level == 4)
3466 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3467 else
3468 for (i = 0; i < 4; ++i)
3469 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3470 audit_mappings_page(vcpu,
3471 vcpu->arch.mmu.pae_root[i],
3472 i << 30,
3473 2);
3474}
3475
3476static int count_rmaps(struct kvm_vcpu *vcpu)
3477{
3478 struct kvm *kvm = vcpu->kvm;
3479 struct kvm_memslots *slots;
3480 int nmaps = 0;
3481 int i, j, k, idx;
3482
3483 idx = srcu_read_lock(&kvm->srcu);
3484 slots = kvm_memslots(kvm);
3485 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3486 struct kvm_memory_slot *m = &slots->memslots[i];
3487 struct kvm_rmap_desc *d;
3488
3489 for (j = 0; j < m->npages; ++j) {
3490 unsigned long *rmapp = &m->rmap[j];
3491
3492 if (!*rmapp)
3493 continue;
3494 if (!(*rmapp & 1)) {
3495 ++nmaps;
3496 continue;
3497 }
3498 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3499 while (d) {
3500 for (k = 0; k < RMAP_EXT; ++k)
3501 if (d->sptes[k])
3502 ++nmaps;
3503 else
3504 break;
3505 d = d->more;
3506 }
3507 }
3508 }
3509 srcu_read_unlock(&kvm->srcu, idx);
3510 return nmaps;
3511}
3512
3513void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3514{
3515 unsigned long *rmapp;
3516 struct kvm_mmu_page *rev_sp;
3517 gfn_t gfn;
3518
3519 if (is_writable_pte(*sptep)) {
3520 rev_sp = page_header(__pa(sptep));
3521 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3522
3523 if (!gfn_to_memslot(kvm, gfn)) {
3524 if (!printk_ratelimit())
3525 return;
3526 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3527 audit_msg, gfn);
3528 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3529 audit_msg, (long int)(sptep - rev_sp->spt),
3530 rev_sp->gfn);
3531 dump_stack();
3532 return;
3533 }
3534
3535 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3536 if (!*rmapp) {
3537 if (!printk_ratelimit())
3538 return;
3539 printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3540 audit_msg, *sptep);
3541 dump_stack();
3542 }
3543 }
3544
3545}
3546
3547void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3548{
3549 mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3550}
3551
3552static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3553{
3554 struct kvm_mmu_page *sp;
3555 int i;
3556
3557 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3558 u64 *pt = sp->spt;
3559
3560 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3561 continue;
3562
3563 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3564 u64 ent = pt[i];
3565
3566 if (!(ent & PT_PRESENT_MASK))
3567 continue;
3568 if (!is_writable_pte(ent))
3569 continue;
3570 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3571 }
3572 }
3573 return;
3574}
3575
3576static void audit_rmap(struct kvm_vcpu *vcpu)
3577{
3578 check_writable_mappings_rmap(vcpu);
3579 count_rmaps(vcpu);
3580}
3581
3582static void audit_write_protection(struct kvm_vcpu *vcpu)
3583{
3584 struct kvm_mmu_page *sp;
3585 struct kvm_memory_slot *slot;
3586 unsigned long *rmapp;
3587 u64 *spte;
3588 gfn_t gfn;
3589
3590 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3591 if (sp->role.direct)
3592 continue;
3593 if (sp->unsync)
3594 continue;
3595
3596 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3597 rmapp = &slot->rmap[gfn - slot->base_gfn];
3598
3599 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3600 while (spte) {
3601 if (is_writable_pte(*spte))
3602 printk(KERN_ERR "%s: (%s) shadow page has "
3603 "writable mappings: gfn %lx role %x\n",
3604 __func__, audit_msg, sp->gfn,
3605 sp->role.word);
3606 spte = rmap_next(vcpu->kvm, rmapp, spte);
3607 }
3608 }
3609}
3610
3611static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
3612{ 3670{
3613 int olddbg = dbg; 3671 ASSERT(vcpu);
3614 3672
3615 dbg = 0; 3673 destroy_kvm_mmu(vcpu);
3616 audit_msg = msg; 3674 free_mmu_pages(vcpu);
3617 audit_rmap(vcpu); 3675 mmu_free_memory_caches(vcpu);
3618 audit_write_protection(vcpu); 3676 mmu_audit_disable();
3619 if (strcmp("pre pte write", audit_msg) != 0)
3620 audit_mappings(vcpu);
3621 audit_writable_sptes_have_rmaps(vcpu);
3622 dbg = olddbg;
3623} 3677}
3624
3625#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index be66759321a5..7086ca85d3e7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,10 +49,17 @@
49#define PFERR_FETCH_MASK (1U << 4) 49#define PFERR_FETCH_MASK (1U << 4)
50 50
51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
52int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
53
54static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
55{
56 return kvm->arch.n_max_mmu_pages -
57 kvm->arch.n_used_mmu_pages;
58}
52 59
53static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 60static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
54{ 61{
55 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) 62 if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
56 __kvm_mmu_free_some_pages(vcpu); 63 __kvm_mmu_free_some_pages(vcpu);
57} 64}
58 65
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 000000000000..ba2bcdde6221
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,299 @@
1/*
2 * mmu_audit.c:
3 *
4 * Audit code for KVM MMU
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8 *
9 * Authors:
10 * Yaniv Kamay <yaniv@qumranet.com>
11 * Avi Kivity <avi@qumranet.com>
12 * Marcelo Tosatti <mtosatti@redhat.com>
13 * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include <linux/ratelimit.h>
21
22static int audit_point;
23
24#define audit_printk(fmt, args...) \
25 printk(KERN_ERR "audit: (%s) error: " \
26 fmt, audit_point_name[audit_point], ##args)
27
28typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
29
30static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
31 inspect_spte_fn fn, int level)
32{
33 int i;
34
35 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
36 u64 *ent = sp->spt;
37
38 fn(vcpu, ent + i, level);
39
40 if (is_shadow_present_pte(ent[i]) &&
41 !is_last_spte(ent[i], level)) {
42 struct kvm_mmu_page *child;
43
44 child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
45 __mmu_spte_walk(vcpu, child, fn, level - 1);
46 }
47 }
48}
49
50static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
51{
52 int i;
53 struct kvm_mmu_page *sp;
54
55 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
56 return;
57
58 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
59 hpa_t root = vcpu->arch.mmu.root_hpa;
60
61 sp = page_header(root);
62 __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
63 return;
64 }
65
66 for (i = 0; i < 4; ++i) {
67 hpa_t root = vcpu->arch.mmu.pae_root[i];
68
69 if (root && VALID_PAGE(root)) {
70 root &= PT64_BASE_ADDR_MASK;
71 sp = page_header(root);
72 __mmu_spte_walk(vcpu, sp, fn, 2);
73 }
74 }
75
76 return;
77}
78
79typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
80
81static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
82{
83 struct kvm_mmu_page *sp;
84
85 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
86 fn(kvm, sp);
87}
88
89static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
90{
91 struct kvm_mmu_page *sp;
92 gfn_t gfn;
93 pfn_t pfn;
94 hpa_t hpa;
95
96 sp = page_header(__pa(sptep));
97
98 if (sp->unsync) {
99 if (level != PT_PAGE_TABLE_LEVEL) {
100 audit_printk("unsync sp: %p level = %d\n", sp, level);
101 return;
102 }
103
104 if (*sptep == shadow_notrap_nonpresent_pte) {
105 audit_printk("notrap spte in unsync sp: %p\n", sp);
106 return;
107 }
108 }
109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk("notrap spte in direct sp: %p\n", sp);
112 return;
113 }
114
115 if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
116 return;
117
118 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
119 pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
120
121 if (is_error_pfn(pfn)) {
122 kvm_release_pfn_clean(pfn);
123 return;
124 }
125
126 hpa = pfn << PAGE_SHIFT;
127 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
128 audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
129 vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
130}
131
132static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
133{
134 unsigned long *rmapp;
135 struct kvm_mmu_page *rev_sp;
136 gfn_t gfn;
137
138
139 rev_sp = page_header(__pa(sptep));
140 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
141
142 if (!gfn_to_memslot(kvm, gfn)) {
143 if (!printk_ratelimit())
144 return;
145 audit_printk("no memslot for gfn %llx\n", gfn);
146 audit_printk("index %ld of sp (gfn=%llx)\n",
147 (long int)(sptep - rev_sp->spt), rev_sp->gfn);
148 dump_stack();
149 return;
150 }
151
152 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
153 if (!*rmapp) {
154 if (!printk_ratelimit())
155 return;
156 audit_printk("no rmap for writable spte %llx\n", *sptep);
157 dump_stack();
158 }
159}
160
161static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
162{
163 if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
164 inspect_spte_has_rmap(vcpu->kvm, sptep);
165}
166
167static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
168{
169 struct kvm_mmu_page *sp = page_header(__pa(sptep));
170
171 if (audit_point == AUDIT_POST_SYNC && sp->unsync)
172 audit_printk("meet unsync sp(%p) after sync root.\n", sp);
173}
174
175static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
176{
177 int i;
178
179 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
180 return;
181
182 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
183 if (!is_rmap_spte(sp->spt[i]))
184 continue;
185
186 inspect_spte_has_rmap(kvm, sp->spt + i);
187 }
188}
189
190static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
191{
192 struct kvm_memory_slot *slot;
193 unsigned long *rmapp;
194 u64 *spte;
195
196 if (sp->role.direct || sp->unsync || sp->role.invalid)
197 return;
198
199 slot = gfn_to_memslot(kvm, sp->gfn);
200 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
201
202 spte = rmap_next(kvm, rmapp, NULL);
203 while (spte) {
204 if (is_writable_pte(*spte))
205 audit_printk("shadow page has writable mappings: gfn "
206 "%llx role %x\n", sp->gfn, sp->role.word);
207 spte = rmap_next(kvm, rmapp, spte);
208 }
209}
210
211static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
212{
213 check_mappings_rmap(kvm, sp);
214 audit_write_protection(kvm, sp);
215}
216
217static void audit_all_active_sps(struct kvm *kvm)
218{
219 walk_all_active_sps(kvm, audit_sp);
220}
221
222static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
223{
224 audit_sptes_have_rmaps(vcpu, sptep, level);
225 audit_mappings(vcpu, sptep, level);
226 audit_spte_after_sync(vcpu, sptep, level);
227}
228
229static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
230{
231 mmu_spte_walk(vcpu, audit_spte);
232}
233
234static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
235{
236 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
237
238 if (!__ratelimit(&ratelimit_state))
239 return;
240
241 audit_point = point;
242 audit_all_active_sps(vcpu->kvm);
243 audit_vcpu_spte(vcpu);
244}
245
246static bool mmu_audit;
247
248static void mmu_audit_enable(void)
249{
250 int ret;
251
252 if (mmu_audit)
253 return;
254
255 ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
256 WARN_ON(ret);
257
258 mmu_audit = true;
259}
260
261static void mmu_audit_disable(void)
262{
263 if (!mmu_audit)
264 return;
265
266 unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
267 tracepoint_synchronize_unregister();
268 mmu_audit = false;
269}
270
271static int mmu_audit_set(const char *val, const struct kernel_param *kp)
272{
273 int ret;
274 unsigned long enable;
275
276 ret = strict_strtoul(val, 10, &enable);
277 if (ret < 0)
278 return -EINVAL;
279
280 switch (enable) {
281 case 0:
282 mmu_audit_disable();
283 break;
284 case 1:
285 mmu_audit_enable();
286 break;
287 default:
288 return -EINVAL;
289 }
290
291 return 0;
292}
293
294static struct kernel_param_ops audit_param_ops = {
295 .set = mmu_audit_set,
296 .get = param_get_bool,
297};
298
299module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0930ef..b60b4fdb3eda 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
195 195
196 TP_ARGS(sp) 196 TP_ARGS(sp)
197); 197);
198
199TRACE_EVENT(
200 kvm_mmu_audit,
201 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
202 TP_ARGS(vcpu, audit_point),
203
204 TP_STRUCT__entry(
205 __field(struct kvm_vcpu *, vcpu)
206 __field(int, audit_point)
207 ),
208
209 TP_fast_assign(
210 __entry->vcpu = vcpu;
211 __entry->audit_point = audit_point;
212 ),
213
214 TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
215 audit_point_name[__entry->audit_point])
216);
198#endif /* _TRACE_KVMMMU_H */ 217#endif /* _TRACE_KVMMMU_H */
199 218
200#undef TRACE_INCLUDE_PATH 219#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 51ef9097960d..cd7a833a3b52 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,7 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -67,6 +67,7 @@ struct guest_walker {
67 int level; 67 int level;
68 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 68 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
69 pt_element_t ptes[PT_MAX_FULL_LEVELS]; 69 pt_element_t ptes[PT_MAX_FULL_LEVELS];
70 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
70 gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 71 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71 unsigned pt_access; 72 unsigned pt_access;
72 unsigned pte_access; 73 unsigned pte_access;
@@ -104,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
104 105
105 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; 106 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
106#if PTTYPE == 64 107#if PTTYPE == 64
107 if (is_nx(vcpu)) 108 if (vcpu->arch.mmu.nx)
108 access &= ~(gpte >> PT64_NX_SHIFT); 109 access &= ~(gpte >> PT64_NX_SHIFT);
109#endif 110#endif
110 return access; 111 return access;
@@ -113,26 +114,32 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
113/* 114/*
114 * Fetch a guest pte for a guest virtual address 115 * Fetch a guest pte for a guest virtual address
115 */ 116 */
116static int FNAME(walk_addr)(struct guest_walker *walker, 117static int FNAME(walk_addr_generic)(struct guest_walker *walker,
117 struct kvm_vcpu *vcpu, gva_t addr, 118 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
118 int write_fault, int user_fault, int fetch_fault) 119 gva_t addr, u32 access)
119{ 120{
120 pt_element_t pte; 121 pt_element_t pte;
121 gfn_t table_gfn; 122 gfn_t table_gfn;
122 unsigned index, pt_access, uninitialized_var(pte_access); 123 unsigned index, pt_access, uninitialized_var(pte_access);
123 gpa_t pte_gpa; 124 gpa_t pte_gpa;
124 bool eperm, present, rsvd_fault; 125 bool eperm, present, rsvd_fault;
126 int offset, write_fault, user_fault, fetch_fault;
127
128 write_fault = access & PFERR_WRITE_MASK;
129 user_fault = access & PFERR_USER_MASK;
130 fetch_fault = access & PFERR_FETCH_MASK;
125 131
126 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 132 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
127 fetch_fault); 133 fetch_fault);
128walk: 134walk:
129 present = true; 135 present = true;
130 eperm = rsvd_fault = false; 136 eperm = rsvd_fault = false;
131 walker->level = vcpu->arch.mmu.root_level; 137 walker->level = mmu->root_level;
132 pte = vcpu->arch.cr3; 138 pte = mmu->get_cr3(vcpu);
139
133#if PTTYPE == 64 140#if PTTYPE == 64
134 if (!is_long_mode(vcpu)) { 141 if (walker->level == PT32E_ROOT_LEVEL) {
135 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); 142 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
136 trace_kvm_mmu_paging_element(pte, walker->level); 143 trace_kvm_mmu_paging_element(pte, walker->level);
137 if (!is_present_gpte(pte)) { 144 if (!is_present_gpte(pte)) {
138 present = false; 145 present = false;
@@ -142,7 +149,7 @@ walk:
142 } 149 }
143#endif 150#endif
144 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 151 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
145 (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); 152 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
146 153
147 pt_access = ACC_ALL; 154 pt_access = ACC_ALL;
148 155
@@ -150,12 +157,14 @@ walk:
150 index = PT_INDEX(addr, walker->level); 157 index = PT_INDEX(addr, walker->level);
151 158
152 table_gfn = gpte_to_gfn(pte); 159 table_gfn = gpte_to_gfn(pte);
153 pte_gpa = gfn_to_gpa(table_gfn); 160 offset = index * sizeof(pt_element_t);
154 pte_gpa += index * sizeof(pt_element_t); 161 pte_gpa = gfn_to_gpa(table_gfn) + offset;
155 walker->table_gfn[walker->level - 1] = table_gfn; 162 walker->table_gfn[walker->level - 1] = table_gfn;
156 walker->pte_gpa[walker->level - 1] = pte_gpa; 163 walker->pte_gpa[walker->level - 1] = pte_gpa;
157 164
158 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { 165 if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte,
166 offset, sizeof(pte),
167 PFERR_USER_MASK|PFERR_WRITE_MASK)) {
159 present = false; 168 present = false;
160 break; 169 break;
161 } 170 }
@@ -167,7 +176,7 @@ walk:
167 break; 176 break;
168 } 177 }
169 178
170 if (is_rsvd_bits_set(vcpu, pte, walker->level)) { 179 if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
171 rsvd_fault = true; 180 rsvd_fault = true;
172 break; 181 break;
173 } 182 }
@@ -204,17 +213,28 @@ walk:
204 (PTTYPE == 64 || is_pse(vcpu))) || 213 (PTTYPE == 64 || is_pse(vcpu))) ||
205 ((walker->level == PT_PDPE_LEVEL) && 214 ((walker->level == PT_PDPE_LEVEL) &&
206 is_large_pte(pte) && 215 is_large_pte(pte) &&
207 is_long_mode(vcpu))) { 216 mmu->root_level == PT64_ROOT_LEVEL)) {
208 int lvl = walker->level; 217 int lvl = walker->level;
218 gpa_t real_gpa;
219 gfn_t gfn;
220 u32 ac;
209 221
210 walker->gfn = gpte_to_gfn_lvl(pte, lvl); 222 gfn = gpte_to_gfn_lvl(pte, lvl);
211 walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) 223 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
212 >> PAGE_SHIFT;
213 224
214 if (PTTYPE == 32 && 225 if (PTTYPE == 32 &&
215 walker->level == PT_DIRECTORY_LEVEL && 226 walker->level == PT_DIRECTORY_LEVEL &&
216 is_cpuid_PSE36()) 227 is_cpuid_PSE36())
217 walker->gfn += pse36_gfn_delta(pte); 228 gfn += pse36_gfn_delta(pte);
229
230 ac = write_fault | fetch_fault | user_fault;
231
232 real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
233 ac);
234 if (real_gpa == UNMAPPED_GVA)
235 return 0;
236
237 walker->gfn = real_gpa >> PAGE_SHIFT;
218 238
219 break; 239 break;
220 } 240 }
@@ -249,18 +269,36 @@ error:
249 walker->error_code = 0; 269 walker->error_code = 0;
250 if (present) 270 if (present)
251 walker->error_code |= PFERR_PRESENT_MASK; 271 walker->error_code |= PFERR_PRESENT_MASK;
252 if (write_fault) 272
253 walker->error_code |= PFERR_WRITE_MASK; 273 walker->error_code |= write_fault | user_fault;
254 if (user_fault) 274
255 walker->error_code |= PFERR_USER_MASK; 275 if (fetch_fault && mmu->nx)
256 if (fetch_fault && is_nx(vcpu))
257 walker->error_code |= PFERR_FETCH_MASK; 276 walker->error_code |= PFERR_FETCH_MASK;
258 if (rsvd_fault) 277 if (rsvd_fault)
259 walker->error_code |= PFERR_RSVD_MASK; 278 walker->error_code |= PFERR_RSVD_MASK;
279
280 vcpu->arch.fault.address = addr;
281 vcpu->arch.fault.error_code = walker->error_code;
282
260 trace_kvm_mmu_walker_error(walker->error_code); 283 trace_kvm_mmu_walker_error(walker->error_code);
261 return 0; 284 return 0;
262} 285}
263 286
287static int FNAME(walk_addr)(struct guest_walker *walker,
288 struct kvm_vcpu *vcpu, gva_t addr, u32 access)
289{
290 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
291 access);
292}
293
294static int FNAME(walk_addr_nested)(struct guest_walker *walker,
295 struct kvm_vcpu *vcpu, gva_t addr,
296 u32 access)
297{
298 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
299 addr, access);
300}
301
264static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 302static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
265 u64 *spte, const void *pte) 303 u64 *spte, const void *pte)
266{ 304{
@@ -302,14 +340,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
302static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, 340static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
303 struct guest_walker *gw, int level) 341 struct guest_walker *gw, int level)
304{ 342{
305 int r;
306 pt_element_t curr_pte; 343 pt_element_t curr_pte;
307 344 gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
308 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], 345 u64 mask;
346 int r, index;
347
348 if (level == PT_PAGE_TABLE_LEVEL) {
349 mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
350 base_gpa = pte_gpa & ~mask;
351 index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
352
353 r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
354 gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
355 curr_pte = gw->prefetch_ptes[index];
356 } else
357 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
309 &curr_pte, sizeof(curr_pte)); 358 &curr_pte, sizeof(curr_pte));
359
310 return r || curr_pte != gw->ptes[level - 1]; 360 return r || curr_pte != gw->ptes[level - 1];
311} 361}
312 362
363static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
364 u64 *sptep)
365{
366 struct kvm_mmu_page *sp;
367 struct kvm_mmu *mmu = &vcpu->arch.mmu;
368 pt_element_t *gptep = gw->prefetch_ptes;
369 u64 *spte;
370 int i;
371
372 sp = page_header(__pa(sptep));
373
374 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
375 return;
376
377 if (sp->role.direct)
378 return __direct_pte_prefetch(vcpu, sp, sptep);
379
380 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
381 spte = sp->spt + i;
382
383 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
384 pt_element_t gpte;
385 unsigned pte_access;
386 gfn_t gfn;
387 pfn_t pfn;
388 bool dirty;
389
390 if (spte == sptep)
391 continue;
392
393 if (*spte != shadow_trap_nonpresent_pte)
394 continue;
395
396 gpte = gptep[i];
397
398 if (!is_present_gpte(gpte) ||
399 is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
400 if (!sp->unsync)
401 __set_spte(spte, shadow_notrap_nonpresent_pte);
402 continue;
403 }
404
405 if (!(gpte & PT_ACCESSED_MASK))
406 continue;
407
408 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
409 gfn = gpte_to_gfn(gpte);
410 dirty = is_dirty_gpte(gpte);
411 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
412 (pte_access & ACC_WRITE_MASK) && dirty);
413 if (is_error_pfn(pfn)) {
414 kvm_release_pfn_clean(pfn);
415 break;
416 }
417
418 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
419 dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
420 pfn, true, true);
421 }
422}
423
313/* 424/*
314 * Fetch a shadow pte for a specific level in the paging hierarchy. 425 * Fetch a shadow pte for a specific level in the paging hierarchy.
315 */ 426 */
@@ -391,6 +502,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
391 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 502 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
392 user_fault, write_fault, dirty, ptwrite, it.level, 503 user_fault, write_fault, dirty, ptwrite, it.level,
393 gw->gfn, pfn, false, true); 504 gw->gfn, pfn, false, true);
505 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
394 506
395 return it.sptep; 507 return it.sptep;
396 508
@@ -420,7 +532,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
420{ 532{
421 int write_fault = error_code & PFERR_WRITE_MASK; 533 int write_fault = error_code & PFERR_WRITE_MASK;
422 int user_fault = error_code & PFERR_USER_MASK; 534 int user_fault = error_code & PFERR_USER_MASK;
423 int fetch_fault = error_code & PFERR_FETCH_MASK;
424 struct guest_walker walker; 535 struct guest_walker walker;
425 u64 *sptep; 536 u64 *sptep;
426 int write_pt = 0; 537 int write_pt = 0;
@@ -430,7 +541,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
430 unsigned long mmu_seq; 541 unsigned long mmu_seq;
431 542
432 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 543 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
433 kvm_mmu_audit(vcpu, "pre page fault");
434 544
435 r = mmu_topup_memory_caches(vcpu); 545 r = mmu_topup_memory_caches(vcpu);
436 if (r) 546 if (r)
@@ -439,15 +549,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
439 /* 549 /*
440 * Look up the guest pte for the faulting address. 550 * Look up the guest pte for the faulting address.
441 */ 551 */
442 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, 552 r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
443 fetch_fault);
444 553
445 /* 554 /*
446 * The page is not mapped by the guest. Let the guest handle it. 555 * The page is not mapped by the guest. Let the guest handle it.
447 */ 556 */
448 if (!r) { 557 if (!r) {
449 pgprintk("%s: guest page fault\n", __func__); 558 pgprintk("%s: guest page fault\n", __func__);
450 inject_page_fault(vcpu, addr, walker.error_code); 559 inject_page_fault(vcpu);
451 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 560 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
452 return 0; 561 return 0;
453 } 562 }
@@ -468,6 +577,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
468 spin_lock(&vcpu->kvm->mmu_lock); 577 spin_lock(&vcpu->kvm->mmu_lock);
469 if (mmu_notifier_retry(vcpu, mmu_seq)) 578 if (mmu_notifier_retry(vcpu, mmu_seq))
470 goto out_unlock; 579 goto out_unlock;
580
581 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
471 kvm_mmu_free_some_pages(vcpu); 582 kvm_mmu_free_some_pages(vcpu);
472 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 583 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
473 level, &write_pt, pfn); 584 level, &write_pt, pfn);
@@ -479,7 +590,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
479 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 590 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
480 591
481 ++vcpu->stat.pf_fixed; 592 ++vcpu->stat.pf_fixed;
482 kvm_mmu_audit(vcpu, "post page fault (fixed)"); 593 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
483 spin_unlock(&vcpu->kvm->mmu_lock); 594 spin_unlock(&vcpu->kvm->mmu_lock);
484 595
485 return write_pt; 596 return write_pt;
@@ -556,10 +667,25 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
556 gpa_t gpa = UNMAPPED_GVA; 667 gpa_t gpa = UNMAPPED_GVA;
557 int r; 668 int r;
558 669
559 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 670 r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
560 !!(access & PFERR_WRITE_MASK), 671
561 !!(access & PFERR_USER_MASK), 672 if (r) {
562 !!(access & PFERR_FETCH_MASK)); 673 gpa = gfn_to_gpa(walker.gfn);
674 gpa |= vaddr & ~PAGE_MASK;
675 } else if (error)
676 *error = walker.error_code;
677
678 return gpa;
679}
680
681static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
682 u32 access, u32 *error)
683{
684 struct guest_walker walker;
685 gpa_t gpa = UNMAPPED_GVA;
686 int r;
687
688 r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
563 689
564 if (r) { 690 if (r) {
565 gpa = gfn_to_gpa(walker.gfn); 691 gpa = gfn_to_gpa(walker.gfn);
@@ -638,7 +764,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
638 return -EINVAL; 764 return -EINVAL;
639 765
640 gfn = gpte_to_gfn(gpte); 766 gfn = gpte_to_gfn(gpte);
641 if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) 767 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
642 || gfn != sp->gfns[i] || !is_present_gpte(gpte) 768 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
643 || !(gpte & PT_ACCESSED_MASK)) { 769 || !(gpte & PT_ACCESSED_MASK)) {
644 u64 nonpresent; 770 u64 nonpresent;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8a3f9f64f86f..82e144a4e514 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,7 +4,7 @@
4 * AMD SVM support 4 * AMD SVM support
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affilates. 7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8 * 8 *
9 * Authors: 9 * Authors:
10 * Yaniv Kamay <yaniv@qumranet.com> 10 * Yaniv Kamay <yaniv@qumranet.com>
@@ -88,6 +88,14 @@ struct nested_state {
88 /* A VMEXIT is required but not yet emulated */ 88 /* A VMEXIT is required but not yet emulated */
89 bool exit_required; 89 bool exit_required;
90 90
91 /*
92 * If we vmexit during an instruction emulation we need this to restore
93 * the l1 guest rip after the emulation
94 */
95 unsigned long vmexit_rip;
96 unsigned long vmexit_rsp;
97 unsigned long vmexit_rax;
98
91 /* cache for intercepts of the guest */ 99 /* cache for intercepts of the guest */
92 u16 intercept_cr_read; 100 u16 intercept_cr_read;
93 u16 intercept_cr_write; 101 u16 intercept_cr_write;
@@ -96,6 +104,8 @@ struct nested_state {
96 u32 intercept_exceptions; 104 u32 intercept_exceptions;
97 u64 intercept; 105 u64 intercept;
98 106
107 /* Nested Paging related state */
108 u64 nested_cr3;
99}; 109};
100 110
101#define MSRPM_OFFSETS 16 111#define MSRPM_OFFSETS 16
@@ -284,6 +294,15 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
284 force_new_asid(vcpu); 294 force_new_asid(vcpu);
285} 295}
286 296
297static int get_npt_level(void)
298{
299#ifdef CONFIG_X86_64
300 return PT64_ROOT_LEVEL;
301#else
302 return PT32E_ROOT_LEVEL;
303#endif
304}
305
287static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 306static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
288{ 307{
289 vcpu->arch.efer = efer; 308 vcpu->arch.efer = efer;
@@ -701,6 +720,29 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
701 seg->base = 0; 720 seg->base = 0;
702} 721}
703 722
723static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
724{
725 struct vcpu_svm *svm = to_svm(vcpu);
726 u64 g_tsc_offset = 0;
727
728 if (is_nested(svm)) {
729 g_tsc_offset = svm->vmcb->control.tsc_offset -
730 svm->nested.hsave->control.tsc_offset;
731 svm->nested.hsave->control.tsc_offset = offset;
732 }
733
734 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
735}
736
737static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
738{
739 struct vcpu_svm *svm = to_svm(vcpu);
740
741 svm->vmcb->control.tsc_offset += adjustment;
742 if (is_nested(svm))
743 svm->nested.hsave->control.tsc_offset += adjustment;
744}
745
704static void init_vmcb(struct vcpu_svm *svm) 746static void init_vmcb(struct vcpu_svm *svm)
705{ 747{
706 struct vmcb_control_area *control = &svm->vmcb->control; 748 struct vmcb_control_area *control = &svm->vmcb->control;
@@ -793,7 +835,7 @@ static void init_vmcb(struct vcpu_svm *svm)
793 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 835 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
794 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 836 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
795 837
796 save->efer = EFER_SVME; 838 svm_set_efer(&svm->vcpu, 0);
797 save->dr6 = 0xffff0ff0; 839 save->dr6 = 0xffff0ff0;
798 save->dr7 = 0x400; 840 save->dr7 = 0x400;
799 save->rflags = 2; 841 save->rflags = 2;
@@ -804,8 +846,8 @@ static void init_vmcb(struct vcpu_svm *svm)
804 * This is the guest-visible cr0 value. 846 * This is the guest-visible cr0 value.
805 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 847 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
806 */ 848 */
807 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 849 svm->vcpu.arch.cr0 = 0;
808 (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); 850 (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
809 851
810 save->cr4 = X86_CR4_PAE; 852 save->cr4 = X86_CR4_PAE;
811 /* rdx = ?? */ 853 /* rdx = ?? */
@@ -901,7 +943,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
901 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 943 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
902 svm->asid_generation = 0; 944 svm->asid_generation = 0;
903 init_vmcb(svm); 945 init_vmcb(svm);
904 svm->vmcb->control.tsc_offset = 0-native_read_tsc(); 946 kvm_write_tsc(&svm->vcpu, 0);
905 947
906 err = fx_init(&svm->vcpu); 948 err = fx_init(&svm->vcpu);
907 if (err) 949 if (err)
@@ -947,20 +989,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
947 int i; 989 int i;
948 990
949 if (unlikely(cpu != vcpu->cpu)) { 991 if (unlikely(cpu != vcpu->cpu)) {
950 u64 delta;
951
952 if (check_tsc_unstable()) {
953 /*
954 * Make sure that the guest sees a monotonically
955 * increasing TSC.
956 */
957 delta = vcpu->arch.host_tsc - native_read_tsc();
958 svm->vmcb->control.tsc_offset += delta;
959 if (is_nested(svm))
960 svm->nested.hsave->control.tsc_offset += delta;
961 }
962 vcpu->cpu = cpu;
963 kvm_migrate_timers(vcpu);
964 svm->asid_generation = 0; 992 svm->asid_generation = 0;
965 } 993 }
966 994
@@ -976,8 +1004,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
976 ++vcpu->stat.host_state_reload; 1004 ++vcpu->stat.host_state_reload;
977 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1005 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
978 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1006 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
979
980 vcpu->arch.host_tsc = native_read_tsc();
981} 1007}
982 1008
983static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1009static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -995,7 +1021,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
995 switch (reg) { 1021 switch (reg) {
996 case VCPU_EXREG_PDPTR: 1022 case VCPU_EXREG_PDPTR:
997 BUG_ON(!npt_enabled); 1023 BUG_ON(!npt_enabled);
998 load_pdptrs(vcpu, vcpu->arch.cr3); 1024 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
999 break; 1025 break;
1000 default: 1026 default:
1001 BUG(); 1027 BUG();
@@ -1206,8 +1232,12 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1206 if (old == new) { 1232 if (old == new) {
1207 /* cr0 write with ts and mp unchanged */ 1233 /* cr0 write with ts and mp unchanged */
1208 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; 1234 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1209 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) 1235 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
1236 svm->nested.vmexit_rip = kvm_rip_read(vcpu);
1237 svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
1238 svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
1210 return; 1239 return;
1240 }
1211 } 1241 }
1212 } 1242 }
1213 1243
@@ -1581,6 +1611,54 @@ static int vmmcall_interception(struct vcpu_svm *svm)
1581 return 1; 1611 return 1;
1582} 1612}
1583 1613
1614static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
1615{
1616 struct vcpu_svm *svm = to_svm(vcpu);
1617
1618 return svm->nested.nested_cr3;
1619}
1620
1621static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1622 unsigned long root)
1623{
1624 struct vcpu_svm *svm = to_svm(vcpu);
1625
1626 svm->vmcb->control.nested_cr3 = root;
1627 force_new_asid(vcpu);
1628}
1629
1630static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
1631{
1632 struct vcpu_svm *svm = to_svm(vcpu);
1633
1634 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1635 svm->vmcb->control.exit_code_hi = 0;
1636 svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code;
1637 svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address;
1638
1639 nested_svm_vmexit(svm);
1640}
1641
1642static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
1643{
1644 int r;
1645
1646 r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
1647
1648 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
1649 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
1650 vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
1651 vcpu->arch.mmu.shadow_root_level = get_npt_level();
1652 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
1653
1654 return r;
1655}
1656
1657static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
1658{
1659 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
1660}
1661
1584static int nested_svm_check_permissions(struct vcpu_svm *svm) 1662static int nested_svm_check_permissions(struct vcpu_svm *svm)
1585{ 1663{
1586 if (!(svm->vcpu.arch.efer & EFER_SVME) 1664 if (!(svm->vcpu.arch.efer & EFER_SVME)
@@ -1629,6 +1707,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
1629 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 1707 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1630 return false; 1708 return false;
1631 1709
1710 /*
1711 * if vmexit was already requested (by intercepted exception
1712 * for instance) do not overwrite it with "external interrupt"
1713 * vmexit.
1714 */
1715 if (svm->nested.exit_required)
1716 return false;
1717
1632 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1718 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1633 svm->vmcb->control.exit_info_1 = 0; 1719 svm->vmcb->control.exit_info_1 = 0;
1634 svm->vmcb->control.exit_info_2 = 0; 1720 svm->vmcb->control.exit_info_2 = 0;
@@ -1896,6 +1982,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1896 nested_vmcb->save.ds = vmcb->save.ds; 1982 nested_vmcb->save.ds = vmcb->save.ds;
1897 nested_vmcb->save.gdtr = vmcb->save.gdtr; 1983 nested_vmcb->save.gdtr = vmcb->save.gdtr;
1898 nested_vmcb->save.idtr = vmcb->save.idtr; 1984 nested_vmcb->save.idtr = vmcb->save.idtr;
1985 nested_vmcb->save.efer = svm->vcpu.arch.efer;
1899 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); 1986 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1900 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; 1987 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
1901 nested_vmcb->save.cr2 = vmcb->save.cr2; 1988 nested_vmcb->save.cr2 = vmcb->save.cr2;
@@ -1917,6 +2004,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1917 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 2004 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1918 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 2005 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1919 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 2006 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2007 nested_vmcb->control.next_rip = vmcb->control.next_rip;
1920 2008
1921 /* 2009 /*
1922 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have 2010 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -1947,6 +2035,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1947 kvm_clear_exception_queue(&svm->vcpu); 2035 kvm_clear_exception_queue(&svm->vcpu);
1948 kvm_clear_interrupt_queue(&svm->vcpu); 2036 kvm_clear_interrupt_queue(&svm->vcpu);
1949 2037
2038 svm->nested.nested_cr3 = 0;
2039
1950 /* Restore selected save entries */ 2040 /* Restore selected save entries */
1951 svm->vmcb->save.es = hsave->save.es; 2041 svm->vmcb->save.es = hsave->save.es;
1952 svm->vmcb->save.cs = hsave->save.cs; 2042 svm->vmcb->save.cs = hsave->save.cs;
@@ -1973,6 +2063,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1973 2063
1974 nested_svm_unmap(page); 2064 nested_svm_unmap(page);
1975 2065
2066 nested_svm_uninit_mmu_context(&svm->vcpu);
1976 kvm_mmu_reset_context(&svm->vcpu); 2067 kvm_mmu_reset_context(&svm->vcpu);
1977 kvm_mmu_load(&svm->vcpu); 2068 kvm_mmu_load(&svm->vcpu);
1978 2069
@@ -2012,6 +2103,20 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2012 return true; 2103 return true;
2013} 2104}
2014 2105
2106static bool nested_vmcb_checks(struct vmcb *vmcb)
2107{
2108 if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2109 return false;
2110
2111 if (vmcb->control.asid == 0)
2112 return false;
2113
2114 if (vmcb->control.nested_ctl && !npt_enabled)
2115 return false;
2116
2117 return true;
2118}
2119
2015static bool nested_svm_vmrun(struct vcpu_svm *svm) 2120static bool nested_svm_vmrun(struct vcpu_svm *svm)
2016{ 2121{
2017 struct vmcb *nested_vmcb; 2122 struct vmcb *nested_vmcb;
@@ -2026,7 +2131,18 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2026 if (!nested_vmcb) 2131 if (!nested_vmcb)
2027 return false; 2132 return false;
2028 2133
2029 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, 2134 if (!nested_vmcb_checks(nested_vmcb)) {
2135 nested_vmcb->control.exit_code = SVM_EXIT_ERR;
2136 nested_vmcb->control.exit_code_hi = 0;
2137 nested_vmcb->control.exit_info_1 = 0;
2138 nested_vmcb->control.exit_info_2 = 0;
2139
2140 nested_svm_unmap(page);
2141
2142 return false;
2143 }
2144
2145 trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2030 nested_vmcb->save.rip, 2146 nested_vmcb->save.rip,
2031 nested_vmcb->control.int_ctl, 2147 nested_vmcb->control.int_ctl,
2032 nested_vmcb->control.event_inj, 2148 nested_vmcb->control.event_inj,
@@ -2055,7 +2171,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2055 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); 2171 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
2056 hsave->save.cr4 = svm->vcpu.arch.cr4; 2172 hsave->save.cr4 = svm->vcpu.arch.cr4;
2057 hsave->save.rflags = vmcb->save.rflags; 2173 hsave->save.rflags = vmcb->save.rflags;
2058 hsave->save.rip = svm->next_rip; 2174 hsave->save.rip = kvm_rip_read(&svm->vcpu);
2059 hsave->save.rsp = vmcb->save.rsp; 2175 hsave->save.rsp = vmcb->save.rsp;
2060 hsave->save.rax = vmcb->save.rax; 2176 hsave->save.rax = vmcb->save.rax;
2061 if (npt_enabled) 2177 if (npt_enabled)
@@ -2070,6 +2186,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2070 else 2186 else
2071 svm->vcpu.arch.hflags &= ~HF_HIF_MASK; 2187 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2072 2188
2189 if (nested_vmcb->control.nested_ctl) {
2190 kvm_mmu_unload(&svm->vcpu);
2191 svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2192 nested_svm_init_mmu_context(&svm->vcpu);
2193 }
2194
2073 /* Load the nested guest state */ 2195 /* Load the nested guest state */
2074 svm->vmcb->save.es = nested_vmcb->save.es; 2196 svm->vmcb->save.es = nested_vmcb->save.es;
2075 svm->vmcb->save.cs = nested_vmcb->save.cs; 2197 svm->vmcb->save.cs = nested_vmcb->save.cs;
@@ -2227,8 +2349,8 @@ static int vmrun_interception(struct vcpu_svm *svm)
2227 if (nested_svm_check_permissions(svm)) 2349 if (nested_svm_check_permissions(svm))
2228 return 1; 2350 return 1;
2229 2351
2230 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2352 /* Save rip after vmrun instruction */
2231 skip_emulated_instruction(&svm->vcpu); 2353 kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
2232 2354
2233 if (!nested_svm_vmrun(svm)) 2355 if (!nested_svm_vmrun(svm))
2234 return 1; 2356 return 1;
@@ -2257,6 +2379,7 @@ static int stgi_interception(struct vcpu_svm *svm)
2257 2379
2258 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2380 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2259 skip_emulated_instruction(&svm->vcpu); 2381 skip_emulated_instruction(&svm->vcpu);
2382 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2260 2383
2261 enable_gif(svm); 2384 enable_gif(svm);
2262 2385
@@ -2399,6 +2522,23 @@ static int emulate_on_interception(struct vcpu_svm *svm)
2399 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2522 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2400} 2523}
2401 2524
2525static int cr0_write_interception(struct vcpu_svm *svm)
2526{
2527 struct kvm_vcpu *vcpu = &svm->vcpu;
2528 int r;
2529
2530 r = emulate_instruction(&svm->vcpu, 0, 0, 0);
2531
2532 if (svm->nested.vmexit_rip) {
2533 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
2534 kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
2535 kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
2536 svm->nested.vmexit_rip = 0;
2537 }
2538
2539 return r == EMULATE_DONE;
2540}
2541
2402static int cr8_write_interception(struct vcpu_svm *svm) 2542static int cr8_write_interception(struct vcpu_svm *svm)
2403{ 2543{
2404 struct kvm_run *kvm_run = svm->vcpu.run; 2544 struct kvm_run *kvm_run = svm->vcpu.run;
@@ -2542,20 +2682,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2542 struct vcpu_svm *svm = to_svm(vcpu); 2682 struct vcpu_svm *svm = to_svm(vcpu);
2543 2683
2544 switch (ecx) { 2684 switch (ecx) {
2545 case MSR_IA32_TSC: { 2685 case MSR_IA32_TSC:
2546 u64 tsc_offset = data - native_read_tsc(); 2686 kvm_write_tsc(vcpu, data);
2547 u64 g_tsc_offset = 0;
2548
2549 if (is_nested(svm)) {
2550 g_tsc_offset = svm->vmcb->control.tsc_offset -
2551 svm->nested.hsave->control.tsc_offset;
2552 svm->nested.hsave->control.tsc_offset = tsc_offset;
2553 }
2554
2555 svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
2556
2557 break; 2687 break;
2558 }
2559 case MSR_STAR: 2688 case MSR_STAR:
2560 svm->vmcb->save.star = data; 2689 svm->vmcb->save.star = data;
2561 break; 2690 break;
@@ -2643,6 +2772,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
2643{ 2772{
2644 struct kvm_run *kvm_run = svm->vcpu.run; 2773 struct kvm_run *kvm_run = svm->vcpu.run;
2645 2774
2775 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2646 svm_clear_vintr(svm); 2776 svm_clear_vintr(svm);
2647 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2777 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2648 /* 2778 /*
@@ -2672,7 +2802,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2672 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2802 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2673 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2803 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2674 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 2804 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2675 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2805 [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
2676 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2806 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2677 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2807 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
2678 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 2808 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
@@ -2871,7 +3001,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2871 3001
2872 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 3002 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2873 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 3003 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2874 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) 3004 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3005 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
2875 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 3006 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
2876 "exit_code 0x%x\n", 3007 "exit_code 0x%x\n",
2877 __func__, svm->vmcb->control.exit_int_info, 3008 __func__, svm->vmcb->control.exit_int_info,
@@ -3088,8 +3219,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3088 3219
3089 svm->int3_injected = 0; 3220 svm->int3_injected = 0;
3090 3221
3091 if (svm->vcpu.arch.hflags & HF_IRET_MASK) 3222 if (svm->vcpu.arch.hflags & HF_IRET_MASK) {
3092 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3223 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3224 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3225 }
3093 3226
3094 svm->vcpu.arch.nmi_injected = false; 3227 svm->vcpu.arch.nmi_injected = false;
3095 kvm_clear_exception_queue(&svm->vcpu); 3228 kvm_clear_exception_queue(&svm->vcpu);
@@ -3098,6 +3231,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3098 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 3231 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3099 return; 3232 return;
3100 3233
3234 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3235
3101 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 3236 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3102 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 3237 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3103 3238
@@ -3134,6 +3269,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3134 } 3269 }
3135} 3270}
3136 3271
3272static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3273{
3274 struct vcpu_svm *svm = to_svm(vcpu);
3275 struct vmcb_control_area *control = &svm->vmcb->control;
3276
3277 control->exit_int_info = control->event_inj;
3278 control->exit_int_info_err = control->event_inj_err;
3279 control->event_inj = 0;
3280 svm_complete_interrupts(svm);
3281}
3282
3137#ifdef CONFIG_X86_64 3283#ifdef CONFIG_X86_64
3138#define R "r" 3284#define R "r"
3139#else 3285#else
@@ -3167,9 +3313,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3167 savesegment(gs, gs_selector); 3313 savesegment(gs, gs_selector);
3168 ldt_selector = kvm_read_ldt(); 3314 ldt_selector = kvm_read_ldt();
3169 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3315 svm->vmcb->save.cr2 = vcpu->arch.cr2;
3170 /* required for live migration with NPT */
3171 if (npt_enabled)
3172 svm->vmcb->save.cr3 = vcpu->arch.cr3;
3173 3316
3174 clgi(); 3317 clgi();
3175 3318
@@ -3291,16 +3434,22 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3291{ 3434{
3292 struct vcpu_svm *svm = to_svm(vcpu); 3435 struct vcpu_svm *svm = to_svm(vcpu);
3293 3436
3294 if (npt_enabled) {
3295 svm->vmcb->control.nested_cr3 = root;
3296 force_new_asid(vcpu);
3297 return;
3298 }
3299
3300 svm->vmcb->save.cr3 = root; 3437 svm->vmcb->save.cr3 = root;
3301 force_new_asid(vcpu); 3438 force_new_asid(vcpu);
3302} 3439}
3303 3440
3441static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3442{
3443 struct vcpu_svm *svm = to_svm(vcpu);
3444
3445 svm->vmcb->control.nested_cr3 = root;
3446
3447 /* Also sync guest cr3 here in case we live migrate */
3448 svm->vmcb->save.cr3 = vcpu->arch.cr3;
3449
3450 force_new_asid(vcpu);
3451}
3452
3304static int is_disabled(void) 3453static int is_disabled(void)
3305{ 3454{
3306 u64 vm_cr; 3455 u64 vm_cr;
@@ -3333,15 +3482,6 @@ static bool svm_cpu_has_accelerated_tpr(void)
3333 return false; 3482 return false;
3334} 3483}
3335 3484
3336static int get_npt_level(void)
3337{
3338#ifdef CONFIG_X86_64
3339 return PT64_ROOT_LEVEL;
3340#else
3341 return PT32E_ROOT_LEVEL;
3342#endif
3343}
3344
3345static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 3485static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3346{ 3486{
3347 return 0; 3487 return 0;
@@ -3354,12 +3494,25 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
3354static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 3494static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3355{ 3495{
3356 switch (func) { 3496 switch (func) {
3497 case 0x80000001:
3498 if (nested)
3499 entry->ecx |= (1 << 2); /* Set SVM bit */
3500 break;
3357 case 0x8000000A: 3501 case 0x8000000A:
3358 entry->eax = 1; /* SVM revision 1 */ 3502 entry->eax = 1; /* SVM revision 1 */
3359 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper 3503 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
3360 ASID emulation to nested SVM */ 3504 ASID emulation to nested SVM */
3361 entry->ecx = 0; /* Reserved */ 3505 entry->ecx = 0; /* Reserved */
3362 entry->edx = 0; /* Do not support any additional features */ 3506 entry->edx = 0; /* Per default do not support any
3507 additional features */
3508
3509 /* Support next_rip if host supports it */
3510 if (svm_has(SVM_FEATURE_NRIP))
3511 entry->edx |= SVM_FEATURE_NRIP;
3512
3513 /* Support NPT for the guest if enabled */
3514 if (npt_enabled)
3515 entry->edx |= SVM_FEATURE_NPT;
3363 3516
3364 break; 3517 break;
3365 } 3518 }
@@ -3497,6 +3650,7 @@ static struct kvm_x86_ops svm_x86_ops = {
3497 .set_irq = svm_set_irq, 3650 .set_irq = svm_set_irq,
3498 .set_nmi = svm_inject_nmi, 3651 .set_nmi = svm_inject_nmi,
3499 .queue_exception = svm_queue_exception, 3652 .queue_exception = svm_queue_exception,
3653 .cancel_injection = svm_cancel_injection,
3500 .interrupt_allowed = svm_interrupt_allowed, 3654 .interrupt_allowed = svm_interrupt_allowed,
3501 .nmi_allowed = svm_nmi_allowed, 3655 .nmi_allowed = svm_nmi_allowed,
3502 .get_nmi_mask = svm_get_nmi_mask, 3656 .get_nmi_mask = svm_get_nmi_mask,
@@ -3519,6 +3673,11 @@ static struct kvm_x86_ops svm_x86_ops = {
3519 .set_supported_cpuid = svm_set_supported_cpuid, 3673 .set_supported_cpuid = svm_set_supported_cpuid,
3520 3674
3521 .has_wbinvd_exit = svm_has_wbinvd_exit, 3675 .has_wbinvd_exit = svm_has_wbinvd_exit,
3676
3677 .write_tsc_offset = svm_write_tsc_offset,
3678 .adjust_tsc_offset = svm_adjust_tsc_offset,
3679
3680 .set_tdp_cr3 = set_tdp_cr3,
3522}; 3681};
3523 3682
3524static int __init svm_init(void) 3683static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index e16a0dbe74d8..fc7a101c4a35 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * timer support 7 * timer support
8 * 8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affilates. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * 10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See 11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory. 12 * the COPYING file in the top-level directory.
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7bddfab12013..8da0e45ff7c9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,7 +5,7 @@
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
@@ -125,6 +125,7 @@ struct vcpu_vmx {
125 unsigned long host_rsp; 125 unsigned long host_rsp;
126 int launched; 126 int launched;
127 u8 fail; 127 u8 fail;
128 u32 exit_intr_info;
128 u32 idt_vectoring_info; 129 u32 idt_vectoring_info;
129 struct shared_msr_entry *guest_msrs; 130 struct shared_msr_entry *guest_msrs;
130 int nmsrs; 131 int nmsrs;
@@ -154,11 +155,6 @@ struct vcpu_vmx {
154 u32 limit; 155 u32 limit;
155 u32 ar; 156 u32 ar;
156 } tr, es, ds, fs, gs; 157 } tr, es, ds, fs, gs;
157 struct {
158 bool pending;
159 u8 vector;
160 unsigned rip;
161 } irq;
162 } rmode; 158 } rmode;
163 int vpid; 159 int vpid;
164 bool emulation_required; 160 bool emulation_required;
@@ -505,7 +501,6 @@ static void __vcpu_clear(void *arg)
505 vmcs_clear(vmx->vmcs); 501 vmcs_clear(vmx->vmcs);
506 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 502 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
507 per_cpu(current_vmcs, cpu) = NULL; 503 per_cpu(current_vmcs, cpu) = NULL;
508 rdtscll(vmx->vcpu.arch.host_tsc);
509 list_del(&vmx->local_vcpus_link); 504 list_del(&vmx->local_vcpus_link);
510 vmx->vcpu.cpu = -1; 505 vmx->vcpu.cpu = -1;
511 vmx->launched = 0; 506 vmx->launched = 0;
@@ -706,11 +701,10 @@ static void reload_tss(void)
706 /* 701 /*
707 * VT restores TR but not its size. Useless. 702 * VT restores TR but not its size. Useless.
708 */ 703 */
709 struct desc_ptr gdt; 704 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
710 struct desc_struct *descs; 705 struct desc_struct *descs;
711 706
712 native_store_gdt(&gdt); 707 descs = (void *)gdt->address;
713 descs = (void *)gdt.address;
714 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 708 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
715 load_TR_desc(); 709 load_TR_desc();
716} 710}
@@ -753,7 +747,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
753 747
754static unsigned long segment_base(u16 selector) 748static unsigned long segment_base(u16 selector)
755{ 749{
756 struct desc_ptr gdt; 750 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
757 struct desc_struct *d; 751 struct desc_struct *d;
758 unsigned long table_base; 752 unsigned long table_base;
759 unsigned long v; 753 unsigned long v;
@@ -761,8 +755,7 @@ static unsigned long segment_base(u16 selector)
761 if (!(selector & ~3)) 755 if (!(selector & ~3))
762 return 0; 756 return 0;
763 757
764 native_store_gdt(&gdt); 758 table_base = gdt->address;
765 table_base = gdt.address;
766 759
767 if (selector & 4) { /* from ldt */ 760 if (selector & 4) { /* from ldt */
768 u16 ldt_selector = kvm_read_ldt(); 761 u16 ldt_selector = kvm_read_ldt();
@@ -883,7 +876,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
883static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 876static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
884{ 877{
885 struct vcpu_vmx *vmx = to_vmx(vcpu); 878 struct vcpu_vmx *vmx = to_vmx(vcpu);
886 u64 tsc_this, delta, new_offset;
887 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 879 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
888 880
889 if (!vmm_exclusive) 881 if (!vmm_exclusive)
@@ -897,37 +889,24 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
897 } 889 }
898 890
899 if (vcpu->cpu != cpu) { 891 if (vcpu->cpu != cpu) {
900 struct desc_ptr dt; 892 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
901 unsigned long sysenter_esp; 893 unsigned long sysenter_esp;
902 894
903 kvm_migrate_timers(vcpu);
904 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 895 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
905 local_irq_disable(); 896 local_irq_disable();
906 list_add(&vmx->local_vcpus_link, 897 list_add(&vmx->local_vcpus_link,
907 &per_cpu(vcpus_on_cpu, cpu)); 898 &per_cpu(vcpus_on_cpu, cpu));
908 local_irq_enable(); 899 local_irq_enable();
909 900
910 vcpu->cpu = cpu;
911 /* 901 /*
912 * Linux uses per-cpu TSS and GDT, so set these when switching 902 * Linux uses per-cpu TSS and GDT, so set these when switching
913 * processors. 903 * processors.
914 */ 904 */
915 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 905 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
916 native_store_gdt(&dt); 906 vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
917 vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
918 907
919 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 908 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
920 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 909 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
921
922 /*
923 * Make sure the time stamp counter is monotonous.
924 */
925 rdtscll(tsc_this);
926 if (tsc_this < vcpu->arch.host_tsc) {
927 delta = vcpu->arch.host_tsc - tsc_this;
928 new_offset = vmcs_read64(TSC_OFFSET) + delta;
929 vmcs_write64(TSC_OFFSET, new_offset);
930 }
931 } 910 }
932} 911}
933 912
@@ -1044,16 +1023,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1044 } 1023 }
1045 1024
1046 if (vmx->rmode.vm86_active) { 1025 if (vmx->rmode.vm86_active) {
1047 vmx->rmode.irq.pending = true; 1026 if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE)
1048 vmx->rmode.irq.vector = nr; 1027 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1049 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
1050 if (kvm_exception_is_soft(nr))
1051 vmx->rmode.irq.rip +=
1052 vmx->vcpu.arch.event_exit_inst_len;
1053 intr_info |= INTR_TYPE_SOFT_INTR;
1054 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1055 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
1056 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
1057 return; 1028 return;
1058 } 1029 }
1059 1030
@@ -1149,12 +1120,17 @@ static u64 guest_read_tsc(void)
1149} 1120}
1150 1121
1151/* 1122/*
1152 * writes 'guest_tsc' into guest's timestamp counter "register" 1123 * writes 'offset' into guest's timestamp counter offset register
1153 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
1154 */ 1124 */
1155static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) 1125static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1126{
1127 vmcs_write64(TSC_OFFSET, offset);
1128}
1129
1130static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1156{ 1131{
1157 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); 1132 u64 offset = vmcs_read64(TSC_OFFSET);
1133 vmcs_write64(TSC_OFFSET, offset + adjustment);
1158} 1134}
1159 1135
1160/* 1136/*
@@ -1227,7 +1203,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1227{ 1203{
1228 struct vcpu_vmx *vmx = to_vmx(vcpu); 1204 struct vcpu_vmx *vmx = to_vmx(vcpu);
1229 struct shared_msr_entry *msr; 1205 struct shared_msr_entry *msr;
1230 u64 host_tsc;
1231 int ret = 0; 1206 int ret = 0;
1232 1207
1233 switch (msr_index) { 1208 switch (msr_index) {
@@ -1257,8 +1232,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1257 vmcs_writel(GUEST_SYSENTER_ESP, data); 1232 vmcs_writel(GUEST_SYSENTER_ESP, data);
1258 break; 1233 break;
1259 case MSR_IA32_TSC: 1234 case MSR_IA32_TSC:
1260 rdtscll(host_tsc); 1235 kvm_write_tsc(vcpu, data);
1261 guest_write_tsc(data, host_tsc);
1262 break; 1236 break;
1263 case MSR_IA32_CR_PAT: 1237 case MSR_IA32_CR_PAT:
1264 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 1238 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -1856,20 +1830,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1856 return; 1830 return;
1857 1831
1858 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 1832 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1859 vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); 1833 vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
1860 vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); 1834 vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
1861 vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); 1835 vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
1862 vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); 1836 vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
1863 } 1837 }
1864} 1838}
1865 1839
1866static void ept_save_pdptrs(struct kvm_vcpu *vcpu) 1840static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
1867{ 1841{
1868 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 1842 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1869 vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 1843 vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
1870 vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 1844 vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
1871 vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 1845 vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
1872 vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 1846 vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
1873 } 1847 }
1874 1848
1875 __set_bit(VCPU_EXREG_PDPTR, 1849 __set_bit(VCPU_EXREG_PDPTR,
@@ -2515,7 +2489,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2515{ 2489{
2516 u32 host_sysenter_cs, msr_low, msr_high; 2490 u32 host_sysenter_cs, msr_low, msr_high;
2517 u32 junk; 2491 u32 junk;
2518 u64 host_pat, tsc_this, tsc_base; 2492 u64 host_pat;
2519 unsigned long a; 2493 unsigned long a;
2520 struct desc_ptr dt; 2494 struct desc_ptr dt;
2521 int i; 2495 int i;
@@ -2656,12 +2630,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2656 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; 2630 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2657 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 2631 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2658 2632
2659 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; 2633 kvm_write_tsc(&vmx->vcpu, 0);
2660 rdtscll(tsc_this);
2661 if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
2662 tsc_base = tsc_this;
2663
2664 guest_write_tsc(0, tsc_base);
2665 2634
2666 return 0; 2635 return 0;
2667} 2636}
@@ -2834,16 +2803,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2834 2803
2835 ++vcpu->stat.irq_injections; 2804 ++vcpu->stat.irq_injections;
2836 if (vmx->rmode.vm86_active) { 2805 if (vmx->rmode.vm86_active) {
2837 vmx->rmode.irq.pending = true; 2806 if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE)
2838 vmx->rmode.irq.vector = irq; 2807 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2839 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2840 if (vcpu->arch.interrupt.soft)
2841 vmx->rmode.irq.rip +=
2842 vmx->vcpu.arch.event_exit_inst_len;
2843 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2844 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2845 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2846 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2847 return; 2808 return;
2848 } 2809 }
2849 intr = irq | INTR_INFO_VALID_MASK; 2810 intr = irq | INTR_INFO_VALID_MASK;
@@ -2875,14 +2836,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2875 2836
2876 ++vcpu->stat.nmi_injections; 2837 ++vcpu->stat.nmi_injections;
2877 if (vmx->rmode.vm86_active) { 2838 if (vmx->rmode.vm86_active) {
2878 vmx->rmode.irq.pending = true; 2839 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
2879 vmx->rmode.irq.vector = NMI_VECTOR; 2840 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2880 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2881 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2882 NMI_VECTOR | INTR_TYPE_SOFT_INTR |
2883 INTR_INFO_VALID_MASK);
2884 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2885 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2886 return; 2841 return;
2887 } 2842 }
2888 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2843 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -3346,6 +3301,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
3346 3301
3347static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 3302static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3348{ 3303{
3304 kvm_make_request(KVM_REQ_EVENT, vcpu);
3349 return 1; 3305 return 1;
3350} 3306}
3351 3307
@@ -3358,6 +3314,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3358 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 3314 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
3359 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3315 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3360 3316
3317 kvm_make_request(KVM_REQ_EVENT, vcpu);
3318
3361 ++vcpu->stat.irq_window_exits; 3319 ++vcpu->stat.irq_window_exits;
3362 3320
3363 /* 3321 /*
@@ -3614,6 +3572,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
3614 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 3572 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
3615 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3573 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3616 ++vcpu->stat.nmi_window_exits; 3574 ++vcpu->stat.nmi_window_exits;
3575 kvm_make_request(KVM_REQ_EVENT, vcpu);
3617 3576
3618 return 1; 3577 return 1;
3619} 3578}
@@ -3623,8 +3582,17 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3623 struct vcpu_vmx *vmx = to_vmx(vcpu); 3582 struct vcpu_vmx *vmx = to_vmx(vcpu);
3624 enum emulation_result err = EMULATE_DONE; 3583 enum emulation_result err = EMULATE_DONE;
3625 int ret = 1; 3584 int ret = 1;
3585 u32 cpu_exec_ctrl;
3586 bool intr_window_requested;
3587
3588 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3589 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
3626 3590
3627 while (!guest_state_valid(vcpu)) { 3591 while (!guest_state_valid(vcpu)) {
3592 if (intr_window_requested
3593 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
3594 return handle_interrupt_window(&vmx->vcpu);
3595
3628 err = emulate_instruction(vcpu, 0, 0, 0); 3596 err = emulate_instruction(vcpu, 0, 0, 0);
3629 3597
3630 if (err == EMULATE_DO_MMIO) { 3598 if (err == EMULATE_DO_MMIO) {
@@ -3790,18 +3758,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3790 vmcs_write32(TPR_THRESHOLD, irr); 3758 vmcs_write32(TPR_THRESHOLD, irr);
3791} 3759}
3792 3760
3793static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 3761static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
3794{ 3762{
3795 u32 exit_intr_info; 3763 u32 exit_intr_info = vmx->exit_intr_info;
3796 u32 idt_vectoring_info = vmx->idt_vectoring_info;
3797 bool unblock_nmi;
3798 u8 vector;
3799 int type;
3800 bool idtv_info_valid;
3801
3802 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3803
3804 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
3805 3764
3806 /* Handle machine checks before interrupts are enabled */ 3765 /* Handle machine checks before interrupts are enabled */
3807 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) 3766 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -3816,8 +3775,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3816 asm("int $2"); 3775 asm("int $2");
3817 kvm_after_handle_nmi(&vmx->vcpu); 3776 kvm_after_handle_nmi(&vmx->vcpu);
3818 } 3777 }
3778}
3819 3779
3820 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3780static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
3781{
3782 u32 exit_intr_info = vmx->exit_intr_info;
3783 bool unblock_nmi;
3784 u8 vector;
3785 bool idtv_info_valid;
3786
3787 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3821 3788
3822 if (cpu_has_virtual_nmis()) { 3789 if (cpu_has_virtual_nmis()) {
3823 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 3790 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
@@ -3839,6 +3806,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3839 } else if (unlikely(vmx->soft_vnmi_blocked)) 3806 } else if (unlikely(vmx->soft_vnmi_blocked))
3840 vmx->vnmi_blocked_time += 3807 vmx->vnmi_blocked_time +=
3841 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 3808 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
3809}
3810
3811static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
3812 u32 idt_vectoring_info,
3813 int instr_len_field,
3814 int error_code_field)
3815{
3816 u8 vector;
3817 int type;
3818 bool idtv_info_valid;
3819
3820 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3842 3821
3843 vmx->vcpu.arch.nmi_injected = false; 3822 vmx->vcpu.arch.nmi_injected = false;
3844 kvm_clear_exception_queue(&vmx->vcpu); 3823 kvm_clear_exception_queue(&vmx->vcpu);
@@ -3847,6 +3826,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3847 if (!idtv_info_valid) 3826 if (!idtv_info_valid)
3848 return; 3827 return;
3849 3828
3829 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
3830
3850 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 3831 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3851 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 3832 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3852 3833
@@ -3863,18 +3844,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3863 break; 3844 break;
3864 case INTR_TYPE_SOFT_EXCEPTION: 3845 case INTR_TYPE_SOFT_EXCEPTION:
3865 vmx->vcpu.arch.event_exit_inst_len = 3846 vmx->vcpu.arch.event_exit_inst_len =
3866 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3847 vmcs_read32(instr_len_field);
3867 /* fall through */ 3848 /* fall through */
3868 case INTR_TYPE_HARD_EXCEPTION: 3849 case INTR_TYPE_HARD_EXCEPTION:
3869 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 3850 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3870 u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); 3851 u32 err = vmcs_read32(error_code_field);
3871 kvm_queue_exception_e(&vmx->vcpu, vector, err); 3852 kvm_queue_exception_e(&vmx->vcpu, vector, err);
3872 } else 3853 } else
3873 kvm_queue_exception(&vmx->vcpu, vector); 3854 kvm_queue_exception(&vmx->vcpu, vector);
3874 break; 3855 break;
3875 case INTR_TYPE_SOFT_INTR: 3856 case INTR_TYPE_SOFT_INTR:
3876 vmx->vcpu.arch.event_exit_inst_len = 3857 vmx->vcpu.arch.event_exit_inst_len =
3877 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3858 vmcs_read32(instr_len_field);
3878 /* fall through */ 3859 /* fall through */
3879 case INTR_TYPE_EXT_INTR: 3860 case INTR_TYPE_EXT_INTR:
3880 kvm_queue_interrupt(&vmx->vcpu, vector, 3861 kvm_queue_interrupt(&vmx->vcpu, vector,
@@ -3885,27 +3866,21 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3885 } 3866 }
3886} 3867}
3887 3868
3888/* 3869static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3889 * Failure to inject an interrupt should give us the information
3890 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
3891 * when fetching the interrupt redirection bitmap in the real-mode
3892 * tss, this doesn't happen. So we do it ourselves.
3893 */
3894static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3895{ 3870{
3896 vmx->rmode.irq.pending = 0; 3871 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
3897 if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) 3872 VM_EXIT_INSTRUCTION_LEN,
3898 return; 3873 IDT_VECTORING_ERROR_CODE);
3899 kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); 3874}
3900 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { 3875
3901 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; 3876static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
3902 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; 3877{
3903 return; 3878 __vmx_complete_interrupts(to_vmx(vcpu),
3904 } 3879 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
3905 vmx->idt_vectoring_info = 3880 VM_ENTRY_INSTRUCTION_LEN,
3906 VECTORING_INFO_VALID_MASK 3881 VM_ENTRY_EXCEPTION_ERROR_CODE);
3907 | INTR_TYPE_EXT_INTR 3882
3908 | vmx->rmode.irq.vector; 3883 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
3909} 3884}
3910 3885
3911#ifdef CONFIG_X86_64 3886#ifdef CONFIG_X86_64
@@ -4032,7 +4007,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4032#endif 4007#endif
4033 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 4008 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
4034 : "cc", "memory" 4009 : "cc", "memory"
4035 , R"bx", R"di", R"si" 4010 , R"ax", R"bx", R"di", R"si"
4036#ifdef CONFIG_X86_64 4011#ifdef CONFIG_X86_64
4037 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 4012 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
4038#endif 4013#endif
@@ -4043,12 +4018,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4043 vcpu->arch.regs_dirty = 0; 4018 vcpu->arch.regs_dirty = 0;
4044 4019
4045 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 4020 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
4046 if (vmx->rmode.irq.pending)
4047 fixup_rmode_irq(vmx);
4048 4021
4049 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 4022 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
4050 vmx->launched = 1; 4023 vmx->launched = 1;
4051 4024
4025 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4026 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
4027
4028 vmx_complete_atomic_exit(vmx);
4029 vmx_recover_nmi_blocking(vmx);
4052 vmx_complete_interrupts(vmx); 4030 vmx_complete_interrupts(vmx);
4053} 4031}
4054 4032
@@ -4119,6 +4097,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4119 4097
4120 cpu = get_cpu(); 4098 cpu = get_cpu();
4121 vmx_vcpu_load(&vmx->vcpu, cpu); 4099 vmx_vcpu_load(&vmx->vcpu, cpu);
4100 vmx->vcpu.cpu = cpu;
4122 err = vmx_vcpu_setup(vmx); 4101 err = vmx_vcpu_setup(vmx);
4123 vmx_vcpu_put(&vmx->vcpu); 4102 vmx_vcpu_put(&vmx->vcpu);
4124 put_cpu(); 4103 put_cpu();
@@ -4334,6 +4313,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4334 .set_irq = vmx_inject_irq, 4313 .set_irq = vmx_inject_irq,
4335 .set_nmi = vmx_inject_nmi, 4314 .set_nmi = vmx_inject_nmi,
4336 .queue_exception = vmx_queue_exception, 4315 .queue_exception = vmx_queue_exception,
4316 .cancel_injection = vmx_cancel_injection,
4337 .interrupt_allowed = vmx_interrupt_allowed, 4317 .interrupt_allowed = vmx_interrupt_allowed,
4338 .nmi_allowed = vmx_nmi_allowed, 4318 .nmi_allowed = vmx_nmi_allowed,
4339 .get_nmi_mask = vmx_get_nmi_mask, 4319 .get_nmi_mask = vmx_get_nmi_mask,
@@ -4356,6 +4336,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
4356 .set_supported_cpuid = vmx_set_supported_cpuid, 4336 .set_supported_cpuid = vmx_set_supported_cpuid,
4357 4337
4358 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 4338 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4339
4340 .write_tsc_offset = vmx_write_tsc_offset,
4341 .adjust_tsc_offset = vmx_adjust_tsc_offset,
4342
4343 .set_tdp_cr3 = vmx_set_cr3,
4359}; 4344};
4360 4345
4361static int __init vmx_init(void) 4346static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6c2ecf0a806d..2288ad829b32 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008 8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affilates. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * 10 *
11 * Authors: 11 * Authors:
12 * Avi Kivity <avi@qumranet.com> 12 * Avi Kivity <avi@qumranet.com>
@@ -55,6 +55,8 @@
55#include <asm/mce.h> 55#include <asm/mce.h>
56#include <asm/i387.h> 56#include <asm/i387.h>
57#include <asm/xcr.h> 57#include <asm/xcr.h>
58#include <asm/pvclock.h>
59#include <asm/div64.h>
58 60
59#define MAX_IO_MSRS 256 61#define MAX_IO_MSRS 256
60#define CR0_RESERVED_BITS \ 62#define CR0_RESERVED_BITS \
@@ -71,7 +73,7 @@
71#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 73#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
72 74
73#define KVM_MAX_MCE_BANKS 32 75#define KVM_MAX_MCE_BANKS 32
74#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P 76#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
75 77
76/* EFER defaults: 78/* EFER defaults:
77 * - enable syscall per default because its emulated by KVM 79 * - enable syscall per default because its emulated by KVM
@@ -282,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
282 u32 prev_nr; 284 u32 prev_nr;
283 int class1, class2; 285 int class1, class2;
284 286
287 kvm_make_request(KVM_REQ_EVENT, vcpu);
288
285 if (!vcpu->arch.exception.pending) { 289 if (!vcpu->arch.exception.pending) {
286 queue: 290 queue:
287 vcpu->arch.exception.pending = true; 291 vcpu->arch.exception.pending = true;
@@ -327,16 +331,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
327} 331}
328EXPORT_SYMBOL_GPL(kvm_requeue_exception); 332EXPORT_SYMBOL_GPL(kvm_requeue_exception);
329 333
330void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 334void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
331 u32 error_code)
332{ 335{
336 unsigned error_code = vcpu->arch.fault.error_code;
337
333 ++vcpu->stat.pf_guest; 338 ++vcpu->stat.pf_guest;
334 vcpu->arch.cr2 = addr; 339 vcpu->arch.cr2 = vcpu->arch.fault.address;
335 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 340 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
336} 341}
337 342
343void kvm_propagate_fault(struct kvm_vcpu *vcpu)
344{
345 if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
346 vcpu->arch.nested_mmu.inject_page_fault(vcpu);
347 else
348 vcpu->arch.mmu.inject_page_fault(vcpu);
349
350 vcpu->arch.fault.nested = false;
351}
352
338void kvm_inject_nmi(struct kvm_vcpu *vcpu) 353void kvm_inject_nmi(struct kvm_vcpu *vcpu)
339{ 354{
355 kvm_make_request(KVM_REQ_EVENT, vcpu);
340 vcpu->arch.nmi_pending = 1; 356 vcpu->arch.nmi_pending = 1;
341} 357}
342EXPORT_SYMBOL_GPL(kvm_inject_nmi); 358EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -367,18 +383,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
367EXPORT_SYMBOL_GPL(kvm_require_cpl); 383EXPORT_SYMBOL_GPL(kvm_require_cpl);
368 384
369/* 385/*
386 * This function will be used to read from the physical memory of the currently
387 * running guest. The difference to kvm_read_guest_page is that this function
388 * can read from guest physical or from the guest's guest physical memory.
389 */
390int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
391 gfn_t ngfn, void *data, int offset, int len,
392 u32 access)
393{
394 gfn_t real_gfn;
395 gpa_t ngpa;
396
397 ngpa = gfn_to_gpa(ngfn);
398 real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
399 if (real_gfn == UNMAPPED_GVA)
400 return -EFAULT;
401
402 real_gfn = gpa_to_gfn(real_gfn);
403
404 return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
405}
406EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
407
408int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
409 void *data, int offset, int len, u32 access)
410{
411 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
412 data, offset, len, access);
413}
414
415/*
370 * Load the pae pdptrs. Return true is they are all valid. 416 * Load the pae pdptrs. Return true is they are all valid.
371 */ 417 */
372int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 418int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
373{ 419{
374 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 420 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
375 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 421 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
376 int i; 422 int i;
377 int ret; 423 int ret;
378 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 424 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
379 425
380 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 426 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
381 offset * sizeof(u64), sizeof(pdpte)); 427 offset * sizeof(u64), sizeof(pdpte),
428 PFERR_USER_MASK|PFERR_WRITE_MASK);
382 if (ret < 0) { 429 if (ret < 0) {
383 ret = 0; 430 ret = 0;
384 goto out; 431 goto out;
@@ -392,7 +439,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
392 } 439 }
393 ret = 1; 440 ret = 1;
394 441
395 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 442 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
396 __set_bit(VCPU_EXREG_PDPTR, 443 __set_bit(VCPU_EXREG_PDPTR,
397 (unsigned long *)&vcpu->arch.regs_avail); 444 (unsigned long *)&vcpu->arch.regs_avail);
398 __set_bit(VCPU_EXREG_PDPTR, 445 __set_bit(VCPU_EXREG_PDPTR,
@@ -405,8 +452,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
405 452
406static bool pdptrs_changed(struct kvm_vcpu *vcpu) 453static bool pdptrs_changed(struct kvm_vcpu *vcpu)
407{ 454{
408 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 455 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
409 bool changed = true; 456 bool changed = true;
457 int offset;
458 gfn_t gfn;
410 int r; 459 int r;
411 460
412 if (is_long_mode(vcpu) || !is_pae(vcpu)) 461 if (is_long_mode(vcpu) || !is_pae(vcpu))
@@ -416,10 +465,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
416 (unsigned long *)&vcpu->arch.regs_avail)) 465 (unsigned long *)&vcpu->arch.regs_avail))
417 return true; 466 return true;
418 467
419 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 468 gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
469 offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
470 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
471 PFERR_USER_MASK | PFERR_WRITE_MASK);
420 if (r < 0) 472 if (r < 0)
421 goto out; 473 goto out;
422 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 474 changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
423out: 475out:
424 476
425 return changed; 477 return changed;
@@ -458,7 +510,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
458 return 1; 510 return 1;
459 } else 511 } else
460#endif 512#endif
461 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) 513 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
514 vcpu->arch.cr3))
462 return 1; 515 return 1;
463 } 516 }
464 517
@@ -547,7 +600,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
547 return 1; 600 return 1;
548 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 601 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
549 && ((cr4 ^ old_cr4) & pdptr_bits) 602 && ((cr4 ^ old_cr4) & pdptr_bits)
550 && !load_pdptrs(vcpu, vcpu->arch.cr3)) 603 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
551 return 1; 604 return 1;
552 605
553 if (cr4 & X86_CR4_VMXE) 606 if (cr4 & X86_CR4_VMXE)
@@ -580,7 +633,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
580 if (is_pae(vcpu)) { 633 if (is_pae(vcpu)) {
581 if (cr3 & CR3_PAE_RESERVED_BITS) 634 if (cr3 & CR3_PAE_RESERVED_BITS)
582 return 1; 635 return 1;
583 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) 636 if (is_paging(vcpu) &&
637 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
584 return 1; 638 return 1;
585 } 639 }
586 /* 640 /*
@@ -737,7 +791,7 @@ static u32 msrs_to_save[] = {
737#ifdef CONFIG_X86_64 791#ifdef CONFIG_X86_64
738 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 792 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
739#endif 793#endif
740 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 794 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
741}; 795};
742 796
743static unsigned num_msrs_to_save; 797static unsigned num_msrs_to_save;
@@ -838,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
838 892
839 /* 893 /*
840 * The guest calculates current wall clock time by adding 894 * The guest calculates current wall clock time by adding
841 * system time (updated by kvm_write_guest_time below) to the 895 * system time (updated by kvm_guest_time_update below) to the
842 * wall clock specified here. guest system time equals host 896 * wall clock specified here. guest system time equals host
843 * system time for us, thus we must fill in host boot time here. 897 * system time for us, thus we must fill in host boot time here.
844 */ 898 */
@@ -866,65 +920,229 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
866 return quotient; 920 return quotient;
867} 921}
868 922
869static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 923static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
924 s8 *pshift, u32 *pmultiplier)
870{ 925{
871 uint64_t nsecs = 1000000000LL; 926 uint64_t scaled64;
872 int32_t shift = 0; 927 int32_t shift = 0;
873 uint64_t tps64; 928 uint64_t tps64;
874 uint32_t tps32; 929 uint32_t tps32;
875 930
876 tps64 = tsc_khz * 1000LL; 931 tps64 = base_khz * 1000LL;
877 while (tps64 > nsecs*2) { 932 scaled64 = scaled_khz * 1000LL;
933 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
878 tps64 >>= 1; 934 tps64 >>= 1;
879 shift--; 935 shift--;
880 } 936 }
881 937
882 tps32 = (uint32_t)tps64; 938 tps32 = (uint32_t)tps64;
883 while (tps32 <= (uint32_t)nsecs) { 939 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
884 tps32 <<= 1; 940 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
941 scaled64 >>= 1;
942 else
943 tps32 <<= 1;
885 shift++; 944 shift++;
886 } 945 }
887 946
888 hv_clock->tsc_shift = shift; 947 *pshift = shift;
889 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 948 *pmultiplier = div_frac(scaled64, tps32);
890 949
891 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 950 pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
892 __func__, tsc_khz, hv_clock->tsc_shift, 951 __func__, base_khz, scaled_khz, shift, *pmultiplier);
893 hv_clock->tsc_to_system_mul); 952}
953
954static inline u64 get_kernel_ns(void)
955{
956 struct timespec ts;
957
958 WARN_ON(preemptible());
959 ktime_get_ts(&ts);
960 monotonic_to_bootbased(&ts);
961 return timespec_to_ns(&ts);
894} 962}
895 963
896static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 964static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
965unsigned long max_tsc_khz;
897 966
898static void kvm_write_guest_time(struct kvm_vcpu *v) 967static inline int kvm_tsc_changes_freq(void)
968{
969 int cpu = get_cpu();
970 int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
971 cpufreq_quick_get(cpu) != 0;
972 put_cpu();
973 return ret;
974}
975
976static inline u64 nsec_to_cycles(u64 nsec)
977{
978 u64 ret;
979
980 WARN_ON(preemptible());
981 if (kvm_tsc_changes_freq())
982 printk_once(KERN_WARNING
983 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
984 ret = nsec * __get_cpu_var(cpu_tsc_khz);
985 do_div(ret, USEC_PER_SEC);
986 return ret;
987}
988
989static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
990{
991 /* Compute a scale to convert nanoseconds in TSC cycles */
992 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
993 &kvm->arch.virtual_tsc_shift,
994 &kvm->arch.virtual_tsc_mult);
995 kvm->arch.virtual_tsc_khz = this_tsc_khz;
996}
997
998static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
999{
1000 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
1001 vcpu->kvm->arch.virtual_tsc_mult,
1002 vcpu->kvm->arch.virtual_tsc_shift);
1003 tsc += vcpu->arch.last_tsc_write;
1004 return tsc;
1005}
1006
1007void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1008{
1009 struct kvm *kvm = vcpu->kvm;
1010 u64 offset, ns, elapsed;
1011 unsigned long flags;
1012 s64 sdiff;
1013
1014 spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1015 offset = data - native_read_tsc();
1016 ns = get_kernel_ns();
1017 elapsed = ns - kvm->arch.last_tsc_nsec;
1018 sdiff = data - kvm->arch.last_tsc_write;
1019 if (sdiff < 0)
1020 sdiff = -sdiff;
1021
1022 /*
1023 * Special case: close write to TSC within 5 seconds of
1024 * another CPU is interpreted as an attempt to synchronize
1025 * The 5 seconds is to accomodate host load / swapping as
1026 * well as any reset of TSC during the boot process.
1027 *
1028 * In that case, for a reliable TSC, we can match TSC offsets,
1029 * or make a best guest using elapsed value.
1030 */
1031 if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
1032 elapsed < 5ULL * NSEC_PER_SEC) {
1033 if (!check_tsc_unstable()) {
1034 offset = kvm->arch.last_tsc_offset;
1035 pr_debug("kvm: matched tsc offset for %llu\n", data);
1036 } else {
1037 u64 delta = nsec_to_cycles(elapsed);
1038 offset += delta;
1039 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1040 }
1041 ns = kvm->arch.last_tsc_nsec;
1042 }
1043 kvm->arch.last_tsc_nsec = ns;
1044 kvm->arch.last_tsc_write = data;
1045 kvm->arch.last_tsc_offset = offset;
1046 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1047 spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1048
1049 /* Reset of TSC must disable overshoot protection below */
1050 vcpu->arch.hv_clock.tsc_timestamp = 0;
1051 vcpu->arch.last_tsc_write = data;
1052 vcpu->arch.last_tsc_nsec = ns;
1053}
1054EXPORT_SYMBOL_GPL(kvm_write_tsc);
1055
1056static int kvm_guest_time_update(struct kvm_vcpu *v)
899{ 1057{
900 struct timespec ts;
901 unsigned long flags; 1058 unsigned long flags;
902 struct kvm_vcpu_arch *vcpu = &v->arch; 1059 struct kvm_vcpu_arch *vcpu = &v->arch;
903 void *shared_kaddr; 1060 void *shared_kaddr;
904 unsigned long this_tsc_khz; 1061 unsigned long this_tsc_khz;
1062 s64 kernel_ns, max_kernel_ns;
1063 u64 tsc_timestamp;
905 1064
906 if ((!vcpu->time_page)) 1065 /* Keep irq disabled to prevent changes to the clock */
907 return; 1066 local_irq_save(flags);
1067 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
1068 kernel_ns = get_kernel_ns();
1069 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
908 1070
909 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 1071 if (unlikely(this_tsc_khz == 0)) {
910 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 1072 local_irq_restore(flags);
911 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 1073 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
912 vcpu->hv_clock_tsc_khz = this_tsc_khz; 1074 return 1;
1075 }
1076
1077 /*
1078 * We may have to catch up the TSC to match elapsed wall clock
1079 * time for two reasons, even if kvmclock is used.
1080 * 1) CPU could have been running below the maximum TSC rate
1081 * 2) Broken TSC compensation resets the base at each VCPU
1082 * entry to avoid unknown leaps of TSC even when running
1083 * again on the same CPU. This may cause apparent elapsed
1084 * time to disappear, and the guest to stand still or run
1085 * very slowly.
1086 */
1087 if (vcpu->tsc_catchup) {
1088 u64 tsc = compute_guest_tsc(v, kernel_ns);
1089 if (tsc > tsc_timestamp) {
1090 kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
1091 tsc_timestamp = tsc;
1092 }
913 } 1093 }
914 put_cpu_var(cpu_tsc_khz);
915 1094
916 /* Keep irq disabled to prevent changes to the clock */
917 local_irq_save(flags);
918 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
919 ktime_get_ts(&ts);
920 monotonic_to_bootbased(&ts);
921 local_irq_restore(flags); 1095 local_irq_restore(flags);
922 1096
923 /* With all the info we got, fill in the values */ 1097 if (!vcpu->time_page)
1098 return 0;
924 1099
925 vcpu->hv_clock.system_time = ts.tv_nsec + 1100 /*
926 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 1101 * Time as measured by the TSC may go backwards when resetting the base
1102 * tsc_timestamp. The reason for this is that the TSC resolution is
1103 * higher than the resolution of the other clock scales. Thus, many
1104 * possible measurments of the TSC correspond to one measurement of any
1105 * other clock, and so a spread of values is possible. This is not a
1106 * problem for the computation of the nanosecond clock; with TSC rates
1107 * around 1GHZ, there can only be a few cycles which correspond to one
1108 * nanosecond value, and any path through this code will inevitably
1109 * take longer than that. However, with the kernel_ns value itself,
1110 * the precision may be much lower, down to HZ granularity. If the
1111 * first sampling of TSC against kernel_ns ends in the low part of the
1112 * range, and the second in the high end of the range, we can get:
1113 *
1114 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
1115 *
1116 * As the sampling errors potentially range in the thousands of cycles,
1117 * it is possible such a time value has already been observed by the
1118 * guest. To protect against this, we must compute the system time as
1119 * observed by the guest and ensure the new system time is greater.
1120 */
1121 max_kernel_ns = 0;
1122 if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
1123 max_kernel_ns = vcpu->last_guest_tsc -
1124 vcpu->hv_clock.tsc_timestamp;
1125 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
1126 vcpu->hv_clock.tsc_to_system_mul,
1127 vcpu->hv_clock.tsc_shift);
1128 max_kernel_ns += vcpu->last_kernel_ns;
1129 }
927 1130
1131 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1132 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1133 &vcpu->hv_clock.tsc_shift,
1134 &vcpu->hv_clock.tsc_to_system_mul);
1135 vcpu->hw_tsc_khz = this_tsc_khz;
1136 }
1137
1138 if (max_kernel_ns > kernel_ns)
1139 kernel_ns = max_kernel_ns;
1140
1141 /* With all the info we got, fill in the values */
1142 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1143 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1144 vcpu->last_kernel_ns = kernel_ns;
1145 vcpu->last_guest_tsc = tsc_timestamp;
928 vcpu->hv_clock.flags = 0; 1146 vcpu->hv_clock.flags = 0;
929 1147
930 /* 1148 /*
@@ -942,16 +1160,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
942 kunmap_atomic(shared_kaddr, KM_USER0); 1160 kunmap_atomic(shared_kaddr, KM_USER0);
943 1161
944 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 1162 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
945} 1163 return 0;
946
947static int kvm_request_guest_time_update(struct kvm_vcpu *v)
948{
949 struct kvm_vcpu_arch *vcpu = &v->arch;
950
951 if (!vcpu->time_page)
952 return 0;
953 kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
954 return 1;
955} 1164}
956 1165
957static bool msr_mtrr_valid(unsigned msr) 1166static bool msr_mtrr_valid(unsigned msr)
@@ -1277,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1277 } 1486 }
1278 1487
1279 vcpu->arch.time = data; 1488 vcpu->arch.time = data;
1489 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1280 1490
1281 /* we verify if the enable bit is set... */ 1491 /* we verify if the enable bit is set... */
1282 if (!(data & 1)) 1492 if (!(data & 1))
@@ -1292,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1292 kvm_release_page_clean(vcpu->arch.time_page); 1502 kvm_release_page_clean(vcpu->arch.time_page);
1293 vcpu->arch.time_page = NULL; 1503 vcpu->arch.time_page = NULL;
1294 } 1504 }
1295
1296 kvm_request_guest_time_update(vcpu);
1297 break; 1505 break;
1298 } 1506 }
1299 case MSR_IA32_MCG_CTL: 1507 case MSR_IA32_MCG_CTL:
@@ -1330,6 +1538,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1330 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1538 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1331 "0x%x data 0x%llx\n", msr, data); 1539 "0x%x data 0x%llx\n", msr, data);
1332 break; 1540 break;
1541 case MSR_K7_CLK_CTL:
1542 /*
1543 * Ignore all writes to this no longer documented MSR.
1544 * Writes are only relevant for old K7 processors,
1545 * all pre-dating SVM, but a recommended workaround from
1546 * AMD for these chips. It is possible to speicify the
1547 * affected processor models on the command line, hence
1548 * the need to ignore the workaround.
1549 */
1550 break;
1333 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1551 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1334 if (kvm_hv_msr_partition_wide(msr)) { 1552 if (kvm_hv_msr_partition_wide(msr)) {
1335 int r; 1553 int r;
@@ -1522,6 +1740,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1522 case 0xcd: /* fsb frequency */ 1740 case 0xcd: /* fsb frequency */
1523 data = 3; 1741 data = 3;
1524 break; 1742 break;
1743 /*
1744 * MSR_EBC_FREQUENCY_ID
1745 * Conservative value valid for even the basic CPU models.
1746 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
1747 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
1748 * and 266MHz for model 3, or 4. Set Core Clock
1749 * Frequency to System Bus Frequency Ratio to 1 (bits
1750 * 31:24) even though these are only valid for CPU
1751 * models > 2, however guests may end up dividing or
1752 * multiplying by zero otherwise.
1753 */
1754 case MSR_EBC_FREQUENCY_ID:
1755 data = 1 << 24;
1756 break;
1525 case MSR_IA32_APICBASE: 1757 case MSR_IA32_APICBASE:
1526 data = kvm_get_apic_base(vcpu); 1758 data = kvm_get_apic_base(vcpu);
1527 break; 1759 break;
@@ -1555,6 +1787,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1555 case MSR_IA32_MCG_STATUS: 1787 case MSR_IA32_MCG_STATUS:
1556 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1788 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1557 return get_msr_mce(vcpu, msr, pdata); 1789 return get_msr_mce(vcpu, msr, pdata);
1790 case MSR_K7_CLK_CTL:
1791 /*
1792 * Provide expected ramp-up count for K7. All other
1793 * are set to zero, indicating minimum divisors for
1794 * every field.
1795 *
1796 * This prevents guest kernels on AMD host with CPU
1797 * type 6, model 8 and higher from exploding due to
1798 * the rdmsr failing.
1799 */
1800 data = 0x20000000;
1801 break;
1558 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1802 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1559 if (kvm_hv_msr_partition_wide(msr)) { 1803 if (kvm_hv_msr_partition_wide(msr)) {
1560 int r; 1804 int r;
@@ -1808,19 +2052,28 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1808 } 2052 }
1809 2053
1810 kvm_x86_ops->vcpu_load(vcpu, cpu); 2054 kvm_x86_ops->vcpu_load(vcpu, cpu);
1811 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 2055 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
1812 unsigned long khz = cpufreq_quick_get(cpu); 2056 /* Make sure TSC doesn't go backwards */
1813 if (!khz) 2057 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
1814 khz = tsc_khz; 2058 native_read_tsc() - vcpu->arch.last_host_tsc;
1815 per_cpu(cpu_tsc_khz, cpu) = khz; 2059 if (tsc_delta < 0)
2060 mark_tsc_unstable("KVM discovered backwards TSC");
2061 if (check_tsc_unstable()) {
2062 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
2063 vcpu->arch.tsc_catchup = 1;
2064 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2065 }
2066 if (vcpu->cpu != cpu)
2067 kvm_migrate_timers(vcpu);
2068 vcpu->cpu = cpu;
1816 } 2069 }
1817 kvm_request_guest_time_update(vcpu);
1818} 2070}
1819 2071
1820void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2072void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1821{ 2073{
1822 kvm_x86_ops->vcpu_put(vcpu); 2074 kvm_x86_ops->vcpu_put(vcpu);
1823 kvm_put_guest_fpu(vcpu); 2075 kvm_put_guest_fpu(vcpu);
2076 vcpu->arch.last_host_tsc = native_read_tsc();
1824} 2077}
1825 2078
1826static int is_efer_nx(void) 2079static int is_efer_nx(void)
@@ -1995,7 +2248,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1995 F(F16C); 2248 F(F16C);
1996 /* cpuid 0x80000001.ecx */ 2249 /* cpuid 0x80000001.ecx */
1997 const u32 kvm_supported_word6_x86_features = 2250 const u32 kvm_supported_word6_x86_features =
1998 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 2251 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
1999 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 2252 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
2000 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 2253 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2001 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); 2254 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
@@ -2204,6 +2457,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2204 return -ENXIO; 2457 return -ENXIO;
2205 2458
2206 kvm_queue_interrupt(vcpu, irq->irq, false); 2459 kvm_queue_interrupt(vcpu, irq->irq, false);
2460 kvm_make_request(KVM_REQ_EVENT, vcpu);
2207 2461
2208 return 0; 2462 return 0;
2209} 2463}
@@ -2357,6 +2611,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2357 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2611 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2358 vcpu->arch.sipi_vector = events->sipi_vector; 2612 vcpu->arch.sipi_vector = events->sipi_vector;
2359 2613
2614 kvm_make_request(KVM_REQ_EVENT, vcpu);
2615
2360 return 0; 2616 return 0;
2361} 2617}
2362 2618
@@ -2760,7 +3016,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2760 3016
2761static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 3017static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2762{ 3018{
2763 return kvm->arch.n_alloc_mmu_pages; 3019 return kvm->arch.n_max_mmu_pages;
2764} 3020}
2765 3021
2766static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 3022static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
@@ -2796,18 +3052,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2796 r = 0; 3052 r = 0;
2797 switch (chip->chip_id) { 3053 switch (chip->chip_id) {
2798 case KVM_IRQCHIP_PIC_MASTER: 3054 case KVM_IRQCHIP_PIC_MASTER:
2799 raw_spin_lock(&pic_irqchip(kvm)->lock); 3055 spin_lock(&pic_irqchip(kvm)->lock);
2800 memcpy(&pic_irqchip(kvm)->pics[0], 3056 memcpy(&pic_irqchip(kvm)->pics[0],
2801 &chip->chip.pic, 3057 &chip->chip.pic,
2802 sizeof(struct kvm_pic_state)); 3058 sizeof(struct kvm_pic_state));
2803 raw_spin_unlock(&pic_irqchip(kvm)->lock); 3059 spin_unlock(&pic_irqchip(kvm)->lock);
2804 break; 3060 break;
2805 case KVM_IRQCHIP_PIC_SLAVE: 3061 case KVM_IRQCHIP_PIC_SLAVE:
2806 raw_spin_lock(&pic_irqchip(kvm)->lock); 3062 spin_lock(&pic_irqchip(kvm)->lock);
2807 memcpy(&pic_irqchip(kvm)->pics[1], 3063 memcpy(&pic_irqchip(kvm)->pics[1],
2808 &chip->chip.pic, 3064 &chip->chip.pic,
2809 sizeof(struct kvm_pic_state)); 3065 sizeof(struct kvm_pic_state));
2810 raw_spin_unlock(&pic_irqchip(kvm)->lock); 3066 spin_unlock(&pic_irqchip(kvm)->lock);
2811 break; 3067 break;
2812 case KVM_IRQCHIP_IOAPIC: 3068 case KVM_IRQCHIP_IOAPIC:
2813 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 3069 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -3201,7 +3457,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3201 break; 3457 break;
3202 } 3458 }
3203 case KVM_SET_CLOCK: { 3459 case KVM_SET_CLOCK: {
3204 struct timespec now;
3205 struct kvm_clock_data user_ns; 3460 struct kvm_clock_data user_ns;
3206 u64 now_ns; 3461 u64 now_ns;
3207 s64 delta; 3462 s64 delta;
@@ -3215,20 +3470,21 @@ long kvm_arch_vm_ioctl(struct file *filp,
3215 goto out; 3470 goto out;
3216 3471
3217 r = 0; 3472 r = 0;
3218 ktime_get_ts(&now); 3473 local_irq_disable();
3219 now_ns = timespec_to_ns(&now); 3474 now_ns = get_kernel_ns();
3220 delta = user_ns.clock - now_ns; 3475 delta = user_ns.clock - now_ns;
3476 local_irq_enable();
3221 kvm->arch.kvmclock_offset = delta; 3477 kvm->arch.kvmclock_offset = delta;
3222 break; 3478 break;
3223 } 3479 }
3224 case KVM_GET_CLOCK: { 3480 case KVM_GET_CLOCK: {
3225 struct timespec now;
3226 struct kvm_clock_data user_ns; 3481 struct kvm_clock_data user_ns;
3227 u64 now_ns; 3482 u64 now_ns;
3228 3483
3229 ktime_get_ts(&now); 3484 local_irq_disable();
3230 now_ns = timespec_to_ns(&now); 3485 now_ns = get_kernel_ns();
3231 user_ns.clock = kvm->arch.kvmclock_offset + now_ns; 3486 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
3487 local_irq_enable();
3232 user_ns.flags = 0; 3488 user_ns.flags = 0;
3233 3489
3234 r = -EFAULT; 3490 r = -EFAULT;
@@ -3292,30 +3548,51 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
3292 kvm_x86_ops->get_segment(vcpu, var, seg); 3548 kvm_x86_ops->get_segment(vcpu, var, seg);
3293} 3549}
3294 3550
3551static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3552{
3553 return gpa;
3554}
3555
3556static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3557{
3558 gpa_t t_gpa;
3559 u32 error;
3560
3561 BUG_ON(!mmu_is_nested(vcpu));
3562
3563 /* NPT walks are always user-walks */
3564 access |= PFERR_USER_MASK;
3565 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
3566 if (t_gpa == UNMAPPED_GVA)
3567 vcpu->arch.fault.nested = true;
3568
3569 return t_gpa;
3570}
3571
3295gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3572gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3296{ 3573{
3297 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3574 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3298 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3575 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
3299} 3576}
3300 3577
3301 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3578 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3302{ 3579{
3303 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3580 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3304 access |= PFERR_FETCH_MASK; 3581 access |= PFERR_FETCH_MASK;
3305 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3582 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
3306} 3583}
3307 3584
3308gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3585gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3309{ 3586{
3310 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3587 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3311 access |= PFERR_WRITE_MASK; 3588 access |= PFERR_WRITE_MASK;
3312 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3589 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
3313} 3590}
3314 3591
3315/* uses this to access any guest's mapped memory without checking CPL */ 3592/* uses this to access any guest's mapped memory without checking CPL */
3316gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3593gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3317{ 3594{
3318 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); 3595 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
3319} 3596}
3320 3597
3321static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3598static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
@@ -3326,7 +3603,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3326 int r = X86EMUL_CONTINUE; 3603 int r = X86EMUL_CONTINUE;
3327 3604
3328 while (bytes) { 3605 while (bytes) {
3329 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); 3606 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3607 error);
3330 unsigned offset = addr & (PAGE_SIZE-1); 3608 unsigned offset = addr & (PAGE_SIZE-1);
3331 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3609 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3332 int ret; 3610 int ret;
@@ -3381,8 +3659,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3381 int r = X86EMUL_CONTINUE; 3659 int r = X86EMUL_CONTINUE;
3382 3660
3383 while (bytes) { 3661 while (bytes) {
3384 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, 3662 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3385 PFERR_WRITE_MASK, error); 3663 PFERR_WRITE_MASK,
3664 error);
3386 unsigned offset = addr & (PAGE_SIZE-1); 3665 unsigned offset = addr & (PAGE_SIZE-1);
3387 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3666 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3388 int ret; 3667 int ret;
@@ -3624,7 +3903,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3624 if (vcpu->arch.pio.count) 3903 if (vcpu->arch.pio.count)
3625 goto data_avail; 3904 goto data_avail;
3626 3905
3627 trace_kvm_pio(1, port, size, 1); 3906 trace_kvm_pio(0, port, size, 1);
3628 3907
3629 vcpu->arch.pio.port = port; 3908 vcpu->arch.pio.port = port;
3630 vcpu->arch.pio.in = 1; 3909 vcpu->arch.pio.in = 1;
@@ -3652,7 +3931,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
3652 const void *val, unsigned int count, 3931 const void *val, unsigned int count,
3653 struct kvm_vcpu *vcpu) 3932 struct kvm_vcpu *vcpu)
3654{ 3933{
3655 trace_kvm_pio(0, port, size, 1); 3934 trace_kvm_pio(1, port, size, 1);
3656 3935
3657 vcpu->arch.pio.port = port; 3936 vcpu->arch.pio.port = port;
3658 vcpu->arch.pio.in = 0; 3937 vcpu->arch.pio.in = 0;
@@ -3791,6 +4070,11 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3791 kvm_x86_ops->get_gdt(vcpu, dt); 4070 kvm_x86_ops->get_gdt(vcpu, dt);
3792} 4071}
3793 4072
4073static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
4074{
4075 kvm_x86_ops->get_idt(vcpu, dt);
4076}
4077
3794static unsigned long emulator_get_cached_segment_base(int seg, 4078static unsigned long emulator_get_cached_segment_base(int seg,
3795 struct kvm_vcpu *vcpu) 4079 struct kvm_vcpu *vcpu)
3796{ 4080{
@@ -3884,6 +4168,7 @@ static struct x86_emulate_ops emulate_ops = {
3884 .set_segment_selector = emulator_set_segment_selector, 4168 .set_segment_selector = emulator_set_segment_selector,
3885 .get_cached_segment_base = emulator_get_cached_segment_base, 4169 .get_cached_segment_base = emulator_get_cached_segment_base,
3886 .get_gdt = emulator_get_gdt, 4170 .get_gdt = emulator_get_gdt,
4171 .get_idt = emulator_get_idt,
3887 .get_cr = emulator_get_cr, 4172 .get_cr = emulator_get_cr,
3888 .set_cr = emulator_set_cr, 4173 .set_cr = emulator_set_cr,
3889 .cpl = emulator_get_cpl, 4174 .cpl = emulator_get_cpl,
@@ -3919,13 +4204,64 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
3919{ 4204{
3920 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4205 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
3921 if (ctxt->exception == PF_VECTOR) 4206 if (ctxt->exception == PF_VECTOR)
3922 kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); 4207 kvm_propagate_fault(vcpu);
3923 else if (ctxt->error_code_valid) 4208 else if (ctxt->error_code_valid)
3924 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 4209 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
3925 else 4210 else
3926 kvm_queue_exception(vcpu, ctxt->exception); 4211 kvm_queue_exception(vcpu, ctxt->exception);
3927} 4212}
3928 4213
4214static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4215{
4216 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4217 int cs_db, cs_l;
4218
4219 cache_all_regs(vcpu);
4220
4221 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4222
4223 vcpu->arch.emulate_ctxt.vcpu = vcpu;
4224 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
4225 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4226 vcpu->arch.emulate_ctxt.mode =
4227 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4228 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
4229 ? X86EMUL_MODE_VM86 : cs_l
4230 ? X86EMUL_MODE_PROT64 : cs_db
4231 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4232 memset(c, 0, sizeof(struct decode_cache));
4233 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4234}
4235
4236int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
4237{
4238 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4239 int ret;
4240
4241 init_emulate_ctxt(vcpu);
4242
4243 vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
4244 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
4245 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
4246 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
4247
4248 if (ret != X86EMUL_CONTINUE)
4249 return EMULATE_FAIL;
4250
4251 vcpu->arch.emulate_ctxt.eip = c->eip;
4252 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4253 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4254 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4255
4256 if (irq == NMI_VECTOR)
4257 vcpu->arch.nmi_pending = false;
4258 else
4259 vcpu->arch.interrupt.pending = false;
4260
4261 return EMULATE_DONE;
4262}
4263EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4264
3929static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4265static int handle_emulation_failure(struct kvm_vcpu *vcpu)
3930{ 4266{
3931 ++vcpu->stat.insn_emulation_fail; 4267 ++vcpu->stat.insn_emulation_fail;
@@ -3982,24 +4318,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3982 cache_all_regs(vcpu); 4318 cache_all_regs(vcpu);
3983 4319
3984 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4320 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3985 int cs_db, cs_l; 4321 init_emulate_ctxt(vcpu);
3986 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3987
3988 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3989 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
3990 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
3991 vcpu->arch.emulate_ctxt.mode =
3992 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3993 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
3994 ? X86EMUL_MODE_VM86 : cs_l
3995 ? X86EMUL_MODE_PROT64 : cs_db
3996 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3997 memset(c, 0, sizeof(struct decode_cache));
3998 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
3999 vcpu->arch.emulate_ctxt.interruptibility = 0; 4322 vcpu->arch.emulate_ctxt.interruptibility = 0;
4000 vcpu->arch.emulate_ctxt.exception = -1; 4323 vcpu->arch.emulate_ctxt.exception = -1;
4324 vcpu->arch.emulate_ctxt.perm_ok = false;
4325
4326 r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
4327 if (r == X86EMUL_PROPAGATE_FAULT)
4328 goto done;
4001 4329
4002 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
4003 trace_kvm_emulate_insn_start(vcpu); 4330 trace_kvm_emulate_insn_start(vcpu);
4004 4331
4005 /* Only allow emulation of specific instructions on #UD 4332 /* Only allow emulation of specific instructions on #UD
@@ -4049,41 +4376,39 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
4049 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4376 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4050 4377
4051restart: 4378restart:
4052 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4379 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
4053 4380
4054 if (r) { /* emulation failed */ 4381 if (r == EMULATION_FAILED) {
4055 if (reexecute_instruction(vcpu, cr2)) 4382 if (reexecute_instruction(vcpu, cr2))
4056 return EMULATE_DONE; 4383 return EMULATE_DONE;
4057 4384
4058 return handle_emulation_failure(vcpu); 4385 return handle_emulation_failure(vcpu);
4059 } 4386 }
4060 4387
4061 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); 4388done:
4062 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4063 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4064 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4065
4066 if (vcpu->arch.emulate_ctxt.exception >= 0) { 4389 if (vcpu->arch.emulate_ctxt.exception >= 0) {
4067 inject_emulated_exception(vcpu); 4390 inject_emulated_exception(vcpu);
4068 return EMULATE_DONE; 4391 r = EMULATE_DONE;
4069 } 4392 } else if (vcpu->arch.pio.count) {
4070
4071 if (vcpu->arch.pio.count) {
4072 if (!vcpu->arch.pio.in) 4393 if (!vcpu->arch.pio.in)
4073 vcpu->arch.pio.count = 0; 4394 vcpu->arch.pio.count = 0;
4074 return EMULATE_DO_MMIO; 4395 r = EMULATE_DO_MMIO;
4075 } 4396 } else if (vcpu->mmio_needed) {
4076
4077 if (vcpu->mmio_needed) {
4078 if (vcpu->mmio_is_write) 4397 if (vcpu->mmio_is_write)
4079 vcpu->mmio_needed = 0; 4398 vcpu->mmio_needed = 0;
4080 return EMULATE_DO_MMIO; 4399 r = EMULATE_DO_MMIO;
4081 } 4400 } else if (r == EMULATION_RESTART)
4082
4083 if (vcpu->arch.emulate_ctxt.restart)
4084 goto restart; 4401 goto restart;
4402 else
4403 r = EMULATE_DONE;
4085 4404
4086 return EMULATE_DONE; 4405 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
4406 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4407 kvm_make_request(KVM_REQ_EVENT, vcpu);
4408 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4409 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4410
4411 return r;
4087} 4412}
4088EXPORT_SYMBOL_GPL(emulate_instruction); 4413EXPORT_SYMBOL_GPL(emulate_instruction);
4089 4414
@@ -4097,9 +4422,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4097} 4422}
4098EXPORT_SYMBOL_GPL(kvm_fast_pio_out); 4423EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
4099 4424
4100static void bounce_off(void *info) 4425static void tsc_bad(void *info)
4426{
4427 __get_cpu_var(cpu_tsc_khz) = 0;
4428}
4429
4430static void tsc_khz_changed(void *data)
4101{ 4431{
4102 /* nothing */ 4432 struct cpufreq_freqs *freq = data;
4433 unsigned long khz = 0;
4434
4435 if (data)
4436 khz = freq->new;
4437 else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4438 khz = cpufreq_quick_get(raw_smp_processor_id());
4439 if (!khz)
4440 khz = tsc_khz;
4441 __get_cpu_var(cpu_tsc_khz) = khz;
4103} 4442}
4104 4443
4105static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 4444static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4110,21 +4449,60 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4110 struct kvm_vcpu *vcpu; 4449 struct kvm_vcpu *vcpu;
4111 int i, send_ipi = 0; 4450 int i, send_ipi = 0;
4112 4451
4452 /*
4453 * We allow guests to temporarily run on slowing clocks,
4454 * provided we notify them after, or to run on accelerating
4455 * clocks, provided we notify them before. Thus time never
4456 * goes backwards.
4457 *
4458 * However, we have a problem. We can't atomically update
4459 * the frequency of a given CPU from this function; it is
4460 * merely a notifier, which can be called from any CPU.
4461 * Changing the TSC frequency at arbitrary points in time
4462 * requires a recomputation of local variables related to
4463 * the TSC for each VCPU. We must flag these local variables
4464 * to be updated and be sure the update takes place with the
4465 * new frequency before any guests proceed.
4466 *
4467 * Unfortunately, the combination of hotplug CPU and frequency
4468 * change creates an intractable locking scenario; the order
4469 * of when these callouts happen is undefined with respect to
4470 * CPU hotplug, and they can race with each other. As such,
4471 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
4472 * undefined; you can actually have a CPU frequency change take
4473 * place in between the computation of X and the setting of the
4474 * variable. To protect against this problem, all updates of
4475 * the per_cpu tsc_khz variable are done in an interrupt
4476 * protected IPI, and all callers wishing to update the value
4477 * must wait for a synchronous IPI to complete (which is trivial
4478 * if the caller is on the CPU already). This establishes the
4479 * necessary total order on variable updates.
4480 *
4481 * Note that because a guest time update may take place
4482 * anytime after the setting of the VCPU's request bit, the
4483 * correct TSC value must be set before the request. However,
4484 * to ensure the update actually makes it to any guest which
4485 * starts running in hardware virtualization between the set
4486 * and the acquisition of the spinlock, we must also ping the
4487 * CPU after setting the request bit.
4488 *
4489 */
4490
4113 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 4491 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
4114 return 0; 4492 return 0;
4115 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 4493 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
4116 return 0; 4494 return 0;
4117 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; 4495
4496 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4118 4497
4119 spin_lock(&kvm_lock); 4498 spin_lock(&kvm_lock);
4120 list_for_each_entry(kvm, &vm_list, vm_list) { 4499 list_for_each_entry(kvm, &vm_list, vm_list) {
4121 kvm_for_each_vcpu(i, vcpu, kvm) { 4500 kvm_for_each_vcpu(i, vcpu, kvm) {
4122 if (vcpu->cpu != freq->cpu) 4501 if (vcpu->cpu != freq->cpu)
4123 continue; 4502 continue;
4124 if (!kvm_request_guest_time_update(vcpu)) 4503 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4125 continue;
4126 if (vcpu->cpu != smp_processor_id()) 4504 if (vcpu->cpu != smp_processor_id())
4127 send_ipi++; 4505 send_ipi = 1;
4128 } 4506 }
4129 } 4507 }
4130 spin_unlock(&kvm_lock); 4508 spin_unlock(&kvm_lock);
@@ -4142,32 +4520,57 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4142 * guest context is entered kvmclock will be updated, 4520 * guest context is entered kvmclock will be updated,
4143 * so the guest will not see stale values. 4521 * so the guest will not see stale values.
4144 */ 4522 */
4145 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 4523 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4146 } 4524 }
4147 return 0; 4525 return 0;
4148} 4526}
4149 4527
4150static struct notifier_block kvmclock_cpufreq_notifier_block = { 4528static struct notifier_block kvmclock_cpufreq_notifier_block = {
4151 .notifier_call = kvmclock_cpufreq_notifier 4529 .notifier_call = kvmclock_cpufreq_notifier
4530};
4531
4532static int kvmclock_cpu_notifier(struct notifier_block *nfb,
4533 unsigned long action, void *hcpu)
4534{
4535 unsigned int cpu = (unsigned long)hcpu;
4536
4537 switch (action) {
4538 case CPU_ONLINE:
4539 case CPU_DOWN_FAILED:
4540 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4541 break;
4542 case CPU_DOWN_PREPARE:
4543 smp_call_function_single(cpu, tsc_bad, NULL, 1);
4544 break;
4545 }
4546 return NOTIFY_OK;
4547}
4548
4549static struct notifier_block kvmclock_cpu_notifier_block = {
4550 .notifier_call = kvmclock_cpu_notifier,
4551 .priority = -INT_MAX
4152}; 4552};
4153 4553
4154static void kvm_timer_init(void) 4554static void kvm_timer_init(void)
4155{ 4555{
4156 int cpu; 4556 int cpu;
4157 4557
4558 max_tsc_khz = tsc_khz;
4559 register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
4158 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 4560 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
4561#ifdef CONFIG_CPU_FREQ
4562 struct cpufreq_policy policy;
4563 memset(&policy, 0, sizeof(policy));
4564 cpufreq_get_policy(&policy, get_cpu());
4565 if (policy.cpuinfo.max_freq)
4566 max_tsc_khz = policy.cpuinfo.max_freq;
4567#endif
4159 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 4568 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
4160 CPUFREQ_TRANSITION_NOTIFIER); 4569 CPUFREQ_TRANSITION_NOTIFIER);
4161 for_each_online_cpu(cpu) {
4162 unsigned long khz = cpufreq_get(cpu);
4163 if (!khz)
4164 khz = tsc_khz;
4165 per_cpu(cpu_tsc_khz, cpu) = khz;
4166 }
4167 } else {
4168 for_each_possible_cpu(cpu)
4169 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
4170 } 4570 }
4571 pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
4572 for_each_online_cpu(cpu)
4573 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4171} 4574}
4172 4575
4173static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 4576static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -4269,6 +4672,7 @@ void kvm_arch_exit(void)
4269 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 4672 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4270 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 4673 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
4271 CPUFREQ_TRANSITION_NOTIFIER); 4674 CPUFREQ_TRANSITION_NOTIFIER);
4675 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
4272 kvm_x86_ops = NULL; 4676 kvm_x86_ops = NULL;
4273 kvm_mmu_module_exit(); 4677 kvm_mmu_module_exit();
4274} 4678}
@@ -4684,8 +5088,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4684 kvm_mmu_unload(vcpu); 5088 kvm_mmu_unload(vcpu);
4685 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5089 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
4686 __kvm_migrate_timers(vcpu); 5090 __kvm_migrate_timers(vcpu);
4687 if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) 5091 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
4688 kvm_write_guest_time(vcpu); 5092 r = kvm_guest_time_update(vcpu);
5093 if (unlikely(r))
5094 goto out;
5095 }
4689 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) 5096 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4690 kvm_mmu_sync_roots(vcpu); 5097 kvm_mmu_sync_roots(vcpu);
4691 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 5098 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
@@ -4710,6 +5117,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4710 if (unlikely(r)) 5117 if (unlikely(r))
4711 goto out; 5118 goto out;
4712 5119
5120 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5121 inject_pending_event(vcpu);
5122
5123 /* enable NMI/IRQ window open exits if needed */
5124 if (vcpu->arch.nmi_pending)
5125 kvm_x86_ops->enable_nmi_window(vcpu);
5126 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
5127 kvm_x86_ops->enable_irq_window(vcpu);
5128
5129 if (kvm_lapic_enabled(vcpu)) {
5130 update_cr8_intercept(vcpu);
5131 kvm_lapic_sync_to_vapic(vcpu);
5132 }
5133 }
5134
4713 preempt_disable(); 5135 preempt_disable();
4714 5136
4715 kvm_x86_ops->prepare_guest_switch(vcpu); 5137 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -4728,23 +5150,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4728 smp_wmb(); 5150 smp_wmb();
4729 local_irq_enable(); 5151 local_irq_enable();
4730 preempt_enable(); 5152 preempt_enable();
5153 kvm_x86_ops->cancel_injection(vcpu);
4731 r = 1; 5154 r = 1;
4732 goto out; 5155 goto out;
4733 } 5156 }
4734 5157
4735 inject_pending_event(vcpu);
4736
4737 /* enable NMI/IRQ window open exits if needed */
4738 if (vcpu->arch.nmi_pending)
4739 kvm_x86_ops->enable_nmi_window(vcpu);
4740 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
4741 kvm_x86_ops->enable_irq_window(vcpu);
4742
4743 if (kvm_lapic_enabled(vcpu)) {
4744 update_cr8_intercept(vcpu);
4745 kvm_lapic_sync_to_vapic(vcpu);
4746 }
4747
4748 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5158 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4749 5159
4750 kvm_guest_enter(); 5160 kvm_guest_enter();
@@ -4770,6 +5180,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4770 if (hw_breakpoint_active()) 5180 if (hw_breakpoint_active())
4771 hw_breakpoint_restore(); 5181 hw_breakpoint_restore();
4772 5182
5183 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
5184
4773 atomic_set(&vcpu->guest_mode, 0); 5185 atomic_set(&vcpu->guest_mode, 0);
4774 smp_wmb(); 5186 smp_wmb();
4775 local_irq_enable(); 5187 local_irq_enable();
@@ -4899,8 +5311,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4899 if (!irqchip_in_kernel(vcpu->kvm)) 5311 if (!irqchip_in_kernel(vcpu->kvm))
4900 kvm_set_cr8(vcpu, kvm_run->cr8); 5312 kvm_set_cr8(vcpu, kvm_run->cr8);
4901 5313
4902 if (vcpu->arch.pio.count || vcpu->mmio_needed || 5314 if (vcpu->arch.pio.count || vcpu->mmio_needed) {
4903 vcpu->arch.emulate_ctxt.restart) {
4904 if (vcpu->mmio_needed) { 5315 if (vcpu->mmio_needed) {
4905 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 5316 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4906 vcpu->mmio_read_completed = 1; 5317 vcpu->mmio_read_completed = 1;
@@ -4981,6 +5392,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4981 5392
4982 vcpu->arch.exception.pending = false; 5393 vcpu->arch.exception.pending = false;
4983 5394
5395 kvm_make_request(KVM_REQ_EVENT, vcpu);
5396
4984 return 0; 5397 return 0;
4985} 5398}
4986 5399
@@ -5044,6 +5457,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5044 struct kvm_mp_state *mp_state) 5457 struct kvm_mp_state *mp_state)
5045{ 5458{
5046 vcpu->arch.mp_state = mp_state->mp_state; 5459 vcpu->arch.mp_state = mp_state->mp_state;
5460 kvm_make_request(KVM_REQ_EVENT, vcpu);
5047 return 0; 5461 return 0;
5048} 5462}
5049 5463
@@ -5051,24 +5465,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5051 bool has_error_code, u32 error_code) 5465 bool has_error_code, u32 error_code)
5052{ 5466{
5053 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 5467 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
5054 int cs_db, cs_l, ret; 5468 int ret;
5055 cache_all_regs(vcpu);
5056
5057 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5058 5469
5059 vcpu->arch.emulate_ctxt.vcpu = vcpu; 5470 init_emulate_ctxt(vcpu);
5060 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
5061 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
5062 vcpu->arch.emulate_ctxt.mode =
5063 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
5064 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
5065 ? X86EMUL_MODE_VM86 : cs_l
5066 ? X86EMUL_MODE_PROT64 : cs_db
5067 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
5068 memset(c, 0, sizeof(struct decode_cache));
5069 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
5070 5471
5071 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5472 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
5072 tss_selector, reason, has_error_code, 5473 tss_selector, reason, has_error_code,
5073 error_code); 5474 error_code);
5074 5475
@@ -5078,6 +5479,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5078 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5479 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5079 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 5480 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
5080 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5481 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5482 kvm_make_request(KVM_REQ_EVENT, vcpu);
5081 return EMULATE_DONE; 5483 return EMULATE_DONE;
5082} 5484}
5083EXPORT_SYMBOL_GPL(kvm_task_switch); 5485EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5113,7 +5515,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5113 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5515 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
5114 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5516 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5115 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5517 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5116 load_pdptrs(vcpu, vcpu->arch.cr3); 5518 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
5117 mmu_reset_needed = 1; 5519 mmu_reset_needed = 1;
5118 } 5520 }
5119 5521
@@ -5148,6 +5550,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5148 !is_protmode(vcpu)) 5550 !is_protmode(vcpu))
5149 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5551 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5150 5552
5553 kvm_make_request(KVM_REQ_EVENT, vcpu);
5554
5151 return 0; 5555 return 0;
5152} 5556}
5153 5557
@@ -5334,6 +5738,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5334struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 5738struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
5335 unsigned int id) 5739 unsigned int id)
5336{ 5740{
5741 if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
5742 printk_once(KERN_WARNING
5743 "kvm: SMP vm created on host with unstable TSC; "
5744 "guest TSC will not be reliable\n");
5337 return kvm_x86_ops->vcpu_create(kvm, id); 5745 return kvm_x86_ops->vcpu_create(kvm, id);
5338} 5746}
5339 5747
@@ -5376,22 +5784,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5376 vcpu->arch.dr6 = DR6_FIXED_1; 5784 vcpu->arch.dr6 = DR6_FIXED_1;
5377 vcpu->arch.dr7 = DR7_FIXED_1; 5785 vcpu->arch.dr7 = DR7_FIXED_1;
5378 5786
5787 kvm_make_request(KVM_REQ_EVENT, vcpu);
5788
5379 return kvm_x86_ops->vcpu_reset(vcpu); 5789 return kvm_x86_ops->vcpu_reset(vcpu);
5380} 5790}
5381 5791
5382int kvm_arch_hardware_enable(void *garbage) 5792int kvm_arch_hardware_enable(void *garbage)
5383{ 5793{
5384 /* 5794 struct kvm *kvm;
5385 * Since this may be called from a hotplug notifcation, 5795 struct kvm_vcpu *vcpu;
5386 * we can't get the CPU frequency directly. 5796 int i;
5387 */
5388 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5389 int cpu = raw_smp_processor_id();
5390 per_cpu(cpu_tsc_khz, cpu) = 0;
5391 }
5392 5797
5393 kvm_shared_msr_cpu_online(); 5798 kvm_shared_msr_cpu_online();
5394 5799 list_for_each_entry(kvm, &vm_list, vm_list)
5800 kvm_for_each_vcpu(i, vcpu, kvm)
5801 if (vcpu->cpu == smp_processor_id())
5802 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5395 return kvm_x86_ops->hardware_enable(garbage); 5803 return kvm_x86_ops->hardware_enable(garbage);
5396} 5804}
5397 5805
@@ -5425,7 +5833,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5425 BUG_ON(vcpu->kvm == NULL); 5833 BUG_ON(vcpu->kvm == NULL);
5426 kvm = vcpu->kvm; 5834 kvm = vcpu->kvm;
5427 5835
5836 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
5837 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
5428 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 5838 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
5839 vcpu->arch.mmu.translate_gpa = translate_gpa;
5840 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5429 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 5841 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
5430 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5842 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5431 else 5843 else
@@ -5438,6 +5850,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5438 } 5850 }
5439 vcpu->arch.pio_data = page_address(page); 5851 vcpu->arch.pio_data = page_address(page);
5440 5852
5853 if (!kvm->arch.virtual_tsc_khz)
5854 kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
5855
5441 r = kvm_mmu_create(vcpu); 5856 r = kvm_mmu_create(vcpu);
5442 if (r < 0) 5857 if (r < 0)
5443 goto fail_free_pio_data; 5858 goto fail_free_pio_data;
@@ -5497,7 +5912,7 @@ struct kvm *kvm_arch_create_vm(void)
5497 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 5912 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
5498 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 5913 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
5499 5914
5500 rdtscll(kvm->arch.vm_init_tsc); 5915 spin_lock_init(&kvm->arch.tsc_write_lock);
5501 5916
5502 return kvm; 5917 return kvm;
5503} 5918}
@@ -5684,6 +6099,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5684 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) 6099 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
5685 rflags |= X86_EFLAGS_TF; 6100 rflags |= X86_EFLAGS_TF;
5686 kvm_x86_ops->set_rflags(vcpu, rflags); 6101 kvm_x86_ops->set_rflags(vcpu, rflags);
6102 kvm_make_request(KVM_REQ_EVENT, vcpu);
5687} 6103}
5688EXPORT_SYMBOL_GPL(kvm_set_rflags); 6104EXPORT_SYMBOL_GPL(kvm_set_rflags);
5689 6105
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2b..2cea414489f3 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
50#endif 50#endif
51} 51}
52 52
53static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
54{
55 return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
56}
57
53static inline int is_pae(struct kvm_vcpu *vcpu) 58static inline int is_pae(struct kvm_vcpu *vcpu)
54{ 59{
55 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); 60 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
@@ -67,5 +72,8 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
67 72
68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 73void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 74void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
75int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
76
77void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
70 78
71#endif 79#endif