aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/kvm
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig8
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c3945
-rw-r--r--arch/x86/kvm/i8254.c11
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c48
-rw-r--r--arch/x86/kvm/irq.c9
-rw-r--r--arch/x86/kvm/irq.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h31
-rw-r--r--arch/x86/kvm/lapic.c30
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c1371
-rw-r--r--arch/x86/kvm/mmu.h9
-rw-r--r--arch/x86/kvm/mmu_audit.c304
-rw-r--r--arch/x86/kvm/mmutrace.h19
-rw-r--r--arch/x86/kvm/paging_tmpl.h410
-rw-r--r--arch/x86/kvm/svm.c1644
-rw-r--r--arch/x86/kvm/timer.c4
-rw-r--r--arch/x86/kvm/trace.h25
-rw-r--r--arch/x86/kvm/vmx.c724
-rw-r--r--arch/x86/kvm/x86.c1860
-rw-r--r--arch/x86/kvm/x86.h13
22 files changed, 7051 insertions, 3424 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd479516..50f63648ce1b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select KVM_ASYNC_PF
31 select USER_RETURN_NOTIFIER 32 select USER_RETURN_NOTIFIER
32 select KVM_MMIO 33 select KVM_MMIO
33 ---help--- 34 ---help---
@@ -64,6 +65,13 @@ config KVM_AMD
64 To compile this as a module, choose M here: the module 65 To compile this as a module, choose M here: the module
65 will be called kvm-amd. 66 will be called kvm-amd.
66 67
68config KVM_MMU_AUDIT
69 bool "Audit KVM MMU"
70 depends on KVM && TRACEPOINTS
71 ---help---
72 This option adds a R/W kVM module parameter 'mmu_audit', which allows
73 audit KVM MMU at runtime.
74
67# OK, it's a little counter-intuitive to do this, but it puts it neatly under 75# OK, it's a little counter-intuitive to do this, but it puts it neatly under
68# the virtualization menu. 76# the virtualization menu.
69source drivers/vhost/Kconfig 77source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..f15501f431c8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
1 1
2EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 2ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
3 3
4CFLAGS_x86.o := -I. 4CFLAGS_x86.o := -I.
5CFLAGS_svm.o := -I. 5CFLAGS_svm.o := -I.
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o) 10 assigned-dev.o)
11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
12 13
13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
14 i8254.o timer.o 15 i8254.o timer.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 66ca98aafdd6..adc98675cda0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,7 +9,7 @@
9 * privileged instructions: 9 * privileged instructions:
10 * 10 *
11 * Copyright (C) 2006 Qumranet 11 * Copyright (C) 2006 Qumranet
12 * Copyright 2010 Red Hat, Inc. and/or its affilates. 12 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
13 * 13 *
14 * Avi Kivity <avi@qumranet.com> 14 * Avi Kivity <avi@qumranet.com>
15 * Yaniv Kamay <yaniv@qumranet.com> 15 * Yaniv Kamay <yaniv@qumranet.com>
@@ -20,16 +20,8 @@
20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
21 */ 21 */
22 22
23#ifndef __KERNEL__
24#include <stdio.h>
25#include <stdint.h>
26#include <public/xen.h>
27#define DPRINTF(_f, _a ...) printf(_f , ## _a)
28#else
29#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
30#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
31#define DPRINTF(x...) do {} while (0)
32#endif
33#include <linux/module.h> 25#include <linux/module.h>
34#include <asm/kvm_emulate.h> 26#include <asm/kvm_emulate.h>
35 27
@@ -51,39 +43,50 @@
51#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ 43#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
52#define DstReg (2<<1) /* Register operand. */ 44#define DstReg (2<<1) /* Register operand. */
53#define DstMem (3<<1) /* Memory operand. */ 45#define DstMem (3<<1) /* Memory operand. */
54#define DstAcc (4<<1) /* Destination Accumulator */ 46#define DstAcc (4<<1) /* Destination Accumulator */
55#define DstDI (5<<1) /* Destination is in ES:(E)DI */ 47#define DstDI (5<<1) /* Destination is in ES:(E)DI */
56#define DstMem64 (6<<1) /* 64bit memory operand */ 48#define DstMem64 (6<<1) /* 64bit memory operand */
57#define DstMask (7<<1) 49#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */
50#define DstDX (8<<1) /* Destination is in DX register */
51#define DstMask (0xf<<1)
58/* Source operand type. */ 52/* Source operand type. */
59#define SrcNone (0<<4) /* No source operand. */ 53#define SrcNone (0<<5) /* No source operand. */
60#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ 54#define SrcReg (1<<5) /* Register operand. */
61#define SrcReg (1<<4) /* Register operand. */ 55#define SrcMem (2<<5) /* Memory operand. */
62#define SrcMem (2<<4) /* Memory operand. */ 56#define SrcMem16 (3<<5) /* Memory operand (16-bit). */
63#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ 57#define SrcMem32 (4<<5) /* Memory operand (32-bit). */
64#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ 58#define SrcImm (5<<5) /* Immediate operand. */
65#define SrcImm (5<<4) /* Immediate operand. */ 59#define SrcImmByte (6<<5) /* 8-bit sign-extended immediate operand. */
66#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ 60#define SrcOne (7<<5) /* Implied '1' */
67#define SrcOne (7<<4) /* Implied '1' */ 61#define SrcImmUByte (8<<5) /* 8-bit unsigned immediate operand. */
68#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 62#define SrcImmU (9<<5) /* Immediate operand, unsigned */
69#define SrcImmU (9<<4) /* Immediate operand, unsigned */ 63#define SrcSI (0xa<<5) /* Source is in the DS:RSI */
70#define SrcSI (0xa<<4) /* Source is in the DS:RSI */ 64#define SrcImmFAddr (0xb<<5) /* Source is immediate far address */
71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ 65#define SrcMemFAddr (0xc<<5) /* Source is far address in memory */
72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ 66#define SrcAcc (0xd<<5) /* Source Accumulator */
73#define SrcAcc (0xd<<4) /* Source Accumulator */ 67#define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */
74#define SrcMask (0xf<<4) 68#define SrcDX (0xf<<5) /* Source is in DX register */
69#define SrcMask (0xf<<5)
75/* Generic ModRM decode. */ 70/* Generic ModRM decode. */
76#define ModRM (1<<8) 71#define ModRM (1<<9)
77/* Destination is only written; never read. */ 72/* Destination is only written; never read. */
78#define Mov (1<<9) 73#define Mov (1<<10)
79#define BitOp (1<<10) 74#define BitOp (1<<11)
80#define MemAbs (1<<11) /* Memory operand is absolute displacement */ 75#define MemAbs (1<<12) /* Memory operand is absolute displacement */
81#define String (1<<12) /* String instruction (rep capable) */ 76#define String (1<<13) /* String instruction (rep capable) */
82#define Stack (1<<13) /* Stack instruction (push/pop) */ 77#define Stack (1<<14) /* Stack instruction (push/pop) */
83#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 78#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */
84#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 79#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */
85#define GroupMask 0xff /* Group number stored in bits 0:7 */ 80#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
81#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
82#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
83#define Sse (1<<18) /* SSE Vector instruction */
86/* Misc flags */ 84/* Misc flags */
85#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
86#define VendorSpecific (1<<22) /* Vendor specific instruction */
87#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
88#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
89#define Undefined (1<<25) /* No Such Instruction */
87#define Lock (1<<26) /* lock prefix is allowed for the instruction */ 90#define Lock (1<<26) /* lock prefix is allowed for the instruction */
88#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 91#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
89#define No64 (1<<28) 92#define No64 (1<<28)
@@ -92,285 +95,40 @@
92#define Src2CL (1<<29) 95#define Src2CL (1<<29)
93#define Src2ImmByte (2<<29) 96#define Src2ImmByte (2<<29)
94#define Src2One (3<<29) 97#define Src2One (3<<29)
98#define Src2Imm (4<<29)
95#define Src2Mask (7<<29) 99#define Src2Mask (7<<29)
96 100
97enum { 101#define X2(x...) x, x
98 Group1_80, Group1_81, Group1_82, Group1_83, 102#define X3(x...) X2(x), x
99 Group1A, Group3_Byte, Group3, Group4, Group5, Group7, 103#define X4(x...) X2(x), X2(x)
100 Group8, Group9, 104#define X5(x...) X4(x), x
101}; 105#define X6(x...) X4(x), X2(x)
102 106#define X7(x...) X4(x), X3(x)
103static u32 opcode_table[256] = { 107#define X8(x...) X4(x), X4(x)
104 /* 0x00 - 0x07 */ 108#define X16(x...) X8(x), X8(x)
105 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 109
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 110struct opcode {
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 111 u32 flags;
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, 112 u8 intercept;
109 /* 0x08 - 0x0F */ 113 union {
110 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 114 int (*execute)(struct x86_emulate_ctxt *ctxt);
111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 115 struct opcode *group;
112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 116 struct group_dual *gdual;
113 ImplicitOps | Stack | No64, 0, 117 struct gprefix *gprefix;
114 /* 0x10 - 0x17 */ 118 } u;
115 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 119 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
117 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
118 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
119 /* 0x18 - 0x1F */
120 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
121 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
122 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
123 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
124 /* 0x20 - 0x27 */
125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
127 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
128 /* 0x28 - 0x2F */
129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
131 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
132 /* 0x30 - 0x37 */
133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
135 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
136 /* 0x38 - 0x3F */
137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
139 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
140 0, 0,
141 /* 0x40 - 0x47 */
142 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
143 /* 0x48 - 0x4F */
144 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
145 /* 0x50 - 0x57 */
146 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
147 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
148 /* 0x58 - 0x5F */
149 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
150 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
151 /* 0x60 - 0x67 */
152 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
153 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
154 0, 0, 0, 0,
155 /* 0x68 - 0x6F */
156 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
157 DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
158 SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
159 /* 0x70 - 0x77 */
160 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
161 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
162 /* 0x78 - 0x7F */
163 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
164 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
165 /* 0x80 - 0x87 */
166 Group | Group1_80, Group | Group1_81,
167 Group | Group1_82, Group | Group1_83,
168 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
169 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
170 /* 0x88 - 0x8F */
171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
173 DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
174 ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
175 /* 0x90 - 0x97 */
176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
177 /* 0x98 - 0x9F */
178 0, 0, SrcImmFAddr | No64, 0,
179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
180 /* 0xA0 - 0xA7 */
181 ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
182 ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
185 /* 0xA8 - 0xAF */
186 DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
188 ByteOp | DstDI | String, DstDI | String,
189 /* 0xB0 - 0xB7 */
190 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
191 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
192 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
193 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
194 /* 0xB8 - 0xBF */
195 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
196 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
197 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
198 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
199 /* 0xC0 - 0xC7 */
200 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
201 0, ImplicitOps | Stack, 0, 0,
202 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
203 /* 0xC8 - 0xCF */
204 0, 0, 0, ImplicitOps | Stack,
205 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
206 /* 0xD0 - 0xD7 */
207 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
208 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
209 0, 0, 0, 0,
210 /* 0xD8 - 0xDF */
211 0, 0, 0, 0, 0, 0, 0, 0,
212 /* 0xE0 - 0xE7 */
213 0, 0, 0, 0,
214 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
216 /* 0xE8 - 0xEF */
217 SrcImm | Stack, SrcImm | ImplicitOps,
218 SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
221 /* 0xF0 - 0xF7 */
222 0, 0, 0, 0,
223 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
224 /* 0xF8 - 0xFF */
225 ImplicitOps, 0, ImplicitOps, ImplicitOps,
226 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
227};
228
229static u32 twobyte_table[256] = {
230 /* 0x00 - 0x0F */
231 0, Group | GroupDual | Group7, 0, 0,
232 0, ImplicitOps, ImplicitOps | Priv, 0,
233 ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
234 0, ImplicitOps | ModRM, 0, 0,
235 /* 0x10 - 0x1F */
236 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
237 /* 0x20 - 0x2F */
238 ModRM | ImplicitOps | Priv, ModRM | Priv,
239 ModRM | ImplicitOps | Priv, ModRM | Priv,
240 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 /* 0x30 - 0x3F */
243 ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
244 ImplicitOps, ImplicitOps | Priv, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0x40 - 0x47 */
247 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
248 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
249 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
250 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
251 /* 0x48 - 0x4F */
252 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
253 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
254 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
255 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
256 /* 0x50 - 0x5F */
257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
258 /* 0x60 - 0x6F */
259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
260 /* 0x70 - 0x7F */
261 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
262 /* 0x80 - 0x8F */
263 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
264 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
265 /* 0x90 - 0x9F */
266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
267 /* 0xA0 - 0xA7 */
268 ImplicitOps | Stack, ImplicitOps | Stack,
269 0, DstMem | SrcReg | ModRM | BitOp,
270 DstMem | SrcReg | Src2ImmByte | ModRM,
271 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
272 /* 0xA8 - 0xAF */
273 ImplicitOps | Stack, ImplicitOps | Stack,
274 0, DstMem | SrcReg | ModRM | BitOp | Lock,
275 DstMem | SrcReg | Src2ImmByte | ModRM,
276 DstMem | SrcReg | Src2CL | ModRM,
277 ModRM, 0,
278 /* 0xB0 - 0xB7 */
279 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
280 0, DstMem | SrcReg | ModRM | BitOp | Lock,
281 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
282 DstReg | SrcMem16 | ModRM | Mov,
283 /* 0xB8 - 0xBF */
284 0, 0,
285 Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
286 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
287 DstReg | SrcMem16 | ModRM | Mov,
288 /* 0xC0 - 0xCF */
289 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
290 0, 0, 0, Group | GroupDual | Group9,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 /* 0xD0 - 0xDF */
293 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
294 /* 0xE0 - 0xEF */
295 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
296 /* 0xF0 - 0xFF */
297 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
298}; 120};
299 121
300static u32 group_table[] = { 122struct group_dual {
301 [Group1_80*8] = 123 struct opcode mod012[8];
302 ByteOp | DstMem | SrcImm | ModRM | Lock, 124 struct opcode mod3[8];
303 ByteOp | DstMem | SrcImm | ModRM | Lock,
304 ByteOp | DstMem | SrcImm | ModRM | Lock,
305 ByteOp | DstMem | SrcImm | ModRM | Lock,
306 ByteOp | DstMem | SrcImm | ModRM | Lock,
307 ByteOp | DstMem | SrcImm | ModRM | Lock,
308 ByteOp | DstMem | SrcImm | ModRM | Lock,
309 ByteOp | DstMem | SrcImm | ModRM,
310 [Group1_81*8] =
311 DstMem | SrcImm | ModRM | Lock,
312 DstMem | SrcImm | ModRM | Lock,
313 DstMem | SrcImm | ModRM | Lock,
314 DstMem | SrcImm | ModRM | Lock,
315 DstMem | SrcImm | ModRM | Lock,
316 DstMem | SrcImm | ModRM | Lock,
317 DstMem | SrcImm | ModRM | Lock,
318 DstMem | SrcImm | ModRM,
319 [Group1_82*8] =
320 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
321 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
322 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
323 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
324 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
325 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
326 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
327 ByteOp | DstMem | SrcImm | ModRM | No64,
328 [Group1_83*8] =
329 DstMem | SrcImmByte | ModRM | Lock,
330 DstMem | SrcImmByte | ModRM | Lock,
331 DstMem | SrcImmByte | ModRM | Lock,
332 DstMem | SrcImmByte | ModRM | Lock,
333 DstMem | SrcImmByte | ModRM | Lock,
334 DstMem | SrcImmByte | ModRM | Lock,
335 DstMem | SrcImmByte | ModRM | Lock,
336 DstMem | SrcImmByte | ModRM,
337 [Group1A*8] =
338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
339 [Group3_Byte*8] =
340 ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
342 0, 0, 0, 0,
343 [Group3*8] =
344 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
346 0, 0, 0, 0,
347 [Group4*8] =
348 ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
349 0, 0, 0, 0, 0, 0,
350 [Group5*8] =
351 DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
352 SrcMem | ModRM | Stack, 0,
353 SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
354 SrcMem | ModRM | Stack, 0,
355 [Group7*8] =
356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
357 SrcNone | ModRM | DstMem | Mov, 0,
358 SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
359 [Group8*8] =
360 0, 0, 0, 0,
361 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
362 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
363 [Group9*8] =
364 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
365}; 125};
366 126
367static u32 group2_table[] = { 127struct gprefix {
368 [Group7*8] = 128 struct opcode pfx_no;
369 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv, 129 struct opcode pfx_66;
370 SrcNone | ModRM | DstMem | Mov, 0, 130 struct opcode pfx_f2;
371 SrcMem16 | ModRM | Mov | Priv, 0, 131 struct opcode pfx_f3;
372 [Group9*8] =
373 0, 0, 0, 0, 0, 0, 0, 0,
374}; 132};
375 133
376/* EFLAGS bit definitions. */ 134/* EFLAGS bit definitions. */
@@ -392,6 +150,9 @@ static u32 group2_table[] = {
392#define EFLG_PF (1<<2) 150#define EFLG_PF (1<<2)
393#define EFLG_CF (1<<0) 151#define EFLG_CF (1<<0)
394 152
153#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
154#define EFLG_RESERVED_ONE_MASK 2
155
395/* 156/*
396 * Instruction emulation: 157 * Instruction emulation:
397 * Most instructions are emulated directly via a fragment of inline assembly 158 * Most instructions are emulated directly via a fragment of inline assembly
@@ -444,13 +205,13 @@ static u32 group2_table[] = {
444#define ON64(x) 205#define ON64(x)
445#endif 206#endif
446 207
447#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ 208#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
448 do { \ 209 do { \
449 __asm__ __volatile__ ( \ 210 __asm__ __volatile__ ( \
450 _PRE_EFLAGS("0", "4", "2") \ 211 _PRE_EFLAGS("0", "4", "2") \
451 _op _suffix " %"_x"3,%1; " \ 212 _op _suffix " %"_x"3,%1; " \
452 _POST_EFLAGS("0", "4", "2") \ 213 _POST_EFLAGS("0", "4", "2") \
453 : "=m" (_eflags), "=m" ((_dst).val), \ 214 : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
454 "=&r" (_tmp) \ 215 "=&r" (_tmp) \
455 : _y ((_src).val), "i" (EFLAGS_MASK)); \ 216 : _y ((_src).val), "i" (EFLAGS_MASK)); \
456 } while (0) 217 } while (0)
@@ -463,13 +224,13 @@ static u32 group2_table[] = {
463 \ 224 \
464 switch ((_dst).bytes) { \ 225 switch ((_dst).bytes) { \
465 case 2: \ 226 case 2: \
466 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ 227 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
467 break; \ 228 break; \
468 case 4: \ 229 case 4: \
469 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ 230 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
470 break; \ 231 break; \
471 case 8: \ 232 case 8: \
472 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ 233 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
473 break; \ 234 break; \
474 } \ 235 } \
475 } while (0) 236 } while (0)
@@ -479,7 +240,7 @@ static u32 group2_table[] = {
479 unsigned long _tmp; \ 240 unsigned long _tmp; \
480 switch ((_dst).bytes) { \ 241 switch ((_dst).bytes) { \
481 case 1: \ 242 case 1: \
482 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ 243 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
483 break; \ 244 break; \
484 default: \ 245 default: \
485 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 246 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
@@ -504,42 +265,42 @@ static u32 group2_table[] = {
504 "w", "r", _LO32, "r", "", "r") 265 "w", "r", _LO32, "r", "", "r")
505 266
506/* Instruction has three operands and one operand is stored in ECX register */ 267/* Instruction has three operands and one operand is stored in ECX register */
507#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ 268#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
508 do { \ 269 do { \
509 unsigned long _tmp; \ 270 unsigned long _tmp; \
510 _type _clv = (_cl).val; \ 271 _type _clv = (_cl).val; \
511 _type _srcv = (_src).val; \ 272 _type _srcv = (_src).val; \
512 _type _dstv = (_dst).val; \ 273 _type _dstv = (_dst).val; \
513 \ 274 \
514 __asm__ __volatile__ ( \ 275 __asm__ __volatile__ ( \
515 _PRE_EFLAGS("0", "5", "2") \ 276 _PRE_EFLAGS("0", "5", "2") \
516 _op _suffix " %4,%1 \n" \ 277 _op _suffix " %4,%1 \n" \
517 _POST_EFLAGS("0", "5", "2") \ 278 _POST_EFLAGS("0", "5", "2") \
518 : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ 279 : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
519 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ 280 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
520 ); \ 281 ); \
521 \ 282 \
522 (_cl).val = (unsigned long) _clv; \ 283 (_cl).val = (unsigned long) _clv; \
523 (_src).val = (unsigned long) _srcv; \ 284 (_src).val = (unsigned long) _srcv; \
524 (_dst).val = (unsigned long) _dstv; \ 285 (_dst).val = (unsigned long) _dstv; \
525 } while (0) 286 } while (0)
526 287
527#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ 288#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
528 do { \ 289 do { \
529 switch ((_dst).bytes) { \ 290 switch ((_dst).bytes) { \
530 case 2: \ 291 case 2: \
531 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 292 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
532 "w", unsigned short); \ 293 "w", unsigned short); \
533 break; \ 294 break; \
534 case 4: \ 295 case 4: \
535 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 296 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
536 "l", unsigned int); \ 297 "l", unsigned int); \
537 break; \ 298 break; \
538 case 8: \ 299 case 8: \
539 ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 300 ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
540 "q", unsigned long)); \ 301 "q", unsigned long)); \
541 break; \ 302 break; \
542 } \ 303 } \
543 } while (0) 304 } while (0)
544 305
545#define __emulate_1op(_op, _dst, _eflags, _suffix) \ 306#define __emulate_1op(_op, _dst, _eflags, _suffix) \
@@ -566,6 +327,86 @@ static u32 group2_table[] = {
566 } \ 327 } \
567 } while (0) 328 } while (0)
568 329
330#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \
331 do { \
332 unsigned long _tmp; \
333 \
334 __asm__ __volatile__ ( \
335 _PRE_EFLAGS("0", "4", "1") \
336 _op _suffix " %5; " \
337 _POST_EFLAGS("0", "4", "1") \
338 : "=m" (_eflags), "=&r" (_tmp), \
339 "+a" (_rax), "+d" (_rdx) \
340 : "i" (EFLAGS_MASK), "m" ((_src).val), \
341 "a" (_rax), "d" (_rdx)); \
342 } while (0)
343
344#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
345 do { \
346 unsigned long _tmp; \
347 \
348 __asm__ __volatile__ ( \
349 _PRE_EFLAGS("0", "5", "1") \
350 "1: \n\t" \
351 _op _suffix " %6; " \
352 "2: \n\t" \
353 _POST_EFLAGS("0", "5", "1") \
354 ".pushsection .fixup,\"ax\" \n\t" \
355 "3: movb $1, %4 \n\t" \
356 "jmp 2b \n\t" \
357 ".popsection \n\t" \
358 _ASM_EXTABLE(1b, 3b) \
359 : "=m" (_eflags), "=&r" (_tmp), \
360 "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \
361 : "i" (EFLAGS_MASK), "m" ((_src).val), \
362 "a" (_rax), "d" (_rdx)); \
363 } while (0)
364
365/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
366#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
367 do { \
368 switch((_src).bytes) { \
369 case 1: \
370 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
371 _eflags, "b"); \
372 break; \
373 case 2: \
374 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
375 _eflags, "w"); \
376 break; \
377 case 4: \
378 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
379 _eflags, "l"); \
380 break; \
381 case 8: \
382 ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
383 _eflags, "q")); \
384 break; \
385 } \
386 } while (0)
387
388#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \
389 do { \
390 switch((_src).bytes) { \
391 case 1: \
392 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
393 _eflags, "b", _ex); \
394 break; \
395 case 2: \
396 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
397 _eflags, "w", _ex); \
398 break; \
399 case 4: \
400 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
401 _eflags, "l", _ex); \
402 break; \
403 case 8: ON64( \
404 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
405 _eflags, "q", _ex)); \
406 break; \
407 } \
408 } while (0)
409
569/* Fetch next part of the instruction being emulated. */ 410/* Fetch next part of the instruction being emulated. */
570#define insn_fetch(_type, _size, _eip) \ 411#define insn_fetch(_type, _size, _eip) \
571({ unsigned long _x; \ 412({ unsigned long _x; \
@@ -576,13 +417,33 @@ static u32 group2_table[] = {
576 (_type)_x; \ 417 (_type)_x; \
577}) 418})
578 419
579#define insn_fetch_arr(_arr, _size, _eip) \ 420#define insn_fetch_arr(_arr, _size, _eip) \
580({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ 421({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
581 if (rc != X86EMUL_CONTINUE) \ 422 if (rc != X86EMUL_CONTINUE) \
582 goto done; \ 423 goto done; \
583 (_eip) += (_size); \ 424 (_eip) += (_size); \
584}) 425})
585 426
427static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
428 enum x86_intercept intercept,
429 enum x86_intercept_stage stage)
430{
431 struct x86_instruction_info info = {
432 .intercept = intercept,
433 .rep_prefix = ctxt->decode.rep_prefix,
434 .modrm_mod = ctxt->decode.modrm_mod,
435 .modrm_reg = ctxt->decode.modrm_reg,
436 .modrm_rm = ctxt->decode.modrm_rm,
437 .src_val = ctxt->decode.src.val64,
438 .src_bytes = ctxt->decode.src.bytes,
439 .dst_bytes = ctxt->decode.dst.bytes,
440 .ad_bytes = ctxt->decode.ad_bytes,
441 .next_rip = ctxt->eip,
442 };
443
444 return ctxt->ops->intercept(ctxt, &info, stage);
445}
446
586static inline unsigned long ad_mask(struct decode_cache *c) 447static inline unsigned long ad_mask(struct decode_cache *c)
587{ 448{
588 return (1UL << (c->ad_bytes << 3)) - 1; 449 return (1UL << (c->ad_bytes << 3)) - 1;
@@ -599,9 +460,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
599} 460}
600 461
601static inline unsigned long 462static inline unsigned long
602register_address(struct decode_cache *c, unsigned long base, unsigned long reg) 463register_address(struct decode_cache *c, unsigned long reg)
603{ 464{
604 return base + address_mask(c, reg); 465 return address_mask(c, reg);
605} 466}
606 467
607static inline void 468static inline void
@@ -618,6 +479,13 @@ static inline void jmp_rel(struct decode_cache *c, int rel)
618 register_address_increment(c, &c->eip, rel); 479 register_address_increment(c, &c->eip, rel);
619} 480}
620 481
482static u32 desc_limit_scaled(struct desc_struct *desc)
483{
484 u32 limit = get_desc_limit(desc);
485
486 return desc->g ? (limit << 12) | 0xfff : limit;
487}
488
621static void set_seg_override(struct decode_cache *c, int seg) 489static void set_seg_override(struct decode_cache *c, int seg)
622{ 490{
623 c->has_seg_override = true; 491 c->has_seg_override = true;
@@ -630,60 +498,177 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
630 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 498 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
631 return 0; 499 return 0;
632 500
633 return ops->get_cached_segment_base(seg, ctxt->vcpu); 501 return ops->get_cached_segment_base(ctxt, seg);
634} 502}
635 503
636static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 504static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
637 struct x86_emulate_ops *ops, 505 struct decode_cache *c)
638 struct decode_cache *c)
639{ 506{
640 if (!c->has_seg_override) 507 if (!c->has_seg_override)
641 return 0; 508 return 0;
642 509
643 return seg_base(ctxt, ops, c->seg_override); 510 return c->seg_override;
644} 511}
645 512
646static unsigned long es_base(struct x86_emulate_ctxt *ctxt, 513static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
647 struct x86_emulate_ops *ops) 514 u32 error, bool valid)
648{ 515{
649 return seg_base(ctxt, ops, VCPU_SREG_ES); 516 ctxt->exception.vector = vec;
517 ctxt->exception.error_code = error;
518 ctxt->exception.error_code_valid = valid;
519 return X86EMUL_PROPAGATE_FAULT;
650} 520}
651 521
652static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, 522static int emulate_db(struct x86_emulate_ctxt *ctxt)
653 struct x86_emulate_ops *ops) 523{
524 return emulate_exception(ctxt, DB_VECTOR, 0, false);
525}
526
527static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
528{
529 return emulate_exception(ctxt, GP_VECTOR, err, true);
530}
531
532static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err)
533{
534 return emulate_exception(ctxt, SS_VECTOR, err, true);
535}
536
537static int emulate_ud(struct x86_emulate_ctxt *ctxt)
538{
539 return emulate_exception(ctxt, UD_VECTOR, 0, false);
540}
541
542static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
543{
544 return emulate_exception(ctxt, TS_VECTOR, err, true);
545}
546
547static int emulate_de(struct x86_emulate_ctxt *ctxt)
654{ 548{
655 return seg_base(ctxt, ops, VCPU_SREG_SS); 549 return emulate_exception(ctxt, DE_VECTOR, 0, false);
656} 550}
657 551
658static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 552static int emulate_nm(struct x86_emulate_ctxt *ctxt)
659 u32 error, bool valid)
660{ 553{
661 ctxt->exception = vec; 554 return emulate_exception(ctxt, NM_VECTOR, 0, false);
662 ctxt->error_code = error;
663 ctxt->error_code_valid = valid;
664 ctxt->restart = false;
665} 555}
666 556
667static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 557static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
668{ 558{
669 emulate_exception(ctxt, GP_VECTOR, err, true); 559 u16 selector;
560 struct desc_struct desc;
561
562 ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg);
563 return selector;
670} 564}
671 565
672static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, 566static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
673 int err) 567 unsigned seg)
674{ 568{
675 ctxt->cr2 = addr; 569 u16 dummy;
676 emulate_exception(ctxt, PF_VECTOR, err, true); 570 u32 base3;
571 struct desc_struct desc;
572
573 ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg);
574 ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
677} 575}
678 576
679static void emulate_ud(struct x86_emulate_ctxt *ctxt) 577static int __linearize(struct x86_emulate_ctxt *ctxt,
578 struct segmented_address addr,
579 unsigned size, bool write, bool fetch,
580 ulong *linear)
680{ 581{
681 emulate_exception(ctxt, UD_VECTOR, 0, false); 582 struct decode_cache *c = &ctxt->decode;
583 struct desc_struct desc;
584 bool usable;
585 ulong la;
586 u32 lim;
587 u16 sel;
588 unsigned cpl, rpl;
589
590 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
591 switch (ctxt->mode) {
592 case X86EMUL_MODE_REAL:
593 break;
594 case X86EMUL_MODE_PROT64:
595 if (((signed long)la << 16) >> 16 != la)
596 return emulate_gp(ctxt, 0);
597 break;
598 default:
599 usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
600 addr.seg);
601 if (!usable)
602 goto bad;
603 /* code segment or read-only data segment */
604 if (((desc.type & 8) || !(desc.type & 2)) && write)
605 goto bad;
606 /* unreadable code segment */
607 if (!fetch && (desc.type & 8) && !(desc.type & 2))
608 goto bad;
609 lim = desc_limit_scaled(&desc);
610 if ((desc.type & 8) || !(desc.type & 4)) {
611 /* expand-up segment */
612 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
613 goto bad;
614 } else {
615 /* exapand-down segment */
616 if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
617 goto bad;
618 lim = desc.d ? 0xffffffff : 0xffff;
619 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
620 goto bad;
621 }
622 cpl = ctxt->ops->cpl(ctxt);
623 rpl = sel & 3;
624 cpl = max(cpl, rpl);
625 if (!(desc.type & 8)) {
626 /* data segment */
627 if (cpl > desc.dpl)
628 goto bad;
629 } else if ((desc.type & 8) && !(desc.type & 4)) {
630 /* nonconforming code segment */
631 if (cpl != desc.dpl)
632 goto bad;
633 } else if ((desc.type & 8) && (desc.type & 4)) {
634 /* conforming code segment */
635 if (cpl < desc.dpl)
636 goto bad;
637 }
638 break;
639 }
640 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8)
641 la &= (u32)-1;
642 *linear = la;
643 return X86EMUL_CONTINUE;
644bad:
645 if (addr.seg == VCPU_SREG_SS)
646 return emulate_ss(ctxt, addr.seg);
647 else
648 return emulate_gp(ctxt, addr.seg);
682} 649}
683 650
684static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) 651static int linearize(struct x86_emulate_ctxt *ctxt,
652 struct segmented_address addr,
653 unsigned size, bool write,
654 ulong *linear)
685{ 655{
686 emulate_exception(ctxt, TS_VECTOR, err, true); 656 return __linearize(ctxt, addr, size, write, false, linear);
657}
658
659
660static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
661 struct segmented_address addr,
662 void *data,
663 unsigned size)
664{
665 int rc;
666 ulong linear;
667
668 rc = linearize(ctxt, addr, size, false, &linear);
669 if (rc != X86EMUL_CONTINUE)
670 return rc;
671 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
687} 672}
688 673
689static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 674static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -695,10 +680,15 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
695 int size, cur_size; 680 int size, cur_size;
696 681
697 if (eip == fc->end) { 682 if (eip == fc->end) {
683 unsigned long linear;
684 struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip};
698 cur_size = fc->end - fc->start; 685 cur_size = fc->end - fc->start;
699 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); 686 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
700 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, 687 rc = __linearize(ctxt, addr, size, false, true, &linear);
701 size, ctxt->vcpu, NULL); 688 if (rc != X86EMUL_CONTINUE)
689 return rc;
690 rc = ops->fetch(ctxt, linear, fc->data + cur_size,
691 size, &ctxt->exception);
702 if (rc != X86EMUL_CONTINUE) 692 if (rc != X86EMUL_CONTINUE)
703 return rc; 693 return rc;
704 fc->end += size; 694 fc->end += size;
@@ -741,8 +731,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
741} 731}
742 732
743static int read_descriptor(struct x86_emulate_ctxt *ctxt, 733static int read_descriptor(struct x86_emulate_ctxt *ctxt,
744 struct x86_emulate_ops *ops, 734 struct segmented_address addr,
745 void *ptr,
746 u16 *size, unsigned long *address, int op_bytes) 735 u16 *size, unsigned long *address, int op_bytes)
747{ 736{
748 int rc; 737 int rc;
@@ -750,12 +739,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
750 if (op_bytes == 2) 739 if (op_bytes == 2)
751 op_bytes = 3; 740 op_bytes = 3;
752 *address = 0; 741 *address = 0;
753 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 742 rc = segmented_read_std(ctxt, addr, size, 2);
754 ctxt->vcpu, NULL);
755 if (rc != X86EMUL_CONTINUE) 743 if (rc != X86EMUL_CONTINUE)
756 return rc; 744 return rc;
757 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 745 addr.ea += 2;
758 ctxt->vcpu, NULL); 746 rc = segmented_read_std(ctxt, addr, address, op_bytes);
759 return rc; 747 return rc;
760} 748}
761 749
@@ -794,7 +782,81 @@ static int test_cc(unsigned int condition, unsigned int flags)
794 return (!!rc ^ (condition & 1)); 782 return (!!rc ^ (condition & 1));
795} 783}
796 784
797static void decode_register_operand(struct operand *op, 785static void fetch_register_operand(struct operand *op)
786{
787 switch (op->bytes) {
788 case 1:
789 op->val = *(u8 *)op->addr.reg;
790 break;
791 case 2:
792 op->val = *(u16 *)op->addr.reg;
793 break;
794 case 4:
795 op->val = *(u32 *)op->addr.reg;
796 break;
797 case 8:
798 op->val = *(u64 *)op->addr.reg;
799 break;
800 }
801}
802
803static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
804{
805 ctxt->ops->get_fpu(ctxt);
806 switch (reg) {
807 case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break;
808 case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break;
809 case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break;
810 case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break;
811 case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break;
812 case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break;
813 case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break;
814 case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break;
815#ifdef CONFIG_X86_64
816 case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break;
817 case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break;
818 case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break;
819 case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break;
820 case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break;
821 case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break;
822 case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break;
823 case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break;
824#endif
825 default: BUG();
826 }
827 ctxt->ops->put_fpu(ctxt);
828}
829
830static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
831 int reg)
832{
833 ctxt->ops->get_fpu(ctxt);
834 switch (reg) {
835 case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break;
836 case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break;
837 case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break;
838 case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break;
839 case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break;
840 case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break;
841 case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break;
842 case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break;
843#ifdef CONFIG_X86_64
844 case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break;
845 case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break;
846 case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break;
847 case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break;
848 case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break;
849 case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break;
850 case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break;
851 case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break;
852#endif
853 default: BUG();
854 }
855 ctxt->ops->put_fpu(ctxt);
856}
857
858static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
859 struct operand *op,
798 struct decode_cache *c, 860 struct decode_cache *c,
799 int inhibit_bytereg) 861 int inhibit_bytereg)
800{ 862{
@@ -803,36 +865,36 @@ static void decode_register_operand(struct operand *op,
803 865
804 if (!(c->d & ModRM)) 866 if (!(c->d & ModRM))
805 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); 867 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
868
869 if (c->d & Sse) {
870 op->type = OP_XMM;
871 op->bytes = 16;
872 op->addr.xmm = reg;
873 read_sse_reg(ctxt, &op->vec_val, reg);
874 return;
875 }
876
806 op->type = OP_REG; 877 op->type = OP_REG;
807 if ((c->d & ByteOp) && !inhibit_bytereg) { 878 if ((c->d & ByteOp) && !inhibit_bytereg) {
808 op->ptr = decode_register(reg, c->regs, highbyte_regs); 879 op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
809 op->val = *(u8 *)op->ptr;
810 op->bytes = 1; 880 op->bytes = 1;
811 } else { 881 } else {
812 op->ptr = decode_register(reg, c->regs, 0); 882 op->addr.reg = decode_register(reg, c->regs, 0);
813 op->bytes = c->op_bytes; 883 op->bytes = c->op_bytes;
814 switch (op->bytes) {
815 case 2:
816 op->val = *(u16 *)op->ptr;
817 break;
818 case 4:
819 op->val = *(u32 *)op->ptr;
820 break;
821 case 8:
822 op->val = *(u64 *) op->ptr;
823 break;
824 }
825 } 884 }
885 fetch_register_operand(op);
826 op->orig_val = op->val; 886 op->orig_val = op->val;
827} 887}
828 888
829static int decode_modrm(struct x86_emulate_ctxt *ctxt, 889static int decode_modrm(struct x86_emulate_ctxt *ctxt,
830 struct x86_emulate_ops *ops) 890 struct x86_emulate_ops *ops,
891 struct operand *op)
831{ 892{
832 struct decode_cache *c = &ctxt->decode; 893 struct decode_cache *c = &ctxt->decode;
833 u8 sib; 894 u8 sib;
834 int index_reg = 0, base_reg = 0, scale; 895 int index_reg = 0, base_reg = 0, scale;
835 int rc = X86EMUL_CONTINUE; 896 int rc = X86EMUL_CONTINUE;
897 ulong modrm_ea = 0;
836 898
837 if (c->rex_prefix) { 899 if (c->rex_prefix) {
838 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 900 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -844,16 +906,26 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
844 c->modrm_mod |= (c->modrm & 0xc0) >> 6; 906 c->modrm_mod |= (c->modrm & 0xc0) >> 6;
845 c->modrm_reg |= (c->modrm & 0x38) >> 3; 907 c->modrm_reg |= (c->modrm & 0x38) >> 3;
846 c->modrm_rm |= (c->modrm & 0x07); 908 c->modrm_rm |= (c->modrm & 0x07);
847 c->modrm_ea = 0; 909 c->modrm_seg = VCPU_SREG_DS;
848 c->use_modrm_ea = 1;
849 910
850 if (c->modrm_mod == 3) { 911 if (c->modrm_mod == 3) {
851 c->modrm_ptr = decode_register(c->modrm_rm, 912 op->type = OP_REG;
913 op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
914 op->addr.reg = decode_register(c->modrm_rm,
852 c->regs, c->d & ByteOp); 915 c->regs, c->d & ByteOp);
853 c->modrm_val = *(unsigned long *)c->modrm_ptr; 916 if (c->d & Sse) {
917 op->type = OP_XMM;
918 op->bytes = 16;
919 op->addr.xmm = c->modrm_rm;
920 read_sse_reg(ctxt, &op->vec_val, c->modrm_rm);
921 return rc;
922 }
923 fetch_register_operand(op);
854 return rc; 924 return rc;
855 } 925 }
856 926
927 op->type = OP_MEM;
928
857 if (c->ad_bytes == 2) { 929 if (c->ad_bytes == 2) {
858 unsigned bx = c->regs[VCPU_REGS_RBX]; 930 unsigned bx = c->regs[VCPU_REGS_RBX];
859 unsigned bp = c->regs[VCPU_REGS_RBP]; 931 unsigned bp = c->regs[VCPU_REGS_RBP];
@@ -864,47 +936,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
864 switch (c->modrm_mod) { 936 switch (c->modrm_mod) {
865 case 0: 937 case 0:
866 if (c->modrm_rm == 6) 938 if (c->modrm_rm == 6)
867 c->modrm_ea += insn_fetch(u16, 2, c->eip); 939 modrm_ea += insn_fetch(u16, 2, c->eip);
868 break; 940 break;
869 case 1: 941 case 1:
870 c->modrm_ea += insn_fetch(s8, 1, c->eip); 942 modrm_ea += insn_fetch(s8, 1, c->eip);
871 break; 943 break;
872 case 2: 944 case 2:
873 c->modrm_ea += insn_fetch(u16, 2, c->eip); 945 modrm_ea += insn_fetch(u16, 2, c->eip);
874 break; 946 break;
875 } 947 }
876 switch (c->modrm_rm) { 948 switch (c->modrm_rm) {
877 case 0: 949 case 0:
878 c->modrm_ea += bx + si; 950 modrm_ea += bx + si;
879 break; 951 break;
880 case 1: 952 case 1:
881 c->modrm_ea += bx + di; 953 modrm_ea += bx + di;
882 break; 954 break;
883 case 2: 955 case 2:
884 c->modrm_ea += bp + si; 956 modrm_ea += bp + si;
885 break; 957 break;
886 case 3: 958 case 3:
887 c->modrm_ea += bp + di; 959 modrm_ea += bp + di;
888 break; 960 break;
889 case 4: 961 case 4:
890 c->modrm_ea += si; 962 modrm_ea += si;
891 break; 963 break;
892 case 5: 964 case 5:
893 c->modrm_ea += di; 965 modrm_ea += di;
894 break; 966 break;
895 case 6: 967 case 6:
896 if (c->modrm_mod != 0) 968 if (c->modrm_mod != 0)
897 c->modrm_ea += bp; 969 modrm_ea += bp;
898 break; 970 break;
899 case 7: 971 case 7:
900 c->modrm_ea += bx; 972 modrm_ea += bx;
901 break; 973 break;
902 } 974 }
903 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 975 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
904 (c->modrm_rm == 6 && c->modrm_mod != 0)) 976 (c->modrm_rm == 6 && c->modrm_mod != 0))
905 if (!c->has_seg_override) 977 c->modrm_seg = VCPU_SREG_SS;
906 set_seg_override(c, VCPU_SREG_SS); 978 modrm_ea = (u16)modrm_ea;
907 c->modrm_ea = (u16)c->modrm_ea;
908 } else { 979 } else {
909 /* 32/64-bit ModR/M decode. */ 980 /* 32/64-bit ModR/M decode. */
910 if ((c->modrm_rm & 7) == 4) { 981 if ((c->modrm_rm & 7) == 4) {
@@ -914,410 +985,74 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
914 scale = sib >> 6; 985 scale = sib >> 6;
915 986
916 if ((base_reg & 7) == 5 && c->modrm_mod == 0) 987 if ((base_reg & 7) == 5 && c->modrm_mod == 0)
917 c->modrm_ea += insn_fetch(s32, 4, c->eip); 988 modrm_ea += insn_fetch(s32, 4, c->eip);
918 else 989 else
919 c->modrm_ea += c->regs[base_reg]; 990 modrm_ea += c->regs[base_reg];
920 if (index_reg != 4) 991 if (index_reg != 4)
921 c->modrm_ea += c->regs[index_reg] << scale; 992 modrm_ea += c->regs[index_reg] << scale;
922 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { 993 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
923 if (ctxt->mode == X86EMUL_MODE_PROT64) 994 if (ctxt->mode == X86EMUL_MODE_PROT64)
924 c->rip_relative = 1; 995 c->rip_relative = 1;
925 } else 996 } else
926 c->modrm_ea += c->regs[c->modrm_rm]; 997 modrm_ea += c->regs[c->modrm_rm];
927 switch (c->modrm_mod) { 998 switch (c->modrm_mod) {
928 case 0: 999 case 0:
929 if (c->modrm_rm == 5) 1000 if (c->modrm_rm == 5)
930 c->modrm_ea += insn_fetch(s32, 4, c->eip); 1001 modrm_ea += insn_fetch(s32, 4, c->eip);
931 break; 1002 break;
932 case 1: 1003 case 1:
933 c->modrm_ea += insn_fetch(s8, 1, c->eip); 1004 modrm_ea += insn_fetch(s8, 1, c->eip);
934 break; 1005 break;
935 case 2: 1006 case 2:
936 c->modrm_ea += insn_fetch(s32, 4, c->eip); 1007 modrm_ea += insn_fetch(s32, 4, c->eip);
937 break; 1008 break;
938 } 1009 }
939 } 1010 }
1011 op->addr.mem.ea = modrm_ea;
940done: 1012done:
941 return rc; 1013 return rc;
942} 1014}
943 1015
944static int decode_abs(struct x86_emulate_ctxt *ctxt, 1016static int decode_abs(struct x86_emulate_ctxt *ctxt,
945 struct x86_emulate_ops *ops) 1017 struct x86_emulate_ops *ops,
1018 struct operand *op)
946{ 1019{
947 struct decode_cache *c = &ctxt->decode; 1020 struct decode_cache *c = &ctxt->decode;
948 int rc = X86EMUL_CONTINUE; 1021 int rc = X86EMUL_CONTINUE;
949 1022
1023 op->type = OP_MEM;
950 switch (c->ad_bytes) { 1024 switch (c->ad_bytes) {
951 case 2: 1025 case 2:
952 c->modrm_ea = insn_fetch(u16, 2, c->eip); 1026 op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
953 break; 1027 break;
954 case 4: 1028 case 4:
955 c->modrm_ea = insn_fetch(u32, 4, c->eip); 1029 op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
956 break; 1030 break;
957 case 8: 1031 case 8:
958 c->modrm_ea = insn_fetch(u64, 8, c->eip); 1032 op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
959 break; 1033 break;
960 } 1034 }
961done: 1035done:
962 return rc; 1036 return rc;
963} 1037}
964 1038
965int 1039static void fetch_bit_operand(struct decode_cache *c)
966x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
967{ 1040{
968 struct decode_cache *c = &ctxt->decode; 1041 long sv = 0, mask;
969 int rc = X86EMUL_CONTINUE;
970 int mode = ctxt->mode;
971 int def_op_bytes, def_ad_bytes, group;
972
973
974 /* we cannot decode insn before we complete previous rep insn */
975 WARN_ON(ctxt->restart);
976
977 c->eip = ctxt->eip;
978 c->fetch.start = c->fetch.end = c->eip;
979 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
980
981 switch (mode) {
982 case X86EMUL_MODE_REAL:
983 case X86EMUL_MODE_VM86:
984 case X86EMUL_MODE_PROT16:
985 def_op_bytes = def_ad_bytes = 2;
986 break;
987 case X86EMUL_MODE_PROT32:
988 def_op_bytes = def_ad_bytes = 4;
989 break;
990#ifdef CONFIG_X86_64
991 case X86EMUL_MODE_PROT64:
992 def_op_bytes = 4;
993 def_ad_bytes = 8;
994 break;
995#endif
996 default:
997 return -1;
998 }
999
1000 c->op_bytes = def_op_bytes;
1001 c->ad_bytes = def_ad_bytes;
1002
1003 /* Legacy prefixes. */
1004 for (;;) {
1005 switch (c->b = insn_fetch(u8, 1, c->eip)) {
1006 case 0x66: /* operand-size override */
1007 /* switch between 2/4 bytes */
1008 c->op_bytes = def_op_bytes ^ 6;
1009 break;
1010 case 0x67: /* address-size override */
1011 if (mode == X86EMUL_MODE_PROT64)
1012 /* switch between 4/8 bytes */
1013 c->ad_bytes = def_ad_bytes ^ 12;
1014 else
1015 /* switch between 2/4 bytes */
1016 c->ad_bytes = def_ad_bytes ^ 6;
1017 break;
1018 case 0x26: /* ES override */
1019 case 0x2e: /* CS override */
1020 case 0x36: /* SS override */
1021 case 0x3e: /* DS override */
1022 set_seg_override(c, (c->b >> 3) & 3);
1023 break;
1024 case 0x64: /* FS override */
1025 case 0x65: /* GS override */
1026 set_seg_override(c, c->b & 7);
1027 break;
1028 case 0x40 ... 0x4f: /* REX */
1029 if (mode != X86EMUL_MODE_PROT64)
1030 goto done_prefixes;
1031 c->rex_prefix = c->b;
1032 continue;
1033 case 0xf0: /* LOCK */
1034 c->lock_prefix = 1;
1035 break;
1036 case 0xf2: /* REPNE/REPNZ */
1037 c->rep_prefix = REPNE_PREFIX;
1038 break;
1039 case 0xf3: /* REP/REPE/REPZ */
1040 c->rep_prefix = REPE_PREFIX;
1041 break;
1042 default:
1043 goto done_prefixes;
1044 }
1045
1046 /* Any legacy prefix after a REX prefix nullifies its effect. */
1047 1042
1048 c->rex_prefix = 0; 1043 if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
1049 } 1044 mask = ~(c->dst.bytes * 8 - 1);
1050
1051done_prefixes:
1052
1053 /* REX prefix. */
1054 if (c->rex_prefix)
1055 if (c->rex_prefix & 8)
1056 c->op_bytes = 8; /* REX.W */
1057 1045
1058 /* Opcode byte(s). */ 1046 if (c->src.bytes == 2)
1059 c->d = opcode_table[c->b]; 1047 sv = (s16)c->src.val & (s16)mask;
1060 if (c->d == 0) { 1048 else if (c->src.bytes == 4)
1061 /* Two-byte opcode? */ 1049 sv = (s32)c->src.val & (s32)mask;
1062 if (c->b == 0x0f) {
1063 c->twobyte = 1;
1064 c->b = insn_fetch(u8, 1, c->eip);
1065 c->d = twobyte_table[c->b];
1066 }
1067 }
1068
1069 if (c->d & Group) {
1070 group = c->d & GroupMask;
1071 c->modrm = insn_fetch(u8, 1, c->eip);
1072 --c->eip;
1073
1074 group = (group << 3) + ((c->modrm >> 3) & 7);
1075 if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
1076 c->d = group2_table[group];
1077 else
1078 c->d = group_table[group];
1079 }
1080 1050
1081 /* Unrecognised? */ 1051 c->dst.addr.mem.ea += (sv >> 3);
1082 if (c->d == 0) {
1083 DPRINTF("Cannot emulate %02x\n", c->b);
1084 return -1;
1085 } 1052 }
1086 1053
1087 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 1054 /* only subword offset */
1088 c->op_bytes = 8; 1055 c->src.val &= (c->dst.bytes << 3) - 1;
1089
1090 /* ModRM and SIB bytes. */
1091 if (c->d & ModRM)
1092 rc = decode_modrm(ctxt, ops);
1093 else if (c->d & MemAbs)
1094 rc = decode_abs(ctxt, ops);
1095 if (rc != X86EMUL_CONTINUE)
1096 goto done;
1097
1098 if (!c->has_seg_override)
1099 set_seg_override(c, VCPU_SREG_DS);
1100
1101 if (!(!c->twobyte && c->b == 0x8d))
1102 c->modrm_ea += seg_override_base(ctxt, ops, c);
1103
1104 if (c->ad_bytes != 8)
1105 c->modrm_ea = (u32)c->modrm_ea;
1106
1107 if (c->rip_relative)
1108 c->modrm_ea += c->eip;
1109
1110 /*
1111 * Decode and fetch the source operand: register, memory
1112 * or immediate.
1113 */
1114 switch (c->d & SrcMask) {
1115 case SrcNone:
1116 break;
1117 case SrcReg:
1118 decode_register_operand(&c->src, c, 0);
1119 break;
1120 case SrcMem16:
1121 c->src.bytes = 2;
1122 goto srcmem_common;
1123 case SrcMem32:
1124 c->src.bytes = 4;
1125 goto srcmem_common;
1126 case SrcMem:
1127 c->src.bytes = (c->d & ByteOp) ? 1 :
1128 c->op_bytes;
1129 /* Don't fetch the address for invlpg: it could be unmapped. */
1130 if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
1131 break;
1132 srcmem_common:
1133 /*
1134 * For instructions with a ModR/M byte, switch to register
1135 * access if Mod = 3.
1136 */
1137 if ((c->d & ModRM) && c->modrm_mod == 3) {
1138 c->src.type = OP_REG;
1139 c->src.val = c->modrm_val;
1140 c->src.ptr = c->modrm_ptr;
1141 break;
1142 }
1143 c->src.type = OP_MEM;
1144 c->src.ptr = (unsigned long *)c->modrm_ea;
1145 c->src.val = 0;
1146 break;
1147 case SrcImm:
1148 case SrcImmU:
1149 c->src.type = OP_IMM;
1150 c->src.ptr = (unsigned long *)c->eip;
1151 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1152 if (c->src.bytes == 8)
1153 c->src.bytes = 4;
1154 /* NB. Immediates are sign-extended as necessary. */
1155 switch (c->src.bytes) {
1156 case 1:
1157 c->src.val = insn_fetch(s8, 1, c->eip);
1158 break;
1159 case 2:
1160 c->src.val = insn_fetch(s16, 2, c->eip);
1161 break;
1162 case 4:
1163 c->src.val = insn_fetch(s32, 4, c->eip);
1164 break;
1165 }
1166 if ((c->d & SrcMask) == SrcImmU) {
1167 switch (c->src.bytes) {
1168 case 1:
1169 c->src.val &= 0xff;
1170 break;
1171 case 2:
1172 c->src.val &= 0xffff;
1173 break;
1174 case 4:
1175 c->src.val &= 0xffffffff;
1176 break;
1177 }
1178 }
1179 break;
1180 case SrcImmByte:
1181 case SrcImmUByte:
1182 c->src.type = OP_IMM;
1183 c->src.ptr = (unsigned long *)c->eip;
1184 c->src.bytes = 1;
1185 if ((c->d & SrcMask) == SrcImmByte)
1186 c->src.val = insn_fetch(s8, 1, c->eip);
1187 else
1188 c->src.val = insn_fetch(u8, 1, c->eip);
1189 break;
1190 case SrcAcc:
1191 c->src.type = OP_REG;
1192 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1193 c->src.ptr = &c->regs[VCPU_REGS_RAX];
1194 switch (c->src.bytes) {
1195 case 1:
1196 c->src.val = *(u8 *)c->src.ptr;
1197 break;
1198 case 2:
1199 c->src.val = *(u16 *)c->src.ptr;
1200 break;
1201 case 4:
1202 c->src.val = *(u32 *)c->src.ptr;
1203 break;
1204 case 8:
1205 c->src.val = *(u64 *)c->src.ptr;
1206 break;
1207 }
1208 break;
1209 case SrcOne:
1210 c->src.bytes = 1;
1211 c->src.val = 1;
1212 break;
1213 case SrcSI:
1214 c->src.type = OP_MEM;
1215 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1216 c->src.ptr = (unsigned long *)
1217 register_address(c, seg_override_base(ctxt, ops, c),
1218 c->regs[VCPU_REGS_RSI]);
1219 c->src.val = 0;
1220 break;
1221 case SrcImmFAddr:
1222 c->src.type = OP_IMM;
1223 c->src.ptr = (unsigned long *)c->eip;
1224 c->src.bytes = c->op_bytes + 2;
1225 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
1226 break;
1227 case SrcMemFAddr:
1228 c->src.type = OP_MEM;
1229 c->src.ptr = (unsigned long *)c->modrm_ea;
1230 c->src.bytes = c->op_bytes + 2;
1231 break;
1232 }
1233
1234 /*
1235 * Decode and fetch the second source operand: register, memory
1236 * or immediate.
1237 */
1238 switch (c->d & Src2Mask) {
1239 case Src2None:
1240 break;
1241 case Src2CL:
1242 c->src2.bytes = 1;
1243 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
1244 break;
1245 case Src2ImmByte:
1246 c->src2.type = OP_IMM;
1247 c->src2.ptr = (unsigned long *)c->eip;
1248 c->src2.bytes = 1;
1249 c->src2.val = insn_fetch(u8, 1, c->eip);
1250 break;
1251 case Src2One:
1252 c->src2.bytes = 1;
1253 c->src2.val = 1;
1254 break;
1255 }
1256
1257 /* Decode and fetch the destination operand: register or memory. */
1258 switch (c->d & DstMask) {
1259 case ImplicitOps:
1260 /* Special instructions do their own operand decoding. */
1261 return 0;
1262 case DstReg:
1263 decode_register_operand(&c->dst, c,
1264 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
1265 break;
1266 case DstMem:
1267 case DstMem64:
1268 if ((c->d & ModRM) && c->modrm_mod == 3) {
1269 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1270 c->dst.type = OP_REG;
1271 c->dst.val = c->dst.orig_val = c->modrm_val;
1272 c->dst.ptr = c->modrm_ptr;
1273 break;
1274 }
1275 c->dst.type = OP_MEM;
1276 c->dst.ptr = (unsigned long *)c->modrm_ea;
1277 if ((c->d & DstMask) == DstMem64)
1278 c->dst.bytes = 8;
1279 else
1280 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1281 c->dst.val = 0;
1282 if (c->d & BitOp) {
1283 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1284
1285 c->dst.ptr = (void *)c->dst.ptr +
1286 (c->src.val & mask) / 8;
1287 }
1288 break;
1289 case DstAcc:
1290 c->dst.type = OP_REG;
1291 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1292 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1293 switch (c->dst.bytes) {
1294 case 1:
1295 c->dst.val = *(u8 *)c->dst.ptr;
1296 break;
1297 case 2:
1298 c->dst.val = *(u16 *)c->dst.ptr;
1299 break;
1300 case 4:
1301 c->dst.val = *(u32 *)c->dst.ptr;
1302 break;
1303 case 8:
1304 c->dst.val = *(u64 *)c->dst.ptr;
1305 break;
1306 }
1307 c->dst.orig_val = c->dst.val;
1308 break;
1309 case DstDI:
1310 c->dst.type = OP_MEM;
1311 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1312 c->dst.ptr = (unsigned long *)
1313 register_address(c, es_base(ctxt, ops),
1314 c->regs[VCPU_REGS_RDI]);
1315 c->dst.val = 0;
1316 break;
1317 }
1318
1319done:
1320 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1321} 1056}
1322 1057
1323static int read_emulated(struct x86_emulate_ctxt *ctxt, 1058static int read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -1326,7 +1061,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1326{ 1061{
1327 int rc; 1062 int rc;
1328 struct read_cache *mc = &ctxt->decode.mem_read; 1063 struct read_cache *mc = &ctxt->decode.mem_read;
1329 u32 err;
1330 1064
1331 while (size) { 1065 while (size) {
1332 int n = min(size, 8u); 1066 int n = min(size, 8u);
@@ -1334,10 +1068,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1334 if (mc->pos < mc->end) 1068 if (mc->pos < mc->end)
1335 goto read_cached; 1069 goto read_cached;
1336 1070
1337 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, 1071 rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
1338 ctxt->vcpu); 1072 &ctxt->exception);
1339 if (rc == X86EMUL_PROPAGATE_FAULT)
1340 emulate_pf(ctxt, addr, err);
1341 if (rc != X86EMUL_CONTINUE) 1073 if (rc != X86EMUL_CONTINUE)
1342 return rc; 1074 return rc;
1343 mc->end += n; 1075 mc->end += n;
@@ -1351,6 +1083,50 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1351 return X86EMUL_CONTINUE; 1083 return X86EMUL_CONTINUE;
1352} 1084}
1353 1085
1086static int segmented_read(struct x86_emulate_ctxt *ctxt,
1087 struct segmented_address addr,
1088 void *data,
1089 unsigned size)
1090{
1091 int rc;
1092 ulong linear;
1093
1094 rc = linearize(ctxt, addr, size, false, &linear);
1095 if (rc != X86EMUL_CONTINUE)
1096 return rc;
1097 return read_emulated(ctxt, ctxt->ops, linear, data, size);
1098}
1099
1100static int segmented_write(struct x86_emulate_ctxt *ctxt,
1101 struct segmented_address addr,
1102 const void *data,
1103 unsigned size)
1104{
1105 int rc;
1106 ulong linear;
1107
1108 rc = linearize(ctxt, addr, size, true, &linear);
1109 if (rc != X86EMUL_CONTINUE)
1110 return rc;
1111 return ctxt->ops->write_emulated(ctxt, linear, data, size,
1112 &ctxt->exception);
1113}
1114
1115static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
1116 struct segmented_address addr,
1117 const void *orig_data, const void *data,
1118 unsigned size)
1119{
1120 int rc;
1121 ulong linear;
1122
1123 rc = linearize(ctxt, addr, size, true, &linear);
1124 if (rc != X86EMUL_CONTINUE)
1125 return rc;
1126 return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
1127 size, &ctxt->exception);
1128}
1129
1354static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, 1130static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1355 struct x86_emulate_ops *ops, 1131 struct x86_emulate_ops *ops,
1356 unsigned int size, unsigned short port, 1132 unsigned int size, unsigned short port,
@@ -1371,7 +1147,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1371 if (n == 0) 1147 if (n == 0)
1372 n = 1; 1148 n = 1;
1373 rc->pos = rc->end = 0; 1149 rc->pos = rc->end = 0;
1374 if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) 1150 if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n))
1375 return 0; 1151 return 0;
1376 rc->end = n * size; 1152 rc->end = n * size;
1377 } 1153 }
@@ -1381,27 +1157,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1381 return 1; 1157 return 1;
1382} 1158}
1383 1159
1384static u32 desc_limit_scaled(struct desc_struct *desc)
1385{
1386 u32 limit = get_desc_limit(desc);
1387
1388 return desc->g ? (limit << 12) | 0xfff : limit;
1389}
1390
1391static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1160static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1392 struct x86_emulate_ops *ops, 1161 struct x86_emulate_ops *ops,
1393 u16 selector, struct desc_ptr *dt) 1162 u16 selector, struct desc_ptr *dt)
1394{ 1163{
1395 if (selector & 1 << 2) { 1164 if (selector & 1 << 2) {
1396 struct desc_struct desc; 1165 struct desc_struct desc;
1166 u16 sel;
1167
1397 memset (dt, 0, sizeof *dt); 1168 memset (dt, 0, sizeof *dt);
1398 if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) 1169 if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR))
1399 return; 1170 return;
1400 1171
1401 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ 1172 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
1402 dt->address = get_desc_base(&desc); 1173 dt->address = get_desc_base(&desc);
1403 } else 1174 } else
1404 ops->get_gdt(dt, ctxt->vcpu); 1175 ops->get_gdt(ctxt, dt);
1405} 1176}
1406 1177
1407/* allowed just for 8 bytes segments */ 1178/* allowed just for 8 bytes segments */
@@ -1412,19 +1183,14 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1412 struct desc_ptr dt; 1183 struct desc_ptr dt;
1413 u16 index = selector >> 3; 1184 u16 index = selector >> 3;
1414 int ret; 1185 int ret;
1415 u32 err;
1416 ulong addr; 1186 ulong addr;
1417 1187
1418 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1188 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1419 1189
1420 if (dt.size < index * 8 + 7) { 1190 if (dt.size < index * 8 + 7)
1421 emulate_gp(ctxt, selector & 0xfffc); 1191 return emulate_gp(ctxt, selector & 0xfffc);
1422 return X86EMUL_PROPAGATE_FAULT;
1423 }
1424 addr = dt.address + index * 8; 1192 addr = dt.address + index * 8;
1425 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1193 ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
1426 if (ret == X86EMUL_PROPAGATE_FAULT)
1427 emulate_pf(ctxt, addr, err);
1428 1194
1429 return ret; 1195 return ret;
1430} 1196}
@@ -1436,25 +1202,21 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1436{ 1202{
1437 struct desc_ptr dt; 1203 struct desc_ptr dt;
1438 u16 index = selector >> 3; 1204 u16 index = selector >> 3;
1439 u32 err;
1440 ulong addr; 1205 ulong addr;
1441 int ret; 1206 int ret;
1442 1207
1443 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1208 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1444 1209
1445 if (dt.size < index * 8 + 7) { 1210 if (dt.size < index * 8 + 7)
1446 emulate_gp(ctxt, selector & 0xfffc); 1211 return emulate_gp(ctxt, selector & 0xfffc);
1447 return X86EMUL_PROPAGATE_FAULT;
1448 }
1449 1212
1450 addr = dt.address + index * 8; 1213 addr = dt.address + index * 8;
1451 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1214 ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
1452 if (ret == X86EMUL_PROPAGATE_FAULT)
1453 emulate_pf(ctxt, addr, err);
1454 1215
1455 return ret; 1216 return ret;
1456} 1217}
1457 1218
1219/* Does not support long mode */
1458static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1220static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1459 struct x86_emulate_ops *ops, 1221 struct x86_emulate_ops *ops,
1460 u16 selector, int seg) 1222 u16 selector, int seg)
@@ -1509,7 +1271,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1509 1271
1510 rpl = selector & 3; 1272 rpl = selector & 3;
1511 dpl = seg_desc.dpl; 1273 dpl = seg_desc.dpl;
1512 cpl = ops->cpl(ctxt->vcpu); 1274 cpl = ops->cpl(ctxt);
1513 1275
1514 switch (seg) { 1276 switch (seg) {
1515 case VCPU_SREG_SS: 1277 case VCPU_SREG_SS:
@@ -1565,63 +1327,59 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1565 return ret; 1327 return ret;
1566 } 1328 }
1567load: 1329load:
1568 ops->set_segment_selector(selector, seg, ctxt->vcpu); 1330 ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
1569 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
1570 return X86EMUL_CONTINUE; 1331 return X86EMUL_CONTINUE;
1571exception: 1332exception:
1572 emulate_exception(ctxt, err_vec, err_code, true); 1333 emulate_exception(ctxt, err_vec, err_code, true);
1573 return X86EMUL_PROPAGATE_FAULT; 1334 return X86EMUL_PROPAGATE_FAULT;
1574} 1335}
1575 1336
1576static inline int writeback(struct x86_emulate_ctxt *ctxt, 1337static void write_register_operand(struct operand *op)
1577 struct x86_emulate_ops *ops) 1338{
1339 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
1340 switch (op->bytes) {
1341 case 1:
1342 *(u8 *)op->addr.reg = (u8)op->val;
1343 break;
1344 case 2:
1345 *(u16 *)op->addr.reg = (u16)op->val;
1346 break;
1347 case 4:
1348 *op->addr.reg = (u32)op->val;
1349 break; /* 64b: zero-extend */
1350 case 8:
1351 *op->addr.reg = op->val;
1352 break;
1353 }
1354}
1355
1356static int writeback(struct x86_emulate_ctxt *ctxt)
1578{ 1357{
1579 int rc; 1358 int rc;
1580 struct decode_cache *c = &ctxt->decode; 1359 struct decode_cache *c = &ctxt->decode;
1581 u32 err;
1582 1360
1583 switch (c->dst.type) { 1361 switch (c->dst.type) {
1584 case OP_REG: 1362 case OP_REG:
1585 /* The 4-byte case *is* correct: 1363 write_register_operand(&c->dst);
1586 * in 64-bit mode we zero-extend.
1587 */
1588 switch (c->dst.bytes) {
1589 case 1:
1590 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1591 break;
1592 case 2:
1593 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1594 break;
1595 case 4:
1596 *c->dst.ptr = (u32)c->dst.val;
1597 break; /* 64b: zero-ext */
1598 case 8:
1599 *c->dst.ptr = c->dst.val;
1600 break;
1601 }
1602 break; 1364 break;
1603 case OP_MEM: 1365 case OP_MEM:
1604 if (c->lock_prefix) 1366 if (c->lock_prefix)
1605 rc = ops->cmpxchg_emulated( 1367 rc = segmented_cmpxchg(ctxt,
1606 (unsigned long)c->dst.ptr, 1368 c->dst.addr.mem,
1607 &c->dst.orig_val, 1369 &c->dst.orig_val,
1608 &c->dst.val, 1370 &c->dst.val,
1609 c->dst.bytes, 1371 c->dst.bytes);
1610 &err,
1611 ctxt->vcpu);
1612 else 1372 else
1613 rc = ops->write_emulated( 1373 rc = segmented_write(ctxt,
1614 (unsigned long)c->dst.ptr, 1374 c->dst.addr.mem,
1615 &c->dst.val, 1375 &c->dst.val,
1616 c->dst.bytes, 1376 c->dst.bytes);
1617 &err,
1618 ctxt->vcpu);
1619 if (rc == X86EMUL_PROPAGATE_FAULT)
1620 emulate_pf(ctxt,
1621 (unsigned long)c->dst.ptr, err);
1622 if (rc != X86EMUL_CONTINUE) 1377 if (rc != X86EMUL_CONTINUE)
1623 return rc; 1378 return rc;
1624 break; 1379 break;
1380 case OP_XMM:
1381 write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm);
1382 break;
1625 case OP_NONE: 1383 case OP_NONE:
1626 /* no writeback */ 1384 /* no writeback */
1627 break; 1385 break;
@@ -1631,29 +1389,30 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1631 return X86EMUL_CONTINUE; 1389 return X86EMUL_CONTINUE;
1632} 1390}
1633 1391
1634static inline void emulate_push(struct x86_emulate_ctxt *ctxt, 1392static int em_push(struct x86_emulate_ctxt *ctxt)
1635 struct x86_emulate_ops *ops)
1636{ 1393{
1637 struct decode_cache *c = &ctxt->decode; 1394 struct decode_cache *c = &ctxt->decode;
1395 struct segmented_address addr;
1638 1396
1639 c->dst.type = OP_MEM;
1640 c->dst.bytes = c->op_bytes;
1641 c->dst.val = c->src.val;
1642 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1397 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1643 c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), 1398 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1644 c->regs[VCPU_REGS_RSP]); 1399 addr.seg = VCPU_SREG_SS;
1400
1401 /* Disable writeback. */
1402 c->dst.type = OP_NONE;
1403 return segmented_write(ctxt, addr, &c->src.val, c->op_bytes);
1645} 1404}
1646 1405
1647static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1406static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1648 struct x86_emulate_ops *ops,
1649 void *dest, int len) 1407 void *dest, int len)
1650{ 1408{
1651 struct decode_cache *c = &ctxt->decode; 1409 struct decode_cache *c = &ctxt->decode;
1652 int rc; 1410 int rc;
1411 struct segmented_address addr;
1653 1412
1654 rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), 1413 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1655 c->regs[VCPU_REGS_RSP]), 1414 addr.seg = VCPU_SREG_SS;
1656 dest, len); 1415 rc = segmented_read(ctxt, addr, dest, len);
1657 if (rc != X86EMUL_CONTINUE) 1416 if (rc != X86EMUL_CONTINUE)
1658 return rc; 1417 return rc;
1659 1418
@@ -1661,6 +1420,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1661 return rc; 1420 return rc;
1662} 1421}
1663 1422
1423static int em_pop(struct x86_emulate_ctxt *ctxt)
1424{
1425 struct decode_cache *c = &ctxt->decode;
1426
1427 return emulate_pop(ctxt, &c->dst.val, c->op_bytes);
1428}
1429
1664static int emulate_popf(struct x86_emulate_ctxt *ctxt, 1430static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1665 struct x86_emulate_ops *ops, 1431 struct x86_emulate_ops *ops,
1666 void *dest, int len) 1432 void *dest, int len)
@@ -1668,9 +1434,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1668 int rc; 1434 int rc;
1669 unsigned long val, change_mask; 1435 unsigned long val, change_mask;
1670 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1436 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1671 int cpl = ops->cpl(ctxt->vcpu); 1437 int cpl = ops->cpl(ctxt);
1672 1438
1673 rc = emulate_pop(ctxt, ops, &val, len); 1439 rc = emulate_pop(ctxt, &val, len);
1674 if (rc != X86EMUL_CONTINUE) 1440 if (rc != X86EMUL_CONTINUE)
1675 return rc; 1441 return rc;
1676 1442
@@ -1687,10 +1453,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1687 change_mask |= EFLG_IF; 1453 change_mask |= EFLG_IF;
1688 break; 1454 break;
1689 case X86EMUL_MODE_VM86: 1455 case X86EMUL_MODE_VM86:
1690 if (iopl < 3) { 1456 if (iopl < 3)
1691 emulate_gp(ctxt, 0); 1457 return emulate_gp(ctxt, 0);
1692 return X86EMUL_PROPAGATE_FAULT;
1693 }
1694 change_mask |= EFLG_IF; 1458 change_mask |= EFLG_IF;
1695 break; 1459 break;
1696 default: /* real mode */ 1460 default: /* real mode */
@@ -1704,14 +1468,24 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1704 return rc; 1468 return rc;
1705} 1469}
1706 1470
1707static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, 1471static int em_popf(struct x86_emulate_ctxt *ctxt)
1708 struct x86_emulate_ops *ops, int seg) 1472{
1473 struct decode_cache *c = &ctxt->decode;
1474
1475 c->dst.type = OP_REG;
1476 c->dst.addr.reg = &ctxt->eflags;
1477 c->dst.bytes = c->op_bytes;
1478 return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
1479}
1480
1481static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
1482 struct x86_emulate_ops *ops, int seg)
1709{ 1483{
1710 struct decode_cache *c = &ctxt->decode; 1484 struct decode_cache *c = &ctxt->decode;
1711 1485
1712 c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); 1486 c->src.val = get_segment_selector(ctxt, seg);
1713 1487
1714 emulate_push(ctxt, ops); 1488 return em_push(ctxt);
1715} 1489}
1716 1490
1717static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, 1491static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1721,7 +1495,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1721 unsigned long selector; 1495 unsigned long selector;
1722 int rc; 1496 int rc;
1723 1497
1724 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); 1498 rc = emulate_pop(ctxt, &selector, c->op_bytes);
1725 if (rc != X86EMUL_CONTINUE) 1499 if (rc != X86EMUL_CONTINUE)
1726 return rc; 1500 return rc;
1727 1501
@@ -1729,8 +1503,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1729 return rc; 1503 return rc;
1730} 1504}
1731 1505
1732static int emulate_pusha(struct x86_emulate_ctxt *ctxt, 1506static int em_pusha(struct x86_emulate_ctxt *ctxt)
1733 struct x86_emulate_ops *ops)
1734{ 1507{
1735 struct decode_cache *c = &ctxt->decode; 1508 struct decode_cache *c = &ctxt->decode;
1736 unsigned long old_esp = c->regs[VCPU_REGS_RSP]; 1509 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
@@ -1741,23 +1514,25 @@ static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
1741 (reg == VCPU_REGS_RSP) ? 1514 (reg == VCPU_REGS_RSP) ?
1742 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1515 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1743 1516
1744 emulate_push(ctxt, ops); 1517 rc = em_push(ctxt);
1745
1746 rc = writeback(ctxt, ops);
1747 if (rc != X86EMUL_CONTINUE) 1518 if (rc != X86EMUL_CONTINUE)
1748 return rc; 1519 return rc;
1749 1520
1750 ++reg; 1521 ++reg;
1751 } 1522 }
1752 1523
1753 /* Disable writeback. */
1754 c->dst.type = OP_NONE;
1755
1756 return rc; 1524 return rc;
1757} 1525}
1758 1526
1759static int emulate_popa(struct x86_emulate_ctxt *ctxt, 1527static int em_pushf(struct x86_emulate_ctxt *ctxt)
1760 struct x86_emulate_ops *ops) 1528{
1529 struct decode_cache *c = &ctxt->decode;
1530
1531 c->src.val = (unsigned long)ctxt->eflags;
1532 return em_push(ctxt);
1533}
1534
1535static int em_popa(struct x86_emulate_ctxt *ctxt)
1761{ 1536{
1762 struct decode_cache *c = &ctxt->decode; 1537 struct decode_cache *c = &ctxt->decode;
1763 int rc = X86EMUL_CONTINUE; 1538 int rc = X86EMUL_CONTINUE;
@@ -1770,7 +1545,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1770 --reg; 1545 --reg;
1771 } 1546 }
1772 1547
1773 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); 1548 rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes);
1774 if (rc != X86EMUL_CONTINUE) 1549 if (rc != X86EMUL_CONTINUE)
1775 break; 1550 break;
1776 --reg; 1551 --reg;
@@ -1778,15 +1553,167 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1778 return rc; 1553 return rc;
1779} 1554}
1780 1555
1781static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1556int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1782 struct x86_emulate_ops *ops) 1557 struct x86_emulate_ops *ops, int irq)
1783{ 1558{
1784 struct decode_cache *c = &ctxt->decode; 1559 struct decode_cache *c = &ctxt->decode;
1560 int rc;
1561 struct desc_ptr dt;
1562 gva_t cs_addr;
1563 gva_t eip_addr;
1564 u16 cs, eip;
1785 1565
1786 return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); 1566 /* TODO: Add limit checks */
1567 c->src.val = ctxt->eflags;
1568 rc = em_push(ctxt);
1569 if (rc != X86EMUL_CONTINUE)
1570 return rc;
1571
1572 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
1573
1574 c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
1575 rc = em_push(ctxt);
1576 if (rc != X86EMUL_CONTINUE)
1577 return rc;
1578
1579 c->src.val = c->eip;
1580 rc = em_push(ctxt);
1581 if (rc != X86EMUL_CONTINUE)
1582 return rc;
1583
1584 ops->get_idt(ctxt, &dt);
1585
1586 eip_addr = dt.address + (irq << 2);
1587 cs_addr = dt.address + (irq << 2) + 2;
1588
1589 rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception);
1590 if (rc != X86EMUL_CONTINUE)
1591 return rc;
1592
1593 rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception);
1594 if (rc != X86EMUL_CONTINUE)
1595 return rc;
1596
1597 rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
1598 if (rc != X86EMUL_CONTINUE)
1599 return rc;
1600
1601 c->eip = eip;
1602
1603 return rc;
1787} 1604}
1788 1605
1789static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) 1606static int emulate_int(struct x86_emulate_ctxt *ctxt,
1607 struct x86_emulate_ops *ops, int irq)
1608{
1609 switch(ctxt->mode) {
1610 case X86EMUL_MODE_REAL:
1611 return emulate_int_real(ctxt, ops, irq);
1612 case X86EMUL_MODE_VM86:
1613 case X86EMUL_MODE_PROT16:
1614 case X86EMUL_MODE_PROT32:
1615 case X86EMUL_MODE_PROT64:
1616 default:
1617 /* Protected mode interrupts unimplemented yet */
1618 return X86EMUL_UNHANDLEABLE;
1619 }
1620}
1621
1622static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1623 struct x86_emulate_ops *ops)
1624{
1625 struct decode_cache *c = &ctxt->decode;
1626 int rc = X86EMUL_CONTINUE;
1627 unsigned long temp_eip = 0;
1628 unsigned long temp_eflags = 0;
1629 unsigned long cs = 0;
1630 unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
1631 EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
1632 EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
1633 unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
1634
1635 /* TODO: Add stack limit check */
1636
1637 rc = emulate_pop(ctxt, &temp_eip, c->op_bytes);
1638
1639 if (rc != X86EMUL_CONTINUE)
1640 return rc;
1641
1642 if (temp_eip & ~0xffff)
1643 return emulate_gp(ctxt, 0);
1644
1645 rc = emulate_pop(ctxt, &cs, c->op_bytes);
1646
1647 if (rc != X86EMUL_CONTINUE)
1648 return rc;
1649
1650 rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes);
1651
1652 if (rc != X86EMUL_CONTINUE)
1653 return rc;
1654
1655 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
1656
1657 if (rc != X86EMUL_CONTINUE)
1658 return rc;
1659
1660 c->eip = temp_eip;
1661
1662
1663 if (c->op_bytes == 4)
1664 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
1665 else if (c->op_bytes == 2) {
1666 ctxt->eflags &= ~0xffff;
1667 ctxt->eflags |= temp_eflags;
1668 }
1669
1670 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
1671 ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
1672
1673 return rc;
1674}
1675
1676static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
1677 struct x86_emulate_ops* ops)
1678{
1679 switch(ctxt->mode) {
1680 case X86EMUL_MODE_REAL:
1681 return emulate_iret_real(ctxt, ops);
1682 case X86EMUL_MODE_VM86:
1683 case X86EMUL_MODE_PROT16:
1684 case X86EMUL_MODE_PROT32:
1685 case X86EMUL_MODE_PROT64:
1686 default:
1687 /* iret from protected mode unimplemented yet */
1688 return X86EMUL_UNHANDLEABLE;
1689 }
1690}
1691
1692static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
1693{
1694 struct decode_cache *c = &ctxt->decode;
1695 int rc;
1696 unsigned short sel;
1697
1698 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
1699
1700 rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS);
1701 if (rc != X86EMUL_CONTINUE)
1702 return rc;
1703
1704 c->eip = 0;
1705 memcpy(&c->eip, c->src.valptr, c->op_bytes);
1706 return X86EMUL_CONTINUE;
1707}
1708
1709static int em_grp1a(struct x86_emulate_ctxt *ctxt)
1710{
1711 struct decode_cache *c = &ctxt->decode;
1712
1713 return emulate_pop(ctxt, &c->dst.val, c->dst.bytes);
1714}
1715
1716static int em_grp2(struct x86_emulate_ctxt *ctxt)
1790{ 1717{
1791 struct decode_cache *c = &ctxt->decode; 1718 struct decode_cache *c = &ctxt->decode;
1792 switch (c->modrm_reg) { 1719 switch (c->modrm_reg) {
@@ -1813,12 +1740,15 @@ static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
1813 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); 1740 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
1814 break; 1741 break;
1815 } 1742 }
1743 return X86EMUL_CONTINUE;
1816} 1744}
1817 1745
1818static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, 1746static int em_grp3(struct x86_emulate_ctxt *ctxt)
1819 struct x86_emulate_ops *ops)
1820{ 1747{
1821 struct decode_cache *c = &ctxt->decode; 1748 struct decode_cache *c = &ctxt->decode;
1749 unsigned long *rax = &c->regs[VCPU_REGS_RAX];
1750 unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
1751 u8 de = 0;
1822 1752
1823 switch (c->modrm_reg) { 1753 switch (c->modrm_reg) {
1824 case 0 ... 1: /* test */ 1754 case 0 ... 1: /* test */
@@ -1830,16 +1760,32 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1830 case 3: /* neg */ 1760 case 3: /* neg */
1831 emulate_1op("neg", c->dst, ctxt->eflags); 1761 emulate_1op("neg", c->dst, ctxt->eflags);
1832 break; 1762 break;
1763 case 4: /* mul */
1764 emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
1765 break;
1766 case 5: /* imul */
1767 emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
1768 break;
1769 case 6: /* div */
1770 emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
1771 ctxt->eflags, de);
1772 break;
1773 case 7: /* idiv */
1774 emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
1775 ctxt->eflags, de);
1776 break;
1833 default: 1777 default:
1834 return 0; 1778 return X86EMUL_UNHANDLEABLE;
1835 } 1779 }
1836 return 1; 1780 if (de)
1781 return emulate_de(ctxt);
1782 return X86EMUL_CONTINUE;
1837} 1783}
1838 1784
1839static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, 1785static int em_grp45(struct x86_emulate_ctxt *ctxt)
1840 struct x86_emulate_ops *ops)
1841{ 1786{
1842 struct decode_cache *c = &ctxt->decode; 1787 struct decode_cache *c = &ctxt->decode;
1788 int rc = X86EMUL_CONTINUE;
1843 1789
1844 switch (c->modrm_reg) { 1790 switch (c->modrm_reg) {
1845 case 0: /* inc */ 1791 case 0: /* inc */
@@ -1853,21 +1799,23 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1853 old_eip = c->eip; 1799 old_eip = c->eip;
1854 c->eip = c->src.val; 1800 c->eip = c->src.val;
1855 c->src.val = old_eip; 1801 c->src.val = old_eip;
1856 emulate_push(ctxt, ops); 1802 rc = em_push(ctxt);
1857 break; 1803 break;
1858 } 1804 }
1859 case 4: /* jmp abs */ 1805 case 4: /* jmp abs */
1860 c->eip = c->src.val; 1806 c->eip = c->src.val;
1861 break; 1807 break;
1808 case 5: /* jmp far */
1809 rc = em_jmp_far(ctxt);
1810 break;
1862 case 6: /* push */ 1811 case 6: /* push */
1863 emulate_push(ctxt, ops); 1812 rc = em_push(ctxt);
1864 break; 1813 break;
1865 } 1814 }
1866 return X86EMUL_CONTINUE; 1815 return rc;
1867} 1816}
1868 1817
1869static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, 1818static int em_grp9(struct x86_emulate_ctxt *ctxt)
1870 struct x86_emulate_ops *ops)
1871{ 1819{
1872 struct decode_cache *c = &ctxt->decode; 1820 struct decode_cache *c = &ctxt->decode;
1873 u64 old = c->dst.orig_val64; 1821 u64 old = c->dst.orig_val64;
@@ -1893,25 +1841,44 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1893 int rc; 1841 int rc;
1894 unsigned long cs; 1842 unsigned long cs;
1895 1843
1896 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); 1844 rc = emulate_pop(ctxt, &c->eip, c->op_bytes);
1897 if (rc != X86EMUL_CONTINUE) 1845 if (rc != X86EMUL_CONTINUE)
1898 return rc; 1846 return rc;
1899 if (c->op_bytes == 4) 1847 if (c->op_bytes == 4)
1900 c->eip = (u32)c->eip; 1848 c->eip = (u32)c->eip;
1901 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1849 rc = emulate_pop(ctxt, &cs, c->op_bytes);
1902 if (rc != X86EMUL_CONTINUE) 1850 if (rc != X86EMUL_CONTINUE)
1903 return rc; 1851 return rc;
1904 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); 1852 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
1905 return rc; 1853 return rc;
1906} 1854}
1907 1855
1856static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
1857 struct x86_emulate_ops *ops, int seg)
1858{
1859 struct decode_cache *c = &ctxt->decode;
1860 unsigned short sel;
1861 int rc;
1862
1863 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
1864
1865 rc = load_segment_descriptor(ctxt, ops, sel, seg);
1866 if (rc != X86EMUL_CONTINUE)
1867 return rc;
1868
1869 c->dst.val = c->src.val;
1870 return rc;
1871}
1872
1908static inline void 1873static inline void
1909setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1874setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1910 struct x86_emulate_ops *ops, struct desc_struct *cs, 1875 struct x86_emulate_ops *ops, struct desc_struct *cs,
1911 struct desc_struct *ss) 1876 struct desc_struct *ss)
1912{ 1877{
1878 u16 selector;
1879
1913 memset(cs, 0, sizeof(struct desc_struct)); 1880 memset(cs, 0, sizeof(struct desc_struct));
1914 ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); 1881 ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
1915 memset(ss, 0, sizeof(struct desc_struct)); 1882 memset(ss, 0, sizeof(struct desc_struct));
1916 1883
1917 cs->l = 0; /* will be adjusted later */ 1884 cs->l = 0; /* will be adjusted later */
@@ -1941,46 +1908,44 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1941 struct desc_struct cs, ss; 1908 struct desc_struct cs, ss;
1942 u64 msr_data; 1909 u64 msr_data;
1943 u16 cs_sel, ss_sel; 1910 u16 cs_sel, ss_sel;
1911 u64 efer = 0;
1944 1912
1945 /* syscall is not available in real mode */ 1913 /* syscall is not available in real mode */
1946 if (ctxt->mode == X86EMUL_MODE_REAL || 1914 if (ctxt->mode == X86EMUL_MODE_REAL ||
1947 ctxt->mode == X86EMUL_MODE_VM86) { 1915 ctxt->mode == X86EMUL_MODE_VM86)
1948 emulate_ud(ctxt); 1916 return emulate_ud(ctxt);
1949 return X86EMUL_PROPAGATE_FAULT;
1950 }
1951 1917
1918 ops->get_msr(ctxt, MSR_EFER, &efer);
1952 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1919 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1953 1920
1954 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1921 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1955 msr_data >>= 32; 1922 msr_data >>= 32;
1956 cs_sel = (u16)(msr_data & 0xfffc); 1923 cs_sel = (u16)(msr_data & 0xfffc);
1957 ss_sel = (u16)(msr_data + 8); 1924 ss_sel = (u16)(msr_data + 8);
1958 1925
1959 if (is_long_mode(ctxt->vcpu)) { 1926 if (efer & EFER_LMA) {
1960 cs.d = 0; 1927 cs.d = 0;
1961 cs.l = 1; 1928 cs.l = 1;
1962 } 1929 }
1963 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1930 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
1964 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1931 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
1965 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
1966 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1967 1932
1968 c->regs[VCPU_REGS_RCX] = c->eip; 1933 c->regs[VCPU_REGS_RCX] = c->eip;
1969 if (is_long_mode(ctxt->vcpu)) { 1934 if (efer & EFER_LMA) {
1970#ifdef CONFIG_X86_64 1935#ifdef CONFIG_X86_64
1971 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 1936 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1972 1937
1973 ops->get_msr(ctxt->vcpu, 1938 ops->get_msr(ctxt,
1974 ctxt->mode == X86EMUL_MODE_PROT64 ? 1939 ctxt->mode == X86EMUL_MODE_PROT64 ?
1975 MSR_LSTAR : MSR_CSTAR, &msr_data); 1940 MSR_LSTAR : MSR_CSTAR, &msr_data);
1976 c->eip = msr_data; 1941 c->eip = msr_data;
1977 1942
1978 ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); 1943 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
1979 ctxt->eflags &= ~(msr_data | EFLG_RF); 1944 ctxt->eflags &= ~(msr_data | EFLG_RF);
1980#endif 1945#endif
1981 } else { 1946 } else {
1982 /* legacy mode */ 1947 /* legacy mode */
1983 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1948 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1984 c->eip = (u32)msr_data; 1949 c->eip = (u32)msr_data;
1985 1950
1986 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1951 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1996,36 +1961,30 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1996 struct desc_struct cs, ss; 1961 struct desc_struct cs, ss;
1997 u64 msr_data; 1962 u64 msr_data;
1998 u16 cs_sel, ss_sel; 1963 u16 cs_sel, ss_sel;
1964 u64 efer = 0;
1999 1965
1966 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2000 /* inject #GP if in real mode */ 1967 /* inject #GP if in real mode */
2001 if (ctxt->mode == X86EMUL_MODE_REAL) { 1968 if (ctxt->mode == X86EMUL_MODE_REAL)
2002 emulate_gp(ctxt, 0); 1969 return emulate_gp(ctxt, 0);
2003 return X86EMUL_PROPAGATE_FAULT;
2004 }
2005 1970
2006 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1971 /* XXX sysenter/sysexit have not been tested in 64bit mode.
2007 * Therefore, we inject an #UD. 1972 * Therefore, we inject an #UD.
2008 */ 1973 */
2009 if (ctxt->mode == X86EMUL_MODE_PROT64) { 1974 if (ctxt->mode == X86EMUL_MODE_PROT64)
2010 emulate_ud(ctxt); 1975 return emulate_ud(ctxt);
2011 return X86EMUL_PROPAGATE_FAULT;
2012 }
2013 1976
2014 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1977 setup_syscalls_segments(ctxt, ops, &cs, &ss);
2015 1978
2016 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 1979 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
2017 switch (ctxt->mode) { 1980 switch (ctxt->mode) {
2018 case X86EMUL_MODE_PROT32: 1981 case X86EMUL_MODE_PROT32:
2019 if ((msr_data & 0xfffc) == 0x0) { 1982 if ((msr_data & 0xfffc) == 0x0)
2020 emulate_gp(ctxt, 0); 1983 return emulate_gp(ctxt, 0);
2021 return X86EMUL_PROPAGATE_FAULT;
2022 }
2023 break; 1984 break;
2024 case X86EMUL_MODE_PROT64: 1985 case X86EMUL_MODE_PROT64:
2025 if (msr_data == 0x0) { 1986 if (msr_data == 0x0)
2026 emulate_gp(ctxt, 0); 1987 return emulate_gp(ctxt, 0);
2027 return X86EMUL_PROPAGATE_FAULT;
2028 }
2029 break; 1988 break;
2030 } 1989 }
2031 1990
@@ -2034,21 +1993,18 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2034 cs_sel &= ~SELECTOR_RPL_MASK; 1993 cs_sel &= ~SELECTOR_RPL_MASK;
2035 ss_sel = cs_sel + 8; 1994 ss_sel = cs_sel + 8;
2036 ss_sel &= ~SELECTOR_RPL_MASK; 1995 ss_sel &= ~SELECTOR_RPL_MASK;
2037 if (ctxt->mode == X86EMUL_MODE_PROT64 1996 if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) {
2038 || is_long_mode(ctxt->vcpu)) {
2039 cs.d = 0; 1997 cs.d = 0;
2040 cs.l = 1; 1998 cs.l = 1;
2041 } 1999 }
2042 2000
2043 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 2001 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2044 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 2002 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2045 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2046 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
2047 2003
2048 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 2004 ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
2049 c->eip = msr_data; 2005 c->eip = msr_data;
2050 2006
2051 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 2007 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
2052 c->regs[VCPU_REGS_RSP] = msr_data; 2008 c->regs[VCPU_REGS_RSP] = msr_data;
2053 2009
2054 return X86EMUL_CONTINUE; 2010 return X86EMUL_CONTINUE;
@@ -2065,10 +2021,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2065 2021
2066 /* inject #GP if in real mode or Virtual 8086 mode */ 2022 /* inject #GP if in real mode or Virtual 8086 mode */
2067 if (ctxt->mode == X86EMUL_MODE_REAL || 2023 if (ctxt->mode == X86EMUL_MODE_REAL ||
2068 ctxt->mode == X86EMUL_MODE_VM86) { 2024 ctxt->mode == X86EMUL_MODE_VM86)
2069 emulate_gp(ctxt, 0); 2025 return emulate_gp(ctxt, 0);
2070 return X86EMUL_PROPAGATE_FAULT;
2071 }
2072 2026
2073 setup_syscalls_segments(ctxt, ops, &cs, &ss); 2027 setup_syscalls_segments(ctxt, ops, &cs, &ss);
2074 2028
@@ -2079,22 +2033,18 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2079 2033
2080 cs.dpl = 3; 2034 cs.dpl = 3;
2081 ss.dpl = 3; 2035 ss.dpl = 3;
2082 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2036 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
2083 switch (usermode) { 2037 switch (usermode) {
2084 case X86EMUL_MODE_PROT32: 2038 case X86EMUL_MODE_PROT32:
2085 cs_sel = (u16)(msr_data + 16); 2039 cs_sel = (u16)(msr_data + 16);
2086 if ((msr_data & 0xfffc) == 0x0) { 2040 if ((msr_data & 0xfffc) == 0x0)
2087 emulate_gp(ctxt, 0); 2041 return emulate_gp(ctxt, 0);
2088 return X86EMUL_PROPAGATE_FAULT;
2089 }
2090 ss_sel = (u16)(msr_data + 24); 2042 ss_sel = (u16)(msr_data + 24);
2091 break; 2043 break;
2092 case X86EMUL_MODE_PROT64: 2044 case X86EMUL_MODE_PROT64:
2093 cs_sel = (u16)(msr_data + 32); 2045 cs_sel = (u16)(msr_data + 32);
2094 if (msr_data == 0x0) { 2046 if (msr_data == 0x0)
2095 emulate_gp(ctxt, 0); 2047 return emulate_gp(ctxt, 0);
2096 return X86EMUL_PROPAGATE_FAULT;
2097 }
2098 ss_sel = cs_sel + 8; 2048 ss_sel = cs_sel + 8;
2099 cs.d = 0; 2049 cs.d = 0;
2100 cs.l = 1; 2050 cs.l = 1;
@@ -2103,10 +2053,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2103 cs_sel |= SELECTOR_RPL_MASK; 2053 cs_sel |= SELECTOR_RPL_MASK;
2104 ss_sel |= SELECTOR_RPL_MASK; 2054 ss_sel |= SELECTOR_RPL_MASK;
2105 2055
2106 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 2056 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2107 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 2057 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2108 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2109 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
2110 2058
2111 c->eip = c->regs[VCPU_REGS_RDX]; 2059 c->eip = c->regs[VCPU_REGS_RDX];
2112 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; 2060 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
@@ -2123,7 +2071,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
2123 if (ctxt->mode == X86EMUL_MODE_VM86) 2071 if (ctxt->mode == X86EMUL_MODE_VM86)
2124 return true; 2072 return true;
2125 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2073 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
2126 return ops->cpl(ctxt->vcpu) > iopl; 2074 return ops->cpl(ctxt) > iopl;
2127} 2075}
2128 2076
2129static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2077static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -2131,24 +2079,27 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2131 u16 port, u16 len) 2079 u16 port, u16 len)
2132{ 2080{
2133 struct desc_struct tr_seg; 2081 struct desc_struct tr_seg;
2082 u32 base3;
2134 int r; 2083 int r;
2135 u16 io_bitmap_ptr; 2084 u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7;
2136 u8 perm, bit_idx = port & 0x7;
2137 unsigned mask = (1 << len) - 1; 2085 unsigned mask = (1 << len) - 1;
2086 unsigned long base;
2138 2087
2139 ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); 2088 ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
2140 if (!tr_seg.p) 2089 if (!tr_seg.p)
2141 return false; 2090 return false;
2142 if (desc_limit_scaled(&tr_seg) < 103) 2091 if (desc_limit_scaled(&tr_seg) < 103)
2143 return false; 2092 return false;
2144 r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, 2093 base = get_desc_base(&tr_seg);
2145 ctxt->vcpu, NULL); 2094#ifdef CONFIG_X86_64
2095 base |= ((u64)base3) << 32;
2096#endif
2097 r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL);
2146 if (r != X86EMUL_CONTINUE) 2098 if (r != X86EMUL_CONTINUE)
2147 return false; 2099 return false;
2148 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) 2100 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
2149 return false; 2101 return false;
2150 r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, 2102 r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL);
2151 &perm, 1, ctxt->vcpu, NULL);
2152 if (r != X86EMUL_CONTINUE) 2103 if (r != X86EMUL_CONTINUE)
2153 return false; 2104 return false;
2154 if ((perm >> bit_idx) & mask) 2105 if ((perm >> bit_idx) & mask)
@@ -2160,9 +2111,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2160 struct x86_emulate_ops *ops, 2111 struct x86_emulate_ops *ops,
2161 u16 port, u16 len) 2112 u16 port, u16 len)
2162{ 2113{
2114 if (ctxt->perm_ok)
2115 return true;
2116
2163 if (emulator_bad_iopl(ctxt, ops)) 2117 if (emulator_bad_iopl(ctxt, ops))
2164 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 2118 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
2165 return false; 2119 return false;
2120
2121 ctxt->perm_ok = true;
2122
2166 return true; 2123 return true;
2167} 2124}
2168 2125
@@ -2183,11 +2140,11 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2183 tss->si = c->regs[VCPU_REGS_RSI]; 2140 tss->si = c->regs[VCPU_REGS_RSI];
2184 tss->di = c->regs[VCPU_REGS_RDI]; 2141 tss->di = c->regs[VCPU_REGS_RDI];
2185 2142
2186 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); 2143 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2187 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); 2144 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
2188 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); 2145 tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
2189 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); 2146 tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
2190 tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); 2147 tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR);
2191} 2148}
2192 2149
2193static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, 2150static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
@@ -2212,11 +2169,11 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2212 * SDM says that segment selectors are loaded before segment 2169 * SDM says that segment selectors are loaded before segment
2213 * descriptors 2170 * descriptors
2214 */ 2171 */
2215 ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); 2172 set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR);
2216 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); 2173 set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
2217 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); 2174 set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
2218 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); 2175 set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
2219 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); 2176 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
2220 2177
2221 /* 2178 /*
2222 * Now load segment descriptors. If fault happenes at this stage 2179 * Now load segment descriptors. If fault happenes at this stage
@@ -2248,46 +2205,38 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2248{ 2205{
2249 struct tss_segment_16 tss_seg; 2206 struct tss_segment_16 tss_seg;
2250 int ret; 2207 int ret;
2251 u32 err, new_tss_base = get_desc_base(new_desc); 2208 u32 new_tss_base = get_desc_base(new_desc);
2252 2209
2253 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2210 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2254 &err); 2211 &ctxt->exception);
2255 if (ret == X86EMUL_PROPAGATE_FAULT) { 2212 if (ret != X86EMUL_CONTINUE)
2256 /* FIXME: need to provide precise fault address */ 2213 /* FIXME: need to provide precise fault address */
2257 emulate_pf(ctxt, old_tss_base, err);
2258 return ret; 2214 return ret;
2259 }
2260 2215
2261 save_state_to_tss16(ctxt, ops, &tss_seg); 2216 save_state_to_tss16(ctxt, ops, &tss_seg);
2262 2217
2263 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2218 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2264 &err); 2219 &ctxt->exception);
2265 if (ret == X86EMUL_PROPAGATE_FAULT) { 2220 if (ret != X86EMUL_CONTINUE)
2266 /* FIXME: need to provide precise fault address */ 2221 /* FIXME: need to provide precise fault address */
2267 emulate_pf(ctxt, old_tss_base, err);
2268 return ret; 2222 return ret;
2269 }
2270 2223
2271 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2224 ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
2272 &err); 2225 &ctxt->exception);
2273 if (ret == X86EMUL_PROPAGATE_FAULT) { 2226 if (ret != X86EMUL_CONTINUE)
2274 /* FIXME: need to provide precise fault address */ 2227 /* FIXME: need to provide precise fault address */
2275 emulate_pf(ctxt, new_tss_base, err);
2276 return ret; 2228 return ret;
2277 }
2278 2229
2279 if (old_tss_sel != 0xffff) { 2230 if (old_tss_sel != 0xffff) {
2280 tss_seg.prev_task_link = old_tss_sel; 2231 tss_seg.prev_task_link = old_tss_sel;
2281 2232
2282 ret = ops->write_std(new_tss_base, 2233 ret = ops->write_std(ctxt, new_tss_base,
2283 &tss_seg.prev_task_link, 2234 &tss_seg.prev_task_link,
2284 sizeof tss_seg.prev_task_link, 2235 sizeof tss_seg.prev_task_link,
2285 ctxt->vcpu, &err); 2236 &ctxt->exception);
2286 if (ret == X86EMUL_PROPAGATE_FAULT) { 2237 if (ret != X86EMUL_CONTINUE)
2287 /* FIXME: need to provide precise fault address */ 2238 /* FIXME: need to provide precise fault address */
2288 emulate_pf(ctxt, new_tss_base, err);
2289 return ret; 2239 return ret;
2290 }
2291 } 2240 }
2292 2241
2293 return load_state_from_tss16(ctxt, ops, &tss_seg); 2242 return load_state_from_tss16(ctxt, ops, &tss_seg);
@@ -2299,7 +2248,7 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2299{ 2248{
2300 struct decode_cache *c = &ctxt->decode; 2249 struct decode_cache *c = &ctxt->decode;
2301 2250
2302 tss->cr3 = ops->get_cr(3, ctxt->vcpu); 2251 tss->cr3 = ops->get_cr(ctxt, 3);
2303 tss->eip = c->eip; 2252 tss->eip = c->eip;
2304 tss->eflags = ctxt->eflags; 2253 tss->eflags = ctxt->eflags;
2305 tss->eax = c->regs[VCPU_REGS_RAX]; 2254 tss->eax = c->regs[VCPU_REGS_RAX];
@@ -2311,13 +2260,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2311 tss->esi = c->regs[VCPU_REGS_RSI]; 2260 tss->esi = c->regs[VCPU_REGS_RSI];
2312 tss->edi = c->regs[VCPU_REGS_RDI]; 2261 tss->edi = c->regs[VCPU_REGS_RDI];
2313 2262
2314 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); 2263 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2315 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); 2264 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
2316 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); 2265 tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
2317 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); 2266 tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
2318 tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); 2267 tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
2319 tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); 2268 tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
2320 tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); 2269 tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR);
2321} 2270}
2322 2271
2323static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, 2272static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
@@ -2327,10 +2276,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2327 struct decode_cache *c = &ctxt->decode; 2276 struct decode_cache *c = &ctxt->decode;
2328 int ret; 2277 int ret;
2329 2278
2330 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { 2279 if (ops->set_cr(ctxt, 3, tss->cr3))
2331 emulate_gp(ctxt, 0); 2280 return emulate_gp(ctxt, 0);
2332 return X86EMUL_PROPAGATE_FAULT;
2333 }
2334 c->eip = tss->eip; 2281 c->eip = tss->eip;
2335 ctxt->eflags = tss->eflags | 2; 2282 ctxt->eflags = tss->eflags | 2;
2336 c->regs[VCPU_REGS_RAX] = tss->eax; 2283 c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2346,13 +2293,13 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2346 * SDM says that segment selectors are loaded before segment 2293 * SDM says that segment selectors are loaded before segment
2347 * descriptors 2294 * descriptors
2348 */ 2295 */
2349 ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); 2296 set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
2350 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); 2297 set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
2351 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); 2298 set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
2352 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); 2299 set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
2353 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); 2300 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
2354 ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); 2301 set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
2355 ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); 2302 set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
2356 2303
2357 /* 2304 /*
2358 * Now load segment descriptors. If fault happenes at this stage 2305 * Now load segment descriptors. If fault happenes at this stage
@@ -2390,46 +2337,38 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2390{ 2337{
2391 struct tss_segment_32 tss_seg; 2338 struct tss_segment_32 tss_seg;
2392 int ret; 2339 int ret;
2393 u32 err, new_tss_base = get_desc_base(new_desc); 2340 u32 new_tss_base = get_desc_base(new_desc);
2394 2341
2395 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2342 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2396 &err); 2343 &ctxt->exception);
2397 if (ret == X86EMUL_PROPAGATE_FAULT) { 2344 if (ret != X86EMUL_CONTINUE)
2398 /* FIXME: need to provide precise fault address */ 2345 /* FIXME: need to provide precise fault address */
2399 emulate_pf(ctxt, old_tss_base, err);
2400 return ret; 2346 return ret;
2401 }
2402 2347
2403 save_state_to_tss32(ctxt, ops, &tss_seg); 2348 save_state_to_tss32(ctxt, ops, &tss_seg);
2404 2349
2405 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2350 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2406 &err); 2351 &ctxt->exception);
2407 if (ret == X86EMUL_PROPAGATE_FAULT) { 2352 if (ret != X86EMUL_CONTINUE)
2408 /* FIXME: need to provide precise fault address */ 2353 /* FIXME: need to provide precise fault address */
2409 emulate_pf(ctxt, old_tss_base, err);
2410 return ret; 2354 return ret;
2411 }
2412 2355
2413 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2356 ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
2414 &err); 2357 &ctxt->exception);
2415 if (ret == X86EMUL_PROPAGATE_FAULT) { 2358 if (ret != X86EMUL_CONTINUE)
2416 /* FIXME: need to provide precise fault address */ 2359 /* FIXME: need to provide precise fault address */
2417 emulate_pf(ctxt, new_tss_base, err);
2418 return ret; 2360 return ret;
2419 }
2420 2361
2421 if (old_tss_sel != 0xffff) { 2362 if (old_tss_sel != 0xffff) {
2422 tss_seg.prev_task_link = old_tss_sel; 2363 tss_seg.prev_task_link = old_tss_sel;
2423 2364
2424 ret = ops->write_std(new_tss_base, 2365 ret = ops->write_std(ctxt, new_tss_base,
2425 &tss_seg.prev_task_link, 2366 &tss_seg.prev_task_link,
2426 sizeof tss_seg.prev_task_link, 2367 sizeof tss_seg.prev_task_link,
2427 ctxt->vcpu, &err); 2368 &ctxt->exception);
2428 if (ret == X86EMUL_PROPAGATE_FAULT) { 2369 if (ret != X86EMUL_CONTINUE)
2429 /* FIXME: need to provide precise fault address */ 2370 /* FIXME: need to provide precise fault address */
2430 emulate_pf(ctxt, new_tss_base, err);
2431 return ret; 2371 return ret;
2432 }
2433 } 2372 }
2434 2373
2435 return load_state_from_tss32(ctxt, ops, &tss_seg); 2374 return load_state_from_tss32(ctxt, ops, &tss_seg);
@@ -2442,9 +2381,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2442{ 2381{
2443 struct desc_struct curr_tss_desc, next_tss_desc; 2382 struct desc_struct curr_tss_desc, next_tss_desc;
2444 int ret; 2383 int ret;
2445 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); 2384 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
2446 ulong old_tss_base = 2385 ulong old_tss_base =
2447 ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); 2386 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
2448 u32 desc_limit; 2387 u32 desc_limit;
2449 2388
2450 /* FIXME: old_tss_base == ~0 ? */ 2389 /* FIXME: old_tss_base == ~0 ? */
@@ -2460,10 +2399,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2460 2399
2461 if (reason != TASK_SWITCH_IRET) { 2400 if (reason != TASK_SWITCH_IRET) {
2462 if ((tss_selector & 3) > next_tss_desc.dpl || 2401 if ((tss_selector & 3) > next_tss_desc.dpl ||
2463 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2402 ops->cpl(ctxt) > next_tss_desc.dpl)
2464 emulate_gp(ctxt, 0); 2403 return emulate_gp(ctxt, 0);
2465 return X86EMUL_PROPAGATE_FAULT;
2466 }
2467 } 2404 }
2468 2405
2469 desc_limit = desc_limit_scaled(&next_tss_desc); 2406 desc_limit = desc_limit_scaled(&next_tss_desc);
@@ -2506,9 +2443,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2506 &next_tss_desc); 2443 &next_tss_desc);
2507 } 2444 }
2508 2445
2509 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); 2446 ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
2510 ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); 2447 ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
2511 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2512 2448
2513 if (has_error_code) { 2449 if (has_error_code) {
2514 struct decode_cache *c = &ctxt->decode; 2450 struct decode_cache *c = &ctxt->decode;
@@ -2516,17 +2452,17 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2516 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2452 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2517 c->lock_prefix = 0; 2453 c->lock_prefix = 0;
2518 c->src.val = (unsigned long) error_code; 2454 c->src.val = (unsigned long) error_code;
2519 emulate_push(ctxt, ops); 2455 ret = em_push(ctxt);
2520 } 2456 }
2521 2457
2522 return ret; 2458 return ret;
2523} 2459}
2524 2460
2525int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 2461int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2526 struct x86_emulate_ops *ops,
2527 u16 tss_selector, int reason, 2462 u16 tss_selector, int reason,
2528 bool has_error_code, u32 error_code) 2463 bool has_error_code, u32 error_code)
2529{ 2464{
2465 struct x86_emulate_ops *ops = ctxt->ops;
2530 struct decode_cache *c = &ctxt->decode; 2466 struct decode_cache *c = &ctxt->decode;
2531 int rc; 2467 int rc;
2532 2468
@@ -2536,91 +2472,1357 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2536 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2472 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2537 has_error_code, error_code); 2473 has_error_code, error_code);
2538 2474
2539 if (rc == X86EMUL_CONTINUE) { 2475 if (rc == X86EMUL_CONTINUE)
2540 rc = writeback(ctxt, ops); 2476 ctxt->eip = c->eip;
2541 if (rc == X86EMUL_CONTINUE)
2542 ctxt->eip = c->eip;
2543 }
2544 2477
2545 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2478 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
2546} 2479}
2547 2480
2548static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, 2481static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
2549 int reg, struct operand *op) 2482 int reg, struct operand *op)
2550{ 2483{
2551 struct decode_cache *c = &ctxt->decode; 2484 struct decode_cache *c = &ctxt->decode;
2552 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2485 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2553 2486
2554 register_address_increment(c, &c->regs[reg], df * op->bytes); 2487 register_address_increment(c, &c->regs[reg], df * op->bytes);
2555 op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); 2488 op->addr.mem.ea = register_address(c, c->regs[reg]);
2489 op->addr.mem.seg = seg;
2490}
2491
2492static int em_das(struct x86_emulate_ctxt *ctxt)
2493{
2494 struct decode_cache *c = &ctxt->decode;
2495 u8 al, old_al;
2496 bool af, cf, old_cf;
2497
2498 cf = ctxt->eflags & X86_EFLAGS_CF;
2499 al = c->dst.val;
2500
2501 old_al = al;
2502 old_cf = cf;
2503 cf = false;
2504 af = ctxt->eflags & X86_EFLAGS_AF;
2505 if ((al & 0x0f) > 9 || af) {
2506 al -= 6;
2507 cf = old_cf | (al >= 250);
2508 af = true;
2509 } else {
2510 af = false;
2511 }
2512 if (old_al > 0x99 || old_cf) {
2513 al -= 0x60;
2514 cf = true;
2515 }
2516
2517 c->dst.val = al;
2518 /* Set PF, ZF, SF */
2519 c->src.type = OP_IMM;
2520 c->src.val = 0;
2521 c->src.bytes = 1;
2522 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2523 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2524 if (cf)
2525 ctxt->eflags |= X86_EFLAGS_CF;
2526 if (af)
2527 ctxt->eflags |= X86_EFLAGS_AF;
2528 return X86EMUL_CONTINUE;
2529}
2530
2531static int em_call_far(struct x86_emulate_ctxt *ctxt)
2532{
2533 struct decode_cache *c = &ctxt->decode;
2534 u16 sel, old_cs;
2535 ulong old_eip;
2536 int rc;
2537
2538 old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
2539 old_eip = c->eip;
2540
2541 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
2542 if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
2543 return X86EMUL_CONTINUE;
2544
2545 c->eip = 0;
2546 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2547
2548 c->src.val = old_cs;
2549 rc = em_push(ctxt);
2550 if (rc != X86EMUL_CONTINUE)
2551 return rc;
2552
2553 c->src.val = old_eip;
2554 return em_push(ctxt);
2555}
2556
2557static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2558{
2559 struct decode_cache *c = &ctxt->decode;
2560 int rc;
2561
2562 c->dst.type = OP_REG;
2563 c->dst.addr.reg = &c->eip;
2564 c->dst.bytes = c->op_bytes;
2565 rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes);
2566 if (rc != X86EMUL_CONTINUE)
2567 return rc;
2568 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
2569 return X86EMUL_CONTINUE;
2570}
2571
2572static int em_add(struct x86_emulate_ctxt *ctxt)
2573{
2574 struct decode_cache *c = &ctxt->decode;
2575
2576 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2577 return X86EMUL_CONTINUE;
2578}
2579
2580static int em_or(struct x86_emulate_ctxt *ctxt)
2581{
2582 struct decode_cache *c = &ctxt->decode;
2583
2584 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2585 return X86EMUL_CONTINUE;
2586}
2587
2588static int em_adc(struct x86_emulate_ctxt *ctxt)
2589{
2590 struct decode_cache *c = &ctxt->decode;
2591
2592 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2593 return X86EMUL_CONTINUE;
2594}
2595
2596static int em_sbb(struct x86_emulate_ctxt *ctxt)
2597{
2598 struct decode_cache *c = &ctxt->decode;
2599
2600 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2601 return X86EMUL_CONTINUE;
2602}
2603
2604static int em_and(struct x86_emulate_ctxt *ctxt)
2605{
2606 struct decode_cache *c = &ctxt->decode;
2607
2608 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
2609 return X86EMUL_CONTINUE;
2610}
2611
2612static int em_sub(struct x86_emulate_ctxt *ctxt)
2613{
2614 struct decode_cache *c = &ctxt->decode;
2615
2616 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
2617 return X86EMUL_CONTINUE;
2618}
2619
2620static int em_xor(struct x86_emulate_ctxt *ctxt)
2621{
2622 struct decode_cache *c = &ctxt->decode;
2623
2624 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
2625 return X86EMUL_CONTINUE;
2626}
2627
2628static int em_cmp(struct x86_emulate_ctxt *ctxt)
2629{
2630 struct decode_cache *c = &ctxt->decode;
2631
2632 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2633 /* Disable writeback. */
2634 c->dst.type = OP_NONE;
2635 return X86EMUL_CONTINUE;
2636}
2637
2638static int em_imul(struct x86_emulate_ctxt *ctxt)
2639{
2640 struct decode_cache *c = &ctxt->decode;
2641
2642 emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
2643 return X86EMUL_CONTINUE;
2644}
2645
2646static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
2647{
2648 struct decode_cache *c = &ctxt->decode;
2649
2650 c->dst.val = c->src2.val;
2651 return em_imul(ctxt);
2652}
2653
2654static int em_cwd(struct x86_emulate_ctxt *ctxt)
2655{
2656 struct decode_cache *c = &ctxt->decode;
2657
2658 c->dst.type = OP_REG;
2659 c->dst.bytes = c->src.bytes;
2660 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
2661 c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
2662
2663 return X86EMUL_CONTINUE;
2664}
2665
2666static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2667{
2668 struct decode_cache *c = &ctxt->decode;
2669 u64 tsc = 0;
2670
2671 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
2672 c->regs[VCPU_REGS_RAX] = (u32)tsc;
2673 c->regs[VCPU_REGS_RDX] = tsc >> 32;
2674 return X86EMUL_CONTINUE;
2675}
2676
2677static int em_mov(struct x86_emulate_ctxt *ctxt)
2678{
2679 struct decode_cache *c = &ctxt->decode;
2680 c->dst.val = c->src.val;
2681 return X86EMUL_CONTINUE;
2682}
2683
2684static int em_movdqu(struct x86_emulate_ctxt *ctxt)
2685{
2686 struct decode_cache *c = &ctxt->decode;
2687 memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes);
2688 return X86EMUL_CONTINUE;
2689}
2690
2691static int em_invlpg(struct x86_emulate_ctxt *ctxt)
2692{
2693 struct decode_cache *c = &ctxt->decode;
2694 int rc;
2695 ulong linear;
2696
2697 rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear);
2698 if (rc == X86EMUL_CONTINUE)
2699 ctxt->ops->invlpg(ctxt, linear);
2700 /* Disable writeback. */
2701 c->dst.type = OP_NONE;
2702 return X86EMUL_CONTINUE;
2703}
2704
2705static int em_clts(struct x86_emulate_ctxt *ctxt)
2706{
2707 ulong cr0;
2708
2709 cr0 = ctxt->ops->get_cr(ctxt, 0);
2710 cr0 &= ~X86_CR0_TS;
2711 ctxt->ops->set_cr(ctxt, 0, cr0);
2712 return X86EMUL_CONTINUE;
2713}
2714
2715static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2716{
2717 struct decode_cache *c = &ctxt->decode;
2718 int rc;
2719
2720 if (c->modrm_mod != 3 || c->modrm_rm != 1)
2721 return X86EMUL_UNHANDLEABLE;
2722
2723 rc = ctxt->ops->fix_hypercall(ctxt);
2724 if (rc != X86EMUL_CONTINUE)
2725 return rc;
2726
2727 /* Let the processor re-execute the fixed hypercall */
2728 c->eip = ctxt->eip;
2729 /* Disable writeback. */
2730 c->dst.type = OP_NONE;
2731 return X86EMUL_CONTINUE;
2732}
2733
2734static int em_lgdt(struct x86_emulate_ctxt *ctxt)
2735{
2736 struct decode_cache *c = &ctxt->decode;
2737 struct desc_ptr desc_ptr;
2738 int rc;
2739
2740 rc = read_descriptor(ctxt, c->src.addr.mem,
2741 &desc_ptr.size, &desc_ptr.address,
2742 c->op_bytes);
2743 if (rc != X86EMUL_CONTINUE)
2744 return rc;
2745 ctxt->ops->set_gdt(ctxt, &desc_ptr);
2746 /* Disable writeback. */
2747 c->dst.type = OP_NONE;
2748 return X86EMUL_CONTINUE;
2749}
2750
2751static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
2752{
2753 struct decode_cache *c = &ctxt->decode;
2754 int rc;
2755
2756 rc = ctxt->ops->fix_hypercall(ctxt);
2757
2758 /* Disable writeback. */
2759 c->dst.type = OP_NONE;
2760 return rc;
2761}
2762
2763static int em_lidt(struct x86_emulate_ctxt *ctxt)
2764{
2765 struct decode_cache *c = &ctxt->decode;
2766 struct desc_ptr desc_ptr;
2767 int rc;
2768
2769 rc = read_descriptor(ctxt, c->src.addr.mem,
2770 &desc_ptr.size, &desc_ptr.address,
2771 c->op_bytes);
2772 if (rc != X86EMUL_CONTINUE)
2773 return rc;
2774 ctxt->ops->set_idt(ctxt, &desc_ptr);
2775 /* Disable writeback. */
2776 c->dst.type = OP_NONE;
2777 return X86EMUL_CONTINUE;
2778}
2779
2780static int em_smsw(struct x86_emulate_ctxt *ctxt)
2781{
2782 struct decode_cache *c = &ctxt->decode;
2783
2784 c->dst.bytes = 2;
2785 c->dst.val = ctxt->ops->get_cr(ctxt, 0);
2786 return X86EMUL_CONTINUE;
2787}
2788
2789static int em_lmsw(struct x86_emulate_ctxt *ctxt)
2790{
2791 struct decode_cache *c = &ctxt->decode;
2792 ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
2793 | (c->src.val & 0x0f));
2794 c->dst.type = OP_NONE;
2795 return X86EMUL_CONTINUE;
2796}
2797
2798static bool valid_cr(int nr)
2799{
2800 switch (nr) {
2801 case 0:
2802 case 2 ... 4:
2803 case 8:
2804 return true;
2805 default:
2806 return false;
2807 }
2808}
2809
2810static int check_cr_read(struct x86_emulate_ctxt *ctxt)
2811{
2812 struct decode_cache *c = &ctxt->decode;
2813
2814 if (!valid_cr(c->modrm_reg))
2815 return emulate_ud(ctxt);
2816
2817 return X86EMUL_CONTINUE;
2818}
2819
2820static int check_cr_write(struct x86_emulate_ctxt *ctxt)
2821{
2822 struct decode_cache *c = &ctxt->decode;
2823 u64 new_val = c->src.val64;
2824 int cr = c->modrm_reg;
2825 u64 efer = 0;
2826
2827 static u64 cr_reserved_bits[] = {
2828 0xffffffff00000000ULL,
2829 0, 0, 0, /* CR3 checked later */
2830 CR4_RESERVED_BITS,
2831 0, 0, 0,
2832 CR8_RESERVED_BITS,
2833 };
2834
2835 if (!valid_cr(cr))
2836 return emulate_ud(ctxt);
2837
2838 if (new_val & cr_reserved_bits[cr])
2839 return emulate_gp(ctxt, 0);
2840
2841 switch (cr) {
2842 case 0: {
2843 u64 cr4;
2844 if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
2845 ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
2846 return emulate_gp(ctxt, 0);
2847
2848 cr4 = ctxt->ops->get_cr(ctxt, 4);
2849 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2850
2851 if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
2852 !(cr4 & X86_CR4_PAE))
2853 return emulate_gp(ctxt, 0);
2854
2855 break;
2856 }
2857 case 3: {
2858 u64 rsvd = 0;
2859
2860 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2861 if (efer & EFER_LMA)
2862 rsvd = CR3_L_MODE_RESERVED_BITS;
2863 else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE)
2864 rsvd = CR3_PAE_RESERVED_BITS;
2865 else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG)
2866 rsvd = CR3_NONPAE_RESERVED_BITS;
2867
2868 if (new_val & rsvd)
2869 return emulate_gp(ctxt, 0);
2870
2871 break;
2872 }
2873 case 4: {
2874 u64 cr4;
2875
2876 cr4 = ctxt->ops->get_cr(ctxt, 4);
2877 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2878
2879 if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
2880 return emulate_gp(ctxt, 0);
2881
2882 break;
2883 }
2884 }
2885
2886 return X86EMUL_CONTINUE;
2887}
2888
2889static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
2890{
2891 unsigned long dr7;
2892
2893 ctxt->ops->get_dr(ctxt, 7, &dr7);
2894
2895 /* Check if DR7.Global_Enable is set */
2896 return dr7 & (1 << 13);
2897}
2898
2899static int check_dr_read(struct x86_emulate_ctxt *ctxt)
2900{
2901 struct decode_cache *c = &ctxt->decode;
2902 int dr = c->modrm_reg;
2903 u64 cr4;
2904
2905 if (dr > 7)
2906 return emulate_ud(ctxt);
2907
2908 cr4 = ctxt->ops->get_cr(ctxt, 4);
2909 if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
2910 return emulate_ud(ctxt);
2911
2912 if (check_dr7_gd(ctxt))
2913 return emulate_db(ctxt);
2914
2915 return X86EMUL_CONTINUE;
2916}
2917
2918static int check_dr_write(struct x86_emulate_ctxt *ctxt)
2919{
2920 struct decode_cache *c = &ctxt->decode;
2921 u64 new_val = c->src.val64;
2922 int dr = c->modrm_reg;
2923
2924 if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
2925 return emulate_gp(ctxt, 0);
2926
2927 return check_dr_read(ctxt);
2928}
2929
2930static int check_svme(struct x86_emulate_ctxt *ctxt)
2931{
2932 u64 efer;
2933
2934 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2935
2936 if (!(efer & EFER_SVME))
2937 return emulate_ud(ctxt);
2938
2939 return X86EMUL_CONTINUE;
2940}
2941
2942static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
2943{
2944 u64 rax = ctxt->decode.regs[VCPU_REGS_RAX];
2945
2946 /* Valid physical address? */
2947 if (rax & 0xffff000000000000ULL)
2948 return emulate_gp(ctxt, 0);
2949
2950 return check_svme(ctxt);
2951}
2952
2953static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
2954{
2955 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
2956
2957 if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
2958 return emulate_ud(ctxt);
2959
2960 return X86EMUL_CONTINUE;
2961}
2962
2963static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
2964{
2965 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
2966 u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX];
2967
2968 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
2969 (rcx > 3))
2970 return emulate_gp(ctxt, 0);
2971
2972 return X86EMUL_CONTINUE;
2973}
2974
2975static int check_perm_in(struct x86_emulate_ctxt *ctxt)
2976{
2977 struct decode_cache *c = &ctxt->decode;
2978
2979 c->dst.bytes = min(c->dst.bytes, 4u);
2980 if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes))
2981 return emulate_gp(ctxt, 0);
2982
2983 return X86EMUL_CONTINUE;
2984}
2985
2986static int check_perm_out(struct x86_emulate_ctxt *ctxt)
2987{
2988 struct decode_cache *c = &ctxt->decode;
2989
2990 c->src.bytes = min(c->src.bytes, 4u);
2991 if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes))
2992 return emulate_gp(ctxt, 0);
2993
2994 return X86EMUL_CONTINUE;
2995}
2996
2997#define D(_y) { .flags = (_y) }
2998#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
2999#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
3000 .check_perm = (_p) }
3001#define N D(0)
3002#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
3003#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
3004#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) }
3005#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
3006#define II(_f, _e, _i) \
3007 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
3008#define IIP(_f, _e, _i, _p) \
3009 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \
3010 .check_perm = (_p) }
3011#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
3012
3013#define D2bv(_f) D((_f) | ByteOp), D(_f)
3014#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
3015#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
3016
3017#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
3018 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
3019 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
3020
3021static struct opcode group7_rm1[] = {
3022 DI(SrcNone | ModRM | Priv, monitor),
3023 DI(SrcNone | ModRM | Priv, mwait),
3024 N, N, N, N, N, N,
3025};
3026
3027static struct opcode group7_rm3[] = {
3028 DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa),
3029 II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall),
3030 DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa),
3031 DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa),
3032 DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme),
3033 DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme),
3034 DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme),
3035 DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme),
3036};
3037
3038static struct opcode group7_rm7[] = {
3039 N,
3040 DIP(SrcNone | ModRM, rdtscp, check_rdtsc),
3041 N, N, N, N, N, N,
3042};
3043
3044static struct opcode group1[] = {
3045 I(Lock, em_add),
3046 I(Lock, em_or),
3047 I(Lock, em_adc),
3048 I(Lock, em_sbb),
3049 I(Lock, em_and),
3050 I(Lock, em_sub),
3051 I(Lock, em_xor),
3052 I(0, em_cmp),
3053};
3054
3055static struct opcode group1A[] = {
3056 D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
3057};
3058
3059static struct opcode group3[] = {
3060 D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
3061 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
3062 X4(D(SrcMem | ModRM)),
3063};
3064
3065static struct opcode group4[] = {
3066 D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
3067 N, N, N, N, N, N,
3068};
3069
3070static struct opcode group5[] = {
3071 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
3072 D(SrcMem | ModRM | Stack),
3073 I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
3074 D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
3075 D(SrcMem | ModRM | Stack), N,
3076};
3077
3078static struct opcode group6[] = {
3079 DI(ModRM | Prot, sldt),
3080 DI(ModRM | Prot, str),
3081 DI(ModRM | Prot | Priv, lldt),
3082 DI(ModRM | Prot | Priv, ltr),
3083 N, N, N, N,
3084};
3085
3086static struct group_dual group7 = { {
3087 DI(ModRM | Mov | DstMem | Priv, sgdt),
3088 DI(ModRM | Mov | DstMem | Priv, sidt),
3089 II(ModRM | SrcMem | Priv, em_lgdt, lgdt),
3090 II(ModRM | SrcMem | Priv, em_lidt, lidt),
3091 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
3092 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw),
3093 II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
3094}, {
3095 I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall),
3096 EXT(0, group7_rm1),
3097 N, EXT(0, group7_rm3),
3098 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
3099 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7),
3100} };
3101
3102static struct opcode group8[] = {
3103 N, N, N, N,
3104 D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
3105 D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
3106};
3107
3108static struct group_dual group9 = { {
3109 N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
3110}, {
3111 N, N, N, N, N, N, N, N,
3112} };
3113
3114static struct opcode group11[] = {
3115 I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
3116};
3117
3118static struct gprefix pfx_0f_6f_0f_7f = {
3119 N, N, N, I(Sse, em_movdqu),
3120};
3121
3122static struct opcode opcode_table[256] = {
3123 /* 0x00 - 0x07 */
3124 I6ALU(Lock, em_add),
3125 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
3126 /* 0x08 - 0x0F */
3127 I6ALU(Lock, em_or),
3128 D(ImplicitOps | Stack | No64), N,
3129 /* 0x10 - 0x17 */
3130 I6ALU(Lock, em_adc),
3131 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
3132 /* 0x18 - 0x1F */
3133 I6ALU(Lock, em_sbb),
3134 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
3135 /* 0x20 - 0x27 */
3136 I6ALU(Lock, em_and), N, N,
3137 /* 0x28 - 0x2F */
3138 I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
3139 /* 0x30 - 0x37 */
3140 I6ALU(Lock, em_xor), N, N,
3141 /* 0x38 - 0x3F */
3142 I6ALU(0, em_cmp), N, N,
3143 /* 0x40 - 0x4F */
3144 X16(D(DstReg)),
3145 /* 0x50 - 0x57 */
3146 X8(I(SrcReg | Stack, em_push)),
3147 /* 0x58 - 0x5F */
3148 X8(I(DstReg | Stack, em_pop)),
3149 /* 0x60 - 0x67 */
3150 I(ImplicitOps | Stack | No64, em_pusha),
3151 I(ImplicitOps | Stack | No64, em_popa),
3152 N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
3153 N, N, N, N,
3154 /* 0x68 - 0x6F */
3155 I(SrcImm | Mov | Stack, em_push),
3156 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
3157 I(SrcImmByte | Mov | Stack, em_push),
3158 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
3159 D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */
3160 D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */
3161 /* 0x70 - 0x7F */
3162 X16(D(SrcImmByte)),
3163 /* 0x80 - 0x87 */
3164 G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
3165 G(DstMem | SrcImm | ModRM | Group, group1),
3166 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
3167 G(DstMem | SrcImmByte | ModRM | Group, group1),
3168 D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
3169 /* 0x88 - 0x8F */
3170 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
3171 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
3172 D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
3173 D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
3174 /* 0x90 - 0x97 */
3175 DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
3176 /* 0x98 - 0x9F */
3177 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
3178 I(SrcImmFAddr | No64, em_call_far), N,
3179 II(ImplicitOps | Stack, em_pushf, pushf),
3180 II(ImplicitOps | Stack, em_popf, popf), N, N,
3181 /* 0xA0 - 0xA7 */
3182 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
3183 I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
3184 I2bv(SrcSI | DstDI | Mov | String, em_mov),
3185 I2bv(SrcSI | DstDI | String, em_cmp),
3186 /* 0xA8 - 0xAF */
3187 D2bv(DstAcc | SrcImm),
3188 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
3189 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
3190 I2bv(SrcAcc | DstDI | String, em_cmp),
3191 /* 0xB0 - 0xB7 */
3192 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
3193 /* 0xB8 - 0xBF */
3194 X8(I(DstReg | SrcImm | Mov, em_mov)),
3195 /* 0xC0 - 0xC7 */
3196 D2bv(DstMem | SrcImmByte | ModRM),
3197 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
3198 D(ImplicitOps | Stack),
3199 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
3200 G(ByteOp, group11), G(0, group11),
3201 /* 0xC8 - 0xCF */
3202 N, N, N, D(ImplicitOps | Stack),
3203 D(ImplicitOps), DI(SrcImmByte, intn),
3204 D(ImplicitOps | No64), DI(ImplicitOps, iret),
3205 /* 0xD0 - 0xD7 */
3206 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
3207 N, N, N, N,
3208 /* 0xD8 - 0xDF */
3209 N, N, N, N, N, N, N, N,
3210 /* 0xE0 - 0xE7 */
3211 X4(D(SrcImmByte)),
3212 D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in),
3213 D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
3214 /* 0xE8 - 0xEF */
3215 D(SrcImm | Stack), D(SrcImm | ImplicitOps),
3216 D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
3217 D2bvIP(SrcDX | DstAcc, in, check_perm_in),
3218 D2bvIP(SrcAcc | DstDX, out, check_perm_out),
3219 /* 0xF0 - 0xF7 */
3220 N, DI(ImplicitOps, icebp), N, N,
3221 DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
3222 G(ByteOp, group3), G(0, group3),
3223 /* 0xF8 - 0xFF */
3224 D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
3225 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
3226};
3227
3228static struct opcode twobyte_table[256] = {
3229 /* 0x00 - 0x0F */
3230 G(0, group6), GD(0, &group7), N, N,
3231 N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N,
3232 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
3233 N, D(ImplicitOps | ModRM), N, N,
3234 /* 0x10 - 0x1F */
3235 N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
3236 /* 0x20 - 0x2F */
3237 DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
3238 DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
3239 DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write),
3240 DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write),
3241 N, N, N, N,
3242 N, N, N, N, N, N, N, N,
3243 /* 0x30 - 0x3F */
3244 DI(ImplicitOps | Priv, wrmsr),
3245 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
3246 DI(ImplicitOps | Priv, rdmsr),
3247 DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
3248 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
3249 N, N,
3250 N, N, N, N, N, N, N, N,
3251 /* 0x40 - 0x4F */
3252 X16(D(DstReg | SrcMem | ModRM | Mov)),
3253 /* 0x50 - 0x5F */
3254 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
3255 /* 0x60 - 0x6F */
3256 N, N, N, N,
3257 N, N, N, N,
3258 N, N, N, N,
3259 N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
3260 /* 0x70 - 0x7F */
3261 N, N, N, N,
3262 N, N, N, N,
3263 N, N, N, N,
3264 N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
3265 /* 0x80 - 0x8F */
3266 X16(D(SrcImm)),
3267 /* 0x90 - 0x9F */
3268 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3269 /* 0xA0 - 0xA7 */
3270 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
3271 DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
3272 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3273 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
3274 /* 0xA8 - 0xAF */
3275 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
3276 DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
3277 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3278 D(DstMem | SrcReg | Src2CL | ModRM),
3279 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
3280 /* 0xB0 - 0xB7 */
3281 D2bv(DstMem | SrcReg | ModRM | Lock),
3282 D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
3283 D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
3284 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3285 /* 0xB8 - 0xBF */
3286 N, N,
3287 G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
3288 D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
3289 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3290 /* 0xC0 - 0xCF */
3291 D2bv(DstMem | SrcReg | ModRM | Lock),
3292 N, D(DstMem | SrcReg | ModRM | Mov),
3293 N, N, N, GD(0, &group9),
3294 N, N, N, N, N, N, N, N,
3295 /* 0xD0 - 0xDF */
3296 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
3297 /* 0xE0 - 0xEF */
3298 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
3299 /* 0xF0 - 0xFF */
3300 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
3301};
3302
3303#undef D
3304#undef N
3305#undef G
3306#undef GD
3307#undef I
3308#undef GP
3309#undef EXT
3310
3311#undef D2bv
3312#undef D2bvIP
3313#undef I2bv
3314#undef I6ALU
3315
3316static unsigned imm_size(struct decode_cache *c)
3317{
3318 unsigned size;
3319
3320 size = (c->d & ByteOp) ? 1 : c->op_bytes;
3321 if (size == 8)
3322 size = 4;
3323 return size;
3324}
3325
3326static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
3327 unsigned size, bool sign_extension)
3328{
3329 struct decode_cache *c = &ctxt->decode;
3330 struct x86_emulate_ops *ops = ctxt->ops;
3331 int rc = X86EMUL_CONTINUE;
3332
3333 op->type = OP_IMM;
3334 op->bytes = size;
3335 op->addr.mem.ea = c->eip;
3336 /* NB. Immediates are sign-extended as necessary. */
3337 switch (op->bytes) {
3338 case 1:
3339 op->val = insn_fetch(s8, 1, c->eip);
3340 break;
3341 case 2:
3342 op->val = insn_fetch(s16, 2, c->eip);
3343 break;
3344 case 4:
3345 op->val = insn_fetch(s32, 4, c->eip);
3346 break;
3347 }
3348 if (!sign_extension) {
3349 switch (op->bytes) {
3350 case 1:
3351 op->val &= 0xff;
3352 break;
3353 case 2:
3354 op->val &= 0xffff;
3355 break;
3356 case 4:
3357 op->val &= 0xffffffff;
3358 break;
3359 }
3360 }
3361done:
3362 return rc;
2556} 3363}
2557 3364
2558int 3365int
2559x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 3366x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2560{ 3367{
3368 struct x86_emulate_ops *ops = ctxt->ops;
3369 struct decode_cache *c = &ctxt->decode;
3370 int rc = X86EMUL_CONTINUE;
3371 int mode = ctxt->mode;
3372 int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
3373 bool op_prefix = false;
3374 struct opcode opcode;
3375 struct operand memop = { .type = OP_NONE }, *memopp = NULL;
3376
3377 c->eip = ctxt->eip;
3378 c->fetch.start = c->eip;
3379 c->fetch.end = c->fetch.start + insn_len;
3380 if (insn_len > 0)
3381 memcpy(c->fetch.data, insn, insn_len);
3382
3383 switch (mode) {
3384 case X86EMUL_MODE_REAL:
3385 case X86EMUL_MODE_VM86:
3386 case X86EMUL_MODE_PROT16:
3387 def_op_bytes = def_ad_bytes = 2;
3388 break;
3389 case X86EMUL_MODE_PROT32:
3390 def_op_bytes = def_ad_bytes = 4;
3391 break;
3392#ifdef CONFIG_X86_64
3393 case X86EMUL_MODE_PROT64:
3394 def_op_bytes = 4;
3395 def_ad_bytes = 8;
3396 break;
3397#endif
3398 default:
3399 return -1;
3400 }
3401
3402 c->op_bytes = def_op_bytes;
3403 c->ad_bytes = def_ad_bytes;
3404
3405 /* Legacy prefixes. */
3406 for (;;) {
3407 switch (c->b = insn_fetch(u8, 1, c->eip)) {
3408 case 0x66: /* operand-size override */
3409 op_prefix = true;
3410 /* switch between 2/4 bytes */
3411 c->op_bytes = def_op_bytes ^ 6;
3412 break;
3413 case 0x67: /* address-size override */
3414 if (mode == X86EMUL_MODE_PROT64)
3415 /* switch between 4/8 bytes */
3416 c->ad_bytes = def_ad_bytes ^ 12;
3417 else
3418 /* switch between 2/4 bytes */
3419 c->ad_bytes = def_ad_bytes ^ 6;
3420 break;
3421 case 0x26: /* ES override */
3422 case 0x2e: /* CS override */
3423 case 0x36: /* SS override */
3424 case 0x3e: /* DS override */
3425 set_seg_override(c, (c->b >> 3) & 3);
3426 break;
3427 case 0x64: /* FS override */
3428 case 0x65: /* GS override */
3429 set_seg_override(c, c->b & 7);
3430 break;
3431 case 0x40 ... 0x4f: /* REX */
3432 if (mode != X86EMUL_MODE_PROT64)
3433 goto done_prefixes;
3434 c->rex_prefix = c->b;
3435 continue;
3436 case 0xf0: /* LOCK */
3437 c->lock_prefix = 1;
3438 break;
3439 case 0xf2: /* REPNE/REPNZ */
3440 case 0xf3: /* REP/REPE/REPZ */
3441 c->rep_prefix = c->b;
3442 break;
3443 default:
3444 goto done_prefixes;
3445 }
3446
3447 /* Any legacy prefix after a REX prefix nullifies its effect. */
3448
3449 c->rex_prefix = 0;
3450 }
3451
3452done_prefixes:
3453
3454 /* REX prefix. */
3455 if (c->rex_prefix & 8)
3456 c->op_bytes = 8; /* REX.W */
3457
3458 /* Opcode byte(s). */
3459 opcode = opcode_table[c->b];
3460 /* Two-byte opcode? */
3461 if (c->b == 0x0f) {
3462 c->twobyte = 1;
3463 c->b = insn_fetch(u8, 1, c->eip);
3464 opcode = twobyte_table[c->b];
3465 }
3466 c->d = opcode.flags;
3467
3468 while (c->d & GroupMask) {
3469 switch (c->d & GroupMask) {
3470 case Group:
3471 c->modrm = insn_fetch(u8, 1, c->eip);
3472 --c->eip;
3473 goffset = (c->modrm >> 3) & 7;
3474 opcode = opcode.u.group[goffset];
3475 break;
3476 case GroupDual:
3477 c->modrm = insn_fetch(u8, 1, c->eip);
3478 --c->eip;
3479 goffset = (c->modrm >> 3) & 7;
3480 if ((c->modrm >> 6) == 3)
3481 opcode = opcode.u.gdual->mod3[goffset];
3482 else
3483 opcode = opcode.u.gdual->mod012[goffset];
3484 break;
3485 case RMExt:
3486 goffset = c->modrm & 7;
3487 opcode = opcode.u.group[goffset];
3488 break;
3489 case Prefix:
3490 if (c->rep_prefix && op_prefix)
3491 return X86EMUL_UNHANDLEABLE;
3492 simd_prefix = op_prefix ? 0x66 : c->rep_prefix;
3493 switch (simd_prefix) {
3494 case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
3495 case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
3496 case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
3497 case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
3498 }
3499 break;
3500 default:
3501 return X86EMUL_UNHANDLEABLE;
3502 }
3503
3504 c->d &= ~GroupMask;
3505 c->d |= opcode.flags;
3506 }
3507
3508 c->execute = opcode.u.execute;
3509 c->check_perm = opcode.check_perm;
3510 c->intercept = opcode.intercept;
3511
3512 /* Unrecognised? */
3513 if (c->d == 0 || (c->d & Undefined))
3514 return -1;
3515
3516 if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
3517 return -1;
3518
3519 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
3520 c->op_bytes = 8;
3521
3522 if (c->d & Op3264) {
3523 if (mode == X86EMUL_MODE_PROT64)
3524 c->op_bytes = 8;
3525 else
3526 c->op_bytes = 4;
3527 }
3528
3529 if (c->d & Sse)
3530 c->op_bytes = 16;
3531
3532 /* ModRM and SIB bytes. */
3533 if (c->d & ModRM) {
3534 rc = decode_modrm(ctxt, ops, &memop);
3535 if (!c->has_seg_override)
3536 set_seg_override(c, c->modrm_seg);
3537 } else if (c->d & MemAbs)
3538 rc = decode_abs(ctxt, ops, &memop);
3539 if (rc != X86EMUL_CONTINUE)
3540 goto done;
3541
3542 if (!c->has_seg_override)
3543 set_seg_override(c, VCPU_SREG_DS);
3544
3545 memop.addr.mem.seg = seg_override(ctxt, c);
3546
3547 if (memop.type == OP_MEM && c->ad_bytes != 8)
3548 memop.addr.mem.ea = (u32)memop.addr.mem.ea;
3549
3550 /*
3551 * Decode and fetch the source operand: register, memory
3552 * or immediate.
3553 */
3554 switch (c->d & SrcMask) {
3555 case SrcNone:
3556 break;
3557 case SrcReg:
3558 decode_register_operand(ctxt, &c->src, c, 0);
3559 break;
3560 case SrcMem16:
3561 memop.bytes = 2;
3562 goto srcmem_common;
3563 case SrcMem32:
3564 memop.bytes = 4;
3565 goto srcmem_common;
3566 case SrcMem:
3567 memop.bytes = (c->d & ByteOp) ? 1 :
3568 c->op_bytes;
3569 srcmem_common:
3570 c->src = memop;
3571 memopp = &c->src;
3572 break;
3573 case SrcImmU16:
3574 rc = decode_imm(ctxt, &c->src, 2, false);
3575 break;
3576 case SrcImm:
3577 rc = decode_imm(ctxt, &c->src, imm_size(c), true);
3578 break;
3579 case SrcImmU:
3580 rc = decode_imm(ctxt, &c->src, imm_size(c), false);
3581 break;
3582 case SrcImmByte:
3583 rc = decode_imm(ctxt, &c->src, 1, true);
3584 break;
3585 case SrcImmUByte:
3586 rc = decode_imm(ctxt, &c->src, 1, false);
3587 break;
3588 case SrcAcc:
3589 c->src.type = OP_REG;
3590 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
3591 c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
3592 fetch_register_operand(&c->src);
3593 break;
3594 case SrcOne:
3595 c->src.bytes = 1;
3596 c->src.val = 1;
3597 break;
3598 case SrcSI:
3599 c->src.type = OP_MEM;
3600 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
3601 c->src.addr.mem.ea =
3602 register_address(c, c->regs[VCPU_REGS_RSI]);
3603 c->src.addr.mem.seg = seg_override(ctxt, c);
3604 c->src.val = 0;
3605 break;
3606 case SrcImmFAddr:
3607 c->src.type = OP_IMM;
3608 c->src.addr.mem.ea = c->eip;
3609 c->src.bytes = c->op_bytes + 2;
3610 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
3611 break;
3612 case SrcMemFAddr:
3613 memop.bytes = c->op_bytes + 2;
3614 goto srcmem_common;
3615 break;
3616 case SrcDX:
3617 c->src.type = OP_REG;
3618 c->src.bytes = 2;
3619 c->src.addr.reg = &c->regs[VCPU_REGS_RDX];
3620 fetch_register_operand(&c->src);
3621 break;
3622 }
3623
3624 if (rc != X86EMUL_CONTINUE)
3625 goto done;
3626
3627 /*
3628 * Decode and fetch the second source operand: register, memory
3629 * or immediate.
3630 */
3631 switch (c->d & Src2Mask) {
3632 case Src2None:
3633 break;
3634 case Src2CL:
3635 c->src2.bytes = 1;
3636 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
3637 break;
3638 case Src2ImmByte:
3639 rc = decode_imm(ctxt, &c->src2, 1, true);
3640 break;
3641 case Src2One:
3642 c->src2.bytes = 1;
3643 c->src2.val = 1;
3644 break;
3645 case Src2Imm:
3646 rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
3647 break;
3648 }
3649
3650 if (rc != X86EMUL_CONTINUE)
3651 goto done;
3652
3653 /* Decode and fetch the destination operand: register or memory. */
3654 switch (c->d & DstMask) {
3655 case DstReg:
3656 decode_register_operand(ctxt, &c->dst, c,
3657 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
3658 break;
3659 case DstImmUByte:
3660 c->dst.type = OP_IMM;
3661 c->dst.addr.mem.ea = c->eip;
3662 c->dst.bytes = 1;
3663 c->dst.val = insn_fetch(u8, 1, c->eip);
3664 break;
3665 case DstMem:
3666 case DstMem64:
3667 c->dst = memop;
3668 memopp = &c->dst;
3669 if ((c->d & DstMask) == DstMem64)
3670 c->dst.bytes = 8;
3671 else
3672 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
3673 if (c->d & BitOp)
3674 fetch_bit_operand(c);
3675 c->dst.orig_val = c->dst.val;
3676 break;
3677 case DstAcc:
3678 c->dst.type = OP_REG;
3679 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
3680 c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
3681 fetch_register_operand(&c->dst);
3682 c->dst.orig_val = c->dst.val;
3683 break;
3684 case DstDI:
3685 c->dst.type = OP_MEM;
3686 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
3687 c->dst.addr.mem.ea =
3688 register_address(c, c->regs[VCPU_REGS_RDI]);
3689 c->dst.addr.mem.seg = VCPU_SREG_ES;
3690 c->dst.val = 0;
3691 break;
3692 case DstDX:
3693 c->dst.type = OP_REG;
3694 c->dst.bytes = 2;
3695 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
3696 fetch_register_operand(&c->dst);
3697 break;
3698 case ImplicitOps:
3699 /* Special instructions do their own operand decoding. */
3700 default:
3701 c->dst.type = OP_NONE; /* Disable writeback. */
3702 break;
3703 }
3704
3705done:
3706 if (memopp && memopp->type == OP_MEM && c->rip_relative)
3707 memopp->addr.mem.ea += c->eip;
3708
3709 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3710}
3711
3712static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
3713{
3714 struct decode_cache *c = &ctxt->decode;
3715
3716 /* The second termination condition only applies for REPE
3717 * and REPNE. Test if the repeat string operation prefix is
3718 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
3719 * corresponding termination condition according to:
3720 * - if REPE/REPZ and ZF = 0 then done
3721 * - if REPNE/REPNZ and ZF = 1 then done
3722 */
3723 if (((c->b == 0xa6) || (c->b == 0xa7) ||
3724 (c->b == 0xae) || (c->b == 0xaf))
3725 && (((c->rep_prefix == REPE_PREFIX) &&
3726 ((ctxt->eflags & EFLG_ZF) == 0))
3727 || ((c->rep_prefix == REPNE_PREFIX) &&
3728 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
3729 return true;
3730
3731 return false;
3732}
3733
3734int
3735x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3736{
3737 struct x86_emulate_ops *ops = ctxt->ops;
2561 u64 msr_data; 3738 u64 msr_data;
2562 struct decode_cache *c = &ctxt->decode; 3739 struct decode_cache *c = &ctxt->decode;
2563 int rc = X86EMUL_CONTINUE; 3740 int rc = X86EMUL_CONTINUE;
2564 int saved_dst_type = c->dst.type; 3741 int saved_dst_type = c->dst.type;
3742 int irq; /* Used for int 3, int, and into */
2565 3743
2566 ctxt->decode.mem_read.pos = 0; 3744 ctxt->decode.mem_read.pos = 0;
2567 3745
2568 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 3746 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
2569 emulate_ud(ctxt); 3747 rc = emulate_ud(ctxt);
2570 goto done; 3748 goto done;
2571 } 3749 }
2572 3750
2573 /* LOCK prefix is allowed only with some instructions */ 3751 /* LOCK prefix is allowed only with some instructions */
2574 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 3752 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
2575 emulate_ud(ctxt); 3753 rc = emulate_ud(ctxt);
3754 goto done;
3755 }
3756
3757 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
3758 rc = emulate_ud(ctxt);
3759 goto done;
3760 }
3761
3762 if ((c->d & Sse)
3763 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
3764 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
3765 rc = emulate_ud(ctxt);
3766 goto done;
3767 }
3768
3769 if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
3770 rc = emulate_nm(ctxt);
2576 goto done; 3771 goto done;
2577 } 3772 }
2578 3773
3774 if (unlikely(ctxt->guest_mode) && c->intercept) {
3775 rc = emulator_check_intercept(ctxt, c->intercept,
3776 X86_ICPT_PRE_EXCEPT);
3777 if (rc != X86EMUL_CONTINUE)
3778 goto done;
3779 }
3780
2579 /* Privileged instruction can be executed only in CPL=0 */ 3781 /* Privileged instruction can be executed only in CPL=0 */
2580 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 3782 if ((c->d & Priv) && ops->cpl(ctxt)) {
2581 emulate_gp(ctxt, 0); 3783 rc = emulate_gp(ctxt, 0);
2582 goto done; 3784 goto done;
2583 } 3785 }
2584 3786
3787 /* Instruction can only be executed in protected mode */
3788 if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
3789 rc = emulate_ud(ctxt);
3790 goto done;
3791 }
3792
3793 /* Do instruction specific permission checks */
3794 if (c->check_perm) {
3795 rc = c->check_perm(ctxt);
3796 if (rc != X86EMUL_CONTINUE)
3797 goto done;
3798 }
3799
3800 if (unlikely(ctxt->guest_mode) && c->intercept) {
3801 rc = emulator_check_intercept(ctxt, c->intercept,
3802 X86_ICPT_POST_EXCEPT);
3803 if (rc != X86EMUL_CONTINUE)
3804 goto done;
3805 }
3806
2585 if (c->rep_prefix && (c->d & String)) { 3807 if (c->rep_prefix && (c->d & String)) {
2586 ctxt->restart = true;
2587 /* All REP prefixes have the same first termination condition */ 3808 /* All REP prefixes have the same first termination condition */
2588 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 3809 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2589 string_done:
2590 ctxt->restart = false;
2591 ctxt->eip = c->eip; 3810 ctxt->eip = c->eip;
2592 goto done; 3811 goto done;
2593 } 3812 }
2594 /* The second termination condition only applies for REPE
2595 * and REPNE. Test if the repeat string operation prefix is
2596 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
2597 * corresponding termination condition according to:
2598 * - if REPE/REPZ and ZF = 0 then done
2599 * - if REPNE/REPNZ and ZF = 1 then done
2600 */
2601 if ((c->b == 0xa6) || (c->b == 0xa7) ||
2602 (c->b == 0xae) || (c->b == 0xaf)) {
2603 if ((c->rep_prefix == REPE_PREFIX) &&
2604 ((ctxt->eflags & EFLG_ZF) == 0))
2605 goto string_done;
2606 if ((c->rep_prefix == REPNE_PREFIX) &&
2607 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
2608 goto string_done;
2609 }
2610 c->eip = ctxt->eip;
2611 } 3813 }
2612 3814
2613 if (c->src.type == OP_MEM) { 3815 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
2614 rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, 3816 rc = segmented_read(ctxt, c->src.addr.mem,
2615 c->src.valptr, c->src.bytes); 3817 c->src.valptr, c->src.bytes);
2616 if (rc != X86EMUL_CONTINUE) 3818 if (rc != X86EMUL_CONTINUE)
2617 goto done; 3819 goto done;
2618 c->src.orig_val64 = c->src.val64; 3820 c->src.orig_val64 = c->src.val64;
2619 } 3821 }
2620 3822
2621 if (c->src2.type == OP_MEM) { 3823 if (c->src2.type == OP_MEM) {
2622 rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, 3824 rc = segmented_read(ctxt, c->src2.addr.mem,
2623 &c->src2.val, c->src2.bytes); 3825 &c->src2.val, c->src2.bytes);
2624 if (rc != X86EMUL_CONTINUE) 3826 if (rc != X86EMUL_CONTINUE)
2625 goto done; 3827 goto done;
2626 } 3828 }
@@ -2631,7 +3833,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2631 3833
2632 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3834 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
2633 /* optimisation - avoid slow emulated read if Mov */ 3835 /* optimisation - avoid slow emulated read if Mov */
2634 rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, 3836 rc = segmented_read(ctxt, c->dst.addr.mem,
2635 &c->dst.val, c->dst.bytes); 3837 &c->dst.val, c->dst.bytes);
2636 if (rc != X86EMUL_CONTINUE) 3838 if (rc != X86EMUL_CONTINUE)
2637 goto done; 3839 goto done;
@@ -2640,68 +3842,44 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2640 3842
2641special_insn: 3843special_insn:
2642 3844
3845 if (unlikely(ctxt->guest_mode) && c->intercept) {
3846 rc = emulator_check_intercept(ctxt, c->intercept,
3847 X86_ICPT_POST_MEMACCESS);
3848 if (rc != X86EMUL_CONTINUE)
3849 goto done;
3850 }
3851
3852 if (c->execute) {
3853 rc = c->execute(ctxt);
3854 if (rc != X86EMUL_CONTINUE)
3855 goto done;
3856 goto writeback;
3857 }
3858
2643 if (c->twobyte) 3859 if (c->twobyte)
2644 goto twobyte_insn; 3860 goto twobyte_insn;
2645 3861
2646 switch (c->b) { 3862 switch (c->b) {
2647 case 0x00 ... 0x05:
2648 add: /* add */
2649 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2650 break;
2651 case 0x06: /* push es */ 3863 case 0x06: /* push es */
2652 emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); 3864 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
2653 break; 3865 break;
2654 case 0x07: /* pop es */ 3866 case 0x07: /* pop es */
2655 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 3867 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
2656 if (rc != X86EMUL_CONTINUE)
2657 goto done;
2658 break;
2659 case 0x08 ... 0x0d:
2660 or: /* or */
2661 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2662 break; 3868 break;
2663 case 0x0e: /* push cs */ 3869 case 0x0e: /* push cs */
2664 emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); 3870 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
2665 break;
2666 case 0x10 ... 0x15:
2667 adc: /* adc */
2668 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2669 break; 3871 break;
2670 case 0x16: /* push ss */ 3872 case 0x16: /* push ss */
2671 emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); 3873 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
2672 break; 3874 break;
2673 case 0x17: /* pop ss */ 3875 case 0x17: /* pop ss */
2674 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 3876 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
2675 if (rc != X86EMUL_CONTINUE)
2676 goto done;
2677 break;
2678 case 0x18 ... 0x1d:
2679 sbb: /* sbb */
2680 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2681 break; 3877 break;
2682 case 0x1e: /* push ds */ 3878 case 0x1e: /* push ds */
2683 emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); 3879 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
2684 break; 3880 break;
2685 case 0x1f: /* pop ds */ 3881 case 0x1f: /* pop ds */
2686 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 3882 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
2687 if (rc != X86EMUL_CONTINUE)
2688 goto done;
2689 break;
2690 case 0x20 ... 0x25:
2691 and: /* and */
2692 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
2693 break;
2694 case 0x28 ... 0x2d:
2695 sub: /* sub */
2696 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
2697 break;
2698 case 0x30 ... 0x35:
2699 xor: /* xor */
2700 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
2701 break;
2702 case 0x38 ... 0x3d:
2703 cmp: /* cmp */
2704 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2705 break; 3883 break;
2706 case 0x40 ... 0x47: /* inc r16/r32 */ 3884 case 0x40 ... 0x47: /* inc r16/r32 */
2707 emulate_1op("inc", c->dst, ctxt->eflags); 3885 emulate_1op("inc", c->dst, ctxt->eflags);
@@ -2709,83 +3887,24 @@ special_insn:
2709 case 0x48 ... 0x4f: /* dec r16/r32 */ 3887 case 0x48 ... 0x4f: /* dec r16/r32 */
2710 emulate_1op("dec", c->dst, ctxt->eflags); 3888 emulate_1op("dec", c->dst, ctxt->eflags);
2711 break; 3889 break;
2712 case 0x50 ... 0x57: /* push reg */
2713 emulate_push(ctxt, ops);
2714 break;
2715 case 0x58 ... 0x5f: /* pop reg */
2716 pop_instruction:
2717 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
2718 if (rc != X86EMUL_CONTINUE)
2719 goto done;
2720 break;
2721 case 0x60: /* pusha */
2722 rc = emulate_pusha(ctxt, ops);
2723 if (rc != X86EMUL_CONTINUE)
2724 goto done;
2725 break;
2726 case 0x61: /* popa */
2727 rc = emulate_popa(ctxt, ops);
2728 if (rc != X86EMUL_CONTINUE)
2729 goto done;
2730 break;
2731 case 0x63: /* movsxd */ 3890 case 0x63: /* movsxd */
2732 if (ctxt->mode != X86EMUL_MODE_PROT64) 3891 if (ctxt->mode != X86EMUL_MODE_PROT64)
2733 goto cannot_emulate; 3892 goto cannot_emulate;
2734 c->dst.val = (s32) c->src.val; 3893 c->dst.val = (s32) c->src.val;
2735 break; 3894 break;
2736 case 0x68: /* push imm */
2737 case 0x6a: /* push imm8 */
2738 emulate_push(ctxt, ops);
2739 break;
2740 case 0x6c: /* insb */ 3895 case 0x6c: /* insb */
2741 case 0x6d: /* insw/insd */ 3896 case 0x6d: /* insw/insd */
2742 c->dst.bytes = min(c->dst.bytes, 4u); 3897 c->src.val = c->regs[VCPU_REGS_RDX];
2743 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 3898 goto do_io_in;
2744 c->dst.bytes)) {
2745 emulate_gp(ctxt, 0);
2746 goto done;
2747 }
2748 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
2749 c->regs[VCPU_REGS_RDX], &c->dst.val))
2750 goto done; /* IO is needed, skip writeback */
2751 break;
2752 case 0x6e: /* outsb */ 3899 case 0x6e: /* outsb */
2753 case 0x6f: /* outsw/outsd */ 3900 case 0x6f: /* outsw/outsd */
2754 c->src.bytes = min(c->src.bytes, 4u); 3901 c->dst.val = c->regs[VCPU_REGS_RDX];
2755 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 3902 goto do_io_out;
2756 c->src.bytes)) {
2757 emulate_gp(ctxt, 0);
2758 goto done;
2759 }
2760 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
2761 &c->src.val, 1, ctxt->vcpu);
2762
2763 c->dst.type = OP_NONE; /* nothing to writeback */
2764 break; 3903 break;
2765 case 0x70 ... 0x7f: /* jcc (short) */ 3904 case 0x70 ... 0x7f: /* jcc (short) */
2766 if (test_cc(c->b, ctxt->eflags)) 3905 if (test_cc(c->b, ctxt->eflags))
2767 jmp_rel(c, c->src.val); 3906 jmp_rel(c, c->src.val);
2768 break; 3907 break;
2769 case 0x80 ... 0x83: /* Grp1 */
2770 switch (c->modrm_reg) {
2771 case 0:
2772 goto add;
2773 case 1:
2774 goto or;
2775 case 2:
2776 goto adc;
2777 case 3:
2778 goto sbb;
2779 case 4:
2780 goto and;
2781 case 5:
2782 goto sub;
2783 case 6:
2784 goto xor;
2785 case 7:
2786 goto cmp;
2787 }
2788 break;
2789 case 0x84 ... 0x85: 3908 case 0x84 ... 0x85:
2790 test: 3909 test:
2791 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 3910 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
@@ -2793,38 +3912,24 @@ special_insn:
2793 case 0x86 ... 0x87: /* xchg */ 3912 case 0x86 ... 0x87: /* xchg */
2794 xchg: 3913 xchg:
2795 /* Write back the register source. */ 3914 /* Write back the register source. */
2796 switch (c->dst.bytes) { 3915 c->src.val = c->dst.val;
2797 case 1: 3916 write_register_operand(&c->src);
2798 *(u8 *) c->src.ptr = (u8) c->dst.val;
2799 break;
2800 case 2:
2801 *(u16 *) c->src.ptr = (u16) c->dst.val;
2802 break;
2803 case 4:
2804 *c->src.ptr = (u32) c->dst.val;
2805 break; /* 64b reg: zero-extend */
2806 case 8:
2807 *c->src.ptr = c->dst.val;
2808 break;
2809 }
2810 /* 3917 /*
2811 * Write back the memory destination with implicit LOCK 3918 * Write back the memory destination with implicit LOCK
2812 * prefix. 3919 * prefix.
2813 */ 3920 */
2814 c->dst.val = c->src.val; 3921 c->dst.val = c->src.orig_val;
2815 c->lock_prefix = 1; 3922 c->lock_prefix = 1;
2816 break; 3923 break;
2817 case 0x88 ... 0x8b: /* mov */
2818 goto mov;
2819 case 0x8c: /* mov r/m, sreg */ 3924 case 0x8c: /* mov r/m, sreg */
2820 if (c->modrm_reg > VCPU_SREG_GS) { 3925 if (c->modrm_reg > VCPU_SREG_GS) {
2821 emulate_ud(ctxt); 3926 rc = emulate_ud(ctxt);
2822 goto done; 3927 goto done;
2823 } 3928 }
2824 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 3929 c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
2825 break; 3930 break;
2826 case 0x8d: /* lea r16/r32, m */ 3931 case 0x8d: /* lea r16/r32, m */
2827 c->dst.val = c->modrm_ea; 3932 c->dst.val = c->src.addr.mem.ea;
2828 break; 3933 break;
2829 case 0x8e: { /* mov seg, r/m16 */ 3934 case 0x8e: { /* mov seg, r/m16 */
2830 uint16_t sel; 3935 uint16_t sel;
@@ -2833,7 +3938,7 @@ special_insn:
2833 3938
2834 if (c->modrm_reg == VCPU_SREG_CS || 3939 if (c->modrm_reg == VCPU_SREG_CS ||
2835 c->modrm_reg > VCPU_SREG_GS) { 3940 c->modrm_reg > VCPU_SREG_GS) {
2836 emulate_ud(ctxt); 3941 rc = emulate_ud(ctxt);
2837 goto done; 3942 goto done;
2838 } 3943 }
2839 3944
@@ -2846,76 +3951,72 @@ special_insn:
2846 break; 3951 break;
2847 } 3952 }
2848 case 0x8f: /* pop (sole member of Grp1a) */ 3953 case 0x8f: /* pop (sole member of Grp1a) */
2849 rc = emulate_grp1a(ctxt, ops); 3954 rc = em_grp1a(ctxt);
2850 if (rc != X86EMUL_CONTINUE)
2851 goto done;
2852 break; 3955 break;
2853 case 0x90: /* nop / xchg r8,rax */ 3956 case 0x90 ... 0x97: /* nop / xchg reg, rax */
2854 if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { 3957 if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
2855 c->dst.type = OP_NONE; /* nop */
2856 break; 3958 break;
2857 }
2858 case 0x91 ... 0x97: /* xchg reg,rax */
2859 c->src.type = OP_REG;
2860 c->src.bytes = c->op_bytes;
2861 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
2862 c->src.val = *(c->src.ptr);
2863 goto xchg; 3959 goto xchg;
2864 case 0x9c: /* pushf */ 3960 case 0x98: /* cbw/cwde/cdqe */
2865 c->src.val = (unsigned long) ctxt->eflags; 3961 switch (c->op_bytes) {
2866 emulate_push(ctxt, ops); 3962 case 2: c->dst.val = (s8)c->dst.val; break;
2867 break; 3963 case 4: c->dst.val = (s16)c->dst.val; break;
2868 case 0x9d: /* popf */ 3964 case 8: c->dst.val = (s32)c->dst.val; break;
2869 c->dst.type = OP_REG; 3965 }
2870 c->dst.ptr = (unsigned long *) &ctxt->eflags;
2871 c->dst.bytes = c->op_bytes;
2872 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
2873 if (rc != X86EMUL_CONTINUE)
2874 goto done;
2875 break; 3966 break;
2876 case 0xa0 ... 0xa3: /* mov */
2877 case 0xa4 ... 0xa5: /* movs */
2878 goto mov;
2879 case 0xa6 ... 0xa7: /* cmps */
2880 c->dst.type = OP_NONE; /* Disable writeback. */
2881 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
2882 goto cmp;
2883 case 0xa8 ... 0xa9: /* test ax, imm */ 3967 case 0xa8 ... 0xa9: /* test ax, imm */
2884 goto test; 3968 goto test;
2885 case 0xaa ... 0xab: /* stos */
2886 c->dst.val = c->regs[VCPU_REGS_RAX];
2887 break;
2888 case 0xac ... 0xad: /* lods */
2889 goto mov;
2890 case 0xae ... 0xaf: /* scas */
2891 DPRINTF("Urk! I don't handle SCAS.\n");
2892 goto cannot_emulate;
2893 case 0xb0 ... 0xbf: /* mov r, imm */
2894 goto mov;
2895 case 0xc0 ... 0xc1: 3969 case 0xc0 ... 0xc1:
2896 emulate_grp2(ctxt); 3970 rc = em_grp2(ctxt);
2897 break; 3971 break;
2898 case 0xc3: /* ret */ 3972 case 0xc3: /* ret */
2899 c->dst.type = OP_REG; 3973 c->dst.type = OP_REG;
2900 c->dst.ptr = &c->eip; 3974 c->dst.addr.reg = &c->eip;
2901 c->dst.bytes = c->op_bytes; 3975 c->dst.bytes = c->op_bytes;
2902 goto pop_instruction; 3976 rc = em_pop(ctxt);
2903 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ 3977 break;
2904 mov: 3978 case 0xc4: /* les */
2905 c->dst.val = c->src.val; 3979 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
3980 break;
3981 case 0xc5: /* lds */
3982 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
2906 break; 3983 break;
2907 case 0xcb: /* ret far */ 3984 case 0xcb: /* ret far */
2908 rc = emulate_ret_far(ctxt, ops); 3985 rc = emulate_ret_far(ctxt, ops);
2909 if (rc != X86EMUL_CONTINUE) 3986 break;
2910 goto done; 3987 case 0xcc: /* int3 */
3988 irq = 3;
3989 goto do_interrupt;
3990 case 0xcd: /* int n */
3991 irq = c->src.val;
3992 do_interrupt:
3993 rc = emulate_int(ctxt, ops, irq);
3994 break;
3995 case 0xce: /* into */
3996 if (ctxt->eflags & EFLG_OF) {
3997 irq = 4;
3998 goto do_interrupt;
3999 }
4000 break;
4001 case 0xcf: /* iret */
4002 rc = emulate_iret(ctxt, ops);
2911 break; 4003 break;
2912 case 0xd0 ... 0xd1: /* Grp2 */ 4004 case 0xd0 ... 0xd1: /* Grp2 */
2913 c->src.val = 1; 4005 rc = em_grp2(ctxt);
2914 emulate_grp2(ctxt);
2915 break; 4006 break;
2916 case 0xd2 ... 0xd3: /* Grp2 */ 4007 case 0xd2 ... 0xd3: /* Grp2 */
2917 c->src.val = c->regs[VCPU_REGS_RCX]; 4008 c->src.val = c->regs[VCPU_REGS_RCX];
2918 emulate_grp2(ctxt); 4009 rc = em_grp2(ctxt);
4010 break;
4011 case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
4012 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
4013 if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
4014 (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
4015 jmp_rel(c, c->src.val);
4016 break;
4017 case 0xe3: /* jcxz/jecxz/jrcxz */
4018 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
4019 jmp_rel(c, c->src.val);
2919 break; 4020 break;
2920 case 0xe4: /* inb */ 4021 case 0xe4: /* inb */
2921 case 0xe5: /* in */ 4022 case 0xe5: /* in */
@@ -2927,23 +4028,14 @@ special_insn:
2927 long int rel = c->src.val; 4028 long int rel = c->src.val;
2928 c->src.val = (unsigned long) c->eip; 4029 c->src.val = (unsigned long) c->eip;
2929 jmp_rel(c, rel); 4030 jmp_rel(c, rel);
2930 emulate_push(ctxt, ops); 4031 rc = em_push(ctxt);
2931 break; 4032 break;
2932 } 4033 }
2933 case 0xe9: /* jmp rel */ 4034 case 0xe9: /* jmp rel */
2934 goto jmp; 4035 goto jmp;
2935 case 0xea: { /* jmp far */ 4036 case 0xea: /* jmp far */
2936 unsigned short sel; 4037 rc = em_jmp_far(ctxt);
2937 jump_far:
2938 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
2939
2940 if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
2941 goto done;
2942
2943 c->eip = 0;
2944 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2945 break; 4038 break;
2946 }
2947 case 0xeb: 4039 case 0xeb:
2948 jmp: /* jmp rel short */ 4040 jmp: /* jmp rel short */
2949 jmp_rel(c, c->src.val); 4041 jmp_rel(c, c->src.val);
@@ -2951,87 +4043,71 @@ special_insn:
2951 break; 4043 break;
2952 case 0xec: /* in al,dx */ 4044 case 0xec: /* in al,dx */
2953 case 0xed: /* in (e/r)ax,dx */ 4045 case 0xed: /* in (e/r)ax,dx */
2954 c->src.val = c->regs[VCPU_REGS_RDX];
2955 do_io_in: 4046 do_io_in:
2956 c->dst.bytes = min(c->dst.bytes, 4u);
2957 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2958 emulate_gp(ctxt, 0);
2959 goto done;
2960 }
2961 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 4047 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
2962 &c->dst.val)) 4048 &c->dst.val))
2963 goto done; /* IO is needed */ 4049 goto done; /* IO is needed */
2964 break; 4050 break;
2965 case 0xee: /* out dx,al */ 4051 case 0xee: /* out dx,al */
2966 case 0xef: /* out dx,(e/r)ax */ 4052 case 0xef: /* out dx,(e/r)ax */
2967 c->src.val = c->regs[VCPU_REGS_RDX];
2968 do_io_out: 4053 do_io_out:
2969 c->dst.bytes = min(c->dst.bytes, 4u); 4054 ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val,
2970 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 4055 &c->src.val, 1);
2971 emulate_gp(ctxt, 0);
2972 goto done;
2973 }
2974 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
2975 ctxt->vcpu);
2976 c->dst.type = OP_NONE; /* Disable writeback. */ 4056 c->dst.type = OP_NONE; /* Disable writeback. */
2977 break; 4057 break;
2978 case 0xf4: /* hlt */ 4058 case 0xf4: /* hlt */
2979 ctxt->vcpu->arch.halt_request = 1; 4059 ctxt->ops->halt(ctxt);
2980 break; 4060 break;
2981 case 0xf5: /* cmc */ 4061 case 0xf5: /* cmc */
2982 /* complement carry flag from eflags reg */ 4062 /* complement carry flag from eflags reg */
2983 ctxt->eflags ^= EFLG_CF; 4063 ctxt->eflags ^= EFLG_CF;
2984 c->dst.type = OP_NONE; /* Disable writeback. */
2985 break; 4064 break;
2986 case 0xf6 ... 0xf7: /* Grp3 */ 4065 case 0xf6 ... 0xf7: /* Grp3 */
2987 if (!emulate_grp3(ctxt, ops)) 4066 rc = em_grp3(ctxt);
2988 goto cannot_emulate;
2989 break; 4067 break;
2990 case 0xf8: /* clc */ 4068 case 0xf8: /* clc */
2991 ctxt->eflags &= ~EFLG_CF; 4069 ctxt->eflags &= ~EFLG_CF;
2992 c->dst.type = OP_NONE; /* Disable writeback. */ 4070 break;
4071 case 0xf9: /* stc */
4072 ctxt->eflags |= EFLG_CF;
2993 break; 4073 break;
2994 case 0xfa: /* cli */ 4074 case 0xfa: /* cli */
2995 if (emulator_bad_iopl(ctxt, ops)) { 4075 if (emulator_bad_iopl(ctxt, ops)) {
2996 emulate_gp(ctxt, 0); 4076 rc = emulate_gp(ctxt, 0);
2997 goto done; 4077 goto done;
2998 } else { 4078 } else
2999 ctxt->eflags &= ~X86_EFLAGS_IF; 4079 ctxt->eflags &= ~X86_EFLAGS_IF;
3000 c->dst.type = OP_NONE; /* Disable writeback. */
3001 }
3002 break; 4080 break;
3003 case 0xfb: /* sti */ 4081 case 0xfb: /* sti */
3004 if (emulator_bad_iopl(ctxt, ops)) { 4082 if (emulator_bad_iopl(ctxt, ops)) {
3005 emulate_gp(ctxt, 0); 4083 rc = emulate_gp(ctxt, 0);
3006 goto done; 4084 goto done;
3007 } else { 4085 } else {
3008 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; 4086 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
3009 ctxt->eflags |= X86_EFLAGS_IF; 4087 ctxt->eflags |= X86_EFLAGS_IF;
3010 c->dst.type = OP_NONE; /* Disable writeback. */
3011 } 4088 }
3012 break; 4089 break;
3013 case 0xfc: /* cld */ 4090 case 0xfc: /* cld */
3014 ctxt->eflags &= ~EFLG_DF; 4091 ctxt->eflags &= ~EFLG_DF;
3015 c->dst.type = OP_NONE; /* Disable writeback. */
3016 break; 4092 break;
3017 case 0xfd: /* std */ 4093 case 0xfd: /* std */
3018 ctxt->eflags |= EFLG_DF; 4094 ctxt->eflags |= EFLG_DF;
3019 c->dst.type = OP_NONE; /* Disable writeback. */
3020 break; 4095 break;
3021 case 0xfe: /* Grp4 */ 4096 case 0xfe: /* Grp4 */
3022 grp45: 4097 rc = em_grp45(ctxt);
3023 rc = emulate_grp45(ctxt, ops);
3024 if (rc != X86EMUL_CONTINUE)
3025 goto done;
3026 break; 4098 break;
3027 case 0xff: /* Grp5 */ 4099 case 0xff: /* Grp5 */
3028 if (c->modrm_reg == 5) 4100 rc = em_grp45(ctxt);
3029 goto jump_far; 4101 break;
3030 goto grp45; 4102 default:
4103 goto cannot_emulate;
3031 } 4104 }
3032 4105
4106 if (rc != X86EMUL_CONTINUE)
4107 goto done;
4108
3033writeback: 4109writeback:
3034 rc = writeback(ctxt, ops); 4110 rc = writeback(ctxt);
3035 if (rc != X86EMUL_CONTINUE) 4111 if (rc != X86EMUL_CONTINUE)
3036 goto done; 4112 goto done;
3037 4113
@@ -3042,165 +4118,82 @@ writeback:
3042 c->dst.type = saved_dst_type; 4118 c->dst.type = saved_dst_type;
3043 4119
3044 if ((c->d & SrcMask) == SrcSI) 4120 if ((c->d & SrcMask) == SrcSI)
3045 string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), 4121 string_addr_inc(ctxt, seg_override(ctxt, c),
3046 VCPU_REGS_RSI, &c->src); 4122 VCPU_REGS_RSI, &c->src);
3047 4123
3048 if ((c->d & DstMask) == DstDI) 4124 if ((c->d & DstMask) == DstDI)
3049 string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, 4125 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
3050 &c->dst); 4126 &c->dst);
3051 4127
3052 if (c->rep_prefix && (c->d & String)) { 4128 if (c->rep_prefix && (c->d & String)) {
3053 struct read_cache *rc = &ctxt->decode.io_read; 4129 struct read_cache *r = &ctxt->decode.io_read;
3054 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); 4130 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
3055 /* 4131
3056 * Re-enter guest when pio read ahead buffer is empty or, 4132 if (!string_insn_completed(ctxt)) {
3057 * if it is not used, after each 1024 iteration. 4133 /*
3058 */ 4134 * Re-enter guest when pio read ahead buffer is empty
3059 if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || 4135 * or, if it is not used, after each 1024 iteration.
3060 (rc->end != 0 && rc->end == rc->pos)) 4136 */
3061 ctxt->restart = false; 4137 if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
4138 (r->end == 0 || r->end != r->pos)) {
4139 /*
4140 * Reset read cache. Usually happens before
4141 * decode, but since instruction is restarted
4142 * we have to do it here.
4143 */
4144 ctxt->decode.mem_read.end = 0;
4145 return EMULATION_RESTART;
4146 }
4147 goto done; /* skip rip writeback */
4148 }
3062 } 4149 }
3063 /* 4150
3064 * reset read cache here in case string instruction is restared
3065 * without decoding
3066 */
3067 ctxt->decode.mem_read.end = 0;
3068 ctxt->eip = c->eip; 4151 ctxt->eip = c->eip;
3069 4152
3070done: 4153done:
3071 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 4154 if (rc == X86EMUL_PROPAGATE_FAULT)
4155 ctxt->have_exception = true;
4156 if (rc == X86EMUL_INTERCEPTED)
4157 return EMULATION_INTERCEPTED;
4158
4159 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3072 4160
3073twobyte_insn: 4161twobyte_insn:
3074 switch (c->b) { 4162 switch (c->b) {
3075 case 0x01: /* lgdt, lidt, lmsw */
3076 switch (c->modrm_reg) {
3077 u16 size;
3078 unsigned long address;
3079
3080 case 0: /* vmcall */
3081 if (c->modrm_mod != 3 || c->modrm_rm != 1)
3082 goto cannot_emulate;
3083
3084 rc = kvm_fix_hypercall(ctxt->vcpu);
3085 if (rc != X86EMUL_CONTINUE)
3086 goto done;
3087
3088 /* Let the processor re-execute the fixed hypercall */
3089 c->eip = ctxt->eip;
3090 /* Disable writeback. */
3091 c->dst.type = OP_NONE;
3092 break;
3093 case 2: /* lgdt */
3094 rc = read_descriptor(ctxt, ops, c->src.ptr,
3095 &size, &address, c->op_bytes);
3096 if (rc != X86EMUL_CONTINUE)
3097 goto done;
3098 realmode_lgdt(ctxt->vcpu, size, address);
3099 /* Disable writeback. */
3100 c->dst.type = OP_NONE;
3101 break;
3102 case 3: /* lidt/vmmcall */
3103 if (c->modrm_mod == 3) {
3104 switch (c->modrm_rm) {
3105 case 1:
3106 rc = kvm_fix_hypercall(ctxt->vcpu);
3107 if (rc != X86EMUL_CONTINUE)
3108 goto done;
3109 break;
3110 default:
3111 goto cannot_emulate;
3112 }
3113 } else {
3114 rc = read_descriptor(ctxt, ops, c->src.ptr,
3115 &size, &address,
3116 c->op_bytes);
3117 if (rc != X86EMUL_CONTINUE)
3118 goto done;
3119 realmode_lidt(ctxt->vcpu, size, address);
3120 }
3121 /* Disable writeback. */
3122 c->dst.type = OP_NONE;
3123 break;
3124 case 4: /* smsw */
3125 c->dst.bytes = 2;
3126 c->dst.val = ops->get_cr(0, ctxt->vcpu);
3127 break;
3128 case 6: /* lmsw */
3129 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
3130 (c->src.val & 0x0f), ctxt->vcpu);
3131 c->dst.type = OP_NONE;
3132 break;
3133 case 5: /* not defined */
3134 emulate_ud(ctxt);
3135 goto done;
3136 case 7: /* invlpg*/
3137 emulate_invlpg(ctxt->vcpu, c->modrm_ea);
3138 /* Disable writeback. */
3139 c->dst.type = OP_NONE;
3140 break;
3141 default:
3142 goto cannot_emulate;
3143 }
3144 break;
3145 case 0x05: /* syscall */ 4163 case 0x05: /* syscall */
3146 rc = emulate_syscall(ctxt, ops); 4164 rc = emulate_syscall(ctxt, ops);
3147 if (rc != X86EMUL_CONTINUE)
3148 goto done;
3149 else
3150 goto writeback;
3151 break; 4165 break;
3152 case 0x06: 4166 case 0x06:
3153 emulate_clts(ctxt->vcpu); 4167 rc = em_clts(ctxt);
3154 c->dst.type = OP_NONE;
3155 break; 4168 break;
3156 case 0x09: /* wbinvd */ 4169 case 0x09: /* wbinvd */
3157 kvm_emulate_wbinvd(ctxt->vcpu); 4170 (ctxt->ops->wbinvd)(ctxt);
3158 c->dst.type = OP_NONE;
3159 break; 4171 break;
3160 case 0x08: /* invd */ 4172 case 0x08: /* invd */
3161 case 0x0d: /* GrpP (prefetch) */ 4173 case 0x0d: /* GrpP (prefetch) */
3162 case 0x18: /* Grp16 (prefetch/nop) */ 4174 case 0x18: /* Grp16 (prefetch/nop) */
3163 c->dst.type = OP_NONE;
3164 break; 4175 break;
3165 case 0x20: /* mov cr, reg */ 4176 case 0x20: /* mov cr, reg */
3166 switch (c->modrm_reg) { 4177 c->dst.val = ops->get_cr(ctxt, c->modrm_reg);
3167 case 1:
3168 case 5 ... 7:
3169 case 9 ... 15:
3170 emulate_ud(ctxt);
3171 goto done;
3172 }
3173 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
3174 c->dst.type = OP_NONE; /* no writeback */
3175 break; 4178 break;
3176 case 0x21: /* mov from dr to reg */ 4179 case 0x21: /* mov from dr to reg */
3177 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 4180 ops->get_dr(ctxt, c->modrm_reg, &c->dst.val);
3178 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3179 emulate_ud(ctxt);
3180 goto done;
3181 }
3182 ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
3183 c->dst.type = OP_NONE; /* no writeback */
3184 break; 4181 break;
3185 case 0x22: /* mov reg, cr */ 4182 case 0x22: /* mov reg, cr */
3186 if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { 4183 if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) {
3187 emulate_gp(ctxt, 0); 4184 emulate_gp(ctxt, 0);
4185 rc = X86EMUL_PROPAGATE_FAULT;
3188 goto done; 4186 goto done;
3189 } 4187 }
3190 c->dst.type = OP_NONE; 4188 c->dst.type = OP_NONE;
3191 break; 4189 break;
3192 case 0x23: /* mov from reg to dr */ 4190 case 0x23: /* mov from reg to dr */
3193 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 4191 if (ops->set_dr(ctxt, c->modrm_reg, c->src.val &
3194 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3195 emulate_ud(ctxt);
3196 goto done;
3197 }
3198
3199 if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
3200 ((ctxt->mode == X86EMUL_MODE_PROT64) ? 4192 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
3201 ~0ULL : ~0U), ctxt->vcpu) < 0) { 4193 ~0ULL : ~0U)) < 0) {
3202 /* #UD condition is already handled by the code above */ 4194 /* #UD condition is already handled by the code above */
3203 emulate_gp(ctxt, 0); 4195 emulate_gp(ctxt, 0);
4196 rc = X86EMUL_PROPAGATE_FAULT;
3204 goto done; 4197 goto done;
3205 } 4198 }
3206 4199
@@ -3210,38 +4203,30 @@ twobyte_insn:
3210 /* wrmsr */ 4203 /* wrmsr */
3211 msr_data = (u32)c->regs[VCPU_REGS_RAX] 4204 msr_data = (u32)c->regs[VCPU_REGS_RAX]
3212 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 4205 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3213 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 4206 if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) {
3214 emulate_gp(ctxt, 0); 4207 emulate_gp(ctxt, 0);
4208 rc = X86EMUL_PROPAGATE_FAULT;
3215 goto done; 4209 goto done;
3216 } 4210 }
3217 rc = X86EMUL_CONTINUE; 4211 rc = X86EMUL_CONTINUE;
3218 c->dst.type = OP_NONE;
3219 break; 4212 break;
3220 case 0x32: 4213 case 0x32:
3221 /* rdmsr */ 4214 /* rdmsr */
3222 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 4215 if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) {
3223 emulate_gp(ctxt, 0); 4216 emulate_gp(ctxt, 0);
4217 rc = X86EMUL_PROPAGATE_FAULT;
3224 goto done; 4218 goto done;
3225 } else { 4219 } else {
3226 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 4220 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
3227 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 4221 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
3228 } 4222 }
3229 rc = X86EMUL_CONTINUE; 4223 rc = X86EMUL_CONTINUE;
3230 c->dst.type = OP_NONE;
3231 break; 4224 break;
3232 case 0x34: /* sysenter */ 4225 case 0x34: /* sysenter */
3233 rc = emulate_sysenter(ctxt, ops); 4226 rc = emulate_sysenter(ctxt, ops);
3234 if (rc != X86EMUL_CONTINUE)
3235 goto done;
3236 else
3237 goto writeback;
3238 break; 4227 break;
3239 case 0x35: /* sysexit */ 4228 case 0x35: /* sysexit */
3240 rc = emulate_sysexit(ctxt, ops); 4229 rc = emulate_sysexit(ctxt, ops);
3241 if (rc != X86EMUL_CONTINUE)
3242 goto done;
3243 else
3244 goto writeback;
3245 break; 4230 break;
3246 case 0x40 ... 0x4f: /* cmov */ 4231 case 0x40 ... 0x4f: /* cmov */
3247 c->dst.val = c->dst.orig_val = c->src.val; 4232 c->dst.val = c->dst.orig_val = c->src.val;
@@ -3251,15 +4236,15 @@ twobyte_insn:
3251 case 0x80 ... 0x8f: /* jnz rel, etc*/ 4236 case 0x80 ... 0x8f: /* jnz rel, etc*/
3252 if (test_cc(c->b, ctxt->eflags)) 4237 if (test_cc(c->b, ctxt->eflags))
3253 jmp_rel(c, c->src.val); 4238 jmp_rel(c, c->src.val);
3254 c->dst.type = OP_NONE; 4239 break;
4240 case 0x90 ... 0x9f: /* setcc r/m8 */
4241 c->dst.val = test_cc(c->b, ctxt->eflags);
3255 break; 4242 break;
3256 case 0xa0: /* push fs */ 4243 case 0xa0: /* push fs */
3257 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); 4244 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
3258 break; 4245 break;
3259 case 0xa1: /* pop fs */ 4246 case 0xa1: /* pop fs */
3260 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 4247 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
3261 if (rc != X86EMUL_CONTINUE)
3262 goto done;
3263 break; 4248 break;
3264 case 0xa3: 4249 case 0xa3:
3265 bt: /* bt */ 4250 bt: /* bt */
@@ -3273,17 +4258,13 @@ twobyte_insn:
3273 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 4258 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
3274 break; 4259 break;
3275 case 0xa8: /* push gs */ 4260 case 0xa8: /* push gs */
3276 emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); 4261 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
3277 break; 4262 break;
3278 case 0xa9: /* pop gs */ 4263 case 0xa9: /* pop gs */
3279 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 4264 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
3280 if (rc != X86EMUL_CONTINUE)
3281 goto done;
3282 break; 4265 break;
3283 case 0xab: 4266 case 0xab:
3284 bts: /* bts */ 4267 bts: /* bts */
3285 /* only subword offset */
3286 c->src.val &= (c->dst.bytes << 3) - 1;
3287 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 4268 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
3288 break; 4269 break;
3289 case 0xac: /* shrd imm8, r, r/m */ 4270 case 0xac: /* shrd imm8, r, r/m */
@@ -3306,15 +4287,22 @@ twobyte_insn:
3306 } else { 4287 } else {
3307 /* Failure: write the value we saw to EAX. */ 4288 /* Failure: write the value we saw to EAX. */
3308 c->dst.type = OP_REG; 4289 c->dst.type = OP_REG;
3309 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 4290 c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
3310 } 4291 }
3311 break; 4292 break;
4293 case 0xb2: /* lss */
4294 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
4295 break;
3312 case 0xb3: 4296 case 0xb3:
3313 btr: /* btr */ 4297 btr: /* btr */
3314 /* only subword offset */
3315 c->src.val &= (c->dst.bytes << 3) - 1;
3316 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); 4298 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
3317 break; 4299 break;
4300 case 0xb4: /* lfs */
4301 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
4302 break;
4303 case 0xb5: /* lgs */
4304 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
4305 break;
3318 case 0xb6 ... 0xb7: /* movzx */ 4306 case 0xb6 ... 0xb7: /* movzx */
3319 c->dst.bytes = c->op_bytes; 4307 c->dst.bytes = c->op_bytes;
3320 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val 4308 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
@@ -3334,29 +4322,60 @@ twobyte_insn:
3334 break; 4322 break;
3335 case 0xbb: 4323 case 0xbb:
3336 btc: /* btc */ 4324 btc: /* btc */
3337 /* only subword offset */
3338 c->src.val &= (c->dst.bytes << 3) - 1;
3339 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); 4325 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
3340 break; 4326 break;
4327 case 0xbc: { /* bsf */
4328 u8 zf;
4329 __asm__ ("bsf %2, %0; setz %1"
4330 : "=r"(c->dst.val), "=q"(zf)
4331 : "r"(c->src.val));
4332 ctxt->eflags &= ~X86_EFLAGS_ZF;
4333 if (zf) {
4334 ctxt->eflags |= X86_EFLAGS_ZF;
4335 c->dst.type = OP_NONE; /* Disable writeback. */
4336 }
4337 break;
4338 }
4339 case 0xbd: { /* bsr */
4340 u8 zf;
4341 __asm__ ("bsr %2, %0; setz %1"
4342 : "=r"(c->dst.val), "=q"(zf)
4343 : "r"(c->src.val));
4344 ctxt->eflags &= ~X86_EFLAGS_ZF;
4345 if (zf) {
4346 ctxt->eflags |= X86_EFLAGS_ZF;
4347 c->dst.type = OP_NONE; /* Disable writeback. */
4348 }
4349 break;
4350 }
3341 case 0xbe ... 0xbf: /* movsx */ 4351 case 0xbe ... 0xbf: /* movsx */
3342 c->dst.bytes = c->op_bytes; 4352 c->dst.bytes = c->op_bytes;
3343 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : 4353 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
3344 (s16) c->src.val; 4354 (s16) c->src.val;
3345 break; 4355 break;
4356 case 0xc0 ... 0xc1: /* xadd */
4357 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
4358 /* Write back the register source. */
4359 c->src.val = c->dst.orig_val;
4360 write_register_operand(&c->src);
4361 break;
3346 case 0xc3: /* movnti */ 4362 case 0xc3: /* movnti */
3347 c->dst.bytes = c->op_bytes; 4363 c->dst.bytes = c->op_bytes;
3348 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : 4364 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
3349 (u64) c->src.val; 4365 (u64) c->src.val;
3350 break; 4366 break;
3351 case 0xc7: /* Grp9 (cmpxchg8b) */ 4367 case 0xc7: /* Grp9 (cmpxchg8b) */
3352 rc = emulate_grp9(ctxt, ops); 4368 rc = em_grp9(ctxt);
3353 if (rc != X86EMUL_CONTINUE)
3354 goto done;
3355 break; 4369 break;
4370 default:
4371 goto cannot_emulate;
3356 } 4372 }
4373
4374 if (rc != X86EMUL_CONTINUE)
4375 goto done;
4376
3357 goto writeback; 4377 goto writeback;
3358 4378
3359cannot_emulate: 4379cannot_emulate:
3360 DPRINTF("Cannot emulate %02x\n", c->b); 4380 return EMULATION_FAILED;
3361 return -1;
3362} 4381}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index ddeb2314b522..efad72385058 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,7 +5,7 @@
5 * Copyright (c) 2006 Intel Corporation 5 * Copyright (c) 2006 Intel Corporation
6 * Copyright (c) 2007 Keir Fraser, XenSource Inc 6 * Copyright (c) 2007 Keir Fraser, XenSource Inc
7 * Copyright (c) 2008 Intel Corporation 7 * Copyright (c) 2008 Intel Corporation
8 * Copyright 2009 Red Hat, Inc. and/or its affilates. 8 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal 11 * of this software and associated documentation files (the "Software"), to deal
@@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
232 } 232 }
233} 233}
234 234
235int pit_has_pending_timer(struct kvm_vcpu *vcpu)
236{
237 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
238
239 if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
240 return atomic_read(&pit->pit_state.pit_timer.pending);
241 return 0;
242}
243
244static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) 235static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
245{ 236{
246 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 237 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 46d08ca0b48f..51a97426e791 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -33,7 +33,6 @@ struct kvm_kpit_state {
33}; 33};
34 34
35struct kvm_pit { 35struct kvm_pit {
36 unsigned long base_addresss;
37 struct kvm_io_device dev; 36 struct kvm_io_device dev;
38 struct kvm_io_device speaker_dev; 37 struct kvm_io_device speaker_dev;
39 struct kvm *kvm; 38 struct kvm *kvm;
@@ -51,7 +50,6 @@ struct kvm_pit {
51#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 50#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
52#define KVM_PIT_CHANNEL_MASK 0x3 51#define KVM_PIT_CHANNEL_MASK 0x3
53 52
54void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
55void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); 53void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
56struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); 54struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
57void kvm_free_pit(struct kvm *kvm); 55void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 4b7b73ce2098..19fe855e7953 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard 4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation 5 * Copyright (c) 2007 Intel Corporation
6 * Copyright 2009 Red Hat, Inc. and/or its affilates. 6 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
7 * 7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal 9 * of this software and associated documentation files (the "Software"), to deal
@@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level);
39static void pic_lock(struct kvm_pic *s) 39static void pic_lock(struct kvm_pic *s)
40 __acquires(&s->lock) 40 __acquires(&s->lock)
41{ 41{
42 raw_spin_lock(&s->lock); 42 spin_lock(&s->lock);
43} 43}
44 44
45static void pic_unlock(struct kvm_pic *s) 45static void pic_unlock(struct kvm_pic *s)
@@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s)
51 51
52 s->wakeup_needed = false; 52 s->wakeup_needed = false;
53 53
54 raw_spin_unlock(&s->lock); 54 spin_unlock(&s->lock);
55 55
56 if (wakeup) { 56 if (wakeup) {
57 kvm_for_each_vcpu(i, vcpu, s->kvm) { 57 kvm_for_each_vcpu(i, vcpu, s->kvm) {
@@ -62,11 +62,9 @@ static void pic_unlock(struct kvm_pic *s)
62 } 62 }
63 63
64 if (!found) 64 if (!found)
65 found = s->kvm->bsp_vcpu;
66
67 if (!found)
68 return; 65 return;
69 66
67 kvm_make_request(KVM_REQ_EVENT, found);
70 kvm_vcpu_kick(found); 68 kvm_vcpu_kick(found);
71 } 69 }
72} 70}
@@ -74,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s)
74static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 72static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
75{ 73{
76 s->isr &= ~(1 << irq); 74 s->isr &= ~(1 << irq);
77 s->isr_ack |= (1 << irq);
78 if (s != &s->pics_state->pics[0]) 75 if (s != &s->pics_state->pics[0])
79 irq += 8; 76 irq += 8;
80 /* 77 /*
@@ -88,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
88 pic_lock(s->pics_state); 85 pic_lock(s->pics_state);
89} 86}
90 87
91void kvm_pic_clear_isr_ack(struct kvm *kvm)
92{
93 struct kvm_pic *s = pic_irqchip(kvm);
94
95 pic_lock(s);
96 s->pics[0].isr_ack = 0xff;
97 s->pics[1].isr_ack = 0xff;
98 pic_unlock(s);
99}
100
101/* 88/*
102 * set irq level. If an edge is detected, then the IRR is set to 1 89 * set irq level. If an edge is detected, then the IRR is set to 1
103 */ 90 */
@@ -280,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
280 s->irr = 0; 267 s->irr = 0;
281 s->imr = 0; 268 s->imr = 0;
282 s->isr = 0; 269 s->isr = 0;
283 s->isr_ack = 0xff;
284 s->priority_add = 0; 270 s->priority_add = 0;
285 s->irq_base = 0; 271 s->irq_base = 0;
286 s->read_reg_select = 0; 272 s->read_reg_select = 0;
@@ -308,13 +294,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
308 addr &= 1; 294 addr &= 1;
309 if (addr == 0) { 295 if (addr == 0) {
310 if (val & 0x10) { 296 if (val & 0x10) {
311 kvm_pic_reset(s); /* init */
312 /*
313 * deassert a pending interrupt
314 */
315 pic_irq_request(s->pics_state->kvm, 0);
316 s->init_state = 1;
317 s->init4 = val & 1; 297 s->init4 = val & 1;
298 s->last_irr = 0;
299 s->imr = 0;
300 s->priority_add = 0;
301 s->special_mask = 0;
302 s->read_reg_select = 0;
303 if (!s->init4) {
304 s->special_fully_nested_mode = 0;
305 s->auto_eoi = 0;
306 }
307 s->init_state = 1;
318 if (val & 0x02) 308 if (val & 0x02)
319 printk(KERN_ERR "single mode not supported"); 309 printk(KERN_ERR "single mode not supported");
320 if (val & 0x08) 310 if (val & 0x08)
@@ -540,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this,
540 */ 530 */
541static void pic_irq_request(struct kvm *kvm, int level) 531static void pic_irq_request(struct kvm *kvm, int level)
542{ 532{
543 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
544 struct kvm_pic *s = pic_irqchip(kvm); 533 struct kvm_pic *s = pic_irqchip(kvm);
545 int irq = pic_get_irq(&s->pics[0]);
546 534
547 s->output = level; 535 if (!s->output)
548 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
549 s->pics[0].isr_ack &= ~(1 << irq);
550 s->wakeup_needed = true; 536 s->wakeup_needed = true;
551 } 537 s->output = level;
552} 538}
553 539
554static const struct kvm_io_device_ops picdev_ops = { 540static const struct kvm_io_device_ops picdev_ops = {
@@ -564,7 +550,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
564 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 550 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
565 if (!s) 551 if (!s)
566 return NULL; 552 return NULL;
567 raw_spin_lock_init(&s->lock); 553 spin_lock_init(&s->lock);
568 s->kvm = kvm; 554 s->kvm = kvm;
569 s->pics[0].elcr_mask = 0xf8; 555 s->pics[0].elcr_mask = 0xf8;
570 s->pics[1].elcr_mask = 0xde; 556 s->pics[1].elcr_mask = 0xde;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 2095a049835e..7e06ba1618bd 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * irq.c: API for in kernel interrupt controller 2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation. 3 * Copyright (c) 2007, Intel Corporation.
4 * Copyright 2009 Red Hat, Inc. and/or its affilates. 4 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -33,12 +33,7 @@
33 */ 33 */
34int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 34int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
35{ 35{
36 int ret; 36 return apic_has_pending_timer(vcpu);
37
38 ret = pit_has_pending_timer(vcpu);
39 ret |= apic_has_pending_timer(vcpu);
40
41 return ret;
42} 37}
43EXPORT_SYMBOL(kvm_cpu_has_pending_timer); 38EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
44 39
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 63c314502993..53e2d084bffb 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -60,7 +60,7 @@ struct kvm_kpic_state {
60}; 60};
61 61
62struct kvm_pic { 62struct kvm_pic {
63 raw_spinlock_t lock; 63 spinlock_t lock;
64 bool wakeup_needed; 64 bool wakeup_needed;
65 unsigned pending_acks; 65 unsigned pending_acks;
66 struct kvm *kvm; 66 struct kvm *kvm;
@@ -75,7 +75,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm);
75void kvm_destroy_pic(struct kvm *kvm); 75void kvm_destroy_pic(struct kvm *kvm);
76int kvm_pic_read_irq(struct kvm *kvm); 76int kvm_pic_read_irq(struct kvm *kvm);
77void kvm_pic_update_irq(struct kvm_pic *s); 77void kvm_pic_update_irq(struct kvm_pic *s);
78void kvm_pic_clear_isr_ack(struct kvm *kvm);
79 78
80static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) 79static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
81{ 80{
@@ -100,7 +99,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
100void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); 99void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
101void __kvm_migrate_timers(struct kvm_vcpu *vcpu); 100void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
102 101
103int pit_has_pending_timer(struct kvm_vcpu *vcpu);
104int apic_has_pending_timer(struct kvm_vcpu *vcpu); 102int apic_has_pending_timer(struct kvm_vcpu *vcpu);
105 103
106#endif 104#endif
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 6491ac8e755b..3377d53fcd36 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -42,7 +42,14 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
42 (unsigned long *)&vcpu->arch.regs_avail)) 42 (unsigned long *)&vcpu->arch.regs_avail))
43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); 43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
44 44
45 return vcpu->arch.pdptrs[index]; 45 return vcpu->arch.walk_mmu->pdptrs[index];
46}
47
48static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
49{
50 load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
51
52 return mmu->pdptrs[index];
46} 53}
47 54
48static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) 55static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
@@ -66,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
66 return vcpu->arch.cr4 & mask; 73 return vcpu->arch.cr4 & mask;
67} 74}
68 75
76static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
77{
78 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
79 kvm_x86_ops->decache_cr3(vcpu);
80 return vcpu->arch.cr3;
81}
82
69static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) 83static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
70{ 84{
71 return kvm_read_cr4_bits(vcpu, ~0UL); 85 return kvm_read_cr4_bits(vcpu, ~0UL);
@@ -77,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
77 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); 91 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
78} 92}
79 93
94static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
95{
96 vcpu->arch.hflags |= HF_GUEST_MASK;
97}
98
99static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
100{
101 vcpu->arch.hflags &= ~HF_GUEST_MASK;
102}
103
104static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
105{
106 return vcpu->arch.hflags & HF_GUEST_MASK;
107}
108
80#endif 109#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 77d8c0f4817d..2b2255b1f04b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,7 +5,7 @@
5 * Copyright (C) 2006 Qumranet, Inc. 5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell 6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel 7 * Copyright (C) 2007 Intel
8 * Copyright 2009 Red Hat, Inc. and/or its affilates. 8 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Dor Laor <dor.laor@qumranet.com> 11 * Dor Laor <dor.laor@qumranet.com>
@@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
259 259
260static void apic_update_ppr(struct kvm_lapic *apic) 260static void apic_update_ppr(struct kvm_lapic *apic)
261{ 261{
262 u32 tpr, isrv, ppr; 262 u32 tpr, isrv, ppr, old_ppr;
263 int isr; 263 int isr;
264 264
265 old_ppr = apic_get_reg(apic, APIC_PROCPRI);
265 tpr = apic_get_reg(apic, APIC_TASKPRI); 266 tpr = apic_get_reg(apic, APIC_TASKPRI);
266 isr = apic_find_highest_isr(apic); 267 isr = apic_find_highest_isr(apic);
267 isrv = (isr != -1) ? isr : 0; 268 isrv = (isr != -1) ? isr : 0;
@@ -274,7 +275,11 @@ static void apic_update_ppr(struct kvm_lapic *apic)
274 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", 275 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
275 apic, ppr, isr, isrv); 276 apic, ppr, isr, isrv);
276 277
277 apic_set_reg(apic, APIC_PROCPRI, ppr); 278 if (old_ppr != ppr) {
279 apic_set_reg(apic, APIC_PROCPRI, ppr);
280 if (ppr < old_ppr)
281 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
282 }
278} 283}
279 284
280static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) 285static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
@@ -391,6 +396,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
391 break; 396 break;
392 } 397 }
393 398
399 kvm_make_request(KVM_REQ_EVENT, vcpu);
394 kvm_vcpu_kick(vcpu); 400 kvm_vcpu_kick(vcpu);
395 break; 401 break;
396 402
@@ -411,11 +417,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
411 case APIC_DM_INIT: 417 case APIC_DM_INIT:
412 if (level) { 418 if (level) {
413 result = 1; 419 result = 1;
414 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
415 printk(KERN_DEBUG
416 "INIT on a runnable vcpu %d\n",
417 vcpu->vcpu_id);
418 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 420 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
421 kvm_make_request(KVM_REQ_EVENT, vcpu);
419 kvm_vcpu_kick(vcpu); 422 kvm_vcpu_kick(vcpu);
420 } else { 423 } else {
421 apic_debug("Ignoring de-assert INIT to vcpu %d\n", 424 apic_debug("Ignoring de-assert INIT to vcpu %d\n",
@@ -430,6 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
430 result = 1; 433 result = 1;
431 vcpu->arch.sipi_vector = vector; 434 vcpu->arch.sipi_vector = vector;
432 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 435 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
436 kvm_make_request(KVM_REQ_EVENT, vcpu);
433 kvm_vcpu_kick(vcpu); 437 kvm_vcpu_kick(vcpu);
434 } 438 }
435 break; 439 break;
@@ -475,6 +479,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
475 trigger_mode = IOAPIC_EDGE_TRIG; 479 trigger_mode = IOAPIC_EDGE_TRIG;
476 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) 480 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
477 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 481 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
482 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
478} 483}
479 484
480static void apic_send_ipi(struct kvm_lapic *apic) 485static void apic_send_ipi(struct kvm_lapic *apic)
@@ -866,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
866 871
867 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); 872 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
868 873
869 if (vcpu->arch.apic->regs_page) 874 if (vcpu->arch.apic->regs)
870 __free_page(vcpu->arch.apic->regs_page); 875 free_page((unsigned long)vcpu->arch.apic->regs);
871 876
872 kfree(vcpu->arch.apic); 877 kfree(vcpu->arch.apic);
873} 878}
@@ -1056,14 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1056 1061
1057 vcpu->arch.apic = apic; 1062 vcpu->arch.apic = apic;
1058 1063
1059 apic->regs_page = alloc_page(GFP_KERNEL); 1064 apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
1060 if (apic->regs_page == NULL) { 1065 if (!apic->regs) {
1061 printk(KERN_ERR "malloc apic regs error for vcpu %x\n", 1066 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
1062 vcpu->vcpu_id); 1067 vcpu->vcpu_id);
1063 goto nomem_free_apic; 1068 goto nomem_free_apic;
1064 } 1069 }
1065 apic->regs = page_address(apic->regs_page);
1066 memset(apic->regs, 0, PAGE_SIZE);
1067 apic->vcpu = vcpu; 1070 apic->vcpu = vcpu;
1068 1071
1069 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, 1072 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
@@ -1152,6 +1155,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1152 update_divide_count(apic); 1155 update_divide_count(apic);
1153 start_apic_timer(apic); 1156 start_apic_timer(apic);
1154 apic->irr_pending = true; 1157 apic->irr_pending = true;
1158 kvm_make_request(KVM_REQ_EVENT, vcpu);
1155} 1159}
1156 1160
1157void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1161void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f5fe32c5edad..52c9e6b9e725 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,7 +13,6 @@ struct kvm_lapic {
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending; 15 bool irr_pending;
16 struct page *regs_page;
17 void *regs; 16 void *regs;
18 gpa_t vapic_addr; 17 gpa_t vapic_addr;
19 struct page *vapic_page; 18 struct page *vapic_page;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 311f6dad8951..aee38623b768 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,7 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -18,9 +18,11 @@
18 * 18 *
19 */ 19 */
20 20
21#include "irq.h"
21#include "mmu.h" 22#include "mmu.h"
22#include "x86.h" 23#include "x86.h"
23#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include "x86.h"
24 26
25#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
26#include <linux/types.h> 28#include <linux/types.h>
@@ -49,15 +51,25 @@
49 */ 51 */
50bool tdp_enabled = false; 52bool tdp_enabled = false;
51 53
52#undef MMU_DEBUG 54enum {
55 AUDIT_PRE_PAGE_FAULT,
56 AUDIT_POST_PAGE_FAULT,
57 AUDIT_PRE_PTE_WRITE,
58 AUDIT_POST_PTE_WRITE,
59 AUDIT_PRE_SYNC,
60 AUDIT_POST_SYNC
61};
53 62
54#undef AUDIT 63char *audit_point_name[] = {
64 "pre page fault",
65 "post page fault",
66 "pre pte write",
67 "post pte write",
68 "pre sync",
69 "post sync"
70};
55 71
56#ifdef AUDIT 72#undef MMU_DEBUG
57static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
58#else
59static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
60#endif
61 73
62#ifdef MMU_DEBUG 74#ifdef MMU_DEBUG
63 75
@@ -71,7 +83,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
71 83
72#endif 84#endif
73 85
74#if defined(MMU_DEBUG) || defined(AUDIT) 86#ifdef MMU_DEBUG
75static int dbg = 0; 87static int dbg = 0;
76module_param(dbg, bool, 0644); 88module_param(dbg, bool, 0644);
77#endif 89#endif
@@ -89,6 +101,8 @@ module_param(oos_shadow, bool, 0644);
89 } 101 }
90#endif 102#endif
91 103
104#define PTE_PREFETCH_NUM 8
105
92#define PT_FIRST_AVAIL_BITS_SHIFT 9 106#define PT_FIRST_AVAIL_BITS_SHIFT 9
93#define PT64_SECOND_AVAIL_BITS_SHIFT 52 107#define PT64_SECOND_AVAIL_BITS_SHIFT 52
94 108
@@ -97,9 +111,6 @@ module_param(oos_shadow, bool, 0644);
97#define PT64_LEVEL_SHIFT(level) \ 111#define PT64_LEVEL_SHIFT(level) \
98 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) 112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
99 113
100#define PT64_LEVEL_MASK(level) \
101 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
102
103#define PT64_INDEX(address, level)\ 114#define PT64_INDEX(address, level)\
104 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 115 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
105 116
@@ -109,8 +120,6 @@ module_param(oos_shadow, bool, 0644);
109#define PT32_LEVEL_SHIFT(level) \ 120#define PT32_LEVEL_SHIFT(level) \
110 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 121 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
111 122
112#define PT32_LEVEL_MASK(level) \
113 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
114#define PT32_LVL_OFFSET_MASK(level) \ 123#define PT32_LVL_OFFSET_MASK(level) \
115 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 124 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
116 * PT32_LEVEL_BITS))) - 1)) 125 * PT32_LEVEL_BITS))) - 1))
@@ -178,10 +187,10 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
178static struct kmem_cache *pte_chain_cache; 187static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache; 188static struct kmem_cache *rmap_desc_cache;
180static struct kmem_cache *mmu_page_header_cache; 189static struct kmem_cache *mmu_page_header_cache;
190static struct percpu_counter kvm_total_used_mmu_pages;
181 191
182static u64 __read_mostly shadow_trap_nonpresent_pte; 192static u64 __read_mostly shadow_trap_nonpresent_pte;
183static u64 __read_mostly shadow_notrap_nonpresent_pte; 193static u64 __read_mostly shadow_notrap_nonpresent_pte;
184static u64 __read_mostly shadow_base_present_pte;
185static u64 __read_mostly shadow_nx_mask; 194static u64 __read_mostly shadow_nx_mask;
186static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 195static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
187static u64 __read_mostly shadow_user_mask; 196static u64 __read_mostly shadow_user_mask;
@@ -200,12 +209,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
200} 209}
201EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); 210EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
202 211
203void kvm_mmu_set_base_ptes(u64 base_pte)
204{
205 shadow_base_present_pte = base_pte;
206}
207EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
208
209void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 212void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
210 u64 dirty_mask, u64 nx_mask, u64 x_mask) 213 u64 dirty_mask, u64 nx_mask, u64 x_mask)
211{ 214{
@@ -299,18 +302,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
299#endif 302#endif
300} 303}
301 304
305static bool spte_has_volatile_bits(u64 spte)
306{
307 if (!shadow_accessed_mask)
308 return false;
309
310 if (!is_shadow_present_pte(spte))
311 return false;
312
313 if ((spte & shadow_accessed_mask) &&
314 (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
315 return false;
316
317 return true;
318}
319
320static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
321{
322 return (old_spte & bit_mask) && !(new_spte & bit_mask);
323}
324
302static void update_spte(u64 *sptep, u64 new_spte) 325static void update_spte(u64 *sptep, u64 new_spte)
303{ 326{
304 u64 old_spte; 327 u64 mask, old_spte = *sptep;
328
329 WARN_ON(!is_rmap_spte(new_spte));
330
331 new_spte |= old_spte & shadow_dirty_mask;
332
333 mask = shadow_accessed_mask;
334 if (is_writable_pte(old_spte))
335 mask |= shadow_dirty_mask;
305 336
306 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || 337 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
307 !is_rmap_spte(*sptep))
308 __set_spte(sptep, new_spte); 338 __set_spte(sptep, new_spte);
309 else { 339 else
310 old_spte = __xchg_spte(sptep, new_spte); 340 old_spte = __xchg_spte(sptep, new_spte);
311 if (old_spte & shadow_accessed_mask) 341
312 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); 342 if (!shadow_accessed_mask)
313 } 343 return;
344
345 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
346 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
347 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
348 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
314} 349}
315 350
316static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 351static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -339,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
339static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 374static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
340 int min) 375 int min)
341{ 376{
342 struct page *page; 377 void *page;
343 378
344 if (cache->nobjs >= min) 379 if (cache->nobjs >= min)
345 return 0; 380 return 0;
346 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 381 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
347 page = alloc_page(GFP_KERNEL); 382 page = (void *)__get_free_page(GFP_KERNEL);
348 if (!page) 383 if (!page)
349 return -ENOMEM; 384 return -ENOMEM;
350 cache->objects[cache->nobjs++] = page_address(page); 385 cache->objects[cache->nobjs++] = page;
351 } 386 }
352 return 0; 387 return 0;
353} 388}
@@ -367,7 +402,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
367 if (r) 402 if (r)
368 goto out; 403 goto out;
369 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, 404 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
370 rmap_desc_cache, 4); 405 rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
371 if (r) 406 if (r)
372 goto out; 407 goto out;
373 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 408 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -437,46 +472,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
437} 472}
438 473
439/* 474/*
440 * Return the pointer to the largepage write count for a given 475 * Return the pointer to the large page information for a given gfn,
441 * gfn, handling slots that are not large page aligned. 476 * handling slots that are not large page aligned.
442 */ 477 */
443static int *slot_largepage_idx(gfn_t gfn, 478static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
444 struct kvm_memory_slot *slot, 479 struct kvm_memory_slot *slot,
445 int level) 480 int level)
446{ 481{
447 unsigned long idx; 482 unsigned long idx;
448 483
449 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 484 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
450 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 485 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
451 return &slot->lpage_info[level - 2][idx].write_count; 486 return &slot->lpage_info[level - 2][idx];
452} 487}
453 488
454static void account_shadowed(struct kvm *kvm, gfn_t gfn) 489static void account_shadowed(struct kvm *kvm, gfn_t gfn)
455{ 490{
456 struct kvm_memory_slot *slot; 491 struct kvm_memory_slot *slot;
457 int *write_count; 492 struct kvm_lpage_info *linfo;
458 int i; 493 int i;
459 494
460 slot = gfn_to_memslot(kvm, gfn); 495 slot = gfn_to_memslot(kvm, gfn);
461 for (i = PT_DIRECTORY_LEVEL; 496 for (i = PT_DIRECTORY_LEVEL;
462 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 497 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
463 write_count = slot_largepage_idx(gfn, slot, i); 498 linfo = lpage_info_slot(gfn, slot, i);
464 *write_count += 1; 499 linfo->write_count += 1;
465 } 500 }
466} 501}
467 502
468static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 503static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
469{ 504{
470 struct kvm_memory_slot *slot; 505 struct kvm_memory_slot *slot;
471 int *write_count; 506 struct kvm_lpage_info *linfo;
472 int i; 507 int i;
473 508
474 slot = gfn_to_memslot(kvm, gfn); 509 slot = gfn_to_memslot(kvm, gfn);
475 for (i = PT_DIRECTORY_LEVEL; 510 for (i = PT_DIRECTORY_LEVEL;
476 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 511 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
477 write_count = slot_largepage_idx(gfn, slot, i); 512 linfo = lpage_info_slot(gfn, slot, i);
478 *write_count -= 1; 513 linfo->write_count -= 1;
479 WARN_ON(*write_count < 0); 514 WARN_ON(linfo->write_count < 0);
480 } 515 }
481} 516}
482 517
@@ -485,12 +520,12 @@ static int has_wrprotected_page(struct kvm *kvm,
485 int level) 520 int level)
486{ 521{
487 struct kvm_memory_slot *slot; 522 struct kvm_memory_slot *slot;
488 int *largepage_idx; 523 struct kvm_lpage_info *linfo;
489 524
490 slot = gfn_to_memslot(kvm, gfn); 525 slot = gfn_to_memslot(kvm, gfn);
491 if (slot) { 526 if (slot) {
492 largepage_idx = slot_largepage_idx(gfn, slot, level); 527 linfo = lpage_info_slot(gfn, slot, level);
493 return *largepage_idx; 528 return linfo->write_count;
494 } 529 }
495 530
496 return 1; 531 return 1;
@@ -514,14 +549,28 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
514 return ret; 549 return ret;
515} 550}
516 551
517static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 552static struct kvm_memory_slot *
553gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
554 bool no_dirty_log)
518{ 555{
519 struct kvm_memory_slot *slot; 556 struct kvm_memory_slot *slot;
520 int host_level, level, max_level;
521 557
522 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 558 slot = gfn_to_memslot(vcpu->kvm, gfn);
523 if (slot && slot->dirty_bitmap) 559 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
524 return PT_PAGE_TABLE_LEVEL; 560 (no_dirty_log && slot->dirty_bitmap))
561 slot = NULL;
562
563 return slot;
564}
565
566static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
569}
570
571static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
572{
573 int host_level, level, max_level;
525 574
526 host_level = host_mapping_level(vcpu->kvm, large_gfn); 575 host_level = host_mapping_level(vcpu->kvm, large_gfn);
527 576
@@ -545,16 +594,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
545static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 594static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
546{ 595{
547 struct kvm_memory_slot *slot; 596 struct kvm_memory_slot *slot;
548 unsigned long idx; 597 struct kvm_lpage_info *linfo;
549 598
550 slot = gfn_to_memslot(kvm, gfn); 599 slot = gfn_to_memslot(kvm, gfn);
551 if (likely(level == PT_PAGE_TABLE_LEVEL)) 600 if (likely(level == PT_PAGE_TABLE_LEVEL))
552 return &slot->rmap[gfn - slot->base_gfn]; 601 return &slot->rmap[gfn - slot->base_gfn];
553 602
554 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 603 linfo = lpage_info_slot(gfn, slot, level);
555 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
556 604
557 return &slot->lpage_info[level - 2][idx].rmap_pde; 605 return &linfo->rmap_pde;
558} 606}
559 607
560/* 608/*
@@ -591,6 +639,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
591 desc->sptes[0] = (u64 *)*rmapp; 639 desc->sptes[0] = (u64 *)*rmapp;
592 desc->sptes[1] = spte; 640 desc->sptes[1] = spte;
593 *rmapp = (unsigned long)desc | 1; 641 *rmapp = (unsigned long)desc | 1;
642 ++count;
594 } else { 643 } else {
595 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 644 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
596 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 645 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -603,7 +652,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
603 desc = desc->more; 652 desc = desc->more;
604 } 653 }
605 for (i = 0; desc->sptes[i]; ++i) 654 for (i = 0; desc->sptes[i]; ++i)
606 ; 655 ++count;
607 desc->sptes[i] = spte; 656 desc->sptes[i] = spte;
608 } 657 }
609 return count; 658 return count;
@@ -645,18 +694,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
645 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 694 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
646 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); 695 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
647 if (!*rmapp) { 696 if (!*rmapp) {
648 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 697 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
649 BUG(); 698 BUG();
650 } else if (!(*rmapp & 1)) { 699 } else if (!(*rmapp & 1)) {
651 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); 700 rmap_printk("rmap_remove: %p 1->0\n", spte);
652 if ((u64 *)*rmapp != spte) { 701 if ((u64 *)*rmapp != spte) {
653 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", 702 printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
654 spte, *spte);
655 BUG(); 703 BUG();
656 } 704 }
657 *rmapp = 0; 705 *rmapp = 0;
658 } else { 706 } else {
659 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); 707 rmap_printk("rmap_remove: %p many->many\n", spte);
660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 708 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
661 prev_desc = NULL; 709 prev_desc = NULL;
662 while (desc) { 710 while (desc) {
@@ -670,35 +718,36 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
670 prev_desc = desc; 718 prev_desc = desc;
671 desc = desc->more; 719 desc = desc->more;
672 } 720 }
673 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); 721 pr_err("rmap_remove: %p many->many\n", spte);
674 BUG(); 722 BUG();
675 } 723 }
676} 724}
677 725
678static void set_spte_track_bits(u64 *sptep, u64 new_spte) 726static int set_spte_track_bits(u64 *sptep, u64 new_spte)
679{ 727{
680 pfn_t pfn; 728 pfn_t pfn;
681 u64 old_spte = *sptep; 729 u64 old_spte = *sptep;
682 730
683 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || 731 if (!spte_has_volatile_bits(old_spte))
684 old_spte & shadow_accessed_mask) {
685 __set_spte(sptep, new_spte); 732 __set_spte(sptep, new_spte);
686 } else 733 else
687 old_spte = __xchg_spte(sptep, new_spte); 734 old_spte = __xchg_spte(sptep, new_spte);
688 735
689 if (!is_rmap_spte(old_spte)) 736 if (!is_rmap_spte(old_spte))
690 return; 737 return 0;
738
691 pfn = spte_to_pfn(old_spte); 739 pfn = spte_to_pfn(old_spte);
692 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 740 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
693 kvm_set_pfn_accessed(pfn); 741 kvm_set_pfn_accessed(pfn);
694 if (is_writable_pte(old_spte)) 742 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
695 kvm_set_pfn_dirty(pfn); 743 kvm_set_pfn_dirty(pfn);
744 return 1;
696} 745}
697 746
698static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) 747static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
699{ 748{
700 set_spte_track_bits(sptep, new_spte); 749 if (set_spte_track_bits(sptep, new_spte))
701 rmap_remove(kvm, sptep); 750 rmap_remove(kvm, sptep);
702} 751}
703 752
704static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 753static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
@@ -746,13 +795,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
746 } 795 }
747 spte = rmap_next(kvm, rmapp, spte); 796 spte = rmap_next(kvm, rmapp, spte);
748 } 797 }
749 if (write_protected) {
750 pfn_t pfn;
751
752 spte = rmap_next(kvm, rmapp, NULL);
753 pfn = spte_to_pfn(*spte);
754 kvm_set_pfn_dirty(pfn);
755 }
756 798
757 /* check for huge page mappings */ 799 /* check for huge page mappings */
758 for (i = PT_DIRECTORY_LEVEL; 800 for (i = PT_DIRECTORY_LEVEL;
@@ -848,19 +890,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
848 end = start + (memslot->npages << PAGE_SHIFT); 890 end = start + (memslot->npages << PAGE_SHIFT);
849 if (hva >= start && hva < end) { 891 if (hva >= start && hva < end) {
850 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 892 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
893 gfn_t gfn = memslot->base_gfn + gfn_offset;
851 894
852 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 895 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
853 896
854 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 897 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
855 unsigned long idx; 898 struct kvm_lpage_info *linfo;
856 int sh; 899
857 900 linfo = lpage_info_slot(gfn, memslot,
858 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); 901 PT_DIRECTORY_LEVEL + j);
859 idx = ((memslot->base_gfn+gfn_offset) >> sh) - 902 ret |= handler(kvm, &linfo->rmap_pde, data);
860 (memslot->base_gfn >> sh);
861 ret |= handler(kvm,
862 &memslot->lpage_info[j][idx].rmap_pde,
863 data);
864 } 903 }
865 trace_kvm_age_page(hva, memslot, ret); 904 trace_kvm_age_page(hva, memslot, ret);
866 retval |= ret; 905 retval |= ret;
@@ -911,6 +950,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
911 return young; 950 return young;
912} 951}
913 952
953static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
954 unsigned long data)
955{
956 u64 *spte;
957 int young = 0;
958
959 /*
960 * If there's no access bit in the secondary pte set by the
961 * hardware it's up to gup-fast/gup to set the access bit in
962 * the primary pte or in the page structure.
963 */
964 if (!shadow_accessed_mask)
965 goto out;
966
967 spte = rmap_next(kvm, rmapp, NULL);
968 while (spte) {
969 u64 _spte = *spte;
970 BUG_ON(!(_spte & PT_PRESENT_MASK));
971 young = _spte & PT_ACCESSED_MASK;
972 if (young) {
973 young = 1;
974 break;
975 }
976 spte = rmap_next(kvm, rmapp, spte);
977 }
978out:
979 return young;
980}
981
914#define RMAP_RECYCLE_THRESHOLD 1000 982#define RMAP_RECYCLE_THRESHOLD 1000
915 983
916static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 984static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -931,6 +999,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
931 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 999 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
932} 1000}
933 1001
1002int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1003{
1004 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1005}
1006
934#ifdef MMU_DEBUG 1007#ifdef MMU_DEBUG
935static int is_empty_shadow_page(u64 *spt) 1008static int is_empty_shadow_page(u64 *spt)
936{ 1009{
@@ -947,16 +1020,28 @@ static int is_empty_shadow_page(u64 *spt)
947} 1020}
948#endif 1021#endif
949 1022
1023/*
1024 * This value is the sum of all of the kvm instances's
1025 * kvm->arch.n_used_mmu_pages values. We need a global,
1026 * aggregate version in order to make the slab shrinker
1027 * faster
1028 */
1029static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1030{
1031 kvm->arch.n_used_mmu_pages += nr;
1032 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1033}
1034
950static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1035static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
951{ 1036{
952 ASSERT(is_empty_shadow_page(sp->spt)); 1037 ASSERT(is_empty_shadow_page(sp->spt));
953 hlist_del(&sp->hash_link); 1038 hlist_del(&sp->hash_link);
954 list_del(&sp->link); 1039 list_del(&sp->link);
955 __free_page(virt_to_page(sp->spt)); 1040 free_page((unsigned long)sp->spt);
956 if (!sp->role.direct) 1041 if (!sp->role.direct)
957 __free_page(virt_to_page(sp->gfns)); 1042 free_page((unsigned long)sp->gfns);
958 kmem_cache_free(mmu_page_header_cache, sp); 1043 kmem_cache_free(mmu_page_header_cache, sp);
959 ++kvm->arch.n_free_mmu_pages; 1044 kvm_mod_used_mmu_pages(kvm, -1);
960} 1045}
961 1046
962static unsigned kvm_page_table_hashfn(gfn_t gfn) 1047static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -979,7 +1064,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
979 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 1064 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
980 sp->multimapped = 0; 1065 sp->multimapped = 0;
981 sp->parent_pte = parent_pte; 1066 sp->parent_pte = parent_pte;
982 --vcpu->kvm->arch.n_free_mmu_pages; 1067 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
983 return sp; 1068 return sp;
984} 1069}
985 1070
@@ -1110,7 +1195,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1110} 1195}
1111 1196
1112static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1197static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1113 struct kvm_mmu_page *sp, bool clear_unsync) 1198 struct kvm_mmu_page *sp)
1114{ 1199{
1115 return 1; 1200 return 1;
1116} 1201}
@@ -1119,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1119{ 1204{
1120} 1205}
1121 1206
1207static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1208 struct kvm_mmu_page *sp, u64 *spte,
1209 const void *pte)
1210{
1211 WARN_ON(1);
1212}
1213
1122#define KVM_PAGE_ARRAY_NR 16 1214#define KVM_PAGE_ARRAY_NR 16
1123 1215
1124struct kvm_mmu_pages { 1216struct kvm_mmu_pages {
@@ -1240,7 +1332,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1240 if (clear_unsync) 1332 if (clear_unsync)
1241 kvm_unlink_unsync_page(vcpu->kvm, sp); 1333 kvm_unlink_unsync_page(vcpu->kvm, sp);
1242 1334
1243 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { 1335 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1244 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1336 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1245 return 1; 1337 return 1;
1246 } 1338 }
@@ -1281,12 +1373,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1281 continue; 1373 continue;
1282 1374
1283 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1375 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1376 kvm_unlink_unsync_page(vcpu->kvm, s);
1284 if ((s->role.cr4_pae != !!is_pae(vcpu)) || 1377 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1285 (vcpu->arch.mmu.sync_page(vcpu, s, true))) { 1378 (vcpu->arch.mmu.sync_page(vcpu, s))) {
1286 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); 1379 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1287 continue; 1380 continue;
1288 } 1381 }
1289 kvm_unlink_unsync_page(vcpu->kvm, s);
1290 flush = true; 1382 flush = true;
1291 } 1383 }
1292 1384
@@ -1403,7 +1495,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1403 if (role.direct) 1495 if (role.direct)
1404 role.cr4_pae = 0; 1496 role.cr4_pae = 0;
1405 role.access = access; 1497 role.access = access;
1406 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1498 if (!vcpu->arch.mmu.direct_map
1499 && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1407 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1500 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1408 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1501 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1409 role.quadrant = quadrant; 1502 role.quadrant = quadrant;
@@ -1458,6 +1551,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1458 iterator->addr = addr; 1551 iterator->addr = addr;
1459 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 1552 iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1460 iterator->level = vcpu->arch.mmu.shadow_root_level; 1553 iterator->level = vcpu->arch.mmu.shadow_root_level;
1554
1555 if (iterator->level == PT64_ROOT_LEVEL &&
1556 vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1557 !vcpu->arch.mmu.direct_map)
1558 --iterator->level;
1559
1461 if (iterator->level == PT32E_ROOT_LEVEL) { 1560 if (iterator->level == PT32E_ROOT_LEVEL) {
1462 iterator->shadow_addr 1561 iterator->shadow_addr
1463 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 1562 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -1665,41 +1764,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1665 1764
1666/* 1765/*
1667 * Changing the number of mmu pages allocated to the vm 1766 * Changing the number of mmu pages allocated to the vm
1668 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1767 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1669 */ 1768 */
1670void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1769void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1671{ 1770{
1672 int used_pages;
1673 LIST_HEAD(invalid_list); 1771 LIST_HEAD(invalid_list);
1674
1675 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1676 used_pages = max(0, used_pages);
1677
1678 /* 1772 /*
1679 * If we set the number of mmu pages to be smaller be than the 1773 * If we set the number of mmu pages to be smaller be than the
1680 * number of actived pages , we must to free some mmu pages before we 1774 * number of actived pages , we must to free some mmu pages before we
1681 * change the value 1775 * change the value
1682 */ 1776 */
1683 1777
1684 if (used_pages > kvm_nr_mmu_pages) { 1778 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
1685 while (used_pages > kvm_nr_mmu_pages && 1779 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1686 !list_empty(&kvm->arch.active_mmu_pages)) { 1780 !list_empty(&kvm->arch.active_mmu_pages)) {
1687 struct kvm_mmu_page *page; 1781 struct kvm_mmu_page *page;
1688 1782
1689 page = container_of(kvm->arch.active_mmu_pages.prev, 1783 page = container_of(kvm->arch.active_mmu_pages.prev,
1690 struct kvm_mmu_page, link); 1784 struct kvm_mmu_page, link);
1691 used_pages -= kvm_mmu_prepare_zap_page(kvm, page, 1785 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1692 &invalid_list); 1786 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1693 } 1787 }
1694 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1788 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1695 kvm_nr_mmu_pages = used_pages;
1696 kvm->arch.n_free_mmu_pages = 0;
1697 } 1789 }
1698 else
1699 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1700 - kvm->arch.n_alloc_mmu_pages;
1701 1790
1702 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; 1791 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1703} 1792}
1704 1793
1705static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1794static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -1709,11 +1798,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1709 LIST_HEAD(invalid_list); 1798 LIST_HEAD(invalid_list);
1710 int r; 1799 int r;
1711 1800
1712 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1801 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1713 r = 0; 1802 r = 0;
1714 1803
1715 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1804 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1716 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1805 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1717 sp->role.word); 1806 sp->role.word);
1718 r = 1; 1807 r = 1;
1719 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1808 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -1729,7 +1818,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1729 LIST_HEAD(invalid_list); 1818 LIST_HEAD(invalid_list);
1730 1819
1731 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1820 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1732 pgprintk("%s: zap %lx %x\n", 1821 pgprintk("%s: zap %llx %x\n",
1733 __func__, gfn, sp->role.word); 1822 __func__, gfn, sp->role.word);
1734 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1823 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1735 } 1824 }
@@ -1915,9 +2004,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1915 unsigned pte_access, int user_fault, 2004 unsigned pte_access, int user_fault,
1916 int write_fault, int dirty, int level, 2005 int write_fault, int dirty, int level,
1917 gfn_t gfn, pfn_t pfn, bool speculative, 2006 gfn_t gfn, pfn_t pfn, bool speculative,
1918 bool can_unsync, bool reset_host_protection) 2007 bool can_unsync, bool host_writable)
1919{ 2008{
1920 u64 spte; 2009 u64 spte, entry = *sptep;
1921 int ret = 0; 2010 int ret = 0;
1922 2011
1923 /* 2012 /*
@@ -1925,7 +2014,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1925 * whether the guest actually used the pte (in order to detect 2014 * whether the guest actually used the pte (in order to detect
1926 * demand paging). 2015 * demand paging).
1927 */ 2016 */
1928 spte = shadow_base_present_pte | shadow_dirty_mask; 2017 spte = PT_PRESENT_MASK;
1929 if (!speculative) 2018 if (!speculative)
1930 spte |= shadow_accessed_mask; 2019 spte |= shadow_accessed_mask;
1931 if (!dirty) 2020 if (!dirty)
@@ -1942,14 +2031,16 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1942 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 2031 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1943 kvm_is_mmio_pfn(pfn)); 2032 kvm_is_mmio_pfn(pfn));
1944 2033
1945 if (reset_host_protection) 2034 if (host_writable)
1946 spte |= SPTE_HOST_WRITEABLE; 2035 spte |= SPTE_HOST_WRITEABLE;
2036 else
2037 pte_access &= ~ACC_WRITE_MASK;
1947 2038
1948 spte |= (u64)pfn << PAGE_SHIFT; 2039 spte |= (u64)pfn << PAGE_SHIFT;
1949 2040
1950 if ((pte_access & ACC_WRITE_MASK) 2041 if ((pte_access & ACC_WRITE_MASK)
1951 || (!tdp_enabled && write_fault && !is_write_protection(vcpu) 2042 || (!vcpu->arch.mmu.direct_map && write_fault
1952 && !user_fault)) { 2043 && !is_write_protection(vcpu) && !user_fault)) {
1953 2044
1954 if (level > PT_PAGE_TABLE_LEVEL && 2045 if (level > PT_PAGE_TABLE_LEVEL &&
1955 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2046 has_wrprotected_page(vcpu->kvm, gfn, level)) {
@@ -1960,7 +2051,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1960 2051
1961 spte |= PT_WRITABLE_MASK; 2052 spte |= PT_WRITABLE_MASK;
1962 2053
1963 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) 2054 if (!vcpu->arch.mmu.direct_map
2055 && !(pte_access & ACC_WRITE_MASK))
1964 spte &= ~PT_USER_MASK; 2056 spte &= ~PT_USER_MASK;
1965 2057
1966 /* 2058 /*
@@ -1973,7 +2065,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1973 goto set_pte; 2065 goto set_pte;
1974 2066
1975 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 2067 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1976 pgprintk("%s: found shadow page for %lx, marking ro\n", 2068 pgprintk("%s: found shadow page for %llx, marking ro\n",
1977 __func__, gfn); 2069 __func__, gfn);
1978 ret = 1; 2070 ret = 1;
1979 pte_access &= ~ACC_WRITE_MASK; 2071 pte_access &= ~ACC_WRITE_MASK;
@@ -1986,9 +2078,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1986 mark_page_dirty(vcpu->kvm, gfn); 2078 mark_page_dirty(vcpu->kvm, gfn);
1987 2079
1988set_pte: 2080set_pte:
1989 if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1990 kvm_set_pfn_dirty(pfn);
1991 update_spte(sptep, spte); 2081 update_spte(sptep, spte);
2082 /*
2083 * If we overwrite a writable spte with a read-only one we
2084 * should flush remote TLBs. Otherwise rmap_write_protect
2085 * will find a read-only spte, even though the writable spte
2086 * might be cached on a CPU's TLB.
2087 */
2088 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2089 kvm_flush_remote_tlbs(vcpu->kvm);
1992done: 2090done:
1993 return ret; 2091 return ret;
1994} 2092}
@@ -1998,13 +2096,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1998 int user_fault, int write_fault, int dirty, 2096 int user_fault, int write_fault, int dirty,
1999 int *ptwrite, int level, gfn_t gfn, 2097 int *ptwrite, int level, gfn_t gfn,
2000 pfn_t pfn, bool speculative, 2098 pfn_t pfn, bool speculative,
2001 bool reset_host_protection) 2099 bool host_writable)
2002{ 2100{
2003 int was_rmapped = 0; 2101 int was_rmapped = 0;
2004 int rmap_count; 2102 int rmap_count;
2005 2103
2006 pgprintk("%s: spte %llx access %x write_fault %d" 2104 pgprintk("%s: spte %llx access %x write_fault %d"
2007 " user_fault %d gfn %lx\n", 2105 " user_fault %d gfn %llx\n",
2008 __func__, *sptep, pt_access, 2106 __func__, *sptep, pt_access,
2009 write_fault, user_fault, gfn); 2107 write_fault, user_fault, gfn);
2010 2108
@@ -2023,7 +2121,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2023 __set_spte(sptep, shadow_trap_nonpresent_pte); 2121 __set_spte(sptep, shadow_trap_nonpresent_pte);
2024 kvm_flush_remote_tlbs(vcpu->kvm); 2122 kvm_flush_remote_tlbs(vcpu->kvm);
2025 } else if (pfn != spte_to_pfn(*sptep)) { 2123 } else if (pfn != spte_to_pfn(*sptep)) {
2026 pgprintk("hfn old %lx new %lx\n", 2124 pgprintk("hfn old %llx new %llx\n",
2027 spte_to_pfn(*sptep), pfn); 2125 spte_to_pfn(*sptep), pfn);
2028 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2126 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2029 kvm_flush_remote_tlbs(vcpu->kvm); 2127 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2033,14 +2131,14 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2033 2131
2034 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2132 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2035 dirty, level, gfn, pfn, speculative, true, 2133 dirty, level, gfn, pfn, speculative, true,
2036 reset_host_protection)) { 2134 host_writable)) {
2037 if (write_fault) 2135 if (write_fault)
2038 *ptwrite = 1; 2136 *ptwrite = 1;
2039 kvm_mmu_flush_tlb(vcpu); 2137 kvm_mmu_flush_tlb(vcpu);
2040 } 2138 }
2041 2139
2042 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2140 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2043 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", 2141 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2044 is_large_pte(*sptep)? "2MB" : "4kB", 2142 is_large_pte(*sptep)? "2MB" : "4kB",
2045 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 2143 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2046 *sptep, sptep); 2144 *sptep, sptep);
@@ -2064,8 +2162,95 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2064{ 2162{
2065} 2163}
2066 2164
2165static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2166 bool no_dirty_log)
2167{
2168 struct kvm_memory_slot *slot;
2169 unsigned long hva;
2170
2171 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2172 if (!slot) {
2173 get_page(bad_page);
2174 return page_to_pfn(bad_page);
2175 }
2176
2177 hva = gfn_to_hva_memslot(slot, gfn);
2178
2179 return hva_to_pfn_atomic(vcpu->kvm, hva);
2180}
2181
2182static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2183 struct kvm_mmu_page *sp,
2184 u64 *start, u64 *end)
2185{
2186 struct page *pages[PTE_PREFETCH_NUM];
2187 unsigned access = sp->role.access;
2188 int i, ret;
2189 gfn_t gfn;
2190
2191 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2192 if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
2193 return -1;
2194
2195 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2196 if (ret <= 0)
2197 return -1;
2198
2199 for (i = 0; i < ret; i++, gfn++, start++)
2200 mmu_set_spte(vcpu, start, ACC_ALL,
2201 access, 0, 0, 1, NULL,
2202 sp->role.level, gfn,
2203 page_to_pfn(pages[i]), true, true);
2204
2205 return 0;
2206}
2207
2208static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2209 struct kvm_mmu_page *sp, u64 *sptep)
2210{
2211 u64 *spte, *start = NULL;
2212 int i;
2213
2214 WARN_ON(!sp->role.direct);
2215
2216 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2217 spte = sp->spt + i;
2218
2219 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2220 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
2221 if (!start)
2222 continue;
2223 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2224 break;
2225 start = NULL;
2226 } else if (!start)
2227 start = spte;
2228 }
2229}
2230
2231static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2232{
2233 struct kvm_mmu_page *sp;
2234
2235 /*
2236 * Since it's no accessed bit on EPT, it's no way to
2237 * distinguish between actually accessed translations
2238 * and prefetched, so disable pte prefetch if EPT is
2239 * enabled.
2240 */
2241 if (!shadow_accessed_mask)
2242 return;
2243
2244 sp = page_header(__pa(sptep));
2245 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2246 return;
2247
2248 __direct_pte_prefetch(vcpu, sp, sptep);
2249}
2250
2067static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2251static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2068 int level, gfn_t gfn, pfn_t pfn) 2252 int map_writable, int level, gfn_t gfn, pfn_t pfn,
2253 bool prefault)
2069{ 2254{
2070 struct kvm_shadow_walk_iterator iterator; 2255 struct kvm_shadow_walk_iterator iterator;
2071 struct kvm_mmu_page *sp; 2256 struct kvm_mmu_page *sp;
@@ -2074,9 +2259,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2074 2259
2075 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2260 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2076 if (iterator.level == level) { 2261 if (iterator.level == level) {
2077 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2262 unsigned pte_access = ACC_ALL;
2263
2264 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2078 0, write, 1, &pt_write, 2265 0, write, 1, &pt_write,
2079 level, gfn, pfn, false, true); 2266 level, gfn, pfn, prefault, map_writable);
2267 direct_pte_prefetch(vcpu, iterator.sptep);
2080 ++vcpu->stat.pf_fixed; 2268 ++vcpu->stat.pf_fixed;
2081 break; 2269 break;
2082 } 2270 }
@@ -2098,28 +2286,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2098 __set_spte(iterator.sptep, 2286 __set_spte(iterator.sptep,
2099 __pa(sp->spt) 2287 __pa(sp->spt)
2100 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2288 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2101 | shadow_user_mask | shadow_x_mask); 2289 | shadow_user_mask | shadow_x_mask
2290 | shadow_accessed_mask);
2102 } 2291 }
2103 } 2292 }
2104 return pt_write; 2293 return pt_write;
2105} 2294}
2106 2295
2107static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) 2296static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2108{ 2297{
2109 char buf[1]; 2298 siginfo_t info;
2110 void __user *hva;
2111 int r;
2112 2299
2113 /* Touch the page, so send SIGBUS */ 2300 info.si_signo = SIGBUS;
2114 hva = (void __user *)gfn_to_hva(kvm, gfn); 2301 info.si_errno = 0;
2115 r = copy_from_user(buf, hva, 1); 2302 info.si_code = BUS_MCEERR_AR;
2303 info.si_addr = (void __user *)address;
2304 info.si_addr_lsb = PAGE_SHIFT;
2305
2306 send_sig_info(SIGBUS, &info, tsk);
2116} 2307}
2117 2308
2118static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2309static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2119{ 2310{
2120 kvm_release_pfn_clean(pfn); 2311 kvm_release_pfn_clean(pfn);
2121 if (is_hwpoison_pfn(pfn)) { 2312 if (is_hwpoison_pfn(pfn)) {
2122 kvm_send_hwpoison_signal(kvm, gfn); 2313 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
2123 return 0; 2314 return 0;
2124 } else if (is_fault_pfn(pfn)) 2315 } else if (is_fault_pfn(pfn))
2125 return -EFAULT; 2316 return -EFAULT;
@@ -2127,27 +2318,81 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2127 return 1; 2318 return 1;
2128} 2319}
2129 2320
2130static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2321static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2322 gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2323{
2324 pfn_t pfn = *pfnp;
2325 gfn_t gfn = *gfnp;
2326 int level = *levelp;
2327
2328 /*
2329 * Check if it's a transparent hugepage. If this would be an
2330 * hugetlbfs page, level wouldn't be set to
2331 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2332 * here.
2333 */
2334 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2335 level == PT_PAGE_TABLE_LEVEL &&
2336 PageTransCompound(pfn_to_page(pfn)) &&
2337 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2338 unsigned long mask;
2339 /*
2340 * mmu_notifier_retry was successful and we hold the
2341 * mmu_lock here, so the pmd can't become splitting
2342 * from under us, and in turn
2343 * __split_huge_page_refcount() can't run from under
2344 * us and we can safely transfer the refcount from
2345 * PG_tail to PG_head as we switch the pfn to tail to
2346 * head.
2347 */
2348 *levelp = level = PT_DIRECTORY_LEVEL;
2349 mask = KVM_PAGES_PER_HPAGE(level) - 1;
2350 VM_BUG_ON((gfn & mask) != (pfn & mask));
2351 if (pfn & mask) {
2352 gfn &= ~mask;
2353 *gfnp = gfn;
2354 kvm_release_pfn_clean(pfn);
2355 pfn &= ~mask;
2356 if (!get_page_unless_zero(pfn_to_page(pfn)))
2357 BUG();
2358 *pfnp = pfn;
2359 }
2360 }
2361}
2362
2363static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2364 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2365
2366static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2367 bool prefault)
2131{ 2368{
2132 int r; 2369 int r;
2133 int level; 2370 int level;
2371 int force_pt_level;
2134 pfn_t pfn; 2372 pfn_t pfn;
2135 unsigned long mmu_seq; 2373 unsigned long mmu_seq;
2374 bool map_writable;
2136 2375
2137 level = mapping_level(vcpu, gfn); 2376 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2138 2377 if (likely(!force_pt_level)) {
2139 /* 2378 level = mapping_level(vcpu, gfn);
2140 * This path builds a PAE pagetable - so we can map 2mb pages at 2379 /*
2141 * maximum. Therefore check if the level is larger than that. 2380 * This path builds a PAE pagetable - so we can map
2142 */ 2381 * 2mb pages at maximum. Therefore check if the level
2143 if (level > PT_DIRECTORY_LEVEL) 2382 * is larger than that.
2144 level = PT_DIRECTORY_LEVEL; 2383 */
2384 if (level > PT_DIRECTORY_LEVEL)
2385 level = PT_DIRECTORY_LEVEL;
2145 2386
2146 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2387 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2388 } else
2389 level = PT_PAGE_TABLE_LEVEL;
2147 2390
2148 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2391 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2149 smp_rmb(); 2392 smp_rmb();
2150 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2393
2394 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2395 return 0;
2151 2396
2152 /* mmio */ 2397 /* mmio */
2153 if (is_error_pfn(pfn)) 2398 if (is_error_pfn(pfn))
@@ -2157,7 +2402,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2157 if (mmu_notifier_retry(vcpu, mmu_seq)) 2402 if (mmu_notifier_retry(vcpu, mmu_seq))
2158 goto out_unlock; 2403 goto out_unlock;
2159 kvm_mmu_free_some_pages(vcpu); 2404 kvm_mmu_free_some_pages(vcpu);
2160 r = __direct_map(vcpu, v, write, level, gfn, pfn); 2405 if (likely(!force_pt_level))
2406 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2407 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2408 prefault);
2161 spin_unlock(&vcpu->kvm->mmu_lock); 2409 spin_unlock(&vcpu->kvm->mmu_lock);
2162 2410
2163 2411
@@ -2179,7 +2427,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2179 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2427 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2180 return; 2428 return;
2181 spin_lock(&vcpu->kvm->mmu_lock); 2429 spin_lock(&vcpu->kvm->mmu_lock);
2182 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2430 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2431 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2432 vcpu->arch.mmu.direct_map)) {
2183 hpa_t root = vcpu->arch.mmu.root_hpa; 2433 hpa_t root = vcpu->arch.mmu.root_hpa;
2184 2434
2185 sp = page_header(root); 2435 sp = page_header(root);
@@ -2222,83 +2472,163 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2222 return ret; 2472 return ret;
2223} 2473}
2224 2474
2225static int mmu_alloc_roots(struct kvm_vcpu *vcpu) 2475static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2226{ 2476{
2227 int i;
2228 gfn_t root_gfn;
2229 struct kvm_mmu_page *sp; 2477 struct kvm_mmu_page *sp;
2230 int direct = 0; 2478 unsigned i;
2231 u64 pdptr;
2232
2233 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2234 2479
2235 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2480 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2481 spin_lock(&vcpu->kvm->mmu_lock);
2482 kvm_mmu_free_some_pages(vcpu);
2483 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2484 1, ACC_ALL, NULL);
2485 ++sp->root_count;
2486 spin_unlock(&vcpu->kvm->mmu_lock);
2487 vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2488 } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2489 for (i = 0; i < 4; ++i) {
2490 hpa_t root = vcpu->arch.mmu.pae_root[i];
2491
2492 ASSERT(!VALID_PAGE(root));
2493 spin_lock(&vcpu->kvm->mmu_lock);
2494 kvm_mmu_free_some_pages(vcpu);
2495 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2496 i << 30,
2497 PT32_ROOT_LEVEL, 1, ACC_ALL,
2498 NULL);
2499 root = __pa(sp->spt);
2500 ++sp->root_count;
2501 spin_unlock(&vcpu->kvm->mmu_lock);
2502 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2503 }
2504 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2505 } else
2506 BUG();
2507
2508 return 0;
2509}
2510
2511static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2512{
2513 struct kvm_mmu_page *sp;
2514 u64 pdptr, pm_mask;
2515 gfn_t root_gfn;
2516 int i;
2517
2518 root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2519
2520 if (mmu_check_root(vcpu, root_gfn))
2521 return 1;
2522
2523 /*
2524 * Do we shadow a long mode page table? If so we need to
2525 * write-protect the guests page table root.
2526 */
2527 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2236 hpa_t root = vcpu->arch.mmu.root_hpa; 2528 hpa_t root = vcpu->arch.mmu.root_hpa;
2237 2529
2238 ASSERT(!VALID_PAGE(root)); 2530 ASSERT(!VALID_PAGE(root));
2239 if (mmu_check_root(vcpu, root_gfn)) 2531
2240 return 1;
2241 if (tdp_enabled) {
2242 direct = 1;
2243 root_gfn = 0;
2244 }
2245 spin_lock(&vcpu->kvm->mmu_lock); 2532 spin_lock(&vcpu->kvm->mmu_lock);
2246 kvm_mmu_free_some_pages(vcpu); 2533 kvm_mmu_free_some_pages(vcpu);
2247 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2534 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2248 PT64_ROOT_LEVEL, direct, 2535 0, ACC_ALL, NULL);
2249 ACC_ALL, NULL);
2250 root = __pa(sp->spt); 2536 root = __pa(sp->spt);
2251 ++sp->root_count; 2537 ++sp->root_count;
2252 spin_unlock(&vcpu->kvm->mmu_lock); 2538 spin_unlock(&vcpu->kvm->mmu_lock);
2253 vcpu->arch.mmu.root_hpa = root; 2539 vcpu->arch.mmu.root_hpa = root;
2254 return 0; 2540 return 0;
2255 } 2541 }
2256 direct = !is_paging(vcpu); 2542
2543 /*
2544 * We shadow a 32 bit page table. This may be a legacy 2-level
2545 * or a PAE 3-level page table. In either case we need to be aware that
2546 * the shadow page table may be a PAE or a long mode page table.
2547 */
2548 pm_mask = PT_PRESENT_MASK;
2549 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
2550 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
2551
2257 for (i = 0; i < 4; ++i) { 2552 for (i = 0; i < 4; ++i) {
2258 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2553 hpa_t root = vcpu->arch.mmu.pae_root[i];
2259 2554
2260 ASSERT(!VALID_PAGE(root)); 2555 ASSERT(!VALID_PAGE(root));
2261 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2556 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2262 pdptr = kvm_pdptr_read(vcpu, i); 2557 pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2263 if (!is_present_gpte(pdptr)) { 2558 if (!is_present_gpte(pdptr)) {
2264 vcpu->arch.mmu.pae_root[i] = 0; 2559 vcpu->arch.mmu.pae_root[i] = 0;
2265 continue; 2560 continue;
2266 } 2561 }
2267 root_gfn = pdptr >> PAGE_SHIFT; 2562 root_gfn = pdptr >> PAGE_SHIFT;
2268 } else if (vcpu->arch.mmu.root_level == 0) 2563 if (mmu_check_root(vcpu, root_gfn))
2269 root_gfn = 0; 2564 return 1;
2270 if (mmu_check_root(vcpu, root_gfn))
2271 return 1;
2272 if (tdp_enabled) {
2273 direct = 1;
2274 root_gfn = i << 30;
2275 } 2565 }
2276 spin_lock(&vcpu->kvm->mmu_lock); 2566 spin_lock(&vcpu->kvm->mmu_lock);
2277 kvm_mmu_free_some_pages(vcpu); 2567 kvm_mmu_free_some_pages(vcpu);
2278 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2568 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2279 PT32_ROOT_LEVEL, direct, 2569 PT32_ROOT_LEVEL, 0,
2280 ACC_ALL, NULL); 2570 ACC_ALL, NULL);
2281 root = __pa(sp->spt); 2571 root = __pa(sp->spt);
2282 ++sp->root_count; 2572 ++sp->root_count;
2283 spin_unlock(&vcpu->kvm->mmu_lock); 2573 spin_unlock(&vcpu->kvm->mmu_lock);
2284 2574
2285 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2575 vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2286 } 2576 }
2287 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2577 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2578
2579 /*
2580 * If we shadow a 32 bit page table with a long mode page
2581 * table we enter this path.
2582 */
2583 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2584 if (vcpu->arch.mmu.lm_root == NULL) {
2585 /*
2586 * The additional page necessary for this is only
2587 * allocated on demand.
2588 */
2589
2590 u64 *lm_root;
2591
2592 lm_root = (void*)get_zeroed_page(GFP_KERNEL);
2593 if (lm_root == NULL)
2594 return 1;
2595
2596 lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
2597
2598 vcpu->arch.mmu.lm_root = lm_root;
2599 }
2600
2601 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
2602 }
2603
2288 return 0; 2604 return 0;
2289} 2605}
2290 2606
2607static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2608{
2609 if (vcpu->arch.mmu.direct_map)
2610 return mmu_alloc_direct_roots(vcpu);
2611 else
2612 return mmu_alloc_shadow_roots(vcpu);
2613}
2614
2291static void mmu_sync_roots(struct kvm_vcpu *vcpu) 2615static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2292{ 2616{
2293 int i; 2617 int i;
2294 struct kvm_mmu_page *sp; 2618 struct kvm_mmu_page *sp;
2295 2619
2620 if (vcpu->arch.mmu.direct_map)
2621 return;
2622
2296 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2623 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2297 return; 2624 return;
2298 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2625
2626 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2627 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2299 hpa_t root = vcpu->arch.mmu.root_hpa; 2628 hpa_t root = vcpu->arch.mmu.root_hpa;
2300 sp = page_header(root); 2629 sp = page_header(root);
2301 mmu_sync_children(vcpu, sp); 2630 mmu_sync_children(vcpu, sp);
2631 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2302 return; 2632 return;
2303 } 2633 }
2304 for (i = 0; i < 4; ++i) { 2634 for (i = 0; i < 4; ++i) {
@@ -2310,6 +2640,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2310 mmu_sync_children(vcpu, sp); 2640 mmu_sync_children(vcpu, sp);
2311 } 2641 }
2312 } 2642 }
2643 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2313} 2644}
2314 2645
2315void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2646void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2320,15 +2651,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2320} 2651}
2321 2652
2322static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 2653static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2323 u32 access, u32 *error) 2654 u32 access, struct x86_exception *exception)
2324{ 2655{
2325 if (error) 2656 if (exception)
2326 *error = 0; 2657 exception->error_code = 0;
2327 return vaddr; 2658 return vaddr;
2328} 2659}
2329 2660
2661static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2662 u32 access,
2663 struct x86_exception *exception)
2664{
2665 if (exception)
2666 exception->error_code = 0;
2667 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2668}
2669
2330static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2670static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2331 u32 error_code) 2671 u32 error_code, bool prefault)
2332{ 2672{
2333 gfn_t gfn; 2673 gfn_t gfn;
2334 int r; 2674 int r;
@@ -2344,17 +2684,68 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2344 gfn = gva >> PAGE_SHIFT; 2684 gfn = gva >> PAGE_SHIFT;
2345 2685
2346 return nonpaging_map(vcpu, gva & PAGE_MASK, 2686 return nonpaging_map(vcpu, gva & PAGE_MASK,
2347 error_code & PFERR_WRITE_MASK, gfn); 2687 error_code & PFERR_WRITE_MASK, gfn, prefault);
2688}
2689
2690static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2691{
2692 struct kvm_arch_async_pf arch;
2693
2694 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2695 arch.gfn = gfn;
2696 arch.direct_map = vcpu->arch.mmu.direct_map;
2697 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
2698
2699 return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2700}
2701
2702static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2703{
2704 if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2705 kvm_event_needs_reinjection(vcpu)))
2706 return false;
2707
2708 return kvm_x86_ops->interrupt_allowed(vcpu);
2709}
2710
2711static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2712 gva_t gva, pfn_t *pfn, bool write, bool *writable)
2713{
2714 bool async;
2715
2716 *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
2717
2718 if (!async)
2719 return false; /* *pfn has correct page already */
2720
2721 put_page(pfn_to_page(*pfn));
2722
2723 if (!prefault && can_do_async_pf(vcpu)) {
2724 trace_kvm_try_async_get_page(gva, gfn);
2725 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2726 trace_kvm_async_pf_doublefault(gva, gfn);
2727 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2728 return true;
2729 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2730 return true;
2731 }
2732
2733 *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
2734
2735 return false;
2348} 2736}
2349 2737
2350static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, 2738static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2351 u32 error_code) 2739 bool prefault)
2352{ 2740{
2353 pfn_t pfn; 2741 pfn_t pfn;
2354 int r; 2742 int r;
2355 int level; 2743 int level;
2744 int force_pt_level;
2356 gfn_t gfn = gpa >> PAGE_SHIFT; 2745 gfn_t gfn = gpa >> PAGE_SHIFT;
2357 unsigned long mmu_seq; 2746 unsigned long mmu_seq;
2747 int write = error_code & PFERR_WRITE_MASK;
2748 bool map_writable;
2358 2749
2359 ASSERT(vcpu); 2750 ASSERT(vcpu);
2360 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2751 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2363,21 +2754,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2363 if (r) 2754 if (r)
2364 return r; 2755 return r;
2365 2756
2366 level = mapping_level(vcpu, gfn); 2757 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2367 2758 if (likely(!force_pt_level)) {
2368 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2759 level = mapping_level(vcpu, gfn);
2760 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2761 } else
2762 level = PT_PAGE_TABLE_LEVEL;
2369 2763
2370 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2764 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2371 smp_rmb(); 2765 smp_rmb();
2372 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2766
2767 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2768 return 0;
2769
2770 /* mmio */
2373 if (is_error_pfn(pfn)) 2771 if (is_error_pfn(pfn))
2374 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2772 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2375 spin_lock(&vcpu->kvm->mmu_lock); 2773 spin_lock(&vcpu->kvm->mmu_lock);
2376 if (mmu_notifier_retry(vcpu, mmu_seq)) 2774 if (mmu_notifier_retry(vcpu, mmu_seq))
2377 goto out_unlock; 2775 goto out_unlock;
2378 kvm_mmu_free_some_pages(vcpu); 2776 kvm_mmu_free_some_pages(vcpu);
2379 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2777 if (likely(!force_pt_level))
2380 level, gfn, pfn); 2778 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2779 r = __direct_map(vcpu, gpa, write, map_writable,
2780 level, gfn, pfn, prefault);
2381 spin_unlock(&vcpu->kvm->mmu_lock); 2781 spin_unlock(&vcpu->kvm->mmu_lock);
2382 2782
2383 return r; 2783 return r;
@@ -2393,10 +2793,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
2393 mmu_free_roots(vcpu); 2793 mmu_free_roots(vcpu);
2394} 2794}
2395 2795
2396static int nonpaging_init_context(struct kvm_vcpu *vcpu) 2796static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2797 struct kvm_mmu *context)
2397{ 2798{
2398 struct kvm_mmu *context = &vcpu->arch.mmu;
2399
2400 context->new_cr3 = nonpaging_new_cr3; 2799 context->new_cr3 = nonpaging_new_cr3;
2401 context->page_fault = nonpaging_page_fault; 2800 context->page_fault = nonpaging_page_fault;
2402 context->gva_to_gpa = nonpaging_gva_to_gpa; 2801 context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2404,9 +2803,12 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2404 context->prefetch_page = nonpaging_prefetch_page; 2803 context->prefetch_page = nonpaging_prefetch_page;
2405 context->sync_page = nonpaging_sync_page; 2804 context->sync_page = nonpaging_sync_page;
2406 context->invlpg = nonpaging_invlpg; 2805 context->invlpg = nonpaging_invlpg;
2806 context->update_pte = nonpaging_update_pte;
2407 context->root_level = 0; 2807 context->root_level = 0;
2408 context->shadow_root_level = PT32E_ROOT_LEVEL; 2808 context->shadow_root_level = PT32E_ROOT_LEVEL;
2409 context->root_hpa = INVALID_PAGE; 2809 context->root_hpa = INVALID_PAGE;
2810 context->direct_map = true;
2811 context->nx = false;
2410 return 0; 2812 return 0;
2411} 2813}
2412 2814
@@ -2418,15 +2820,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2418 2820
2419static void paging_new_cr3(struct kvm_vcpu *vcpu) 2821static void paging_new_cr3(struct kvm_vcpu *vcpu)
2420{ 2822{
2421 pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); 2823 pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
2422 mmu_free_roots(vcpu); 2824 mmu_free_roots(vcpu);
2423} 2825}
2424 2826
2827static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2828{
2829 return kvm_read_cr3(vcpu);
2830}
2831
2425static void inject_page_fault(struct kvm_vcpu *vcpu, 2832static void inject_page_fault(struct kvm_vcpu *vcpu,
2426 u64 addr, 2833 struct x86_exception *fault)
2427 u32 err_code)
2428{ 2834{
2429 kvm_inject_page_fault(vcpu, addr, err_code); 2835 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
2430} 2836}
2431 2837
2432static void paging_free(struct kvm_vcpu *vcpu) 2838static void paging_free(struct kvm_vcpu *vcpu)
@@ -2434,12 +2840,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
2434 nonpaging_free(vcpu); 2840 nonpaging_free(vcpu);
2435} 2841}
2436 2842
2437static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) 2843static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2438{ 2844{
2439 int bit7; 2845 int bit7;
2440 2846
2441 bit7 = (gpte >> 7) & 1; 2847 bit7 = (gpte >> 7) & 1;
2442 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; 2848 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2443} 2849}
2444 2850
2445#define PTTYPE 64 2851#define PTTYPE 64
@@ -2450,13 +2856,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2450#include "paging_tmpl.h" 2856#include "paging_tmpl.h"
2451#undef PTTYPE 2857#undef PTTYPE
2452 2858
2453static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) 2859static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
2860 struct kvm_mmu *context,
2861 int level)
2454{ 2862{
2455 struct kvm_mmu *context = &vcpu->arch.mmu;
2456 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2863 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2457 u64 exb_bit_rsvd = 0; 2864 u64 exb_bit_rsvd = 0;
2458 2865
2459 if (!is_nx(vcpu)) 2866 if (!context->nx)
2460 exb_bit_rsvd = rsvd_bits(63, 63); 2867 exb_bit_rsvd = rsvd_bits(63, 63);
2461 switch (level) { 2868 switch (level) {
2462 case PT32_ROOT_LEVEL: 2869 case PT32_ROOT_LEVEL:
@@ -2511,9 +2918,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2511 } 2918 }
2512} 2919}
2513 2920
2514static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2921static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2922 struct kvm_mmu *context,
2923 int level)
2515{ 2924{
2516 struct kvm_mmu *context = &vcpu->arch.mmu; 2925 context->nx = is_nx(vcpu);
2926
2927 reset_rsvds_bits_mask(vcpu, context, level);
2517 2928
2518 ASSERT(is_pae(vcpu)); 2929 ASSERT(is_pae(vcpu));
2519 context->new_cr3 = paging_new_cr3; 2930 context->new_cr3 = paging_new_cr3;
@@ -2522,24 +2933,28 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2522 context->prefetch_page = paging64_prefetch_page; 2933 context->prefetch_page = paging64_prefetch_page;
2523 context->sync_page = paging64_sync_page; 2934 context->sync_page = paging64_sync_page;
2524 context->invlpg = paging64_invlpg; 2935 context->invlpg = paging64_invlpg;
2936 context->update_pte = paging64_update_pte;
2525 context->free = paging_free; 2937 context->free = paging_free;
2526 context->root_level = level; 2938 context->root_level = level;
2527 context->shadow_root_level = level; 2939 context->shadow_root_level = level;
2528 context->root_hpa = INVALID_PAGE; 2940 context->root_hpa = INVALID_PAGE;
2941 context->direct_map = false;
2529 return 0; 2942 return 0;
2530} 2943}
2531 2944
2532static int paging64_init_context(struct kvm_vcpu *vcpu) 2945static int paging64_init_context(struct kvm_vcpu *vcpu,
2946 struct kvm_mmu *context)
2533{ 2947{
2534 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2948 return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2535 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2536} 2949}
2537 2950
2538static int paging32_init_context(struct kvm_vcpu *vcpu) 2951static int paging32_init_context(struct kvm_vcpu *vcpu,
2952 struct kvm_mmu *context)
2539{ 2953{
2540 struct kvm_mmu *context = &vcpu->arch.mmu; 2954 context->nx = false;
2955
2956 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2541 2957
2542 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2543 context->new_cr3 = paging_new_cr3; 2958 context->new_cr3 = paging_new_cr3;
2544 context->page_fault = paging32_page_fault; 2959 context->page_fault = paging32_page_fault;
2545 context->gva_to_gpa = paging32_gva_to_gpa; 2960 context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2547,44 +2962,57 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2547 context->prefetch_page = paging32_prefetch_page; 2962 context->prefetch_page = paging32_prefetch_page;
2548 context->sync_page = paging32_sync_page; 2963 context->sync_page = paging32_sync_page;
2549 context->invlpg = paging32_invlpg; 2964 context->invlpg = paging32_invlpg;
2965 context->update_pte = paging32_update_pte;
2550 context->root_level = PT32_ROOT_LEVEL; 2966 context->root_level = PT32_ROOT_LEVEL;
2551 context->shadow_root_level = PT32E_ROOT_LEVEL; 2967 context->shadow_root_level = PT32E_ROOT_LEVEL;
2552 context->root_hpa = INVALID_PAGE; 2968 context->root_hpa = INVALID_PAGE;
2969 context->direct_map = false;
2553 return 0; 2970 return 0;
2554} 2971}
2555 2972
2556static int paging32E_init_context(struct kvm_vcpu *vcpu) 2973static int paging32E_init_context(struct kvm_vcpu *vcpu,
2974 struct kvm_mmu *context)
2557{ 2975{
2558 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2976 return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
2559 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2560} 2977}
2561 2978
2562static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 2979static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2563{ 2980{
2564 struct kvm_mmu *context = &vcpu->arch.mmu; 2981 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2565 2982
2983 context->base_role.word = 0;
2566 context->new_cr3 = nonpaging_new_cr3; 2984 context->new_cr3 = nonpaging_new_cr3;
2567 context->page_fault = tdp_page_fault; 2985 context->page_fault = tdp_page_fault;
2568 context->free = nonpaging_free; 2986 context->free = nonpaging_free;
2569 context->prefetch_page = nonpaging_prefetch_page; 2987 context->prefetch_page = nonpaging_prefetch_page;
2570 context->sync_page = nonpaging_sync_page; 2988 context->sync_page = nonpaging_sync_page;
2571 context->invlpg = nonpaging_invlpg; 2989 context->invlpg = nonpaging_invlpg;
2990 context->update_pte = nonpaging_update_pte;
2572 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2991 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2573 context->root_hpa = INVALID_PAGE; 2992 context->root_hpa = INVALID_PAGE;
2993 context->direct_map = true;
2994 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2995 context->get_cr3 = get_cr3;
2996 context->inject_page_fault = kvm_inject_page_fault;
2997 context->nx = is_nx(vcpu);
2574 2998
2575 if (!is_paging(vcpu)) { 2999 if (!is_paging(vcpu)) {
3000 context->nx = false;
2576 context->gva_to_gpa = nonpaging_gva_to_gpa; 3001 context->gva_to_gpa = nonpaging_gva_to_gpa;
2577 context->root_level = 0; 3002 context->root_level = 0;
2578 } else if (is_long_mode(vcpu)) { 3003 } else if (is_long_mode(vcpu)) {
2579 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 3004 context->nx = is_nx(vcpu);
3005 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
2580 context->gva_to_gpa = paging64_gva_to_gpa; 3006 context->gva_to_gpa = paging64_gva_to_gpa;
2581 context->root_level = PT64_ROOT_LEVEL; 3007 context->root_level = PT64_ROOT_LEVEL;
2582 } else if (is_pae(vcpu)) { 3008 } else if (is_pae(vcpu)) {
2583 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 3009 context->nx = is_nx(vcpu);
3010 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
2584 context->gva_to_gpa = paging64_gva_to_gpa; 3011 context->gva_to_gpa = paging64_gva_to_gpa;
2585 context->root_level = PT32E_ROOT_LEVEL; 3012 context->root_level = PT32E_ROOT_LEVEL;
2586 } else { 3013 } else {
2587 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); 3014 context->nx = false;
3015 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2588 context->gva_to_gpa = paging32_gva_to_gpa; 3016 context->gva_to_gpa = paging32_gva_to_gpa;
2589 context->root_level = PT32_ROOT_LEVEL; 3017 context->root_level = PT32_ROOT_LEVEL;
2590 } 3018 }
@@ -2592,33 +3020,81 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2592 return 0; 3020 return 0;
2593} 3021}
2594 3022
2595static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 3023int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
2596{ 3024{
2597 int r; 3025 int r;
2598
2599 ASSERT(vcpu); 3026 ASSERT(vcpu);
2600 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3027 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2601 3028
2602 if (!is_paging(vcpu)) 3029 if (!is_paging(vcpu))
2603 r = nonpaging_init_context(vcpu); 3030 r = nonpaging_init_context(vcpu, context);
2604 else if (is_long_mode(vcpu)) 3031 else if (is_long_mode(vcpu))
2605 r = paging64_init_context(vcpu); 3032 r = paging64_init_context(vcpu, context);
2606 else if (is_pae(vcpu)) 3033 else if (is_pae(vcpu))
2607 r = paging32E_init_context(vcpu); 3034 r = paging32E_init_context(vcpu, context);
2608 else 3035 else
2609 r = paging32_init_context(vcpu); 3036 r = paging32_init_context(vcpu, context);
2610 3037
2611 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3038 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2612 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3039 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2613 3040
2614 return r; 3041 return r;
2615} 3042}
3043EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
2616 3044
2617static int init_kvm_mmu(struct kvm_vcpu *vcpu) 3045static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2618{ 3046{
2619 vcpu->arch.update_pte.pfn = bad_pfn; 3047 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
2620 3048
2621 if (tdp_enabled) 3049 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
3050 vcpu->arch.walk_mmu->get_cr3 = get_cr3;
3051 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3052
3053 return r;
3054}
3055
3056static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3057{
3058 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
3059
3060 g_context->get_cr3 = get_cr3;
3061 g_context->inject_page_fault = kvm_inject_page_fault;
3062
3063 /*
3064 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
3065 * translation of l2_gpa to l1_gpa addresses is done using the
3066 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
3067 * functions between mmu and nested_mmu are swapped.
3068 */
3069 if (!is_paging(vcpu)) {
3070 g_context->nx = false;
3071 g_context->root_level = 0;
3072 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
3073 } else if (is_long_mode(vcpu)) {
3074 g_context->nx = is_nx(vcpu);
3075 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
3076 g_context->root_level = PT64_ROOT_LEVEL;
3077 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3078 } else if (is_pae(vcpu)) {
3079 g_context->nx = is_nx(vcpu);
3080 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
3081 g_context->root_level = PT32E_ROOT_LEVEL;
3082 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3083 } else {
3084 g_context->nx = false;
3085 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
3086 g_context->root_level = PT32_ROOT_LEVEL;
3087 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3088 }
3089
3090 return 0;
3091}
3092
3093static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3094{
3095 if (mmu_is_nested(vcpu))
3096 return init_kvm_nested_mmu(vcpu);
3097 else if (tdp_enabled)
2622 return init_kvm_tdp_mmu(vcpu); 3098 return init_kvm_tdp_mmu(vcpu);
2623 else 3099 else
2624 return init_kvm_softmmu(vcpu); 3100 return init_kvm_softmmu(vcpu);
@@ -2653,7 +3129,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2653 if (r) 3129 if (r)
2654 goto out; 3130 goto out;
2655 /* set_cr3() should ensure TLB has been flushed */ 3131 /* set_cr3() should ensure TLB has been flushed */
2656 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 3132 vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2657out: 3133out:
2658 return r; 3134 return r;
2659} 3135}
@@ -2663,6 +3139,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2663{ 3139{
2664 mmu_free_roots(vcpu); 3140 mmu_free_roots(vcpu);
2665} 3141}
3142EXPORT_SYMBOL_GPL(kvm_mmu_unload);
2666 3143
2667static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, 3144static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2668 struct kvm_mmu_page *sp, 3145 struct kvm_mmu_page *sp,
@@ -2686,8 +3163,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2686} 3163}
2687 3164
2688static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2689 struct kvm_mmu_page *sp, 3166 struct kvm_mmu_page *sp, u64 *spte,
2690 u64 *spte,
2691 const void *new) 3167 const void *new)
2692{ 3168{
2693 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 3169 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
@@ -2695,14 +3171,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2695 return; 3171 return;
2696 } 3172 }
2697 3173
2698 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2699 return;
2700
2701 ++vcpu->kvm->stat.mmu_pte_updated; 3174 ++vcpu->kvm->stat.mmu_pte_updated;
2702 if (!sp->role.cr4_pae) 3175 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
2703 paging32_update_pte(vcpu, sp, spte, new);
2704 else
2705 paging64_update_pte(vcpu, sp, spte, new);
2706} 3176}
2707 3177
2708static bool need_remote_flush(u64 old, u64 new) 3178static bool need_remote_flush(u64 old, u64 new)
@@ -2737,28 +3207,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2737 return !!(spte && (*spte & shadow_accessed_mask)); 3207 return !!(spte && (*spte & shadow_accessed_mask));
2738} 3208}
2739 3209
2740static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2741 u64 gpte)
2742{
2743 gfn_t gfn;
2744 pfn_t pfn;
2745
2746 if (!is_present_gpte(gpte))
2747 return;
2748 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2749
2750 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2751 smp_rmb();
2752 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2753
2754 if (is_error_pfn(pfn)) {
2755 kvm_release_pfn_clean(pfn);
2756 return;
2757 }
2758 vcpu->arch.update_pte.gfn = gfn;
2759 vcpu->arch.update_pte.pfn = pfn;
2760}
2761
2762static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) 3210static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2763{ 3211{
2764 u64 *spte = vcpu->arch.last_pte_updated; 3212 u64 *spte = vcpu->arch.last_pte_updated;
@@ -2780,21 +3228,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2780 struct kvm_mmu_page *sp; 3228 struct kvm_mmu_page *sp;
2781 struct hlist_node *node; 3229 struct hlist_node *node;
2782 LIST_HEAD(invalid_list); 3230 LIST_HEAD(invalid_list);
2783 u64 entry, gentry; 3231 u64 entry, gentry, *spte;
2784 u64 *spte; 3232 unsigned pte_size, page_offset, misaligned, quadrant, offset;
2785 unsigned offset = offset_in_page(gpa); 3233 int level, npte, invlpg_counter, r, flooded = 0;
2786 unsigned pte_size;
2787 unsigned page_offset;
2788 unsigned misaligned;
2789 unsigned quadrant;
2790 int level;
2791 int flooded = 0;
2792 int npte;
2793 int r;
2794 int invlpg_counter;
2795 bool remote_flush, local_flush, zap_page; 3234 bool remote_flush, local_flush, zap_page;
2796 3235
2797 zap_page = remote_flush = local_flush = false; 3236 zap_page = remote_flush = local_flush = false;
3237 offset = offset_in_page(gpa);
2798 3238
2799 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 3239 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2800 3240
@@ -2802,9 +3242,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2802 3242
2803 /* 3243 /*
2804 * Assume that the pte write on a page table of the same type 3244 * Assume that the pte write on a page table of the same type
2805 * as the current vcpu paging mode. This is nearly always true 3245 * as the current vcpu paging mode since we update the sptes only
2806 * (might be false while changing modes). Note it is verified later 3246 * when they have the same mode.
2807 * by update_pte().
2808 */ 3247 */
2809 if ((is_pae(vcpu) && bytes == 4) || !new) { 3248 if ((is_pae(vcpu) && bytes == 4) || !new) {
2810 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3249 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
@@ -2830,15 +3269,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2830 break; 3269 break;
2831 } 3270 }
2832 3271
2833 mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2834 spin_lock(&vcpu->kvm->mmu_lock); 3272 spin_lock(&vcpu->kvm->mmu_lock);
2835 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) 3273 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2836 gentry = 0; 3274 gentry = 0;
2837 kvm_mmu_access_page(vcpu, gfn);
2838 kvm_mmu_free_some_pages(vcpu); 3275 kvm_mmu_free_some_pages(vcpu);
2839 ++vcpu->kvm->stat.mmu_pte_write; 3276 ++vcpu->kvm->stat.mmu_pte_write;
2840 kvm_mmu_audit(vcpu, "pre pte write"); 3277 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
2841 if (guest_initiated) { 3278 if (guest_initiated) {
3279 kvm_mmu_access_page(vcpu, gfn);
2842 if (gfn == vcpu->arch.last_pt_write_gfn 3280 if (gfn == vcpu->arch.last_pt_write_gfn
2843 && !last_updated_pte_accessed(vcpu)) { 3281 && !last_updated_pte_accessed(vcpu)) {
2844 ++vcpu->arch.last_pt_write_count; 3282 ++vcpu->arch.last_pt_write_count;
@@ -2910,12 +3348,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2910 } 3348 }
2911 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 3349 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2912 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3350 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2913 kvm_mmu_audit(vcpu, "post pte write"); 3351 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
2914 spin_unlock(&vcpu->kvm->mmu_lock); 3352 spin_unlock(&vcpu->kvm->mmu_lock);
2915 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2916 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
2917 vcpu->arch.update_pte.pfn = bad_pfn;
2918 }
2919} 3353}
2920 3354
2921int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 3355int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -2923,7 +3357,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2923 gpa_t gpa; 3357 gpa_t gpa;
2924 int r; 3358 int r;
2925 3359
2926 if (tdp_enabled) 3360 if (vcpu->arch.mmu.direct_map)
2927 return 0; 3361 return 0;
2928 3362
2929 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 3363 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -2937,29 +3371,27 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2937 3371
2938void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 3372void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2939{ 3373{
2940 int free_pages;
2941 LIST_HEAD(invalid_list); 3374 LIST_HEAD(invalid_list);
2942 3375
2943 free_pages = vcpu->kvm->arch.n_free_mmu_pages; 3376 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
2944 while (free_pages < KVM_REFILL_PAGES &&
2945 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 3377 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2946 struct kvm_mmu_page *sp; 3378 struct kvm_mmu_page *sp;
2947 3379
2948 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 3380 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2949 struct kvm_mmu_page, link); 3381 struct kvm_mmu_page, link);
2950 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3382 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2951 &invalid_list); 3383 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2952 ++vcpu->kvm->stat.mmu_recycled; 3384 ++vcpu->kvm->stat.mmu_recycled;
2953 } 3385 }
2954 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2955} 3386}
2956 3387
2957int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 3388int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3389 void *insn, int insn_len)
2958{ 3390{
2959 int r; 3391 int r;
2960 enum emulation_result er; 3392 enum emulation_result er;
2961 3393
2962 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); 3394 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
2963 if (r < 0) 3395 if (r < 0)
2964 goto out; 3396 goto out;
2965 3397
@@ -2972,7 +3404,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2972 if (r) 3404 if (r)
2973 goto out; 3405 goto out;
2974 3406
2975 er = emulate_instruction(vcpu, cr2, error_code, 0); 3407 er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
2976 3408
2977 switch (er) { 3409 switch (er) {
2978 case EMULATE_DONE: 3410 case EMULATE_DONE:
@@ -3013,6 +3445,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3013static void free_mmu_pages(struct kvm_vcpu *vcpu) 3445static void free_mmu_pages(struct kvm_vcpu *vcpu)
3014{ 3446{
3015 free_page((unsigned long)vcpu->arch.mmu.pae_root); 3447 free_page((unsigned long)vcpu->arch.mmu.pae_root);
3448 if (vcpu->arch.mmu.lm_root != NULL)
3449 free_page((unsigned long)vcpu->arch.mmu.lm_root);
3016} 3450}
3017 3451
3018static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 3452static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -3054,15 +3488,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3054 return init_kvm_mmu(vcpu); 3488 return init_kvm_mmu(vcpu);
3055} 3489}
3056 3490
3057void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3058{
3059 ASSERT(vcpu);
3060
3061 destroy_kvm_mmu(vcpu);
3062 free_mmu_pages(vcpu);
3063 mmu_free_memory_caches(vcpu);
3064}
3065
3066void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 3491void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3067{ 3492{
3068 struct kvm_mmu_page *sp; 3493 struct kvm_mmu_page *sp;
@@ -3075,10 +3500,22 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3075 continue; 3500 continue;
3076 3501
3077 pt = sp->spt; 3502 pt = sp->spt;
3078 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3503 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3504 if (!is_shadow_present_pte(pt[i]) ||
3505 !is_last_spte(pt[i], sp->role.level))
3506 continue;
3507
3508 if (is_large_pte(pt[i])) {
3509 drop_spte(kvm, &pt[i],
3510 shadow_trap_nonpresent_pte);
3511 --kvm->stat.lpages;
3512 continue;
3513 }
3514
3079 /* avoid RMW */ 3515 /* avoid RMW */
3080 if (is_writable_pte(pt[i])) 3516 if (is_writable_pte(pt[i]))
3081 pt[i] &= ~PT_WRITABLE_MASK; 3517 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3518 }
3082 } 3519 }
3083 kvm_flush_remote_tlbs(kvm); 3520 kvm_flush_remote_tlbs(kvm);
3084} 3521}
@@ -3108,27 +3545,27 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3108 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 3545 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3109} 3546}
3110 3547
3111static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 3548static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3112{ 3549{
3113 struct kvm *kvm; 3550 struct kvm *kvm;
3114 struct kvm *kvm_freed = NULL; 3551 struct kvm *kvm_freed = NULL;
3115 int cache_count = 0; 3552 int nr_to_scan = sc->nr_to_scan;
3553
3554 if (nr_to_scan == 0)
3555 goto out;
3116 3556
3117 spin_lock(&kvm_lock); 3557 raw_spin_lock(&kvm_lock);
3118 3558
3119 list_for_each_entry(kvm, &vm_list, vm_list) { 3559 list_for_each_entry(kvm, &vm_list, vm_list) {
3120 int npages, idx, freed_pages; 3560 int idx, freed_pages;
3121 LIST_HEAD(invalid_list); 3561 LIST_HEAD(invalid_list);
3122 3562
3123 idx = srcu_read_lock(&kvm->srcu); 3563 idx = srcu_read_lock(&kvm->srcu);
3124 spin_lock(&kvm->mmu_lock); 3564 spin_lock(&kvm->mmu_lock);
3125 npages = kvm->arch.n_alloc_mmu_pages - 3565 if (!kvm_freed && nr_to_scan > 0 &&
3126 kvm->arch.n_free_mmu_pages; 3566 kvm->arch.n_used_mmu_pages > 0) {
3127 cache_count += npages;
3128 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3129 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, 3567 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3130 &invalid_list); 3568 &invalid_list);
3131 cache_count -= freed_pages;
3132 kvm_freed = kvm; 3569 kvm_freed = kvm;
3133 } 3570 }
3134 nr_to_scan--; 3571 nr_to_scan--;
@@ -3140,9 +3577,10 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3140 if (kvm_freed) 3577 if (kvm_freed)
3141 list_move_tail(&kvm_freed->vm_list, &vm_list); 3578 list_move_tail(&kvm_freed->vm_list, &vm_list);
3142 3579
3143 spin_unlock(&kvm_lock); 3580 raw_spin_unlock(&kvm_lock);
3144 3581
3145 return cache_count; 3582out:
3583 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3146} 3584}
3147 3585
3148static struct shrinker mmu_shrinker = { 3586static struct shrinker mmu_shrinker = {
@@ -3160,12 +3598,6 @@ static void mmu_destroy_caches(void)
3160 kmem_cache_destroy(mmu_page_header_cache); 3598 kmem_cache_destroy(mmu_page_header_cache);
3161} 3599}
3162 3600
3163void kvm_mmu_module_exit(void)
3164{
3165 mmu_destroy_caches();
3166 unregister_shrinker(&mmu_shrinker);
3167}
3168
3169int kvm_mmu_module_init(void) 3601int kvm_mmu_module_init(void)
3170{ 3602{
3171 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3603 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3185,6 +3617,9 @@ int kvm_mmu_module_init(void)
3185 if (!mmu_page_header_cache) 3617 if (!mmu_page_header_cache)
3186 goto nomem; 3618 goto nomem;
3187 3619
3620 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
3621 goto nomem;
3622
3188 register_shrinker(&mmu_shrinker); 3623 register_shrinker(&mmu_shrinker);
3189 3624
3190 return 0; 3625 return 0;
@@ -3259,7 +3694,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3259 3694
3260static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3695static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3261{ 3696{
3262 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); 3697 (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
3263 return 1; 3698 return 1;
3264} 3699}
3265 3700
@@ -3355,271 +3790,25 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3355} 3790}
3356EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3791EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3357 3792
3358#ifdef AUDIT 3793void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3359
3360static const char *audit_msg;
3361
3362static gva_t canonicalize(gva_t gva)
3363{
3364#ifdef CONFIG_X86_64
3365 gva = (long long)(gva << 16) >> 16;
3366#endif
3367 return gva;
3368}
3369
3370
3371typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3372
3373static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3374 inspect_spte_fn fn)
3375{
3376 int i;
3377
3378 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3379 u64 ent = sp->spt[i];
3380
3381 if (is_shadow_present_pte(ent)) {
3382 if (!is_last_spte(ent, sp->role.level)) {
3383 struct kvm_mmu_page *child;
3384 child = page_header(ent & PT64_BASE_ADDR_MASK);
3385 __mmu_spte_walk(kvm, child, fn);
3386 } else
3387 fn(kvm, &sp->spt[i]);
3388 }
3389 }
3390}
3391
3392static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3393{
3394 int i;
3395 struct kvm_mmu_page *sp;
3396
3397 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3398 return;
3399 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3400 hpa_t root = vcpu->arch.mmu.root_hpa;
3401 sp = page_header(root);
3402 __mmu_spte_walk(vcpu->kvm, sp, fn);
3403 return;
3404 }
3405 for (i = 0; i < 4; ++i) {
3406 hpa_t root = vcpu->arch.mmu.pae_root[i];
3407
3408 if (root && VALID_PAGE(root)) {
3409 root &= PT64_BASE_ADDR_MASK;
3410 sp = page_header(root);
3411 __mmu_spte_walk(vcpu->kvm, sp, fn);
3412 }
3413 }
3414 return;
3415}
3416
3417static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3418 gva_t va, int level)
3419{
3420 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
3421 int i;
3422 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
3423
3424 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
3425 u64 ent = pt[i];
3426
3427 if (ent == shadow_trap_nonpresent_pte)
3428 continue;
3429
3430 va = canonicalize(va);
3431 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3432 audit_mappings_page(vcpu, ent, va, level - 1);
3433 else {
3434 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3435 gfn_t gfn = gpa >> PAGE_SHIFT;
3436 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3437 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3438
3439 if (is_error_pfn(pfn)) {
3440 kvm_release_pfn_clean(pfn);
3441 continue;
3442 }
3443
3444 if (is_shadow_present_pte(ent)
3445 && (ent & PT64_BASE_ADDR_MASK) != hpa)
3446 printk(KERN_ERR "xx audit error: (%s) levels %d"
3447 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3448 audit_msg, vcpu->arch.mmu.root_level,
3449 va, gpa, hpa, ent,
3450 is_shadow_present_pte(ent));
3451 else if (ent == shadow_notrap_nonpresent_pte
3452 && !is_error_hpa(hpa))
3453 printk(KERN_ERR "audit: (%s) notrap shadow,"
3454 " valid guest gva %lx\n", audit_msg, va);
3455 kvm_release_pfn_clean(pfn);
3456
3457 }
3458 }
3459}
3460
3461static void audit_mappings(struct kvm_vcpu *vcpu)
3462{
3463 unsigned i;
3464
3465 if (vcpu->arch.mmu.root_level == 4)
3466 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3467 else
3468 for (i = 0; i < 4; ++i)
3469 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3470 audit_mappings_page(vcpu,
3471 vcpu->arch.mmu.pae_root[i],
3472 i << 30,
3473 2);
3474}
3475
3476static int count_rmaps(struct kvm_vcpu *vcpu)
3477{
3478 struct kvm *kvm = vcpu->kvm;
3479 struct kvm_memslots *slots;
3480 int nmaps = 0;
3481 int i, j, k, idx;
3482
3483 idx = srcu_read_lock(&kvm->srcu);
3484 slots = kvm_memslots(kvm);
3485 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3486 struct kvm_memory_slot *m = &slots->memslots[i];
3487 struct kvm_rmap_desc *d;
3488
3489 for (j = 0; j < m->npages; ++j) {
3490 unsigned long *rmapp = &m->rmap[j];
3491
3492 if (!*rmapp)
3493 continue;
3494 if (!(*rmapp & 1)) {
3495 ++nmaps;
3496 continue;
3497 }
3498 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3499 while (d) {
3500 for (k = 0; k < RMAP_EXT; ++k)
3501 if (d->sptes[k])
3502 ++nmaps;
3503 else
3504 break;
3505 d = d->more;
3506 }
3507 }
3508 }
3509 srcu_read_unlock(&kvm->srcu, idx);
3510 return nmaps;
3511}
3512
3513void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3514{
3515 unsigned long *rmapp;
3516 struct kvm_mmu_page *rev_sp;
3517 gfn_t gfn;
3518
3519 if (is_writable_pte(*sptep)) {
3520 rev_sp = page_header(__pa(sptep));
3521 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3522
3523 if (!gfn_to_memslot(kvm, gfn)) {
3524 if (!printk_ratelimit())
3525 return;
3526 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3527 audit_msg, gfn);
3528 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3529 audit_msg, (long int)(sptep - rev_sp->spt),
3530 rev_sp->gfn);
3531 dump_stack();
3532 return;
3533 }
3534
3535 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3536 if (!*rmapp) {
3537 if (!printk_ratelimit())
3538 return;
3539 printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3540 audit_msg, *sptep);
3541 dump_stack();
3542 }
3543 }
3544
3545}
3546
3547void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3548{
3549 mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3550}
3551
3552static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3553{ 3794{
3554 struct kvm_mmu_page *sp; 3795 ASSERT(vcpu);
3555 int i;
3556
3557 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3558 u64 *pt = sp->spt;
3559
3560 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3561 continue;
3562
3563 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3564 u64 ent = pt[i];
3565
3566 if (!(ent & PT_PRESENT_MASK))
3567 continue;
3568 if (!is_writable_pte(ent))
3569 continue;
3570 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3571 }
3572 }
3573 return;
3574}
3575 3796
3576static void audit_rmap(struct kvm_vcpu *vcpu) 3797 destroy_kvm_mmu(vcpu);
3577{ 3798 free_mmu_pages(vcpu);
3578 check_writable_mappings_rmap(vcpu); 3799 mmu_free_memory_caches(vcpu);
3579 count_rmaps(vcpu);
3580} 3800}
3581 3801
3582static void audit_write_protection(struct kvm_vcpu *vcpu) 3802#ifdef CONFIG_KVM_MMU_AUDIT
3583{ 3803#include "mmu_audit.c"
3584 struct kvm_mmu_page *sp; 3804#else
3585 struct kvm_memory_slot *slot; 3805static void mmu_audit_disable(void) { }
3586 unsigned long *rmapp; 3806#endif
3587 u64 *spte;
3588 gfn_t gfn;
3589
3590 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3591 if (sp->role.direct)
3592 continue;
3593 if (sp->unsync)
3594 continue;
3595
3596 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3597 rmapp = &slot->rmap[gfn - slot->base_gfn];
3598
3599 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3600 while (spte) {
3601 if (is_writable_pte(*spte))
3602 printk(KERN_ERR "%s: (%s) shadow page has "
3603 "writable mappings: gfn %lx role %x\n",
3604 __func__, audit_msg, sp->gfn,
3605 sp->role.word);
3606 spte = rmap_next(vcpu->kvm, rmapp, spte);
3607 }
3608 }
3609}
3610 3807
3611static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) 3808void kvm_mmu_module_exit(void)
3612{ 3809{
3613 int olddbg = dbg; 3810 mmu_destroy_caches();
3614 3811 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3615 dbg = 0; 3812 unregister_shrinker(&mmu_shrinker);
3616 audit_msg = msg; 3813 mmu_audit_disable();
3617 audit_rmap(vcpu);
3618 audit_write_protection(vcpu);
3619 if (strcmp("pre pte write", audit_msg) != 0)
3620 audit_mappings(vcpu);
3621 audit_writable_sptes_have_rmaps(vcpu);
3622 dbg = olddbg;
3623} 3814}
3624
3625#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index be66759321a5..7086ca85d3e7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,10 +49,17 @@
49#define PFERR_FETCH_MASK (1U << 4) 49#define PFERR_FETCH_MASK (1U << 4)
50 50
51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
52int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
53
54static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
55{
56 return kvm->arch.n_max_mmu_pages -
57 kvm->arch.n_used_mmu_pages;
58}
52 59
53static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 60static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
54{ 61{
55 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) 62 if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
56 __kvm_mmu_free_some_pages(vcpu); 63 __kvm_mmu_free_some_pages(vcpu);
57} 64}
58 65
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 000000000000..5f6223b8bcf7
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,304 @@
1/*
2 * mmu_audit.c:
3 *
4 * Audit code for KVM MMU
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8 *
9 * Authors:
10 * Yaniv Kamay <yaniv@qumranet.com>
11 * Avi Kivity <avi@qumranet.com>
12 * Marcelo Tosatti <mtosatti@redhat.com>
13 * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include <linux/ratelimit.h>
21
22#define audit_printk(kvm, fmt, args...) \
23 printk(KERN_ERR "audit: (%s) error: " \
24 fmt, audit_point_name[kvm->arch.audit_point], ##args)
25
26typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
27
28static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
29 inspect_spte_fn fn, int level)
30{
31 int i;
32
33 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
34 u64 *ent = sp->spt;
35
36 fn(vcpu, ent + i, level);
37
38 if (is_shadow_present_pte(ent[i]) &&
39 !is_last_spte(ent[i], level)) {
40 struct kvm_mmu_page *child;
41
42 child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
43 __mmu_spte_walk(vcpu, child, fn, level - 1);
44 }
45 }
46}
47
48static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
49{
50 int i;
51 struct kvm_mmu_page *sp;
52
53 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
54 return;
55
56 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
57 hpa_t root = vcpu->arch.mmu.root_hpa;
58
59 sp = page_header(root);
60 __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
61 return;
62 }
63
64 for (i = 0; i < 4; ++i) {
65 hpa_t root = vcpu->arch.mmu.pae_root[i];
66
67 if (root && VALID_PAGE(root)) {
68 root &= PT64_BASE_ADDR_MASK;
69 sp = page_header(root);
70 __mmu_spte_walk(vcpu, sp, fn, 2);
71 }
72 }
73
74 return;
75}
76
77typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
78
79static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
80{
81 struct kvm_mmu_page *sp;
82
83 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
84 fn(kvm, sp);
85}
86
87static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
88{
89 struct kvm_mmu_page *sp;
90 gfn_t gfn;
91 pfn_t pfn;
92 hpa_t hpa;
93
94 sp = page_header(__pa(sptep));
95
96 if (sp->unsync) {
97 if (level != PT_PAGE_TABLE_LEVEL) {
98 audit_printk(vcpu->kvm, "unsync sp: %p "
99 "level = %d\n", sp, level);
100 return;
101 }
102
103 if (*sptep == shadow_notrap_nonpresent_pte) {
104 audit_printk(vcpu->kvm, "notrap spte in unsync "
105 "sp: %p\n", sp);
106 return;
107 }
108 }
109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
112 sp);
113 return;
114 }
115
116 if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
117 return;
118
119 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
120 pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
121
122 if (is_error_pfn(pfn)) {
123 kvm_release_pfn_clean(pfn);
124 return;
125 }
126
127 hpa = pfn << PAGE_SHIFT;
128 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
129 audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
130 "ent %llxn", vcpu->arch.mmu.root_level, pfn,
131 hpa, *sptep);
132}
133
134static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
135{
136 unsigned long *rmapp;
137 struct kvm_mmu_page *rev_sp;
138 gfn_t gfn;
139
140
141 rev_sp = page_header(__pa(sptep));
142 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
143
144 if (!gfn_to_memslot(kvm, gfn)) {
145 if (!printk_ratelimit())
146 return;
147 audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
148 audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
149 (long int)(sptep - rev_sp->spt), rev_sp->gfn);
150 dump_stack();
151 return;
152 }
153
154 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
155 if (!*rmapp) {
156 if (!printk_ratelimit())
157 return;
158 audit_printk(kvm, "no rmap for writable spte %llx\n",
159 *sptep);
160 dump_stack();
161 }
162}
163
164static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
165{
166 if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
167 inspect_spte_has_rmap(vcpu->kvm, sptep);
168}
169
170static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
171{
172 struct kvm_mmu_page *sp = page_header(__pa(sptep));
173
174 if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
175 audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
176 "root.\n", sp);
177}
178
179static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
180{
181 int i;
182
183 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
184 return;
185
186 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
187 if (!is_rmap_spte(sp->spt[i]))
188 continue;
189
190 inspect_spte_has_rmap(kvm, sp->spt + i);
191 }
192}
193
194static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
195{
196 struct kvm_memory_slot *slot;
197 unsigned long *rmapp;
198 u64 *spte;
199
200 if (sp->role.direct || sp->unsync || sp->role.invalid)
201 return;
202
203 slot = gfn_to_memslot(kvm, sp->gfn);
204 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
205
206 spte = rmap_next(kvm, rmapp, NULL);
207 while (spte) {
208 if (is_writable_pte(*spte))
209 audit_printk(kvm, "shadow page has writable "
210 "mappings: gfn %llx role %x\n",
211 sp->gfn, sp->role.word);
212 spte = rmap_next(kvm, rmapp, spte);
213 }
214}
215
216static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
217{
218 check_mappings_rmap(kvm, sp);
219 audit_write_protection(kvm, sp);
220}
221
222static void audit_all_active_sps(struct kvm *kvm)
223{
224 walk_all_active_sps(kvm, audit_sp);
225}
226
227static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
228{
229 audit_sptes_have_rmaps(vcpu, sptep, level);
230 audit_mappings(vcpu, sptep, level);
231 audit_spte_after_sync(vcpu, sptep, level);
232}
233
234static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
235{
236 mmu_spte_walk(vcpu, audit_spte);
237}
238
239static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
240{
241 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
242
243 if (!__ratelimit(&ratelimit_state))
244 return;
245
246 vcpu->kvm->arch.audit_point = point;
247 audit_all_active_sps(vcpu->kvm);
248 audit_vcpu_spte(vcpu);
249}
250
251static bool mmu_audit;
252
253static void mmu_audit_enable(void)
254{
255 int ret;
256
257 if (mmu_audit)
258 return;
259
260 ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
261 WARN_ON(ret);
262
263 mmu_audit = true;
264}
265
266static void mmu_audit_disable(void)
267{
268 if (!mmu_audit)
269 return;
270
271 unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
272 tracepoint_synchronize_unregister();
273 mmu_audit = false;
274}
275
276static int mmu_audit_set(const char *val, const struct kernel_param *kp)
277{
278 int ret;
279 unsigned long enable;
280
281 ret = strict_strtoul(val, 10, &enable);
282 if (ret < 0)
283 return -EINVAL;
284
285 switch (enable) {
286 case 0:
287 mmu_audit_disable();
288 break;
289 case 1:
290 mmu_audit_enable();
291 break;
292 default:
293 return -EINVAL;
294 }
295
296 return 0;
297}
298
299static struct kernel_param_ops audit_param_ops = {
300 .set = mmu_audit_set,
301 .get = param_get_bool,
302};
303
304module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0930ef..b60b4fdb3eda 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
195 195
196 TP_ARGS(sp) 196 TP_ARGS(sp)
197); 197);
198
199TRACE_EVENT(
200 kvm_mmu_audit,
201 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
202 TP_ARGS(vcpu, audit_point),
203
204 TP_STRUCT__entry(
205 __field(struct kvm_vcpu *, vcpu)
206 __field(int, audit_point)
207 ),
208
209 TP_fast_assign(
210 __entry->vcpu = vcpu;
211 __entry->audit_point = audit_point;
212 ),
213
214 TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
215 audit_point_name[__entry->audit_point])
216);
198#endif /* _TRACE_KVMMMU_H */ 217#endif /* _TRACE_KVMMMU_H */
199 218
200#undef TRACE_INCLUDE_PATH 219#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 51ef9097960d..9d03ad4dd5ec 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,7 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -31,7 +31,6 @@
31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) 31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
35 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
36 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
37 #define PT_MAX_FULL_LEVELS 4 36 #define PT_MAX_FULL_LEVELS 4
@@ -48,7 +47,6 @@
48 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) 47 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
49 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) 48 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
50 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
51 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
52 #define PT_LEVEL_BITS PT32_LEVEL_BITS 50 #define PT_LEVEL_BITS PT32_LEVEL_BITS
53 #define PT_MAX_FULL_LEVELS 2 51 #define PT_MAX_FULL_LEVELS 2
54 #define CMPXCHG cmpxchg 52 #define CMPXCHG cmpxchg
@@ -67,11 +65,12 @@ struct guest_walker {
67 int level; 65 int level;
68 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 66 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
69 pt_element_t ptes[PT_MAX_FULL_LEVELS]; 67 pt_element_t ptes[PT_MAX_FULL_LEVELS];
68 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
70 gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 69 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71 unsigned pt_access; 70 unsigned pt_access;
72 unsigned pte_access; 71 unsigned pte_access;
73 gfn_t gfn; 72 gfn_t gfn;
74 u32 error_code; 73 struct x86_exception fault;
75}; 74};
76 75
77static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) 76static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -79,15 +78,19 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
79 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; 78 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
80} 79}
81 80
82static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, 81static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
83 gfn_t table_gfn, unsigned index, 82 pt_element_t __user *ptep_user, unsigned index,
84 pt_element_t orig_pte, pt_element_t new_pte) 83 pt_element_t orig_pte, pt_element_t new_pte)
85{ 84{
85 int npages;
86 pt_element_t ret; 86 pt_element_t ret;
87 pt_element_t *table; 87 pt_element_t *table;
88 struct page *page; 88 struct page *page;
89 89
90 page = gfn_to_page(kvm, table_gfn); 90 npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
91 /* Check if the user is doing something meaningless. */
92 if (unlikely(npages != 1))
93 return -EFAULT;
91 94
92 table = kmap_atomic(page, KM_USER0); 95 table = kmap_atomic(page, KM_USER0);
93 ret = CMPXCHG(&table[index], orig_pte, new_pte); 96 ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -104,7 +107,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
104 107
105 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; 108 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
106#if PTTYPE == 64 109#if PTTYPE == 64
107 if (is_nx(vcpu)) 110 if (vcpu->arch.mmu.nx)
108 access &= ~(gpte >> PT64_NX_SHIFT); 111 access &= ~(gpte >> PT64_NX_SHIFT);
109#endif 112#endif
110 return access; 113 return access;
@@ -113,26 +116,33 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
113/* 116/*
114 * Fetch a guest pte for a guest virtual address 117 * Fetch a guest pte for a guest virtual address
115 */ 118 */
116static int FNAME(walk_addr)(struct guest_walker *walker, 119static int FNAME(walk_addr_generic)(struct guest_walker *walker,
117 struct kvm_vcpu *vcpu, gva_t addr, 120 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
118 int write_fault, int user_fault, int fetch_fault) 121 gva_t addr, u32 access)
119{ 122{
120 pt_element_t pte; 123 pt_element_t pte;
124 pt_element_t __user *uninitialized_var(ptep_user);
121 gfn_t table_gfn; 125 gfn_t table_gfn;
122 unsigned index, pt_access, uninitialized_var(pte_access); 126 unsigned index, pt_access, uninitialized_var(pte_access);
123 gpa_t pte_gpa; 127 gpa_t pte_gpa;
124 bool eperm, present, rsvd_fault; 128 bool eperm, present, rsvd_fault;
129 int offset, write_fault, user_fault, fetch_fault;
130
131 write_fault = access & PFERR_WRITE_MASK;
132 user_fault = access & PFERR_USER_MASK;
133 fetch_fault = access & PFERR_FETCH_MASK;
125 134
126 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 135 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
127 fetch_fault); 136 fetch_fault);
128walk: 137walk:
129 present = true; 138 present = true;
130 eperm = rsvd_fault = false; 139 eperm = rsvd_fault = false;
131 walker->level = vcpu->arch.mmu.root_level; 140 walker->level = mmu->root_level;
132 pte = vcpu->arch.cr3; 141 pte = mmu->get_cr3(vcpu);
142
133#if PTTYPE == 64 143#if PTTYPE == 64
134 if (!is_long_mode(vcpu)) { 144 if (walker->level == PT32E_ROOT_LEVEL) {
135 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); 145 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
136 trace_kvm_mmu_paging_element(pte, walker->level); 146 trace_kvm_mmu_paging_element(pte, walker->level);
137 if (!is_present_gpte(pte)) { 147 if (!is_present_gpte(pte)) {
138 present = false; 148 present = false;
@@ -142,54 +152,80 @@ walk:
142 } 152 }
143#endif 153#endif
144 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 154 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
145 (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); 155 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
146 156
147 pt_access = ACC_ALL; 157 pt_access = ACC_ALL;
148 158
149 for (;;) { 159 for (;;) {
160 gfn_t real_gfn;
161 unsigned long host_addr;
162
150 index = PT_INDEX(addr, walker->level); 163 index = PT_INDEX(addr, walker->level);
151 164
152 table_gfn = gpte_to_gfn(pte); 165 table_gfn = gpte_to_gfn(pte);
153 pte_gpa = gfn_to_gpa(table_gfn); 166 offset = index * sizeof(pt_element_t);
154 pte_gpa += index * sizeof(pt_element_t); 167 pte_gpa = gfn_to_gpa(table_gfn) + offset;
155 walker->table_gfn[walker->level - 1] = table_gfn; 168 walker->table_gfn[walker->level - 1] = table_gfn;
156 walker->pte_gpa[walker->level - 1] = pte_gpa; 169 walker->pte_gpa[walker->level - 1] = pte_gpa;
157 170
158 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { 171 real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
172 PFERR_USER_MASK|PFERR_WRITE_MASK);
173 if (unlikely(real_gfn == UNMAPPED_GVA)) {
174 present = false;
175 break;
176 }
177 real_gfn = gpa_to_gfn(real_gfn);
178
179 host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
180 if (unlikely(kvm_is_error_hva(host_addr))) {
181 present = false;
182 break;
183 }
184
185 ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
186 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) {
159 present = false; 187 present = false;
160 break; 188 break;
161 } 189 }
162 190
163 trace_kvm_mmu_paging_element(pte, walker->level); 191 trace_kvm_mmu_paging_element(pte, walker->level);
164 192
165 if (!is_present_gpte(pte)) { 193 if (unlikely(!is_present_gpte(pte))) {
166 present = false; 194 present = false;
167 break; 195 break;
168 } 196 }
169 197
170 if (is_rsvd_bits_set(vcpu, pte, walker->level)) { 198 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
199 walker->level))) {
171 rsvd_fault = true; 200 rsvd_fault = true;
172 break; 201 break;
173 } 202 }
174 203
175 if (write_fault && !is_writable_pte(pte)) 204 if (unlikely(write_fault && !is_writable_pte(pte)
176 if (user_fault || is_write_protection(vcpu)) 205 && (user_fault || is_write_protection(vcpu))))
177 eperm = true; 206 eperm = true;
178 207
179 if (user_fault && !(pte & PT_USER_MASK)) 208 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
180 eperm = true; 209 eperm = true;
181 210
182#if PTTYPE == 64 211#if PTTYPE == 64
183 if (fetch_fault && (pte & PT64_NX_MASK)) 212 if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
184 eperm = true; 213 eperm = true;
185#endif 214#endif
186 215
187 if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { 216 if (!eperm && !rsvd_fault
217 && unlikely(!(pte & PT_ACCESSED_MASK))) {
218 int ret;
188 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 219 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
189 sizeof(pte)); 220 sizeof(pte));
190 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 221 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
191 index, pte, pte|PT_ACCESSED_MASK)) 222 pte, pte|PT_ACCESSED_MASK);
223 if (unlikely(ret < 0)) {
224 present = false;
225 break;
226 } else if (ret)
192 goto walk; 227 goto walk;
228
193 mark_page_dirty(vcpu->kvm, table_gfn); 229 mark_page_dirty(vcpu->kvm, table_gfn);
194 pte |= PT_ACCESSED_MASK; 230 pte |= PT_ACCESSED_MASK;
195 } 231 }
@@ -204,17 +240,28 @@ walk:
204 (PTTYPE == 64 || is_pse(vcpu))) || 240 (PTTYPE == 64 || is_pse(vcpu))) ||
205 ((walker->level == PT_PDPE_LEVEL) && 241 ((walker->level == PT_PDPE_LEVEL) &&
206 is_large_pte(pte) && 242 is_large_pte(pte) &&
207 is_long_mode(vcpu))) { 243 mmu->root_level == PT64_ROOT_LEVEL)) {
208 int lvl = walker->level; 244 int lvl = walker->level;
245 gpa_t real_gpa;
246 gfn_t gfn;
247 u32 ac;
209 248
210 walker->gfn = gpte_to_gfn_lvl(pte, lvl); 249 gfn = gpte_to_gfn_lvl(pte, lvl);
211 walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) 250 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
212 >> PAGE_SHIFT;
213 251
214 if (PTTYPE == 32 && 252 if (PTTYPE == 32 &&
215 walker->level == PT_DIRECTORY_LEVEL && 253 walker->level == PT_DIRECTORY_LEVEL &&
216 is_cpuid_PSE36()) 254 is_cpuid_PSE36())
217 walker->gfn += pse36_gfn_delta(pte); 255 gfn += pse36_gfn_delta(pte);
256
257 ac = write_fault | fetch_fault | user_fault;
258
259 real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
260 ac);
261 if (real_gpa == UNMAPPED_GVA)
262 return 0;
263
264 walker->gfn = real_gpa >> PAGE_SHIFT;
218 265
219 break; 266 break;
220 } 267 }
@@ -223,17 +270,21 @@ walk:
223 --walker->level; 270 --walker->level;
224 } 271 }
225 272
226 if (!present || eperm || rsvd_fault) 273 if (unlikely(!present || eperm || rsvd_fault))
227 goto error; 274 goto error;
228 275
229 if (write_fault && !is_dirty_gpte(pte)) { 276 if (write_fault && unlikely(!is_dirty_gpte(pte))) {
230 bool ret; 277 int ret;
231 278
232 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 279 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
233 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 280 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
234 pte|PT_DIRTY_MASK); 281 pte, pte|PT_DIRTY_MASK);
235 if (ret) 282 if (unlikely(ret < 0)) {
283 present = false;
284 goto error;
285 } else if (ret)
236 goto walk; 286 goto walk;
287
237 mark_page_dirty(vcpu->kvm, table_gfn); 288 mark_page_dirty(vcpu->kvm, table_gfn);
238 pte |= PT_DIRTY_MASK; 289 pte |= PT_DIRTY_MASK;
239 walker->ptes[walker->level - 1] = pte; 290 walker->ptes[walker->level - 1] = pte;
@@ -246,52 +297,87 @@ walk:
246 return 1; 297 return 1;
247 298
248error: 299error:
249 walker->error_code = 0; 300 walker->fault.vector = PF_VECTOR;
301 walker->fault.error_code_valid = true;
302 walker->fault.error_code = 0;
250 if (present) 303 if (present)
251 walker->error_code |= PFERR_PRESENT_MASK; 304 walker->fault.error_code |= PFERR_PRESENT_MASK;
252 if (write_fault) 305
253 walker->error_code |= PFERR_WRITE_MASK; 306 walker->fault.error_code |= write_fault | user_fault;
254 if (user_fault) 307
255 walker->error_code |= PFERR_USER_MASK; 308 if (fetch_fault && mmu->nx)
256 if (fetch_fault && is_nx(vcpu)) 309 walker->fault.error_code |= PFERR_FETCH_MASK;
257 walker->error_code |= PFERR_FETCH_MASK;
258 if (rsvd_fault) 310 if (rsvd_fault)
259 walker->error_code |= PFERR_RSVD_MASK; 311 walker->fault.error_code |= PFERR_RSVD_MASK;
260 trace_kvm_mmu_walker_error(walker->error_code); 312
313 walker->fault.address = addr;
314 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
315
316 trace_kvm_mmu_walker_error(walker->fault.error_code);
261 return 0; 317 return 0;
262} 318}
263 319
320static int FNAME(walk_addr)(struct guest_walker *walker,
321 struct kvm_vcpu *vcpu, gva_t addr, u32 access)
322{
323 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
324 access);
325}
326
327static int FNAME(walk_addr_nested)(struct guest_walker *walker,
328 struct kvm_vcpu *vcpu, gva_t addr,
329 u32 access)
330{
331 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
332 addr, access);
333}
334
335static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
336 struct kvm_mmu_page *sp, u64 *spte,
337 pt_element_t gpte)
338{
339 u64 nonpresent = shadow_trap_nonpresent_pte;
340
341 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
342 goto no_present;
343
344 if (!is_present_gpte(gpte)) {
345 if (!sp->unsync)
346 nonpresent = shadow_notrap_nonpresent_pte;
347 goto no_present;
348 }
349
350 if (!(gpte & PT_ACCESSED_MASK))
351 goto no_present;
352
353 return false;
354
355no_present:
356 drop_spte(vcpu->kvm, spte, nonpresent);
357 return true;
358}
359
264static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 360static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
265 u64 *spte, const void *pte) 361 u64 *spte, const void *pte)
266{ 362{
267 pt_element_t gpte; 363 pt_element_t gpte;
268 unsigned pte_access; 364 unsigned pte_access;
269 pfn_t pfn; 365 pfn_t pfn;
270 u64 new_spte;
271 366
272 gpte = *(const pt_element_t *)pte; 367 gpte = *(const pt_element_t *)pte;
273 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 368 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
274 if (!is_present_gpte(gpte)) {
275 if (sp->unsync)
276 new_spte = shadow_trap_nonpresent_pte;
277 else
278 new_spte = shadow_notrap_nonpresent_pte;
279 __set_spte(spte, new_spte);
280 }
281 return; 369 return;
282 } 370
283 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 371 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
284 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 372 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
285 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 373 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
374 if (is_error_pfn(pfn)) {
375 kvm_release_pfn_clean(pfn);
286 return; 376 return;
287 pfn = vcpu->arch.update_pte.pfn; 377 }
288 if (is_error_pfn(pfn)) 378
289 return;
290 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
291 return;
292 kvm_get_pfn(pfn);
293 /* 379 /*
294 * we call mmu_set_spte() with reset_host_protection = true beacuse that 380 * we call mmu_set_spte() with host_writable = true because that
295 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 381 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
296 */ 382 */
297 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 383 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -302,21 +388,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
302static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, 388static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
303 struct guest_walker *gw, int level) 389 struct guest_walker *gw, int level)
304{ 390{
305 int r;
306 pt_element_t curr_pte; 391 pt_element_t curr_pte;
307 392 gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
308 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], 393 u64 mask;
394 int r, index;
395
396 if (level == PT_PAGE_TABLE_LEVEL) {
397 mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
398 base_gpa = pte_gpa & ~mask;
399 index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
400
401 r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
402 gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
403 curr_pte = gw->prefetch_ptes[index];
404 } else
405 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
309 &curr_pte, sizeof(curr_pte)); 406 &curr_pte, sizeof(curr_pte));
407
310 return r || curr_pte != gw->ptes[level - 1]; 408 return r || curr_pte != gw->ptes[level - 1];
311} 409}
312 410
411static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
412 u64 *sptep)
413{
414 struct kvm_mmu_page *sp;
415 pt_element_t *gptep = gw->prefetch_ptes;
416 u64 *spte;
417 int i;
418
419 sp = page_header(__pa(sptep));
420
421 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
422 return;
423
424 if (sp->role.direct)
425 return __direct_pte_prefetch(vcpu, sp, sptep);
426
427 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
428 spte = sp->spt + i;
429
430 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
431 pt_element_t gpte;
432 unsigned pte_access;
433 gfn_t gfn;
434 pfn_t pfn;
435 bool dirty;
436
437 if (spte == sptep)
438 continue;
439
440 if (*spte != shadow_trap_nonpresent_pte)
441 continue;
442
443 gpte = gptep[i];
444
445 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
446 continue;
447
448 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
449 gfn = gpte_to_gfn(gpte);
450 dirty = is_dirty_gpte(gpte);
451 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
452 (pte_access & ACC_WRITE_MASK) && dirty);
453 if (is_error_pfn(pfn)) {
454 kvm_release_pfn_clean(pfn);
455 break;
456 }
457
458 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
459 dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
460 pfn, true, true);
461 }
462}
463
313/* 464/*
314 * Fetch a shadow pte for a specific level in the paging hierarchy. 465 * Fetch a shadow pte for a specific level in the paging hierarchy.
315 */ 466 */
316static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 467static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
317 struct guest_walker *gw, 468 struct guest_walker *gw,
318 int user_fault, int write_fault, int hlevel, 469 int user_fault, int write_fault, int hlevel,
319 int *ptwrite, pfn_t pfn) 470 int *ptwrite, pfn_t pfn, bool map_writable,
471 bool prefault)
320{ 472{
321 unsigned access = gw->pt_access; 473 unsigned access = gw->pt_access;
322 struct kvm_mmu_page *sp = NULL; 474 struct kvm_mmu_page *sp = NULL;
@@ -390,7 +542,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
390 542
391 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 543 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
392 user_fault, write_fault, dirty, ptwrite, it.level, 544 user_fault, write_fault, dirty, ptwrite, it.level,
393 gw->gfn, pfn, false, true); 545 gw->gfn, pfn, prefault, map_writable);
546 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
394 547
395 return it.sptep; 548 return it.sptep;
396 549
@@ -415,22 +568,22 @@ out_gpte_changed:
415 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 568 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
416 * a negative value on error. 569 * a negative value on error.
417 */ 570 */
418static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, 571static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
419 u32 error_code) 572 bool prefault)
420{ 573{
421 int write_fault = error_code & PFERR_WRITE_MASK; 574 int write_fault = error_code & PFERR_WRITE_MASK;
422 int user_fault = error_code & PFERR_USER_MASK; 575 int user_fault = error_code & PFERR_USER_MASK;
423 int fetch_fault = error_code & PFERR_FETCH_MASK;
424 struct guest_walker walker; 576 struct guest_walker walker;
425 u64 *sptep; 577 u64 *sptep;
426 int write_pt = 0; 578 int write_pt = 0;
427 int r; 579 int r;
428 pfn_t pfn; 580 pfn_t pfn;
429 int level = PT_PAGE_TABLE_LEVEL; 581 int level = PT_PAGE_TABLE_LEVEL;
582 int force_pt_level;
430 unsigned long mmu_seq; 583 unsigned long mmu_seq;
584 bool map_writable;
431 585
432 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 586 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
433 kvm_mmu_audit(vcpu, "pre page fault");
434 587
435 r = mmu_topup_memory_caches(vcpu); 588 r = mmu_topup_memory_caches(vcpu);
436 if (r) 589 if (r)
@@ -439,27 +592,36 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
439 /* 592 /*
440 * Look up the guest pte for the faulting address. 593 * Look up the guest pte for the faulting address.
441 */ 594 */
442 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, 595 r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
443 fetch_fault);
444 596
445 /* 597 /*
446 * The page is not mapped by the guest. Let the guest handle it. 598 * The page is not mapped by the guest. Let the guest handle it.
447 */ 599 */
448 if (!r) { 600 if (!r) {
449 pgprintk("%s: guest page fault\n", __func__); 601 pgprintk("%s: guest page fault\n", __func__);
450 inject_page_fault(vcpu, addr, walker.error_code); 602 if (!prefault) {
451 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 603 inject_page_fault(vcpu, &walker.fault);
604 /* reset fork detector */
605 vcpu->arch.last_pt_write_count = 0;
606 }
452 return 0; 607 return 0;
453 } 608 }
454 609
455 if (walker.level >= PT_DIRECTORY_LEVEL) { 610 if (walker.level >= PT_DIRECTORY_LEVEL)
611 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
612 else
613 force_pt_level = 1;
614 if (!force_pt_level) {
456 level = min(walker.level, mapping_level(vcpu, walker.gfn)); 615 level = min(walker.level, mapping_level(vcpu, walker.gfn));
457 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 616 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
458 } 617 }
459 618
460 mmu_seq = vcpu->kvm->mmu_notifier_seq; 619 mmu_seq = vcpu->kvm->mmu_notifier_seq;
461 smp_rmb(); 620 smp_rmb();
462 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 621
622 if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
623 &map_writable))
624 return 0;
463 625
464 /* mmio */ 626 /* mmio */
465 if (is_error_pfn(pfn)) 627 if (is_error_pfn(pfn))
@@ -468,9 +630,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
468 spin_lock(&vcpu->kvm->mmu_lock); 630 spin_lock(&vcpu->kvm->mmu_lock);
469 if (mmu_notifier_retry(vcpu, mmu_seq)) 631 if (mmu_notifier_retry(vcpu, mmu_seq))
470 goto out_unlock; 632 goto out_unlock;
633
634 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
471 kvm_mmu_free_some_pages(vcpu); 635 kvm_mmu_free_some_pages(vcpu);
636 if (!force_pt_level)
637 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
472 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 638 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
473 level, &write_pt, pfn); 639 level, &write_pt, pfn, map_writable, prefault);
474 (void)sptep; 640 (void)sptep;
475 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 641 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
476 sptep, *sptep, write_pt); 642 sptep, *sptep, write_pt);
@@ -479,7 +645,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
479 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 645 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
480 646
481 ++vcpu->stat.pf_fixed; 647 ++vcpu->stat.pf_fixed;
482 kvm_mmu_audit(vcpu, "post page fault (fixed)"); 648 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
483 spin_unlock(&vcpu->kvm->mmu_lock); 649 spin_unlock(&vcpu->kvm->mmu_lock);
484 650
485 return write_pt; 651 return write_pt;
@@ -550,22 +716,38 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
550} 716}
551 717
552static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 718static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
553 u32 *error) 719 struct x86_exception *exception)
720{
721 struct guest_walker walker;
722 gpa_t gpa = UNMAPPED_GVA;
723 int r;
724
725 r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
726
727 if (r) {
728 gpa = gfn_to_gpa(walker.gfn);
729 gpa |= vaddr & ~PAGE_MASK;
730 } else if (exception)
731 *exception = walker.fault;
732
733 return gpa;
734}
735
736static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
737 u32 access,
738 struct x86_exception *exception)
554{ 739{
555 struct guest_walker walker; 740 struct guest_walker walker;
556 gpa_t gpa = UNMAPPED_GVA; 741 gpa_t gpa = UNMAPPED_GVA;
557 int r; 742 int r;
558 743
559 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 744 r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
560 !!(access & PFERR_WRITE_MASK),
561 !!(access & PFERR_USER_MASK),
562 !!(access & PFERR_FETCH_MASK));
563 745
564 if (r) { 746 if (r) {
565 gpa = gfn_to_gpa(walker.gfn); 747 gpa = gfn_to_gpa(walker.gfn);
566 gpa |= vaddr & ~PAGE_MASK; 748 gpa |= vaddr & ~PAGE_MASK;
567 } else if (error) 749 } else if (exception)
568 *error = walker.error_code; 750 *exception = walker.fault;
569 751
570 return gpa; 752 return gpa;
571} 753}
@@ -604,12 +786,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
604 * Using the cached information from sp->gfns is safe because: 786 * Using the cached information from sp->gfns is safe because:
605 * - The spte has a reference to the struct page, so the pfn for a given gfn 787 * - The spte has a reference to the struct page, so the pfn for a given gfn
606 * can't change unless all sptes pointing to it are nuked first. 788 * can't change unless all sptes pointing to it are nuked first.
789 *
790 * Note:
791 * We should flush all tlbs if spte is dropped even though guest is
792 * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
793 * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
794 * used by guest then tlbs are not flushed, so guest is allowed to access the
795 * freed pages.
796 * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
607 */ 797 */
608static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 798static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
609 bool clear_unsync)
610{ 799{
611 int i, offset, nr_present; 800 int i, offset, nr_present;
612 bool reset_host_protection; 801 bool host_writable;
613 gpa_t first_pte_gpa; 802 gpa_t first_pte_gpa;
614 803
615 offset = nr_present = 0; 804 offset = nr_present = 0;
@@ -638,31 +827,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
638 return -EINVAL; 827 return -EINVAL;
639 828
640 gfn = gpte_to_gfn(gpte); 829 gfn = gpte_to_gfn(gpte);
641 if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
642 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
643 || !(gpte & PT_ACCESSED_MASK)) {
644 u64 nonpresent;
645 830
646 if (is_present_gpte(gpte) || !clear_unsync) 831 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
647 nonpresent = shadow_trap_nonpresent_pte; 832 vcpu->kvm->tlbs_dirty++;
648 else 833 continue;
649 nonpresent = shadow_notrap_nonpresent_pte; 834 }
650 drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); 835
836 if (gfn != sp->gfns[i]) {
837 drop_spte(vcpu->kvm, &sp->spt[i],
838 shadow_trap_nonpresent_pte);
839 vcpu->kvm->tlbs_dirty++;
651 continue; 840 continue;
652 } 841 }
653 842
654 nr_present++; 843 nr_present++;
655 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 844 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
656 if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { 845 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
657 pte_access &= ~ACC_WRITE_MASK; 846
658 reset_host_protection = 0;
659 } else {
660 reset_host_protection = 1;
661 }
662 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 847 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
663 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 848 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
664 spte_to_pfn(sp->spt[i]), true, false, 849 spte_to_pfn(sp->spt[i]), true, false,
665 reset_host_protection); 850 host_writable);
666 } 851 }
667 852
668 return !nr_present; 853 return !nr_present;
@@ -673,7 +858,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
673#undef FNAME 858#undef FNAME
674#undef PT_BASE_ADDR_MASK 859#undef PT_BASE_ADDR_MASK
675#undef PT_INDEX 860#undef PT_INDEX
676#undef PT_LEVEL_MASK
677#undef PT_LVL_ADDR_MASK 861#undef PT_LVL_ADDR_MASK
678#undef PT_LVL_OFFSET_MASK 862#undef PT_LVL_OFFSET_MASK
679#undef PT_LEVEL_BITS 863#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8a3f9f64f86f..506e4fe23adc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,7 +4,7 @@
4 * AMD SVM support 4 * AMD SVM support
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affilates. 7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8 * 8 *
9 * Authors: 9 * Authors:
10 * Yaniv Kamay <yaniv@qumranet.com> 10 * Yaniv Kamay <yaniv@qumranet.com>
@@ -31,6 +31,7 @@
31 31
32#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
33#include <asm/desc.h> 33#include <asm/desc.h>
34#include <asm/kvm_para.h>
34 35
35#include <asm/virtext.h> 36#include <asm/virtext.h>
36#include "trace.h" 37#include "trace.h"
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL");
50#define SVM_FEATURE_LBRV (1 << 1) 51#define SVM_FEATURE_LBRV (1 << 1)
51#define SVM_FEATURE_SVML (1 << 2) 52#define SVM_FEATURE_SVML (1 << 2)
52#define SVM_FEATURE_NRIP (1 << 3) 53#define SVM_FEATURE_NRIP (1 << 3)
54#define SVM_FEATURE_TSC_RATE (1 << 4)
55#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
56#define SVM_FEATURE_FLUSH_ASID (1 << 6)
57#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
53#define SVM_FEATURE_PAUSE_FILTER (1 << 10) 58#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
54 59
55#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 60#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -58,6 +63,10 @@ MODULE_LICENSE("GPL");
58 63
59#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 64#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
60 65
66#define TSC_RATIO_RSVD 0xffffff0000000000ULL
67#define TSC_RATIO_MIN 0x0000000000000001ULL
68#define TSC_RATIO_MAX 0x000000ffffffffffULL
69
61static bool erratum_383_found __read_mostly; 70static bool erratum_383_found __read_mostly;
62 71
63static const u32 host_save_user_msrs[] = { 72static const u32 host_save_user_msrs[] = {
@@ -89,13 +98,13 @@ struct nested_state {
89 bool exit_required; 98 bool exit_required;
90 99
91 /* cache for intercepts of the guest */ 100 /* cache for intercepts of the guest */
92 u16 intercept_cr_read; 101 u32 intercept_cr;
93 u16 intercept_cr_write; 102 u32 intercept_dr;
94 u16 intercept_dr_read;
95 u16 intercept_dr_write;
96 u32 intercept_exceptions; 103 u32 intercept_exceptions;
97 u64 intercept; 104 u64 intercept;
98 105
106 /* Nested Paging related state */
107 u64 nested_cr3;
99}; 108};
100 109
101#define MSRPM_OFFSETS 16 110#define MSRPM_OFFSETS 16
@@ -113,18 +122,31 @@ struct vcpu_svm {
113 u64 next_rip; 122 u64 next_rip;
114 123
115 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; 124 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
116 u64 host_gs_base; 125 struct {
126 u16 fs;
127 u16 gs;
128 u16 ldt;
129 u64 gs_base;
130 } host;
117 131
118 u32 *msrpm; 132 u32 *msrpm;
119 133
134 ulong nmi_iret_rip;
135
120 struct nested_state nested; 136 struct nested_state nested;
121 137
122 bool nmi_singlestep; 138 bool nmi_singlestep;
123 139
124 unsigned int3_injected; 140 unsigned int3_injected;
125 unsigned long int3_rip; 141 unsigned long int3_rip;
142 u32 apf_reason;
143
144 u64 tsc_ratio;
126}; 145};
127 146
147static DEFINE_PER_CPU(u64, current_tsc_ratio);
148#define TSC_RATIO_DEFAULT 0x0100000000ULL
149
128#define MSR_INVALID 0xffffffffU 150#define MSR_INVALID 0xffffffffU
129 151
130static struct svm_direct_access_msrs { 152static struct svm_direct_access_msrs {
@@ -169,15 +191,153 @@ static int nested_svm_intercept(struct vcpu_svm *svm);
169static int nested_svm_vmexit(struct vcpu_svm *svm); 191static int nested_svm_vmexit(struct vcpu_svm *svm);
170static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 192static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
171 bool has_error_code, u32 error_code); 193 bool has_error_code, u32 error_code);
194static u64 __scale_tsc(u64 ratio, u64 tsc);
195
196enum {
197 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
198 pause filter count */
199 VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
200 VMCB_ASID, /* ASID */
201 VMCB_INTR, /* int_ctl, int_vector */
202 VMCB_NPT, /* npt_en, nCR3, gPAT */
203 VMCB_CR, /* CR0, CR3, CR4, EFER */
204 VMCB_DR, /* DR6, DR7 */
205 VMCB_DT, /* GDT, IDT */
206 VMCB_SEG, /* CS, DS, SS, ES, CPL */
207 VMCB_CR2, /* CR2 only */
208 VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
209 VMCB_DIRTY_MAX,
210};
211
212/* TPR and CR2 are always written before VMRUN */
213#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
214
215static inline void mark_all_dirty(struct vmcb *vmcb)
216{
217 vmcb->control.clean = 0;
218}
219
220static inline void mark_all_clean(struct vmcb *vmcb)
221{
222 vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
223 & ~VMCB_ALWAYS_DIRTY_MASK;
224}
225
226static inline void mark_dirty(struct vmcb *vmcb, int bit)
227{
228 vmcb->control.clean &= ~(1 << bit);
229}
172 230
173static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 231static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
174{ 232{
175 return container_of(vcpu, struct vcpu_svm, vcpu); 233 return container_of(vcpu, struct vcpu_svm, vcpu);
176} 234}
177 235
178static inline bool is_nested(struct vcpu_svm *svm) 236static void recalc_intercepts(struct vcpu_svm *svm)
237{
238 struct vmcb_control_area *c, *h;
239 struct nested_state *g;
240
241 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
242
243 if (!is_guest_mode(&svm->vcpu))
244 return;
245
246 c = &svm->vmcb->control;
247 h = &svm->nested.hsave->control;
248 g = &svm->nested;
249
250 c->intercept_cr = h->intercept_cr | g->intercept_cr;
251 c->intercept_dr = h->intercept_dr | g->intercept_dr;
252 c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
253 c->intercept = h->intercept | g->intercept;
254}
255
256static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
257{
258 if (is_guest_mode(&svm->vcpu))
259 return svm->nested.hsave;
260 else
261 return svm->vmcb;
262}
263
264static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
265{
266 struct vmcb *vmcb = get_host_vmcb(svm);
267
268 vmcb->control.intercept_cr |= (1U << bit);
269
270 recalc_intercepts(svm);
271}
272
273static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
274{
275 struct vmcb *vmcb = get_host_vmcb(svm);
276
277 vmcb->control.intercept_cr &= ~(1U << bit);
278
279 recalc_intercepts(svm);
280}
281
282static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
179{ 283{
180 return svm->nested.vmcb; 284 struct vmcb *vmcb = get_host_vmcb(svm);
285
286 return vmcb->control.intercept_cr & (1U << bit);
287}
288
289static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
290{
291 struct vmcb *vmcb = get_host_vmcb(svm);
292
293 vmcb->control.intercept_dr |= (1U << bit);
294
295 recalc_intercepts(svm);
296}
297
298static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
299{
300 struct vmcb *vmcb = get_host_vmcb(svm);
301
302 vmcb->control.intercept_dr &= ~(1U << bit);
303
304 recalc_intercepts(svm);
305}
306
307static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
308{
309 struct vmcb *vmcb = get_host_vmcb(svm);
310
311 vmcb->control.intercept_exceptions |= (1U << bit);
312
313 recalc_intercepts(svm);
314}
315
316static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
317{
318 struct vmcb *vmcb = get_host_vmcb(svm);
319
320 vmcb->control.intercept_exceptions &= ~(1U << bit);
321
322 recalc_intercepts(svm);
323}
324
325static inline void set_intercept(struct vcpu_svm *svm, int bit)
326{
327 struct vmcb *vmcb = get_host_vmcb(svm);
328
329 vmcb->control.intercept |= (1ULL << bit);
330
331 recalc_intercepts(svm);
332}
333
334static inline void clr_intercept(struct vcpu_svm *svm, int bit)
335{
336 struct vmcb *vmcb = get_host_vmcb(svm);
337
338 vmcb->control.intercept &= ~(1ULL << bit);
339
340 recalc_intercepts(svm);
181} 341}
182 342
183static inline void enable_gif(struct vcpu_svm *svm) 343static inline void enable_gif(struct vcpu_svm *svm)
@@ -218,7 +378,6 @@ struct svm_cpu_data {
218}; 378};
219 379
220static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); 380static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
221static uint32_t svm_features;
222 381
223struct svm_init_data { 382struct svm_init_data {
224 int cpu; 383 int cpu;
@@ -254,11 +413,6 @@ static u32 svm_msrpm_offset(u32 msr)
254 413
255#define MAX_INST_SIZE 15 414#define MAX_INST_SIZE 15
256 415
257static inline u32 svm_has(u32 feat)
258{
259 return svm_features & feat;
260}
261
262static inline void clgi(void) 416static inline void clgi(void)
263{ 417{
264 asm volatile (__ex(SVM_CLGI)); 418 asm volatile (__ex(SVM_CLGI));
@@ -274,14 +428,13 @@ static inline void invlpga(unsigned long addr, u32 asid)
274 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); 428 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
275} 429}
276 430
277static inline void force_new_asid(struct kvm_vcpu *vcpu) 431static int get_npt_level(void)
278{
279 to_svm(vcpu)->asid_generation--;
280}
281
282static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
283{ 432{
284 force_new_asid(vcpu); 433#ifdef CONFIG_X86_64
434 return PT64_ROOT_LEVEL;
435#else
436 return PT32E_ROOT_LEVEL;
437#endif
285} 438}
286 439
287static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 440static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -291,6 +444,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
291 efer &= ~EFER_LME; 444 efer &= ~EFER_LME;
292 445
293 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 446 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
447 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
294} 448}
295 449
296static int is_external_interrupt(u32 info) 450static int is_external_interrupt(u32 info)
@@ -328,7 +482,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
328 svm->next_rip = svm->vmcb->control.next_rip; 482 svm->next_rip = svm->vmcb->control.next_rip;
329 483
330 if (!svm->next_rip) { 484 if (!svm->next_rip) {
331 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != 485 if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
332 EMULATE_DONE) 486 EMULATE_DONE)
333 printk(KERN_DEBUG "%s: NOP\n", __func__); 487 printk(KERN_DEBUG "%s: NOP\n", __func__);
334 return; 488 return;
@@ -355,7 +509,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
355 nested_svm_check_exception(svm, nr, has_error_code, error_code)) 509 nested_svm_check_exception(svm, nr, has_error_code, error_code))
356 return; 510 return;
357 511
358 if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { 512 if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
359 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); 513 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
360 514
361 /* 515 /*
@@ -416,6 +570,10 @@ static int has_svm(void)
416 570
417static void svm_hardware_disable(void *garbage) 571static void svm_hardware_disable(void *garbage)
418{ 572{
573 /* Make sure we clean up behind us */
574 if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
575 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
576
419 cpu_svm_disable(); 577 cpu_svm_disable();
420} 578}
421 579
@@ -457,6 +615,11 @@ static int svm_hardware_enable(void *garbage)
457 615
458 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); 616 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
459 617
618 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
619 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
620 __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
621 }
622
460 svm_init_erratum_383(); 623 svm_init_erratum_383();
461 624
462 return 0; 625 return 0;
@@ -638,6 +801,23 @@ static __init int svm_hardware_setup(void)
638 if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 801 if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
639 kvm_enable_efer_bits(EFER_FFXSR); 802 kvm_enable_efer_bits(EFER_FFXSR);
640 803
804 if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
805 u64 max;
806
807 kvm_has_tsc_control = true;
808
809 /*
810 * Make sure the user can only configure tsc_khz values that
811 * fit into a signed integer.
812 * A min value is not calculated needed because it will always
813 * be 1 on all machines and a value of 0 is used to disable
814 * tsc-scaling for the vcpu.
815 */
816 max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
817
818 kvm_max_guest_tsc_khz = max;
819 }
820
641 if (nested) { 821 if (nested) {
642 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 822 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
643 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 823 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
@@ -649,9 +829,7 @@ static __init int svm_hardware_setup(void)
649 goto err; 829 goto err;
650 } 830 }
651 831
652 svm_features = cpuid_edx(SVM_CPUID_FUNC); 832 if (!boot_cpu_has(X86_FEATURE_NPT))
653
654 if (!svm_has(SVM_FEATURE_NPT))
655 npt_enabled = false; 833 npt_enabled = false;
656 834
657 if (npt_enabled && !npt) { 835 if (npt_enabled && !npt) {
@@ -701,68 +879,161 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
701 seg->base = 0; 879 seg->base = 0;
702} 880}
703 881
882static u64 __scale_tsc(u64 ratio, u64 tsc)
883{
884 u64 mult, frac, _tsc;
885
886 mult = ratio >> 32;
887 frac = ratio & ((1ULL << 32) - 1);
888
889 _tsc = tsc;
890 _tsc *= mult;
891 _tsc += (tsc >> 32) * frac;
892 _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
893
894 return _tsc;
895}
896
897static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
898{
899 struct vcpu_svm *svm = to_svm(vcpu);
900 u64 _tsc = tsc;
901
902 if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
903 _tsc = __scale_tsc(svm->tsc_ratio, tsc);
904
905 return _tsc;
906}
907
908static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
909{
910 struct vcpu_svm *svm = to_svm(vcpu);
911 u64 ratio;
912 u64 khz;
913
914 /* TSC scaling supported? */
915 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
916 return;
917
918 /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
919 if (user_tsc_khz == 0) {
920 vcpu->arch.virtual_tsc_khz = 0;
921 svm->tsc_ratio = TSC_RATIO_DEFAULT;
922 return;
923 }
924
925 khz = user_tsc_khz;
926
927 /* TSC scaling required - calculate ratio */
928 ratio = khz << 32;
929 do_div(ratio, tsc_khz);
930
931 if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
932 WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
933 user_tsc_khz);
934 return;
935 }
936 vcpu->arch.virtual_tsc_khz = user_tsc_khz;
937 svm->tsc_ratio = ratio;
938}
939
940static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
941{
942 struct vcpu_svm *svm = to_svm(vcpu);
943 u64 g_tsc_offset = 0;
944
945 if (is_guest_mode(vcpu)) {
946 g_tsc_offset = svm->vmcb->control.tsc_offset -
947 svm->nested.hsave->control.tsc_offset;
948 svm->nested.hsave->control.tsc_offset = offset;
949 }
950
951 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
952
953 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
954}
955
956static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
957{
958 struct vcpu_svm *svm = to_svm(vcpu);
959
960 svm->vmcb->control.tsc_offset += adjustment;
961 if (is_guest_mode(vcpu))
962 svm->nested.hsave->control.tsc_offset += adjustment;
963 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
964}
965
966static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
967{
968 u64 tsc;
969
970 tsc = svm_scale_tsc(vcpu, native_read_tsc());
971
972 return target_tsc - tsc;
973}
974
704static void init_vmcb(struct vcpu_svm *svm) 975static void init_vmcb(struct vcpu_svm *svm)
705{ 976{
706 struct vmcb_control_area *control = &svm->vmcb->control; 977 struct vmcb_control_area *control = &svm->vmcb->control;
707 struct vmcb_save_area *save = &svm->vmcb->save; 978 struct vmcb_save_area *save = &svm->vmcb->save;
708 979
709 svm->vcpu.fpu_active = 1; 980 svm->vcpu.fpu_active = 1;
981 svm->vcpu.arch.hflags = 0;
710 982
711 control->intercept_cr_read = INTERCEPT_CR0_MASK | 983 set_cr_intercept(svm, INTERCEPT_CR0_READ);
712 INTERCEPT_CR3_MASK | 984 set_cr_intercept(svm, INTERCEPT_CR3_READ);
713 INTERCEPT_CR4_MASK; 985 set_cr_intercept(svm, INTERCEPT_CR4_READ);
714 986 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
715 control->intercept_cr_write = INTERCEPT_CR0_MASK | 987 set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
716 INTERCEPT_CR3_MASK | 988 set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
717 INTERCEPT_CR4_MASK | 989 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
718 INTERCEPT_CR8_MASK; 990
719 991 set_dr_intercept(svm, INTERCEPT_DR0_READ);
720 control->intercept_dr_read = INTERCEPT_DR0_MASK | 992 set_dr_intercept(svm, INTERCEPT_DR1_READ);
721 INTERCEPT_DR1_MASK | 993 set_dr_intercept(svm, INTERCEPT_DR2_READ);
722 INTERCEPT_DR2_MASK | 994 set_dr_intercept(svm, INTERCEPT_DR3_READ);
723 INTERCEPT_DR3_MASK | 995 set_dr_intercept(svm, INTERCEPT_DR4_READ);
724 INTERCEPT_DR4_MASK | 996 set_dr_intercept(svm, INTERCEPT_DR5_READ);
725 INTERCEPT_DR5_MASK | 997 set_dr_intercept(svm, INTERCEPT_DR6_READ);
726 INTERCEPT_DR6_MASK | 998 set_dr_intercept(svm, INTERCEPT_DR7_READ);
727 INTERCEPT_DR7_MASK; 999
728 1000 set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
729 control->intercept_dr_write = INTERCEPT_DR0_MASK | 1001 set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
730 INTERCEPT_DR1_MASK | 1002 set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
731 INTERCEPT_DR2_MASK | 1003 set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
732 INTERCEPT_DR3_MASK | 1004 set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
733 INTERCEPT_DR4_MASK | 1005 set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
734 INTERCEPT_DR5_MASK | 1006 set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
735 INTERCEPT_DR6_MASK | 1007 set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
736 INTERCEPT_DR7_MASK; 1008
737 1009 set_exception_intercept(svm, PF_VECTOR);
738 control->intercept_exceptions = (1 << PF_VECTOR) | 1010 set_exception_intercept(svm, UD_VECTOR);
739 (1 << UD_VECTOR) | 1011 set_exception_intercept(svm, MC_VECTOR);
740 (1 << MC_VECTOR); 1012
741 1013 set_intercept(svm, INTERCEPT_INTR);
742 1014 set_intercept(svm, INTERCEPT_NMI);
743 control->intercept = (1ULL << INTERCEPT_INTR) | 1015 set_intercept(svm, INTERCEPT_SMI);
744 (1ULL << INTERCEPT_NMI) | 1016 set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
745 (1ULL << INTERCEPT_SMI) | 1017 set_intercept(svm, INTERCEPT_CPUID);
746 (1ULL << INTERCEPT_SELECTIVE_CR0) | 1018 set_intercept(svm, INTERCEPT_INVD);
747 (1ULL << INTERCEPT_CPUID) | 1019 set_intercept(svm, INTERCEPT_HLT);
748 (1ULL << INTERCEPT_INVD) | 1020 set_intercept(svm, INTERCEPT_INVLPG);
749 (1ULL << INTERCEPT_HLT) | 1021 set_intercept(svm, INTERCEPT_INVLPGA);
750 (1ULL << INTERCEPT_INVLPG) | 1022 set_intercept(svm, INTERCEPT_IOIO_PROT);
751 (1ULL << INTERCEPT_INVLPGA) | 1023 set_intercept(svm, INTERCEPT_MSR_PROT);
752 (1ULL << INTERCEPT_IOIO_PROT) | 1024 set_intercept(svm, INTERCEPT_TASK_SWITCH);
753 (1ULL << INTERCEPT_MSR_PROT) | 1025 set_intercept(svm, INTERCEPT_SHUTDOWN);
754 (1ULL << INTERCEPT_TASK_SWITCH) | 1026 set_intercept(svm, INTERCEPT_VMRUN);
755 (1ULL << INTERCEPT_SHUTDOWN) | 1027 set_intercept(svm, INTERCEPT_VMMCALL);
756 (1ULL << INTERCEPT_VMRUN) | 1028 set_intercept(svm, INTERCEPT_VMLOAD);
757 (1ULL << INTERCEPT_VMMCALL) | 1029 set_intercept(svm, INTERCEPT_VMSAVE);
758 (1ULL << INTERCEPT_VMLOAD) | 1030 set_intercept(svm, INTERCEPT_STGI);
759 (1ULL << INTERCEPT_VMSAVE) | 1031 set_intercept(svm, INTERCEPT_CLGI);
760 (1ULL << INTERCEPT_STGI) | 1032 set_intercept(svm, INTERCEPT_SKINIT);
761 (1ULL << INTERCEPT_CLGI) | 1033 set_intercept(svm, INTERCEPT_WBINVD);
762 (1ULL << INTERCEPT_SKINIT) | 1034 set_intercept(svm, INTERCEPT_MONITOR);
763 (1ULL << INTERCEPT_WBINVD) | 1035 set_intercept(svm, INTERCEPT_MWAIT);
764 (1ULL << INTERCEPT_MONITOR) | 1036 set_intercept(svm, INTERCEPT_XSETBV);
765 (1ULL << INTERCEPT_MWAIT);
766 1037
767 control->iopm_base_pa = iopm_base; 1038 control->iopm_base_pa = iopm_base;
768 control->msrpm_base_pa = __pa(svm->msrpm); 1039 control->msrpm_base_pa = __pa(svm->msrpm);
@@ -793,10 +1064,10 @@ static void init_vmcb(struct vcpu_svm *svm)
793 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1064 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
794 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1065 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
795 1066
796 save->efer = EFER_SVME; 1067 svm_set_efer(&svm->vcpu, 0);
797 save->dr6 = 0xffff0ff0; 1068 save->dr6 = 0xffff0ff0;
798 save->dr7 = 0x400; 1069 save->dr7 = 0x400;
799 save->rflags = 2; 1070 kvm_set_rflags(&svm->vcpu, 2);
800 save->rip = 0x0000fff0; 1071 save->rip = 0x0000fff0;
801 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 1072 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
802 1073
@@ -804,8 +1075,8 @@ static void init_vmcb(struct vcpu_svm *svm)
804 * This is the guest-visible cr0 value. 1075 * This is the guest-visible cr0 value.
805 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 1076 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
806 */ 1077 */
807 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 1078 svm->vcpu.arch.cr0 = 0;
808 (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); 1079 (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
809 1080
810 save->cr4 = X86_CR4_PAE; 1081 save->cr4 = X86_CR4_PAE;
811 /* rdx = ?? */ 1082 /* rdx = ?? */
@@ -813,25 +1084,27 @@ static void init_vmcb(struct vcpu_svm *svm)
813 if (npt_enabled) { 1084 if (npt_enabled) {
814 /* Setup VMCB for Nested Paging */ 1085 /* Setup VMCB for Nested Paging */
815 control->nested_ctl = 1; 1086 control->nested_ctl = 1;
816 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | 1087 clr_intercept(svm, INTERCEPT_TASK_SWITCH);
817 (1ULL << INTERCEPT_INVLPG)); 1088 clr_intercept(svm, INTERCEPT_INVLPG);
818 control->intercept_exceptions &= ~(1 << PF_VECTOR); 1089 clr_exception_intercept(svm, PF_VECTOR);
819 control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; 1090 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
820 control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; 1091 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
821 save->g_pat = 0x0007040600070406ULL; 1092 save->g_pat = 0x0007040600070406ULL;
822 save->cr3 = 0; 1093 save->cr3 = 0;
823 save->cr4 = 0; 1094 save->cr4 = 0;
824 } 1095 }
825 force_new_asid(&svm->vcpu); 1096 svm->asid_generation = 0;
826 1097
827 svm->nested.vmcb = 0; 1098 svm->nested.vmcb = 0;
828 svm->vcpu.arch.hflags = 0; 1099 svm->vcpu.arch.hflags = 0;
829 1100
830 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { 1101 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
831 control->pause_filter_count = 3000; 1102 control->pause_filter_count = 3000;
832 control->intercept |= (1ULL << INTERCEPT_PAUSE); 1103 set_intercept(svm, INTERCEPT_PAUSE);
833 } 1104 }
834 1105
1106 mark_all_dirty(svm->vmcb);
1107
835 enable_gif(svm); 1108 enable_gif(svm);
836} 1109}
837 1110
@@ -867,6 +1140,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
867 goto out; 1140 goto out;
868 } 1141 }
869 1142
1143 svm->tsc_ratio = TSC_RATIO_DEFAULT;
1144
870 err = kvm_vcpu_init(&svm->vcpu, kvm, id); 1145 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
871 if (err) 1146 if (err)
872 goto free_svm; 1147 goto free_svm;
@@ -901,7 +1176,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
901 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 1176 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
902 svm->asid_generation = 0; 1177 svm->asid_generation = 0;
903 init_vmcb(svm); 1178 init_vmcb(svm);
904 svm->vmcb->control.tsc_offset = 0-native_read_tsc(); 1179 kvm_write_tsc(&svm->vcpu, 0);
905 1180
906 err = fx_init(&svm->vcpu); 1181 err = fx_init(&svm->vcpu);
907 if (err) 1182 if (err)
@@ -947,25 +1222,25 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
947 int i; 1222 int i;
948 1223
949 if (unlikely(cpu != vcpu->cpu)) { 1224 if (unlikely(cpu != vcpu->cpu)) {
950 u64 delta;
951
952 if (check_tsc_unstable()) {
953 /*
954 * Make sure that the guest sees a monotonically
955 * increasing TSC.
956 */
957 delta = vcpu->arch.host_tsc - native_read_tsc();
958 svm->vmcb->control.tsc_offset += delta;
959 if (is_nested(svm))
960 svm->nested.hsave->control.tsc_offset += delta;
961 }
962 vcpu->cpu = cpu;
963 kvm_migrate_timers(vcpu);
964 svm->asid_generation = 0; 1225 svm->asid_generation = 0;
1226 mark_all_dirty(svm->vmcb);
965 } 1227 }
966 1228
1229#ifdef CONFIG_X86_64
1230 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1231#endif
1232 savesegment(fs, svm->host.fs);
1233 savesegment(gs, svm->host.gs);
1234 svm->host.ldt = kvm_read_ldt();
1235
967 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1236 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
968 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1237 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1238
1239 if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
1240 svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) {
1241 __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio;
1242 wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
1243 }
969} 1244}
970 1245
971static void svm_vcpu_put(struct kvm_vcpu *vcpu) 1246static void svm_vcpu_put(struct kvm_vcpu *vcpu)
@@ -974,10 +1249,18 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
974 int i; 1249 int i;
975 1250
976 ++vcpu->stat.host_state_reload; 1251 ++vcpu->stat.host_state_reload;
1252 kvm_load_ldt(svm->host.ldt);
1253#ifdef CONFIG_X86_64
1254 loadsegment(fs, svm->host.fs);
1255 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1256 load_gs_index(svm->host.gs);
1257#else
1258#ifdef CONFIG_X86_32_LAZY_GS
1259 loadsegment(gs, svm->host.gs);
1260#endif
1261#endif
977 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1262 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
978 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1263 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
979
980 vcpu->arch.host_tsc = native_read_tsc();
981} 1264}
982 1265
983static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1266static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -995,7 +1278,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
995 switch (reg) { 1278 switch (reg) {
996 case VCPU_EXREG_PDPTR: 1279 case VCPU_EXREG_PDPTR:
997 BUG_ON(!npt_enabled); 1280 BUG_ON(!npt_enabled);
998 load_pdptrs(vcpu, vcpu->arch.cr3); 1281 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
999 break; 1282 break;
1000 default: 1283 default:
1001 BUG(); 1284 BUG();
@@ -1004,12 +1287,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1004 1287
1005static void svm_set_vintr(struct vcpu_svm *svm) 1288static void svm_set_vintr(struct vcpu_svm *svm)
1006{ 1289{
1007 svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; 1290 set_intercept(svm, INTERCEPT_VINTR);
1008} 1291}
1009 1292
1010static void svm_clear_vintr(struct vcpu_svm *svm) 1293static void svm_clear_vintr(struct vcpu_svm *svm)
1011{ 1294{
1012 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1295 clr_intercept(svm, INTERCEPT_VINTR);
1013} 1296}
1014 1297
1015static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1298static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1124,6 +1407,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1124 1407
1125 svm->vmcb->save.idtr.limit = dt->size; 1408 svm->vmcb->save.idtr.limit = dt->size;
1126 svm->vmcb->save.idtr.base = dt->address ; 1409 svm->vmcb->save.idtr.base = dt->address ;
1410 mark_dirty(svm->vmcb, VMCB_DT);
1127} 1411}
1128 1412
1129static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1413static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1140,19 +1424,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1140 1424
1141 svm->vmcb->save.gdtr.limit = dt->size; 1425 svm->vmcb->save.gdtr.limit = dt->size;
1142 svm->vmcb->save.gdtr.base = dt->address ; 1426 svm->vmcb->save.gdtr.base = dt->address ;
1427 mark_dirty(svm->vmcb, VMCB_DT);
1143} 1428}
1144 1429
1145static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1430static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1146{ 1431{
1147} 1432}
1148 1433
1434static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1435{
1436}
1437
1149static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1438static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1150{ 1439{
1151} 1440}
1152 1441
1153static void update_cr0_intercept(struct vcpu_svm *svm) 1442static void update_cr0_intercept(struct vcpu_svm *svm)
1154{ 1443{
1155 struct vmcb *vmcb = svm->vmcb;
1156 ulong gcr0 = svm->vcpu.arch.cr0; 1444 ulong gcr0 = svm->vcpu.arch.cr0;
1157 u64 *hcr0 = &svm->vmcb->save.cr0; 1445 u64 *hcr0 = &svm->vmcb->save.cr0;
1158 1446
@@ -1162,27 +1450,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
1162 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) 1450 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1163 | (gcr0 & SVM_CR0_SELECTIVE_MASK); 1451 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1164 1452
1453 mark_dirty(svm->vmcb, VMCB_CR);
1165 1454
1166 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1455 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1167 vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; 1456 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1168 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; 1457 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1169 if (is_nested(svm)) {
1170 struct vmcb *hsave = svm->nested.hsave;
1171
1172 hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
1173 hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1174 vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
1175 vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
1176 }
1177 } else { 1458 } else {
1178 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; 1459 set_cr_intercept(svm, INTERCEPT_CR0_READ);
1179 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; 1460 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1180 if (is_nested(svm)) {
1181 struct vmcb *hsave = svm->nested.hsave;
1182
1183 hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
1184 hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1185 }
1186 } 1461 }
1187} 1462}
1188 1463
@@ -1190,27 +1465,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1190{ 1465{
1191 struct vcpu_svm *svm = to_svm(vcpu); 1466 struct vcpu_svm *svm = to_svm(vcpu);
1192 1467
1193 if (is_nested(svm)) {
1194 /*
1195 * We are here because we run in nested mode, the host kvm
1196 * intercepts cr0 writes but the l1 hypervisor does not.
1197 * But the L1 hypervisor may intercept selective cr0 writes.
1198 * This needs to be checked here.
1199 */
1200 unsigned long old, new;
1201
1202 /* Remove bits that would trigger a real cr0 write intercept */
1203 old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
1204 new = cr0 & SVM_CR0_SELECTIVE_MASK;
1205
1206 if (old == new) {
1207 /* cr0 write with ts and mp unchanged */
1208 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1209 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
1210 return;
1211 }
1212 }
1213
1214#ifdef CONFIG_X86_64 1468#ifdef CONFIG_X86_64
1215 if (vcpu->arch.efer & EFER_LME) { 1469 if (vcpu->arch.efer & EFER_LME) {
1216 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1470 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -1238,6 +1492,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1238 */ 1492 */
1239 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1493 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1240 svm->vmcb->save.cr0 = cr0; 1494 svm->vmcb->save.cr0 = cr0;
1495 mark_dirty(svm->vmcb, VMCB_CR);
1241 update_cr0_intercept(svm); 1496 update_cr0_intercept(svm);
1242} 1497}
1243 1498
@@ -1247,13 +1502,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1247 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; 1502 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1248 1503
1249 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1504 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1250 force_new_asid(vcpu); 1505 svm_flush_tlb(vcpu);
1251 1506
1252 vcpu->arch.cr4 = cr4; 1507 vcpu->arch.cr4 = cr4;
1253 if (!npt_enabled) 1508 if (!npt_enabled)
1254 cr4 |= X86_CR4_PAE; 1509 cr4 |= X86_CR4_PAE;
1255 cr4 |= host_cr4_mce; 1510 cr4 |= host_cr4_mce;
1256 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1511 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1512 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1257} 1513}
1258 1514
1259static void svm_set_segment(struct kvm_vcpu *vcpu, 1515static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1282,26 +1538,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1282 = (svm->vmcb->save.cs.attrib 1538 = (svm->vmcb->save.cs.attrib
1283 >> SVM_SELECTOR_DPL_SHIFT) & 3; 1539 >> SVM_SELECTOR_DPL_SHIFT) & 3;
1284 1540
1541 mark_dirty(svm->vmcb, VMCB_SEG);
1285} 1542}
1286 1543
1287static void update_db_intercept(struct kvm_vcpu *vcpu) 1544static void update_db_intercept(struct kvm_vcpu *vcpu)
1288{ 1545{
1289 struct vcpu_svm *svm = to_svm(vcpu); 1546 struct vcpu_svm *svm = to_svm(vcpu);
1290 1547
1291 svm->vmcb->control.intercept_exceptions &= 1548 clr_exception_intercept(svm, DB_VECTOR);
1292 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1549 clr_exception_intercept(svm, BP_VECTOR);
1293 1550
1294 if (svm->nmi_singlestep) 1551 if (svm->nmi_singlestep)
1295 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1552 set_exception_intercept(svm, DB_VECTOR);
1296 1553
1297 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1554 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1298 if (vcpu->guest_debug & 1555 if (vcpu->guest_debug &
1299 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 1556 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1300 svm->vmcb->control.intercept_exceptions |= 1557 set_exception_intercept(svm, DB_VECTOR);
1301 1 << DB_VECTOR;
1302 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1558 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1303 svm->vmcb->control.intercept_exceptions |= 1559 set_exception_intercept(svm, BP_VECTOR);
1304 1 << BP_VECTOR;
1305 } else 1560 } else
1306 vcpu->guest_debug = 0; 1561 vcpu->guest_debug = 0;
1307} 1562}
@@ -1315,21 +1570,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1315 else 1570 else
1316 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1571 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1317 1572
1318 update_db_intercept(vcpu); 1573 mark_dirty(svm->vmcb, VMCB_DR);
1319}
1320
1321static void load_host_msrs(struct kvm_vcpu *vcpu)
1322{
1323#ifdef CONFIG_X86_64
1324 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1325#endif
1326}
1327 1574
1328static void save_host_msrs(struct kvm_vcpu *vcpu) 1575 update_db_intercept(vcpu);
1329{
1330#ifdef CONFIG_X86_64
1331 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1332#endif
1333} 1576}
1334 1577
1335static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1578static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -1342,6 +1585,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1342 1585
1343 svm->asid_generation = sd->asid_generation; 1586 svm->asid_generation = sd->asid_generation;
1344 svm->vmcb->control.asid = sd->next_asid++; 1587 svm->vmcb->control.asid = sd->next_asid++;
1588
1589 mark_dirty(svm->vmcb, VMCB_ASID);
1345} 1590}
1346 1591
1347static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1592static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
@@ -1349,20 +1594,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1349 struct vcpu_svm *svm = to_svm(vcpu); 1594 struct vcpu_svm *svm = to_svm(vcpu);
1350 1595
1351 svm->vmcb->save.dr7 = value; 1596 svm->vmcb->save.dr7 = value;
1597 mark_dirty(svm->vmcb, VMCB_DR);
1352} 1598}
1353 1599
1354static int pf_interception(struct vcpu_svm *svm) 1600static int pf_interception(struct vcpu_svm *svm)
1355{ 1601{
1356 u64 fault_address; 1602 u64 fault_address = svm->vmcb->control.exit_info_2;
1357 u32 error_code; 1603 u32 error_code;
1604 int r = 1;
1358 1605
1359 fault_address = svm->vmcb->control.exit_info_2; 1606 switch (svm->apf_reason) {
1360 error_code = svm->vmcb->control.exit_info_1; 1607 default:
1361 1608 error_code = svm->vmcb->control.exit_info_1;
1362 trace_kvm_page_fault(fault_address, error_code); 1609
1363 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) 1610 trace_kvm_page_fault(fault_address, error_code);
1364 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1611 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1365 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1612 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1613 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1614 svm->vmcb->control.insn_bytes,
1615 svm->vmcb->control.insn_len);
1616 break;
1617 case KVM_PV_REASON_PAGE_NOT_PRESENT:
1618 svm->apf_reason = 0;
1619 local_irq_disable();
1620 kvm_async_pf_task_wait(fault_address);
1621 local_irq_enable();
1622 break;
1623 case KVM_PV_REASON_PAGE_READY:
1624 svm->apf_reason = 0;
1625 local_irq_disable();
1626 kvm_async_pf_task_wake(fault_address);
1627 local_irq_enable();
1628 break;
1629 }
1630 return r;
1366} 1631}
1367 1632
1368static int db_interception(struct vcpu_svm *svm) 1633static int db_interception(struct vcpu_svm *svm)
@@ -1410,7 +1675,7 @@ static int ud_interception(struct vcpu_svm *svm)
1410{ 1675{
1411 int er; 1676 int er;
1412 1677
1413 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); 1678 er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
1414 if (er != EMULATE_DONE) 1679 if (er != EMULATE_DONE)
1415 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1680 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1416 return 1; 1681 return 1;
@@ -1419,21 +1684,8 @@ static int ud_interception(struct vcpu_svm *svm)
1419static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1684static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1420{ 1685{
1421 struct vcpu_svm *svm = to_svm(vcpu); 1686 struct vcpu_svm *svm = to_svm(vcpu);
1422 u32 excp;
1423
1424 if (is_nested(svm)) {
1425 u32 h_excp, n_excp;
1426
1427 h_excp = svm->nested.hsave->control.intercept_exceptions;
1428 n_excp = svm->nested.intercept_exceptions;
1429 h_excp &= ~(1 << NM_VECTOR);
1430 excp = h_excp | n_excp;
1431 } else {
1432 excp = svm->vmcb->control.intercept_exceptions;
1433 excp &= ~(1 << NM_VECTOR);
1434 }
1435 1687
1436 svm->vmcb->control.intercept_exceptions = excp; 1688 clr_exception_intercept(svm, NM_VECTOR);
1437 1689
1438 svm->vcpu.fpu_active = 1; 1690 svm->vcpu.fpu_active = 1;
1439 update_cr0_intercept(svm); 1691 update_cr0_intercept(svm);
@@ -1540,7 +1792,7 @@ static int io_interception(struct vcpu_svm *svm)
1540 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1792 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1541 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1793 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1542 if (string || in) 1794 if (string || in)
1543 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 1795 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
1544 1796
1545 port = io_info >> 16; 1797 port = io_info >> 16;
1546 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1798 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1581,6 +1833,56 @@ static int vmmcall_interception(struct vcpu_svm *svm)
1581 return 1; 1833 return 1;
1582} 1834}
1583 1835
1836static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
1837{
1838 struct vcpu_svm *svm = to_svm(vcpu);
1839
1840 return svm->nested.nested_cr3;
1841}
1842
1843static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1844 unsigned long root)
1845{
1846 struct vcpu_svm *svm = to_svm(vcpu);
1847
1848 svm->vmcb->control.nested_cr3 = root;
1849 mark_dirty(svm->vmcb, VMCB_NPT);
1850 svm_flush_tlb(vcpu);
1851}
1852
1853static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
1854 struct x86_exception *fault)
1855{
1856 struct vcpu_svm *svm = to_svm(vcpu);
1857
1858 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1859 svm->vmcb->control.exit_code_hi = 0;
1860 svm->vmcb->control.exit_info_1 = fault->error_code;
1861 svm->vmcb->control.exit_info_2 = fault->address;
1862
1863 nested_svm_vmexit(svm);
1864}
1865
1866static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
1867{
1868 int r;
1869
1870 r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
1871
1872 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
1873 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
1874 vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
1875 vcpu->arch.mmu.shadow_root_level = get_npt_level();
1876 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
1877
1878 return r;
1879}
1880
1881static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
1882{
1883 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
1884}
1885
1584static int nested_svm_check_permissions(struct vcpu_svm *svm) 1886static int nested_svm_check_permissions(struct vcpu_svm *svm)
1585{ 1887{
1586 if (!(svm->vcpu.arch.efer & EFER_SVME) 1888 if (!(svm->vcpu.arch.efer & EFER_SVME)
@@ -1602,7 +1904,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1602{ 1904{
1603 int vmexit; 1905 int vmexit;
1604 1906
1605 if (!is_nested(svm)) 1907 if (!is_guest_mode(&svm->vcpu))
1606 return 0; 1908 return 0;
1607 1909
1608 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; 1910 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1620,7 +1922,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1620/* This function returns true if it is save to enable the irq window */ 1922/* This function returns true if it is save to enable the irq window */
1621static inline bool nested_svm_intr(struct vcpu_svm *svm) 1923static inline bool nested_svm_intr(struct vcpu_svm *svm)
1622{ 1924{
1623 if (!is_nested(svm)) 1925 if (!is_guest_mode(&svm->vcpu))
1624 return true; 1926 return true;
1625 1927
1626 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1928 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
@@ -1629,6 +1931,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
1629 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 1931 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1630 return false; 1932 return false;
1631 1933
1934 /*
1935 * if vmexit was already requested (by intercepted exception
1936 * for instance) do not overwrite it with "external interrupt"
1937 * vmexit.
1938 */
1939 if (svm->nested.exit_required)
1940 return false;
1941
1632 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1942 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1633 svm->vmcb->control.exit_info_1 = 0; 1943 svm->vmcb->control.exit_info_1 = 0;
1634 svm->vmcb->control.exit_info_2 = 0; 1944 svm->vmcb->control.exit_info_2 = 0;
@@ -1651,7 +1961,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
1651/* This function returns true if it is save to enable the nmi window */ 1961/* This function returns true if it is save to enable the nmi window */
1652static inline bool nested_svm_nmi(struct vcpu_svm *svm) 1962static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1653{ 1963{
1654 if (!is_nested(svm)) 1964 if (!is_guest_mode(&svm->vcpu))
1655 return true; 1965 return true;
1656 1966
1657 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) 1967 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
@@ -1750,8 +2060,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1750 return NESTED_EXIT_HOST; 2060 return NESTED_EXIT_HOST;
1751 break; 2061 break;
1752 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 2062 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1753 /* When we're shadowing, trap PFs */ 2063 /* When we're shadowing, trap PFs, but not async PF */
1754 if (!npt_enabled) 2064 if (!npt_enabled && svm->apf_reason == 0)
1755 return NESTED_EXIT_HOST; 2065 return NESTED_EXIT_HOST;
1756 break; 2066 break;
1757 case SVM_EXIT_EXCP_BASE + NM_VECTOR: 2067 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1779,27 +2089,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1779 case SVM_EXIT_IOIO: 2089 case SVM_EXIT_IOIO:
1780 vmexit = nested_svm_intercept_ioio(svm); 2090 vmexit = nested_svm_intercept_ioio(svm);
1781 break; 2091 break;
1782 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 2092 case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
1783 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 2093 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
1784 if (svm->nested.intercept_cr_read & cr_bits) 2094 if (svm->nested.intercept_cr & bit)
1785 vmexit = NESTED_EXIT_DONE;
1786 break;
1787 }
1788 case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
1789 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
1790 if (svm->nested.intercept_cr_write & cr_bits)
1791 vmexit = NESTED_EXIT_DONE;
1792 break;
1793 }
1794 case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
1795 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
1796 if (svm->nested.intercept_dr_read & dr_bits)
1797 vmexit = NESTED_EXIT_DONE; 2095 vmexit = NESTED_EXIT_DONE;
1798 break; 2096 break;
1799 } 2097 }
1800 case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { 2098 case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
1801 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); 2099 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
1802 if (svm->nested.intercept_dr_write & dr_bits) 2100 if (svm->nested.intercept_dr & bit)
1803 vmexit = NESTED_EXIT_DONE; 2101 vmexit = NESTED_EXIT_DONE;
1804 break; 2102 break;
1805 } 2103 }
@@ -1807,6 +2105,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1807 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 2105 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1808 if (svm->nested.intercept_exceptions & excp_bits) 2106 if (svm->nested.intercept_exceptions & excp_bits)
1809 vmexit = NESTED_EXIT_DONE; 2107 vmexit = NESTED_EXIT_DONE;
2108 /* async page fault always cause vmexit */
2109 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2110 svm->apf_reason != 0)
2111 vmexit = NESTED_EXIT_DONE;
1810 break; 2112 break;
1811 } 2113 }
1812 case SVM_EXIT_ERR: { 2114 case SVM_EXIT_ERR: {
@@ -1840,10 +2142,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
1840 struct vmcb_control_area *dst = &dst_vmcb->control; 2142 struct vmcb_control_area *dst = &dst_vmcb->control;
1841 struct vmcb_control_area *from = &from_vmcb->control; 2143 struct vmcb_control_area *from = &from_vmcb->control;
1842 2144
1843 dst->intercept_cr_read = from->intercept_cr_read; 2145 dst->intercept_cr = from->intercept_cr;
1844 dst->intercept_cr_write = from->intercept_cr_write; 2146 dst->intercept_dr = from->intercept_dr;
1845 dst->intercept_dr_read = from->intercept_dr_read;
1846 dst->intercept_dr_write = from->intercept_dr_write;
1847 dst->intercept_exceptions = from->intercept_exceptions; 2147 dst->intercept_exceptions = from->intercept_exceptions;
1848 dst->intercept = from->intercept; 2148 dst->intercept = from->intercept;
1849 dst->iopm_base_pa = from->iopm_base_pa; 2149 dst->iopm_base_pa = from->iopm_base_pa;
@@ -1884,7 +2184,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1884 if (!nested_vmcb) 2184 if (!nested_vmcb)
1885 return 1; 2185 return 1;
1886 2186
1887 /* Exit nested SVM mode */ 2187 /* Exit Guest-Mode */
2188 leave_guest_mode(&svm->vcpu);
1888 svm->nested.vmcb = 0; 2189 svm->nested.vmcb = 0;
1889 2190
1890 /* Give the current vmcb to the guest */ 2191 /* Give the current vmcb to the guest */
@@ -1896,11 +2197,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1896 nested_vmcb->save.ds = vmcb->save.ds; 2197 nested_vmcb->save.ds = vmcb->save.ds;
1897 nested_vmcb->save.gdtr = vmcb->save.gdtr; 2198 nested_vmcb->save.gdtr = vmcb->save.gdtr;
1898 nested_vmcb->save.idtr = vmcb->save.idtr; 2199 nested_vmcb->save.idtr = vmcb->save.idtr;
2200 nested_vmcb->save.efer = svm->vcpu.arch.efer;
1899 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); 2201 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1900 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; 2202 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
1901 nested_vmcb->save.cr2 = vmcb->save.cr2; 2203 nested_vmcb->save.cr2 = vmcb->save.cr2;
1902 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; 2204 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
1903 nested_vmcb->save.rflags = vmcb->save.rflags; 2205 nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
1904 nested_vmcb->save.rip = vmcb->save.rip; 2206 nested_vmcb->save.rip = vmcb->save.rip;
1905 nested_vmcb->save.rsp = vmcb->save.rsp; 2207 nested_vmcb->save.rsp = vmcb->save.rsp;
1906 nested_vmcb->save.rax = vmcb->save.rax; 2208 nested_vmcb->save.rax = vmcb->save.rax;
@@ -1917,6 +2219,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1917 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 2219 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1918 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 2220 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1919 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 2221 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2222 nested_vmcb->control.next_rip = vmcb->control.next_rip;
1920 2223
1921 /* 2224 /*
1922 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have 2225 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -1947,6 +2250,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1947 kvm_clear_exception_queue(&svm->vcpu); 2250 kvm_clear_exception_queue(&svm->vcpu);
1948 kvm_clear_interrupt_queue(&svm->vcpu); 2251 kvm_clear_interrupt_queue(&svm->vcpu);
1949 2252
2253 svm->nested.nested_cr3 = 0;
2254
1950 /* Restore selected save entries */ 2255 /* Restore selected save entries */
1951 svm->vmcb->save.es = hsave->save.es; 2256 svm->vmcb->save.es = hsave->save.es;
1952 svm->vmcb->save.cs = hsave->save.cs; 2257 svm->vmcb->save.cs = hsave->save.cs;
@@ -1954,7 +2259,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1954 svm->vmcb->save.ds = hsave->save.ds; 2259 svm->vmcb->save.ds = hsave->save.ds;
1955 svm->vmcb->save.gdtr = hsave->save.gdtr; 2260 svm->vmcb->save.gdtr = hsave->save.gdtr;
1956 svm->vmcb->save.idtr = hsave->save.idtr; 2261 svm->vmcb->save.idtr = hsave->save.idtr;
1957 svm->vmcb->save.rflags = hsave->save.rflags; 2262 kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
1958 svm_set_efer(&svm->vcpu, hsave->save.efer); 2263 svm_set_efer(&svm->vcpu, hsave->save.efer);
1959 svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); 2264 svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
1960 svm_set_cr4(&svm->vcpu, hsave->save.cr4); 2265 svm_set_cr4(&svm->vcpu, hsave->save.cr4);
@@ -1971,8 +2276,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1971 svm->vmcb->save.cpl = 0; 2276 svm->vmcb->save.cpl = 0;
1972 svm->vmcb->control.exit_int_info = 0; 2277 svm->vmcb->control.exit_int_info = 0;
1973 2278
2279 mark_all_dirty(svm->vmcb);
2280
1974 nested_svm_unmap(page); 2281 nested_svm_unmap(page);
1975 2282
2283 nested_svm_uninit_mmu_context(&svm->vcpu);
1976 kvm_mmu_reset_context(&svm->vcpu); 2284 kvm_mmu_reset_context(&svm->vcpu);
1977 kvm_mmu_load(&svm->vcpu); 2285 kvm_mmu_load(&svm->vcpu);
1978 2286
@@ -2012,6 +2320,20 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2012 return true; 2320 return true;
2013} 2321}
2014 2322
2323static bool nested_vmcb_checks(struct vmcb *vmcb)
2324{
2325 if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2326 return false;
2327
2328 if (vmcb->control.asid == 0)
2329 return false;
2330
2331 if (vmcb->control.nested_ctl && !npt_enabled)
2332 return false;
2333
2334 return true;
2335}
2336
2015static bool nested_svm_vmrun(struct vcpu_svm *svm) 2337static bool nested_svm_vmrun(struct vcpu_svm *svm)
2016{ 2338{
2017 struct vmcb *nested_vmcb; 2339 struct vmcb *nested_vmcb;
@@ -2026,14 +2348,25 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2026 if (!nested_vmcb) 2348 if (!nested_vmcb)
2027 return false; 2349 return false;
2028 2350
2029 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, 2351 if (!nested_vmcb_checks(nested_vmcb)) {
2352 nested_vmcb->control.exit_code = SVM_EXIT_ERR;
2353 nested_vmcb->control.exit_code_hi = 0;
2354 nested_vmcb->control.exit_info_1 = 0;
2355 nested_vmcb->control.exit_info_2 = 0;
2356
2357 nested_svm_unmap(page);
2358
2359 return false;
2360 }
2361
2362 trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2030 nested_vmcb->save.rip, 2363 nested_vmcb->save.rip,
2031 nested_vmcb->control.int_ctl, 2364 nested_vmcb->control.int_ctl,
2032 nested_vmcb->control.event_inj, 2365 nested_vmcb->control.event_inj,
2033 nested_vmcb->control.nested_ctl); 2366 nested_vmcb->control.nested_ctl);
2034 2367
2035 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, 2368 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2036 nested_vmcb->control.intercept_cr_write, 2369 nested_vmcb->control.intercept_cr >> 16,
2037 nested_vmcb->control.intercept_exceptions, 2370 nested_vmcb->control.intercept_exceptions,
2038 nested_vmcb->control.intercept); 2371 nested_vmcb->control.intercept);
2039 2372
@@ -2054,22 +2387,28 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2054 hsave->save.efer = svm->vcpu.arch.efer; 2387 hsave->save.efer = svm->vcpu.arch.efer;
2055 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); 2388 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
2056 hsave->save.cr4 = svm->vcpu.arch.cr4; 2389 hsave->save.cr4 = svm->vcpu.arch.cr4;
2057 hsave->save.rflags = vmcb->save.rflags; 2390 hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
2058 hsave->save.rip = svm->next_rip; 2391 hsave->save.rip = kvm_rip_read(&svm->vcpu);
2059 hsave->save.rsp = vmcb->save.rsp; 2392 hsave->save.rsp = vmcb->save.rsp;
2060 hsave->save.rax = vmcb->save.rax; 2393 hsave->save.rax = vmcb->save.rax;
2061 if (npt_enabled) 2394 if (npt_enabled)
2062 hsave->save.cr3 = vmcb->save.cr3; 2395 hsave->save.cr3 = vmcb->save.cr3;
2063 else 2396 else
2064 hsave->save.cr3 = svm->vcpu.arch.cr3; 2397 hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
2065 2398
2066 copy_vmcb_control_area(hsave, vmcb); 2399 copy_vmcb_control_area(hsave, vmcb);
2067 2400
2068 if (svm->vmcb->save.rflags & X86_EFLAGS_IF) 2401 if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
2069 svm->vcpu.arch.hflags |= HF_HIF_MASK; 2402 svm->vcpu.arch.hflags |= HF_HIF_MASK;
2070 else 2403 else
2071 svm->vcpu.arch.hflags &= ~HF_HIF_MASK; 2404 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2072 2405
2406 if (nested_vmcb->control.nested_ctl) {
2407 kvm_mmu_unload(&svm->vcpu);
2408 svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2409 nested_svm_init_mmu_context(&svm->vcpu);
2410 }
2411
2073 /* Load the nested guest state */ 2412 /* Load the nested guest state */
2074 svm->vmcb->save.es = nested_vmcb->save.es; 2413 svm->vmcb->save.es = nested_vmcb->save.es;
2075 svm->vmcb->save.cs = nested_vmcb->save.cs; 2414 svm->vmcb->save.cs = nested_vmcb->save.cs;
@@ -2077,7 +2416,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2077 svm->vmcb->save.ds = nested_vmcb->save.ds; 2416 svm->vmcb->save.ds = nested_vmcb->save.ds;
2078 svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; 2417 svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
2079 svm->vmcb->save.idtr = nested_vmcb->save.idtr; 2418 svm->vmcb->save.idtr = nested_vmcb->save.idtr;
2080 svm->vmcb->save.rflags = nested_vmcb->save.rflags; 2419 kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
2081 svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); 2420 svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
2082 svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); 2421 svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
2083 svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); 2422 svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
@@ -2107,14 +2446,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2107 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; 2446 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
2108 2447
2109 /* cache intercepts */ 2448 /* cache intercepts */
2110 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; 2449 svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
2111 svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; 2450 svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
2112 svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
2113 svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
2114 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; 2451 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2115 svm->nested.intercept = nested_vmcb->control.intercept; 2452 svm->nested.intercept = nested_vmcb->control.intercept;
2116 2453
2117 force_new_asid(&svm->vcpu); 2454 svm_flush_tlb(&svm->vcpu);
2118 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 2455 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2119 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 2456 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2120 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 2457 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -2123,29 +2460,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2123 2460
2124 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { 2461 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2125 /* We only want the cr8 intercept bits of the guest */ 2462 /* We only want the cr8 intercept bits of the guest */
2126 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; 2463 clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2127 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2464 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2128 } 2465 }
2129 2466
2130 /* We don't want to see VMMCALLs from a nested guest */ 2467 /* We don't want to see VMMCALLs from a nested guest */
2131 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); 2468 clr_intercept(svm, INTERCEPT_VMMCALL);
2132
2133 /*
2134 * We don't want a nested guest to be more powerful than the guest, so
2135 * all intercepts are ORed
2136 */
2137 svm->vmcb->control.intercept_cr_read |=
2138 nested_vmcb->control.intercept_cr_read;
2139 svm->vmcb->control.intercept_cr_write |=
2140 nested_vmcb->control.intercept_cr_write;
2141 svm->vmcb->control.intercept_dr_read |=
2142 nested_vmcb->control.intercept_dr_read;
2143 svm->vmcb->control.intercept_dr_write |=
2144 nested_vmcb->control.intercept_dr_write;
2145 svm->vmcb->control.intercept_exceptions |=
2146 nested_vmcb->control.intercept_exceptions;
2147
2148 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
2149 2469
2150 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; 2470 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2151 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 2471 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
@@ -2156,11 +2476,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2156 2476
2157 nested_svm_unmap(page); 2477 nested_svm_unmap(page);
2158 2478
2159 /* nested_vmcb is our indicator if nested SVM is activated */ 2479 /* Enter Guest-Mode */
2480 enter_guest_mode(&svm->vcpu);
2481
2482 /*
2483 * Merge guest and host intercepts - must be called with vcpu in
2484 * guest-mode to take affect here
2485 */
2486 recalc_intercepts(svm);
2487
2160 svm->nested.vmcb = vmcb_gpa; 2488 svm->nested.vmcb = vmcb_gpa;
2161 2489
2162 enable_gif(svm); 2490 enable_gif(svm);
2163 2491
2492 mark_all_dirty(svm->vmcb);
2493
2164 return true; 2494 return true;
2165} 2495}
2166 2496
@@ -2188,13 +2518,13 @@ static int vmload_interception(struct vcpu_svm *svm)
2188 if (nested_svm_check_permissions(svm)) 2518 if (nested_svm_check_permissions(svm))
2189 return 1; 2519 return 1;
2190 2520
2191 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2192 skip_emulated_instruction(&svm->vcpu);
2193
2194 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 2521 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2195 if (!nested_vmcb) 2522 if (!nested_vmcb)
2196 return 1; 2523 return 1;
2197 2524
2525 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2526 skip_emulated_instruction(&svm->vcpu);
2527
2198 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 2528 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2199 nested_svm_unmap(page); 2529 nested_svm_unmap(page);
2200 2530
@@ -2209,13 +2539,13 @@ static int vmsave_interception(struct vcpu_svm *svm)
2209 if (nested_svm_check_permissions(svm)) 2539 if (nested_svm_check_permissions(svm))
2210 return 1; 2540 return 1;
2211 2541
2212 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2213 skip_emulated_instruction(&svm->vcpu);
2214
2215 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 2542 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2216 if (!nested_vmcb) 2543 if (!nested_vmcb)
2217 return 1; 2544 return 1;
2218 2545
2546 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2547 skip_emulated_instruction(&svm->vcpu);
2548
2219 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 2549 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2220 nested_svm_unmap(page); 2550 nested_svm_unmap(page);
2221 2551
@@ -2227,8 +2557,8 @@ static int vmrun_interception(struct vcpu_svm *svm)
2227 if (nested_svm_check_permissions(svm)) 2557 if (nested_svm_check_permissions(svm))
2228 return 1; 2558 return 1;
2229 2559
2230 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2560 /* Save rip after vmrun instruction */
2231 skip_emulated_instruction(&svm->vcpu); 2561 kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
2232 2562
2233 if (!nested_svm_vmrun(svm)) 2563 if (!nested_svm_vmrun(svm))
2234 return 1; 2564 return 1;
@@ -2257,6 +2587,7 @@ static int stgi_interception(struct vcpu_svm *svm)
2257 2587
2258 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2588 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2259 skip_emulated_instruction(&svm->vcpu); 2589 skip_emulated_instruction(&svm->vcpu);
2590 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2260 2591
2261 enable_gif(svm); 2592 enable_gif(svm);
2262 2593
@@ -2277,6 +2608,8 @@ static int clgi_interception(struct vcpu_svm *svm)
2277 svm_clear_vintr(svm); 2608 svm_clear_vintr(svm);
2278 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2609 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2279 2610
2611 mark_dirty(svm->vmcb, VMCB_INTR);
2612
2280 return 1; 2613 return 1;
2281} 2614}
2282 2615
@@ -2303,6 +2636,19 @@ static int skinit_interception(struct vcpu_svm *svm)
2303 return 1; 2636 return 1;
2304} 2637}
2305 2638
2639static int xsetbv_interception(struct vcpu_svm *svm)
2640{
2641 u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2642 u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
2643
2644 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2645 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2646 skip_emulated_instruction(&svm->vcpu);
2647 }
2648
2649 return 1;
2650}
2651
2306static int invalid_op_interception(struct vcpu_svm *svm) 2652static int invalid_op_interception(struct vcpu_svm *svm)
2307{ 2653{
2308 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2654 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2384,34 +2730,162 @@ static int cpuid_interception(struct vcpu_svm *svm)
2384static int iret_interception(struct vcpu_svm *svm) 2730static int iret_interception(struct vcpu_svm *svm)
2385{ 2731{
2386 ++svm->vcpu.stat.nmi_window_exits; 2732 ++svm->vcpu.stat.nmi_window_exits;
2387 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 2733 clr_intercept(svm, INTERCEPT_IRET);
2388 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2734 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2735 svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2389 return 1; 2736 return 1;
2390} 2737}
2391 2738
2392static int invlpg_interception(struct vcpu_svm *svm) 2739static int invlpg_interception(struct vcpu_svm *svm)
2393{ 2740{
2394 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2741 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2742 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2743
2744 kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2745 skip_emulated_instruction(&svm->vcpu);
2746 return 1;
2395} 2747}
2396 2748
2397static int emulate_on_interception(struct vcpu_svm *svm) 2749static int emulate_on_interception(struct vcpu_svm *svm)
2398{ 2750{
2399 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2751 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2752}
2753
2754bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
2755{
2756 unsigned long cr0 = svm->vcpu.arch.cr0;
2757 bool ret = false;
2758 u64 intercept;
2759
2760 intercept = svm->nested.intercept;
2761
2762 if (!is_guest_mode(&svm->vcpu) ||
2763 (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
2764 return false;
2765
2766 cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2767 val &= ~SVM_CR0_SELECTIVE_MASK;
2768
2769 if (cr0 ^ val) {
2770 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2771 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2772 }
2773
2774 return ret;
2775}
2776
2777#define CR_VALID (1ULL << 63)
2778
2779static int cr_interception(struct vcpu_svm *svm)
2780{
2781 int reg, cr;
2782 unsigned long val;
2783 int err;
2784
2785 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2786 return emulate_on_interception(svm);
2787
2788 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2789 return emulate_on_interception(svm);
2790
2791 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2792 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2793
2794 err = 0;
2795 if (cr >= 16) { /* mov to cr */
2796 cr -= 16;
2797 val = kvm_register_read(&svm->vcpu, reg);
2798 switch (cr) {
2799 case 0:
2800 if (!check_selective_cr0_intercepted(svm, val))
2801 err = kvm_set_cr0(&svm->vcpu, val);
2802 else
2803 return 1;
2804
2805 break;
2806 case 3:
2807 err = kvm_set_cr3(&svm->vcpu, val);
2808 break;
2809 case 4:
2810 err = kvm_set_cr4(&svm->vcpu, val);
2811 break;
2812 case 8:
2813 err = kvm_set_cr8(&svm->vcpu, val);
2814 break;
2815 default:
2816 WARN(1, "unhandled write to CR%d", cr);
2817 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2818 return 1;
2819 }
2820 } else { /* mov from cr */
2821 switch (cr) {
2822 case 0:
2823 val = kvm_read_cr0(&svm->vcpu);
2824 break;
2825 case 2:
2826 val = svm->vcpu.arch.cr2;
2827 break;
2828 case 3:
2829 val = kvm_read_cr3(&svm->vcpu);
2830 break;
2831 case 4:
2832 val = kvm_read_cr4(&svm->vcpu);
2833 break;
2834 case 8:
2835 val = kvm_get_cr8(&svm->vcpu);
2836 break;
2837 default:
2838 WARN(1, "unhandled read from CR%d", cr);
2839 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2840 return 1;
2841 }
2842 kvm_register_write(&svm->vcpu, reg, val);
2843 }
2844 kvm_complete_insn_gp(&svm->vcpu, err);
2845
2846 return 1;
2847}
2848
2849static int dr_interception(struct vcpu_svm *svm)
2850{
2851 int reg, dr;
2852 unsigned long val;
2853 int err;
2854
2855 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2856 return emulate_on_interception(svm);
2857
2858 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2859 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2860
2861 if (dr >= 16) { /* mov to DRn */
2862 val = kvm_register_read(&svm->vcpu, reg);
2863 kvm_set_dr(&svm->vcpu, dr - 16, val);
2864 } else {
2865 err = kvm_get_dr(&svm->vcpu, dr, &val);
2866 if (!err)
2867 kvm_register_write(&svm->vcpu, reg, val);
2868 }
2869
2870 skip_emulated_instruction(&svm->vcpu);
2871
2872 return 1;
2400} 2873}
2401 2874
2402static int cr8_write_interception(struct vcpu_svm *svm) 2875static int cr8_write_interception(struct vcpu_svm *svm)
2403{ 2876{
2404 struct kvm_run *kvm_run = svm->vcpu.run; 2877 struct kvm_run *kvm_run = svm->vcpu.run;
2878 int r;
2405 2879
2406 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2880 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2407 /* instruction emulation calls kvm_set_cr8() */ 2881 /* instruction emulation calls kvm_set_cr8() */
2408 emulate_instruction(&svm->vcpu, 0, 0, 0); 2882 r = cr_interception(svm);
2409 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2883 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2410 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2884 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2411 return 1; 2885 return r;
2412 } 2886 }
2413 if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) 2887 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2414 return 1; 2888 return r;
2415 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2889 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2416 return 0; 2890 return 0;
2417} 2891}
@@ -2422,14 +2896,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2422 2896
2423 switch (ecx) { 2897 switch (ecx) {
2424 case MSR_IA32_TSC: { 2898 case MSR_IA32_TSC: {
2425 u64 tsc_offset; 2899 struct vmcb *vmcb = get_host_vmcb(svm);
2426 2900
2427 if (is_nested(svm)) 2901 *data = vmcb->control.tsc_offset +
2428 tsc_offset = svm->nested.hsave->control.tsc_offset; 2902 svm_scale_tsc(vcpu, native_read_tsc());
2429 else
2430 tsc_offset = svm->vmcb->control.tsc_offset;
2431 2903
2432 *data = tsc_offset + native_read_tsc();
2433 break; 2904 break;
2434 } 2905 }
2435 case MSR_STAR: 2906 case MSR_STAR:
@@ -2542,20 +3013,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2542 struct vcpu_svm *svm = to_svm(vcpu); 3013 struct vcpu_svm *svm = to_svm(vcpu);
2543 3014
2544 switch (ecx) { 3015 switch (ecx) {
2545 case MSR_IA32_TSC: { 3016 case MSR_IA32_TSC:
2546 u64 tsc_offset = data - native_read_tsc(); 3017 kvm_write_tsc(vcpu, data);
2547 u64 g_tsc_offset = 0;
2548
2549 if (is_nested(svm)) {
2550 g_tsc_offset = svm->vmcb->control.tsc_offset -
2551 svm->nested.hsave->control.tsc_offset;
2552 svm->nested.hsave->control.tsc_offset = tsc_offset;
2553 }
2554
2555 svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
2556
2557 break; 3018 break;
2558 }
2559 case MSR_STAR: 3019 case MSR_STAR:
2560 svm->vmcb->save.star = data; 3020 svm->vmcb->save.star = data;
2561 break; 3021 break;
@@ -2585,7 +3045,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2585 svm->vmcb->save.sysenter_esp = data; 3045 svm->vmcb->save.sysenter_esp = data;
2586 break; 3046 break;
2587 case MSR_IA32_DEBUGCTLMSR: 3047 case MSR_IA32_DEBUGCTLMSR:
2588 if (!svm_has(SVM_FEATURE_LBRV)) { 3048 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2589 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 3049 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2590 __func__, data); 3050 __func__, data);
2591 break; 3051 break;
@@ -2594,6 +3054,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2594 return 1; 3054 return 1;
2595 3055
2596 svm->vmcb->save.dbgctl = data; 3056 svm->vmcb->save.dbgctl = data;
3057 mark_dirty(svm->vmcb, VMCB_LBR);
2597 if (data & (1ULL<<0)) 3058 if (data & (1ULL<<0))
2598 svm_enable_lbrv(svm); 3059 svm_enable_lbrv(svm);
2599 else 3060 else
@@ -2643,8 +3104,10 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
2643{ 3104{
2644 struct kvm_run *kvm_run = svm->vcpu.run; 3105 struct kvm_run *kvm_run = svm->vcpu.run;
2645 3106
3107 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2646 svm_clear_vintr(svm); 3108 svm_clear_vintr(svm);
2647 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3109 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3110 mark_dirty(svm->vmcb, VMCB_INTR);
2648 /* 3111 /*
2649 * If the user space waits to inject interrupts, exit as soon as 3112 * If the user space waits to inject interrupts, exit as soon as
2650 * possible 3113 * possible
@@ -2667,31 +3130,31 @@ static int pause_interception(struct vcpu_svm *svm)
2667} 3130}
2668 3131
2669static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 3132static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2670 [SVM_EXIT_READ_CR0] = emulate_on_interception, 3133 [SVM_EXIT_READ_CR0] = cr_interception,
2671 [SVM_EXIT_READ_CR3] = emulate_on_interception, 3134 [SVM_EXIT_READ_CR3] = cr_interception,
2672 [SVM_EXIT_READ_CR4] = emulate_on_interception, 3135 [SVM_EXIT_READ_CR4] = cr_interception,
2673 [SVM_EXIT_READ_CR8] = emulate_on_interception, 3136 [SVM_EXIT_READ_CR8] = cr_interception,
2674 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 3137 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2675 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 3138 [SVM_EXIT_WRITE_CR0] = cr_interception,
2676 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 3139 [SVM_EXIT_WRITE_CR3] = cr_interception,
2677 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 3140 [SVM_EXIT_WRITE_CR4] = cr_interception,
2678 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3141 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2679 [SVM_EXIT_READ_DR0] = emulate_on_interception, 3142 [SVM_EXIT_READ_DR0] = dr_interception,
2680 [SVM_EXIT_READ_DR1] = emulate_on_interception, 3143 [SVM_EXIT_READ_DR1] = dr_interception,
2681 [SVM_EXIT_READ_DR2] = emulate_on_interception, 3144 [SVM_EXIT_READ_DR2] = dr_interception,
2682 [SVM_EXIT_READ_DR3] = emulate_on_interception, 3145 [SVM_EXIT_READ_DR3] = dr_interception,
2683 [SVM_EXIT_READ_DR4] = emulate_on_interception, 3146 [SVM_EXIT_READ_DR4] = dr_interception,
2684 [SVM_EXIT_READ_DR5] = emulate_on_interception, 3147 [SVM_EXIT_READ_DR5] = dr_interception,
2685 [SVM_EXIT_READ_DR6] = emulate_on_interception, 3148 [SVM_EXIT_READ_DR6] = dr_interception,
2686 [SVM_EXIT_READ_DR7] = emulate_on_interception, 3149 [SVM_EXIT_READ_DR7] = dr_interception,
2687 [SVM_EXIT_WRITE_DR0] = emulate_on_interception, 3150 [SVM_EXIT_WRITE_DR0] = dr_interception,
2688 [SVM_EXIT_WRITE_DR1] = emulate_on_interception, 3151 [SVM_EXIT_WRITE_DR1] = dr_interception,
2689 [SVM_EXIT_WRITE_DR2] = emulate_on_interception, 3152 [SVM_EXIT_WRITE_DR2] = dr_interception,
2690 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 3153 [SVM_EXIT_WRITE_DR3] = dr_interception,
2691 [SVM_EXIT_WRITE_DR4] = emulate_on_interception, 3154 [SVM_EXIT_WRITE_DR4] = dr_interception,
2692 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 3155 [SVM_EXIT_WRITE_DR5] = dr_interception,
2693 [SVM_EXIT_WRITE_DR6] = emulate_on_interception, 3156 [SVM_EXIT_WRITE_DR6] = dr_interception,
2694 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 3157 [SVM_EXIT_WRITE_DR7] = dr_interception,
2695 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 3158 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2696 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3159 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2697 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3160 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
@@ -2724,100 +3187,121 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2724 [SVM_EXIT_WBINVD] = emulate_on_interception, 3187 [SVM_EXIT_WBINVD] = emulate_on_interception,
2725 [SVM_EXIT_MONITOR] = invalid_op_interception, 3188 [SVM_EXIT_MONITOR] = invalid_op_interception,
2726 [SVM_EXIT_MWAIT] = invalid_op_interception, 3189 [SVM_EXIT_MWAIT] = invalid_op_interception,
3190 [SVM_EXIT_XSETBV] = xsetbv_interception,
2727 [SVM_EXIT_NPF] = pf_interception, 3191 [SVM_EXIT_NPF] = pf_interception,
2728}; 3192};
2729 3193
2730void dump_vmcb(struct kvm_vcpu *vcpu) 3194static void dump_vmcb(struct kvm_vcpu *vcpu)
2731{ 3195{
2732 struct vcpu_svm *svm = to_svm(vcpu); 3196 struct vcpu_svm *svm = to_svm(vcpu);
2733 struct vmcb_control_area *control = &svm->vmcb->control; 3197 struct vmcb_control_area *control = &svm->vmcb->control;
2734 struct vmcb_save_area *save = &svm->vmcb->save; 3198 struct vmcb_save_area *save = &svm->vmcb->save;
2735 3199
2736 pr_err("VMCB Control Area:\n"); 3200 pr_err("VMCB Control Area:\n");
2737 pr_err("cr_read: %04x\n", control->intercept_cr_read); 3201 pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
2738 pr_err("cr_write: %04x\n", control->intercept_cr_write); 3202 pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
2739 pr_err("dr_read: %04x\n", control->intercept_dr_read); 3203 pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
2740 pr_err("dr_write: %04x\n", control->intercept_dr_write); 3204 pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
2741 pr_err("exceptions: %08x\n", control->intercept_exceptions); 3205 pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
2742 pr_err("intercepts: %016llx\n", control->intercept); 3206 pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
2743 pr_err("pause filter count: %d\n", control->pause_filter_count); 3207 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
2744 pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); 3208 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
2745 pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); 3209 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
2746 pr_err("tsc_offset: %016llx\n", control->tsc_offset); 3210 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
2747 pr_err("asid: %d\n", control->asid); 3211 pr_err("%-20s%d\n", "asid:", control->asid);
2748 pr_err("tlb_ctl: %d\n", control->tlb_ctl); 3212 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
2749 pr_err("int_ctl: %08x\n", control->int_ctl); 3213 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
2750 pr_err("int_vector: %08x\n", control->int_vector); 3214 pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
2751 pr_err("int_state: %08x\n", control->int_state); 3215 pr_err("%-20s%08x\n", "int_state:", control->int_state);
2752 pr_err("exit_code: %08x\n", control->exit_code); 3216 pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
2753 pr_err("exit_info1: %016llx\n", control->exit_info_1); 3217 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
2754 pr_err("exit_info2: %016llx\n", control->exit_info_2); 3218 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
2755 pr_err("exit_int_info: %08x\n", control->exit_int_info); 3219 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
2756 pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); 3220 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
2757 pr_err("nested_ctl: %lld\n", control->nested_ctl); 3221 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
2758 pr_err("nested_cr3: %016llx\n", control->nested_cr3); 3222 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
2759 pr_err("event_inj: %08x\n", control->event_inj); 3223 pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
2760 pr_err("event_inj_err: %08x\n", control->event_inj_err); 3224 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
2761 pr_err("lbr_ctl: %lld\n", control->lbr_ctl); 3225 pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
2762 pr_err("next_rip: %016llx\n", control->next_rip); 3226 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
2763 pr_err("VMCB State Save Area:\n"); 3227 pr_err("VMCB State Save Area:\n");
2764 pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", 3228 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2765 save->es.selector, save->es.attrib, 3229 "es:",
2766 save->es.limit, save->es.base); 3230 save->es.selector, save->es.attrib,
2767 pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", 3231 save->es.limit, save->es.base);
2768 save->cs.selector, save->cs.attrib, 3232 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2769 save->cs.limit, save->cs.base); 3233 "cs:",
2770 pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", 3234 save->cs.selector, save->cs.attrib,
2771 save->ss.selector, save->ss.attrib, 3235 save->cs.limit, save->cs.base);
2772 save->ss.limit, save->ss.base); 3236 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2773 pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", 3237 "ss:",
2774 save->ds.selector, save->ds.attrib, 3238 save->ss.selector, save->ss.attrib,
2775 save->ds.limit, save->ds.base); 3239 save->ss.limit, save->ss.base);
2776 pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", 3240 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2777 save->fs.selector, save->fs.attrib, 3241 "ds:",
2778 save->fs.limit, save->fs.base); 3242 save->ds.selector, save->ds.attrib,
2779 pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", 3243 save->ds.limit, save->ds.base);
2780 save->gs.selector, save->gs.attrib, 3244 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2781 save->gs.limit, save->gs.base); 3245 "fs:",
2782 pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", 3246 save->fs.selector, save->fs.attrib,
2783 save->gdtr.selector, save->gdtr.attrib, 3247 save->fs.limit, save->fs.base);
2784 save->gdtr.limit, save->gdtr.base); 3248 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2785 pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", 3249 "gs:",
2786 save->ldtr.selector, save->ldtr.attrib, 3250 save->gs.selector, save->gs.attrib,
2787 save->ldtr.limit, save->ldtr.base); 3251 save->gs.limit, save->gs.base);
2788 pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", 3252 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2789 save->idtr.selector, save->idtr.attrib, 3253 "gdtr:",
2790 save->idtr.limit, save->idtr.base); 3254 save->gdtr.selector, save->gdtr.attrib,
2791 pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", 3255 save->gdtr.limit, save->gdtr.base);
2792 save->tr.selector, save->tr.attrib, 3256 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2793 save->tr.limit, save->tr.base); 3257 "ldtr:",
3258 save->ldtr.selector, save->ldtr.attrib,
3259 save->ldtr.limit, save->ldtr.base);
3260 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3261 "idtr:",
3262 save->idtr.selector, save->idtr.attrib,
3263 save->idtr.limit, save->idtr.base);
3264 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3265 "tr:",
3266 save->tr.selector, save->tr.attrib,
3267 save->tr.limit, save->tr.base);
2794 pr_err("cpl: %d efer: %016llx\n", 3268 pr_err("cpl: %d efer: %016llx\n",
2795 save->cpl, save->efer); 3269 save->cpl, save->efer);
2796 pr_err("cr0: %016llx cr2: %016llx\n", 3270 pr_err("%-15s %016llx %-13s %016llx\n",
2797 save->cr0, save->cr2); 3271 "cr0:", save->cr0, "cr2:", save->cr2);
2798 pr_err("cr3: %016llx cr4: %016llx\n", 3272 pr_err("%-15s %016llx %-13s %016llx\n",
2799 save->cr3, save->cr4); 3273 "cr3:", save->cr3, "cr4:", save->cr4);
2800 pr_err("dr6: %016llx dr7: %016llx\n", 3274 pr_err("%-15s %016llx %-13s %016llx\n",
2801 save->dr6, save->dr7); 3275 "dr6:", save->dr6, "dr7:", save->dr7);
2802 pr_err("rip: %016llx rflags: %016llx\n", 3276 pr_err("%-15s %016llx %-13s %016llx\n",
2803 save->rip, save->rflags); 3277 "rip:", save->rip, "rflags:", save->rflags);
2804 pr_err("rsp: %016llx rax: %016llx\n", 3278 pr_err("%-15s %016llx %-13s %016llx\n",
2805 save->rsp, save->rax); 3279 "rsp:", save->rsp, "rax:", save->rax);
2806 pr_err("star: %016llx lstar: %016llx\n", 3280 pr_err("%-15s %016llx %-13s %016llx\n",
2807 save->star, save->lstar); 3281 "star:", save->star, "lstar:", save->lstar);
2808 pr_err("cstar: %016llx sfmask: %016llx\n", 3282 pr_err("%-15s %016llx %-13s %016llx\n",
2809 save->cstar, save->sfmask); 3283 "cstar:", save->cstar, "sfmask:", save->sfmask);
2810 pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", 3284 pr_err("%-15s %016llx %-13s %016llx\n",
2811 save->kernel_gs_base, save->sysenter_cs); 3285 "kernel_gs_base:", save->kernel_gs_base,
2812 pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", 3286 "sysenter_cs:", save->sysenter_cs);
2813 save->sysenter_esp, save->sysenter_eip); 3287 pr_err("%-15s %016llx %-13s %016llx\n",
2814 pr_err("gpat: %016llx dbgctl: %016llx\n", 3288 "sysenter_esp:", save->sysenter_esp,
2815 save->g_pat, save->dbgctl); 3289 "sysenter_eip:", save->sysenter_eip);
2816 pr_err("br_from: %016llx br_to: %016llx\n", 3290 pr_err("%-15s %016llx %-13s %016llx\n",
2817 save->br_from, save->br_to); 3291 "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
2818 pr_err("excp_from: %016llx excp_to: %016llx\n", 3292 pr_err("%-15s %016llx %-13s %016llx\n",
2819 save->last_excp_from, save->last_excp_to); 3293 "br_from:", save->br_from, "br_to:", save->br_to);
2820 3294 pr_err("%-15s %016llx %-13s %016llx\n",
3295 "excp_from:", save->last_excp_from,
3296 "excp_to:", save->last_excp_to);
3297}
3298
3299static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3300{
3301 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3302
3303 *info1 = control->exit_info_1;
3304 *info2 = control->exit_info_2;
2821} 3305}
2822 3306
2823static int handle_exit(struct kvm_vcpu *vcpu) 3307static int handle_exit(struct kvm_vcpu *vcpu)
@@ -2826,9 +3310,9 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2826 struct kvm_run *kvm_run = vcpu->run; 3310 struct kvm_run *kvm_run = vcpu->run;
2827 u32 exit_code = svm->vmcb->control.exit_code; 3311 u32 exit_code = svm->vmcb->control.exit_code;
2828 3312
2829 trace_kvm_exit(exit_code, vcpu); 3313 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
2830 3314
2831 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) 3315 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
2832 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3316 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2833 if (npt_enabled) 3317 if (npt_enabled)
2834 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3318 vcpu->arch.cr3 = svm->vmcb->save.cr3;
@@ -2840,7 +3324,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2840 return 1; 3324 return 1;
2841 } 3325 }
2842 3326
2843 if (is_nested(svm)) { 3327 if (is_guest_mode(vcpu)) {
2844 int vmexit; 3328 int vmexit;
2845 3329
2846 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, 3330 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -2871,7 +3355,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2871 3355
2872 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 3356 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2873 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 3357 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2874 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) 3358 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3359 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
2875 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 3360 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
2876 "exit_code 0x%x\n", 3361 "exit_code 0x%x\n",
2877 __func__, svm->vmcb->control.exit_int_info, 3362 __func__, svm->vmcb->control.exit_int_info,
@@ -2902,7 +3387,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
2902 3387
2903 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 3388 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
2904 3389
2905 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
2906 /* FIXME: handle wraparound of asid_generation */ 3390 /* FIXME: handle wraparound of asid_generation */
2907 if (svm->asid_generation != sd->asid_generation) 3391 if (svm->asid_generation != sd->asid_generation)
2908 new_asid(svm, sd); 3392 new_asid(svm, sd);
@@ -2914,7 +3398,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
2914 3398
2915 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3399 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
2916 vcpu->arch.hflags |= HF_NMI_MASK; 3400 vcpu->arch.hflags |= HF_NMI_MASK;
2917 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3401 set_intercept(svm, INTERCEPT_IRET);
2918 ++vcpu->stat.nmi_injections; 3402 ++vcpu->stat.nmi_injections;
2919} 3403}
2920 3404
@@ -2927,6 +3411,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2927 control->int_ctl &= ~V_INTR_PRIO_MASK; 3411 control->int_ctl &= ~V_INTR_PRIO_MASK;
2928 control->int_ctl |= V_IRQ_MASK | 3412 control->int_ctl |= V_IRQ_MASK |
2929 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 3413 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
3414 mark_dirty(svm->vmcb, VMCB_INTR);
2930} 3415}
2931 3416
2932static void svm_set_irq(struct kvm_vcpu *vcpu) 3417static void svm_set_irq(struct kvm_vcpu *vcpu)
@@ -2946,14 +3431,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
2946{ 3431{
2947 struct vcpu_svm *svm = to_svm(vcpu); 3432 struct vcpu_svm *svm = to_svm(vcpu);
2948 3433
2949 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3434 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
2950 return; 3435 return;
2951 3436
2952 if (irr == -1) 3437 if (irr == -1)
2953 return; 3438 return;
2954 3439
2955 if (tpr >= irr) 3440 if (tpr >= irr)
2956 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; 3441 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2957} 3442}
2958 3443
2959static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3444static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2981,10 +3466,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2981 3466
2982 if (masked) { 3467 if (masked) {
2983 svm->vcpu.arch.hflags |= HF_NMI_MASK; 3468 svm->vcpu.arch.hflags |= HF_NMI_MASK;
2984 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3469 set_intercept(svm, INTERCEPT_IRET);
2985 } else { 3470 } else {
2986 svm->vcpu.arch.hflags &= ~HF_NMI_MASK; 3471 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
2987 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 3472 clr_intercept(svm, INTERCEPT_IRET);
2988 } 3473 }
2989} 3474}
2990 3475
@@ -2998,9 +3483,9 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2998 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) 3483 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
2999 return 0; 3484 return 0;
3000 3485
3001 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); 3486 ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
3002 3487
3003 if (is_nested(svm)) 3488 if (is_guest_mode(vcpu))
3004 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); 3489 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
3005 3490
3006 return ret; 3491 return ret;
@@ -3046,7 +3531,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3046 3531
3047static void svm_flush_tlb(struct kvm_vcpu *vcpu) 3532static void svm_flush_tlb(struct kvm_vcpu *vcpu)
3048{ 3533{
3049 force_new_asid(vcpu); 3534 struct vcpu_svm *svm = to_svm(vcpu);
3535
3536 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3537 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3538 else
3539 svm->asid_generation--;
3050} 3540}
3051 3541
3052static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) 3542static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3057,10 +3547,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3057{ 3547{
3058 struct vcpu_svm *svm = to_svm(vcpu); 3548 struct vcpu_svm *svm = to_svm(vcpu);
3059 3549
3060 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3550 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3061 return; 3551 return;
3062 3552
3063 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 3553 if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3064 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 3554 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3065 kvm_set_cr8(vcpu, cr8); 3555 kvm_set_cr8(vcpu, cr8);
3066 } 3556 }
@@ -3071,7 +3561,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3071 struct vcpu_svm *svm = to_svm(vcpu); 3561 struct vcpu_svm *svm = to_svm(vcpu);
3072 u64 cr8; 3562 u64 cr8;
3073 3563
3074 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3564 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3075 return; 3565 return;
3076 3566
3077 cr8 = kvm_get_cr8(vcpu); 3567 cr8 = kvm_get_cr8(vcpu);
@@ -3088,8 +3578,15 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3088 3578
3089 svm->int3_injected = 0; 3579 svm->int3_injected = 0;
3090 3580
3091 if (svm->vcpu.arch.hflags & HF_IRET_MASK) 3581 /*
3582 * If we've made progress since setting HF_IRET_MASK, we've
3583 * executed an IRET and can allow NMI injection.
3584 */
3585 if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3586 && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3092 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3587 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3588 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3589 }
3093 3590
3094 svm->vcpu.arch.nmi_injected = false; 3591 svm->vcpu.arch.nmi_injected = false;
3095 kvm_clear_exception_queue(&svm->vcpu); 3592 kvm_clear_exception_queue(&svm->vcpu);
@@ -3098,6 +3595,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3098 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 3595 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3099 return; 3596 return;
3100 3597
3598 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3599
3101 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 3600 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3102 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 3601 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3103 3602
@@ -3134,6 +3633,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3134 } 3633 }
3135} 3634}
3136 3635
3636static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3637{
3638 struct vcpu_svm *svm = to_svm(vcpu);
3639 struct vmcb_control_area *control = &svm->vmcb->control;
3640
3641 control->exit_int_info = control->event_inj;
3642 control->exit_int_info_err = control->event_inj_err;
3643 control->event_inj = 0;
3644 svm_complete_interrupts(svm);
3645}
3646
3137#ifdef CONFIG_X86_64 3647#ifdef CONFIG_X86_64
3138#define R "r" 3648#define R "r"
3139#else 3649#else
@@ -3143,9 +3653,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3143static void svm_vcpu_run(struct kvm_vcpu *vcpu) 3653static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3144{ 3654{
3145 struct vcpu_svm *svm = to_svm(vcpu); 3655 struct vcpu_svm *svm = to_svm(vcpu);
3146 u16 fs_selector;
3147 u16 gs_selector;
3148 u16 ldt_selector;
3149 3656
3150 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3657 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3151 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 3658 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3162,14 +3669,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3162 3669
3163 sync_lapic_to_cr8(vcpu); 3670 sync_lapic_to_cr8(vcpu);
3164 3671
3165 save_host_msrs(vcpu);
3166 savesegment(fs, fs_selector);
3167 savesegment(gs, gs_selector);
3168 ldt_selector = kvm_read_ldt();
3169 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3672 svm->vmcb->save.cr2 = vcpu->arch.cr2;
3170 /* required for live migration with NPT */
3171 if (npt_enabled)
3172 svm->vmcb->save.cr3 = vcpu->arch.cr3;
3173 3673
3174 clgi(); 3674 clgi();
3175 3675
@@ -3246,31 +3746,44 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3246#endif 3746#endif
3247 ); 3747 );
3248 3748
3249 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3250 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3251 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3252 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3253
3254 load_host_msrs(vcpu);
3255 loadsegment(fs, fs_selector);
3256#ifdef CONFIG_X86_64 3749#ifdef CONFIG_X86_64
3257 load_gs_index(gs_selector); 3750 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3258 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
3259#else 3751#else
3260 loadsegment(gs, gs_selector); 3752 loadsegment(fs, svm->host.fs);
3753#ifndef CONFIG_X86_32_LAZY_GS
3754 loadsegment(gs, svm->host.gs);
3755#endif
3261#endif 3756#endif
3262 kvm_load_ldt(ldt_selector);
3263 3757
3264 reload_tss(vcpu); 3758 reload_tss(vcpu);
3265 3759
3266 local_irq_disable(); 3760 local_irq_disable();
3267 3761
3762 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3763 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3764 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3765 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3766
3767 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3768 kvm_before_handle_nmi(&svm->vcpu);
3769
3268 stgi(); 3770 stgi();
3269 3771
3772 /* Any pending NMI will happen here */
3773
3774 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3775 kvm_after_handle_nmi(&svm->vcpu);
3776
3270 sync_cr8_to_lapic(vcpu); 3777 sync_cr8_to_lapic(vcpu);
3271 3778
3272 svm->next_rip = 0; 3779 svm->next_rip = 0;
3273 3780
3781 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3782
3783 /* if exit due to PF check for async PF */
3784 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3785 svm->apf_reason = kvm_read_and_reset_pf_reason();
3786
3274 if (npt_enabled) { 3787 if (npt_enabled) {
3275 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); 3788 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
3276 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); 3789 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
@@ -3283,6 +3796,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3283 if (unlikely(svm->vmcb->control.exit_code == 3796 if (unlikely(svm->vmcb->control.exit_code ==
3284 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 3797 SVM_EXIT_EXCP_BASE + MC_VECTOR))
3285 svm_handle_mce(svm); 3798 svm_handle_mce(svm);
3799
3800 mark_all_clean(svm->vmcb);
3286} 3801}
3287 3802
3288#undef R 3803#undef R
@@ -3291,14 +3806,23 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3291{ 3806{
3292 struct vcpu_svm *svm = to_svm(vcpu); 3807 struct vcpu_svm *svm = to_svm(vcpu);
3293 3808
3294 if (npt_enabled) {
3295 svm->vmcb->control.nested_cr3 = root;
3296 force_new_asid(vcpu);
3297 return;
3298 }
3299
3300 svm->vmcb->save.cr3 = root; 3809 svm->vmcb->save.cr3 = root;
3301 force_new_asid(vcpu); 3810 mark_dirty(svm->vmcb, VMCB_CR);
3811 svm_flush_tlb(vcpu);
3812}
3813
3814static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3815{
3816 struct vcpu_svm *svm = to_svm(vcpu);
3817
3818 svm->vmcb->control.nested_cr3 = root;
3819 mark_dirty(svm->vmcb, VMCB_NPT);
3820
3821 /* Also sync guest cr3 here in case we live migrate */
3822 svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
3823 mark_dirty(svm->vmcb, VMCB_CR);
3824
3825 svm_flush_tlb(vcpu);
3302} 3826}
3303 3827
3304static int is_disabled(void) 3828static int is_disabled(void)
@@ -3333,15 +3857,6 @@ static bool svm_cpu_has_accelerated_tpr(void)
3333 return false; 3857 return false;
3334} 3858}
3335 3859
3336static int get_npt_level(void)
3337{
3338#ifdef CONFIG_X86_64
3339 return PT64_ROOT_LEVEL;
3340#else
3341 return PT32E_ROOT_LEVEL;
3342#endif
3343}
3344
3345static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 3860static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3346{ 3861{
3347 return 0; 3862 return 0;
@@ -3354,12 +3869,25 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
3354static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 3869static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3355{ 3870{
3356 switch (func) { 3871 switch (func) {
3872 case 0x80000001:
3873 if (nested)
3874 entry->ecx |= (1 << 2); /* Set SVM bit */
3875 break;
3357 case 0x8000000A: 3876 case 0x8000000A:
3358 entry->eax = 1; /* SVM revision 1 */ 3877 entry->eax = 1; /* SVM revision 1 */
3359 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper 3878 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
3360 ASID emulation to nested SVM */ 3879 ASID emulation to nested SVM */
3361 entry->ecx = 0; /* Reserved */ 3880 entry->ecx = 0; /* Reserved */
3362 entry->edx = 0; /* Do not support any additional features */ 3881 entry->edx = 0; /* Per default do not support any
3882 additional features */
3883
3884 /* Support next_rip if host supports it */
3885 if (boot_cpu_has(X86_FEATURE_NRIPS))
3886 entry->edx |= SVM_FEATURE_NRIP;
3887
3888 /* Support NPT for the guest if enabled */
3889 if (npt_enabled)
3890 entry->edx |= SVM_FEATURE_NPT;
3363 3891
3364 break; 3892 break;
3365 } 3893 }
@@ -3414,6 +3942,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
3414 { SVM_EXIT_WBINVD, "wbinvd" }, 3942 { SVM_EXIT_WBINVD, "wbinvd" },
3415 { SVM_EXIT_MONITOR, "monitor" }, 3943 { SVM_EXIT_MONITOR, "monitor" },
3416 { SVM_EXIT_MWAIT, "mwait" }, 3944 { SVM_EXIT_MWAIT, "mwait" },
3945 { SVM_EXIT_XSETBV, "xsetbv" },
3417 { SVM_EXIT_NPF, "npf" }, 3946 { SVM_EXIT_NPF, "npf" },
3418 { -1, NULL } 3947 { -1, NULL }
3419}; 3948};
@@ -3437,12 +3966,190 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3437{ 3966{
3438 struct vcpu_svm *svm = to_svm(vcpu); 3967 struct vcpu_svm *svm = to_svm(vcpu);
3439 3968
3440 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; 3969 set_exception_intercept(svm, NM_VECTOR);
3441 if (is_nested(svm))
3442 svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
3443 update_cr0_intercept(svm); 3970 update_cr0_intercept(svm);
3444} 3971}
3445 3972
3973#define PRE_EX(exit) { .exit_code = (exit), \
3974 .stage = X86_ICPT_PRE_EXCEPT, }
3975#define POST_EX(exit) { .exit_code = (exit), \
3976 .stage = X86_ICPT_POST_EXCEPT, }
3977#define POST_MEM(exit) { .exit_code = (exit), \
3978 .stage = X86_ICPT_POST_MEMACCESS, }
3979
3980static struct __x86_intercept {
3981 u32 exit_code;
3982 enum x86_intercept_stage stage;
3983} x86_intercept_map[] = {
3984 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
3985 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
3986 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
3987 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
3988 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
3989 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
3990 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
3991 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
3992 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
3993 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
3994 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
3995 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
3996 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
3997 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
3998 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
3999 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
4000 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
4001 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
4002 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
4003 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
4004 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
4005 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
4006 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
4007 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
4008 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
4009 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
4010 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
4011 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
4012 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
4013 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
4014 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
4015 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
4016 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
4017 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
4018 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
4019 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
4020 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
4021 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
4022 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
4023 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
4024 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
4025 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
4026 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
4027 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
4028 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
4029 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
4030};
4031
4032#undef PRE_EX
4033#undef POST_EX
4034#undef POST_MEM
4035
4036static int svm_check_intercept(struct kvm_vcpu *vcpu,
4037 struct x86_instruction_info *info,
4038 enum x86_intercept_stage stage)
4039{
4040 struct vcpu_svm *svm = to_svm(vcpu);
4041 int vmexit, ret = X86EMUL_CONTINUE;
4042 struct __x86_intercept icpt_info;
4043 struct vmcb *vmcb = svm->vmcb;
4044
4045 if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4046 goto out;
4047
4048 icpt_info = x86_intercept_map[info->intercept];
4049
4050 if (stage != icpt_info.stage)
4051 goto out;
4052
4053 switch (icpt_info.exit_code) {
4054 case SVM_EXIT_READ_CR0:
4055 if (info->intercept == x86_intercept_cr_read)
4056 icpt_info.exit_code += info->modrm_reg;
4057 break;
4058 case SVM_EXIT_WRITE_CR0: {
4059 unsigned long cr0, val;
4060 u64 intercept;
4061
4062 if (info->intercept == x86_intercept_cr_write)
4063 icpt_info.exit_code += info->modrm_reg;
4064
4065 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0)
4066 break;
4067
4068 intercept = svm->nested.intercept;
4069
4070 if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
4071 break;
4072
4073 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4074 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
4075
4076 if (info->intercept == x86_intercept_lmsw) {
4077 cr0 &= 0xfUL;
4078 val &= 0xfUL;
4079 /* lmsw can't clear PE - catch this here */
4080 if (cr0 & X86_CR0_PE)
4081 val |= X86_CR0_PE;
4082 }
4083
4084 if (cr0 ^ val)
4085 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4086
4087 break;
4088 }
4089 case SVM_EXIT_READ_DR0:
4090 case SVM_EXIT_WRITE_DR0:
4091 icpt_info.exit_code += info->modrm_reg;
4092 break;
4093 case SVM_EXIT_MSR:
4094 if (info->intercept == x86_intercept_wrmsr)
4095 vmcb->control.exit_info_1 = 1;
4096 else
4097 vmcb->control.exit_info_1 = 0;
4098 break;
4099 case SVM_EXIT_PAUSE:
4100 /*
4101 * We get this for NOP only, but pause
4102 * is rep not, check this here
4103 */
4104 if (info->rep_prefix != REPE_PREFIX)
4105 goto out;
4106 case SVM_EXIT_IOIO: {
4107 u64 exit_info;
4108 u32 bytes;
4109
4110 exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16;
4111
4112 if (info->intercept == x86_intercept_in ||
4113 info->intercept == x86_intercept_ins) {
4114 exit_info |= SVM_IOIO_TYPE_MASK;
4115 bytes = info->src_bytes;
4116 } else {
4117 bytes = info->dst_bytes;
4118 }
4119
4120 if (info->intercept == x86_intercept_outs ||
4121 info->intercept == x86_intercept_ins)
4122 exit_info |= SVM_IOIO_STR_MASK;
4123
4124 if (info->rep_prefix)
4125 exit_info |= SVM_IOIO_REP_MASK;
4126
4127 bytes = min(bytes, 4u);
4128
4129 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4130
4131 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4132
4133 vmcb->control.exit_info_1 = exit_info;
4134 vmcb->control.exit_info_2 = info->next_rip;
4135
4136 break;
4137 }
4138 default:
4139 break;
4140 }
4141
4142 vmcb->control.next_rip = info->next_rip;
4143 vmcb->control.exit_code = icpt_info.exit_code;
4144 vmexit = nested_svm_exit_handled(svm);
4145
4146 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4147 : X86EMUL_CONTINUE;
4148
4149out:
4150 return ret;
4151}
4152
3446static struct kvm_x86_ops svm_x86_ops = { 4153static struct kvm_x86_ops svm_x86_ops = {
3447 .cpu_has_kvm_support = has_svm, 4154 .cpu_has_kvm_support = has_svm,
3448 .disabled_by_bios = is_disabled, 4155 .disabled_by_bios = is_disabled,
@@ -3470,6 +4177,7 @@ static struct kvm_x86_ops svm_x86_ops = {
3470 .get_cpl = svm_get_cpl, 4177 .get_cpl = svm_get_cpl,
3471 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 4178 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
3472 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, 4179 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
4180 .decache_cr3 = svm_decache_cr3,
3473 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 4181 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
3474 .set_cr0 = svm_set_cr0, 4182 .set_cr0 = svm_set_cr0,
3475 .set_cr3 = svm_set_cr3, 4183 .set_cr3 = svm_set_cr3,
@@ -3497,6 +4205,7 @@ static struct kvm_x86_ops svm_x86_ops = {
3497 .set_irq = svm_set_irq, 4205 .set_irq = svm_set_irq,
3498 .set_nmi = svm_inject_nmi, 4206 .set_nmi = svm_inject_nmi,
3499 .queue_exception = svm_queue_exception, 4207 .queue_exception = svm_queue_exception,
4208 .cancel_injection = svm_cancel_injection,
3500 .interrupt_allowed = svm_interrupt_allowed, 4209 .interrupt_allowed = svm_interrupt_allowed,
3501 .nmi_allowed = svm_nmi_allowed, 4210 .nmi_allowed = svm_nmi_allowed,
3502 .get_nmi_mask = svm_get_nmi_mask, 4211 .get_nmi_mask = svm_get_nmi_mask,
@@ -3509,7 +4218,9 @@ static struct kvm_x86_ops svm_x86_ops = {
3509 .get_tdp_level = get_npt_level, 4218 .get_tdp_level = get_npt_level,
3510 .get_mt_mask = svm_get_mt_mask, 4219 .get_mt_mask = svm_get_mt_mask,
3511 4220
4221 .get_exit_info = svm_get_exit_info,
3512 .exit_reasons_str = svm_exit_reasons_str, 4222 .exit_reasons_str = svm_exit_reasons_str,
4223
3513 .get_lpage_level = svm_get_lpage_level, 4224 .get_lpage_level = svm_get_lpage_level,
3514 4225
3515 .cpuid_update = svm_cpuid_update, 4226 .cpuid_update = svm_cpuid_update,
@@ -3519,6 +4230,15 @@ static struct kvm_x86_ops svm_x86_ops = {
3519 .set_supported_cpuid = svm_set_supported_cpuid, 4230 .set_supported_cpuid = svm_set_supported_cpuid,
3520 4231
3521 .has_wbinvd_exit = svm_has_wbinvd_exit, 4232 .has_wbinvd_exit = svm_has_wbinvd_exit,
4233
4234 .set_tsc_khz = svm_set_tsc_khz,
4235 .write_tsc_offset = svm_write_tsc_offset,
4236 .adjust_tsc_offset = svm_adjust_tsc_offset,
4237 .compute_tsc_offset = svm_compute_tsc_offset,
4238
4239 .set_tdp_cr3 = set_tdp_cr3,
4240
4241 .check_intercept = svm_check_intercept,
3522}; 4242};
3523 4243
3524static int __init svm_init(void) 4244static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index e16a0dbe74d8..abd86e865be3 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * timer support 7 * timer support
8 * 8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affilates. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * 10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See 11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory. 12 * the COPYING file in the top-level directory.
@@ -25,7 +25,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
25 25
26 /* 26 /*
27 * There is a race window between reading and incrementing, but we do 27 * There is a race window between reading and incrementing, but we do
28 * not care about potentially loosing timer events in the !reinject 28 * not care about potentially losing timer events in the !reinject
29 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked 29 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
30 * in vcpu_enter_guest. 30 * in vcpu_enter_guest.
31 */ 31 */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a6544b8e7c0f..db932760ea82 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -62,21 +62,21 @@ TRACE_EVENT(kvm_hv_hypercall,
62 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), 62 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
63 63
64 TP_STRUCT__entry( 64 TP_STRUCT__entry(
65 __field( __u16, code )
66 __field( bool, fast )
67 __field( __u16, rep_cnt ) 65 __field( __u16, rep_cnt )
68 __field( __u16, rep_idx ) 66 __field( __u16, rep_idx )
69 __field( __u64, ingpa ) 67 __field( __u64, ingpa )
70 __field( __u64, outgpa ) 68 __field( __u64, outgpa )
69 __field( __u16, code )
70 __field( bool, fast )
71 ), 71 ),
72 72
73 TP_fast_assign( 73 TP_fast_assign(
74 __entry->code = code;
75 __entry->fast = fast;
76 __entry->rep_cnt = rep_cnt; 74 __entry->rep_cnt = rep_cnt;
77 __entry->rep_idx = rep_idx; 75 __entry->rep_idx = rep_idx;
78 __entry->ingpa = ingpa; 76 __entry->ingpa = ingpa;
79 __entry->outgpa = outgpa; 77 __entry->outgpa = outgpa;
78 __entry->code = code;
79 __entry->fast = fast;
80 ), 80 ),
81 81
82 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", 82 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic,
178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) 178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) 179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
180 180
181#define KVM_ISA_VMX 1
182#define KVM_ISA_SVM 2
183
181/* 184/*
182 * Tracepoint for kvm guest exit: 185 * Tracepoint for kvm guest exit:
183 */ 186 */
184TRACE_EVENT(kvm_exit, 187TRACE_EVENT(kvm_exit,
185 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu), 188 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
186 TP_ARGS(exit_reason, vcpu), 189 TP_ARGS(exit_reason, vcpu, isa),
187 190
188 TP_STRUCT__entry( 191 TP_STRUCT__entry(
189 __field( unsigned int, exit_reason ) 192 __field( unsigned int, exit_reason )
190 __field( unsigned long, guest_rip ) 193 __field( unsigned long, guest_rip )
194 __field( u32, isa )
195 __field( u64, info1 )
196 __field( u64, info2 )
191 ), 197 ),
192 198
193 TP_fast_assign( 199 TP_fast_assign(
194 __entry->exit_reason = exit_reason; 200 __entry->exit_reason = exit_reason;
195 __entry->guest_rip = kvm_rip_read(vcpu); 201 __entry->guest_rip = kvm_rip_read(vcpu);
202 __entry->isa = isa;
203 kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
204 &__entry->info2);
196 ), 205 ),
197 206
198 TP_printk("reason %s rip 0x%lx", 207 TP_printk("reason %s rip 0x%lx info %llx %llx",
199 ftrace_print_symbols_seq(p, __entry->exit_reason, 208 ftrace_print_symbols_seq(p, __entry->exit_reason,
200 kvm_x86_ops->exit_reasons_str), 209 kvm_x86_ops->exit_reasons_str),
201 __entry->guest_rip) 210 __entry->guest_rip, __entry->info1, __entry->info2)
202); 211);
203 212
204/* 213/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7bddfab12013..d48ec60ea421 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,7 +5,7 @@
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
69static int __read_mostly vmm_exclusive = 1; 69static int __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO); 70module_param(vmm_exclusive, bool, S_IRUGO);
71 71
72static int __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO);
74
72#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 75#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
73 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 76 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
74#define KVM_GUEST_CR0_MASK \ 77#define KVM_GUEST_CR0_MASK \
@@ -90,14 +93,14 @@ module_param(vmm_exclusive, bool, S_IRUGO);
90 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 93 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
91 * ple_gap: upper bound on the amount of time between two successive 94 * ple_gap: upper bound on the amount of time between two successive
92 * executions of PAUSE in a loop. Also indicate if ple enabled. 95 * executions of PAUSE in a loop. Also indicate if ple enabled.
93 * According to test, this time is usually small than 41 cycles. 96 * According to test, this time is usually smaller than 128 cycles.
94 * ple_window: upper bound on the amount of time a guest is allowed to execute 97 * ple_window: upper bound on the amount of time a guest is allowed to execute
95 * in a PAUSE loop. Tests indicate that most spinlocks are held for 98 * in a PAUSE loop. Tests indicate that most spinlocks are held for
96 * less than 2^12 cycles 99 * less than 2^12 cycles
97 * Time is measured based on a counter that runs at the same rate as the TSC, 100 * Time is measured based on a counter that runs at the same rate as the TSC,
98 * refer SDM volume 3b section 21.6.13 & 22.1.3. 101 * refer SDM volume 3b section 21.6.13 & 22.1.3.
99 */ 102 */
100#define KVM_VMX_DEFAULT_PLE_GAP 41 103#define KVM_VMX_DEFAULT_PLE_GAP 128
101#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
102static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; 105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
103module_param(ple_gap, int, S_IRUGO); 106module_param(ple_gap, int, S_IRUGO);
@@ -125,7 +128,11 @@ struct vcpu_vmx {
125 unsigned long host_rsp; 128 unsigned long host_rsp;
126 int launched; 129 int launched;
127 u8 fail; 130 u8 fail;
131 u8 cpl;
132 bool nmi_known_unmasked;
133 u32 exit_intr_info;
128 u32 idt_vectoring_info; 134 u32 idt_vectoring_info;
135 ulong rflags;
129 struct shared_msr_entry *guest_msrs; 136 struct shared_msr_entry *guest_msrs;
130 int nmsrs; 137 int nmsrs;
131 int save_nmsrs; 138 int save_nmsrs;
@@ -154,12 +161,11 @@ struct vcpu_vmx {
154 u32 limit; 161 u32 limit;
155 u32 ar; 162 u32 ar;
156 } tr, es, ds, fs, gs; 163 } tr, es, ds, fs, gs;
157 struct {
158 bool pending;
159 u8 vector;
160 unsigned rip;
161 } irq;
162 } rmode; 164 } rmode;
165 struct {
166 u32 bitmask; /* 4 bits per segment (1 bit per field) */
167 struct kvm_save_segment seg[8];
168 } segment_cache;
163 int vpid; 169 int vpid;
164 bool emulation_required; 170 bool emulation_required;
165 171
@@ -172,15 +178,25 @@ struct vcpu_vmx {
172 bool rdtscp_enabled; 178 bool rdtscp_enabled;
173}; 179};
174 180
181enum segment_cache_field {
182 SEG_FIELD_SEL = 0,
183 SEG_FIELD_BASE = 1,
184 SEG_FIELD_LIMIT = 2,
185 SEG_FIELD_AR = 3,
186
187 SEG_FIELD_NR = 4
188};
189
175static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 190static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
176{ 191{
177 return container_of(vcpu, struct vcpu_vmx, vcpu); 192 return container_of(vcpu, struct vcpu_vmx, vcpu);
178} 193}
179 194
180static int init_rmode(struct kvm *kvm);
181static u64 construct_eptp(unsigned long root_hpa); 195static u64 construct_eptp(unsigned long root_hpa);
182static void kvm_cpu_vmxon(u64 addr); 196static void kvm_cpu_vmxon(u64 addr);
183static void kvm_cpu_vmxoff(void); 197static void kvm_cpu_vmxoff(void);
198static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
199static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
184 200
185static DEFINE_PER_CPU(struct vmcs *, vmxarea); 201static DEFINE_PER_CPU(struct vmcs *, vmxarea);
186static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 202static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -192,6 +208,8 @@ static unsigned long *vmx_io_bitmap_b;
192static unsigned long *vmx_msr_bitmap_legacy; 208static unsigned long *vmx_msr_bitmap_legacy;
193static unsigned long *vmx_msr_bitmap_longmode; 209static unsigned long *vmx_msr_bitmap_longmode;
194 210
211static bool cpu_has_load_ia32_efer;
212
195static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 213static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
196static DEFINE_SPINLOCK(vmx_vpid_lock); 214static DEFINE_SPINLOCK(vmx_vpid_lock);
197 215
@@ -476,7 +494,7 @@ static void vmcs_clear(struct vmcs *vmcs)
476 u8 error; 494 u8 error;
477 495
478 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" 496 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
479 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 497 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
480 : "cc", "memory"); 498 : "cc", "memory");
481 if (error) 499 if (error)
482 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 500 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -489,7 +507,7 @@ static void vmcs_load(struct vmcs *vmcs)
489 u8 error; 507 u8 error;
490 508
491 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 509 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
492 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 510 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
493 : "cc", "memory"); 511 : "cc", "memory");
494 if (error) 512 if (error)
495 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 513 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@@ -505,7 +523,6 @@ static void __vcpu_clear(void *arg)
505 vmcs_clear(vmx->vmcs); 523 vmcs_clear(vmx->vmcs);
506 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 524 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
507 per_cpu(current_vmcs, cpu) = NULL; 525 per_cpu(current_vmcs, cpu) = NULL;
508 rdtscll(vmx->vcpu.arch.host_tsc);
509 list_del(&vmx->local_vcpus_link); 526 list_del(&vmx->local_vcpus_link);
510 vmx->vcpu.cpu = -1; 527 vmx->vcpu.cpu = -1;
511 vmx->launched = 0; 528 vmx->launched = 0;
@@ -570,10 +587,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
570 587
571static unsigned long vmcs_readl(unsigned long field) 588static unsigned long vmcs_readl(unsigned long field)
572{ 589{
573 unsigned long value; 590 unsigned long value = 0;
574 591
575 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) 592 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
576 : "=a"(value) : "d"(field) : "cc"); 593 : "+a"(value) : "d"(field) : "cc");
577 return value; 594 return value;
578} 595}
579 596
@@ -642,6 +659,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
642 vmcs_writel(field, vmcs_readl(field) | mask); 659 vmcs_writel(field, vmcs_readl(field) | mask);
643} 660}
644 661
662static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
663{
664 vmx->segment_cache.bitmask = 0;
665}
666
667static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
668 unsigned field)
669{
670 bool ret;
671 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
672
673 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
674 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
675 vmx->segment_cache.bitmask = 0;
676 }
677 ret = vmx->segment_cache.bitmask & mask;
678 vmx->segment_cache.bitmask |= mask;
679 return ret;
680}
681
682static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
683{
684 u16 *p = &vmx->segment_cache.seg[seg].selector;
685
686 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
687 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
688 return *p;
689}
690
691static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
692{
693 ulong *p = &vmx->segment_cache.seg[seg].base;
694
695 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
696 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
697 return *p;
698}
699
700static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
701{
702 u32 *p = &vmx->segment_cache.seg[seg].limit;
703
704 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
705 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
706 return *p;
707}
708
709static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
710{
711 u32 *p = &vmx->segment_cache.seg[seg].ar;
712
713 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
714 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
715 return *p;
716}
717
645static void update_exception_bitmap(struct kvm_vcpu *vcpu) 718static void update_exception_bitmap(struct kvm_vcpu *vcpu)
646{ 719{
647 u32 eb; 720 u32 eb;
@@ -666,6 +739,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
666 unsigned i; 739 unsigned i;
667 struct msr_autoload *m = &vmx->msr_autoload; 740 struct msr_autoload *m = &vmx->msr_autoload;
668 741
742 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
743 vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
744 vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
745 return;
746 }
747
669 for (i = 0; i < m->nr; ++i) 748 for (i = 0; i < m->nr; ++i)
670 if (m->guest[i].index == msr) 749 if (m->guest[i].index == msr)
671 break; 750 break;
@@ -685,6 +764,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
685 unsigned i; 764 unsigned i;
686 struct msr_autoload *m = &vmx->msr_autoload; 765 struct msr_autoload *m = &vmx->msr_autoload;
687 766
767 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
768 vmcs_write64(GUEST_IA32_EFER, guest_val);
769 vmcs_write64(HOST_IA32_EFER, host_val);
770 vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
771 vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
772 return;
773 }
774
688 for (i = 0; i < m->nr; ++i) 775 for (i = 0; i < m->nr; ++i)
689 if (m->guest[i].index == msr) 776 if (m->guest[i].index == msr)
690 break; 777 break;
@@ -706,11 +793,10 @@ static void reload_tss(void)
706 /* 793 /*
707 * VT restores TR but not its size. Useless. 794 * VT restores TR but not its size. Useless.
708 */ 795 */
709 struct desc_ptr gdt; 796 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
710 struct desc_struct *descs; 797 struct desc_struct *descs;
711 798
712 native_store_gdt(&gdt); 799 descs = (void *)gdt->address;
713 descs = (void *)gdt.address;
714 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 800 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
715 load_TR_desc(); 801 load_TR_desc();
716} 802}
@@ -753,7 +839,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
753 839
754static unsigned long segment_base(u16 selector) 840static unsigned long segment_base(u16 selector)
755{ 841{
756 struct desc_ptr gdt; 842 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
757 struct desc_struct *d; 843 struct desc_struct *d;
758 unsigned long table_base; 844 unsigned long table_base;
759 unsigned long v; 845 unsigned long v;
@@ -761,8 +847,7 @@ static unsigned long segment_base(u16 selector)
761 if (!(selector & ~3)) 847 if (!(selector & ~3))
762 return 0; 848 return 0;
763 849
764 native_store_gdt(&gdt); 850 table_base = gdt->address;
765 table_base = gdt.address;
766 851
767 if (selector & 4) { /* from ldt */ 852 if (selector & 4) { /* from ldt */
768 u16 ldt_selector = kvm_read_ldt(); 853 u16 ldt_selector = kvm_read_ldt();
@@ -828,10 +913,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
828#endif 913#endif
829 914
830#ifdef CONFIG_X86_64 915#ifdef CONFIG_X86_64
831 if (is_long_mode(&vmx->vcpu)) { 916 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
832 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 917 if (is_long_mode(&vmx->vcpu))
833 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 918 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
834 }
835#endif 919#endif
836 for (i = 0; i < vmx->save_nmsrs; ++i) 920 for (i = 0; i < vmx->save_nmsrs; ++i)
837 kvm_set_shared_msr(vmx->guest_msrs[i].index, 921 kvm_set_shared_msr(vmx->guest_msrs[i].index,
@@ -846,23 +930,23 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
846 930
847 ++vmx->vcpu.stat.host_state_reload; 931 ++vmx->vcpu.stat.host_state_reload;
848 vmx->host_state.loaded = 0; 932 vmx->host_state.loaded = 0;
849 if (vmx->host_state.fs_reload_needed) 933#ifdef CONFIG_X86_64
850 loadsegment(fs, vmx->host_state.fs_sel); 934 if (is_long_mode(&vmx->vcpu))
935 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
936#endif
851 if (vmx->host_state.gs_ldt_reload_needed) { 937 if (vmx->host_state.gs_ldt_reload_needed) {
852 kvm_load_ldt(vmx->host_state.ldt_sel); 938 kvm_load_ldt(vmx->host_state.ldt_sel);
853#ifdef CONFIG_X86_64 939#ifdef CONFIG_X86_64
854 load_gs_index(vmx->host_state.gs_sel); 940 load_gs_index(vmx->host_state.gs_sel);
855 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
856#else 941#else
857 loadsegment(gs, vmx->host_state.gs_sel); 942 loadsegment(gs, vmx->host_state.gs_sel);
858#endif 943#endif
859 } 944 }
945 if (vmx->host_state.fs_reload_needed)
946 loadsegment(fs, vmx->host_state.fs_sel);
860 reload_tss(); 947 reload_tss();
861#ifdef CONFIG_X86_64 948#ifdef CONFIG_X86_64
862 if (is_long_mode(&vmx->vcpu)) { 949 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
863 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
864 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
865 }
866#endif 950#endif
867 if (current_thread_info()->status & TS_USEDFPU) 951 if (current_thread_info()->status & TS_USEDFPU)
868 clts(); 952 clts();
@@ -883,7 +967,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
883static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 967static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
884{ 968{
885 struct vcpu_vmx *vmx = to_vmx(vcpu); 969 struct vcpu_vmx *vmx = to_vmx(vcpu);
886 u64 tsc_this, delta, new_offset;
887 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 970 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
888 971
889 if (!vmm_exclusive) 972 if (!vmm_exclusive)
@@ -897,37 +980,24 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
897 } 980 }
898 981
899 if (vcpu->cpu != cpu) { 982 if (vcpu->cpu != cpu) {
900 struct desc_ptr dt; 983 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
901 unsigned long sysenter_esp; 984 unsigned long sysenter_esp;
902 985
903 kvm_migrate_timers(vcpu);
904 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 986 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
905 local_irq_disable(); 987 local_irq_disable();
906 list_add(&vmx->local_vcpus_link, 988 list_add(&vmx->local_vcpus_link,
907 &per_cpu(vcpus_on_cpu, cpu)); 989 &per_cpu(vcpus_on_cpu, cpu));
908 local_irq_enable(); 990 local_irq_enable();
909 991
910 vcpu->cpu = cpu;
911 /* 992 /*
912 * Linux uses per-cpu TSS and GDT, so set these when switching 993 * Linux uses per-cpu TSS and GDT, so set these when switching
913 * processors. 994 * processors.
914 */ 995 */
915 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 996 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
916 native_store_gdt(&dt); 997 vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
917 vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
918 998
919 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 999 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
920 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 1000 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
921
922 /*
923 * Make sure the time stamp counter is monotonous.
924 */
925 rdtscll(tsc_this);
926 if (tsc_this < vcpu->arch.host_tsc) {
927 delta = vcpu->arch.host_tsc - tsc_this;
928 new_offset = vmcs_read64(TSC_OFFSET) + delta;
929 vmcs_write64(TSC_OFFSET, new_offset);
930 }
931 } 1001 }
932} 1002}
933 1003
@@ -972,17 +1042,24 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
972{ 1042{
973 unsigned long rflags, save_rflags; 1043 unsigned long rflags, save_rflags;
974 1044
975 rflags = vmcs_readl(GUEST_RFLAGS); 1045 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
976 if (to_vmx(vcpu)->rmode.vm86_active) { 1046 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
977 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1047 rflags = vmcs_readl(GUEST_RFLAGS);
978 save_rflags = to_vmx(vcpu)->rmode.save_rflags; 1048 if (to_vmx(vcpu)->rmode.vm86_active) {
979 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1049 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1050 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
1051 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1052 }
1053 to_vmx(vcpu)->rflags = rflags;
980 } 1054 }
981 return rflags; 1055 return to_vmx(vcpu)->rflags;
982} 1056}
983 1057
984static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1058static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
985{ 1059{
1060 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1061 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
1062 to_vmx(vcpu)->rflags = rflags;
986 if (to_vmx(vcpu)->rmode.vm86_active) { 1063 if (to_vmx(vcpu)->rmode.vm86_active) {
987 to_vmx(vcpu)->rmode.save_rflags = rflags; 1064 to_vmx(vcpu)->rmode.save_rflags = rflags;
988 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1065 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1031,6 +1108,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1031 vmx_set_interrupt_shadow(vcpu, 0); 1108 vmx_set_interrupt_shadow(vcpu, 0);
1032} 1109}
1033 1110
1111static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1112{
1113 /* Ensure that we clear the HLT state in the VMCS. We don't need to
1114 * explicitly skip the instruction because if the HLT state is set, then
1115 * the instruction is already executing and RIP has already been
1116 * advanced. */
1117 if (!yield_on_hlt &&
1118 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1119 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1120}
1121
1034static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 1122static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1035 bool has_error_code, u32 error_code, 1123 bool has_error_code, u32 error_code,
1036 bool reinject) 1124 bool reinject)
@@ -1044,16 +1132,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1044 } 1132 }
1045 1133
1046 if (vmx->rmode.vm86_active) { 1134 if (vmx->rmode.vm86_active) {
1047 vmx->rmode.irq.pending = true; 1135 int inc_eip = 0;
1048 vmx->rmode.irq.vector = nr;
1049 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
1050 if (kvm_exception_is_soft(nr)) 1136 if (kvm_exception_is_soft(nr))
1051 vmx->rmode.irq.rip += 1137 inc_eip = vcpu->arch.event_exit_inst_len;
1052 vmx->vcpu.arch.event_exit_inst_len; 1138 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
1053 intr_info |= INTR_TYPE_SOFT_INTR; 1139 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1054 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1055 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
1056 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
1057 return; 1140 return;
1058 } 1141 }
1059 1142
@@ -1065,6 +1148,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1065 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1148 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1066 1149
1067 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1150 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1151 vmx_clear_hlt(vcpu);
1068} 1152}
1069 1153
1070static bool vmx_rdtscp_supported(void) 1154static bool vmx_rdtscp_supported(void)
@@ -1149,12 +1233,32 @@ static u64 guest_read_tsc(void)
1149} 1233}
1150 1234
1151/* 1235/*
1152 * writes 'guest_tsc' into guest's timestamp counter "register" 1236 * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
1153 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc 1237 * ioctl. In this case the call-back should update internal vmx state to make
1238 * the changes effective.
1239 */
1240static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1241{
1242 /* Nothing to do here */
1243}
1244
1245/*
1246 * writes 'offset' into guest's timestamp counter offset register
1154 */ 1247 */
1155static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) 1248static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1156{ 1249{
1157 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); 1250 vmcs_write64(TSC_OFFSET, offset);
1251}
1252
1253static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1254{
1255 u64 offset = vmcs_read64(TSC_OFFSET);
1256 vmcs_write64(TSC_OFFSET, offset + adjustment);
1257}
1258
1259static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1260{
1261 return target_tsc - native_read_tsc();
1158} 1262}
1159 1263
1160/* 1264/*
@@ -1227,7 +1331,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1227{ 1331{
1228 struct vcpu_vmx *vmx = to_vmx(vcpu); 1332 struct vcpu_vmx *vmx = to_vmx(vcpu);
1229 struct shared_msr_entry *msr; 1333 struct shared_msr_entry *msr;
1230 u64 host_tsc;
1231 int ret = 0; 1334 int ret = 0;
1232 1335
1233 switch (msr_index) { 1336 switch (msr_index) {
@@ -1237,9 +1340,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1237 break; 1340 break;
1238#ifdef CONFIG_X86_64 1341#ifdef CONFIG_X86_64
1239 case MSR_FS_BASE: 1342 case MSR_FS_BASE:
1343 vmx_segment_cache_clear(vmx);
1240 vmcs_writel(GUEST_FS_BASE, data); 1344 vmcs_writel(GUEST_FS_BASE, data);
1241 break; 1345 break;
1242 case MSR_GS_BASE: 1346 case MSR_GS_BASE:
1347 vmx_segment_cache_clear(vmx);
1243 vmcs_writel(GUEST_GS_BASE, data); 1348 vmcs_writel(GUEST_GS_BASE, data);
1244 break; 1349 break;
1245 case MSR_KERNEL_GS_BASE: 1350 case MSR_KERNEL_GS_BASE:
@@ -1257,8 +1362,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1257 vmcs_writel(GUEST_SYSENTER_ESP, data); 1362 vmcs_writel(GUEST_SYSENTER_ESP, data);
1258 break; 1363 break;
1259 case MSR_IA32_TSC: 1364 case MSR_IA32_TSC:
1260 rdtscll(host_tsc); 1365 kvm_write_tsc(vcpu, data);
1261 guest_write_tsc(data, host_tsc);
1262 break; 1366 break;
1263 case MSR_IA32_CR_PAT: 1367 case MSR_IA32_CR_PAT:
1264 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 1368 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -1328,16 +1432,25 @@ static __init int vmx_disabled_by_bios(void)
1328 1432
1329 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1433 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1330 if (msr & FEATURE_CONTROL_LOCKED) { 1434 if (msr & FEATURE_CONTROL_LOCKED) {
1435 /* launched w/ TXT and VMX disabled */
1331 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 1436 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1332 && tboot_enabled()) 1437 && tboot_enabled())
1333 return 1; 1438 return 1;
1439 /* launched w/o TXT and VMX only enabled w/ TXT */
1440 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1441 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1442 && !tboot_enabled()) {
1443 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1444 "activate TXT before enabling KVM\n");
1445 return 1;
1446 }
1447 /* launched w/o TXT and VMX disabled */
1334 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 1448 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1335 && !tboot_enabled()) 1449 && !tboot_enabled())
1336 return 1; 1450 return 1;
1337 } 1451 }
1338 1452
1339 return 0; 1453 return 0;
1340 /* locked but not enabled */
1341} 1454}
1342 1455
1343static void kvm_cpu_vmxon(u64 addr) 1456static void kvm_cpu_vmxon(u64 addr)
@@ -1427,6 +1540,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1427 return 0; 1540 return 0;
1428} 1541}
1429 1542
1543static __init bool allow_1_setting(u32 msr, u32 ctl)
1544{
1545 u32 vmx_msr_low, vmx_msr_high;
1546
1547 rdmsr(msr, vmx_msr_low, vmx_msr_high);
1548 return vmx_msr_high & ctl;
1549}
1550
1430static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) 1551static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1431{ 1552{
1432 u32 vmx_msr_low, vmx_msr_high; 1553 u32 vmx_msr_low, vmx_msr_high;
@@ -1443,7 +1564,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1443 &_pin_based_exec_control) < 0) 1564 &_pin_based_exec_control) < 0)
1444 return -EIO; 1565 return -EIO;
1445 1566
1446 min = CPU_BASED_HLT_EXITING | 1567 min =
1447#ifdef CONFIG_X86_64 1568#ifdef CONFIG_X86_64
1448 CPU_BASED_CR8_LOAD_EXITING | 1569 CPU_BASED_CR8_LOAD_EXITING |
1449 CPU_BASED_CR8_STORE_EXITING | 1570 CPU_BASED_CR8_STORE_EXITING |
@@ -1456,6 +1577,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1456 CPU_BASED_MWAIT_EXITING | 1577 CPU_BASED_MWAIT_EXITING |
1457 CPU_BASED_MONITOR_EXITING | 1578 CPU_BASED_MONITOR_EXITING |
1458 CPU_BASED_INVLPG_EXITING; 1579 CPU_BASED_INVLPG_EXITING;
1580
1581 if (yield_on_hlt)
1582 min |= CPU_BASED_HLT_EXITING;
1583
1459 opt = CPU_BASED_TPR_SHADOW | 1584 opt = CPU_BASED_TPR_SHADOW |
1460 CPU_BASED_USE_MSR_BITMAPS | 1585 CPU_BASED_USE_MSR_BITMAPS |
1461 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1586 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1537,6 +1662,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1537 vmcs_conf->vmexit_ctrl = _vmexit_control; 1662 vmcs_conf->vmexit_ctrl = _vmexit_control;
1538 vmcs_conf->vmentry_ctrl = _vmentry_control; 1663 vmcs_conf->vmentry_ctrl = _vmentry_control;
1539 1664
1665 cpu_has_load_ia32_efer =
1666 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
1667 VM_ENTRY_LOAD_IA32_EFER)
1668 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
1669 VM_EXIT_LOAD_IA32_EFER);
1670
1540 return 0; 1671 return 0;
1541} 1672}
1542 1673
@@ -1657,6 +1788,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1657 vmx->emulation_required = 1; 1788 vmx->emulation_required = 1;
1658 vmx->rmode.vm86_active = 0; 1789 vmx->rmode.vm86_active = 0;
1659 1790
1791 vmx_segment_cache_clear(vmx);
1792
1793 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
1660 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); 1794 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1661 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); 1795 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
1662 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); 1796 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
@@ -1679,6 +1813,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1679 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); 1813 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
1680 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); 1814 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
1681 1815
1816 vmx_segment_cache_clear(vmx);
1817
1682 vmcs_write16(GUEST_SS_SELECTOR, 0); 1818 vmcs_write16(GUEST_SS_SELECTOR, 0);
1683 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 1819 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1684 1820
@@ -1710,9 +1846,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1710 save->limit = vmcs_read32(sf->limit); 1846 save->limit = vmcs_read32(sf->limit);
1711 save->ar = vmcs_read32(sf->ar_bytes); 1847 save->ar = vmcs_read32(sf->ar_bytes);
1712 vmcs_write16(sf->selector, save->base >> 4); 1848 vmcs_write16(sf->selector, save->base >> 4);
1713 vmcs_write32(sf->base, save->base & 0xfffff); 1849 vmcs_write32(sf->base, save->base & 0xffff0);
1714 vmcs_write32(sf->limit, 0xffff); 1850 vmcs_write32(sf->limit, 0xffff);
1715 vmcs_write32(sf->ar_bytes, 0xf3); 1851 vmcs_write32(sf->ar_bytes, 0xf3);
1852 if (save->base & 0xf)
1853 printk_once(KERN_WARNING "kvm: segment base is not paragraph"
1854 " aligned when entering protected mode (seg=%d)",
1855 seg);
1716} 1856}
1717 1857
1718static void enter_rmode(struct kvm_vcpu *vcpu) 1858static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1726,6 +1866,21 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1726 vmx->emulation_required = 1; 1866 vmx->emulation_required = 1;
1727 vmx->rmode.vm86_active = 1; 1867 vmx->rmode.vm86_active = 1;
1728 1868
1869 /*
1870 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
1871 * vcpu. Call it here with phys address pointing 16M below 4G.
1872 */
1873 if (!vcpu->kvm->arch.tss_addr) {
1874 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
1875 "called before entering vcpu\n");
1876 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
1877 vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
1878 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1879 }
1880
1881 vmx_segment_cache_clear(vmx);
1882
1883 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
1729 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1884 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1730 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1885 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1731 1886
@@ -1764,7 +1919,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1764 1919
1765continue_rmode: 1920continue_rmode:
1766 kvm_mmu_reset_context(vcpu); 1921 kvm_mmu_reset_context(vcpu);
1767 init_rmode(vcpu->kvm);
1768} 1922}
1769 1923
1770static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1924static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -1802,6 +1956,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1802{ 1956{
1803 u32 guest_tr_ar; 1957 u32 guest_tr_ar;
1804 1958
1959 vmx_segment_cache_clear(to_vmx(vcpu));
1960
1805 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 1961 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1806 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { 1962 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1807 printk(KERN_DEBUG "%s: tss fixup for long mode. \n", 1963 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
@@ -1841,6 +1997,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1841 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 1997 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1842} 1998}
1843 1999
2000static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
2001{
2002 if (enable_ept && is_paging(vcpu))
2003 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2004 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
2005}
2006
1844static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 2007static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1845{ 2008{
1846 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2009 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1856,20 +2019,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1856 return; 2019 return;
1857 2020
1858 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 2021 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1859 vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); 2022 vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
1860 vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); 2023 vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
1861 vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); 2024 vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
1862 vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); 2025 vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
1863 } 2026 }
1864} 2027}
1865 2028
1866static void ept_save_pdptrs(struct kvm_vcpu *vcpu) 2029static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
1867{ 2030{
1868 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 2031 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1869 vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 2032 vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
1870 vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 2033 vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
1871 vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 2034 vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
1872 vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 2035 vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
1873 } 2036 }
1874 2037
1875 __set_bit(VCPU_EXREG_PDPTR, 2038 __set_bit(VCPU_EXREG_PDPTR,
@@ -1884,6 +2047,8 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1884 unsigned long cr0, 2047 unsigned long cr0,
1885 struct kvm_vcpu *vcpu) 2048 struct kvm_vcpu *vcpu)
1886{ 2049{
2050 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
2051 vmx_decache_cr3(vcpu);
1887 if (!(cr0 & X86_CR0_PG)) { 2052 if (!(cr0 & X86_CR0_PG)) {
1888 /* From paging/starting to nonpaging */ 2053 /* From paging/starting to nonpaging */
1889 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 2054 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1941,6 +2106,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1941 vmcs_writel(CR0_READ_SHADOW, cr0); 2106 vmcs_writel(CR0_READ_SHADOW, cr0);
1942 vmcs_writel(GUEST_CR0, hw_cr0); 2107 vmcs_writel(GUEST_CR0, hw_cr0);
1943 vcpu->arch.cr0 = cr0; 2108 vcpu->arch.cr0 = cr0;
2109 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
1944} 2110}
1945 2111
1946static u64 construct_eptp(unsigned long root_hpa) 2112static u64 construct_eptp(unsigned long root_hpa)
@@ -1964,7 +2130,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1964 if (enable_ept) { 2130 if (enable_ept) {
1965 eptp = construct_eptp(cr3); 2131 eptp = construct_eptp(cr3);
1966 vmcs_write64(EPT_POINTER, eptp); 2132 vmcs_write64(EPT_POINTER, eptp);
1967 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 2133 guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
1968 vcpu->kvm->arch.ept_identity_map_addr; 2134 vcpu->kvm->arch.ept_identity_map_addr;
1969 ept_load_pdptrs(vcpu); 2135 ept_load_pdptrs(vcpu);
1970 } 2136 }
@@ -1992,23 +2158,39 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1992 vmcs_writel(GUEST_CR4, hw_cr4); 2158 vmcs_writel(GUEST_CR4, hw_cr4);
1993} 2159}
1994 2160
1995static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1996{
1997 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1998
1999 return vmcs_readl(sf->base);
2000}
2001
2002static void vmx_get_segment(struct kvm_vcpu *vcpu, 2161static void vmx_get_segment(struct kvm_vcpu *vcpu,
2003 struct kvm_segment *var, int seg) 2162 struct kvm_segment *var, int seg)
2004{ 2163{
2005 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2164 struct vcpu_vmx *vmx = to_vmx(vcpu);
2165 struct kvm_save_segment *save;
2006 u32 ar; 2166 u32 ar;
2007 2167
2008 var->base = vmcs_readl(sf->base); 2168 if (vmx->rmode.vm86_active
2009 var->limit = vmcs_read32(sf->limit); 2169 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
2010 var->selector = vmcs_read16(sf->selector); 2170 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
2011 ar = vmcs_read32(sf->ar_bytes); 2171 || seg == VCPU_SREG_GS)
2172 && !emulate_invalid_guest_state) {
2173 switch (seg) {
2174 case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
2175 case VCPU_SREG_ES: save = &vmx->rmode.es; break;
2176 case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
2177 case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
2178 case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
2179 default: BUG();
2180 }
2181 var->selector = save->selector;
2182 var->base = save->base;
2183 var->limit = save->limit;
2184 ar = save->ar;
2185 if (seg == VCPU_SREG_TR
2186 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
2187 goto use_saved_rmode_seg;
2188 }
2189 var->base = vmx_read_guest_seg_base(vmx, seg);
2190 var->limit = vmx_read_guest_seg_limit(vmx, seg);
2191 var->selector = vmx_read_guest_seg_selector(vmx, seg);
2192 ar = vmx_read_guest_seg_ar(vmx, seg);
2193use_saved_rmode_seg:
2012 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) 2194 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
2013 ar = 0; 2195 ar = 0;
2014 var->type = ar & 15; 2196 var->type = ar & 15;
@@ -2022,17 +2204,39 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
2022 var->unusable = (ar >> 16) & 1; 2204 var->unusable = (ar >> 16) & 1;
2023} 2205}
2024 2206
2025static int vmx_get_cpl(struct kvm_vcpu *vcpu) 2207static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2208{
2209 struct kvm_segment s;
2210
2211 if (to_vmx(vcpu)->rmode.vm86_active) {
2212 vmx_get_segment(vcpu, &s, seg);
2213 return s.base;
2214 }
2215 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
2216}
2217
2218static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
2026{ 2219{
2027 if (!is_protmode(vcpu)) 2220 if (!is_protmode(vcpu))
2028 return 0; 2221 return 0;
2029 2222
2030 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ 2223 if (!is_long_mode(vcpu)
2224 && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
2031 return 3; 2225 return 3;
2032 2226
2033 return vmcs_read16(GUEST_CS_SELECTOR) & 3; 2227 return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
2034} 2228}
2035 2229
2230static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2231{
2232 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
2233 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2234 to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
2235 }
2236 return to_vmx(vcpu)->cpl;
2237}
2238
2239
2036static u32 vmx_segment_access_rights(struct kvm_segment *var) 2240static u32 vmx_segment_access_rights(struct kvm_segment *var)
2037{ 2241{
2038 u32 ar; 2242 u32 ar;
@@ -2062,7 +2266,10 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2062 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2266 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2063 u32 ar; 2267 u32 ar;
2064 2268
2269 vmx_segment_cache_clear(vmx);
2270
2065 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 2271 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
2272 vmcs_write16(sf->selector, var->selector);
2066 vmx->rmode.tr.selector = var->selector; 2273 vmx->rmode.tr.selector = var->selector;
2067 vmx->rmode.tr.base = var->base; 2274 vmx->rmode.tr.base = var->base;
2068 vmx->rmode.tr.limit = var->limit; 2275 vmx->rmode.tr.limit = var->limit;
@@ -2097,11 +2304,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2097 ar |= 0x1; /* Accessed */ 2304 ar |= 0x1; /* Accessed */
2098 2305
2099 vmcs_write32(sf->ar_bytes, ar); 2306 vmcs_write32(sf->ar_bytes, ar);
2307 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2100} 2308}
2101 2309
2102static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 2310static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2103{ 2311{
2104 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); 2312 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
2105 2313
2106 *db = (ar >> 14) & 1; 2314 *db = (ar >> 14) & 1;
2107 *l = (ar >> 13) & 1; 2315 *l = (ar >> 13) & 1;
@@ -2323,11 +2531,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
2323 2531
2324static int init_rmode_tss(struct kvm *kvm) 2532static int init_rmode_tss(struct kvm *kvm)
2325{ 2533{
2326 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 2534 gfn_t fn;
2327 u16 data = 0; 2535 u16 data = 0;
2328 int ret = 0; 2536 int r, idx, ret = 0;
2329 int r;
2330 2537
2538 idx = srcu_read_lock(&kvm->srcu);
2539 fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
2331 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 2540 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2332 if (r < 0) 2541 if (r < 0)
2333 goto out; 2542 goto out;
@@ -2351,12 +2560,13 @@ static int init_rmode_tss(struct kvm *kvm)
2351 2560
2352 ret = 1; 2561 ret = 1;
2353out: 2562out:
2563 srcu_read_unlock(&kvm->srcu, idx);
2354 return ret; 2564 return ret;
2355} 2565}
2356 2566
2357static int init_rmode_identity_map(struct kvm *kvm) 2567static int init_rmode_identity_map(struct kvm *kvm)
2358{ 2568{
2359 int i, r, ret; 2569 int i, idx, r, ret;
2360 pfn_t identity_map_pfn; 2570 pfn_t identity_map_pfn;
2361 u32 tmp; 2571 u32 tmp;
2362 2572
@@ -2371,6 +2581,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2371 return 1; 2581 return 1;
2372 ret = 0; 2582 ret = 0;
2373 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 2583 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
2584 idx = srcu_read_lock(&kvm->srcu);
2374 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 2585 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2375 if (r < 0) 2586 if (r < 0)
2376 goto out; 2587 goto out;
@@ -2386,6 +2597,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2386 kvm->arch.ept_identity_pagetable_done = true; 2597 kvm->arch.ept_identity_pagetable_done = true;
2387 ret = 1; 2598 ret = 1;
2388out: 2599out:
2600 srcu_read_unlock(&kvm->srcu, idx);
2389 return ret; 2601 return ret;
2390} 2602}
2391 2603
@@ -2515,7 +2727,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2515{ 2727{
2516 u32 host_sysenter_cs, msr_low, msr_high; 2728 u32 host_sysenter_cs, msr_low, msr_high;
2517 u32 junk; 2729 u32 junk;
2518 u64 host_pat, tsc_this, tsc_base; 2730 u64 host_pat;
2519 unsigned long a; 2731 unsigned long a;
2520 struct desc_ptr dt; 2732 struct desc_ptr dt;
2521 int i; 2733 int i;
@@ -2656,32 +2868,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2656 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; 2868 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2657 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 2869 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2658 2870
2659 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; 2871 kvm_write_tsc(&vmx->vcpu, 0);
2660 rdtscll(tsc_this);
2661 if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
2662 tsc_base = tsc_this;
2663
2664 guest_write_tsc(0, tsc_base);
2665 2872
2666 return 0; 2873 return 0;
2667} 2874}
2668 2875
2669static int init_rmode(struct kvm *kvm)
2670{
2671 int idx, ret = 0;
2672
2673 idx = srcu_read_lock(&kvm->srcu);
2674 if (!init_rmode_tss(kvm))
2675 goto exit;
2676 if (!init_rmode_identity_map(kvm))
2677 goto exit;
2678
2679 ret = 1;
2680exit:
2681 srcu_read_unlock(&kvm->srcu, idx);
2682 return ret;
2683}
2684
2685static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2876static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2686{ 2877{
2687 struct vcpu_vmx *vmx = to_vmx(vcpu); 2878 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2689,10 +2880,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2689 int ret; 2880 int ret;
2690 2881
2691 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2882 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2692 if (!init_rmode(vmx->vcpu.kvm)) {
2693 ret = -ENOMEM;
2694 goto out;
2695 }
2696 2883
2697 vmx->rmode.vm86_active = 0; 2884 vmx->rmode.vm86_active = 0;
2698 2885
@@ -2709,6 +2896,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2709 if (ret != 0) 2896 if (ret != 0)
2710 goto out; 2897 goto out;
2711 2898
2899 vmx_segment_cache_clear(vmx);
2900
2712 seg_setup(VCPU_SREG_CS); 2901 seg_setup(VCPU_SREG_CS);
2713 /* 2902 /*
2714 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 2903 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
@@ -2757,7 +2946,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2757 vmcs_writel(GUEST_IDTR_BASE, 0); 2946 vmcs_writel(GUEST_IDTR_BASE, 0);
2758 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 2947 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
2759 2948
2760 vmcs_write32(GUEST_ACTIVITY_STATE, 0); 2949 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
2761 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 2950 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2762 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 2951 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2763 2952
@@ -2772,7 +2961,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2772 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 2961 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
2773 if (vm_need_tpr_shadow(vmx->vcpu.kvm)) 2962 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
2774 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 2963 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
2775 page_to_phys(vmx->vcpu.arch.apic->regs_page)); 2964 __pa(vmx->vcpu.arch.apic->regs));
2776 vmcs_write32(TPR_THRESHOLD, 0); 2965 vmcs_write32(TPR_THRESHOLD, 0);
2777 } 2966 }
2778 2967
@@ -2819,6 +3008,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2819 return; 3008 return;
2820 } 3009 }
2821 3010
3011 if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
3012 enable_irq_window(vcpu);
3013 return;
3014 }
2822 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 3015 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2823 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 3016 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2824 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3017 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2834,16 +3027,11 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2834 3027
2835 ++vcpu->stat.irq_injections; 3028 ++vcpu->stat.irq_injections;
2836 if (vmx->rmode.vm86_active) { 3029 if (vmx->rmode.vm86_active) {
2837 vmx->rmode.irq.pending = true; 3030 int inc_eip = 0;
2838 vmx->rmode.irq.vector = irq;
2839 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2840 if (vcpu->arch.interrupt.soft) 3031 if (vcpu->arch.interrupt.soft)
2841 vmx->rmode.irq.rip += 3032 inc_eip = vcpu->arch.event_exit_inst_len;
2842 vmx->vcpu.arch.event_exit_inst_len; 3033 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
2843 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 3034 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2844 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2845 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2846 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2847 return; 3035 return;
2848 } 3036 }
2849 intr = irq | INTR_INFO_VALID_MASK; 3037 intr = irq | INTR_INFO_VALID_MASK;
@@ -2854,6 +3042,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2854 } else 3042 } else
2855 intr |= INTR_TYPE_EXT_INTR; 3043 intr |= INTR_TYPE_EXT_INTR;
2856 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 3044 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
3045 vmx_clear_hlt(vcpu);
2857} 3046}
2858 3047
2859static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 3048static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2874,19 +3063,15 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2874 } 3063 }
2875 3064
2876 ++vcpu->stat.nmi_injections; 3065 ++vcpu->stat.nmi_injections;
3066 vmx->nmi_known_unmasked = false;
2877 if (vmx->rmode.vm86_active) { 3067 if (vmx->rmode.vm86_active) {
2878 vmx->rmode.irq.pending = true; 3068 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
2879 vmx->rmode.irq.vector = NMI_VECTOR; 3069 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2880 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2881 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2882 NMI_VECTOR | INTR_TYPE_SOFT_INTR |
2883 INTR_INFO_VALID_MASK);
2884 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2885 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2886 return; 3070 return;
2887 } 3071 }
2888 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 3072 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2889 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 3073 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
3074 vmx_clear_hlt(vcpu);
2890} 3075}
2891 3076
2892static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 3077static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2895,13 +3080,16 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2895 return 0; 3080 return 0;
2896 3081
2897 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3082 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2898 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); 3083 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
3084 | GUEST_INTR_STATE_NMI));
2899} 3085}
2900 3086
2901static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 3087static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2902{ 3088{
2903 if (!cpu_has_virtual_nmis()) 3089 if (!cpu_has_virtual_nmis())
2904 return to_vmx(vcpu)->soft_vnmi_blocked; 3090 return to_vmx(vcpu)->soft_vnmi_blocked;
3091 if (to_vmx(vcpu)->nmi_known_unmasked)
3092 return false;
2905 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 3093 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
2906} 3094}
2907 3095
@@ -2915,6 +3103,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2915 vmx->vnmi_blocked_time = 0; 3103 vmx->vnmi_blocked_time = 0;
2916 } 3104 }
2917 } else { 3105 } else {
3106 vmx->nmi_known_unmasked = !masked;
2918 if (masked) 3107 if (masked)
2919 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3108 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2920 GUEST_INTR_STATE_NMI); 3109 GUEST_INTR_STATE_NMI);
@@ -2945,6 +3134,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2945 if (ret) 3134 if (ret)
2946 return ret; 3135 return ret;
2947 kvm->arch.tss_addr = addr; 3136 kvm->arch.tss_addr = addr;
3137 if (!init_rmode_tss(kvm))
3138 return -ENOMEM;
3139
2948 return 0; 3140 return 0;
2949} 3141}
2950 3142
@@ -2956,7 +3148,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2956 * Cause the #SS fault with 0 error code in VM86 mode. 3148 * Cause the #SS fault with 0 error code in VM86 mode.
2957 */ 3149 */
2958 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 3150 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2959 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) 3151 if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
2960 return 1; 3152 return 1;
2961 /* 3153 /*
2962 * Forward all other exceptions that are valid in real mode. 3154 * Forward all other exceptions that are valid in real mode.
@@ -3029,7 +3221,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3029 enum emulation_result er; 3221 enum emulation_result er;
3030 3222
3031 vect_info = vmx->idt_vectoring_info; 3223 vect_info = vmx->idt_vectoring_info;
3032 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3224 intr_info = vmx->exit_intr_info;
3033 3225
3034 if (is_machine_check(intr_info)) 3226 if (is_machine_check(intr_info))
3035 return handle_machine_check(vcpu); 3227 return handle_machine_check(vcpu);
@@ -3053,14 +3245,13 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3053 } 3245 }
3054 3246
3055 if (is_invalid_opcode(intr_info)) { 3247 if (is_invalid_opcode(intr_info)) {
3056 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); 3248 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
3057 if (er != EMULATE_DONE) 3249 if (er != EMULATE_DONE)
3058 kvm_queue_exception(vcpu, UD_VECTOR); 3250 kvm_queue_exception(vcpu, UD_VECTOR);
3059 return 1; 3251 return 1;
3060 } 3252 }
3061 3253
3062 error_code = 0; 3254 error_code = 0;
3063 rip = kvm_rip_read(vcpu);
3064 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 3255 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
3065 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 3256 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
3066 if (is_page_fault(intr_info)) { 3257 if (is_page_fault(intr_info)) {
@@ -3072,7 +3263,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3072 3263
3073 if (kvm_event_needs_reinjection(vcpu)) 3264 if (kvm_event_needs_reinjection(vcpu))
3074 kvm_mmu_unprotect_page_virt(vcpu, cr2); 3265 kvm_mmu_unprotect_page_virt(vcpu, cr2);
3075 return kvm_mmu_page_fault(vcpu, cr2, error_code); 3266 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
3076 } 3267 }
3077 3268
3078 if (vmx->rmode.vm86_active && 3269 if (vmx->rmode.vm86_active &&
@@ -3107,6 +3298,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3107 vmx->vcpu.arch.event_exit_inst_len = 3298 vmx->vcpu.arch.event_exit_inst_len =
3108 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3299 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3109 kvm_run->exit_reason = KVM_EXIT_DEBUG; 3300 kvm_run->exit_reason = KVM_EXIT_DEBUG;
3301 rip = kvm_rip_read(vcpu);
3110 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 3302 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
3111 kvm_run->debug.arch.exception = ex_no; 3303 kvm_run->debug.arch.exception = ex_no;
3112 break; 3304 break;
@@ -3144,7 +3336,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
3144 ++vcpu->stat.io_exits; 3336 ++vcpu->stat.io_exits;
3145 3337
3146 if (string || in) 3338 if (string || in)
3147 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3339 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3148 3340
3149 port = exit_qualification >> 16; 3341 port = exit_qualification >> 16;
3150 size = (exit_qualification & 7) + 1; 3342 size = (exit_qualification & 7) + 1;
@@ -3164,14 +3356,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3164 hypercall[2] = 0xc1; 3356 hypercall[2] = 0xc1;
3165} 3357}
3166 3358
3167static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
3168{
3169 if (err)
3170 kvm_inject_gp(vcpu, 0);
3171 else
3172 skip_emulated_instruction(vcpu);
3173}
3174
3175static int handle_cr(struct kvm_vcpu *vcpu) 3359static int handle_cr(struct kvm_vcpu *vcpu)
3176{ 3360{
3177 unsigned long exit_qualification, val; 3361 unsigned long exit_qualification, val;
@@ -3189,21 +3373,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3189 switch (cr) { 3373 switch (cr) {
3190 case 0: 3374 case 0:
3191 err = kvm_set_cr0(vcpu, val); 3375 err = kvm_set_cr0(vcpu, val);
3192 complete_insn_gp(vcpu, err); 3376 kvm_complete_insn_gp(vcpu, err);
3193 return 1; 3377 return 1;
3194 case 3: 3378 case 3:
3195 err = kvm_set_cr3(vcpu, val); 3379 err = kvm_set_cr3(vcpu, val);
3196 complete_insn_gp(vcpu, err); 3380 kvm_complete_insn_gp(vcpu, err);
3197 return 1; 3381 return 1;
3198 case 4: 3382 case 4:
3199 err = kvm_set_cr4(vcpu, val); 3383 err = kvm_set_cr4(vcpu, val);
3200 complete_insn_gp(vcpu, err); 3384 kvm_complete_insn_gp(vcpu, err);
3201 return 1; 3385 return 1;
3202 case 8: { 3386 case 8: {
3203 u8 cr8_prev = kvm_get_cr8(vcpu); 3387 u8 cr8_prev = kvm_get_cr8(vcpu);
3204 u8 cr8 = kvm_register_read(vcpu, reg); 3388 u8 cr8 = kvm_register_read(vcpu, reg);
3205 kvm_set_cr8(vcpu, cr8); 3389 err = kvm_set_cr8(vcpu, cr8);
3206 skip_emulated_instruction(vcpu); 3390 kvm_complete_insn_gp(vcpu, err);
3207 if (irqchip_in_kernel(vcpu->kvm)) 3391 if (irqchip_in_kernel(vcpu->kvm))
3208 return 1; 3392 return 1;
3209 if (cr8_prev <= cr8) 3393 if (cr8_prev <= cr8)
@@ -3222,8 +3406,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3222 case 1: /*mov from cr*/ 3406 case 1: /*mov from cr*/
3223 switch (cr) { 3407 switch (cr) {
3224 case 3: 3408 case 3:
3225 kvm_register_write(vcpu, reg, vcpu->arch.cr3); 3409 val = kvm_read_cr3(vcpu);
3226 trace_kvm_cr_read(cr, vcpu->arch.cr3); 3410 kvm_register_write(vcpu, reg, val);
3411 trace_kvm_cr_read(cr, val);
3227 skip_emulated_instruction(vcpu); 3412 skip_emulated_instruction(vcpu);
3228 return 1; 3413 return 1;
3229 case 8: 3414 case 8:
@@ -3346,6 +3531,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
3346 3531
3347static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 3532static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3348{ 3533{
3534 kvm_make_request(KVM_REQ_EVENT, vcpu);
3349 return 1; 3535 return 1;
3350} 3536}
3351 3537
@@ -3358,6 +3544,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3358 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 3544 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
3359 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3545 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3360 3546
3547 kvm_make_request(KVM_REQ_EVENT, vcpu);
3548
3361 ++vcpu->stat.irq_window_exits; 3549 ++vcpu->stat.irq_window_exits;
3362 3550
3363 /* 3551 /*
@@ -3392,6 +3580,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3392 return 1; 3580 return 1;
3393} 3581}
3394 3582
3583static int handle_invd(struct kvm_vcpu *vcpu)
3584{
3585 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3586}
3587
3395static int handle_invlpg(struct kvm_vcpu *vcpu) 3588static int handle_invlpg(struct kvm_vcpu *vcpu)
3396{ 3589{
3397 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3590 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3420,7 +3613,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
3420 3613
3421static int handle_apic_access(struct kvm_vcpu *vcpu) 3614static int handle_apic_access(struct kvm_vcpu *vcpu)
3422{ 3615{
3423 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3616 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3424} 3617}
3425 3618
3426static int handle_task_switch(struct kvm_vcpu *vcpu) 3619static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3442,9 +3635,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3442 switch (type) { 3635 switch (type) {
3443 case INTR_TYPE_NMI_INTR: 3636 case INTR_TYPE_NMI_INTR:
3444 vcpu->arch.nmi_injected = false; 3637 vcpu->arch.nmi_injected = false;
3445 if (cpu_has_virtual_nmis()) 3638 vmx_set_nmi_mask(vcpu, true);
3446 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3447 GUEST_INTR_STATE_NMI);
3448 break; 3639 break;
3449 case INTR_TYPE_EXT_INTR: 3640 case INTR_TYPE_EXT_INTR:
3450 case INTR_TYPE_SOFT_INTR: 3641 case INTR_TYPE_SOFT_INTR:
@@ -3519,7 +3710,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
3519 3710
3520 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 3711 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3521 trace_kvm_page_fault(gpa, exit_qualification); 3712 trace_kvm_page_fault(gpa, exit_qualification);
3522 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); 3713 return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
3523} 3714}
3524 3715
3525static u64 ept_rsvd_mask(u64 spte, int level) 3716static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3614,6 +3805,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
3614 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 3805 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
3615 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3806 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3616 ++vcpu->stat.nmi_window_exits; 3807 ++vcpu->stat.nmi_window_exits;
3808 kvm_make_request(KVM_REQ_EVENT, vcpu);
3617 3809
3618 return 1; 3810 return 1;
3619} 3811}
@@ -3623,9 +3815,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3623 struct vcpu_vmx *vmx = to_vmx(vcpu); 3815 struct vcpu_vmx *vmx = to_vmx(vcpu);
3624 enum emulation_result err = EMULATE_DONE; 3816 enum emulation_result err = EMULATE_DONE;
3625 int ret = 1; 3817 int ret = 1;
3818 u32 cpu_exec_ctrl;
3819 bool intr_window_requested;
3820
3821 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3822 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
3626 3823
3627 while (!guest_state_valid(vcpu)) { 3824 while (!guest_state_valid(vcpu)) {
3628 err = emulate_instruction(vcpu, 0, 0, 0); 3825 if (intr_window_requested
3826 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
3827 return handle_interrupt_window(&vmx->vcpu);
3828
3829 err = emulate_instruction(vcpu, 0);
3629 3830
3630 if (err == EMULATE_DO_MMIO) { 3831 if (err == EMULATE_DO_MMIO) {
3631 ret = 0; 3832 ret = 0;
@@ -3682,6 +3883,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3682 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 3883 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
3683 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 3884 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
3684 [EXIT_REASON_HLT] = handle_halt, 3885 [EXIT_REASON_HLT] = handle_halt,
3886 [EXIT_REASON_INVD] = handle_invd,
3685 [EXIT_REASON_INVLPG] = handle_invlpg, 3887 [EXIT_REASON_INVLPG] = handle_invlpg,
3686 [EXIT_REASON_VMCALL] = handle_vmcall, 3888 [EXIT_REASON_VMCALL] = handle_vmcall,
3687 [EXIT_REASON_VMCLEAR] = handle_vmx_insn, 3889 [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
@@ -3709,6 +3911,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3709static const int kvm_vmx_max_exit_handlers = 3911static const int kvm_vmx_max_exit_handlers =
3710 ARRAY_SIZE(kvm_vmx_exit_handlers); 3912 ARRAY_SIZE(kvm_vmx_exit_handlers);
3711 3913
3914static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3915{
3916 *info1 = vmcs_readl(EXIT_QUALIFICATION);
3917 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
3918}
3919
3712/* 3920/*
3713 * The guest has exited. See if we can fix it or if we need userspace 3921 * The guest has exited. See if we can fix it or if we need userspace
3714 * assistance. 3922 * assistance.
@@ -3719,17 +3927,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3719 u32 exit_reason = vmx->exit_reason; 3927 u32 exit_reason = vmx->exit_reason;
3720 u32 vectoring_info = vmx->idt_vectoring_info; 3928 u32 vectoring_info = vmx->idt_vectoring_info;
3721 3929
3722 trace_kvm_exit(exit_reason, vcpu); 3930 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
3723 3931
3724 /* If guest state is invalid, start emulating */ 3932 /* If guest state is invalid, start emulating */
3725 if (vmx->emulation_required && emulate_invalid_guest_state) 3933 if (vmx->emulation_required && emulate_invalid_guest_state)
3726 return handle_invalid_guest_state(vcpu); 3934 return handle_invalid_guest_state(vcpu);
3727 3935
3728 /* Access CR3 don't cause VMExit in paging mode, so we need
3729 * to sync with guest real CR3. */
3730 if (enable_ept && is_paging(vcpu))
3731 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3732
3733 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 3936 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3734 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3937 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3735 vcpu->run->fail_entry.hardware_entry_failure_reason 3938 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3790,23 +3993,19 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3790 vmcs_write32(TPR_THRESHOLD, irr); 3993 vmcs_write32(TPR_THRESHOLD, irr);
3791} 3994}
3792 3995
3793static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 3996static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
3794{ 3997{
3795 u32 exit_intr_info; 3998 u32 exit_intr_info;
3796 u32 idt_vectoring_info = vmx->idt_vectoring_info;
3797 bool unblock_nmi;
3798 u8 vector;
3799 int type;
3800 bool idtv_info_valid;
3801 3999
3802 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 4000 if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
4001 || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
4002 return;
3803 4003
3804 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 4004 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
4005 exit_intr_info = vmx->exit_intr_info;
3805 4006
3806 /* Handle machine checks before interrupts are enabled */ 4007 /* Handle machine checks before interrupts are enabled */
3807 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) 4008 if (is_machine_check(exit_intr_info))
3808 || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
3809 && is_machine_check(exit_intr_info)))
3810 kvm_machine_check(); 4009 kvm_machine_check();
3811 4010
3812 /* We need to handle NMIs before interrupts are enabled */ 4011 /* We need to handle NMIs before interrupts are enabled */
@@ -3816,10 +4015,25 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3816 asm("int $2"); 4015 asm("int $2");
3817 kvm_after_handle_nmi(&vmx->vcpu); 4016 kvm_after_handle_nmi(&vmx->vcpu);
3818 } 4017 }
4018}
3819 4019
3820 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 4020static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
4021{
4022 u32 exit_intr_info;
4023 bool unblock_nmi;
4024 u8 vector;
4025 bool idtv_info_valid;
4026
4027 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3821 4028
3822 if (cpu_has_virtual_nmis()) { 4029 if (cpu_has_virtual_nmis()) {
4030 if (vmx->nmi_known_unmasked)
4031 return;
4032 /*
4033 * Can't use vmx->exit_intr_info since we're not sure what
4034 * the exit reason is.
4035 */
4036 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3823 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 4037 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3824 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 4038 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3825 /* 4039 /*
@@ -3836,9 +4050,25 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3836 vector != DF_VECTOR && !idtv_info_valid) 4050 vector != DF_VECTOR && !idtv_info_valid)
3837 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 4051 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3838 GUEST_INTR_STATE_NMI); 4052 GUEST_INTR_STATE_NMI);
4053 else
4054 vmx->nmi_known_unmasked =
4055 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
4056 & GUEST_INTR_STATE_NMI);
3839 } else if (unlikely(vmx->soft_vnmi_blocked)) 4057 } else if (unlikely(vmx->soft_vnmi_blocked))
3840 vmx->vnmi_blocked_time += 4058 vmx->vnmi_blocked_time +=
3841 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 4059 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
4060}
4061
4062static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
4063 u32 idt_vectoring_info,
4064 int instr_len_field,
4065 int error_code_field)
4066{
4067 u8 vector;
4068 int type;
4069 bool idtv_info_valid;
4070
4071 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3842 4072
3843 vmx->vcpu.arch.nmi_injected = false; 4073 vmx->vcpu.arch.nmi_injected = false;
3844 kvm_clear_exception_queue(&vmx->vcpu); 4074 kvm_clear_exception_queue(&vmx->vcpu);
@@ -3847,6 +4077,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3847 if (!idtv_info_valid) 4077 if (!idtv_info_valid)
3848 return; 4078 return;
3849 4079
4080 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
4081
3850 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 4082 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3851 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 4083 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3852 4084
@@ -3858,23 +4090,22 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3858 * Clear bit "block by NMI" before VM entry if a NMI 4090 * Clear bit "block by NMI" before VM entry if a NMI
3859 * delivery faulted. 4091 * delivery faulted.
3860 */ 4092 */
3861 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 4093 vmx_set_nmi_mask(&vmx->vcpu, false);
3862 GUEST_INTR_STATE_NMI);
3863 break; 4094 break;
3864 case INTR_TYPE_SOFT_EXCEPTION: 4095 case INTR_TYPE_SOFT_EXCEPTION:
3865 vmx->vcpu.arch.event_exit_inst_len = 4096 vmx->vcpu.arch.event_exit_inst_len =
3866 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4097 vmcs_read32(instr_len_field);
3867 /* fall through */ 4098 /* fall through */
3868 case INTR_TYPE_HARD_EXCEPTION: 4099 case INTR_TYPE_HARD_EXCEPTION:
3869 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 4100 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3870 u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); 4101 u32 err = vmcs_read32(error_code_field);
3871 kvm_queue_exception_e(&vmx->vcpu, vector, err); 4102 kvm_queue_exception_e(&vmx->vcpu, vector, err);
3872 } else 4103 } else
3873 kvm_queue_exception(&vmx->vcpu, vector); 4104 kvm_queue_exception(&vmx->vcpu, vector);
3874 break; 4105 break;
3875 case INTR_TYPE_SOFT_INTR: 4106 case INTR_TYPE_SOFT_INTR:
3876 vmx->vcpu.arch.event_exit_inst_len = 4107 vmx->vcpu.arch.event_exit_inst_len =
3877 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4108 vmcs_read32(instr_len_field);
3878 /* fall through */ 4109 /* fall through */
3879 case INTR_TYPE_EXT_INTR: 4110 case INTR_TYPE_EXT_INTR:
3880 kvm_queue_interrupt(&vmx->vcpu, vector, 4111 kvm_queue_interrupt(&vmx->vcpu, vector,
@@ -3885,27 +4116,21 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3885 } 4116 }
3886} 4117}
3887 4118
3888/* 4119static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3889 * Failure to inject an interrupt should give us the information
3890 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
3891 * when fetching the interrupt redirection bitmap in the real-mode
3892 * tss, this doesn't happen. So we do it ourselves.
3893 */
3894static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3895{ 4120{
3896 vmx->rmode.irq.pending = 0; 4121 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
3897 if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) 4122 VM_EXIT_INSTRUCTION_LEN,
3898 return; 4123 IDT_VECTORING_ERROR_CODE);
3899 kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); 4124}
3900 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { 4125
3901 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; 4126static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
3902 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; 4127{
3903 return; 4128 __vmx_complete_interrupts(to_vmx(vcpu),
3904 } 4129 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
3905 vmx->idt_vectoring_info = 4130 VM_ENTRY_INSTRUCTION_LEN,
3906 VECTORING_INFO_VALID_MASK 4131 VM_ENTRY_EXCEPTION_ERROR_CODE);
3907 | INTR_TYPE_EXT_INTR 4132
3908 | vmx->rmode.irq.vector; 4133 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
3909} 4134}
3910 4135
3911#ifdef CONFIG_X86_64 4136#ifdef CONFIG_X86_64
@@ -3916,7 +4141,7 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3916#define Q "l" 4141#define Q "l"
3917#endif 4142#endif
3918 4143
3919static void vmx_vcpu_run(struct kvm_vcpu *vcpu) 4144static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
3920{ 4145{
3921 struct vcpu_vmx *vmx = to_vmx(vcpu); 4146 struct vcpu_vmx *vmx = to_vmx(vcpu);
3922 4147
@@ -3945,6 +4170,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3945 asm( 4170 asm(
3946 /* Store host registers */ 4171 /* Store host registers */
3947 "push %%"R"dx; push %%"R"bp;" 4172 "push %%"R"dx; push %%"R"bp;"
4173 "push %%"R"cx \n\t" /* placeholder for guest rcx */
3948 "push %%"R"cx \n\t" 4174 "push %%"R"cx \n\t"
3949 "cmp %%"R"sp, %c[host_rsp](%0) \n\t" 4175 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
3950 "je 1f \n\t" 4176 "je 1f \n\t"
@@ -3986,10 +4212,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3986 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 4212 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
3987 ".Lkvm_vmx_return: " 4213 ".Lkvm_vmx_return: "
3988 /* Save guest registers, load host registers, keep flags */ 4214 /* Save guest registers, load host registers, keep flags */
3989 "xchg %0, (%%"R"sp) \n\t" 4215 "mov %0, %c[wordsize](%%"R"sp) \n\t"
4216 "pop %0 \n\t"
3990 "mov %%"R"ax, %c[rax](%0) \n\t" 4217 "mov %%"R"ax, %c[rax](%0) \n\t"
3991 "mov %%"R"bx, %c[rbx](%0) \n\t" 4218 "mov %%"R"bx, %c[rbx](%0) \n\t"
3992 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" 4219 "pop"Q" %c[rcx](%0) \n\t"
3993 "mov %%"R"dx, %c[rdx](%0) \n\t" 4220 "mov %%"R"dx, %c[rdx](%0) \n\t"
3994 "mov %%"R"si, %c[rsi](%0) \n\t" 4221 "mov %%"R"si, %c[rsi](%0) \n\t"
3995 "mov %%"R"di, %c[rdi](%0) \n\t" 4222 "mov %%"R"di, %c[rdi](%0) \n\t"
@@ -4007,7 +4234,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4007 "mov %%cr2, %%"R"ax \n\t" 4234 "mov %%cr2, %%"R"ax \n\t"
4008 "mov %%"R"ax, %c[cr2](%0) \n\t" 4235 "mov %%"R"ax, %c[cr2](%0) \n\t"
4009 4236
4010 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" 4237 "pop %%"R"bp; pop %%"R"dx \n\t"
4011 "setbe %c[fail](%0) \n\t" 4238 "setbe %c[fail](%0) \n\t"
4012 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 4239 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4013 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 4240 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
@@ -4030,25 +4257,32 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4030 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), 4257 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
4031 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), 4258 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
4032#endif 4259#endif
4033 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 4260 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
4261 [wordsize]"i"(sizeof(ulong))
4034 : "cc", "memory" 4262 : "cc", "memory"
4035 , R"bx", R"di", R"si" 4263 , R"ax", R"bx", R"di", R"si"
4036#ifdef CONFIG_X86_64 4264#ifdef CONFIG_X86_64
4037 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 4265 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
4038#endif 4266#endif
4039 ); 4267 );
4040 4268
4041 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 4269 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
4042 | (1 << VCPU_EXREG_PDPTR)); 4270 | (1 << VCPU_EXREG_RFLAGS)
4271 | (1 << VCPU_EXREG_CPL)
4272 | (1 << VCPU_EXREG_PDPTR)
4273 | (1 << VCPU_EXREG_SEGMENTS)
4274 | (1 << VCPU_EXREG_CR3));
4043 vcpu->arch.regs_dirty = 0; 4275 vcpu->arch.regs_dirty = 0;
4044 4276
4045 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 4277 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
4046 if (vmx->rmode.irq.pending)
4047 fixup_rmode_irq(vmx);
4048 4278
4049 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 4279 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
4050 vmx->launched = 1; 4280 vmx->launched = 1;
4051 4281
4282 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4283
4284 vmx_complete_atomic_exit(vmx);
4285 vmx_recover_nmi_blocking(vmx);
4052 vmx_complete_interrupts(vmx); 4286 vmx_complete_interrupts(vmx);
4053} 4287}
4054 4288
@@ -4106,8 +4340,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4106 goto free_vcpu; 4340 goto free_vcpu;
4107 4341
4108 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 4342 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
4343 err = -ENOMEM;
4109 if (!vmx->guest_msrs) { 4344 if (!vmx->guest_msrs) {
4110 err = -ENOMEM;
4111 goto uninit_vcpu; 4345 goto uninit_vcpu;
4112 } 4346 }
4113 4347
@@ -4119,21 +4353,26 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4119 4353
4120 cpu = get_cpu(); 4354 cpu = get_cpu();
4121 vmx_vcpu_load(&vmx->vcpu, cpu); 4355 vmx_vcpu_load(&vmx->vcpu, cpu);
4356 vmx->vcpu.cpu = cpu;
4122 err = vmx_vcpu_setup(vmx); 4357 err = vmx_vcpu_setup(vmx);
4123 vmx_vcpu_put(&vmx->vcpu); 4358 vmx_vcpu_put(&vmx->vcpu);
4124 put_cpu(); 4359 put_cpu();
4125 if (err) 4360 if (err)
4126 goto free_vmcs; 4361 goto free_vmcs;
4127 if (vm_need_virtualize_apic_accesses(kvm)) 4362 if (vm_need_virtualize_apic_accesses(kvm))
4128 if (alloc_apic_access_page(kvm) != 0) 4363 err = alloc_apic_access_page(kvm);
4364 if (err)
4129 goto free_vmcs; 4365 goto free_vmcs;
4130 4366
4131 if (enable_ept) { 4367 if (enable_ept) {
4132 if (!kvm->arch.ept_identity_map_addr) 4368 if (!kvm->arch.ept_identity_map_addr)
4133 kvm->arch.ept_identity_map_addr = 4369 kvm->arch.ept_identity_map_addr =
4134 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 4370 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4371 err = -ENOMEM;
4135 if (alloc_identity_pagetable(kvm) != 0) 4372 if (alloc_identity_pagetable(kvm) != 0)
4136 goto free_vmcs; 4373 goto free_vmcs;
4374 if (!init_rmode_identity_map(kvm))
4375 goto free_vmcs;
4137 } 4376 }
4138 4377
4139 return &vmx->vcpu; 4378 return &vmx->vcpu;
@@ -4249,11 +4488,6 @@ static int vmx_get_lpage_level(void)
4249 return PT_PDPE_LEVEL; 4488 return PT_PDPE_LEVEL;
4250} 4489}
4251 4490
4252static inline u32 bit(int bitno)
4253{
4254 return 1 << (bitno & 31);
4255}
4256
4257static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 4491static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4258{ 4492{
4259 struct kvm_cpuid_entry2 *best; 4493 struct kvm_cpuid_entry2 *best;
@@ -4280,6 +4514,13 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4280{ 4514{
4281} 4515}
4282 4516
4517static int vmx_check_intercept(struct kvm_vcpu *vcpu,
4518 struct x86_instruction_info *info,
4519 enum x86_intercept_stage stage)
4520{
4521 return X86EMUL_CONTINUE;
4522}
4523
4283static struct kvm_x86_ops vmx_x86_ops = { 4524static struct kvm_x86_ops vmx_x86_ops = {
4284 .cpu_has_kvm_support = cpu_has_kvm_support, 4525 .cpu_has_kvm_support = cpu_has_kvm_support,
4285 .disabled_by_bios = vmx_disabled_by_bios, 4526 .disabled_by_bios = vmx_disabled_by_bios,
@@ -4307,6 +4548,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4307 .get_cpl = vmx_get_cpl, 4548 .get_cpl = vmx_get_cpl,
4308 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 4549 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4309 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 4550 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
4551 .decache_cr3 = vmx_decache_cr3,
4310 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 4552 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
4311 .set_cr0 = vmx_set_cr0, 4553 .set_cr0 = vmx_set_cr0,
4312 .set_cr3 = vmx_set_cr3, 4554 .set_cr3 = vmx_set_cr3,
@@ -4334,6 +4576,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4334 .set_irq = vmx_inject_irq, 4576 .set_irq = vmx_inject_irq,
4335 .set_nmi = vmx_inject_nmi, 4577 .set_nmi = vmx_inject_nmi,
4336 .queue_exception = vmx_queue_exception, 4578 .queue_exception = vmx_queue_exception,
4579 .cancel_injection = vmx_cancel_injection,
4337 .interrupt_allowed = vmx_interrupt_allowed, 4580 .interrupt_allowed = vmx_interrupt_allowed,
4338 .nmi_allowed = vmx_nmi_allowed, 4581 .nmi_allowed = vmx_nmi_allowed,
4339 .get_nmi_mask = vmx_get_nmi_mask, 4582 .get_nmi_mask = vmx_get_nmi_mask,
@@ -4346,7 +4589,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
4346 .get_tdp_level = get_ept_level, 4589 .get_tdp_level = get_ept_level,
4347 .get_mt_mask = vmx_get_mt_mask, 4590 .get_mt_mask = vmx_get_mt_mask,
4348 4591
4592 .get_exit_info = vmx_get_exit_info,
4349 .exit_reasons_str = vmx_exit_reasons_str, 4593 .exit_reasons_str = vmx_exit_reasons_str,
4594
4350 .get_lpage_level = vmx_get_lpage_level, 4595 .get_lpage_level = vmx_get_lpage_level,
4351 4596
4352 .cpuid_update = vmx_cpuid_update, 4597 .cpuid_update = vmx_cpuid_update,
@@ -4356,6 +4601,15 @@ static struct kvm_x86_ops vmx_x86_ops = {
4356 .set_supported_cpuid = vmx_set_supported_cpuid, 4601 .set_supported_cpuid = vmx_set_supported_cpuid,
4357 4602
4358 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 4603 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4604
4605 .set_tsc_khz = vmx_set_tsc_khz,
4606 .write_tsc_offset = vmx_write_tsc_offset,
4607 .adjust_tsc_offset = vmx_adjust_tsc_offset,
4608 .compute_tsc_offset = vmx_compute_tsc_offset,
4609
4610 .set_tdp_cr3 = vmx_set_cr3,
4611
4612 .check_intercept = vmx_check_intercept,
4359}; 4613};
4360 4614
4361static int __init vmx_init(void) 4615static int __init vmx_init(void)
@@ -4417,8 +4671,6 @@ static int __init vmx_init(void)
4417 4671
4418 if (enable_ept) { 4672 if (enable_ept) {
4419 bypass_guest_pf = 0; 4673 bypass_guest_pf = 0;
4420 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
4421 VMX_EPT_WRITABLE_MASK);
4422 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 4674 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4423 VMX_EPT_EXECUTABLE_MASK); 4675 VMX_EPT_EXECUTABLE_MASK);
4424 kvm_enable_tdp(); 4676 kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a09c625d526..77c9d8673dc4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008 8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affilates. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * 10 *
11 * Authors: 11 * Authors:
12 * Avi Kivity <avi@qumranet.com> 12 * Avi Kivity <avi@qumranet.com>
@@ -43,6 +43,7 @@
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/hash.h>
46#include <trace/events/kvm.h> 47#include <trace/events/kvm.h>
47 48
48#define CREATE_TRACE_POINTS 49#define CREATE_TRACE_POINTS
@@ -55,32 +56,25 @@
55#include <asm/mce.h> 56#include <asm/mce.h>
56#include <asm/i387.h> 57#include <asm/i387.h>
57#include <asm/xcr.h> 58#include <asm/xcr.h>
59#include <asm/pvclock.h>
60#include <asm/div64.h>
58 61
59#define MAX_IO_MSRS 256 62#define MAX_IO_MSRS 256
60#define CR0_RESERVED_BITS \
61 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
62 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
63 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
64#define CR4_RESERVED_BITS \
65 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
66 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
67 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
68 | X86_CR4_OSXSAVE \
69 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
70
71#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
72
73#define KVM_MAX_MCE_BANKS 32 63#define KVM_MAX_MCE_BANKS 32
74#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P 64#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
65
66#define emul_to_vcpu(ctxt) \
67 container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
75 68
76/* EFER defaults: 69/* EFER defaults:
77 * - enable syscall per default because its emulated by KVM 70 * - enable syscall per default because its emulated by KVM
78 * - enable LME and LMA per default on 64 bit KVM 71 * - enable LME and LMA per default on 64 bit KVM
79 */ 72 */
80#ifdef CONFIG_X86_64 73#ifdef CONFIG_X86_64
81static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 74static
75u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
82#else 76#else
83static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 77static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
84#endif 78#endif
85 79
86#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 80#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
@@ -96,6 +90,11 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
96int ignore_msrs = 0; 90int ignore_msrs = 0;
97module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 91module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
98 92
93bool kvm_has_tsc_control;
94EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
95u32 kvm_max_guest_tsc_khz;
96EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
97
99#define KVM_NR_SHARED_MSRS 16 98#define KVM_NR_SHARED_MSRS 16
100 99
101struct kvm_shared_msrs_global { 100struct kvm_shared_msrs_global {
@@ -153,9 +152,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
153 152
154u64 __read_mostly host_xcr0; 153u64 __read_mostly host_xcr0;
155 154
156static inline u32 bit(int bitno) 155int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
156
157static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
157{ 158{
158 return 1 << (bitno & 31); 159 int i;
160 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
161 vcpu->arch.apf.gfns[i] = ~0;
159} 162}
160 163
161static void kvm_on_user_return(struct user_return_notifier *urn) 164static void kvm_on_user_return(struct user_return_notifier *urn)
@@ -282,6 +285,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
282 u32 prev_nr; 285 u32 prev_nr;
283 int class1, class2; 286 int class1, class2;
284 287
288 kvm_make_request(KVM_REQ_EVENT, vcpu);
289
285 if (!vcpu->arch.exception.pending) { 290 if (!vcpu->arch.exception.pending) {
286 queue: 291 queue:
287 vcpu->arch.exception.pending = true; 292 vcpu->arch.exception.pending = true;
@@ -327,16 +332,33 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
327} 332}
328EXPORT_SYMBOL_GPL(kvm_requeue_exception); 333EXPORT_SYMBOL_GPL(kvm_requeue_exception);
329 334
330void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 335void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
331 u32 error_code) 336{
337 if (err)
338 kvm_inject_gp(vcpu, 0);
339 else
340 kvm_x86_ops->skip_emulated_instruction(vcpu);
341}
342EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
343
344void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
332{ 345{
333 ++vcpu->stat.pf_guest; 346 ++vcpu->stat.pf_guest;
334 vcpu->arch.cr2 = addr; 347 vcpu->arch.cr2 = fault->address;
335 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 348 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
349}
350
351void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
352{
353 if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
354 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
355 else
356 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
336} 357}
337 358
338void kvm_inject_nmi(struct kvm_vcpu *vcpu) 359void kvm_inject_nmi(struct kvm_vcpu *vcpu)
339{ 360{
361 kvm_make_request(KVM_REQ_EVENT, vcpu);
340 vcpu->arch.nmi_pending = 1; 362 vcpu->arch.nmi_pending = 1;
341} 363}
342EXPORT_SYMBOL_GPL(kvm_inject_nmi); 364EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -367,18 +389,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
367EXPORT_SYMBOL_GPL(kvm_require_cpl); 389EXPORT_SYMBOL_GPL(kvm_require_cpl);
368 390
369/* 391/*
392 * This function will be used to read from the physical memory of the currently
393 * running guest. The difference to kvm_read_guest_page is that this function
394 * can read from guest physical or from the guest's guest physical memory.
395 */
396int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
397 gfn_t ngfn, void *data, int offset, int len,
398 u32 access)
399{
400 gfn_t real_gfn;
401 gpa_t ngpa;
402
403 ngpa = gfn_to_gpa(ngfn);
404 real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
405 if (real_gfn == UNMAPPED_GVA)
406 return -EFAULT;
407
408 real_gfn = gpa_to_gfn(real_gfn);
409
410 return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
411}
412EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
413
414int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
415 void *data, int offset, int len, u32 access)
416{
417 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
418 data, offset, len, access);
419}
420
421/*
370 * Load the pae pdptrs. Return true is they are all valid. 422 * Load the pae pdptrs. Return true is they are all valid.
371 */ 423 */
372int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 424int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
373{ 425{
374 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 426 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
375 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 427 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
376 int i; 428 int i;
377 int ret; 429 int ret;
378 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 430 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
379 431
380 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 432 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
381 offset * sizeof(u64), sizeof(pdpte)); 433 offset * sizeof(u64), sizeof(pdpte),
434 PFERR_USER_MASK|PFERR_WRITE_MASK);
382 if (ret < 0) { 435 if (ret < 0) {
383 ret = 0; 436 ret = 0;
384 goto out; 437 goto out;
@@ -392,7 +445,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
392 } 445 }
393 ret = 1; 446 ret = 1;
394 447
395 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 448 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
396 __set_bit(VCPU_EXREG_PDPTR, 449 __set_bit(VCPU_EXREG_PDPTR,
397 (unsigned long *)&vcpu->arch.regs_avail); 450 (unsigned long *)&vcpu->arch.regs_avail);
398 __set_bit(VCPU_EXREG_PDPTR, 451 __set_bit(VCPU_EXREG_PDPTR,
@@ -405,8 +458,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
405 458
406static bool pdptrs_changed(struct kvm_vcpu *vcpu) 459static bool pdptrs_changed(struct kvm_vcpu *vcpu)
407{ 460{
408 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 461 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
409 bool changed = true; 462 bool changed = true;
463 int offset;
464 gfn_t gfn;
410 int r; 465 int r;
411 466
412 if (is_long_mode(vcpu) || !is_pae(vcpu)) 467 if (is_long_mode(vcpu) || !is_pae(vcpu))
@@ -416,10 +471,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
416 (unsigned long *)&vcpu->arch.regs_avail)) 471 (unsigned long *)&vcpu->arch.regs_avail))
417 return true; 472 return true;
418 473
419 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 474 gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
475 offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
476 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
477 PFERR_USER_MASK | PFERR_WRITE_MASK);
420 if (r < 0) 478 if (r < 0)
421 goto out; 479 goto out;
422 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 480 changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
423out: 481out:
424 482
425 return changed; 483 return changed;
@@ -458,12 +516,18 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
458 return 1; 516 return 1;
459 } else 517 } else
460#endif 518#endif
461 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) 519 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
520 kvm_read_cr3(vcpu)))
462 return 1; 521 return 1;
463 } 522 }
464 523
465 kvm_x86_ops->set_cr0(vcpu, cr0); 524 kvm_x86_ops->set_cr0(vcpu, cr0);
466 525
526 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
527 kvm_clear_async_pf_completion_queue(vcpu);
528 kvm_async_pf_hash_reset(vcpu);
529 }
530
467 if ((cr0 ^ old_cr0) & update_bits) 531 if ((cr0 ^ old_cr0) & update_bits)
468 kvm_mmu_reset_context(vcpu); 532 kvm_mmu_reset_context(vcpu);
469 return 0; 533 return 0;
@@ -547,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
547 return 1; 611 return 1;
548 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 612 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
549 && ((cr4 ^ old_cr4) & pdptr_bits) 613 && ((cr4 ^ old_cr4) & pdptr_bits)
550 && !load_pdptrs(vcpu, vcpu->arch.cr3)) 614 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
615 kvm_read_cr3(vcpu)))
551 return 1; 616 return 1;
552 617
553 if (cr4 & X86_CR4_VMXE) 618 if (cr4 & X86_CR4_VMXE)
@@ -567,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
567 632
568int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 633int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
569{ 634{
570 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 635 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
571 kvm_mmu_sync_roots(vcpu); 636 kvm_mmu_sync_roots(vcpu);
572 kvm_mmu_flush_tlb(vcpu); 637 kvm_mmu_flush_tlb(vcpu);
573 return 0; 638 return 0;
@@ -580,7 +645,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
580 if (is_pae(vcpu)) { 645 if (is_pae(vcpu)) {
581 if (cr3 & CR3_PAE_RESERVED_BITS) 646 if (cr3 & CR3_PAE_RESERVED_BITS)
582 return 1; 647 return 1;
583 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) 648 if (is_paging(vcpu) &&
649 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
584 return 1; 650 return 1;
585 } 651 }
586 /* 652 /*
@@ -601,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
601 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 667 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
602 return 1; 668 return 1;
603 vcpu->arch.cr3 = cr3; 669 vcpu->arch.cr3 = cr3;
670 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
604 vcpu->arch.mmu.new_cr3(vcpu); 671 vcpu->arch.mmu.new_cr3(vcpu);
605 return 0; 672 return 0;
606} 673}
607EXPORT_SYMBOL_GPL(kvm_set_cr3); 674EXPORT_SYMBOL_GPL(kvm_set_cr3);
608 675
609int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 676int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
610{ 677{
611 if (cr8 & CR8_RESERVED_BITS) 678 if (cr8 & CR8_RESERVED_BITS)
612 return 1; 679 return 1;
@@ -616,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
616 vcpu->arch.cr8 = cr8; 683 vcpu->arch.cr8 = cr8;
617 return 0; 684 return 0;
618} 685}
619
620void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
621{
622 if (__kvm_set_cr8(vcpu, cr8))
623 kvm_inject_gp(vcpu, 0);
624}
625EXPORT_SYMBOL_GPL(kvm_set_cr8); 686EXPORT_SYMBOL_GPL(kvm_set_cr8);
626 687
627unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 688unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@@ -726,18 +787,18 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
726 * kvm-specific. Those are put in the beginning of the list. 787 * kvm-specific. Those are put in the beginning of the list.
727 */ 788 */
728 789
729#define KVM_SAVE_MSRS_BEGIN 7 790#define KVM_SAVE_MSRS_BEGIN 8
730static u32 msrs_to_save[] = { 791static u32 msrs_to_save[] = {
731 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 792 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
732 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 793 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
733 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 794 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
734 HV_X64_MSR_APIC_ASSIST_PAGE, 795 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
735 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 796 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
736 MSR_STAR, 797 MSR_STAR,
737#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
738 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 799 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
739#endif 800#endif
740 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 801 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
741}; 802};
742 803
743static unsigned num_msrs_to_save; 804static unsigned num_msrs_to_save;
@@ -781,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
781 kvm_x86_ops->set_efer(vcpu, efer); 842 kvm_x86_ops->set_efer(vcpu, efer);
782 843
783 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 844 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
784 kvm_mmu_reset_context(vcpu);
785 845
786 /* Update reserved bits */ 846 /* Update reserved bits */
787 if ((efer ^ old_efer) & EFER_NX) 847 if ((efer ^ old_efer) & EFER_NX)
@@ -838,7 +898,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
838 898
839 /* 899 /*
840 * The guest calculates current wall clock time by adding 900 * The guest calculates current wall clock time by adding
841 * system time (updated by kvm_write_guest_time below) to the 901 * system time (updated by kvm_guest_time_update below) to the
842 * wall clock specified here. guest system time equals host 902 * wall clock specified here. guest system time equals host
843 * system time for us, thus we must fill in host boot time here. 903 * system time for us, thus we must fill in host boot time here.
844 */ 904 */
@@ -866,65 +926,235 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
866 return quotient; 926 return quotient;
867} 927}
868 928
869static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 929static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
930 s8 *pshift, u32 *pmultiplier)
870{ 931{
871 uint64_t nsecs = 1000000000LL; 932 uint64_t scaled64;
872 int32_t shift = 0; 933 int32_t shift = 0;
873 uint64_t tps64; 934 uint64_t tps64;
874 uint32_t tps32; 935 uint32_t tps32;
875 936
876 tps64 = tsc_khz * 1000LL; 937 tps64 = base_khz * 1000LL;
877 while (tps64 > nsecs*2) { 938 scaled64 = scaled_khz * 1000LL;
939 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
878 tps64 >>= 1; 940 tps64 >>= 1;
879 shift--; 941 shift--;
880 } 942 }
881 943
882 tps32 = (uint32_t)tps64; 944 tps32 = (uint32_t)tps64;
883 while (tps32 <= (uint32_t)nsecs) { 945 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
884 tps32 <<= 1; 946 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
947 scaled64 >>= 1;
948 else
949 tps32 <<= 1;
885 shift++; 950 shift++;
886 } 951 }
887 952
888 hv_clock->tsc_shift = shift; 953 *pshift = shift;
889 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 954 *pmultiplier = div_frac(scaled64, tps32);
890 955
891 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 956 pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
892 __func__, tsc_khz, hv_clock->tsc_shift, 957 __func__, base_khz, scaled_khz, shift, *pmultiplier);
893 hv_clock->tsc_to_system_mul); 958}
959
960static inline u64 get_kernel_ns(void)
961{
962 struct timespec ts;
963
964 WARN_ON(preemptible());
965 ktime_get_ts(&ts);
966 monotonic_to_bootbased(&ts);
967 return timespec_to_ns(&ts);
894} 968}
895 969
896static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 970static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
971unsigned long max_tsc_khz;
897 972
898static void kvm_write_guest_time(struct kvm_vcpu *v) 973static inline int kvm_tsc_changes_freq(void)
974{
975 int cpu = get_cpu();
976 int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
977 cpufreq_quick_get(cpu) != 0;
978 put_cpu();
979 return ret;
980}
981
982static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
983{
984 if (vcpu->arch.virtual_tsc_khz)
985 return vcpu->arch.virtual_tsc_khz;
986 else
987 return __this_cpu_read(cpu_tsc_khz);
988}
989
990static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
991{
992 u64 ret;
993
994 WARN_ON(preemptible());
995 if (kvm_tsc_changes_freq())
996 printk_once(KERN_WARNING
997 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
998 ret = nsec * vcpu_tsc_khz(vcpu);
999 do_div(ret, USEC_PER_SEC);
1000 return ret;
1001}
1002
1003static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1004{
1005 /* Compute a scale to convert nanoseconds in TSC cycles */
1006 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1007 &vcpu->arch.tsc_catchup_shift,
1008 &vcpu->arch.tsc_catchup_mult);
1009}
1010
1011static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1012{
1013 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
1014 vcpu->arch.tsc_catchup_mult,
1015 vcpu->arch.tsc_catchup_shift);
1016 tsc += vcpu->arch.last_tsc_write;
1017 return tsc;
1018}
1019
1020void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1021{
1022 struct kvm *kvm = vcpu->kvm;
1023 u64 offset, ns, elapsed;
1024 unsigned long flags;
1025 s64 sdiff;
1026
1027 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1028 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1029 ns = get_kernel_ns();
1030 elapsed = ns - kvm->arch.last_tsc_nsec;
1031 sdiff = data - kvm->arch.last_tsc_write;
1032 if (sdiff < 0)
1033 sdiff = -sdiff;
1034
1035 /*
1036 * Special case: close write to TSC within 5 seconds of
1037 * another CPU is interpreted as an attempt to synchronize
1038 * The 5 seconds is to accommodate host load / swapping as
1039 * well as any reset of TSC during the boot process.
1040 *
1041 * In that case, for a reliable TSC, we can match TSC offsets,
1042 * or make a best guest using elapsed value.
1043 */
1044 if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
1045 elapsed < 5ULL * NSEC_PER_SEC) {
1046 if (!check_tsc_unstable()) {
1047 offset = kvm->arch.last_tsc_offset;
1048 pr_debug("kvm: matched tsc offset for %llu\n", data);
1049 } else {
1050 u64 delta = nsec_to_cycles(vcpu, elapsed);
1051 offset += delta;
1052 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1053 }
1054 ns = kvm->arch.last_tsc_nsec;
1055 }
1056 kvm->arch.last_tsc_nsec = ns;
1057 kvm->arch.last_tsc_write = data;
1058 kvm->arch.last_tsc_offset = offset;
1059 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1060 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1061
1062 /* Reset of TSC must disable overshoot protection below */
1063 vcpu->arch.hv_clock.tsc_timestamp = 0;
1064 vcpu->arch.last_tsc_write = data;
1065 vcpu->arch.last_tsc_nsec = ns;
1066}
1067EXPORT_SYMBOL_GPL(kvm_write_tsc);
1068
1069static int kvm_guest_time_update(struct kvm_vcpu *v)
899{ 1070{
900 struct timespec ts;
901 unsigned long flags; 1071 unsigned long flags;
902 struct kvm_vcpu_arch *vcpu = &v->arch; 1072 struct kvm_vcpu_arch *vcpu = &v->arch;
903 void *shared_kaddr; 1073 void *shared_kaddr;
904 unsigned long this_tsc_khz; 1074 unsigned long this_tsc_khz;
1075 s64 kernel_ns, max_kernel_ns;
1076 u64 tsc_timestamp;
905 1077
906 if ((!vcpu->time_page)) 1078 /* Keep irq disabled to prevent changes to the clock */
907 return; 1079 local_irq_save(flags);
1080 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
1081 kernel_ns = get_kernel_ns();
1082 this_tsc_khz = vcpu_tsc_khz(v);
1083 if (unlikely(this_tsc_khz == 0)) {
1084 local_irq_restore(flags);
1085 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
1086 return 1;
1087 }
908 1088
909 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 1089 /*
910 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 1090 * We may have to catch up the TSC to match elapsed wall clock
911 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 1091 * time for two reasons, even if kvmclock is used.
912 vcpu->hv_clock_tsc_khz = this_tsc_khz; 1092 * 1) CPU could have been running below the maximum TSC rate
1093 * 2) Broken TSC compensation resets the base at each VCPU
1094 * entry to avoid unknown leaps of TSC even when running
1095 * again on the same CPU. This may cause apparent elapsed
1096 * time to disappear, and the guest to stand still or run
1097 * very slowly.
1098 */
1099 if (vcpu->tsc_catchup) {
1100 u64 tsc = compute_guest_tsc(v, kernel_ns);
1101 if (tsc > tsc_timestamp) {
1102 kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
1103 tsc_timestamp = tsc;
1104 }
913 } 1105 }
914 put_cpu_var(cpu_tsc_khz);
915 1106
916 /* Keep irq disabled to prevent changes to the clock */
917 local_irq_save(flags);
918 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
919 ktime_get_ts(&ts);
920 monotonic_to_bootbased(&ts);
921 local_irq_restore(flags); 1107 local_irq_restore(flags);
922 1108
923 /* With all the info we got, fill in the values */ 1109 if (!vcpu->time_page)
1110 return 0;
924 1111
925 vcpu->hv_clock.system_time = ts.tv_nsec + 1112 /*
926 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 1113 * Time as measured by the TSC may go backwards when resetting the base
1114 * tsc_timestamp. The reason for this is that the TSC resolution is
1115 * higher than the resolution of the other clock scales. Thus, many
1116 * possible measurments of the TSC correspond to one measurement of any
1117 * other clock, and so a spread of values is possible. This is not a
1118 * problem for the computation of the nanosecond clock; with TSC rates
1119 * around 1GHZ, there can only be a few cycles which correspond to one
1120 * nanosecond value, and any path through this code will inevitably
1121 * take longer than that. However, with the kernel_ns value itself,
1122 * the precision may be much lower, down to HZ granularity. If the
1123 * first sampling of TSC against kernel_ns ends in the low part of the
1124 * range, and the second in the high end of the range, we can get:
1125 *
1126 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
1127 *
1128 * As the sampling errors potentially range in the thousands of cycles,
1129 * it is possible such a time value has already been observed by the
1130 * guest. To protect against this, we must compute the system time as
1131 * observed by the guest and ensure the new system time is greater.
1132 */
1133 max_kernel_ns = 0;
1134 if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
1135 max_kernel_ns = vcpu->last_guest_tsc -
1136 vcpu->hv_clock.tsc_timestamp;
1137 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
1138 vcpu->hv_clock.tsc_to_system_mul,
1139 vcpu->hv_clock.tsc_shift);
1140 max_kernel_ns += vcpu->last_kernel_ns;
1141 }
927 1142
1143 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1144 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1145 &vcpu->hv_clock.tsc_shift,
1146 &vcpu->hv_clock.tsc_to_system_mul);
1147 vcpu->hw_tsc_khz = this_tsc_khz;
1148 }
1149
1150 if (max_kernel_ns > kernel_ns)
1151 kernel_ns = max_kernel_ns;
1152
1153 /* With all the info we got, fill in the values */
1154 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1155 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1156 vcpu->last_kernel_ns = kernel_ns;
1157 vcpu->last_guest_tsc = tsc_timestamp;
928 vcpu->hv_clock.flags = 0; 1158 vcpu->hv_clock.flags = 0;
929 1159
930 /* 1160 /*
@@ -942,16 +1172,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
942 kunmap_atomic(shared_kaddr, KM_USER0); 1172 kunmap_atomic(shared_kaddr, KM_USER0);
943 1173
944 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 1174 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
945} 1175 return 0;
946
947static int kvm_request_guest_time_update(struct kvm_vcpu *v)
948{
949 struct kvm_vcpu_arch *vcpu = &v->arch;
950
951 if (!vcpu->time_page)
952 return 0;
953 kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
954 return 1;
955} 1176}
956 1177
957static bool msr_mtrr_valid(unsigned msr) 1178static bool msr_mtrr_valid(unsigned msr)
@@ -1214,6 +1435,38 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1214 return 0; 1435 return 0;
1215} 1436}
1216 1437
1438static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1439{
1440 gpa_t gpa = data & ~0x3f;
1441
1442 /* Bits 2:5 are resrved, Should be zero */
1443 if (data & 0x3c)
1444 return 1;
1445
1446 vcpu->arch.apf.msr_val = data;
1447
1448 if (!(data & KVM_ASYNC_PF_ENABLED)) {
1449 kvm_clear_async_pf_completion_queue(vcpu);
1450 kvm_async_pf_hash_reset(vcpu);
1451 return 0;
1452 }
1453
1454 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
1455 return 1;
1456
1457 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
1458 kvm_async_pf_wakeup_all(vcpu);
1459 return 0;
1460}
1461
1462static void kvmclock_reset(struct kvm_vcpu *vcpu)
1463{
1464 if (vcpu->arch.time_page) {
1465 kvm_release_page_dirty(vcpu->arch.time_page);
1466 vcpu->arch.time_page = NULL;
1467 }
1468}
1469
1217int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1470int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1218{ 1471{
1219 switch (msr) { 1472 switch (msr) {
@@ -1271,12 +1524,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1271 break; 1524 break;
1272 case MSR_KVM_SYSTEM_TIME_NEW: 1525 case MSR_KVM_SYSTEM_TIME_NEW:
1273 case MSR_KVM_SYSTEM_TIME: { 1526 case MSR_KVM_SYSTEM_TIME: {
1274 if (vcpu->arch.time_page) { 1527 kvmclock_reset(vcpu);
1275 kvm_release_page_dirty(vcpu->arch.time_page);
1276 vcpu->arch.time_page = NULL;
1277 }
1278 1528
1279 vcpu->arch.time = data; 1529 vcpu->arch.time = data;
1530 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1280 1531
1281 /* we verify if the enable bit is set... */ 1532 /* we verify if the enable bit is set... */
1282 if (!(data & 1)) 1533 if (!(data & 1))
@@ -1292,10 +1543,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1292 kvm_release_page_clean(vcpu->arch.time_page); 1543 kvm_release_page_clean(vcpu->arch.time_page);
1293 vcpu->arch.time_page = NULL; 1544 vcpu->arch.time_page = NULL;
1294 } 1545 }
1295
1296 kvm_request_guest_time_update(vcpu);
1297 break; 1546 break;
1298 } 1547 }
1548 case MSR_KVM_ASYNC_PF_EN:
1549 if (kvm_pv_enable_async_pf(vcpu, data))
1550 return 1;
1551 break;
1299 case MSR_IA32_MCG_CTL: 1552 case MSR_IA32_MCG_CTL:
1300 case MSR_IA32_MCG_STATUS: 1553 case MSR_IA32_MCG_STATUS:
1301 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1554 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1330,6 +1583,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1330 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1583 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1331 "0x%x data 0x%llx\n", msr, data); 1584 "0x%x data 0x%llx\n", msr, data);
1332 break; 1585 break;
1586 case MSR_K7_CLK_CTL:
1587 /*
1588 * Ignore all writes to this no longer documented MSR.
1589 * Writes are only relevant for old K7 processors,
1590 * all pre-dating SVM, but a recommended workaround from
1591 * AMD for these chips. It is possible to speicify the
1592 * affected processor models on the command line, hence
1593 * the need to ignore the workaround.
1594 */
1595 break;
1333 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1596 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1334 if (kvm_hv_msr_partition_wide(msr)) { 1597 if (kvm_hv_msr_partition_wide(msr)) {
1335 int r; 1598 int r;
@@ -1340,6 +1603,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1340 } else 1603 } else
1341 return set_msr_hyperv(vcpu, msr, data); 1604 return set_msr_hyperv(vcpu, msr, data);
1342 break; 1605 break;
1606 case MSR_IA32_BBL_CR_CTL3:
1607 /* Drop writes to this legacy MSR -- see rdmsr
1608 * counterpart for further detail.
1609 */
1610 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1611 break;
1343 default: 1612 default:
1344 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1613 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1345 return xen_hvm_config(vcpu, data); 1614 return xen_hvm_config(vcpu, data);
@@ -1522,6 +1791,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1522 case 0xcd: /* fsb frequency */ 1791 case 0xcd: /* fsb frequency */
1523 data = 3; 1792 data = 3;
1524 break; 1793 break;
1794 /*
1795 * MSR_EBC_FREQUENCY_ID
1796 * Conservative value valid for even the basic CPU models.
1797 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
1798 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
1799 * and 266MHz for model 3, or 4. Set Core Clock
1800 * Frequency to System Bus Frequency Ratio to 1 (bits
1801 * 31:24) even though these are only valid for CPU
1802 * models > 2, however guests may end up dividing or
1803 * multiplying by zero otherwise.
1804 */
1805 case MSR_EBC_FREQUENCY_ID:
1806 data = 1 << 24;
1807 break;
1525 case MSR_IA32_APICBASE: 1808 case MSR_IA32_APICBASE:
1526 data = kvm_get_apic_base(vcpu); 1809 data = kvm_get_apic_base(vcpu);
1527 break; 1810 break;
@@ -1548,6 +1831,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1548 case MSR_KVM_SYSTEM_TIME_NEW: 1831 case MSR_KVM_SYSTEM_TIME_NEW:
1549 data = vcpu->arch.time; 1832 data = vcpu->arch.time;
1550 break; 1833 break;
1834 case MSR_KVM_ASYNC_PF_EN:
1835 data = vcpu->arch.apf.msr_val;
1836 break;
1551 case MSR_IA32_P5_MC_ADDR: 1837 case MSR_IA32_P5_MC_ADDR:
1552 case MSR_IA32_P5_MC_TYPE: 1838 case MSR_IA32_P5_MC_TYPE:
1553 case MSR_IA32_MCG_CAP: 1839 case MSR_IA32_MCG_CAP:
@@ -1555,6 +1841,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1555 case MSR_IA32_MCG_STATUS: 1841 case MSR_IA32_MCG_STATUS:
1556 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1842 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1557 return get_msr_mce(vcpu, msr, pdata); 1843 return get_msr_mce(vcpu, msr, pdata);
1844 case MSR_K7_CLK_CTL:
1845 /*
1846 * Provide expected ramp-up count for K7. All other
1847 * are set to zero, indicating minimum divisors for
1848 * every field.
1849 *
1850 * This prevents guest kernels on AMD host with CPU
1851 * type 6, model 8 and higher from exploding due to
1852 * the rdmsr failing.
1853 */
1854 data = 0x20000000;
1855 break;
1558 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1856 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1559 if (kvm_hv_msr_partition_wide(msr)) { 1857 if (kvm_hv_msr_partition_wide(msr)) {
1560 int r; 1858 int r;
@@ -1565,6 +1863,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1565 } else 1863 } else
1566 return get_msr_hyperv(vcpu, msr, pdata); 1864 return get_msr_hyperv(vcpu, msr, pdata);
1567 break; 1865 break;
1866 case MSR_IA32_BBL_CR_CTL3:
1867 /* This legacy MSR exists but isn't fully documented in current
1868 * silicon. It is however accessed by winxp in very narrow
1869 * scenarios where it sets bit #19, itself documented as
1870 * a "reserved" bit. Best effort attempt to source coherent
1871 * read data here should the balance of the register be
1872 * interpreted by the guest:
1873 *
1874 * L2 cache control register 3: 64GB range, 256KB size,
1875 * enabled, latency 0x1, configured
1876 */
1877 data = 0xbe702111;
1878 break;
1568 default: 1879 default:
1569 if (!ignore_msrs) { 1880 if (!ignore_msrs) {
1570 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1881 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -1665,6 +1976,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1665 case KVM_CAP_NOP_IO_DELAY: 1976 case KVM_CAP_NOP_IO_DELAY:
1666 case KVM_CAP_MP_STATE: 1977 case KVM_CAP_MP_STATE:
1667 case KVM_CAP_SYNC_MMU: 1978 case KVM_CAP_SYNC_MMU:
1979 case KVM_CAP_USER_NMI:
1668 case KVM_CAP_REINJECT_CONTROL: 1980 case KVM_CAP_REINJECT_CONTROL:
1669 case KVM_CAP_IRQ_INJECT_STATUS: 1981 case KVM_CAP_IRQ_INJECT_STATUS:
1670 case KVM_CAP_ASSIGN_DEV_IRQ: 1982 case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -1683,6 +1995,8 @@ int kvm_dev_ioctl_check_extension(long ext)
1683 case KVM_CAP_DEBUGREGS: 1995 case KVM_CAP_DEBUGREGS:
1684 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1996 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1685 case KVM_CAP_XSAVE: 1997 case KVM_CAP_XSAVE:
1998 case KVM_CAP_ASYNC_PF:
1999 case KVM_CAP_GET_TSC_KHZ:
1686 r = 1; 2000 r = 1;
1687 break; 2001 break;
1688 case KVM_CAP_COALESCED_MMIO: 2002 case KVM_CAP_COALESCED_MMIO:
@@ -1709,6 +2023,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1709 case KVM_CAP_XCRS: 2023 case KVM_CAP_XCRS:
1710 r = cpu_has_xsave; 2024 r = cpu_has_xsave;
1711 break; 2025 break;
2026 case KVM_CAP_TSC_CONTROL:
2027 r = kvm_has_tsc_control;
2028 break;
1712 default: 2029 default:
1713 r = 0; 2030 r = 0;
1714 break; 2031 break;
@@ -1808,19 +2125,33 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1808 } 2125 }
1809 2126
1810 kvm_x86_ops->vcpu_load(vcpu, cpu); 2127 kvm_x86_ops->vcpu_load(vcpu, cpu);
1811 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 2128 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
1812 unsigned long khz = cpufreq_quick_get(cpu); 2129 /* Make sure TSC doesn't go backwards */
1813 if (!khz) 2130 s64 tsc_delta;
1814 khz = tsc_khz; 2131 u64 tsc;
1815 per_cpu(cpu_tsc_khz, cpu) = khz; 2132
2133 kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
2134 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
2135 tsc - vcpu->arch.last_guest_tsc;
2136
2137 if (tsc_delta < 0)
2138 mark_tsc_unstable("KVM discovered backwards TSC");
2139 if (check_tsc_unstable()) {
2140 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
2141 vcpu->arch.tsc_catchup = 1;
2142 }
2143 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2144 if (vcpu->cpu != cpu)
2145 kvm_migrate_timers(vcpu);
2146 vcpu->cpu = cpu;
1816 } 2147 }
1817 kvm_request_guest_time_update(vcpu);
1818} 2148}
1819 2149
1820void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2150void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1821{ 2151{
1822 kvm_x86_ops->vcpu_put(vcpu); 2152 kvm_x86_ops->vcpu_put(vcpu);
1823 kvm_put_guest_fpu(vcpu); 2153 kvm_put_guest_fpu(vcpu);
2154 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
1824} 2155}
1825 2156
1826static int is_efer_nx(void) 2157static int is_efer_nx(void)
@@ -1937,6 +2268,11 @@ out:
1937 return r; 2268 return r;
1938} 2269}
1939 2270
2271static void cpuid_mask(u32 *word, int wordnum)
2272{
2273 *word &= boot_cpu_data.x86_capability[wordnum];
2274}
2275
1940static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 2276static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1941 u32 index) 2277 u32 index)
1942{ 2278{
@@ -1991,13 +2327,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1991 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 2327 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1992 0 /* Reserved, DCA */ | F(XMM4_1) | 2328 0 /* Reserved, DCA */ | F(XMM4_1) |
1993 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 2329 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1994 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); 2330 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
2331 F(F16C);
1995 /* cpuid 0x80000001.ecx */ 2332 /* cpuid 0x80000001.ecx */
1996 const u32 kvm_supported_word6_x86_features = 2333 const u32 kvm_supported_word6_x86_features =
1997 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 2334 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
1998 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 2335 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1999 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 2336 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2000 0 /* SKINIT */ | 0 /* WDT */; 2337 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
2338
2339 /* cpuid 0xC0000001.edx */
2340 const u32 kvm_supported_word5_x86_features =
2341 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
2342 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
2343 F(PMM) | F(PMM_EN);
2001 2344
2002 /* all calls to cpuid_count() should be made on the same cpu */ 2345 /* all calls to cpuid_count() should be made on the same cpu */
2003 get_cpu(); 2346 get_cpu();
@@ -2010,7 +2353,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2010 break; 2353 break;
2011 case 1: 2354 case 1:
2012 entry->edx &= kvm_supported_word0_x86_features; 2355 entry->edx &= kvm_supported_word0_x86_features;
2356 cpuid_mask(&entry->edx, 0);
2013 entry->ecx &= kvm_supported_word4_x86_features; 2357 entry->ecx &= kvm_supported_word4_x86_features;
2358 cpuid_mask(&entry->ecx, 4);
2014 /* we support x2apic emulation even if host does not support 2359 /* we support x2apic emulation even if host does not support
2015 * it since we emulate x2apic in software */ 2360 * it since we emulate x2apic in software */
2016 entry->ecx |= F(X2APIC); 2361 entry->ecx |= F(X2APIC);
@@ -2068,9 +2413,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2068 int i; 2413 int i;
2069 2414
2070 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2415 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2071 for (i = 1; *nent < maxnent; ++i) { 2416 for (i = 1; *nent < maxnent && i < 64; ++i) {
2072 if (entry[i - 1].eax == 0 && i != 2) 2417 if (entry[i].eax == 0)
2073 break; 2418 continue;
2074 do_cpuid_1_ent(&entry[i], function, i); 2419 do_cpuid_1_ent(&entry[i], function, i);
2075 entry[i].flags |= 2420 entry[i].flags |=
2076 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2421 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
@@ -2091,6 +2436,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2091 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | 2436 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
2092 (1 << KVM_FEATURE_NOP_IO_DELAY) | 2437 (1 << KVM_FEATURE_NOP_IO_DELAY) |
2093 (1 << KVM_FEATURE_CLOCKSOURCE2) | 2438 (1 << KVM_FEATURE_CLOCKSOURCE2) |
2439 (1 << KVM_FEATURE_ASYNC_PF) |
2094 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 2440 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
2095 entry->ebx = 0; 2441 entry->ebx = 0;
2096 entry->ecx = 0; 2442 entry->ecx = 0;
@@ -2101,7 +2447,23 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2101 break; 2447 break;
2102 case 0x80000001: 2448 case 0x80000001:
2103 entry->edx &= kvm_supported_word1_x86_features; 2449 entry->edx &= kvm_supported_word1_x86_features;
2450 cpuid_mask(&entry->edx, 1);
2104 entry->ecx &= kvm_supported_word6_x86_features; 2451 entry->ecx &= kvm_supported_word6_x86_features;
2452 cpuid_mask(&entry->ecx, 6);
2453 break;
2454 /*Add support for Centaur's CPUID instruction*/
2455 case 0xC0000000:
2456 /*Just support up to 0xC0000004 now*/
2457 entry->eax = min(entry->eax, 0xC0000004);
2458 break;
2459 case 0xC0000001:
2460 entry->edx &= kvm_supported_word5_x86_features;
2461 cpuid_mask(&entry->edx, 5);
2462 break;
2463 case 0xC0000002:
2464 case 0xC0000003:
2465 case 0xC0000004:
2466 /*Now nothing to do, reserved for the future*/
2105 break; 2467 break;
2106 } 2468 }
2107 2469
@@ -2149,6 +2511,26 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
2149 if (nent >= cpuid->nent) 2511 if (nent >= cpuid->nent)
2150 goto out_free; 2512 goto out_free;
2151 2513
2514 /* Add support for Centaur's CPUID instruction. */
2515 if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
2516 do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
2517 &nent, cpuid->nent);
2518
2519 r = -E2BIG;
2520 if (nent >= cpuid->nent)
2521 goto out_free;
2522
2523 limit = cpuid_entries[nent - 1].eax;
2524 for (func = 0xC0000001;
2525 func <= limit && nent < cpuid->nent; ++func)
2526 do_cpuid_ent(&cpuid_entries[nent], func, 0,
2527 &nent, cpuid->nent);
2528
2529 r = -E2BIG;
2530 if (nent >= cpuid->nent)
2531 goto out_free;
2532 }
2533
2152 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, 2534 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
2153 cpuid->nent); 2535 cpuid->nent);
2154 2536
@@ -2203,6 +2585,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2203 return -ENXIO; 2585 return -ENXIO;
2204 2586
2205 kvm_queue_interrupt(vcpu, irq->irq, false); 2587 kvm_queue_interrupt(vcpu, irq->irq, false);
2588 kvm_make_request(KVM_REQ_EVENT, vcpu);
2206 2589
2207 return 0; 2590 return 0;
2208} 2591}
@@ -2272,9 +2655,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2272 if (mce->status & MCI_STATUS_UC) { 2655 if (mce->status & MCI_STATUS_UC) {
2273 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2656 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2274 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { 2657 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2275 printk(KERN_DEBUG "kvm: set_mce: "
2276 "injects mce exception while "
2277 "previous one is in progress!\n");
2278 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2658 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2279 return 0; 2659 return 0;
2280 } 2660 }
@@ -2305,6 +2685,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2305 !kvm_exception_is_soft(vcpu->arch.exception.nr); 2685 !kvm_exception_is_soft(vcpu->arch.exception.nr);
2306 events->exception.nr = vcpu->arch.exception.nr; 2686 events->exception.nr = vcpu->arch.exception.nr;
2307 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 2687 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2688 events->exception.pad = 0;
2308 events->exception.error_code = vcpu->arch.exception.error_code; 2689 events->exception.error_code = vcpu->arch.exception.error_code;
2309 2690
2310 events->interrupt.injected = 2691 events->interrupt.injected =
@@ -2318,12 +2699,14 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2318 events->nmi.injected = vcpu->arch.nmi_injected; 2699 events->nmi.injected = vcpu->arch.nmi_injected;
2319 events->nmi.pending = vcpu->arch.nmi_pending; 2700 events->nmi.pending = vcpu->arch.nmi_pending;
2320 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); 2701 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2702 events->nmi.pad = 0;
2321 2703
2322 events->sipi_vector = vcpu->arch.sipi_vector; 2704 events->sipi_vector = vcpu->arch.sipi_vector;
2323 2705
2324 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2706 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2325 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2707 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2326 | KVM_VCPUEVENT_VALID_SHADOW); 2708 | KVM_VCPUEVENT_VALID_SHADOW);
2709 memset(&events->reserved, 0, sizeof(events->reserved));
2327} 2710}
2328 2711
2329static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 2712static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2342,8 +2725,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2342 vcpu->arch.interrupt.pending = events->interrupt.injected; 2725 vcpu->arch.interrupt.pending = events->interrupt.injected;
2343 vcpu->arch.interrupt.nr = events->interrupt.nr; 2726 vcpu->arch.interrupt.nr = events->interrupt.nr;
2344 vcpu->arch.interrupt.soft = events->interrupt.soft; 2727 vcpu->arch.interrupt.soft = events->interrupt.soft;
2345 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2346 kvm_pic_clear_isr_ack(vcpu->kvm);
2347 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 2728 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2348 kvm_x86_ops->set_interrupt_shadow(vcpu, 2729 kvm_x86_ops->set_interrupt_shadow(vcpu,
2349 events->interrupt.shadow); 2730 events->interrupt.shadow);
@@ -2356,6 +2737,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2356 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2737 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2357 vcpu->arch.sipi_vector = events->sipi_vector; 2738 vcpu->arch.sipi_vector = events->sipi_vector;
2358 2739
2740 kvm_make_request(KVM_REQ_EVENT, vcpu);
2741
2359 return 0; 2742 return 0;
2360} 2743}
2361 2744
@@ -2366,6 +2749,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2366 dbgregs->dr6 = vcpu->arch.dr6; 2749 dbgregs->dr6 = vcpu->arch.dr6;
2367 dbgregs->dr7 = vcpu->arch.dr7; 2750 dbgregs->dr7 = vcpu->arch.dr7;
2368 dbgregs->flags = 0; 2751 dbgregs->flags = 0;
2752 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
2369} 2753}
2370 2754
2371static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 2755static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2715,6 +3099,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2715 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 3099 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
2716 break; 3100 break;
2717 } 3101 }
3102 case KVM_SET_TSC_KHZ: {
3103 u32 user_tsc_khz;
3104
3105 r = -EINVAL;
3106 if (!kvm_has_tsc_control)
3107 break;
3108
3109 user_tsc_khz = (u32)arg;
3110
3111 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
3112 goto out;
3113
3114 kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
3115
3116 r = 0;
3117 goto out;
3118 }
3119 case KVM_GET_TSC_KHZ: {
3120 r = -EIO;
3121 if (check_tsc_unstable())
3122 goto out;
3123
3124 r = vcpu_tsc_khz(vcpu);
3125
3126 goto out;
3127 }
2718 default: 3128 default:
2719 r = -EINVAL; 3129 r = -EINVAL;
2720 } 3130 }
@@ -2759,7 +3169,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2759 3169
2760static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 3170static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2761{ 3171{
2762 return kvm->arch.n_alloc_mmu_pages; 3172 return kvm->arch.n_max_mmu_pages;
2763} 3173}
2764 3174
2765static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 3175static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
@@ -2795,18 +3205,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2795 r = 0; 3205 r = 0;
2796 switch (chip->chip_id) { 3206 switch (chip->chip_id) {
2797 case KVM_IRQCHIP_PIC_MASTER: 3207 case KVM_IRQCHIP_PIC_MASTER:
2798 raw_spin_lock(&pic_irqchip(kvm)->lock); 3208 spin_lock(&pic_irqchip(kvm)->lock);
2799 memcpy(&pic_irqchip(kvm)->pics[0], 3209 memcpy(&pic_irqchip(kvm)->pics[0],
2800 &chip->chip.pic, 3210 &chip->chip.pic,
2801 sizeof(struct kvm_pic_state)); 3211 sizeof(struct kvm_pic_state));
2802 raw_spin_unlock(&pic_irqchip(kvm)->lock); 3212 spin_unlock(&pic_irqchip(kvm)->lock);
2803 break; 3213 break;
2804 case KVM_IRQCHIP_PIC_SLAVE: 3214 case KVM_IRQCHIP_PIC_SLAVE:
2805 raw_spin_lock(&pic_irqchip(kvm)->lock); 3215 spin_lock(&pic_irqchip(kvm)->lock);
2806 memcpy(&pic_irqchip(kvm)->pics[1], 3216 memcpy(&pic_irqchip(kvm)->pics[1],
2807 &chip->chip.pic, 3217 &chip->chip.pic,
2808 sizeof(struct kvm_pic_state)); 3218 sizeof(struct kvm_pic_state));
2809 raw_spin_unlock(&pic_irqchip(kvm)->lock); 3219 spin_unlock(&pic_irqchip(kvm)->lock);
2810 break; 3220 break;
2811 case KVM_IRQCHIP_IOAPIC: 3221 case KVM_IRQCHIP_IOAPIC:
2812 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 3222 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -2849,6 +3259,7 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2849 sizeof(ps->channels)); 3259 sizeof(ps->channels));
2850 ps->flags = kvm->arch.vpit->pit_state.flags; 3260 ps->flags = kvm->arch.vpit->pit_state.flags;
2851 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3261 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3262 memset(&ps->reserved, 0, sizeof(ps->reserved));
2852 return r; 3263 return r;
2853} 3264}
2854 3265
@@ -2912,24 +3323,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2912 struct kvm_memslots *slots, *old_slots; 3323 struct kvm_memslots *slots, *old_slots;
2913 unsigned long *dirty_bitmap; 3324 unsigned long *dirty_bitmap;
2914 3325
2915 spin_lock(&kvm->mmu_lock); 3326 dirty_bitmap = memslot->dirty_bitmap_head;
2916 kvm_mmu_slot_remove_write_access(kvm, log->slot); 3327 if (memslot->dirty_bitmap == dirty_bitmap)
2917 spin_unlock(&kvm->mmu_lock); 3328 dirty_bitmap += n / sizeof(long);
2918
2919 r = -ENOMEM;
2920 dirty_bitmap = vmalloc(n);
2921 if (!dirty_bitmap)
2922 goto out;
2923 memset(dirty_bitmap, 0, n); 3329 memset(dirty_bitmap, 0, n);
2924 3330
2925 r = -ENOMEM; 3331 r = -ENOMEM;
2926 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 3332 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2927 if (!slots) { 3333 if (!slots)
2928 vfree(dirty_bitmap);
2929 goto out; 3334 goto out;
2930 }
2931 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 3335 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2932 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 3336 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
3337 slots->generation++;
2933 3338
2934 old_slots = kvm->memslots; 3339 old_slots = kvm->memslots;
2935 rcu_assign_pointer(kvm->memslots, slots); 3340 rcu_assign_pointer(kvm->memslots, slots);
@@ -2937,12 +3342,13 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2937 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; 3342 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2938 kfree(old_slots); 3343 kfree(old_slots);
2939 3344
3345 spin_lock(&kvm->mmu_lock);
3346 kvm_mmu_slot_remove_write_access(kvm, log->slot);
3347 spin_unlock(&kvm->mmu_lock);
3348
2940 r = -EFAULT; 3349 r = -EFAULT;
2941 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { 3350 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2942 vfree(dirty_bitmap);
2943 goto out; 3351 goto out;
2944 }
2945 vfree(dirty_bitmap);
2946 } else { 3352 } else {
2947 r = -EFAULT; 3353 r = -EFAULT;
2948 if (clear_user(log->dirty_bitmap, n)) 3354 if (clear_user(log->dirty_bitmap, n))
@@ -3009,8 +3415,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
3009 if (vpic) { 3415 if (vpic) {
3010 r = kvm_ioapic_init(kvm); 3416 r = kvm_ioapic_init(kvm);
3011 if (r) { 3417 if (r) {
3418 mutex_lock(&kvm->slots_lock);
3012 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3419 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3013 &vpic->dev); 3420 &vpic->dev);
3421 mutex_unlock(&kvm->slots_lock);
3014 kfree(vpic); 3422 kfree(vpic);
3015 goto create_irqchip_unlock; 3423 goto create_irqchip_unlock;
3016 } 3424 }
@@ -3021,10 +3429,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
3021 smp_wmb(); 3429 smp_wmb();
3022 r = kvm_setup_default_irq_routing(kvm); 3430 r = kvm_setup_default_irq_routing(kvm);
3023 if (r) { 3431 if (r) {
3432 mutex_lock(&kvm->slots_lock);
3024 mutex_lock(&kvm->irq_lock); 3433 mutex_lock(&kvm->irq_lock);
3025 kvm_ioapic_destroy(kvm); 3434 kvm_ioapic_destroy(kvm);
3026 kvm_destroy_pic(kvm); 3435 kvm_destroy_pic(kvm);
3027 mutex_unlock(&kvm->irq_lock); 3436 mutex_unlock(&kvm->irq_lock);
3437 mutex_unlock(&kvm->slots_lock);
3028 } 3438 }
3029 create_irqchip_unlock: 3439 create_irqchip_unlock:
3030 mutex_unlock(&kvm->lock); 3440 mutex_unlock(&kvm->lock);
@@ -3200,7 +3610,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3200 break; 3610 break;
3201 } 3611 }
3202 case KVM_SET_CLOCK: { 3612 case KVM_SET_CLOCK: {
3203 struct timespec now;
3204 struct kvm_clock_data user_ns; 3613 struct kvm_clock_data user_ns;
3205 u64 now_ns; 3614 u64 now_ns;
3206 s64 delta; 3615 s64 delta;
@@ -3214,21 +3623,23 @@ long kvm_arch_vm_ioctl(struct file *filp,
3214 goto out; 3623 goto out;
3215 3624
3216 r = 0; 3625 r = 0;
3217 ktime_get_ts(&now); 3626 local_irq_disable();
3218 now_ns = timespec_to_ns(&now); 3627 now_ns = get_kernel_ns();
3219 delta = user_ns.clock - now_ns; 3628 delta = user_ns.clock - now_ns;
3629 local_irq_enable();
3220 kvm->arch.kvmclock_offset = delta; 3630 kvm->arch.kvmclock_offset = delta;
3221 break; 3631 break;
3222 } 3632 }
3223 case KVM_GET_CLOCK: { 3633 case KVM_GET_CLOCK: {
3224 struct timespec now;
3225 struct kvm_clock_data user_ns; 3634 struct kvm_clock_data user_ns;
3226 u64 now_ns; 3635 u64 now_ns;
3227 3636
3228 ktime_get_ts(&now); 3637 local_irq_disable();
3229 now_ns = timespec_to_ns(&now); 3638 now_ns = get_kernel_ns();
3230 user_ns.clock = kvm->arch.kvmclock_offset + now_ns; 3639 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
3640 local_irq_enable();
3231 user_ns.flags = 0; 3641 user_ns.flags = 0;
3642 memset(&user_ns.pad, 0, sizeof(user_ns.pad));
3232 3643
3233 r = -EFAULT; 3644 r = -EFAULT;
3234 if (copy_to_user(argp, &user_ns, sizeof(user_ns))) 3645 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
@@ -3263,20 +3674,43 @@ static void kvm_init_msr_list(void)
3263static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 3674static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
3264 const void *v) 3675 const void *v)
3265{ 3676{
3266 if (vcpu->arch.apic && 3677 int handled = 0;
3267 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 3678 int n;
3268 return 0; 3679
3680 do {
3681 n = min(len, 8);
3682 if (!(vcpu->arch.apic &&
3683 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
3684 && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3685 break;
3686 handled += n;
3687 addr += n;
3688 len -= n;
3689 v += n;
3690 } while (len);
3269 3691
3270 return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3692 return handled;
3271} 3693}
3272 3694
3273static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 3695static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3274{ 3696{
3275 if (vcpu->arch.apic && 3697 int handled = 0;
3276 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 3698 int n;
3277 return 0; 3699
3700 do {
3701 n = min(len, 8);
3702 if (!(vcpu->arch.apic &&
3703 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
3704 && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3705 break;
3706 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
3707 handled += n;
3708 addr += n;
3709 len -= n;
3710 v += n;
3711 } while (len);
3278 3712
3279 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3713 return handled;
3280} 3714}
3281 3715
3282static void kvm_set_segment(struct kvm_vcpu *vcpu, 3716static void kvm_set_segment(struct kvm_vcpu *vcpu,
@@ -3291,49 +3725,71 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
3291 kvm_x86_ops->get_segment(vcpu, var, seg); 3725 kvm_x86_ops->get_segment(vcpu, var, seg);
3292} 3726}
3293 3727
3294gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3728static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3729{
3730 return gpa;
3731}
3732
3733static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3734{
3735 gpa_t t_gpa;
3736 struct x86_exception exception;
3737
3738 BUG_ON(!mmu_is_nested(vcpu));
3739
3740 /* NPT walks are always user-walks */
3741 access |= PFERR_USER_MASK;
3742 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
3743
3744 return t_gpa;
3745}
3746
3747gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
3748 struct x86_exception *exception)
3295{ 3749{
3296 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3750 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3297 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3751 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3298} 3752}
3299 3753
3300 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3754 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
3755 struct x86_exception *exception)
3301{ 3756{
3302 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3757 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3303 access |= PFERR_FETCH_MASK; 3758 access |= PFERR_FETCH_MASK;
3304 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3759 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3305} 3760}
3306 3761
3307gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3762gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
3763 struct x86_exception *exception)
3308{ 3764{
3309 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3765 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3310 access |= PFERR_WRITE_MASK; 3766 access |= PFERR_WRITE_MASK;
3311 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3767 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3312} 3768}
3313 3769
3314/* uses this to access any guest's mapped memory without checking CPL */ 3770/* uses this to access any guest's mapped memory without checking CPL */
3315gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3771gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
3772 struct x86_exception *exception)
3316{ 3773{
3317 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); 3774 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
3318} 3775}
3319 3776
3320static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3777static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3321 struct kvm_vcpu *vcpu, u32 access, 3778 struct kvm_vcpu *vcpu, u32 access,
3322 u32 *error) 3779 struct x86_exception *exception)
3323{ 3780{
3324 void *data = val; 3781 void *data = val;
3325 int r = X86EMUL_CONTINUE; 3782 int r = X86EMUL_CONTINUE;
3326 3783
3327 while (bytes) { 3784 while (bytes) {
3328 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); 3785 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3786 exception);
3329 unsigned offset = addr & (PAGE_SIZE-1); 3787 unsigned offset = addr & (PAGE_SIZE-1);
3330 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3788 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3331 int ret; 3789 int ret;
3332 3790
3333 if (gpa == UNMAPPED_GVA) { 3791 if (gpa == UNMAPPED_GVA)
3334 r = X86EMUL_PROPAGATE_FAULT; 3792 return X86EMUL_PROPAGATE_FAULT;
3335 goto out;
3336 }
3337 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3793 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3338 if (ret < 0) { 3794 if (ret < 0) {
3339 r = X86EMUL_IO_NEEDED; 3795 r = X86EMUL_IO_NEEDED;
@@ -3349,47 +3805,56 @@ out:
3349} 3805}
3350 3806
3351/* used for instruction fetching */ 3807/* used for instruction fetching */
3352static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, 3808static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
3353 struct kvm_vcpu *vcpu, u32 *error) 3809 gva_t addr, void *val, unsigned int bytes,
3810 struct x86_exception *exception)
3354{ 3811{
3812 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3355 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3813 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3814
3356 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 3815 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3357 access | PFERR_FETCH_MASK, error); 3816 access | PFERR_FETCH_MASK,
3817 exception);
3358} 3818}
3359 3819
3360static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3820static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
3361 struct kvm_vcpu *vcpu, u32 *error) 3821 gva_t addr, void *val, unsigned int bytes,
3822 struct x86_exception *exception)
3362{ 3823{
3824 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3363 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3825 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3826
3364 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3827 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3365 error); 3828 exception);
3366} 3829}
3367 3830
3368static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, 3831static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3369 struct kvm_vcpu *vcpu, u32 *error) 3832 gva_t addr, void *val, unsigned int bytes,
3833 struct x86_exception *exception)
3370{ 3834{
3371 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3835 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3836 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3372} 3837}
3373 3838
3374static int kvm_write_guest_virt_system(gva_t addr, void *val, 3839static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3840 gva_t addr, void *val,
3375 unsigned int bytes, 3841 unsigned int bytes,
3376 struct kvm_vcpu *vcpu, 3842 struct x86_exception *exception)
3377 u32 *error)
3378{ 3843{
3844 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3379 void *data = val; 3845 void *data = val;
3380 int r = X86EMUL_CONTINUE; 3846 int r = X86EMUL_CONTINUE;
3381 3847
3382 while (bytes) { 3848 while (bytes) {
3383 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, 3849 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3384 PFERR_WRITE_MASK, error); 3850 PFERR_WRITE_MASK,
3851 exception);
3385 unsigned offset = addr & (PAGE_SIZE-1); 3852 unsigned offset = addr & (PAGE_SIZE-1);
3386 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3853 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3387 int ret; 3854 int ret;
3388 3855
3389 if (gpa == UNMAPPED_GVA) { 3856 if (gpa == UNMAPPED_GVA)
3390 r = X86EMUL_PROPAGATE_FAULT; 3857 return X86EMUL_PROPAGATE_FAULT;
3391 goto out;
3392 }
3393 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3858 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3394 if (ret < 0) { 3859 if (ret < 0) {
3395 r = X86EMUL_IO_NEEDED; 3860 r = X86EMUL_IO_NEEDED;
@@ -3404,13 +3869,15 @@ out:
3404 return r; 3869 return r;
3405} 3870}
3406 3871
3407static int emulator_read_emulated(unsigned long addr, 3872static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3873 unsigned long addr,
3408 void *val, 3874 void *val,
3409 unsigned int bytes, 3875 unsigned int bytes,
3410 unsigned int *error_code, 3876 struct x86_exception *exception)
3411 struct kvm_vcpu *vcpu)
3412{ 3877{
3878 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3413 gpa_t gpa; 3879 gpa_t gpa;
3880 int handled;
3414 3881
3415 if (vcpu->mmio_read_completed) { 3882 if (vcpu->mmio_read_completed) {
3416 memcpy(val, vcpu->mmio_data, bytes); 3883 memcpy(val, vcpu->mmio_data, bytes);
@@ -3420,7 +3887,7 @@ static int emulator_read_emulated(unsigned long addr,
3420 return X86EMUL_CONTINUE; 3887 return X86EMUL_CONTINUE;
3421 } 3888 }
3422 3889
3423 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); 3890 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
3424 3891
3425 if (gpa == UNMAPPED_GVA) 3892 if (gpa == UNMAPPED_GVA)
3426 return X86EMUL_PROPAGATE_FAULT; 3893 return X86EMUL_PROPAGATE_FAULT;
@@ -3429,32 +3896,38 @@ static int emulator_read_emulated(unsigned long addr,
3429 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3896 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3430 goto mmio; 3897 goto mmio;
3431 3898
3432 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) 3899 if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
3433 == X86EMUL_CONTINUE) 3900 == X86EMUL_CONTINUE)
3434 return X86EMUL_CONTINUE; 3901 return X86EMUL_CONTINUE;
3435 3902
3436mmio: 3903mmio:
3437 /* 3904 /*
3438 * Is this MMIO handled locally? 3905 * Is this MMIO handled locally?
3439 */ 3906 */
3440 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { 3907 handled = vcpu_mmio_read(vcpu, gpa, bytes, val);
3441 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); 3908
3909 if (handled == bytes)
3442 return X86EMUL_CONTINUE; 3910 return X86EMUL_CONTINUE;
3443 } 3911
3912 gpa += handled;
3913 bytes -= handled;
3914 val += handled;
3444 3915
3445 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 3916 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
3446 3917
3447 vcpu->mmio_needed = 1; 3918 vcpu->mmio_needed = 1;
3448 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3919 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3449 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3920 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3450 vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3921 vcpu->mmio_size = bytes;
3922 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
3451 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; 3923 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
3924 vcpu->mmio_index = 0;
3452 3925
3453 return X86EMUL_IO_NEEDED; 3926 return X86EMUL_IO_NEEDED;
3454} 3927}
3455 3928
3456int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3929int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3457 const void *val, int bytes) 3930 const void *val, int bytes)
3458{ 3931{
3459 int ret; 3932 int ret;
3460 3933
@@ -3468,12 +3941,13 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3468static int emulator_write_emulated_onepage(unsigned long addr, 3941static int emulator_write_emulated_onepage(unsigned long addr,
3469 const void *val, 3942 const void *val,
3470 unsigned int bytes, 3943 unsigned int bytes,
3471 unsigned int *error_code, 3944 struct x86_exception *exception,
3472 struct kvm_vcpu *vcpu) 3945 struct kvm_vcpu *vcpu)
3473{ 3946{
3474 gpa_t gpa; 3947 gpa_t gpa;
3948 int handled;
3475 3949
3476 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); 3950 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
3477 3951
3478 if (gpa == UNMAPPED_GVA) 3952 if (gpa == UNMAPPED_GVA)
3479 return X86EMUL_PROPAGATE_FAULT; 3953 return X86EMUL_PROPAGATE_FAULT;
@@ -3490,31 +3964,41 @@ mmio:
3490 /* 3964 /*
3491 * Is this MMIO handled locally? 3965 * Is this MMIO handled locally?
3492 */ 3966 */
3493 if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) 3967 handled = vcpu_mmio_write(vcpu, gpa, bytes, val);
3968 if (handled == bytes)
3494 return X86EMUL_CONTINUE; 3969 return X86EMUL_CONTINUE;
3495 3970
3971 gpa += handled;
3972 bytes -= handled;
3973 val += handled;
3974
3496 vcpu->mmio_needed = 1; 3975 vcpu->mmio_needed = 1;
3976 memcpy(vcpu->mmio_data, val, bytes);
3497 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3977 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3498 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3978 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3499 vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3979 vcpu->mmio_size = bytes;
3980 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
3500 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; 3981 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
3501 memcpy(vcpu->run->mmio.data, val, bytes); 3982 memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
3983 vcpu->mmio_index = 0;
3502 3984
3503 return X86EMUL_CONTINUE; 3985 return X86EMUL_CONTINUE;
3504} 3986}
3505 3987
3506int emulator_write_emulated(unsigned long addr, 3988int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
3989 unsigned long addr,
3507 const void *val, 3990 const void *val,
3508 unsigned int bytes, 3991 unsigned int bytes,
3509 unsigned int *error_code, 3992 struct x86_exception *exception)
3510 struct kvm_vcpu *vcpu)
3511{ 3993{
3994 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3995
3512 /* Crossing a page boundary? */ 3996 /* Crossing a page boundary? */
3513 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3997 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
3514 int rc, now; 3998 int rc, now;
3515 3999
3516 now = -addr & ~PAGE_MASK; 4000 now = -addr & ~PAGE_MASK;
3517 rc = emulator_write_emulated_onepage(addr, val, now, error_code, 4001 rc = emulator_write_emulated_onepage(addr, val, now, exception,
3518 vcpu); 4002 vcpu);
3519 if (rc != X86EMUL_CONTINUE) 4003 if (rc != X86EMUL_CONTINUE)
3520 return rc; 4004 return rc;
@@ -3522,7 +4006,7 @@ int emulator_write_emulated(unsigned long addr,
3522 val += now; 4006 val += now;
3523 bytes -= now; 4007 bytes -= now;
3524 } 4008 }
3525 return emulator_write_emulated_onepage(addr, val, bytes, error_code, 4009 return emulator_write_emulated_onepage(addr, val, bytes, exception,
3526 vcpu); 4010 vcpu);
3527} 4011}
3528 4012
@@ -3536,13 +4020,14 @@ int emulator_write_emulated(unsigned long addr,
3536 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) 4020 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
3537#endif 4021#endif
3538 4022
3539static int emulator_cmpxchg_emulated(unsigned long addr, 4023static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4024 unsigned long addr,
3540 const void *old, 4025 const void *old,
3541 const void *new, 4026 const void *new,
3542 unsigned int bytes, 4027 unsigned int bytes,
3543 unsigned int *error_code, 4028 struct x86_exception *exception)
3544 struct kvm_vcpu *vcpu)
3545{ 4029{
4030 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3546 gpa_t gpa; 4031 gpa_t gpa;
3547 struct page *page; 4032 struct page *page;
3548 char *kaddr; 4033 char *kaddr;
@@ -3598,7 +4083,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3598emul_write: 4083emul_write:
3599 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 4084 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3600 4085
3601 return emulator_write_emulated(addr, new, bytes, error_code, vcpu); 4086 return emulator_write_emulated(ctxt, addr, new, bytes, exception);
3602} 4087}
3603 4088
3604static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 4089static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3617,13 +4102,16 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3617} 4102}
3618 4103
3619 4104
3620static int emulator_pio_in_emulated(int size, unsigned short port, void *val, 4105static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
3621 unsigned int count, struct kvm_vcpu *vcpu) 4106 int size, unsigned short port, void *val,
4107 unsigned int count)
3622{ 4108{
4109 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4110
3623 if (vcpu->arch.pio.count) 4111 if (vcpu->arch.pio.count)
3624 goto data_avail; 4112 goto data_avail;
3625 4113
3626 trace_kvm_pio(1, port, size, 1); 4114 trace_kvm_pio(0, port, size, count);
3627 4115
3628 vcpu->arch.pio.port = port; 4116 vcpu->arch.pio.port = port;
3629 vcpu->arch.pio.in = 1; 4117 vcpu->arch.pio.in = 1;
@@ -3647,11 +4135,13 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3647 return 0; 4135 return 0;
3648} 4136}
3649 4137
3650static int emulator_pio_out_emulated(int size, unsigned short port, 4138static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
3651 const void *val, unsigned int count, 4139 int size, unsigned short port,
3652 struct kvm_vcpu *vcpu) 4140 const void *val, unsigned int count)
3653{ 4141{
3654 trace_kvm_pio(0, port, size, 1); 4142 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4143
4144 trace_kvm_pio(1, port, size, count);
3655 4145
3656 vcpu->arch.pio.port = port; 4146 vcpu->arch.pio.port = port;
3657 vcpu->arch.pio.in = 0; 4147 vcpu->arch.pio.in = 0;
@@ -3680,10 +4170,9 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
3680 return kvm_x86_ops->get_segment_base(vcpu, seg); 4170 return kvm_x86_ops->get_segment_base(vcpu, seg);
3681} 4171}
3682 4172
3683int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 4173static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
3684{ 4174{
3685 kvm_mmu_invlpg(vcpu, address); 4175 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
3686 return X86EMUL_CONTINUE;
3687} 4176}
3688 4177
3689int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) 4178int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
@@ -3692,31 +4181,33 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
3692 return X86EMUL_CONTINUE; 4181 return X86EMUL_CONTINUE;
3693 4182
3694 if (kvm_x86_ops->has_wbinvd_exit()) { 4183 if (kvm_x86_ops->has_wbinvd_exit()) {
4184 int cpu = get_cpu();
4185
4186 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3695 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, 4187 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
3696 wbinvd_ipi, NULL, 1); 4188 wbinvd_ipi, NULL, 1);
4189 put_cpu();
3697 cpumask_clear(vcpu->arch.wbinvd_dirty_mask); 4190 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
3698 } 4191 } else
3699 wbinvd(); 4192 wbinvd();
3700 return X86EMUL_CONTINUE; 4193 return X86EMUL_CONTINUE;
3701} 4194}
3702EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4195EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
3703 4196
3704int emulate_clts(struct kvm_vcpu *vcpu) 4197static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
3705{ 4198{
3706 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 4199 kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
3707 kvm_x86_ops->fpu_activate(vcpu);
3708 return X86EMUL_CONTINUE;
3709} 4200}
3710 4201
3711int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) 4202int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
3712{ 4203{
3713 return _kvm_get_dr(vcpu, dr, dest); 4204 return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
3714} 4205}
3715 4206
3716int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) 4207int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
3717{ 4208{
3718 4209
3719 return __kvm_set_dr(vcpu, dr, value); 4210 return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
3720} 4211}
3721 4212
3722static u64 mk_cr_64(u64 curr_cr, u32 new_val) 4213static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -3724,8 +4215,9 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3724 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 4215 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3725} 4216}
3726 4217
3727static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) 4218static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
3728{ 4219{
4220 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3729 unsigned long value; 4221 unsigned long value;
3730 4222
3731 switch (cr) { 4223 switch (cr) {
@@ -3736,7 +4228,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3736 value = vcpu->arch.cr2; 4228 value = vcpu->arch.cr2;
3737 break; 4229 break;
3738 case 3: 4230 case 3:
3739 value = vcpu->arch.cr3; 4231 value = kvm_read_cr3(vcpu);
3740 break; 4232 break;
3741 case 4: 4233 case 4:
3742 value = kvm_read_cr4(vcpu); 4234 value = kvm_read_cr4(vcpu);
@@ -3752,8 +4244,9 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3752 return value; 4244 return value;
3753} 4245}
3754 4246
3755static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 4247static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
3756{ 4248{
4249 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3757 int res = 0; 4250 int res = 0;
3758 4251
3759 switch (cr) { 4252 switch (cr) {
@@ -3770,7 +4263,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3770 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 4263 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3771 break; 4264 break;
3772 case 8: 4265 case 8:
3773 res = __kvm_set_cr8(vcpu, val & 0xfUL); 4266 res = kvm_set_cr8(vcpu, val);
3774 break; 4267 break;
3775 default: 4268 default:
3776 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4269 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@@ -3780,28 +4273,45 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3780 return res; 4273 return res;
3781} 4274}
3782 4275
3783static int emulator_get_cpl(struct kvm_vcpu *vcpu) 4276static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
3784{ 4277{
3785 return kvm_x86_ops->get_cpl(vcpu); 4278 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
3786} 4279}
3787 4280
3788static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) 4281static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
3789{ 4282{
3790 kvm_x86_ops->get_gdt(vcpu, dt); 4283 kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
3791} 4284}
3792 4285
3793static unsigned long emulator_get_cached_segment_base(int seg, 4286static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
3794 struct kvm_vcpu *vcpu)
3795{ 4287{
3796 return get_segment_base(vcpu, seg); 4288 kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
3797} 4289}
3798 4290
3799static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 4291static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
3800 struct kvm_vcpu *vcpu) 4292{
4293 kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
4294}
4295
4296static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4297{
4298 kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
4299}
4300
4301static unsigned long emulator_get_cached_segment_base(
4302 struct x86_emulate_ctxt *ctxt, int seg)
4303{
4304 return get_segment_base(emul_to_vcpu(ctxt), seg);
4305}
4306
4307static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
4308 struct desc_struct *desc, u32 *base3,
4309 int seg)
3801{ 4310{
3802 struct kvm_segment var; 4311 struct kvm_segment var;
3803 4312
3804 kvm_get_segment(vcpu, &var, seg); 4313 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4314 *selector = var.selector;
3805 4315
3806 if (var.unusable) 4316 if (var.unusable)
3807 return false; 4317 return false;
@@ -3810,6 +4320,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3810 var.limit >>= 12; 4320 var.limit >>= 12;
3811 set_desc_limit(desc, var.limit); 4321 set_desc_limit(desc, var.limit);
3812 set_desc_base(desc, (unsigned long)var.base); 4322 set_desc_base(desc, (unsigned long)var.base);
4323#ifdef CONFIG_X86_64
4324 if (base3)
4325 *base3 = var.base >> 32;
4326#endif
3813 desc->type = var.type; 4327 desc->type = var.type;
3814 desc->s = var.s; 4328 desc->s = var.s;
3815 desc->dpl = var.dpl; 4329 desc->dpl = var.dpl;
@@ -3822,15 +4336,18 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3822 return true; 4336 return true;
3823} 4337}
3824 4338
3825static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, 4339static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
3826 struct kvm_vcpu *vcpu) 4340 struct desc_struct *desc, u32 base3,
4341 int seg)
3827{ 4342{
4343 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3828 struct kvm_segment var; 4344 struct kvm_segment var;
3829 4345
3830 /* needed to preserve selector */ 4346 var.selector = selector;
3831 kvm_get_segment(vcpu, &var, seg);
3832
3833 var.base = get_desc_base(desc); 4347 var.base = get_desc_base(desc);
4348#ifdef CONFIG_X86_64
4349 var.base |= ((u64)base3) << 32;
4350#endif
3834 var.limit = get_desc_limit(desc); 4351 var.limit = get_desc_limit(desc);
3835 if (desc->g) 4352 if (desc->g)
3836 var.limit = (var.limit << 12) | 0xfff; 4353 var.limit = (var.limit << 12) | 0xfff;
@@ -3850,22 +4367,44 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
3850 return; 4367 return;
3851} 4368}
3852 4369
3853static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) 4370static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4371 u32 msr_index, u64 *pdata)
3854{ 4372{
3855 struct kvm_segment kvm_seg; 4373 return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
4374}
3856 4375
3857 kvm_get_segment(vcpu, &kvm_seg, seg); 4376static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
3858 return kvm_seg.selector; 4377 u32 msr_index, u64 data)
4378{
4379 return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
4380}
4381
4382static void emulator_halt(struct x86_emulate_ctxt *ctxt)
4383{
4384 emul_to_vcpu(ctxt)->arch.halt_request = 1;
4385}
4386
4387static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
4388{
4389 preempt_disable();
4390 kvm_load_guest_fpu(emul_to_vcpu(ctxt));
4391 /*
4392 * CR0.TS may reference the host fpu state, not the guest fpu state,
4393 * so it may be clear at this point.
4394 */
4395 clts();
3859} 4396}
3860 4397
3861static void emulator_set_segment_selector(u16 sel, int seg, 4398static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
3862 struct kvm_vcpu *vcpu)
3863{ 4399{
3864 struct kvm_segment kvm_seg; 4400 preempt_enable();
4401}
3865 4402
3866 kvm_get_segment(vcpu, &kvm_seg, seg); 4403static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
3867 kvm_seg.selector = sel; 4404 struct x86_instruction_info *info,
3868 kvm_set_segment(vcpu, &kvm_seg, seg); 4405 enum x86_intercept_stage stage)
4406{
4407 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
3869} 4408}
3870 4409
3871static struct x86_emulate_ops emulate_ops = { 4410static struct x86_emulate_ops emulate_ops = {
@@ -3875,21 +4414,29 @@ static struct x86_emulate_ops emulate_ops = {
3875 .read_emulated = emulator_read_emulated, 4414 .read_emulated = emulator_read_emulated,
3876 .write_emulated = emulator_write_emulated, 4415 .write_emulated = emulator_write_emulated,
3877 .cmpxchg_emulated = emulator_cmpxchg_emulated, 4416 .cmpxchg_emulated = emulator_cmpxchg_emulated,
4417 .invlpg = emulator_invlpg,
3878 .pio_in_emulated = emulator_pio_in_emulated, 4418 .pio_in_emulated = emulator_pio_in_emulated,
3879 .pio_out_emulated = emulator_pio_out_emulated, 4419 .pio_out_emulated = emulator_pio_out_emulated,
3880 .get_cached_descriptor = emulator_get_cached_descriptor, 4420 .get_segment = emulator_get_segment,
3881 .set_cached_descriptor = emulator_set_cached_descriptor, 4421 .set_segment = emulator_set_segment,
3882 .get_segment_selector = emulator_get_segment_selector,
3883 .set_segment_selector = emulator_set_segment_selector,
3884 .get_cached_segment_base = emulator_get_cached_segment_base, 4422 .get_cached_segment_base = emulator_get_cached_segment_base,
3885 .get_gdt = emulator_get_gdt, 4423 .get_gdt = emulator_get_gdt,
4424 .get_idt = emulator_get_idt,
4425 .set_gdt = emulator_set_gdt,
4426 .set_idt = emulator_set_idt,
3886 .get_cr = emulator_get_cr, 4427 .get_cr = emulator_get_cr,
3887 .set_cr = emulator_set_cr, 4428 .set_cr = emulator_set_cr,
3888 .cpl = emulator_get_cpl, 4429 .cpl = emulator_get_cpl,
3889 .get_dr = emulator_get_dr, 4430 .get_dr = emulator_get_dr,
3890 .set_dr = emulator_set_dr, 4431 .set_dr = emulator_set_dr,
3891 .set_msr = kvm_set_msr, 4432 .set_msr = emulator_set_msr,
3892 .get_msr = kvm_get_msr, 4433 .get_msr = emulator_get_msr,
4434 .halt = emulator_halt,
4435 .wbinvd = emulator_wbinvd,
4436 .fix_hypercall = emulator_fix_hypercall,
4437 .get_fpu = emulator_get_fpu,
4438 .put_fpu = emulator_put_fpu,
4439 .intercept = emulator_intercept,
3893}; 4440};
3894 4441
3895static void cache_all_regs(struct kvm_vcpu *vcpu) 4442static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3917,23 +4464,89 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
3917static void inject_emulated_exception(struct kvm_vcpu *vcpu) 4464static void inject_emulated_exception(struct kvm_vcpu *vcpu)
3918{ 4465{
3919 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4466 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
3920 if (ctxt->exception == PF_VECTOR) 4467 if (ctxt->exception.vector == PF_VECTOR)
3921 kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); 4468 kvm_propagate_fault(vcpu, &ctxt->exception);
3922 else if (ctxt->error_code_valid) 4469 else if (ctxt->exception.error_code_valid)
3923 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 4470 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
4471 ctxt->exception.error_code);
4472 else
4473 kvm_queue_exception(vcpu, ctxt->exception.vector);
4474}
4475
4476static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4477{
4478 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4479 int cs_db, cs_l;
4480
4481 /*
4482 * TODO: fix emulate.c to use guest_read/write_register
4483 * instead of direct ->regs accesses, can save hundred cycles
4484 * on Intel for instructions that don't read/change RSP, for
4485 * for example.
4486 */
4487 cache_all_regs(vcpu);
4488
4489 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4490
4491 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
4492 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4493 vcpu->arch.emulate_ctxt.mode =
4494 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4495 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
4496 ? X86EMUL_MODE_VM86 : cs_l
4497 ? X86EMUL_MODE_PROT64 : cs_db
4498 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4499 vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu);
4500 memset(c, 0, sizeof(struct decode_cache));
4501 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4502 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4503}
4504
4505int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4506{
4507 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4508 int ret;
4509
4510 init_emulate_ctxt(vcpu);
4511
4512 vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
4513 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
4514 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip +
4515 inc_eip;
4516 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
4517
4518 if (ret != X86EMUL_CONTINUE)
4519 return EMULATE_FAIL;
4520
4521 vcpu->arch.emulate_ctxt.eip = c->eip;
4522 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4523 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4524 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4525
4526 if (irq == NMI_VECTOR)
4527 vcpu->arch.nmi_pending = false;
3924 else 4528 else
3925 kvm_queue_exception(vcpu, ctxt->exception); 4529 vcpu->arch.interrupt.pending = false;
4530
4531 return EMULATE_DONE;
3926} 4532}
4533EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
3927 4534
3928static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4535static int handle_emulation_failure(struct kvm_vcpu *vcpu)
3929{ 4536{
4537 int r = EMULATE_DONE;
4538
3930 ++vcpu->stat.insn_emulation_fail; 4539 ++vcpu->stat.insn_emulation_fail;
3931 trace_kvm_emulate_insn_failed(vcpu); 4540 trace_kvm_emulate_insn_failed(vcpu);
3932 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4541 if (!is_guest_mode(vcpu)) {
3933 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4542 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3934 vcpu->run->internal.ndata = 0; 4543 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4544 vcpu->run->internal.ndata = 0;
4545 r = EMULATE_FAIL;
4546 }
3935 kvm_queue_exception(vcpu, UD_VECTOR); 4547 kvm_queue_exception(vcpu, UD_VECTOR);
3936 return EMULATE_FAIL; 4548
4549 return r;
3937} 4550}
3938 4551
3939static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4552static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3962,74 +4575,34 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
3962 return false; 4575 return false;
3963} 4576}
3964 4577
3965int emulate_instruction(struct kvm_vcpu *vcpu, 4578int x86_emulate_instruction(struct kvm_vcpu *vcpu,
3966 unsigned long cr2, 4579 unsigned long cr2,
3967 u16 error_code, 4580 int emulation_type,
3968 int emulation_type) 4581 void *insn,
4582 int insn_len)
3969{ 4583{
3970 int r; 4584 int r;
3971 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4585 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4586 bool writeback = true;
3972 4587
3973 kvm_clear_exception_queue(vcpu); 4588 kvm_clear_exception_queue(vcpu);
3974 vcpu->arch.mmio_fault_cr2 = cr2;
3975 /*
3976 * TODO: fix emulate.c to use guest_read/write_register
3977 * instead of direct ->regs accesses, can save hundred cycles
3978 * on Intel for instructions that don't read/change RSP, for
3979 * for example.
3980 */
3981 cache_all_regs(vcpu);
3982 4589
3983 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4590 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3984 int cs_db, cs_l; 4591 init_emulate_ctxt(vcpu);
3985 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3986
3987 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3988 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
3989 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
3990 vcpu->arch.emulate_ctxt.mode =
3991 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3992 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
3993 ? X86EMUL_MODE_VM86 : cs_l
3994 ? X86EMUL_MODE_PROT64 : cs_db
3995 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3996 memset(c, 0, sizeof(struct decode_cache));
3997 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
3998 vcpu->arch.emulate_ctxt.interruptibility = 0; 4592 vcpu->arch.emulate_ctxt.interruptibility = 0;
3999 vcpu->arch.emulate_ctxt.exception = -1; 4593 vcpu->arch.emulate_ctxt.have_exception = false;
4000 4594 vcpu->arch.emulate_ctxt.perm_ok = false;
4001 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
4002 trace_kvm_emulate_insn_start(vcpu);
4003 4595
4004 /* Only allow emulation of specific instructions on #UD 4596 vcpu->arch.emulate_ctxt.only_vendor_specific_insn
4005 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 4597 = emulation_type & EMULTYPE_TRAP_UD;
4006 if (emulation_type & EMULTYPE_TRAP_UD) {
4007 if (!c->twobyte)
4008 return EMULATE_FAIL;
4009 switch (c->b) {
4010 case 0x01: /* VMMCALL */
4011 if (c->modrm_mod != 3 || c->modrm_rm != 1)
4012 return EMULATE_FAIL;
4013 break;
4014 case 0x34: /* sysenter */
4015 case 0x35: /* sysexit */
4016 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4017 return EMULATE_FAIL;
4018 break;
4019 case 0x05: /* syscall */
4020 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4021 return EMULATE_FAIL;
4022 break;
4023 default:
4024 return EMULATE_FAIL;
4025 }
4026 4598
4027 if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) 4599 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
4028 return EMULATE_FAIL;
4029 }
4030 4600
4601 trace_kvm_emulate_insn_start(vcpu);
4031 ++vcpu->stat.insn_emulation; 4602 ++vcpu->stat.insn_emulation;
4032 if (r) { 4603 if (r) {
4604 if (emulation_type & EMULTYPE_TRAP_UD)
4605 return EMULATE_FAIL;
4033 if (reexecute_instruction(vcpu, cr2)) 4606 if (reexecute_instruction(vcpu, cr2))
4034 return EMULATE_DONE; 4607 return EMULATE_DONE;
4035 if (emulation_type & EMULTYPE_SKIP) 4608 if (emulation_type & EMULTYPE_SKIP)
@@ -4043,62 +4616,87 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
4043 return EMULATE_DONE; 4616 return EMULATE_DONE;
4044 } 4617 }
4045 4618
4046 /* this is needed for vmware backdor interface to work since it 4619 /* this is needed for vmware backdoor interface to work since it
4047 changes registers values during IO operation */ 4620 changes registers values during IO operation */
4048 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4621 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
4622 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4623 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4624 }
4049 4625
4050restart: 4626restart:
4051 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4627 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
4628
4629 if (r == EMULATION_INTERCEPTED)
4630 return EMULATE_DONE;
4052 4631
4053 if (r) { /* emulation failed */ 4632 if (r == EMULATION_FAILED) {
4054 if (reexecute_instruction(vcpu, cr2)) 4633 if (reexecute_instruction(vcpu, cr2))
4055 return EMULATE_DONE; 4634 return EMULATE_DONE;
4056 4635
4057 return handle_emulation_failure(vcpu); 4636 return handle_emulation_failure(vcpu);
4058 } 4637 }
4059 4638
4060 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); 4639 if (vcpu->arch.emulate_ctxt.have_exception) {
4061 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4062 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4063 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4064
4065 if (vcpu->arch.emulate_ctxt.exception >= 0) {
4066 inject_emulated_exception(vcpu); 4640 inject_emulated_exception(vcpu);
4067 return EMULATE_DONE; 4641 r = EMULATE_DONE;
4068 } 4642 } else if (vcpu->arch.pio.count) {
4069
4070 if (vcpu->arch.pio.count) {
4071 if (!vcpu->arch.pio.in) 4643 if (!vcpu->arch.pio.in)
4072 vcpu->arch.pio.count = 0; 4644 vcpu->arch.pio.count = 0;
4073 return EMULATE_DO_MMIO; 4645 else
4074 } 4646 writeback = false;
4075 4647 r = EMULATE_DO_MMIO;
4076 if (vcpu->mmio_needed) { 4648 } else if (vcpu->mmio_needed) {
4077 if (vcpu->mmio_is_write) 4649 if (!vcpu->mmio_is_write)
4078 vcpu->mmio_needed = 0; 4650 writeback = false;
4079 return EMULATE_DO_MMIO; 4651 r = EMULATE_DO_MMIO;
4080 } 4652 } else if (r == EMULATION_RESTART)
4081
4082 if (vcpu->arch.emulate_ctxt.restart)
4083 goto restart; 4653 goto restart;
4654 else
4655 r = EMULATE_DONE;
4656
4657 if (writeback) {
4658 toggle_interruptibility(vcpu,
4659 vcpu->arch.emulate_ctxt.interruptibility);
4660 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4661 kvm_make_request(KVM_REQ_EVENT, vcpu);
4662 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4663 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
4664 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4665 } else
4666 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
4084 4667
4085 return EMULATE_DONE; 4668 return r;
4086} 4669}
4087EXPORT_SYMBOL_GPL(emulate_instruction); 4670EXPORT_SYMBOL_GPL(x86_emulate_instruction);
4088 4671
4089int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 4672int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4090{ 4673{
4091 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); 4674 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
4092 int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); 4675 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
4676 size, port, &val, 1);
4093 /* do not return to emulator after return from userspace */ 4677 /* do not return to emulator after return from userspace */
4094 vcpu->arch.pio.count = 0; 4678 vcpu->arch.pio.count = 0;
4095 return ret; 4679 return ret;
4096} 4680}
4097EXPORT_SYMBOL_GPL(kvm_fast_pio_out); 4681EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
4098 4682
4099static void bounce_off(void *info) 4683static void tsc_bad(void *info)
4100{ 4684{
4101 /* nothing */ 4685 __this_cpu_write(cpu_tsc_khz, 0);
4686}
4687
4688static void tsc_khz_changed(void *data)
4689{
4690 struct cpufreq_freqs *freq = data;
4691 unsigned long khz = 0;
4692
4693 if (data)
4694 khz = freq->new;
4695 else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4696 khz = cpufreq_quick_get(raw_smp_processor_id());
4697 if (!khz)
4698 khz = tsc_khz;
4699 __this_cpu_write(cpu_tsc_khz, khz);
4102} 4700}
4103 4701
4104static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 4702static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4109,24 +4707,63 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4109 struct kvm_vcpu *vcpu; 4707 struct kvm_vcpu *vcpu;
4110 int i, send_ipi = 0; 4708 int i, send_ipi = 0;
4111 4709
4710 /*
4711 * We allow guests to temporarily run on slowing clocks,
4712 * provided we notify them after, or to run on accelerating
4713 * clocks, provided we notify them before. Thus time never
4714 * goes backwards.
4715 *
4716 * However, we have a problem. We can't atomically update
4717 * the frequency of a given CPU from this function; it is
4718 * merely a notifier, which can be called from any CPU.
4719 * Changing the TSC frequency at arbitrary points in time
4720 * requires a recomputation of local variables related to
4721 * the TSC for each VCPU. We must flag these local variables
4722 * to be updated and be sure the update takes place with the
4723 * new frequency before any guests proceed.
4724 *
4725 * Unfortunately, the combination of hotplug CPU and frequency
4726 * change creates an intractable locking scenario; the order
4727 * of when these callouts happen is undefined with respect to
4728 * CPU hotplug, and they can race with each other. As such,
4729 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
4730 * undefined; you can actually have a CPU frequency change take
4731 * place in between the computation of X and the setting of the
4732 * variable. To protect against this problem, all updates of
4733 * the per_cpu tsc_khz variable are done in an interrupt
4734 * protected IPI, and all callers wishing to update the value
4735 * must wait for a synchronous IPI to complete (which is trivial
4736 * if the caller is on the CPU already). This establishes the
4737 * necessary total order on variable updates.
4738 *
4739 * Note that because a guest time update may take place
4740 * anytime after the setting of the VCPU's request bit, the
4741 * correct TSC value must be set before the request. However,
4742 * to ensure the update actually makes it to any guest which
4743 * starts running in hardware virtualization between the set
4744 * and the acquisition of the spinlock, we must also ping the
4745 * CPU after setting the request bit.
4746 *
4747 */
4748
4112 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 4749 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
4113 return 0; 4750 return 0;
4114 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 4751 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
4115 return 0; 4752 return 0;
4116 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
4117 4753
4118 spin_lock(&kvm_lock); 4754 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4755
4756 raw_spin_lock(&kvm_lock);
4119 list_for_each_entry(kvm, &vm_list, vm_list) { 4757 list_for_each_entry(kvm, &vm_list, vm_list) {
4120 kvm_for_each_vcpu(i, vcpu, kvm) { 4758 kvm_for_each_vcpu(i, vcpu, kvm) {
4121 if (vcpu->cpu != freq->cpu) 4759 if (vcpu->cpu != freq->cpu)
4122 continue; 4760 continue;
4123 if (!kvm_request_guest_time_update(vcpu)) 4761 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4124 continue;
4125 if (vcpu->cpu != smp_processor_id()) 4762 if (vcpu->cpu != smp_processor_id())
4126 send_ipi++; 4763 send_ipi = 1;
4127 } 4764 }
4128 } 4765 }
4129 spin_unlock(&kvm_lock); 4766 raw_spin_unlock(&kvm_lock);
4130 4767
4131 if (freq->old < freq->new && send_ipi) { 4768 if (freq->old < freq->new && send_ipi) {
4132 /* 4769 /*
@@ -4141,32 +4778,59 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4141 * guest context is entered kvmclock will be updated, 4778 * guest context is entered kvmclock will be updated,
4142 * so the guest will not see stale values. 4779 * so the guest will not see stale values.
4143 */ 4780 */
4144 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 4781 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4145 } 4782 }
4146 return 0; 4783 return 0;
4147} 4784}
4148 4785
4149static struct notifier_block kvmclock_cpufreq_notifier_block = { 4786static struct notifier_block kvmclock_cpufreq_notifier_block = {
4150 .notifier_call = kvmclock_cpufreq_notifier 4787 .notifier_call = kvmclock_cpufreq_notifier
4788};
4789
4790static int kvmclock_cpu_notifier(struct notifier_block *nfb,
4791 unsigned long action, void *hcpu)
4792{
4793 unsigned int cpu = (unsigned long)hcpu;
4794
4795 switch (action) {
4796 case CPU_ONLINE:
4797 case CPU_DOWN_FAILED:
4798 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4799 break;
4800 case CPU_DOWN_PREPARE:
4801 smp_call_function_single(cpu, tsc_bad, NULL, 1);
4802 break;
4803 }
4804 return NOTIFY_OK;
4805}
4806
4807static struct notifier_block kvmclock_cpu_notifier_block = {
4808 .notifier_call = kvmclock_cpu_notifier,
4809 .priority = -INT_MAX
4151}; 4810};
4152 4811
4153static void kvm_timer_init(void) 4812static void kvm_timer_init(void)
4154{ 4813{
4155 int cpu; 4814 int cpu;
4156 4815
4816 max_tsc_khz = tsc_khz;
4817 register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
4157 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 4818 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
4819#ifdef CONFIG_CPU_FREQ
4820 struct cpufreq_policy policy;
4821 memset(&policy, 0, sizeof(policy));
4822 cpu = get_cpu();
4823 cpufreq_get_policy(&policy, cpu);
4824 if (policy.cpuinfo.max_freq)
4825 max_tsc_khz = policy.cpuinfo.max_freq;
4826 put_cpu();
4827#endif
4158 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 4828 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
4159 CPUFREQ_TRANSITION_NOTIFIER); 4829 CPUFREQ_TRANSITION_NOTIFIER);
4160 for_each_online_cpu(cpu) {
4161 unsigned long khz = cpufreq_get(cpu);
4162 if (!khz)
4163 khz = tsc_khz;
4164 per_cpu(cpu_tsc_khz, cpu) = khz;
4165 }
4166 } else {
4167 for_each_possible_cpu(cpu)
4168 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
4169 } 4830 }
4831 pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
4832 for_each_online_cpu(cpu)
4833 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4170} 4834}
4171 4835
4172static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 4836static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -4244,7 +4908,6 @@ int kvm_arch_init(void *opaque)
4244 4908
4245 kvm_x86_ops = ops; 4909 kvm_x86_ops = ops;
4246 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 4910 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
4247 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
4248 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 4911 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
4249 PT_DIRTY_MASK, PT64_NX_MASK, 0); 4912 PT_DIRTY_MASK, PT64_NX_MASK, 0);
4250 4913
@@ -4268,6 +4931,7 @@ void kvm_arch_exit(void)
4268 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 4931 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4269 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 4932 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
4270 CPUFREQ_TRANSITION_NOTIFIER); 4933 CPUFREQ_TRANSITION_NOTIFIER);
4934 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
4271 kvm_x86_ops = NULL; 4935 kvm_x86_ops = NULL;
4272 kvm_mmu_module_exit(); 4936 kvm_mmu_module_exit();
4273} 4937}
@@ -4403,8 +5067,9 @@ out:
4403} 5067}
4404EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 5068EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
4405 5069
4406int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 5070int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
4407{ 5071{
5072 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4408 char instruction[3]; 5073 char instruction[3];
4409 unsigned long rip = kvm_rip_read(vcpu); 5074 unsigned long rip = kvm_rip_read(vcpu);
4410 5075
@@ -4417,21 +5082,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
4417 5082
4418 kvm_x86_ops->patch_hypercall(vcpu, instruction); 5083 kvm_x86_ops->patch_hypercall(vcpu, instruction);
4419 5084
4420 return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); 5085 return emulator_write_emulated(&vcpu->arch.emulate_ctxt,
4421} 5086 rip, instruction, 3, NULL);
4422
4423void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4424{
4425 struct desc_ptr dt = { limit, base };
4426
4427 kvm_x86_ops->set_gdt(vcpu, &dt);
4428}
4429
4430void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4431{
4432 struct desc_ptr dt = { limit, base };
4433
4434 kvm_x86_ops->set_idt(vcpu, &dt);
4435} 5087}
4436 5088
4437static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 5089static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
@@ -4482,12 +5134,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
4482 best = e; 5134 best = e;
4483 break; 5135 break;
4484 } 5136 }
4485 /*
4486 * Both basic or both extended?
4487 */
4488 if (((e->function ^ function) & 0x80000000) == 0)
4489 if (!best || e->function > best->function)
4490 best = e;
4491 } 5137 }
4492 return best; 5138 return best;
4493} 5139}
@@ -4507,6 +5153,27 @@ not_found:
4507 return 36; 5153 return 36;
4508} 5154}
4509 5155
5156/*
5157 * If no match is found, check whether we exceed the vCPU's limit
5158 * and return the content of the highest valid _standard_ leaf instead.
5159 * This is to satisfy the CPUID specification.
5160 */
5161static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
5162 u32 function, u32 index)
5163{
5164 struct kvm_cpuid_entry2 *maxlevel;
5165
5166 maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
5167 if (!maxlevel || maxlevel->eax >= function)
5168 return NULL;
5169 if (function & 0x80000000) {
5170 maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
5171 if (!maxlevel)
5172 return NULL;
5173 }
5174 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
5175}
5176
4510void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 5177void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
4511{ 5178{
4512 u32 function, index; 5179 u32 function, index;
@@ -4519,6 +5186,10 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
4519 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 5186 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
4520 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 5187 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
4521 best = kvm_find_cpuid_entry(vcpu, function, index); 5188 best = kvm_find_cpuid_entry(vcpu, function, index);
5189
5190 if (!best)
5191 best = check_cpuid_limit(vcpu, function, index);
5192
4522 if (best) { 5193 if (best) {
4523 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 5194 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
4524 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 5195 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
@@ -4675,6 +5346,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
4675static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5346static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4676{ 5347{
4677 int r; 5348 int r;
5349 bool nmi_pending;
4678 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5350 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
4679 vcpu->run->request_interrupt_window; 5351 vcpu->run->request_interrupt_window;
4680 5352
@@ -4683,8 +5355,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4683 kvm_mmu_unload(vcpu); 5355 kvm_mmu_unload(vcpu);
4684 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5356 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
4685 __kvm_migrate_timers(vcpu); 5357 __kvm_migrate_timers(vcpu);
4686 if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) 5358 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
4687 kvm_write_guest_time(vcpu); 5359 r = kvm_guest_time_update(vcpu);
5360 if (unlikely(r))
5361 goto out;
5362 }
4688 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) 5363 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4689 kvm_mmu_sync_roots(vcpu); 5364 kvm_mmu_sync_roots(vcpu);
4690 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 5365 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
@@ -4703,12 +5378,41 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4703 vcpu->fpu_active = 0; 5378 vcpu->fpu_active = 0;
4704 kvm_x86_ops->fpu_deactivate(vcpu); 5379 kvm_x86_ops->fpu_deactivate(vcpu);
4705 } 5380 }
5381 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
5382 /* Page is swapped out. Do synthetic halt */
5383 vcpu->arch.apf.halted = true;
5384 r = 1;
5385 goto out;
5386 }
4706 } 5387 }
4707 5388
4708 r = kvm_mmu_reload(vcpu); 5389 r = kvm_mmu_reload(vcpu);
4709 if (unlikely(r)) 5390 if (unlikely(r))
4710 goto out; 5391 goto out;
4711 5392
5393 /*
5394 * An NMI can be injected between local nmi_pending read and
5395 * vcpu->arch.nmi_pending read inside inject_pending_event().
5396 * But in that case, KVM_REQ_EVENT will be set, which makes
5397 * the race described above benign.
5398 */
5399 nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
5400
5401 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5402 inject_pending_event(vcpu);
5403
5404 /* enable NMI/IRQ window open exits if needed */
5405 if (nmi_pending)
5406 kvm_x86_ops->enable_nmi_window(vcpu);
5407 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
5408 kvm_x86_ops->enable_irq_window(vcpu);
5409
5410 if (kvm_lapic_enabled(vcpu)) {
5411 update_cr8_intercept(vcpu);
5412 kvm_lapic_sync_to_vapic(vcpu);
5413 }
5414 }
5415
4712 preempt_disable(); 5416 preempt_disable();
4713 5417
4714 kvm_x86_ops->prepare_guest_switch(vcpu); 5418 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -4716,34 +5420,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4716 kvm_load_guest_fpu(vcpu); 5420 kvm_load_guest_fpu(vcpu);
4717 kvm_load_guest_xcr0(vcpu); 5421 kvm_load_guest_xcr0(vcpu);
4718 5422
4719 atomic_set(&vcpu->guest_mode, 1); 5423 vcpu->mode = IN_GUEST_MODE;
4720 smp_wmb(); 5424
5425 /* We should set ->mode before check ->requests,
5426 * see the comment in make_all_cpus_request.
5427 */
5428 smp_mb();
4721 5429
4722 local_irq_disable(); 5430 local_irq_disable();
4723 5431
4724 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests 5432 if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
4725 || need_resched() || signal_pending(current)) { 5433 || need_resched() || signal_pending(current)) {
4726 atomic_set(&vcpu->guest_mode, 0); 5434 vcpu->mode = OUTSIDE_GUEST_MODE;
4727 smp_wmb(); 5435 smp_wmb();
4728 local_irq_enable(); 5436 local_irq_enable();
4729 preempt_enable(); 5437 preempt_enable();
5438 kvm_x86_ops->cancel_injection(vcpu);
4730 r = 1; 5439 r = 1;
4731 goto out; 5440 goto out;
4732 } 5441 }
4733 5442
4734 inject_pending_event(vcpu);
4735
4736 /* enable NMI/IRQ window open exits if needed */
4737 if (vcpu->arch.nmi_pending)
4738 kvm_x86_ops->enable_nmi_window(vcpu);
4739 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
4740 kvm_x86_ops->enable_irq_window(vcpu);
4741
4742 if (kvm_lapic_enabled(vcpu)) {
4743 update_cr8_intercept(vcpu);
4744 kvm_lapic_sync_to_vapic(vcpu);
4745 }
4746
4747 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5443 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4748 5444
4749 kvm_guest_enter(); 5445 kvm_guest_enter();
@@ -4769,7 +5465,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4769 if (hw_breakpoint_active()) 5465 if (hw_breakpoint_active())
4770 hw_breakpoint_restore(); 5466 hw_breakpoint_restore();
4771 5467
4772 atomic_set(&vcpu->guest_mode, 0); 5468 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
5469
5470 vcpu->mode = OUTSIDE_GUEST_MODE;
4773 smp_wmb(); 5471 smp_wmb();
4774 local_irq_enable(); 5472 local_irq_enable();
4775 5473
@@ -4826,7 +5524,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4826 5524
4827 r = 1; 5525 r = 1;
4828 while (r > 0) { 5526 while (r > 0) {
4829 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 5527 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
5528 !vcpu->arch.apf.halted)
4830 r = vcpu_enter_guest(vcpu); 5529 r = vcpu_enter_guest(vcpu);
4831 else { 5530 else {
4832 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5531 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -4839,6 +5538,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4839 vcpu->arch.mp_state = 5538 vcpu->arch.mp_state =
4840 KVM_MP_STATE_RUNNABLE; 5539 KVM_MP_STATE_RUNNABLE;
4841 case KVM_MP_STATE_RUNNABLE: 5540 case KVM_MP_STATE_RUNNABLE:
5541 vcpu->arch.apf.halted = false;
4842 break; 5542 break;
4843 case KVM_MP_STATE_SIPI_RECEIVED: 5543 case KVM_MP_STATE_SIPI_RECEIVED:
4844 default: 5544 default:
@@ -4860,6 +5560,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4860 vcpu->run->exit_reason = KVM_EXIT_INTR; 5560 vcpu->run->exit_reason = KVM_EXIT_INTR;
4861 ++vcpu->stat.request_irq_exits; 5561 ++vcpu->stat.request_irq_exits;
4862 } 5562 }
5563
5564 kvm_check_async_pf_completion(vcpu);
5565
4863 if (signal_pending(current)) { 5566 if (signal_pending(current)) {
4864 r = -EINTR; 5567 r = -EINTR;
4865 vcpu->run->exit_reason = KVM_EXIT_INTR; 5568 vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -4879,11 +5582,49 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4879 return r; 5582 return r;
4880} 5583}
4881 5584
5585static int complete_mmio(struct kvm_vcpu *vcpu)
5586{
5587 struct kvm_run *run = vcpu->run;
5588 int r;
5589
5590 if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
5591 return 1;
5592
5593 if (vcpu->mmio_needed) {
5594 vcpu->mmio_needed = 0;
5595 if (!vcpu->mmio_is_write)
5596 memcpy(vcpu->mmio_data + vcpu->mmio_index,
5597 run->mmio.data, 8);
5598 vcpu->mmio_index += 8;
5599 if (vcpu->mmio_index < vcpu->mmio_size) {
5600 run->exit_reason = KVM_EXIT_MMIO;
5601 run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
5602 memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
5603 run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
5604 run->mmio.is_write = vcpu->mmio_is_write;
5605 vcpu->mmio_needed = 1;
5606 return 0;
5607 }
5608 if (vcpu->mmio_is_write)
5609 return 1;
5610 vcpu->mmio_read_completed = 1;
5611 }
5612 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5613 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5614 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5615 if (r != EMULATE_DONE)
5616 return 0;
5617 return 1;
5618}
5619
4882int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 5620int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4883{ 5621{
4884 int r; 5622 int r;
4885 sigset_t sigsaved; 5623 sigset_t sigsaved;
4886 5624
5625 if (!tsk_used_math(current) && init_fpu(current))
5626 return -ENOMEM;
5627
4887 if (vcpu->sigset_active) 5628 if (vcpu->sigset_active)
4888 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 5629 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
4889 5630
@@ -4895,24 +5636,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4895 } 5636 }
4896 5637
4897 /* re-sync apic's tpr */ 5638 /* re-sync apic's tpr */
4898 if (!irqchip_in_kernel(vcpu->kvm)) 5639 if (!irqchip_in_kernel(vcpu->kvm)) {
4899 kvm_set_cr8(vcpu, kvm_run->cr8); 5640 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
4900 5641 r = -EINVAL;
4901 if (vcpu->arch.pio.count || vcpu->mmio_needed ||
4902 vcpu->arch.emulate_ctxt.restart) {
4903 if (vcpu->mmio_needed) {
4904 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4905 vcpu->mmio_read_completed = 1;
4906 vcpu->mmio_needed = 0;
4907 }
4908 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4909 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4910 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4911 if (r != EMULATE_DONE) {
4912 r = 0;
4913 goto out; 5642 goto out;
4914 } 5643 }
4915 } 5644 }
5645
5646 r = complete_mmio(vcpu);
5647 if (r <= 0)
5648 goto out;
5649
4916 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 5650 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
4917 kvm_register_write(vcpu, VCPU_REGS_RAX, 5651 kvm_register_write(vcpu, VCPU_REGS_RAX,
4918 kvm_run->hypercall.ret); 5652 kvm_run->hypercall.ret);
@@ -4929,6 +5663,18 @@ out:
4929 5663
4930int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 5664int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4931{ 5665{
5666 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
5667 /*
5668 * We are here if userspace calls get_regs() in the middle of
5669 * instruction emulation. Registers state needs to be copied
5670 * back from emulation context to vcpu. Usrapace shouldn't do
5671 * that usually, but some bad designed PV devices (vmware
5672 * backdoor interface) need this to work
5673 */
5674 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
5675 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5676 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5677 }
4932 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 5678 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4933 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 5679 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4934 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 5680 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4956,6 +5702,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4956 5702
4957int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 5703int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4958{ 5704{
5705 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
5706 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5707
4959 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 5708 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
4960 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 5709 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
4961 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 5710 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4980,6 +5729,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4980 5729
4981 vcpu->arch.exception.pending = false; 5730 vcpu->arch.exception.pending = false;
4982 5731
5732 kvm_make_request(KVM_REQ_EVENT, vcpu);
5733
4983 return 0; 5734 return 0;
4984} 5735}
4985 5736
@@ -5017,7 +5768,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
5017 5768
5018 sregs->cr0 = kvm_read_cr0(vcpu); 5769 sregs->cr0 = kvm_read_cr0(vcpu);
5019 sregs->cr2 = vcpu->arch.cr2; 5770 sregs->cr2 = vcpu->arch.cr2;
5020 sregs->cr3 = vcpu->arch.cr3; 5771 sregs->cr3 = kvm_read_cr3(vcpu);
5021 sregs->cr4 = kvm_read_cr4(vcpu); 5772 sregs->cr4 = kvm_read_cr4(vcpu);
5022 sregs->cr8 = kvm_get_cr8(vcpu); 5773 sregs->cr8 = kvm_get_cr8(vcpu);
5023 sregs->efer = vcpu->arch.efer; 5774 sregs->efer = vcpu->arch.efer;
@@ -5043,6 +5794,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5043 struct kvm_mp_state *mp_state) 5794 struct kvm_mp_state *mp_state)
5044{ 5795{
5045 vcpu->arch.mp_state = mp_state->mp_state; 5796 vcpu->arch.mp_state = mp_state->mp_state;
5797 kvm_make_request(KVM_REQ_EVENT, vcpu);
5046 return 0; 5798 return 0;
5047} 5799}
5048 5800
@@ -5050,24 +5802,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5050 bool has_error_code, u32 error_code) 5802 bool has_error_code, u32 error_code)
5051{ 5803{
5052 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 5804 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
5053 int cs_db, cs_l, ret; 5805 int ret;
5054 cache_all_regs(vcpu);
5055
5056 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5057 5806
5058 vcpu->arch.emulate_ctxt.vcpu = vcpu; 5807 init_emulate_ctxt(vcpu);
5059 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
5060 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
5061 vcpu->arch.emulate_ctxt.mode =
5062 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
5063 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
5064 ? X86EMUL_MODE_VM86 : cs_l
5065 ? X86EMUL_MODE_PROT64 : cs_db
5066 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
5067 memset(c, 0, sizeof(struct decode_cache));
5068 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
5069 5808
5070 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5809 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
5071 tss_selector, reason, has_error_code, 5810 tss_selector, reason, has_error_code,
5072 error_code); 5811 error_code);
5073 5812
@@ -5076,7 +5815,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5076 5815
5077 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5816 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5078 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 5817 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
5079 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5818 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5819 kvm_make_request(KVM_REQ_EVENT, vcpu);
5080 return EMULATE_DONE; 5820 return EMULATE_DONE;
5081} 5821}
5082EXPORT_SYMBOL_GPL(kvm_task_switch); 5822EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5085,7 +5825,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5085 struct kvm_sregs *sregs) 5825 struct kvm_sregs *sregs)
5086{ 5826{
5087 int mmu_reset_needed = 0; 5827 int mmu_reset_needed = 0;
5088 int pending_vec, max_bits; 5828 int pending_vec, max_bits, idx;
5089 struct desc_ptr dt; 5829 struct desc_ptr dt;
5090 5830
5091 dt.size = sregs->idt.limit; 5831 dt.size = sregs->idt.limit;
@@ -5096,8 +5836,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5096 kvm_x86_ops->set_gdt(vcpu, &dt); 5836 kvm_x86_ops->set_gdt(vcpu, &dt);
5097 5837
5098 vcpu->arch.cr2 = sregs->cr2; 5838 vcpu->arch.cr2 = sregs->cr2;
5099 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 5839 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
5100 vcpu->arch.cr3 = sregs->cr3; 5840 vcpu->arch.cr3 = sregs->cr3;
5841 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
5101 5842
5102 kvm_set_cr8(vcpu, sregs->cr8); 5843 kvm_set_cr8(vcpu, sregs->cr8);
5103 5844
@@ -5111,10 +5852,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5111 5852
5112 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5853 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
5113 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5854 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5855 if (sregs->cr4 & X86_CR4_OSXSAVE)
5856 update_cpuid(vcpu);
5857
5858 idx = srcu_read_lock(&vcpu->kvm->srcu);
5114 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5859 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5115 load_pdptrs(vcpu, vcpu->arch.cr3); 5860 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
5116 mmu_reset_needed = 1; 5861 mmu_reset_needed = 1;
5117 } 5862 }
5863 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5118 5864
5119 if (mmu_reset_needed) 5865 if (mmu_reset_needed)
5120 kvm_mmu_reset_context(vcpu); 5866 kvm_mmu_reset_context(vcpu);
@@ -5125,8 +5871,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5125 if (pending_vec < max_bits) { 5871 if (pending_vec < max_bits) {
5126 kvm_queue_interrupt(vcpu, pending_vec, false); 5872 kvm_queue_interrupt(vcpu, pending_vec, false);
5127 pr_debug("Set back pending irq %d\n", pending_vec); 5873 pr_debug("Set back pending irq %d\n", pending_vec);
5128 if (irqchip_in_kernel(vcpu->kvm))
5129 kvm_pic_clear_isr_ack(vcpu->kvm);
5130 } 5874 }
5131 5875
5132 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5876 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -5147,6 +5891,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5147 !is_protmode(vcpu)) 5891 !is_protmode(vcpu))
5148 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5892 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5149 5893
5894 kvm_make_request(KVM_REQ_EVENT, vcpu);
5895
5150 return 0; 5896 return 0;
5151} 5897}
5152 5898
@@ -5320,10 +6066,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5320 6066
5321void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 6067void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5322{ 6068{
5323 if (vcpu->arch.time_page) { 6069 kvmclock_reset(vcpu);
5324 kvm_release_page_dirty(vcpu->arch.time_page);
5325 vcpu->arch.time_page = NULL;
5326 }
5327 6070
5328 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 6071 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
5329 fx_free(vcpu); 6072 fx_free(vcpu);
@@ -5333,6 +6076,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5333struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 6076struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
5334 unsigned int id) 6077 unsigned int id)
5335{ 6078{
6079 if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
6080 printk_once(KERN_WARNING
6081 "kvm: SMP vm created on host with unstable TSC; "
6082 "guest TSC will not be reliable\n");
5336 return kvm_x86_ops->vcpu_create(kvm, id); 6083 return kvm_x86_ops->vcpu_create(kvm, id);
5337} 6084}
5338 6085
@@ -5357,6 +6104,8 @@ free_vcpu:
5357 6104
5358void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 6105void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
5359{ 6106{
6107 vcpu->arch.apf.msr_val = 0;
6108
5360 vcpu_load(vcpu); 6109 vcpu_load(vcpu);
5361 kvm_mmu_unload(vcpu); 6110 kvm_mmu_unload(vcpu);
5362 vcpu_put(vcpu); 6111 vcpu_put(vcpu);
@@ -5375,22 +6124,29 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5375 vcpu->arch.dr6 = DR6_FIXED_1; 6124 vcpu->arch.dr6 = DR6_FIXED_1;
5376 vcpu->arch.dr7 = DR7_FIXED_1; 6125 vcpu->arch.dr7 = DR7_FIXED_1;
5377 6126
6127 kvm_make_request(KVM_REQ_EVENT, vcpu);
6128 vcpu->arch.apf.msr_val = 0;
6129
6130 kvmclock_reset(vcpu);
6131
6132 kvm_clear_async_pf_completion_queue(vcpu);
6133 kvm_async_pf_hash_reset(vcpu);
6134 vcpu->arch.apf.halted = false;
6135
5378 return kvm_x86_ops->vcpu_reset(vcpu); 6136 return kvm_x86_ops->vcpu_reset(vcpu);
5379} 6137}
5380 6138
5381int kvm_arch_hardware_enable(void *garbage) 6139int kvm_arch_hardware_enable(void *garbage)
5382{ 6140{
5383 /* 6141 struct kvm *kvm;
5384 * Since this may be called from a hotplug notifcation, 6142 struct kvm_vcpu *vcpu;
5385 * we can't get the CPU frequency directly. 6143 int i;
5386 */
5387 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5388 int cpu = raw_smp_processor_id();
5389 per_cpu(cpu_tsc_khz, cpu) = 0;
5390 }
5391 6144
5392 kvm_shared_msr_cpu_online(); 6145 kvm_shared_msr_cpu_online();
5393 6146 list_for_each_entry(kvm, &vm_list, vm_list)
6147 kvm_for_each_vcpu(i, vcpu, kvm)
6148 if (vcpu->cpu == smp_processor_id())
6149 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5394 return kvm_x86_ops->hardware_enable(garbage); 6150 return kvm_x86_ops->hardware_enable(garbage);
5395} 6151}
5396 6152
@@ -5424,7 +6180,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5424 BUG_ON(vcpu->kvm == NULL); 6180 BUG_ON(vcpu->kvm == NULL);
5425 kvm = vcpu->kvm; 6181 kvm = vcpu->kvm;
5426 6182
6183 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
6184 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
5427 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 6185 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
6186 vcpu->arch.mmu.translate_gpa = translate_gpa;
6187 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5428 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 6188 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
5429 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 6189 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5430 else 6190 else
@@ -5437,6 +6197,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5437 } 6197 }
5438 vcpu->arch.pio_data = page_address(page); 6198 vcpu->arch.pio_data = page_address(page);
5439 6199
6200 kvm_init_tsc_catchup(vcpu, max_tsc_khz);
6201
5440 r = kvm_mmu_create(vcpu); 6202 r = kvm_mmu_create(vcpu);
5441 if (r < 0) 6203 if (r < 0)
5442 goto fail_free_pio_data; 6204 goto fail_free_pio_data;
@@ -5458,6 +6220,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5458 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 6220 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5459 goto fail_free_mce_banks; 6221 goto fail_free_mce_banks;
5460 6222
6223 kvm_async_pf_hash_reset(vcpu);
6224
5461 return 0; 6225 return 0;
5462fail_free_mce_banks: 6226fail_free_mce_banks:
5463 kfree(vcpu->arch.mce_banks); 6227 kfree(vcpu->arch.mce_banks);
@@ -5483,22 +6247,17 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
5483 free_page((unsigned long)vcpu->arch.pio_data); 6247 free_page((unsigned long)vcpu->arch.pio_data);
5484} 6248}
5485 6249
5486struct kvm *kvm_arch_create_vm(void) 6250int kvm_arch_init_vm(struct kvm *kvm)
5487{ 6251{
5488 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
5489
5490 if (!kvm)
5491 return ERR_PTR(-ENOMEM);
5492
5493 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6252 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5494 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6253 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5495 6254
5496 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6255 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
5497 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 6256 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
5498 6257
5499 rdtscll(kvm->arch.vm_init_tsc); 6258 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
5500 6259
5501 return kvm; 6260 return 0;
5502} 6261}
5503 6262
5504static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 6263static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -5516,8 +6275,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
5516 /* 6275 /*
5517 * Unpin any mmu pages first. 6276 * Unpin any mmu pages first.
5518 */ 6277 */
5519 kvm_for_each_vcpu(i, vcpu, kvm) 6278 kvm_for_each_vcpu(i, vcpu, kvm) {
6279 kvm_clear_async_pf_completion_queue(vcpu);
5520 kvm_unload_vcpu_mmu(vcpu); 6280 kvm_unload_vcpu_mmu(vcpu);
6281 }
5521 kvm_for_each_vcpu(i, vcpu, kvm) 6282 kvm_for_each_vcpu(i, vcpu, kvm)
5522 kvm_arch_vcpu_free(vcpu); 6283 kvm_arch_vcpu_free(vcpu);
5523 6284
@@ -5541,13 +6302,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5541 kfree(kvm->arch.vpic); 6302 kfree(kvm->arch.vpic);
5542 kfree(kvm->arch.vioapic); 6303 kfree(kvm->arch.vioapic);
5543 kvm_free_vcpus(kvm); 6304 kvm_free_vcpus(kvm);
5544 kvm_free_physmem(kvm);
5545 if (kvm->arch.apic_access_page) 6305 if (kvm->arch.apic_access_page)
5546 put_page(kvm->arch.apic_access_page); 6306 put_page(kvm->arch.apic_access_page);
5547 if (kvm->arch.ept_identity_pagetable) 6307 if (kvm->arch.ept_identity_pagetable)
5548 put_page(kvm->arch.ept_identity_pagetable); 6308 put_page(kvm->arch.ept_identity_pagetable);
5549 cleanup_srcu_struct(&kvm->srcu);
5550 kfree(kvm);
5551} 6309}
5552 6310
5553int kvm_arch_prepare_memory_region(struct kvm *kvm, 6311int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -5595,7 +6353,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
5595 int user_alloc) 6353 int user_alloc)
5596{ 6354{
5597 6355
5598 int npages = mem->memory_size >> PAGE_SHIFT; 6356 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
5599 6357
5600 if (!user_alloc && !old.user_alloc && old.rmap && !npages) { 6358 if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
5601 int ret; 6359 int ret;
@@ -5610,12 +6368,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
5610 "failed to munmap memory\n"); 6368 "failed to munmap memory\n");
5611 } 6369 }
5612 6370
6371 if (!kvm->arch.n_requested_mmu_pages)
6372 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6373
5613 spin_lock(&kvm->mmu_lock); 6374 spin_lock(&kvm->mmu_lock);
5614 if (!kvm->arch.n_requested_mmu_pages) { 6375 if (nr_mmu_pages)
5615 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
5616 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6376 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
5617 }
5618
5619 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6377 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
5620 spin_unlock(&kvm->mmu_lock); 6378 spin_unlock(&kvm->mmu_lock);
5621} 6379}
@@ -5628,7 +6386,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
5628 6386
5629int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6387int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
5630{ 6388{
5631 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 6389 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6390 !vcpu->arch.apf.halted)
6391 || !list_empty_careful(&vcpu->async_pf.done)
5632 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6392 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
5633 || vcpu->arch.nmi_pending || 6393 || vcpu->arch.nmi_pending ||
5634 (kvm_arch_interrupt_allowed(vcpu) && 6394 (kvm_arch_interrupt_allowed(vcpu) &&
@@ -5647,7 +6407,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
5647 6407
5648 me = get_cpu(); 6408 me = get_cpu();
5649 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 6409 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
5650 if (atomic_xchg(&vcpu->guest_mode, 0)) 6410 if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
5651 smp_send_reschedule(cpu); 6411 smp_send_reschedule(cpu);
5652 put_cpu(); 6412 put_cpu();
5653} 6413}
@@ -5683,9 +6443,151 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5683 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) 6443 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
5684 rflags |= X86_EFLAGS_TF; 6444 rflags |= X86_EFLAGS_TF;
5685 kvm_x86_ops->set_rflags(vcpu, rflags); 6445 kvm_x86_ops->set_rflags(vcpu, rflags);
6446 kvm_make_request(KVM_REQ_EVENT, vcpu);
5686} 6447}
5687EXPORT_SYMBOL_GPL(kvm_set_rflags); 6448EXPORT_SYMBOL_GPL(kvm_set_rflags);
5688 6449
6450void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
6451{
6452 int r;
6453
6454 if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
6455 is_error_page(work->page))
6456 return;
6457
6458 r = kvm_mmu_reload(vcpu);
6459 if (unlikely(r))
6460 return;
6461
6462 if (!vcpu->arch.mmu.direct_map &&
6463 work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
6464 return;
6465
6466 vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
6467}
6468
6469static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
6470{
6471 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
6472}
6473
6474static inline u32 kvm_async_pf_next_probe(u32 key)
6475{
6476 return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
6477}
6478
6479static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6480{
6481 u32 key = kvm_async_pf_hash_fn(gfn);
6482
6483 while (vcpu->arch.apf.gfns[key] != ~0)
6484 key = kvm_async_pf_next_probe(key);
6485
6486 vcpu->arch.apf.gfns[key] = gfn;
6487}
6488
6489static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
6490{
6491 int i;
6492 u32 key = kvm_async_pf_hash_fn(gfn);
6493
6494 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
6495 (vcpu->arch.apf.gfns[key] != gfn &&
6496 vcpu->arch.apf.gfns[key] != ~0); i++)
6497 key = kvm_async_pf_next_probe(key);
6498
6499 return key;
6500}
6501
6502bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6503{
6504 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
6505}
6506
6507static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6508{
6509 u32 i, j, k;
6510
6511 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
6512 while (true) {
6513 vcpu->arch.apf.gfns[i] = ~0;
6514 do {
6515 j = kvm_async_pf_next_probe(j);
6516 if (vcpu->arch.apf.gfns[j] == ~0)
6517 return;
6518 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
6519 /*
6520 * k lies cyclically in ]i,j]
6521 * | i.k.j |
6522 * |....j i.k.| or |.k..j i...|
6523 */
6524 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
6525 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
6526 i = j;
6527 }
6528}
6529
6530static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
6531{
6532
6533 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
6534 sizeof(val));
6535}
6536
6537void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
6538 struct kvm_async_pf *work)
6539{
6540 struct x86_exception fault;
6541
6542 trace_kvm_async_pf_not_present(work->arch.token, work->gva);
6543 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
6544
6545 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
6546 (vcpu->arch.apf.send_user_only &&
6547 kvm_x86_ops->get_cpl(vcpu) == 0))
6548 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
6549 else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
6550 fault.vector = PF_VECTOR;
6551 fault.error_code_valid = true;
6552 fault.error_code = 0;
6553 fault.nested_page_fault = false;
6554 fault.address = work->arch.token;
6555 kvm_inject_page_fault(vcpu, &fault);
6556 }
6557}
6558
6559void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
6560 struct kvm_async_pf *work)
6561{
6562 struct x86_exception fault;
6563
6564 trace_kvm_async_pf_ready(work->arch.token, work->gva);
6565 if (is_error_page(work->page))
6566 work->arch.token = ~0; /* broadcast wakeup */
6567 else
6568 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
6569
6570 if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
6571 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
6572 fault.vector = PF_VECTOR;
6573 fault.error_code_valid = true;
6574 fault.error_code = 0;
6575 fault.nested_page_fault = false;
6576 fault.address = work->arch.token;
6577 kvm_inject_page_fault(vcpu, &fault);
6578 }
6579 vcpu->arch.apf.halted = false;
6580}
6581
6582bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
6583{
6584 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
6585 return true;
6586 else
6587 return !kvm_event_needs_reinjection(vcpu) &&
6588 kvm_x86_ops->interrupt_allowed(vcpu);
6589}
6590
5689EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 6591EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
5690EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 6592EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
5691EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 6593EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2b..e407ed3df817 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
50#endif 50#endif
51} 51}
52 52
53static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
54{
55 return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
56}
57
53static inline int is_pae(struct kvm_vcpu *vcpu) 58static inline int is_pae(struct kvm_vcpu *vcpu)
54{ 59{
55 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); 60 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
@@ -65,7 +70,15 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 70 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 71}
67 72
73static inline u32 bit(int bitno)
74{
75 return 1 << (bitno & 31);
76}
77
68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 78void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 79void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
80int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
81
82void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
70 83
71#endif 84#endif