diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/kvm | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 8 | ||||
-rw-r--r-- | arch/x86/kvm/Makefile | 3 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 3945 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 11 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 48 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_cache_regs.h | 31 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 30 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 1371 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/mmu_audit.c | 304 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 19 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 410 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 1644 | ||||
-rw-r--r-- | arch/x86/kvm/timer.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 25 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 724 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 1860 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 13 |
22 files changed, 7051 insertions, 3424 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 970bbd479516..50f63648ce1b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -28,6 +28,7 @@ config KVM | |||
28 | select HAVE_KVM_IRQCHIP | 28 | select HAVE_KVM_IRQCHIP |
29 | select HAVE_KVM_EVENTFD | 29 | select HAVE_KVM_EVENTFD |
30 | select KVM_APIC_ARCHITECTURE | 30 | select KVM_APIC_ARCHITECTURE |
31 | select KVM_ASYNC_PF | ||
31 | select USER_RETURN_NOTIFIER | 32 | select USER_RETURN_NOTIFIER |
32 | select KVM_MMIO | 33 | select KVM_MMIO |
33 | ---help--- | 34 | ---help--- |
@@ -64,6 +65,13 @@ config KVM_AMD | |||
64 | To compile this as a module, choose M here: the module | 65 | To compile this as a module, choose M here: the module |
65 | will be called kvm-amd. | 66 | will be called kvm-amd. |
66 | 67 | ||
68 | config KVM_MMU_AUDIT | ||
69 | bool "Audit KVM MMU" | ||
70 | depends on KVM && TRACEPOINTS | ||
71 | ---help--- | ||
72 | This option adds a R/W kVM module parameter 'mmu_audit', which allows | ||
73 | audit KVM MMU at runtime. | ||
74 | |||
67 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under | 75 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under |
68 | # the virtualization menu. | 76 | # the virtualization menu. |
69 | source drivers/vhost/Kconfig | 77 | source drivers/vhost/Kconfig |
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 31a7035c4bd9..f15501f431c8 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | 1 | ||
2 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm | 2 | ccflags-y += -Ivirt/kvm -Iarch/x86/kvm |
3 | 3 | ||
4 | CFLAGS_x86.o := -I. | 4 | CFLAGS_x86.o := -I. |
5 | CFLAGS_svm.o := -I. | 5 | CFLAGS_svm.o := -I. |
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ | |||
9 | coalesced_mmio.o irq_comm.o eventfd.o \ | 9 | coalesced_mmio.o irq_comm.o eventfd.o \ |
10 | assigned-dev.o) | 10 | assigned-dev.o) |
11 | kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) | 11 | kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) |
12 | kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) | ||
12 | 13 | ||
13 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ | 14 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ |
14 | i8254.o timer.o | 15 | i8254.o timer.o |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 66ca98aafdd6..adc98675cda0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * privileged instructions: | 9 | * privileged instructions: |
10 | * | 10 | * |
11 | * Copyright (C) 2006 Qumranet | 11 | * Copyright (C) 2006 Qumranet |
12 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 12 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
13 | * | 13 | * |
14 | * Avi Kivity <avi@qumranet.com> | 14 | * Avi Kivity <avi@qumranet.com> |
15 | * Yaniv Kamay <yaniv@qumranet.com> | 15 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -20,16 +20,8 @@ | |||
20 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | 20 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #ifndef __KERNEL__ | ||
24 | #include <stdio.h> | ||
25 | #include <stdint.h> | ||
26 | #include <public/xen.h> | ||
27 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) | ||
28 | #else | ||
29 | #include <linux/kvm_host.h> | 23 | #include <linux/kvm_host.h> |
30 | #include "kvm_cache_regs.h" | 24 | #include "kvm_cache_regs.h" |
31 | #define DPRINTF(x...) do {} while (0) | ||
32 | #endif | ||
33 | #include <linux/module.h> | 25 | #include <linux/module.h> |
34 | #include <asm/kvm_emulate.h> | 26 | #include <asm/kvm_emulate.h> |
35 | 27 | ||
@@ -51,39 +43,50 @@ | |||
51 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | 43 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ |
52 | #define DstReg (2<<1) /* Register operand. */ | 44 | #define DstReg (2<<1) /* Register operand. */ |
53 | #define DstMem (3<<1) /* Memory operand. */ | 45 | #define DstMem (3<<1) /* Memory operand. */ |
54 | #define DstAcc (4<<1) /* Destination Accumulator */ | 46 | #define DstAcc (4<<1) /* Destination Accumulator */ |
55 | #define DstDI (5<<1) /* Destination is in ES:(E)DI */ | 47 | #define DstDI (5<<1) /* Destination is in ES:(E)DI */ |
56 | #define DstMem64 (6<<1) /* 64bit memory operand */ | 48 | #define DstMem64 (6<<1) /* 64bit memory operand */ |
57 | #define DstMask (7<<1) | 49 | #define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */ |
50 | #define DstDX (8<<1) /* Destination is in DX register */ | ||
51 | #define DstMask (0xf<<1) | ||
58 | /* Source operand type. */ | 52 | /* Source operand type. */ |
59 | #define SrcNone (0<<4) /* No source operand. */ | 53 | #define SrcNone (0<<5) /* No source operand. */ |
60 | #define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ | 54 | #define SrcReg (1<<5) /* Register operand. */ |
61 | #define SrcReg (1<<4) /* Register operand. */ | 55 | #define SrcMem (2<<5) /* Memory operand. */ |
62 | #define SrcMem (2<<4) /* Memory operand. */ | 56 | #define SrcMem16 (3<<5) /* Memory operand (16-bit). */ |
63 | #define SrcMem16 (3<<4) /* Memory operand (16-bit). */ | 57 | #define SrcMem32 (4<<5) /* Memory operand (32-bit). */ |
64 | #define SrcMem32 (4<<4) /* Memory operand (32-bit). */ | 58 | #define SrcImm (5<<5) /* Immediate operand. */ |
65 | #define SrcImm (5<<4) /* Immediate operand. */ | 59 | #define SrcImmByte (6<<5) /* 8-bit sign-extended immediate operand. */ |
66 | #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ | 60 | #define SrcOne (7<<5) /* Implied '1' */ |
67 | #define SrcOne (7<<4) /* Implied '1' */ | 61 | #define SrcImmUByte (8<<5) /* 8-bit unsigned immediate operand. */ |
68 | #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ | 62 | #define SrcImmU (9<<5) /* Immediate operand, unsigned */ |
69 | #define SrcImmU (9<<4) /* Immediate operand, unsigned */ | 63 | #define SrcSI (0xa<<5) /* Source is in the DS:RSI */ |
70 | #define SrcSI (0xa<<4) /* Source is in the DS:RSI */ | 64 | #define SrcImmFAddr (0xb<<5) /* Source is immediate far address */ |
71 | #define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ | 65 | #define SrcMemFAddr (0xc<<5) /* Source is far address in memory */ |
72 | #define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ | 66 | #define SrcAcc (0xd<<5) /* Source Accumulator */ |
73 | #define SrcAcc (0xd<<4) /* Source Accumulator */ | 67 | #define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */ |
74 | #define SrcMask (0xf<<4) | 68 | #define SrcDX (0xf<<5) /* Source is in DX register */ |
69 | #define SrcMask (0xf<<5) | ||
75 | /* Generic ModRM decode. */ | 70 | /* Generic ModRM decode. */ |
76 | #define ModRM (1<<8) | 71 | #define ModRM (1<<9) |
77 | /* Destination is only written; never read. */ | 72 | /* Destination is only written; never read. */ |
78 | #define Mov (1<<9) | 73 | #define Mov (1<<10) |
79 | #define BitOp (1<<10) | 74 | #define BitOp (1<<11) |
80 | #define MemAbs (1<<11) /* Memory operand is absolute displacement */ | 75 | #define MemAbs (1<<12) /* Memory operand is absolute displacement */ |
81 | #define String (1<<12) /* String instruction (rep capable) */ | 76 | #define String (1<<13) /* String instruction (rep capable) */ |
82 | #define Stack (1<<13) /* Stack instruction (push/pop) */ | 77 | #define Stack (1<<14) /* Stack instruction (push/pop) */ |
83 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ | 78 | #define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */ |
84 | #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ | 79 | #define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */ |
85 | #define GroupMask 0xff /* Group number stored in bits 0:7 */ | 80 | #define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ |
81 | #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ | ||
82 | #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ | ||
83 | #define Sse (1<<18) /* SSE Vector instruction */ | ||
86 | /* Misc flags */ | 84 | /* Misc flags */ |
85 | #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ | ||
86 | #define VendorSpecific (1<<22) /* Vendor specific instruction */ | ||
87 | #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ | ||
88 | #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ | ||
89 | #define Undefined (1<<25) /* No Such Instruction */ | ||
87 | #define Lock (1<<26) /* lock prefix is allowed for the instruction */ | 90 | #define Lock (1<<26) /* lock prefix is allowed for the instruction */ |
88 | #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ | 91 | #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ |
89 | #define No64 (1<<28) | 92 | #define No64 (1<<28) |
@@ -92,285 +95,40 @@ | |||
92 | #define Src2CL (1<<29) | 95 | #define Src2CL (1<<29) |
93 | #define Src2ImmByte (2<<29) | 96 | #define Src2ImmByte (2<<29) |
94 | #define Src2One (3<<29) | 97 | #define Src2One (3<<29) |
98 | #define Src2Imm (4<<29) | ||
95 | #define Src2Mask (7<<29) | 99 | #define Src2Mask (7<<29) |
96 | 100 | ||
97 | enum { | 101 | #define X2(x...) x, x |
98 | Group1_80, Group1_81, Group1_82, Group1_83, | 102 | #define X3(x...) X2(x), x |
99 | Group1A, Group3_Byte, Group3, Group4, Group5, Group7, | 103 | #define X4(x...) X2(x), X2(x) |
100 | Group8, Group9, | 104 | #define X5(x...) X4(x), x |
101 | }; | 105 | #define X6(x...) X4(x), X2(x) |
102 | 106 | #define X7(x...) X4(x), X3(x) | |
103 | static u32 opcode_table[256] = { | 107 | #define X8(x...) X4(x), X4(x) |
104 | /* 0x00 - 0x07 */ | 108 | #define X16(x...) X8(x), X8(x) |
105 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | 109 | |
106 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 110 | struct opcode { |
107 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, | 111 | u32 flags; |
108 | ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, | 112 | u8 intercept; |
109 | /* 0x08 - 0x0F */ | 113 | union { |
110 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | 114 | int (*execute)(struct x86_emulate_ctxt *ctxt); |
111 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 115 | struct opcode *group; |
112 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, | 116 | struct group_dual *gdual; |
113 | ImplicitOps | Stack | No64, 0, | 117 | struct gprefix *gprefix; |
114 | /* 0x10 - 0x17 */ | 118 | } u; |
115 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | 119 | int (*check_perm)(struct x86_emulate_ctxt *ctxt); |
116 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
117 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, | ||
118 | ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, | ||
119 | /* 0x18 - 0x1F */ | ||
120 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
121 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
122 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, | ||
123 | ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, | ||
124 | /* 0x20 - 0x27 */ | ||
125 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
126 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
127 | ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, | ||
128 | /* 0x28 - 0x2F */ | ||
129 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
130 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
131 | ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, | ||
132 | /* 0x30 - 0x37 */ | ||
133 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
134 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
135 | ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, | ||
136 | /* 0x38 - 0x3F */ | ||
137 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
138 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
139 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, | ||
140 | 0, 0, | ||
141 | /* 0x40 - 0x47 */ | ||
142 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
143 | /* 0x48 - 0x4F */ | ||
144 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
145 | /* 0x50 - 0x57 */ | ||
146 | SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, | ||
147 | SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, | ||
148 | /* 0x58 - 0x5F */ | ||
149 | DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, | ||
150 | DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, | ||
151 | /* 0x60 - 0x67 */ | ||
152 | ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, | ||
153 | 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , | ||
154 | 0, 0, 0, 0, | ||
155 | /* 0x68 - 0x6F */ | ||
156 | SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, | ||
157 | DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */ | ||
158 | SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */ | ||
159 | /* 0x70 - 0x77 */ | ||
160 | SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, | ||
161 | SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, | ||
162 | /* 0x78 - 0x7F */ | ||
163 | SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, | ||
164 | SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, | ||
165 | /* 0x80 - 0x87 */ | ||
166 | Group | Group1_80, Group | Group1_81, | ||
167 | Group | Group1_82, Group | Group1_83, | ||
168 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
169 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
170 | /* 0x88 - 0x8F */ | ||
171 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, | ||
172 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
173 | DstMem | SrcNone | ModRM | Mov, ModRM | DstReg, | ||
174 | ImplicitOps | SrcMem16 | ModRM, Group | Group1A, | ||
175 | /* 0x90 - 0x97 */ | ||
176 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
177 | /* 0x98 - 0x9F */ | ||
178 | 0, 0, SrcImmFAddr | No64, 0, | ||
179 | ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, | ||
180 | /* 0xA0 - 0xA7 */ | ||
181 | ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs, | ||
182 | ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs, | ||
183 | ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, | ||
184 | ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, | ||
185 | /* 0xA8 - 0xAF */ | ||
186 | DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String, | ||
187 | ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, | ||
188 | ByteOp | DstDI | String, DstDI | String, | ||
189 | /* 0xB0 - 0xB7 */ | ||
190 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
191 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
192 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
193 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
194 | /* 0xB8 - 0xBF */ | ||
195 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
196 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
197 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
198 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
199 | /* 0xC0 - 0xC7 */ | ||
200 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
201 | 0, ImplicitOps | Stack, 0, 0, | ||
202 | ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, | ||
203 | /* 0xC8 - 0xCF */ | ||
204 | 0, 0, 0, ImplicitOps | Stack, | ||
205 | ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps, | ||
206 | /* 0xD0 - 0xD7 */ | ||
207 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
208 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
209 | 0, 0, 0, 0, | ||
210 | /* 0xD8 - 0xDF */ | ||
211 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
212 | /* 0xE0 - 0xE7 */ | ||
213 | 0, 0, 0, 0, | ||
214 | ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, | ||
215 | ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, | ||
216 | /* 0xE8 - 0xEF */ | ||
217 | SrcImm | Stack, SrcImm | ImplicitOps, | ||
218 | SrcImmFAddr | No64, SrcImmByte | ImplicitOps, | ||
219 | SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, | ||
220 | SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, | ||
221 | /* 0xF0 - 0xF7 */ | ||
222 | 0, 0, 0, 0, | ||
223 | ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, | ||
224 | /* 0xF8 - 0xFF */ | ||
225 | ImplicitOps, 0, ImplicitOps, ImplicitOps, | ||
226 | ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, | ||
227 | }; | ||
228 | |||
229 | static u32 twobyte_table[256] = { | ||
230 | /* 0x00 - 0x0F */ | ||
231 | 0, Group | GroupDual | Group7, 0, 0, | ||
232 | 0, ImplicitOps, ImplicitOps | Priv, 0, | ||
233 | ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, | ||
234 | 0, ImplicitOps | ModRM, 0, 0, | ||
235 | /* 0x10 - 0x1F */ | ||
236 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | ||
237 | /* 0x20 - 0x2F */ | ||
238 | ModRM | ImplicitOps | Priv, ModRM | Priv, | ||
239 | ModRM | ImplicitOps | Priv, ModRM | Priv, | ||
240 | 0, 0, 0, 0, | ||
241 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
242 | /* 0x30 - 0x3F */ | ||
243 | ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, | ||
244 | ImplicitOps, ImplicitOps | Priv, 0, 0, | ||
245 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
246 | /* 0x40 - 0x47 */ | ||
247 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
248 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
249 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
250 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
251 | /* 0x48 - 0x4F */ | ||
252 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
253 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
254 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
255 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
256 | /* 0x50 - 0x5F */ | ||
257 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
258 | /* 0x60 - 0x6F */ | ||
259 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
260 | /* 0x70 - 0x7F */ | ||
261 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
262 | /* 0x80 - 0x8F */ | ||
263 | SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, | ||
264 | SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, | ||
265 | /* 0x90 - 0x9F */ | ||
266 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
267 | /* 0xA0 - 0xA7 */ | ||
268 | ImplicitOps | Stack, ImplicitOps | Stack, | ||
269 | 0, DstMem | SrcReg | ModRM | BitOp, | ||
270 | DstMem | SrcReg | Src2ImmByte | ModRM, | ||
271 | DstMem | SrcReg | Src2CL | ModRM, 0, 0, | ||
272 | /* 0xA8 - 0xAF */ | ||
273 | ImplicitOps | Stack, ImplicitOps | Stack, | ||
274 | 0, DstMem | SrcReg | ModRM | BitOp | Lock, | ||
275 | DstMem | SrcReg | Src2ImmByte | ModRM, | ||
276 | DstMem | SrcReg | Src2CL | ModRM, | ||
277 | ModRM, 0, | ||
278 | /* 0xB0 - 0xB7 */ | ||
279 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
280 | 0, DstMem | SrcReg | ModRM | BitOp | Lock, | ||
281 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
282 | DstReg | SrcMem16 | ModRM | Mov, | ||
283 | /* 0xB8 - 0xBF */ | ||
284 | 0, 0, | ||
285 | Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, | ||
286 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
287 | DstReg | SrcMem16 | ModRM | Mov, | ||
288 | /* 0xC0 - 0xCF */ | ||
289 | 0, 0, 0, DstMem | SrcReg | ModRM | Mov, | ||
290 | 0, 0, 0, Group | GroupDual | Group9, | ||
291 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
292 | /* 0xD0 - 0xDF */ | ||
293 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
294 | /* 0xE0 - 0xEF */ | ||
295 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
296 | /* 0xF0 - 0xFF */ | ||
297 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
298 | }; | 120 | }; |
299 | 121 | ||
300 | static u32 group_table[] = { | 122 | struct group_dual { |
301 | [Group1_80*8] = | 123 | struct opcode mod012[8]; |
302 | ByteOp | DstMem | SrcImm | ModRM | Lock, | 124 | struct opcode mod3[8]; |
303 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
304 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
305 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
306 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
307 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
308 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
309 | ByteOp | DstMem | SrcImm | ModRM, | ||
310 | [Group1_81*8] = | ||
311 | DstMem | SrcImm | ModRM | Lock, | ||
312 | DstMem | SrcImm | ModRM | Lock, | ||
313 | DstMem | SrcImm | ModRM | Lock, | ||
314 | DstMem | SrcImm | ModRM | Lock, | ||
315 | DstMem | SrcImm | ModRM | Lock, | ||
316 | DstMem | SrcImm | ModRM | Lock, | ||
317 | DstMem | SrcImm | ModRM | Lock, | ||
318 | DstMem | SrcImm | ModRM, | ||
319 | [Group1_82*8] = | ||
320 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
321 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
322 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
323 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
324 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
325 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
326 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
327 | ByteOp | DstMem | SrcImm | ModRM | No64, | ||
328 | [Group1_83*8] = | ||
329 | DstMem | SrcImmByte | ModRM | Lock, | ||
330 | DstMem | SrcImmByte | ModRM | Lock, | ||
331 | DstMem | SrcImmByte | ModRM | Lock, | ||
332 | DstMem | SrcImmByte | ModRM | Lock, | ||
333 | DstMem | SrcImmByte | ModRM | Lock, | ||
334 | DstMem | SrcImmByte | ModRM | Lock, | ||
335 | DstMem | SrcImmByte | ModRM | Lock, | ||
336 | DstMem | SrcImmByte | ModRM, | ||
337 | [Group1A*8] = | ||
338 | DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, | ||
339 | [Group3_Byte*8] = | ||
340 | ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM, | ||
341 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | ||
342 | 0, 0, 0, 0, | ||
343 | [Group3*8] = | ||
344 | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
345 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | ||
346 | 0, 0, 0, 0, | ||
347 | [Group4*8] = | ||
348 | ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, | ||
349 | 0, 0, 0, 0, 0, 0, | ||
350 | [Group5*8] = | ||
351 | DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, | ||
352 | SrcMem | ModRM | Stack, 0, | ||
353 | SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps, | ||
354 | SrcMem | ModRM | Stack, 0, | ||
355 | [Group7*8] = | ||
356 | 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, | ||
357 | SrcNone | ModRM | DstMem | Mov, 0, | ||
358 | SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, | ||
359 | [Group8*8] = | ||
360 | 0, 0, 0, 0, | ||
361 | DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, | ||
362 | DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, | ||
363 | [Group9*8] = | ||
364 | 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0, | ||
365 | }; | 125 | }; |
366 | 126 | ||
367 | static u32 group2_table[] = { | 127 | struct gprefix { |
368 | [Group7*8] = | 128 | struct opcode pfx_no; |
369 | SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv, | 129 | struct opcode pfx_66; |
370 | SrcNone | ModRM | DstMem | Mov, 0, | 130 | struct opcode pfx_f2; |
371 | SrcMem16 | ModRM | Mov | Priv, 0, | 131 | struct opcode pfx_f3; |
372 | [Group9*8] = | ||
373 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
374 | }; | 132 | }; |
375 | 133 | ||
376 | /* EFLAGS bit definitions. */ | 134 | /* EFLAGS bit definitions. */ |
@@ -392,6 +150,9 @@ static u32 group2_table[] = { | |||
392 | #define EFLG_PF (1<<2) | 150 | #define EFLG_PF (1<<2) |
393 | #define EFLG_CF (1<<0) | 151 | #define EFLG_CF (1<<0) |
394 | 152 | ||
153 | #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a | ||
154 | #define EFLG_RESERVED_ONE_MASK 2 | ||
155 | |||
395 | /* | 156 | /* |
396 | * Instruction emulation: | 157 | * Instruction emulation: |
397 | * Most instructions are emulated directly via a fragment of inline assembly | 158 | * Most instructions are emulated directly via a fragment of inline assembly |
@@ -444,13 +205,13 @@ static u32 group2_table[] = { | |||
444 | #define ON64(x) | 205 | #define ON64(x) |
445 | #endif | 206 | #endif |
446 | 207 | ||
447 | #define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ | 208 | #define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \ |
448 | do { \ | 209 | do { \ |
449 | __asm__ __volatile__ ( \ | 210 | __asm__ __volatile__ ( \ |
450 | _PRE_EFLAGS("0", "4", "2") \ | 211 | _PRE_EFLAGS("0", "4", "2") \ |
451 | _op _suffix " %"_x"3,%1; " \ | 212 | _op _suffix " %"_x"3,%1; " \ |
452 | _POST_EFLAGS("0", "4", "2") \ | 213 | _POST_EFLAGS("0", "4", "2") \ |
453 | : "=m" (_eflags), "=m" ((_dst).val), \ | 214 | : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\ |
454 | "=&r" (_tmp) \ | 215 | "=&r" (_tmp) \ |
455 | : _y ((_src).val), "i" (EFLAGS_MASK)); \ | 216 | : _y ((_src).val), "i" (EFLAGS_MASK)); \ |
456 | } while (0) | 217 | } while (0) |
@@ -463,13 +224,13 @@ static u32 group2_table[] = { | |||
463 | \ | 224 | \ |
464 | switch ((_dst).bytes) { \ | 225 | switch ((_dst).bytes) { \ |
465 | case 2: \ | 226 | case 2: \ |
466 | ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ | 227 | ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\ |
467 | break; \ | 228 | break; \ |
468 | case 4: \ | 229 | case 4: \ |
469 | ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ | 230 | ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\ |
470 | break; \ | 231 | break; \ |
471 | case 8: \ | 232 | case 8: \ |
472 | ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ | 233 | ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \ |
473 | break; \ | 234 | break; \ |
474 | } \ | 235 | } \ |
475 | } while (0) | 236 | } while (0) |
@@ -479,7 +240,7 @@ static u32 group2_table[] = { | |||
479 | unsigned long _tmp; \ | 240 | unsigned long _tmp; \ |
480 | switch ((_dst).bytes) { \ | 241 | switch ((_dst).bytes) { \ |
481 | case 1: \ | 242 | case 1: \ |
482 | ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ | 243 | ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \ |
483 | break; \ | 244 | break; \ |
484 | default: \ | 245 | default: \ |
485 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | 246 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ |
@@ -504,42 +265,42 @@ static u32 group2_table[] = { | |||
504 | "w", "r", _LO32, "r", "", "r") | 265 | "w", "r", _LO32, "r", "", "r") |
505 | 266 | ||
506 | /* Instruction has three operands and one operand is stored in ECX register */ | 267 | /* Instruction has three operands and one operand is stored in ECX register */ |
507 | #define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ | 268 | #define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ |
508 | do { \ | 269 | do { \ |
509 | unsigned long _tmp; \ | 270 | unsigned long _tmp; \ |
510 | _type _clv = (_cl).val; \ | 271 | _type _clv = (_cl).val; \ |
511 | _type _srcv = (_src).val; \ | 272 | _type _srcv = (_src).val; \ |
512 | _type _dstv = (_dst).val; \ | 273 | _type _dstv = (_dst).val; \ |
513 | \ | 274 | \ |
514 | __asm__ __volatile__ ( \ | 275 | __asm__ __volatile__ ( \ |
515 | _PRE_EFLAGS("0", "5", "2") \ | 276 | _PRE_EFLAGS("0", "5", "2") \ |
516 | _op _suffix " %4,%1 \n" \ | 277 | _op _suffix " %4,%1 \n" \ |
517 | _POST_EFLAGS("0", "5", "2") \ | 278 | _POST_EFLAGS("0", "5", "2") \ |
518 | : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ | 279 | : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ |
519 | : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ | 280 | : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ |
520 | ); \ | 281 | ); \ |
521 | \ | 282 | \ |
522 | (_cl).val = (unsigned long) _clv; \ | 283 | (_cl).val = (unsigned long) _clv; \ |
523 | (_src).val = (unsigned long) _srcv; \ | 284 | (_src).val = (unsigned long) _srcv; \ |
524 | (_dst).val = (unsigned long) _dstv; \ | 285 | (_dst).val = (unsigned long) _dstv; \ |
525 | } while (0) | 286 | } while (0) |
526 | 287 | ||
527 | #define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ | 288 | #define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ |
528 | do { \ | 289 | do { \ |
529 | switch ((_dst).bytes) { \ | 290 | switch ((_dst).bytes) { \ |
530 | case 2: \ | 291 | case 2: \ |
531 | __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ | 292 | __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ |
532 | "w", unsigned short); \ | 293 | "w", unsigned short); \ |
533 | break; \ | 294 | break; \ |
534 | case 4: \ | 295 | case 4: \ |
535 | __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ | 296 | __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ |
536 | "l", unsigned int); \ | 297 | "l", unsigned int); \ |
537 | break; \ | 298 | break; \ |
538 | case 8: \ | 299 | case 8: \ |
539 | ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ | 300 | ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ |
540 | "q", unsigned long)); \ | 301 | "q", unsigned long)); \ |
541 | break; \ | 302 | break; \ |
542 | } \ | 303 | } \ |
543 | } while (0) | 304 | } while (0) |
544 | 305 | ||
545 | #define __emulate_1op(_op, _dst, _eflags, _suffix) \ | 306 | #define __emulate_1op(_op, _dst, _eflags, _suffix) \ |
@@ -566,6 +327,86 @@ static u32 group2_table[] = { | |||
566 | } \ | 327 | } \ |
567 | } while (0) | 328 | } while (0) |
568 | 329 | ||
330 | #define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \ | ||
331 | do { \ | ||
332 | unsigned long _tmp; \ | ||
333 | \ | ||
334 | __asm__ __volatile__ ( \ | ||
335 | _PRE_EFLAGS("0", "4", "1") \ | ||
336 | _op _suffix " %5; " \ | ||
337 | _POST_EFLAGS("0", "4", "1") \ | ||
338 | : "=m" (_eflags), "=&r" (_tmp), \ | ||
339 | "+a" (_rax), "+d" (_rdx) \ | ||
340 | : "i" (EFLAGS_MASK), "m" ((_src).val), \ | ||
341 | "a" (_rax), "d" (_rdx)); \ | ||
342 | } while (0) | ||
343 | |||
344 | #define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ | ||
345 | do { \ | ||
346 | unsigned long _tmp; \ | ||
347 | \ | ||
348 | __asm__ __volatile__ ( \ | ||
349 | _PRE_EFLAGS("0", "5", "1") \ | ||
350 | "1: \n\t" \ | ||
351 | _op _suffix " %6; " \ | ||
352 | "2: \n\t" \ | ||
353 | _POST_EFLAGS("0", "5", "1") \ | ||
354 | ".pushsection .fixup,\"ax\" \n\t" \ | ||
355 | "3: movb $1, %4 \n\t" \ | ||
356 | "jmp 2b \n\t" \ | ||
357 | ".popsection \n\t" \ | ||
358 | _ASM_EXTABLE(1b, 3b) \ | ||
359 | : "=m" (_eflags), "=&r" (_tmp), \ | ||
360 | "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \ | ||
361 | : "i" (EFLAGS_MASK), "m" ((_src).val), \ | ||
362 | "a" (_rax), "d" (_rdx)); \ | ||
363 | } while (0) | ||
364 | |||
365 | /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ | ||
366 | #define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ | ||
367 | do { \ | ||
368 | switch((_src).bytes) { \ | ||
369 | case 1: \ | ||
370 | __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ | ||
371 | _eflags, "b"); \ | ||
372 | break; \ | ||
373 | case 2: \ | ||
374 | __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ | ||
375 | _eflags, "w"); \ | ||
376 | break; \ | ||
377 | case 4: \ | ||
378 | __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ | ||
379 | _eflags, "l"); \ | ||
380 | break; \ | ||
381 | case 8: \ | ||
382 | ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ | ||
383 | _eflags, "q")); \ | ||
384 | break; \ | ||
385 | } \ | ||
386 | } while (0) | ||
387 | |||
388 | #define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \ | ||
389 | do { \ | ||
390 | switch((_src).bytes) { \ | ||
391 | case 1: \ | ||
392 | __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ | ||
393 | _eflags, "b", _ex); \ | ||
394 | break; \ | ||
395 | case 2: \ | ||
396 | __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ | ||
397 | _eflags, "w", _ex); \ | ||
398 | break; \ | ||
399 | case 4: \ | ||
400 | __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ | ||
401 | _eflags, "l", _ex); \ | ||
402 | break; \ | ||
403 | case 8: ON64( \ | ||
404 | __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ | ||
405 | _eflags, "q", _ex)); \ | ||
406 | break; \ | ||
407 | } \ | ||
408 | } while (0) | ||
409 | |||
569 | /* Fetch next part of the instruction being emulated. */ | 410 | /* Fetch next part of the instruction being emulated. */ |
570 | #define insn_fetch(_type, _size, _eip) \ | 411 | #define insn_fetch(_type, _size, _eip) \ |
571 | ({ unsigned long _x; \ | 412 | ({ unsigned long _x; \ |
@@ -576,13 +417,33 @@ static u32 group2_table[] = { | |||
576 | (_type)_x; \ | 417 | (_type)_x; \ |
577 | }) | 418 | }) |
578 | 419 | ||
579 | #define insn_fetch_arr(_arr, _size, _eip) \ | 420 | #define insn_fetch_arr(_arr, _size, _eip) \ |
580 | ({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ | 421 | ({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ |
581 | if (rc != X86EMUL_CONTINUE) \ | 422 | if (rc != X86EMUL_CONTINUE) \ |
582 | goto done; \ | 423 | goto done; \ |
583 | (_eip) += (_size); \ | 424 | (_eip) += (_size); \ |
584 | }) | 425 | }) |
585 | 426 | ||
427 | static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, | ||
428 | enum x86_intercept intercept, | ||
429 | enum x86_intercept_stage stage) | ||
430 | { | ||
431 | struct x86_instruction_info info = { | ||
432 | .intercept = intercept, | ||
433 | .rep_prefix = ctxt->decode.rep_prefix, | ||
434 | .modrm_mod = ctxt->decode.modrm_mod, | ||
435 | .modrm_reg = ctxt->decode.modrm_reg, | ||
436 | .modrm_rm = ctxt->decode.modrm_rm, | ||
437 | .src_val = ctxt->decode.src.val64, | ||
438 | .src_bytes = ctxt->decode.src.bytes, | ||
439 | .dst_bytes = ctxt->decode.dst.bytes, | ||
440 | .ad_bytes = ctxt->decode.ad_bytes, | ||
441 | .next_rip = ctxt->eip, | ||
442 | }; | ||
443 | |||
444 | return ctxt->ops->intercept(ctxt, &info, stage); | ||
445 | } | ||
446 | |||
586 | static inline unsigned long ad_mask(struct decode_cache *c) | 447 | static inline unsigned long ad_mask(struct decode_cache *c) |
587 | { | 448 | { |
588 | return (1UL << (c->ad_bytes << 3)) - 1; | 449 | return (1UL << (c->ad_bytes << 3)) - 1; |
@@ -599,9 +460,9 @@ address_mask(struct decode_cache *c, unsigned long reg) | |||
599 | } | 460 | } |
600 | 461 | ||
601 | static inline unsigned long | 462 | static inline unsigned long |
602 | register_address(struct decode_cache *c, unsigned long base, unsigned long reg) | 463 | register_address(struct decode_cache *c, unsigned long reg) |
603 | { | 464 | { |
604 | return base + address_mask(c, reg); | 465 | return address_mask(c, reg); |
605 | } | 466 | } |
606 | 467 | ||
607 | static inline void | 468 | static inline void |
@@ -618,6 +479,13 @@ static inline void jmp_rel(struct decode_cache *c, int rel) | |||
618 | register_address_increment(c, &c->eip, rel); | 479 | register_address_increment(c, &c->eip, rel); |
619 | } | 480 | } |
620 | 481 | ||
482 | static u32 desc_limit_scaled(struct desc_struct *desc) | ||
483 | { | ||
484 | u32 limit = get_desc_limit(desc); | ||
485 | |||
486 | return desc->g ? (limit << 12) | 0xfff : limit; | ||
487 | } | ||
488 | |||
621 | static void set_seg_override(struct decode_cache *c, int seg) | 489 | static void set_seg_override(struct decode_cache *c, int seg) |
622 | { | 490 | { |
623 | c->has_seg_override = true; | 491 | c->has_seg_override = true; |
@@ -630,60 +498,177 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, | |||
630 | if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) | 498 | if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) |
631 | return 0; | 499 | return 0; |
632 | 500 | ||
633 | return ops->get_cached_segment_base(seg, ctxt->vcpu); | 501 | return ops->get_cached_segment_base(ctxt, seg); |
634 | } | 502 | } |
635 | 503 | ||
636 | static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, | 504 | static unsigned seg_override(struct x86_emulate_ctxt *ctxt, |
637 | struct x86_emulate_ops *ops, | 505 | struct decode_cache *c) |
638 | struct decode_cache *c) | ||
639 | { | 506 | { |
640 | if (!c->has_seg_override) | 507 | if (!c->has_seg_override) |
641 | return 0; | 508 | return 0; |
642 | 509 | ||
643 | return seg_base(ctxt, ops, c->seg_override); | 510 | return c->seg_override; |
644 | } | 511 | } |
645 | 512 | ||
646 | static unsigned long es_base(struct x86_emulate_ctxt *ctxt, | 513 | static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, |
647 | struct x86_emulate_ops *ops) | 514 | u32 error, bool valid) |
648 | { | 515 | { |
649 | return seg_base(ctxt, ops, VCPU_SREG_ES); | 516 | ctxt->exception.vector = vec; |
517 | ctxt->exception.error_code = error; | ||
518 | ctxt->exception.error_code_valid = valid; | ||
519 | return X86EMUL_PROPAGATE_FAULT; | ||
650 | } | 520 | } |
651 | 521 | ||
652 | static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, | 522 | static int emulate_db(struct x86_emulate_ctxt *ctxt) |
653 | struct x86_emulate_ops *ops) | 523 | { |
524 | return emulate_exception(ctxt, DB_VECTOR, 0, false); | ||
525 | } | ||
526 | |||
527 | static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) | ||
528 | { | ||
529 | return emulate_exception(ctxt, GP_VECTOR, err, true); | ||
530 | } | ||
531 | |||
532 | static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err) | ||
533 | { | ||
534 | return emulate_exception(ctxt, SS_VECTOR, err, true); | ||
535 | } | ||
536 | |||
537 | static int emulate_ud(struct x86_emulate_ctxt *ctxt) | ||
538 | { | ||
539 | return emulate_exception(ctxt, UD_VECTOR, 0, false); | ||
540 | } | ||
541 | |||
542 | static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err) | ||
543 | { | ||
544 | return emulate_exception(ctxt, TS_VECTOR, err, true); | ||
545 | } | ||
546 | |||
547 | static int emulate_de(struct x86_emulate_ctxt *ctxt) | ||
654 | { | 548 | { |
655 | return seg_base(ctxt, ops, VCPU_SREG_SS); | 549 | return emulate_exception(ctxt, DE_VECTOR, 0, false); |
656 | } | 550 | } |
657 | 551 | ||
658 | static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, | 552 | static int emulate_nm(struct x86_emulate_ctxt *ctxt) |
659 | u32 error, bool valid) | ||
660 | { | 553 | { |
661 | ctxt->exception = vec; | 554 | return emulate_exception(ctxt, NM_VECTOR, 0, false); |
662 | ctxt->error_code = error; | ||
663 | ctxt->error_code_valid = valid; | ||
664 | ctxt->restart = false; | ||
665 | } | 555 | } |
666 | 556 | ||
667 | static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) | 557 | static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) |
668 | { | 558 | { |
669 | emulate_exception(ctxt, GP_VECTOR, err, true); | 559 | u16 selector; |
560 | struct desc_struct desc; | ||
561 | |||
562 | ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg); | ||
563 | return selector; | ||
670 | } | 564 | } |
671 | 565 | ||
672 | static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, | 566 | static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, |
673 | int err) | 567 | unsigned seg) |
674 | { | 568 | { |
675 | ctxt->cr2 = addr; | 569 | u16 dummy; |
676 | emulate_exception(ctxt, PF_VECTOR, err, true); | 570 | u32 base3; |
571 | struct desc_struct desc; | ||
572 | |||
573 | ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg); | ||
574 | ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); | ||
677 | } | 575 | } |
678 | 576 | ||
679 | static void emulate_ud(struct x86_emulate_ctxt *ctxt) | 577 | static int __linearize(struct x86_emulate_ctxt *ctxt, |
578 | struct segmented_address addr, | ||
579 | unsigned size, bool write, bool fetch, | ||
580 | ulong *linear) | ||
680 | { | 581 | { |
681 | emulate_exception(ctxt, UD_VECTOR, 0, false); | 582 | struct decode_cache *c = &ctxt->decode; |
583 | struct desc_struct desc; | ||
584 | bool usable; | ||
585 | ulong la; | ||
586 | u32 lim; | ||
587 | u16 sel; | ||
588 | unsigned cpl, rpl; | ||
589 | |||
590 | la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; | ||
591 | switch (ctxt->mode) { | ||
592 | case X86EMUL_MODE_REAL: | ||
593 | break; | ||
594 | case X86EMUL_MODE_PROT64: | ||
595 | if (((signed long)la << 16) >> 16 != la) | ||
596 | return emulate_gp(ctxt, 0); | ||
597 | break; | ||
598 | default: | ||
599 | usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL, | ||
600 | addr.seg); | ||
601 | if (!usable) | ||
602 | goto bad; | ||
603 | /* code segment or read-only data segment */ | ||
604 | if (((desc.type & 8) || !(desc.type & 2)) && write) | ||
605 | goto bad; | ||
606 | /* unreadable code segment */ | ||
607 | if (!fetch && (desc.type & 8) && !(desc.type & 2)) | ||
608 | goto bad; | ||
609 | lim = desc_limit_scaled(&desc); | ||
610 | if ((desc.type & 8) || !(desc.type & 4)) { | ||
611 | /* expand-up segment */ | ||
612 | if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) | ||
613 | goto bad; | ||
614 | } else { | ||
615 | /* exapand-down segment */ | ||
616 | if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) | ||
617 | goto bad; | ||
618 | lim = desc.d ? 0xffffffff : 0xffff; | ||
619 | if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) | ||
620 | goto bad; | ||
621 | } | ||
622 | cpl = ctxt->ops->cpl(ctxt); | ||
623 | rpl = sel & 3; | ||
624 | cpl = max(cpl, rpl); | ||
625 | if (!(desc.type & 8)) { | ||
626 | /* data segment */ | ||
627 | if (cpl > desc.dpl) | ||
628 | goto bad; | ||
629 | } else if ((desc.type & 8) && !(desc.type & 4)) { | ||
630 | /* nonconforming code segment */ | ||
631 | if (cpl != desc.dpl) | ||
632 | goto bad; | ||
633 | } else if ((desc.type & 8) && (desc.type & 4)) { | ||
634 | /* conforming code segment */ | ||
635 | if (cpl < desc.dpl) | ||
636 | goto bad; | ||
637 | } | ||
638 | break; | ||
639 | } | ||
640 | if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8) | ||
641 | la &= (u32)-1; | ||
642 | *linear = la; | ||
643 | return X86EMUL_CONTINUE; | ||
644 | bad: | ||
645 | if (addr.seg == VCPU_SREG_SS) | ||
646 | return emulate_ss(ctxt, addr.seg); | ||
647 | else | ||
648 | return emulate_gp(ctxt, addr.seg); | ||
682 | } | 649 | } |
683 | 650 | ||
684 | static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) | 651 | static int linearize(struct x86_emulate_ctxt *ctxt, |
652 | struct segmented_address addr, | ||
653 | unsigned size, bool write, | ||
654 | ulong *linear) | ||
685 | { | 655 | { |
686 | emulate_exception(ctxt, TS_VECTOR, err, true); | 656 | return __linearize(ctxt, addr, size, write, false, linear); |
657 | } | ||
658 | |||
659 | |||
660 | static int segmented_read_std(struct x86_emulate_ctxt *ctxt, | ||
661 | struct segmented_address addr, | ||
662 | void *data, | ||
663 | unsigned size) | ||
664 | { | ||
665 | int rc; | ||
666 | ulong linear; | ||
667 | |||
668 | rc = linearize(ctxt, addr, size, false, &linear); | ||
669 | if (rc != X86EMUL_CONTINUE) | ||
670 | return rc; | ||
671 | return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); | ||
687 | } | 672 | } |
688 | 673 | ||
689 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | 674 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, |
@@ -695,10 +680,15 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | |||
695 | int size, cur_size; | 680 | int size, cur_size; |
696 | 681 | ||
697 | if (eip == fc->end) { | 682 | if (eip == fc->end) { |
683 | unsigned long linear; | ||
684 | struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip}; | ||
698 | cur_size = fc->end - fc->start; | 685 | cur_size = fc->end - fc->start; |
699 | size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); | 686 | size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); |
700 | rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, | 687 | rc = __linearize(ctxt, addr, size, false, true, &linear); |
701 | size, ctxt->vcpu, NULL); | 688 | if (rc != X86EMUL_CONTINUE) |
689 | return rc; | ||
690 | rc = ops->fetch(ctxt, linear, fc->data + cur_size, | ||
691 | size, &ctxt->exception); | ||
702 | if (rc != X86EMUL_CONTINUE) | 692 | if (rc != X86EMUL_CONTINUE) |
703 | return rc; | 693 | return rc; |
704 | fc->end += size; | 694 | fc->end += size; |
@@ -741,8 +731,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs, | |||
741 | } | 731 | } |
742 | 732 | ||
743 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, | 733 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, |
744 | struct x86_emulate_ops *ops, | 734 | struct segmented_address addr, |
745 | void *ptr, | ||
746 | u16 *size, unsigned long *address, int op_bytes) | 735 | u16 *size, unsigned long *address, int op_bytes) |
747 | { | 736 | { |
748 | int rc; | 737 | int rc; |
@@ -750,12 +739,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, | |||
750 | if (op_bytes == 2) | 739 | if (op_bytes == 2) |
751 | op_bytes = 3; | 740 | op_bytes = 3; |
752 | *address = 0; | 741 | *address = 0; |
753 | rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, | 742 | rc = segmented_read_std(ctxt, addr, size, 2); |
754 | ctxt->vcpu, NULL); | ||
755 | if (rc != X86EMUL_CONTINUE) | 743 | if (rc != X86EMUL_CONTINUE) |
756 | return rc; | 744 | return rc; |
757 | rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, | 745 | addr.ea += 2; |
758 | ctxt->vcpu, NULL); | 746 | rc = segmented_read_std(ctxt, addr, address, op_bytes); |
759 | return rc; | 747 | return rc; |
760 | } | 748 | } |
761 | 749 | ||
@@ -794,7 +782,81 @@ static int test_cc(unsigned int condition, unsigned int flags) | |||
794 | return (!!rc ^ (condition & 1)); | 782 | return (!!rc ^ (condition & 1)); |
795 | } | 783 | } |
796 | 784 | ||
797 | static void decode_register_operand(struct operand *op, | 785 | static void fetch_register_operand(struct operand *op) |
786 | { | ||
787 | switch (op->bytes) { | ||
788 | case 1: | ||
789 | op->val = *(u8 *)op->addr.reg; | ||
790 | break; | ||
791 | case 2: | ||
792 | op->val = *(u16 *)op->addr.reg; | ||
793 | break; | ||
794 | case 4: | ||
795 | op->val = *(u32 *)op->addr.reg; | ||
796 | break; | ||
797 | case 8: | ||
798 | op->val = *(u64 *)op->addr.reg; | ||
799 | break; | ||
800 | } | ||
801 | } | ||
802 | |||
803 | static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) | ||
804 | { | ||
805 | ctxt->ops->get_fpu(ctxt); | ||
806 | switch (reg) { | ||
807 | case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break; | ||
808 | case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break; | ||
809 | case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break; | ||
810 | case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break; | ||
811 | case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break; | ||
812 | case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break; | ||
813 | case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break; | ||
814 | case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break; | ||
815 | #ifdef CONFIG_X86_64 | ||
816 | case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break; | ||
817 | case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break; | ||
818 | case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break; | ||
819 | case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break; | ||
820 | case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break; | ||
821 | case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break; | ||
822 | case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break; | ||
823 | case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break; | ||
824 | #endif | ||
825 | default: BUG(); | ||
826 | } | ||
827 | ctxt->ops->put_fpu(ctxt); | ||
828 | } | ||
829 | |||
830 | static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, | ||
831 | int reg) | ||
832 | { | ||
833 | ctxt->ops->get_fpu(ctxt); | ||
834 | switch (reg) { | ||
835 | case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break; | ||
836 | case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break; | ||
837 | case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break; | ||
838 | case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break; | ||
839 | case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break; | ||
840 | case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break; | ||
841 | case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break; | ||
842 | case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break; | ||
843 | #ifdef CONFIG_X86_64 | ||
844 | case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break; | ||
845 | case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break; | ||
846 | case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break; | ||
847 | case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break; | ||
848 | case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break; | ||
849 | case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break; | ||
850 | case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break; | ||
851 | case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break; | ||
852 | #endif | ||
853 | default: BUG(); | ||
854 | } | ||
855 | ctxt->ops->put_fpu(ctxt); | ||
856 | } | ||
857 | |||
858 | static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | ||
859 | struct operand *op, | ||
798 | struct decode_cache *c, | 860 | struct decode_cache *c, |
799 | int inhibit_bytereg) | 861 | int inhibit_bytereg) |
800 | { | 862 | { |
@@ -803,36 +865,36 @@ static void decode_register_operand(struct operand *op, | |||
803 | 865 | ||
804 | if (!(c->d & ModRM)) | 866 | if (!(c->d & ModRM)) |
805 | reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); | 867 | reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); |
868 | |||
869 | if (c->d & Sse) { | ||
870 | op->type = OP_XMM; | ||
871 | op->bytes = 16; | ||
872 | op->addr.xmm = reg; | ||
873 | read_sse_reg(ctxt, &op->vec_val, reg); | ||
874 | return; | ||
875 | } | ||
876 | |||
806 | op->type = OP_REG; | 877 | op->type = OP_REG; |
807 | if ((c->d & ByteOp) && !inhibit_bytereg) { | 878 | if ((c->d & ByteOp) && !inhibit_bytereg) { |
808 | op->ptr = decode_register(reg, c->regs, highbyte_regs); | 879 | op->addr.reg = decode_register(reg, c->regs, highbyte_regs); |
809 | op->val = *(u8 *)op->ptr; | ||
810 | op->bytes = 1; | 880 | op->bytes = 1; |
811 | } else { | 881 | } else { |
812 | op->ptr = decode_register(reg, c->regs, 0); | 882 | op->addr.reg = decode_register(reg, c->regs, 0); |
813 | op->bytes = c->op_bytes; | 883 | op->bytes = c->op_bytes; |
814 | switch (op->bytes) { | ||
815 | case 2: | ||
816 | op->val = *(u16 *)op->ptr; | ||
817 | break; | ||
818 | case 4: | ||
819 | op->val = *(u32 *)op->ptr; | ||
820 | break; | ||
821 | case 8: | ||
822 | op->val = *(u64 *) op->ptr; | ||
823 | break; | ||
824 | } | ||
825 | } | 884 | } |
885 | fetch_register_operand(op); | ||
826 | op->orig_val = op->val; | 886 | op->orig_val = op->val; |
827 | } | 887 | } |
828 | 888 | ||
829 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, | 889 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, |
830 | struct x86_emulate_ops *ops) | 890 | struct x86_emulate_ops *ops, |
891 | struct operand *op) | ||
831 | { | 892 | { |
832 | struct decode_cache *c = &ctxt->decode; | 893 | struct decode_cache *c = &ctxt->decode; |
833 | u8 sib; | 894 | u8 sib; |
834 | int index_reg = 0, base_reg = 0, scale; | 895 | int index_reg = 0, base_reg = 0, scale; |
835 | int rc = X86EMUL_CONTINUE; | 896 | int rc = X86EMUL_CONTINUE; |
897 | ulong modrm_ea = 0; | ||
836 | 898 | ||
837 | if (c->rex_prefix) { | 899 | if (c->rex_prefix) { |
838 | c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ | 900 | c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ |
@@ -844,16 +906,26 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
844 | c->modrm_mod |= (c->modrm & 0xc0) >> 6; | 906 | c->modrm_mod |= (c->modrm & 0xc0) >> 6; |
845 | c->modrm_reg |= (c->modrm & 0x38) >> 3; | 907 | c->modrm_reg |= (c->modrm & 0x38) >> 3; |
846 | c->modrm_rm |= (c->modrm & 0x07); | 908 | c->modrm_rm |= (c->modrm & 0x07); |
847 | c->modrm_ea = 0; | 909 | c->modrm_seg = VCPU_SREG_DS; |
848 | c->use_modrm_ea = 1; | ||
849 | 910 | ||
850 | if (c->modrm_mod == 3) { | 911 | if (c->modrm_mod == 3) { |
851 | c->modrm_ptr = decode_register(c->modrm_rm, | 912 | op->type = OP_REG; |
913 | op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
914 | op->addr.reg = decode_register(c->modrm_rm, | ||
852 | c->regs, c->d & ByteOp); | 915 | c->regs, c->d & ByteOp); |
853 | c->modrm_val = *(unsigned long *)c->modrm_ptr; | 916 | if (c->d & Sse) { |
917 | op->type = OP_XMM; | ||
918 | op->bytes = 16; | ||
919 | op->addr.xmm = c->modrm_rm; | ||
920 | read_sse_reg(ctxt, &op->vec_val, c->modrm_rm); | ||
921 | return rc; | ||
922 | } | ||
923 | fetch_register_operand(op); | ||
854 | return rc; | 924 | return rc; |
855 | } | 925 | } |
856 | 926 | ||
927 | op->type = OP_MEM; | ||
928 | |||
857 | if (c->ad_bytes == 2) { | 929 | if (c->ad_bytes == 2) { |
858 | unsigned bx = c->regs[VCPU_REGS_RBX]; | 930 | unsigned bx = c->regs[VCPU_REGS_RBX]; |
859 | unsigned bp = c->regs[VCPU_REGS_RBP]; | 931 | unsigned bp = c->regs[VCPU_REGS_RBP]; |
@@ -864,47 +936,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
864 | switch (c->modrm_mod) { | 936 | switch (c->modrm_mod) { |
865 | case 0: | 937 | case 0: |
866 | if (c->modrm_rm == 6) | 938 | if (c->modrm_rm == 6) |
867 | c->modrm_ea += insn_fetch(u16, 2, c->eip); | 939 | modrm_ea += insn_fetch(u16, 2, c->eip); |
868 | break; | 940 | break; |
869 | case 1: | 941 | case 1: |
870 | c->modrm_ea += insn_fetch(s8, 1, c->eip); | 942 | modrm_ea += insn_fetch(s8, 1, c->eip); |
871 | break; | 943 | break; |
872 | case 2: | 944 | case 2: |
873 | c->modrm_ea += insn_fetch(u16, 2, c->eip); | 945 | modrm_ea += insn_fetch(u16, 2, c->eip); |
874 | break; | 946 | break; |
875 | } | 947 | } |
876 | switch (c->modrm_rm) { | 948 | switch (c->modrm_rm) { |
877 | case 0: | 949 | case 0: |
878 | c->modrm_ea += bx + si; | 950 | modrm_ea += bx + si; |
879 | break; | 951 | break; |
880 | case 1: | 952 | case 1: |
881 | c->modrm_ea += bx + di; | 953 | modrm_ea += bx + di; |
882 | break; | 954 | break; |
883 | case 2: | 955 | case 2: |
884 | c->modrm_ea += bp + si; | 956 | modrm_ea += bp + si; |
885 | break; | 957 | break; |
886 | case 3: | 958 | case 3: |
887 | c->modrm_ea += bp + di; | 959 | modrm_ea += bp + di; |
888 | break; | 960 | break; |
889 | case 4: | 961 | case 4: |
890 | c->modrm_ea += si; | 962 | modrm_ea += si; |
891 | break; | 963 | break; |
892 | case 5: | 964 | case 5: |
893 | c->modrm_ea += di; | 965 | modrm_ea += di; |
894 | break; | 966 | break; |
895 | case 6: | 967 | case 6: |
896 | if (c->modrm_mod != 0) | 968 | if (c->modrm_mod != 0) |
897 | c->modrm_ea += bp; | 969 | modrm_ea += bp; |
898 | break; | 970 | break; |
899 | case 7: | 971 | case 7: |
900 | c->modrm_ea += bx; | 972 | modrm_ea += bx; |
901 | break; | 973 | break; |
902 | } | 974 | } |
903 | if (c->modrm_rm == 2 || c->modrm_rm == 3 || | 975 | if (c->modrm_rm == 2 || c->modrm_rm == 3 || |
904 | (c->modrm_rm == 6 && c->modrm_mod != 0)) | 976 | (c->modrm_rm == 6 && c->modrm_mod != 0)) |
905 | if (!c->has_seg_override) | 977 | c->modrm_seg = VCPU_SREG_SS; |
906 | set_seg_override(c, VCPU_SREG_SS); | 978 | modrm_ea = (u16)modrm_ea; |
907 | c->modrm_ea = (u16)c->modrm_ea; | ||
908 | } else { | 979 | } else { |
909 | /* 32/64-bit ModR/M decode. */ | 980 | /* 32/64-bit ModR/M decode. */ |
910 | if ((c->modrm_rm & 7) == 4) { | 981 | if ((c->modrm_rm & 7) == 4) { |
@@ -914,410 +985,74 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
914 | scale = sib >> 6; | 985 | scale = sib >> 6; |
915 | 986 | ||
916 | if ((base_reg & 7) == 5 && c->modrm_mod == 0) | 987 | if ((base_reg & 7) == 5 && c->modrm_mod == 0) |
917 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | 988 | modrm_ea += insn_fetch(s32, 4, c->eip); |
918 | else | 989 | else |
919 | c->modrm_ea += c->regs[base_reg]; | 990 | modrm_ea += c->regs[base_reg]; |
920 | if (index_reg != 4) | 991 | if (index_reg != 4) |
921 | c->modrm_ea += c->regs[index_reg] << scale; | 992 | modrm_ea += c->regs[index_reg] << scale; |
922 | } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { | 993 | } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { |
923 | if (ctxt->mode == X86EMUL_MODE_PROT64) | 994 | if (ctxt->mode == X86EMUL_MODE_PROT64) |
924 | c->rip_relative = 1; | 995 | c->rip_relative = 1; |
925 | } else | 996 | } else |
926 | c->modrm_ea += c->regs[c->modrm_rm]; | 997 | modrm_ea += c->regs[c->modrm_rm]; |
927 | switch (c->modrm_mod) { | 998 | switch (c->modrm_mod) { |
928 | case 0: | 999 | case 0: |
929 | if (c->modrm_rm == 5) | 1000 | if (c->modrm_rm == 5) |
930 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | 1001 | modrm_ea += insn_fetch(s32, 4, c->eip); |
931 | break; | 1002 | break; |
932 | case 1: | 1003 | case 1: |
933 | c->modrm_ea += insn_fetch(s8, 1, c->eip); | 1004 | modrm_ea += insn_fetch(s8, 1, c->eip); |
934 | break; | 1005 | break; |
935 | case 2: | 1006 | case 2: |
936 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | 1007 | modrm_ea += insn_fetch(s32, 4, c->eip); |
937 | break; | 1008 | break; |
938 | } | 1009 | } |
939 | } | 1010 | } |
1011 | op->addr.mem.ea = modrm_ea; | ||
940 | done: | 1012 | done: |
941 | return rc; | 1013 | return rc; |
942 | } | 1014 | } |
943 | 1015 | ||
944 | static int decode_abs(struct x86_emulate_ctxt *ctxt, | 1016 | static int decode_abs(struct x86_emulate_ctxt *ctxt, |
945 | struct x86_emulate_ops *ops) | 1017 | struct x86_emulate_ops *ops, |
1018 | struct operand *op) | ||
946 | { | 1019 | { |
947 | struct decode_cache *c = &ctxt->decode; | 1020 | struct decode_cache *c = &ctxt->decode; |
948 | int rc = X86EMUL_CONTINUE; | 1021 | int rc = X86EMUL_CONTINUE; |
949 | 1022 | ||
1023 | op->type = OP_MEM; | ||
950 | switch (c->ad_bytes) { | 1024 | switch (c->ad_bytes) { |
951 | case 2: | 1025 | case 2: |
952 | c->modrm_ea = insn_fetch(u16, 2, c->eip); | 1026 | op->addr.mem.ea = insn_fetch(u16, 2, c->eip); |
953 | break; | 1027 | break; |
954 | case 4: | 1028 | case 4: |
955 | c->modrm_ea = insn_fetch(u32, 4, c->eip); | 1029 | op->addr.mem.ea = insn_fetch(u32, 4, c->eip); |
956 | break; | 1030 | break; |
957 | case 8: | 1031 | case 8: |
958 | c->modrm_ea = insn_fetch(u64, 8, c->eip); | 1032 | op->addr.mem.ea = insn_fetch(u64, 8, c->eip); |
959 | break; | 1033 | break; |
960 | } | 1034 | } |
961 | done: | 1035 | done: |
962 | return rc; | 1036 | return rc; |
963 | } | 1037 | } |
964 | 1038 | ||
965 | int | 1039 | static void fetch_bit_operand(struct decode_cache *c) |
966 | x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
967 | { | 1040 | { |
968 | struct decode_cache *c = &ctxt->decode; | 1041 | long sv = 0, mask; |
969 | int rc = X86EMUL_CONTINUE; | ||
970 | int mode = ctxt->mode; | ||
971 | int def_op_bytes, def_ad_bytes, group; | ||
972 | |||
973 | |||
974 | /* we cannot decode insn before we complete previous rep insn */ | ||
975 | WARN_ON(ctxt->restart); | ||
976 | |||
977 | c->eip = ctxt->eip; | ||
978 | c->fetch.start = c->fetch.end = c->eip; | ||
979 | ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); | ||
980 | |||
981 | switch (mode) { | ||
982 | case X86EMUL_MODE_REAL: | ||
983 | case X86EMUL_MODE_VM86: | ||
984 | case X86EMUL_MODE_PROT16: | ||
985 | def_op_bytes = def_ad_bytes = 2; | ||
986 | break; | ||
987 | case X86EMUL_MODE_PROT32: | ||
988 | def_op_bytes = def_ad_bytes = 4; | ||
989 | break; | ||
990 | #ifdef CONFIG_X86_64 | ||
991 | case X86EMUL_MODE_PROT64: | ||
992 | def_op_bytes = 4; | ||
993 | def_ad_bytes = 8; | ||
994 | break; | ||
995 | #endif | ||
996 | default: | ||
997 | return -1; | ||
998 | } | ||
999 | |||
1000 | c->op_bytes = def_op_bytes; | ||
1001 | c->ad_bytes = def_ad_bytes; | ||
1002 | |||
1003 | /* Legacy prefixes. */ | ||
1004 | for (;;) { | ||
1005 | switch (c->b = insn_fetch(u8, 1, c->eip)) { | ||
1006 | case 0x66: /* operand-size override */ | ||
1007 | /* switch between 2/4 bytes */ | ||
1008 | c->op_bytes = def_op_bytes ^ 6; | ||
1009 | break; | ||
1010 | case 0x67: /* address-size override */ | ||
1011 | if (mode == X86EMUL_MODE_PROT64) | ||
1012 | /* switch between 4/8 bytes */ | ||
1013 | c->ad_bytes = def_ad_bytes ^ 12; | ||
1014 | else | ||
1015 | /* switch between 2/4 bytes */ | ||
1016 | c->ad_bytes = def_ad_bytes ^ 6; | ||
1017 | break; | ||
1018 | case 0x26: /* ES override */ | ||
1019 | case 0x2e: /* CS override */ | ||
1020 | case 0x36: /* SS override */ | ||
1021 | case 0x3e: /* DS override */ | ||
1022 | set_seg_override(c, (c->b >> 3) & 3); | ||
1023 | break; | ||
1024 | case 0x64: /* FS override */ | ||
1025 | case 0x65: /* GS override */ | ||
1026 | set_seg_override(c, c->b & 7); | ||
1027 | break; | ||
1028 | case 0x40 ... 0x4f: /* REX */ | ||
1029 | if (mode != X86EMUL_MODE_PROT64) | ||
1030 | goto done_prefixes; | ||
1031 | c->rex_prefix = c->b; | ||
1032 | continue; | ||
1033 | case 0xf0: /* LOCK */ | ||
1034 | c->lock_prefix = 1; | ||
1035 | break; | ||
1036 | case 0xf2: /* REPNE/REPNZ */ | ||
1037 | c->rep_prefix = REPNE_PREFIX; | ||
1038 | break; | ||
1039 | case 0xf3: /* REP/REPE/REPZ */ | ||
1040 | c->rep_prefix = REPE_PREFIX; | ||
1041 | break; | ||
1042 | default: | ||
1043 | goto done_prefixes; | ||
1044 | } | ||
1045 | |||
1046 | /* Any legacy prefix after a REX prefix nullifies its effect. */ | ||
1047 | 1042 | ||
1048 | c->rex_prefix = 0; | 1043 | if (c->dst.type == OP_MEM && c->src.type == OP_REG) { |
1049 | } | 1044 | mask = ~(c->dst.bytes * 8 - 1); |
1050 | |||
1051 | done_prefixes: | ||
1052 | |||
1053 | /* REX prefix. */ | ||
1054 | if (c->rex_prefix) | ||
1055 | if (c->rex_prefix & 8) | ||
1056 | c->op_bytes = 8; /* REX.W */ | ||
1057 | 1045 | ||
1058 | /* Opcode byte(s). */ | 1046 | if (c->src.bytes == 2) |
1059 | c->d = opcode_table[c->b]; | 1047 | sv = (s16)c->src.val & (s16)mask; |
1060 | if (c->d == 0) { | 1048 | else if (c->src.bytes == 4) |
1061 | /* Two-byte opcode? */ | 1049 | sv = (s32)c->src.val & (s32)mask; |
1062 | if (c->b == 0x0f) { | ||
1063 | c->twobyte = 1; | ||
1064 | c->b = insn_fetch(u8, 1, c->eip); | ||
1065 | c->d = twobyte_table[c->b]; | ||
1066 | } | ||
1067 | } | ||
1068 | |||
1069 | if (c->d & Group) { | ||
1070 | group = c->d & GroupMask; | ||
1071 | c->modrm = insn_fetch(u8, 1, c->eip); | ||
1072 | --c->eip; | ||
1073 | |||
1074 | group = (group << 3) + ((c->modrm >> 3) & 7); | ||
1075 | if ((c->d & GroupDual) && (c->modrm >> 6) == 3) | ||
1076 | c->d = group2_table[group]; | ||
1077 | else | ||
1078 | c->d = group_table[group]; | ||
1079 | } | ||
1080 | 1050 | ||
1081 | /* Unrecognised? */ | 1051 | c->dst.addr.mem.ea += (sv >> 3); |
1082 | if (c->d == 0) { | ||
1083 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1084 | return -1; | ||
1085 | } | 1052 | } |
1086 | 1053 | ||
1087 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | 1054 | /* only subword offset */ |
1088 | c->op_bytes = 8; | 1055 | c->src.val &= (c->dst.bytes << 3) - 1; |
1089 | |||
1090 | /* ModRM and SIB bytes. */ | ||
1091 | if (c->d & ModRM) | ||
1092 | rc = decode_modrm(ctxt, ops); | ||
1093 | else if (c->d & MemAbs) | ||
1094 | rc = decode_abs(ctxt, ops); | ||
1095 | if (rc != X86EMUL_CONTINUE) | ||
1096 | goto done; | ||
1097 | |||
1098 | if (!c->has_seg_override) | ||
1099 | set_seg_override(c, VCPU_SREG_DS); | ||
1100 | |||
1101 | if (!(!c->twobyte && c->b == 0x8d)) | ||
1102 | c->modrm_ea += seg_override_base(ctxt, ops, c); | ||
1103 | |||
1104 | if (c->ad_bytes != 8) | ||
1105 | c->modrm_ea = (u32)c->modrm_ea; | ||
1106 | |||
1107 | if (c->rip_relative) | ||
1108 | c->modrm_ea += c->eip; | ||
1109 | |||
1110 | /* | ||
1111 | * Decode and fetch the source operand: register, memory | ||
1112 | * or immediate. | ||
1113 | */ | ||
1114 | switch (c->d & SrcMask) { | ||
1115 | case SrcNone: | ||
1116 | break; | ||
1117 | case SrcReg: | ||
1118 | decode_register_operand(&c->src, c, 0); | ||
1119 | break; | ||
1120 | case SrcMem16: | ||
1121 | c->src.bytes = 2; | ||
1122 | goto srcmem_common; | ||
1123 | case SrcMem32: | ||
1124 | c->src.bytes = 4; | ||
1125 | goto srcmem_common; | ||
1126 | case SrcMem: | ||
1127 | c->src.bytes = (c->d & ByteOp) ? 1 : | ||
1128 | c->op_bytes; | ||
1129 | /* Don't fetch the address for invlpg: it could be unmapped. */ | ||
1130 | if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) | ||
1131 | break; | ||
1132 | srcmem_common: | ||
1133 | /* | ||
1134 | * For instructions with a ModR/M byte, switch to register | ||
1135 | * access if Mod = 3. | ||
1136 | */ | ||
1137 | if ((c->d & ModRM) && c->modrm_mod == 3) { | ||
1138 | c->src.type = OP_REG; | ||
1139 | c->src.val = c->modrm_val; | ||
1140 | c->src.ptr = c->modrm_ptr; | ||
1141 | break; | ||
1142 | } | ||
1143 | c->src.type = OP_MEM; | ||
1144 | c->src.ptr = (unsigned long *)c->modrm_ea; | ||
1145 | c->src.val = 0; | ||
1146 | break; | ||
1147 | case SrcImm: | ||
1148 | case SrcImmU: | ||
1149 | c->src.type = OP_IMM; | ||
1150 | c->src.ptr = (unsigned long *)c->eip; | ||
1151 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1152 | if (c->src.bytes == 8) | ||
1153 | c->src.bytes = 4; | ||
1154 | /* NB. Immediates are sign-extended as necessary. */ | ||
1155 | switch (c->src.bytes) { | ||
1156 | case 1: | ||
1157 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
1158 | break; | ||
1159 | case 2: | ||
1160 | c->src.val = insn_fetch(s16, 2, c->eip); | ||
1161 | break; | ||
1162 | case 4: | ||
1163 | c->src.val = insn_fetch(s32, 4, c->eip); | ||
1164 | break; | ||
1165 | } | ||
1166 | if ((c->d & SrcMask) == SrcImmU) { | ||
1167 | switch (c->src.bytes) { | ||
1168 | case 1: | ||
1169 | c->src.val &= 0xff; | ||
1170 | break; | ||
1171 | case 2: | ||
1172 | c->src.val &= 0xffff; | ||
1173 | break; | ||
1174 | case 4: | ||
1175 | c->src.val &= 0xffffffff; | ||
1176 | break; | ||
1177 | } | ||
1178 | } | ||
1179 | break; | ||
1180 | case SrcImmByte: | ||
1181 | case SrcImmUByte: | ||
1182 | c->src.type = OP_IMM; | ||
1183 | c->src.ptr = (unsigned long *)c->eip; | ||
1184 | c->src.bytes = 1; | ||
1185 | if ((c->d & SrcMask) == SrcImmByte) | ||
1186 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
1187 | else | ||
1188 | c->src.val = insn_fetch(u8, 1, c->eip); | ||
1189 | break; | ||
1190 | case SrcAcc: | ||
1191 | c->src.type = OP_REG; | ||
1192 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1193 | c->src.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1194 | switch (c->src.bytes) { | ||
1195 | case 1: | ||
1196 | c->src.val = *(u8 *)c->src.ptr; | ||
1197 | break; | ||
1198 | case 2: | ||
1199 | c->src.val = *(u16 *)c->src.ptr; | ||
1200 | break; | ||
1201 | case 4: | ||
1202 | c->src.val = *(u32 *)c->src.ptr; | ||
1203 | break; | ||
1204 | case 8: | ||
1205 | c->src.val = *(u64 *)c->src.ptr; | ||
1206 | break; | ||
1207 | } | ||
1208 | break; | ||
1209 | case SrcOne: | ||
1210 | c->src.bytes = 1; | ||
1211 | c->src.val = 1; | ||
1212 | break; | ||
1213 | case SrcSI: | ||
1214 | c->src.type = OP_MEM; | ||
1215 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1216 | c->src.ptr = (unsigned long *) | ||
1217 | register_address(c, seg_override_base(ctxt, ops, c), | ||
1218 | c->regs[VCPU_REGS_RSI]); | ||
1219 | c->src.val = 0; | ||
1220 | break; | ||
1221 | case SrcImmFAddr: | ||
1222 | c->src.type = OP_IMM; | ||
1223 | c->src.ptr = (unsigned long *)c->eip; | ||
1224 | c->src.bytes = c->op_bytes + 2; | ||
1225 | insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); | ||
1226 | break; | ||
1227 | case SrcMemFAddr: | ||
1228 | c->src.type = OP_MEM; | ||
1229 | c->src.ptr = (unsigned long *)c->modrm_ea; | ||
1230 | c->src.bytes = c->op_bytes + 2; | ||
1231 | break; | ||
1232 | } | ||
1233 | |||
1234 | /* | ||
1235 | * Decode and fetch the second source operand: register, memory | ||
1236 | * or immediate. | ||
1237 | */ | ||
1238 | switch (c->d & Src2Mask) { | ||
1239 | case Src2None: | ||
1240 | break; | ||
1241 | case Src2CL: | ||
1242 | c->src2.bytes = 1; | ||
1243 | c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; | ||
1244 | break; | ||
1245 | case Src2ImmByte: | ||
1246 | c->src2.type = OP_IMM; | ||
1247 | c->src2.ptr = (unsigned long *)c->eip; | ||
1248 | c->src2.bytes = 1; | ||
1249 | c->src2.val = insn_fetch(u8, 1, c->eip); | ||
1250 | break; | ||
1251 | case Src2One: | ||
1252 | c->src2.bytes = 1; | ||
1253 | c->src2.val = 1; | ||
1254 | break; | ||
1255 | } | ||
1256 | |||
1257 | /* Decode and fetch the destination operand: register or memory. */ | ||
1258 | switch (c->d & DstMask) { | ||
1259 | case ImplicitOps: | ||
1260 | /* Special instructions do their own operand decoding. */ | ||
1261 | return 0; | ||
1262 | case DstReg: | ||
1263 | decode_register_operand(&c->dst, c, | ||
1264 | c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); | ||
1265 | break; | ||
1266 | case DstMem: | ||
1267 | case DstMem64: | ||
1268 | if ((c->d & ModRM) && c->modrm_mod == 3) { | ||
1269 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1270 | c->dst.type = OP_REG; | ||
1271 | c->dst.val = c->dst.orig_val = c->modrm_val; | ||
1272 | c->dst.ptr = c->modrm_ptr; | ||
1273 | break; | ||
1274 | } | ||
1275 | c->dst.type = OP_MEM; | ||
1276 | c->dst.ptr = (unsigned long *)c->modrm_ea; | ||
1277 | if ((c->d & DstMask) == DstMem64) | ||
1278 | c->dst.bytes = 8; | ||
1279 | else | ||
1280 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1281 | c->dst.val = 0; | ||
1282 | if (c->d & BitOp) { | ||
1283 | unsigned long mask = ~(c->dst.bytes * 8 - 1); | ||
1284 | |||
1285 | c->dst.ptr = (void *)c->dst.ptr + | ||
1286 | (c->src.val & mask) / 8; | ||
1287 | } | ||
1288 | break; | ||
1289 | case DstAcc: | ||
1290 | c->dst.type = OP_REG; | ||
1291 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1292 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1293 | switch (c->dst.bytes) { | ||
1294 | case 1: | ||
1295 | c->dst.val = *(u8 *)c->dst.ptr; | ||
1296 | break; | ||
1297 | case 2: | ||
1298 | c->dst.val = *(u16 *)c->dst.ptr; | ||
1299 | break; | ||
1300 | case 4: | ||
1301 | c->dst.val = *(u32 *)c->dst.ptr; | ||
1302 | break; | ||
1303 | case 8: | ||
1304 | c->dst.val = *(u64 *)c->dst.ptr; | ||
1305 | break; | ||
1306 | } | ||
1307 | c->dst.orig_val = c->dst.val; | ||
1308 | break; | ||
1309 | case DstDI: | ||
1310 | c->dst.type = OP_MEM; | ||
1311 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1312 | c->dst.ptr = (unsigned long *) | ||
1313 | register_address(c, es_base(ctxt, ops), | ||
1314 | c->regs[VCPU_REGS_RDI]); | ||
1315 | c->dst.val = 0; | ||
1316 | break; | ||
1317 | } | ||
1318 | |||
1319 | done: | ||
1320 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | ||
1321 | } | 1056 | } |
1322 | 1057 | ||
1323 | static int read_emulated(struct x86_emulate_ctxt *ctxt, | 1058 | static int read_emulated(struct x86_emulate_ctxt *ctxt, |
@@ -1326,7 +1061,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, | |||
1326 | { | 1061 | { |
1327 | int rc; | 1062 | int rc; |
1328 | struct read_cache *mc = &ctxt->decode.mem_read; | 1063 | struct read_cache *mc = &ctxt->decode.mem_read; |
1329 | u32 err; | ||
1330 | 1064 | ||
1331 | while (size) { | 1065 | while (size) { |
1332 | int n = min(size, 8u); | 1066 | int n = min(size, 8u); |
@@ -1334,10 +1068,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, | |||
1334 | if (mc->pos < mc->end) | 1068 | if (mc->pos < mc->end) |
1335 | goto read_cached; | 1069 | goto read_cached; |
1336 | 1070 | ||
1337 | rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, | 1071 | rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n, |
1338 | ctxt->vcpu); | 1072 | &ctxt->exception); |
1339 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
1340 | emulate_pf(ctxt, addr, err); | ||
1341 | if (rc != X86EMUL_CONTINUE) | 1073 | if (rc != X86EMUL_CONTINUE) |
1342 | return rc; | 1074 | return rc; |
1343 | mc->end += n; | 1075 | mc->end += n; |
@@ -1351,6 +1083,50 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, | |||
1351 | return X86EMUL_CONTINUE; | 1083 | return X86EMUL_CONTINUE; |
1352 | } | 1084 | } |
1353 | 1085 | ||
1086 | static int segmented_read(struct x86_emulate_ctxt *ctxt, | ||
1087 | struct segmented_address addr, | ||
1088 | void *data, | ||
1089 | unsigned size) | ||
1090 | { | ||
1091 | int rc; | ||
1092 | ulong linear; | ||
1093 | |||
1094 | rc = linearize(ctxt, addr, size, false, &linear); | ||
1095 | if (rc != X86EMUL_CONTINUE) | ||
1096 | return rc; | ||
1097 | return read_emulated(ctxt, ctxt->ops, linear, data, size); | ||
1098 | } | ||
1099 | |||
1100 | static int segmented_write(struct x86_emulate_ctxt *ctxt, | ||
1101 | struct segmented_address addr, | ||
1102 | const void *data, | ||
1103 | unsigned size) | ||
1104 | { | ||
1105 | int rc; | ||
1106 | ulong linear; | ||
1107 | |||
1108 | rc = linearize(ctxt, addr, size, true, &linear); | ||
1109 | if (rc != X86EMUL_CONTINUE) | ||
1110 | return rc; | ||
1111 | return ctxt->ops->write_emulated(ctxt, linear, data, size, | ||
1112 | &ctxt->exception); | ||
1113 | } | ||
1114 | |||
1115 | static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, | ||
1116 | struct segmented_address addr, | ||
1117 | const void *orig_data, const void *data, | ||
1118 | unsigned size) | ||
1119 | { | ||
1120 | int rc; | ||
1121 | ulong linear; | ||
1122 | |||
1123 | rc = linearize(ctxt, addr, size, true, &linear); | ||
1124 | if (rc != X86EMUL_CONTINUE) | ||
1125 | return rc; | ||
1126 | return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data, | ||
1127 | size, &ctxt->exception); | ||
1128 | } | ||
1129 | |||
1354 | static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | 1130 | static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, |
1355 | struct x86_emulate_ops *ops, | 1131 | struct x86_emulate_ops *ops, |
1356 | unsigned int size, unsigned short port, | 1132 | unsigned int size, unsigned short port, |
@@ -1371,7 +1147,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | |||
1371 | if (n == 0) | 1147 | if (n == 0) |
1372 | n = 1; | 1148 | n = 1; |
1373 | rc->pos = rc->end = 0; | 1149 | rc->pos = rc->end = 0; |
1374 | if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) | 1150 | if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n)) |
1375 | return 0; | 1151 | return 0; |
1376 | rc->end = n * size; | 1152 | rc->end = n * size; |
1377 | } | 1153 | } |
@@ -1381,27 +1157,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | |||
1381 | return 1; | 1157 | return 1; |
1382 | } | 1158 | } |
1383 | 1159 | ||
1384 | static u32 desc_limit_scaled(struct desc_struct *desc) | ||
1385 | { | ||
1386 | u32 limit = get_desc_limit(desc); | ||
1387 | |||
1388 | return desc->g ? (limit << 12) | 0xfff : limit; | ||
1389 | } | ||
1390 | |||
1391 | static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, | 1160 | static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, |
1392 | struct x86_emulate_ops *ops, | 1161 | struct x86_emulate_ops *ops, |
1393 | u16 selector, struct desc_ptr *dt) | 1162 | u16 selector, struct desc_ptr *dt) |
1394 | { | 1163 | { |
1395 | if (selector & 1 << 2) { | 1164 | if (selector & 1 << 2) { |
1396 | struct desc_struct desc; | 1165 | struct desc_struct desc; |
1166 | u16 sel; | ||
1167 | |||
1397 | memset (dt, 0, sizeof *dt); | 1168 | memset (dt, 0, sizeof *dt); |
1398 | if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) | 1169 | if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR)) |
1399 | return; | 1170 | return; |
1400 | 1171 | ||
1401 | dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ | 1172 | dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ |
1402 | dt->address = get_desc_base(&desc); | 1173 | dt->address = get_desc_base(&desc); |
1403 | } else | 1174 | } else |
1404 | ops->get_gdt(dt, ctxt->vcpu); | 1175 | ops->get_gdt(ctxt, dt); |
1405 | } | 1176 | } |
1406 | 1177 | ||
1407 | /* allowed just for 8 bytes segments */ | 1178 | /* allowed just for 8 bytes segments */ |
@@ -1412,19 +1183,14 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1412 | struct desc_ptr dt; | 1183 | struct desc_ptr dt; |
1413 | u16 index = selector >> 3; | 1184 | u16 index = selector >> 3; |
1414 | int ret; | 1185 | int ret; |
1415 | u32 err; | ||
1416 | ulong addr; | 1186 | ulong addr; |
1417 | 1187 | ||
1418 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); | 1188 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); |
1419 | 1189 | ||
1420 | if (dt.size < index * 8 + 7) { | 1190 | if (dt.size < index * 8 + 7) |
1421 | emulate_gp(ctxt, selector & 0xfffc); | 1191 | return emulate_gp(ctxt, selector & 0xfffc); |
1422 | return X86EMUL_PROPAGATE_FAULT; | ||
1423 | } | ||
1424 | addr = dt.address + index * 8; | 1192 | addr = dt.address + index * 8; |
1425 | ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); | 1193 | ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); |
1426 | if (ret == X86EMUL_PROPAGATE_FAULT) | ||
1427 | emulate_pf(ctxt, addr, err); | ||
1428 | 1194 | ||
1429 | return ret; | 1195 | return ret; |
1430 | } | 1196 | } |
@@ -1436,25 +1202,21 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1436 | { | 1202 | { |
1437 | struct desc_ptr dt; | 1203 | struct desc_ptr dt; |
1438 | u16 index = selector >> 3; | 1204 | u16 index = selector >> 3; |
1439 | u32 err; | ||
1440 | ulong addr; | 1205 | ulong addr; |
1441 | int ret; | 1206 | int ret; |
1442 | 1207 | ||
1443 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); | 1208 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); |
1444 | 1209 | ||
1445 | if (dt.size < index * 8 + 7) { | 1210 | if (dt.size < index * 8 + 7) |
1446 | emulate_gp(ctxt, selector & 0xfffc); | 1211 | return emulate_gp(ctxt, selector & 0xfffc); |
1447 | return X86EMUL_PROPAGATE_FAULT; | ||
1448 | } | ||
1449 | 1212 | ||
1450 | addr = dt.address + index * 8; | 1213 | addr = dt.address + index * 8; |
1451 | ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); | 1214 | ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); |
1452 | if (ret == X86EMUL_PROPAGATE_FAULT) | ||
1453 | emulate_pf(ctxt, addr, err); | ||
1454 | 1215 | ||
1455 | return ret; | 1216 | return ret; |
1456 | } | 1217 | } |
1457 | 1218 | ||
1219 | /* Does not support long mode */ | ||
1458 | static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1220 | static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
1459 | struct x86_emulate_ops *ops, | 1221 | struct x86_emulate_ops *ops, |
1460 | u16 selector, int seg) | 1222 | u16 selector, int seg) |
@@ -1509,7 +1271,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1509 | 1271 | ||
1510 | rpl = selector & 3; | 1272 | rpl = selector & 3; |
1511 | dpl = seg_desc.dpl; | 1273 | dpl = seg_desc.dpl; |
1512 | cpl = ops->cpl(ctxt->vcpu); | 1274 | cpl = ops->cpl(ctxt); |
1513 | 1275 | ||
1514 | switch (seg) { | 1276 | switch (seg) { |
1515 | case VCPU_SREG_SS: | 1277 | case VCPU_SREG_SS: |
@@ -1565,63 +1327,59 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1565 | return ret; | 1327 | return ret; |
1566 | } | 1328 | } |
1567 | load: | 1329 | load: |
1568 | ops->set_segment_selector(selector, seg, ctxt->vcpu); | 1330 | ops->set_segment(ctxt, selector, &seg_desc, 0, seg); |
1569 | ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); | ||
1570 | return X86EMUL_CONTINUE; | 1331 | return X86EMUL_CONTINUE; |
1571 | exception: | 1332 | exception: |
1572 | emulate_exception(ctxt, err_vec, err_code, true); | 1333 | emulate_exception(ctxt, err_vec, err_code, true); |
1573 | return X86EMUL_PROPAGATE_FAULT; | 1334 | return X86EMUL_PROPAGATE_FAULT; |
1574 | } | 1335 | } |
1575 | 1336 | ||
1576 | static inline int writeback(struct x86_emulate_ctxt *ctxt, | 1337 | static void write_register_operand(struct operand *op) |
1577 | struct x86_emulate_ops *ops) | 1338 | { |
1339 | /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ | ||
1340 | switch (op->bytes) { | ||
1341 | case 1: | ||
1342 | *(u8 *)op->addr.reg = (u8)op->val; | ||
1343 | break; | ||
1344 | case 2: | ||
1345 | *(u16 *)op->addr.reg = (u16)op->val; | ||
1346 | break; | ||
1347 | case 4: | ||
1348 | *op->addr.reg = (u32)op->val; | ||
1349 | break; /* 64b: zero-extend */ | ||
1350 | case 8: | ||
1351 | *op->addr.reg = op->val; | ||
1352 | break; | ||
1353 | } | ||
1354 | } | ||
1355 | |||
1356 | static int writeback(struct x86_emulate_ctxt *ctxt) | ||
1578 | { | 1357 | { |
1579 | int rc; | 1358 | int rc; |
1580 | struct decode_cache *c = &ctxt->decode; | 1359 | struct decode_cache *c = &ctxt->decode; |
1581 | u32 err; | ||
1582 | 1360 | ||
1583 | switch (c->dst.type) { | 1361 | switch (c->dst.type) { |
1584 | case OP_REG: | 1362 | case OP_REG: |
1585 | /* The 4-byte case *is* correct: | 1363 | write_register_operand(&c->dst); |
1586 | * in 64-bit mode we zero-extend. | ||
1587 | */ | ||
1588 | switch (c->dst.bytes) { | ||
1589 | case 1: | ||
1590 | *(u8 *)c->dst.ptr = (u8)c->dst.val; | ||
1591 | break; | ||
1592 | case 2: | ||
1593 | *(u16 *)c->dst.ptr = (u16)c->dst.val; | ||
1594 | break; | ||
1595 | case 4: | ||
1596 | *c->dst.ptr = (u32)c->dst.val; | ||
1597 | break; /* 64b: zero-ext */ | ||
1598 | case 8: | ||
1599 | *c->dst.ptr = c->dst.val; | ||
1600 | break; | ||
1601 | } | ||
1602 | break; | 1364 | break; |
1603 | case OP_MEM: | 1365 | case OP_MEM: |
1604 | if (c->lock_prefix) | 1366 | if (c->lock_prefix) |
1605 | rc = ops->cmpxchg_emulated( | 1367 | rc = segmented_cmpxchg(ctxt, |
1606 | (unsigned long)c->dst.ptr, | 1368 | c->dst.addr.mem, |
1607 | &c->dst.orig_val, | 1369 | &c->dst.orig_val, |
1608 | &c->dst.val, | 1370 | &c->dst.val, |
1609 | c->dst.bytes, | 1371 | c->dst.bytes); |
1610 | &err, | ||
1611 | ctxt->vcpu); | ||
1612 | else | 1372 | else |
1613 | rc = ops->write_emulated( | 1373 | rc = segmented_write(ctxt, |
1614 | (unsigned long)c->dst.ptr, | 1374 | c->dst.addr.mem, |
1615 | &c->dst.val, | 1375 | &c->dst.val, |
1616 | c->dst.bytes, | 1376 | c->dst.bytes); |
1617 | &err, | ||
1618 | ctxt->vcpu); | ||
1619 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
1620 | emulate_pf(ctxt, | ||
1621 | (unsigned long)c->dst.ptr, err); | ||
1622 | if (rc != X86EMUL_CONTINUE) | 1377 | if (rc != X86EMUL_CONTINUE) |
1623 | return rc; | 1378 | return rc; |
1624 | break; | 1379 | break; |
1380 | case OP_XMM: | ||
1381 | write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm); | ||
1382 | break; | ||
1625 | case OP_NONE: | 1383 | case OP_NONE: |
1626 | /* no writeback */ | 1384 | /* no writeback */ |
1627 | break; | 1385 | break; |
@@ -1631,29 +1389,30 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, | |||
1631 | return X86EMUL_CONTINUE; | 1389 | return X86EMUL_CONTINUE; |
1632 | } | 1390 | } |
1633 | 1391 | ||
1634 | static inline void emulate_push(struct x86_emulate_ctxt *ctxt, | 1392 | static int em_push(struct x86_emulate_ctxt *ctxt) |
1635 | struct x86_emulate_ops *ops) | ||
1636 | { | 1393 | { |
1637 | struct decode_cache *c = &ctxt->decode; | 1394 | struct decode_cache *c = &ctxt->decode; |
1395 | struct segmented_address addr; | ||
1638 | 1396 | ||
1639 | c->dst.type = OP_MEM; | ||
1640 | c->dst.bytes = c->op_bytes; | ||
1641 | c->dst.val = c->src.val; | ||
1642 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); | 1397 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); |
1643 | c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), | 1398 | addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); |
1644 | c->regs[VCPU_REGS_RSP]); | 1399 | addr.seg = VCPU_SREG_SS; |
1400 | |||
1401 | /* Disable writeback. */ | ||
1402 | c->dst.type = OP_NONE; | ||
1403 | return segmented_write(ctxt, addr, &c->src.val, c->op_bytes); | ||
1645 | } | 1404 | } |
1646 | 1405 | ||
1647 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, | 1406 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, |
1648 | struct x86_emulate_ops *ops, | ||
1649 | void *dest, int len) | 1407 | void *dest, int len) |
1650 | { | 1408 | { |
1651 | struct decode_cache *c = &ctxt->decode; | 1409 | struct decode_cache *c = &ctxt->decode; |
1652 | int rc; | 1410 | int rc; |
1411 | struct segmented_address addr; | ||
1653 | 1412 | ||
1654 | rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), | 1413 | addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); |
1655 | c->regs[VCPU_REGS_RSP]), | 1414 | addr.seg = VCPU_SREG_SS; |
1656 | dest, len); | 1415 | rc = segmented_read(ctxt, addr, dest, len); |
1657 | if (rc != X86EMUL_CONTINUE) | 1416 | if (rc != X86EMUL_CONTINUE) |
1658 | return rc; | 1417 | return rc; |
1659 | 1418 | ||
@@ -1661,6 +1420,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, | |||
1661 | return rc; | 1420 | return rc; |
1662 | } | 1421 | } |
1663 | 1422 | ||
1423 | static int em_pop(struct x86_emulate_ctxt *ctxt) | ||
1424 | { | ||
1425 | struct decode_cache *c = &ctxt->decode; | ||
1426 | |||
1427 | return emulate_pop(ctxt, &c->dst.val, c->op_bytes); | ||
1428 | } | ||
1429 | |||
1664 | static int emulate_popf(struct x86_emulate_ctxt *ctxt, | 1430 | static int emulate_popf(struct x86_emulate_ctxt *ctxt, |
1665 | struct x86_emulate_ops *ops, | 1431 | struct x86_emulate_ops *ops, |
1666 | void *dest, int len) | 1432 | void *dest, int len) |
@@ -1668,9 +1434,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1668 | int rc; | 1434 | int rc; |
1669 | unsigned long val, change_mask; | 1435 | unsigned long val, change_mask; |
1670 | int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | 1436 | int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; |
1671 | int cpl = ops->cpl(ctxt->vcpu); | 1437 | int cpl = ops->cpl(ctxt); |
1672 | 1438 | ||
1673 | rc = emulate_pop(ctxt, ops, &val, len); | 1439 | rc = emulate_pop(ctxt, &val, len); |
1674 | if (rc != X86EMUL_CONTINUE) | 1440 | if (rc != X86EMUL_CONTINUE) |
1675 | return rc; | 1441 | return rc; |
1676 | 1442 | ||
@@ -1687,10 +1453,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1687 | change_mask |= EFLG_IF; | 1453 | change_mask |= EFLG_IF; |
1688 | break; | 1454 | break; |
1689 | case X86EMUL_MODE_VM86: | 1455 | case X86EMUL_MODE_VM86: |
1690 | if (iopl < 3) { | 1456 | if (iopl < 3) |
1691 | emulate_gp(ctxt, 0); | 1457 | return emulate_gp(ctxt, 0); |
1692 | return X86EMUL_PROPAGATE_FAULT; | ||
1693 | } | ||
1694 | change_mask |= EFLG_IF; | 1458 | change_mask |= EFLG_IF; |
1695 | break; | 1459 | break; |
1696 | default: /* real mode */ | 1460 | default: /* real mode */ |
@@ -1704,14 +1468,24 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1704 | return rc; | 1468 | return rc; |
1705 | } | 1469 | } |
1706 | 1470 | ||
1707 | static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, | 1471 | static int em_popf(struct x86_emulate_ctxt *ctxt) |
1708 | struct x86_emulate_ops *ops, int seg) | 1472 | { |
1473 | struct decode_cache *c = &ctxt->decode; | ||
1474 | |||
1475 | c->dst.type = OP_REG; | ||
1476 | c->dst.addr.reg = &ctxt->eflags; | ||
1477 | c->dst.bytes = c->op_bytes; | ||
1478 | return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); | ||
1479 | } | ||
1480 | |||
1481 | static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, | ||
1482 | struct x86_emulate_ops *ops, int seg) | ||
1709 | { | 1483 | { |
1710 | struct decode_cache *c = &ctxt->decode; | 1484 | struct decode_cache *c = &ctxt->decode; |
1711 | 1485 | ||
1712 | c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); | 1486 | c->src.val = get_segment_selector(ctxt, seg); |
1713 | 1487 | ||
1714 | emulate_push(ctxt, ops); | 1488 | return em_push(ctxt); |
1715 | } | 1489 | } |
1716 | 1490 | ||
1717 | static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, | 1491 | static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, |
@@ -1721,7 +1495,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, | |||
1721 | unsigned long selector; | 1495 | unsigned long selector; |
1722 | int rc; | 1496 | int rc; |
1723 | 1497 | ||
1724 | rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); | 1498 | rc = emulate_pop(ctxt, &selector, c->op_bytes); |
1725 | if (rc != X86EMUL_CONTINUE) | 1499 | if (rc != X86EMUL_CONTINUE) |
1726 | return rc; | 1500 | return rc; |
1727 | 1501 | ||
@@ -1729,8 +1503,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, | |||
1729 | return rc; | 1503 | return rc; |
1730 | } | 1504 | } |
1731 | 1505 | ||
1732 | static int emulate_pusha(struct x86_emulate_ctxt *ctxt, | 1506 | static int em_pusha(struct x86_emulate_ctxt *ctxt) |
1733 | struct x86_emulate_ops *ops) | ||
1734 | { | 1507 | { |
1735 | struct decode_cache *c = &ctxt->decode; | 1508 | struct decode_cache *c = &ctxt->decode; |
1736 | unsigned long old_esp = c->regs[VCPU_REGS_RSP]; | 1509 | unsigned long old_esp = c->regs[VCPU_REGS_RSP]; |
@@ -1741,23 +1514,25 @@ static int emulate_pusha(struct x86_emulate_ctxt *ctxt, | |||
1741 | (reg == VCPU_REGS_RSP) ? | 1514 | (reg == VCPU_REGS_RSP) ? |
1742 | (c->src.val = old_esp) : (c->src.val = c->regs[reg]); | 1515 | (c->src.val = old_esp) : (c->src.val = c->regs[reg]); |
1743 | 1516 | ||
1744 | emulate_push(ctxt, ops); | 1517 | rc = em_push(ctxt); |
1745 | |||
1746 | rc = writeback(ctxt, ops); | ||
1747 | if (rc != X86EMUL_CONTINUE) | 1518 | if (rc != X86EMUL_CONTINUE) |
1748 | return rc; | 1519 | return rc; |
1749 | 1520 | ||
1750 | ++reg; | 1521 | ++reg; |
1751 | } | 1522 | } |
1752 | 1523 | ||
1753 | /* Disable writeback. */ | ||
1754 | c->dst.type = OP_NONE; | ||
1755 | |||
1756 | return rc; | 1524 | return rc; |
1757 | } | 1525 | } |
1758 | 1526 | ||
1759 | static int emulate_popa(struct x86_emulate_ctxt *ctxt, | 1527 | static int em_pushf(struct x86_emulate_ctxt *ctxt) |
1760 | struct x86_emulate_ops *ops) | 1528 | { |
1529 | struct decode_cache *c = &ctxt->decode; | ||
1530 | |||
1531 | c->src.val = (unsigned long)ctxt->eflags; | ||
1532 | return em_push(ctxt); | ||
1533 | } | ||
1534 | |||
1535 | static int em_popa(struct x86_emulate_ctxt *ctxt) | ||
1761 | { | 1536 | { |
1762 | struct decode_cache *c = &ctxt->decode; | 1537 | struct decode_cache *c = &ctxt->decode; |
1763 | int rc = X86EMUL_CONTINUE; | 1538 | int rc = X86EMUL_CONTINUE; |
@@ -1770,7 +1545,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, | |||
1770 | --reg; | 1545 | --reg; |
1771 | } | 1546 | } |
1772 | 1547 | ||
1773 | rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); | 1548 | rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes); |
1774 | if (rc != X86EMUL_CONTINUE) | 1549 | if (rc != X86EMUL_CONTINUE) |
1775 | break; | 1550 | break; |
1776 | --reg; | 1551 | --reg; |
@@ -1778,15 +1553,167 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, | |||
1778 | return rc; | 1553 | return rc; |
1779 | } | 1554 | } |
1780 | 1555 | ||
1781 | static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, | 1556 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, |
1782 | struct x86_emulate_ops *ops) | 1557 | struct x86_emulate_ops *ops, int irq) |
1783 | { | 1558 | { |
1784 | struct decode_cache *c = &ctxt->decode; | 1559 | struct decode_cache *c = &ctxt->decode; |
1560 | int rc; | ||
1561 | struct desc_ptr dt; | ||
1562 | gva_t cs_addr; | ||
1563 | gva_t eip_addr; | ||
1564 | u16 cs, eip; | ||
1785 | 1565 | ||
1786 | return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); | 1566 | /* TODO: Add limit checks */ |
1567 | c->src.val = ctxt->eflags; | ||
1568 | rc = em_push(ctxt); | ||
1569 | if (rc != X86EMUL_CONTINUE) | ||
1570 | return rc; | ||
1571 | |||
1572 | ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); | ||
1573 | |||
1574 | c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); | ||
1575 | rc = em_push(ctxt); | ||
1576 | if (rc != X86EMUL_CONTINUE) | ||
1577 | return rc; | ||
1578 | |||
1579 | c->src.val = c->eip; | ||
1580 | rc = em_push(ctxt); | ||
1581 | if (rc != X86EMUL_CONTINUE) | ||
1582 | return rc; | ||
1583 | |||
1584 | ops->get_idt(ctxt, &dt); | ||
1585 | |||
1586 | eip_addr = dt.address + (irq << 2); | ||
1587 | cs_addr = dt.address + (irq << 2) + 2; | ||
1588 | |||
1589 | rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception); | ||
1590 | if (rc != X86EMUL_CONTINUE) | ||
1591 | return rc; | ||
1592 | |||
1593 | rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception); | ||
1594 | if (rc != X86EMUL_CONTINUE) | ||
1595 | return rc; | ||
1596 | |||
1597 | rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); | ||
1598 | if (rc != X86EMUL_CONTINUE) | ||
1599 | return rc; | ||
1600 | |||
1601 | c->eip = eip; | ||
1602 | |||
1603 | return rc; | ||
1787 | } | 1604 | } |
1788 | 1605 | ||
1789 | static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) | 1606 | static int emulate_int(struct x86_emulate_ctxt *ctxt, |
1607 | struct x86_emulate_ops *ops, int irq) | ||
1608 | { | ||
1609 | switch(ctxt->mode) { | ||
1610 | case X86EMUL_MODE_REAL: | ||
1611 | return emulate_int_real(ctxt, ops, irq); | ||
1612 | case X86EMUL_MODE_VM86: | ||
1613 | case X86EMUL_MODE_PROT16: | ||
1614 | case X86EMUL_MODE_PROT32: | ||
1615 | case X86EMUL_MODE_PROT64: | ||
1616 | default: | ||
1617 | /* Protected mode interrupts unimplemented yet */ | ||
1618 | return X86EMUL_UNHANDLEABLE; | ||
1619 | } | ||
1620 | } | ||
1621 | |||
1622 | static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, | ||
1623 | struct x86_emulate_ops *ops) | ||
1624 | { | ||
1625 | struct decode_cache *c = &ctxt->decode; | ||
1626 | int rc = X86EMUL_CONTINUE; | ||
1627 | unsigned long temp_eip = 0; | ||
1628 | unsigned long temp_eflags = 0; | ||
1629 | unsigned long cs = 0; | ||
1630 | unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | | ||
1631 | EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | | ||
1632 | EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ | ||
1633 | unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; | ||
1634 | |||
1635 | /* TODO: Add stack limit check */ | ||
1636 | |||
1637 | rc = emulate_pop(ctxt, &temp_eip, c->op_bytes); | ||
1638 | |||
1639 | if (rc != X86EMUL_CONTINUE) | ||
1640 | return rc; | ||
1641 | |||
1642 | if (temp_eip & ~0xffff) | ||
1643 | return emulate_gp(ctxt, 0); | ||
1644 | |||
1645 | rc = emulate_pop(ctxt, &cs, c->op_bytes); | ||
1646 | |||
1647 | if (rc != X86EMUL_CONTINUE) | ||
1648 | return rc; | ||
1649 | |||
1650 | rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes); | ||
1651 | |||
1652 | if (rc != X86EMUL_CONTINUE) | ||
1653 | return rc; | ||
1654 | |||
1655 | rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); | ||
1656 | |||
1657 | if (rc != X86EMUL_CONTINUE) | ||
1658 | return rc; | ||
1659 | |||
1660 | c->eip = temp_eip; | ||
1661 | |||
1662 | |||
1663 | if (c->op_bytes == 4) | ||
1664 | ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); | ||
1665 | else if (c->op_bytes == 2) { | ||
1666 | ctxt->eflags &= ~0xffff; | ||
1667 | ctxt->eflags |= temp_eflags; | ||
1668 | } | ||
1669 | |||
1670 | ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ | ||
1671 | ctxt->eflags |= EFLG_RESERVED_ONE_MASK; | ||
1672 | |||
1673 | return rc; | ||
1674 | } | ||
1675 | |||
1676 | static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, | ||
1677 | struct x86_emulate_ops* ops) | ||
1678 | { | ||
1679 | switch(ctxt->mode) { | ||
1680 | case X86EMUL_MODE_REAL: | ||
1681 | return emulate_iret_real(ctxt, ops); | ||
1682 | case X86EMUL_MODE_VM86: | ||
1683 | case X86EMUL_MODE_PROT16: | ||
1684 | case X86EMUL_MODE_PROT32: | ||
1685 | case X86EMUL_MODE_PROT64: | ||
1686 | default: | ||
1687 | /* iret from protected mode unimplemented yet */ | ||
1688 | return X86EMUL_UNHANDLEABLE; | ||
1689 | } | ||
1690 | } | ||
1691 | |||
1692 | static int em_jmp_far(struct x86_emulate_ctxt *ctxt) | ||
1693 | { | ||
1694 | struct decode_cache *c = &ctxt->decode; | ||
1695 | int rc; | ||
1696 | unsigned short sel; | ||
1697 | |||
1698 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); | ||
1699 | |||
1700 | rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS); | ||
1701 | if (rc != X86EMUL_CONTINUE) | ||
1702 | return rc; | ||
1703 | |||
1704 | c->eip = 0; | ||
1705 | memcpy(&c->eip, c->src.valptr, c->op_bytes); | ||
1706 | return X86EMUL_CONTINUE; | ||
1707 | } | ||
1708 | |||
1709 | static int em_grp1a(struct x86_emulate_ctxt *ctxt) | ||
1710 | { | ||
1711 | struct decode_cache *c = &ctxt->decode; | ||
1712 | |||
1713 | return emulate_pop(ctxt, &c->dst.val, c->dst.bytes); | ||
1714 | } | ||
1715 | |||
1716 | static int em_grp2(struct x86_emulate_ctxt *ctxt) | ||
1790 | { | 1717 | { |
1791 | struct decode_cache *c = &ctxt->decode; | 1718 | struct decode_cache *c = &ctxt->decode; |
1792 | switch (c->modrm_reg) { | 1719 | switch (c->modrm_reg) { |
@@ -1813,12 +1740,15 @@ static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) | |||
1813 | emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); | 1740 | emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); |
1814 | break; | 1741 | break; |
1815 | } | 1742 | } |
1743 | return X86EMUL_CONTINUE; | ||
1816 | } | 1744 | } |
1817 | 1745 | ||
1818 | static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, | 1746 | static int em_grp3(struct x86_emulate_ctxt *ctxt) |
1819 | struct x86_emulate_ops *ops) | ||
1820 | { | 1747 | { |
1821 | struct decode_cache *c = &ctxt->decode; | 1748 | struct decode_cache *c = &ctxt->decode; |
1749 | unsigned long *rax = &c->regs[VCPU_REGS_RAX]; | ||
1750 | unsigned long *rdx = &c->regs[VCPU_REGS_RDX]; | ||
1751 | u8 de = 0; | ||
1822 | 1752 | ||
1823 | switch (c->modrm_reg) { | 1753 | switch (c->modrm_reg) { |
1824 | case 0 ... 1: /* test */ | 1754 | case 0 ... 1: /* test */ |
@@ -1830,16 +1760,32 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, | |||
1830 | case 3: /* neg */ | 1760 | case 3: /* neg */ |
1831 | emulate_1op("neg", c->dst, ctxt->eflags); | 1761 | emulate_1op("neg", c->dst, ctxt->eflags); |
1832 | break; | 1762 | break; |
1763 | case 4: /* mul */ | ||
1764 | emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); | ||
1765 | break; | ||
1766 | case 5: /* imul */ | ||
1767 | emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); | ||
1768 | break; | ||
1769 | case 6: /* div */ | ||
1770 | emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, | ||
1771 | ctxt->eflags, de); | ||
1772 | break; | ||
1773 | case 7: /* idiv */ | ||
1774 | emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, | ||
1775 | ctxt->eflags, de); | ||
1776 | break; | ||
1833 | default: | 1777 | default: |
1834 | return 0; | 1778 | return X86EMUL_UNHANDLEABLE; |
1835 | } | 1779 | } |
1836 | return 1; | 1780 | if (de) |
1781 | return emulate_de(ctxt); | ||
1782 | return X86EMUL_CONTINUE; | ||
1837 | } | 1783 | } |
1838 | 1784 | ||
1839 | static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | 1785 | static int em_grp45(struct x86_emulate_ctxt *ctxt) |
1840 | struct x86_emulate_ops *ops) | ||
1841 | { | 1786 | { |
1842 | struct decode_cache *c = &ctxt->decode; | 1787 | struct decode_cache *c = &ctxt->decode; |
1788 | int rc = X86EMUL_CONTINUE; | ||
1843 | 1789 | ||
1844 | switch (c->modrm_reg) { | 1790 | switch (c->modrm_reg) { |
1845 | case 0: /* inc */ | 1791 | case 0: /* inc */ |
@@ -1853,21 +1799,23 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | |||
1853 | old_eip = c->eip; | 1799 | old_eip = c->eip; |
1854 | c->eip = c->src.val; | 1800 | c->eip = c->src.val; |
1855 | c->src.val = old_eip; | 1801 | c->src.val = old_eip; |
1856 | emulate_push(ctxt, ops); | 1802 | rc = em_push(ctxt); |
1857 | break; | 1803 | break; |
1858 | } | 1804 | } |
1859 | case 4: /* jmp abs */ | 1805 | case 4: /* jmp abs */ |
1860 | c->eip = c->src.val; | 1806 | c->eip = c->src.val; |
1861 | break; | 1807 | break; |
1808 | case 5: /* jmp far */ | ||
1809 | rc = em_jmp_far(ctxt); | ||
1810 | break; | ||
1862 | case 6: /* push */ | 1811 | case 6: /* push */ |
1863 | emulate_push(ctxt, ops); | 1812 | rc = em_push(ctxt); |
1864 | break; | 1813 | break; |
1865 | } | 1814 | } |
1866 | return X86EMUL_CONTINUE; | 1815 | return rc; |
1867 | } | 1816 | } |
1868 | 1817 | ||
1869 | static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, | 1818 | static int em_grp9(struct x86_emulate_ctxt *ctxt) |
1870 | struct x86_emulate_ops *ops) | ||
1871 | { | 1819 | { |
1872 | struct decode_cache *c = &ctxt->decode; | 1820 | struct decode_cache *c = &ctxt->decode; |
1873 | u64 old = c->dst.orig_val64; | 1821 | u64 old = c->dst.orig_val64; |
@@ -1893,25 +1841,44 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, | |||
1893 | int rc; | 1841 | int rc; |
1894 | unsigned long cs; | 1842 | unsigned long cs; |
1895 | 1843 | ||
1896 | rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); | 1844 | rc = emulate_pop(ctxt, &c->eip, c->op_bytes); |
1897 | if (rc != X86EMUL_CONTINUE) | 1845 | if (rc != X86EMUL_CONTINUE) |
1898 | return rc; | 1846 | return rc; |
1899 | if (c->op_bytes == 4) | 1847 | if (c->op_bytes == 4) |
1900 | c->eip = (u32)c->eip; | 1848 | c->eip = (u32)c->eip; |
1901 | rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); | 1849 | rc = emulate_pop(ctxt, &cs, c->op_bytes); |
1902 | if (rc != X86EMUL_CONTINUE) | 1850 | if (rc != X86EMUL_CONTINUE) |
1903 | return rc; | 1851 | return rc; |
1904 | rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); | 1852 | rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); |
1905 | return rc; | 1853 | return rc; |
1906 | } | 1854 | } |
1907 | 1855 | ||
1856 | static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, | ||
1857 | struct x86_emulate_ops *ops, int seg) | ||
1858 | { | ||
1859 | struct decode_cache *c = &ctxt->decode; | ||
1860 | unsigned short sel; | ||
1861 | int rc; | ||
1862 | |||
1863 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); | ||
1864 | |||
1865 | rc = load_segment_descriptor(ctxt, ops, sel, seg); | ||
1866 | if (rc != X86EMUL_CONTINUE) | ||
1867 | return rc; | ||
1868 | |||
1869 | c->dst.val = c->src.val; | ||
1870 | return rc; | ||
1871 | } | ||
1872 | |||
1908 | static inline void | 1873 | static inline void |
1909 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | 1874 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, |
1910 | struct x86_emulate_ops *ops, struct desc_struct *cs, | 1875 | struct x86_emulate_ops *ops, struct desc_struct *cs, |
1911 | struct desc_struct *ss) | 1876 | struct desc_struct *ss) |
1912 | { | 1877 | { |
1878 | u16 selector; | ||
1879 | |||
1913 | memset(cs, 0, sizeof(struct desc_struct)); | 1880 | memset(cs, 0, sizeof(struct desc_struct)); |
1914 | ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); | 1881 | ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); |
1915 | memset(ss, 0, sizeof(struct desc_struct)); | 1882 | memset(ss, 0, sizeof(struct desc_struct)); |
1916 | 1883 | ||
1917 | cs->l = 0; /* will be adjusted later */ | 1884 | cs->l = 0; /* will be adjusted later */ |
@@ -1941,46 +1908,44 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1941 | struct desc_struct cs, ss; | 1908 | struct desc_struct cs, ss; |
1942 | u64 msr_data; | 1909 | u64 msr_data; |
1943 | u16 cs_sel, ss_sel; | 1910 | u16 cs_sel, ss_sel; |
1911 | u64 efer = 0; | ||
1944 | 1912 | ||
1945 | /* syscall is not available in real mode */ | 1913 | /* syscall is not available in real mode */ |
1946 | if (ctxt->mode == X86EMUL_MODE_REAL || | 1914 | if (ctxt->mode == X86EMUL_MODE_REAL || |
1947 | ctxt->mode == X86EMUL_MODE_VM86) { | 1915 | ctxt->mode == X86EMUL_MODE_VM86) |
1948 | emulate_ud(ctxt); | 1916 | return emulate_ud(ctxt); |
1949 | return X86EMUL_PROPAGATE_FAULT; | ||
1950 | } | ||
1951 | 1917 | ||
1918 | ops->get_msr(ctxt, MSR_EFER, &efer); | ||
1952 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1919 | setup_syscalls_segments(ctxt, ops, &cs, &ss); |
1953 | 1920 | ||
1954 | ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); | 1921 | ops->get_msr(ctxt, MSR_STAR, &msr_data); |
1955 | msr_data >>= 32; | 1922 | msr_data >>= 32; |
1956 | cs_sel = (u16)(msr_data & 0xfffc); | 1923 | cs_sel = (u16)(msr_data & 0xfffc); |
1957 | ss_sel = (u16)(msr_data + 8); | 1924 | ss_sel = (u16)(msr_data + 8); |
1958 | 1925 | ||
1959 | if (is_long_mode(ctxt->vcpu)) { | 1926 | if (efer & EFER_LMA) { |
1960 | cs.d = 0; | 1927 | cs.d = 0; |
1961 | cs.l = 1; | 1928 | cs.l = 1; |
1962 | } | 1929 | } |
1963 | ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); | 1930 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); |
1964 | ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); | 1931 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); |
1965 | ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); | ||
1966 | ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); | ||
1967 | 1932 | ||
1968 | c->regs[VCPU_REGS_RCX] = c->eip; | 1933 | c->regs[VCPU_REGS_RCX] = c->eip; |
1969 | if (is_long_mode(ctxt->vcpu)) { | 1934 | if (efer & EFER_LMA) { |
1970 | #ifdef CONFIG_X86_64 | 1935 | #ifdef CONFIG_X86_64 |
1971 | c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; | 1936 | c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; |
1972 | 1937 | ||
1973 | ops->get_msr(ctxt->vcpu, | 1938 | ops->get_msr(ctxt, |
1974 | ctxt->mode == X86EMUL_MODE_PROT64 ? | 1939 | ctxt->mode == X86EMUL_MODE_PROT64 ? |
1975 | MSR_LSTAR : MSR_CSTAR, &msr_data); | 1940 | MSR_LSTAR : MSR_CSTAR, &msr_data); |
1976 | c->eip = msr_data; | 1941 | c->eip = msr_data; |
1977 | 1942 | ||
1978 | ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); | 1943 | ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); |
1979 | ctxt->eflags &= ~(msr_data | EFLG_RF); | 1944 | ctxt->eflags &= ~(msr_data | EFLG_RF); |
1980 | #endif | 1945 | #endif |
1981 | } else { | 1946 | } else { |
1982 | /* legacy mode */ | 1947 | /* legacy mode */ |
1983 | ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); | 1948 | ops->get_msr(ctxt, MSR_STAR, &msr_data); |
1984 | c->eip = (u32)msr_data; | 1949 | c->eip = (u32)msr_data; |
1985 | 1950 | ||
1986 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); | 1951 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); |
@@ -1996,36 +1961,30 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1996 | struct desc_struct cs, ss; | 1961 | struct desc_struct cs, ss; |
1997 | u64 msr_data; | 1962 | u64 msr_data; |
1998 | u16 cs_sel, ss_sel; | 1963 | u16 cs_sel, ss_sel; |
1964 | u64 efer = 0; | ||
1999 | 1965 | ||
1966 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); | ||
2000 | /* inject #GP if in real mode */ | 1967 | /* inject #GP if in real mode */ |
2001 | if (ctxt->mode == X86EMUL_MODE_REAL) { | 1968 | if (ctxt->mode == X86EMUL_MODE_REAL) |
2002 | emulate_gp(ctxt, 0); | 1969 | return emulate_gp(ctxt, 0); |
2003 | return X86EMUL_PROPAGATE_FAULT; | ||
2004 | } | ||
2005 | 1970 | ||
2006 | /* XXX sysenter/sysexit have not been tested in 64bit mode. | 1971 | /* XXX sysenter/sysexit have not been tested in 64bit mode. |
2007 | * Therefore, we inject an #UD. | 1972 | * Therefore, we inject an #UD. |
2008 | */ | 1973 | */ |
2009 | if (ctxt->mode == X86EMUL_MODE_PROT64) { | 1974 | if (ctxt->mode == X86EMUL_MODE_PROT64) |
2010 | emulate_ud(ctxt); | 1975 | return emulate_ud(ctxt); |
2011 | return X86EMUL_PROPAGATE_FAULT; | ||
2012 | } | ||
2013 | 1976 | ||
2014 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1977 | setup_syscalls_segments(ctxt, ops, &cs, &ss); |
2015 | 1978 | ||
2016 | ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); | 1979 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); |
2017 | switch (ctxt->mode) { | 1980 | switch (ctxt->mode) { |
2018 | case X86EMUL_MODE_PROT32: | 1981 | case X86EMUL_MODE_PROT32: |
2019 | if ((msr_data & 0xfffc) == 0x0) { | 1982 | if ((msr_data & 0xfffc) == 0x0) |
2020 | emulate_gp(ctxt, 0); | 1983 | return emulate_gp(ctxt, 0); |
2021 | return X86EMUL_PROPAGATE_FAULT; | ||
2022 | } | ||
2023 | break; | 1984 | break; |
2024 | case X86EMUL_MODE_PROT64: | 1985 | case X86EMUL_MODE_PROT64: |
2025 | if (msr_data == 0x0) { | 1986 | if (msr_data == 0x0) |
2026 | emulate_gp(ctxt, 0); | 1987 | return emulate_gp(ctxt, 0); |
2027 | return X86EMUL_PROPAGATE_FAULT; | ||
2028 | } | ||
2029 | break; | 1988 | break; |
2030 | } | 1989 | } |
2031 | 1990 | ||
@@ -2034,21 +1993,18 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2034 | cs_sel &= ~SELECTOR_RPL_MASK; | 1993 | cs_sel &= ~SELECTOR_RPL_MASK; |
2035 | ss_sel = cs_sel + 8; | 1994 | ss_sel = cs_sel + 8; |
2036 | ss_sel &= ~SELECTOR_RPL_MASK; | 1995 | ss_sel &= ~SELECTOR_RPL_MASK; |
2037 | if (ctxt->mode == X86EMUL_MODE_PROT64 | 1996 | if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) { |
2038 | || is_long_mode(ctxt->vcpu)) { | ||
2039 | cs.d = 0; | 1997 | cs.d = 0; |
2040 | cs.l = 1; | 1998 | cs.l = 1; |
2041 | } | 1999 | } |
2042 | 2000 | ||
2043 | ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); | 2001 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); |
2044 | ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); | 2002 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); |
2045 | ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); | ||
2046 | ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); | ||
2047 | 2003 | ||
2048 | ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); | 2004 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); |
2049 | c->eip = msr_data; | 2005 | c->eip = msr_data; |
2050 | 2006 | ||
2051 | ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); | 2007 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); |
2052 | c->regs[VCPU_REGS_RSP] = msr_data; | 2008 | c->regs[VCPU_REGS_RSP] = msr_data; |
2053 | 2009 | ||
2054 | return X86EMUL_CONTINUE; | 2010 | return X86EMUL_CONTINUE; |
@@ -2065,10 +2021,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2065 | 2021 | ||
2066 | /* inject #GP if in real mode or Virtual 8086 mode */ | 2022 | /* inject #GP if in real mode or Virtual 8086 mode */ |
2067 | if (ctxt->mode == X86EMUL_MODE_REAL || | 2023 | if (ctxt->mode == X86EMUL_MODE_REAL || |
2068 | ctxt->mode == X86EMUL_MODE_VM86) { | 2024 | ctxt->mode == X86EMUL_MODE_VM86) |
2069 | emulate_gp(ctxt, 0); | 2025 | return emulate_gp(ctxt, 0); |
2070 | return X86EMUL_PROPAGATE_FAULT; | ||
2071 | } | ||
2072 | 2026 | ||
2073 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 2027 | setup_syscalls_segments(ctxt, ops, &cs, &ss); |
2074 | 2028 | ||
@@ -2079,22 +2033,18 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2079 | 2033 | ||
2080 | cs.dpl = 3; | 2034 | cs.dpl = 3; |
2081 | ss.dpl = 3; | 2035 | ss.dpl = 3; |
2082 | ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); | 2036 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); |
2083 | switch (usermode) { | 2037 | switch (usermode) { |
2084 | case X86EMUL_MODE_PROT32: | 2038 | case X86EMUL_MODE_PROT32: |
2085 | cs_sel = (u16)(msr_data + 16); | 2039 | cs_sel = (u16)(msr_data + 16); |
2086 | if ((msr_data & 0xfffc) == 0x0) { | 2040 | if ((msr_data & 0xfffc) == 0x0) |
2087 | emulate_gp(ctxt, 0); | 2041 | return emulate_gp(ctxt, 0); |
2088 | return X86EMUL_PROPAGATE_FAULT; | ||
2089 | } | ||
2090 | ss_sel = (u16)(msr_data + 24); | 2042 | ss_sel = (u16)(msr_data + 24); |
2091 | break; | 2043 | break; |
2092 | case X86EMUL_MODE_PROT64: | 2044 | case X86EMUL_MODE_PROT64: |
2093 | cs_sel = (u16)(msr_data + 32); | 2045 | cs_sel = (u16)(msr_data + 32); |
2094 | if (msr_data == 0x0) { | 2046 | if (msr_data == 0x0) |
2095 | emulate_gp(ctxt, 0); | 2047 | return emulate_gp(ctxt, 0); |
2096 | return X86EMUL_PROPAGATE_FAULT; | ||
2097 | } | ||
2098 | ss_sel = cs_sel + 8; | 2048 | ss_sel = cs_sel + 8; |
2099 | cs.d = 0; | 2049 | cs.d = 0; |
2100 | cs.l = 1; | 2050 | cs.l = 1; |
@@ -2103,10 +2053,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2103 | cs_sel |= SELECTOR_RPL_MASK; | 2053 | cs_sel |= SELECTOR_RPL_MASK; |
2104 | ss_sel |= SELECTOR_RPL_MASK; | 2054 | ss_sel |= SELECTOR_RPL_MASK; |
2105 | 2055 | ||
2106 | ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); | 2056 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); |
2107 | ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); | 2057 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); |
2108 | ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); | ||
2109 | ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); | ||
2110 | 2058 | ||
2111 | c->eip = c->regs[VCPU_REGS_RDX]; | 2059 | c->eip = c->regs[VCPU_REGS_RDX]; |
2112 | c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; | 2060 | c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; |
@@ -2123,7 +2071,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, | |||
2123 | if (ctxt->mode == X86EMUL_MODE_VM86) | 2071 | if (ctxt->mode == X86EMUL_MODE_VM86) |
2124 | return true; | 2072 | return true; |
2125 | iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | 2073 | iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; |
2126 | return ops->cpl(ctxt->vcpu) > iopl; | 2074 | return ops->cpl(ctxt) > iopl; |
2127 | } | 2075 | } |
2128 | 2076 | ||
2129 | static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, | 2077 | static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, |
@@ -2131,24 +2079,27 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, | |||
2131 | u16 port, u16 len) | 2079 | u16 port, u16 len) |
2132 | { | 2080 | { |
2133 | struct desc_struct tr_seg; | 2081 | struct desc_struct tr_seg; |
2082 | u32 base3; | ||
2134 | int r; | 2083 | int r; |
2135 | u16 io_bitmap_ptr; | 2084 | u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7; |
2136 | u8 perm, bit_idx = port & 0x7; | ||
2137 | unsigned mask = (1 << len) - 1; | 2085 | unsigned mask = (1 << len) - 1; |
2086 | unsigned long base; | ||
2138 | 2087 | ||
2139 | ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); | 2088 | ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR); |
2140 | if (!tr_seg.p) | 2089 | if (!tr_seg.p) |
2141 | return false; | 2090 | return false; |
2142 | if (desc_limit_scaled(&tr_seg) < 103) | 2091 | if (desc_limit_scaled(&tr_seg) < 103) |
2143 | return false; | 2092 | return false; |
2144 | r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, | 2093 | base = get_desc_base(&tr_seg); |
2145 | ctxt->vcpu, NULL); | 2094 | #ifdef CONFIG_X86_64 |
2095 | base |= ((u64)base3) << 32; | ||
2096 | #endif | ||
2097 | r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL); | ||
2146 | if (r != X86EMUL_CONTINUE) | 2098 | if (r != X86EMUL_CONTINUE) |
2147 | return false; | 2099 | return false; |
2148 | if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) | 2100 | if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) |
2149 | return false; | 2101 | return false; |
2150 | r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, | 2102 | r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL); |
2151 | &perm, 1, ctxt->vcpu, NULL); | ||
2152 | if (r != X86EMUL_CONTINUE) | 2103 | if (r != X86EMUL_CONTINUE) |
2153 | return false; | 2104 | return false; |
2154 | if ((perm >> bit_idx) & mask) | 2105 | if ((perm >> bit_idx) & mask) |
@@ -2160,9 +2111,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, | |||
2160 | struct x86_emulate_ops *ops, | 2111 | struct x86_emulate_ops *ops, |
2161 | u16 port, u16 len) | 2112 | u16 port, u16 len) |
2162 | { | 2113 | { |
2114 | if (ctxt->perm_ok) | ||
2115 | return true; | ||
2116 | |||
2163 | if (emulator_bad_iopl(ctxt, ops)) | 2117 | if (emulator_bad_iopl(ctxt, ops)) |
2164 | if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) | 2118 | if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) |
2165 | return false; | 2119 | return false; |
2120 | |||
2121 | ctxt->perm_ok = true; | ||
2122 | |||
2166 | return true; | 2123 | return true; |
2167 | } | 2124 | } |
2168 | 2125 | ||
@@ -2183,11 +2140,11 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, | |||
2183 | tss->si = c->regs[VCPU_REGS_RSI]; | 2140 | tss->si = c->regs[VCPU_REGS_RSI]; |
2184 | tss->di = c->regs[VCPU_REGS_RDI]; | 2141 | tss->di = c->regs[VCPU_REGS_RDI]; |
2185 | 2142 | ||
2186 | tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); | 2143 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); |
2187 | tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); | 2144 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); |
2188 | tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); | 2145 | tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); |
2189 | tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); | 2146 | tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); |
2190 | tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); | 2147 | tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR); |
2191 | } | 2148 | } |
2192 | 2149 | ||
2193 | static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | 2150 | static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, |
@@ -2212,11 +2169,11 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | |||
2212 | * SDM says that segment selectors are loaded before segment | 2169 | * SDM says that segment selectors are loaded before segment |
2213 | * descriptors | 2170 | * descriptors |
2214 | */ | 2171 | */ |
2215 | ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); | 2172 | set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR); |
2216 | ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); | 2173 | set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); |
2217 | ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); | 2174 | set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); |
2218 | ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); | 2175 | set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); |
2219 | ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); | 2176 | set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); |
2220 | 2177 | ||
2221 | /* | 2178 | /* |
2222 | * Now load segment descriptors. If fault happenes at this stage | 2179 | * Now load segment descriptors. If fault happenes at this stage |
@@ -2248,46 +2205,38 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2248 | { | 2205 | { |
2249 | struct tss_segment_16 tss_seg; | 2206 | struct tss_segment_16 tss_seg; |
2250 | int ret; | 2207 | int ret; |
2251 | u32 err, new_tss_base = get_desc_base(new_desc); | 2208 | u32 new_tss_base = get_desc_base(new_desc); |
2252 | 2209 | ||
2253 | ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 2210 | ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, |
2254 | &err); | 2211 | &ctxt->exception); |
2255 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2212 | if (ret != X86EMUL_CONTINUE) |
2256 | /* FIXME: need to provide precise fault address */ | 2213 | /* FIXME: need to provide precise fault address */ |
2257 | emulate_pf(ctxt, old_tss_base, err); | ||
2258 | return ret; | 2214 | return ret; |
2259 | } | ||
2260 | 2215 | ||
2261 | save_state_to_tss16(ctxt, ops, &tss_seg); | 2216 | save_state_to_tss16(ctxt, ops, &tss_seg); |
2262 | 2217 | ||
2263 | ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 2218 | ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, |
2264 | &err); | 2219 | &ctxt->exception); |
2265 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2220 | if (ret != X86EMUL_CONTINUE) |
2266 | /* FIXME: need to provide precise fault address */ | 2221 | /* FIXME: need to provide precise fault address */ |
2267 | emulate_pf(ctxt, old_tss_base, err); | ||
2268 | return ret; | 2222 | return ret; |
2269 | } | ||
2270 | 2223 | ||
2271 | ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 2224 | ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, |
2272 | &err); | 2225 | &ctxt->exception); |
2273 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2226 | if (ret != X86EMUL_CONTINUE) |
2274 | /* FIXME: need to provide precise fault address */ | 2227 | /* FIXME: need to provide precise fault address */ |
2275 | emulate_pf(ctxt, new_tss_base, err); | ||
2276 | return ret; | 2228 | return ret; |
2277 | } | ||
2278 | 2229 | ||
2279 | if (old_tss_sel != 0xffff) { | 2230 | if (old_tss_sel != 0xffff) { |
2280 | tss_seg.prev_task_link = old_tss_sel; | 2231 | tss_seg.prev_task_link = old_tss_sel; |
2281 | 2232 | ||
2282 | ret = ops->write_std(new_tss_base, | 2233 | ret = ops->write_std(ctxt, new_tss_base, |
2283 | &tss_seg.prev_task_link, | 2234 | &tss_seg.prev_task_link, |
2284 | sizeof tss_seg.prev_task_link, | 2235 | sizeof tss_seg.prev_task_link, |
2285 | ctxt->vcpu, &err); | 2236 | &ctxt->exception); |
2286 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2237 | if (ret != X86EMUL_CONTINUE) |
2287 | /* FIXME: need to provide precise fault address */ | 2238 | /* FIXME: need to provide precise fault address */ |
2288 | emulate_pf(ctxt, new_tss_base, err); | ||
2289 | return ret; | 2239 | return ret; |
2290 | } | ||
2291 | } | 2240 | } |
2292 | 2241 | ||
2293 | return load_state_from_tss16(ctxt, ops, &tss_seg); | 2242 | return load_state_from_tss16(ctxt, ops, &tss_seg); |
@@ -2299,7 +2248,7 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, | |||
2299 | { | 2248 | { |
2300 | struct decode_cache *c = &ctxt->decode; | 2249 | struct decode_cache *c = &ctxt->decode; |
2301 | 2250 | ||
2302 | tss->cr3 = ops->get_cr(3, ctxt->vcpu); | 2251 | tss->cr3 = ops->get_cr(ctxt, 3); |
2303 | tss->eip = c->eip; | 2252 | tss->eip = c->eip; |
2304 | tss->eflags = ctxt->eflags; | 2253 | tss->eflags = ctxt->eflags; |
2305 | tss->eax = c->regs[VCPU_REGS_RAX]; | 2254 | tss->eax = c->regs[VCPU_REGS_RAX]; |
@@ -2311,13 +2260,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, | |||
2311 | tss->esi = c->regs[VCPU_REGS_RSI]; | 2260 | tss->esi = c->regs[VCPU_REGS_RSI]; |
2312 | tss->edi = c->regs[VCPU_REGS_RDI]; | 2261 | tss->edi = c->regs[VCPU_REGS_RDI]; |
2313 | 2262 | ||
2314 | tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); | 2263 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); |
2315 | tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); | 2264 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); |
2316 | tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); | 2265 | tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); |
2317 | tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); | 2266 | tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); |
2318 | tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); | 2267 | tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); |
2319 | tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); | 2268 | tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); |
2320 | tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); | 2269 | tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR); |
2321 | } | 2270 | } |
2322 | 2271 | ||
2323 | static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | 2272 | static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, |
@@ -2327,10 +2276,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2327 | struct decode_cache *c = &ctxt->decode; | 2276 | struct decode_cache *c = &ctxt->decode; |
2328 | int ret; | 2277 | int ret; |
2329 | 2278 | ||
2330 | if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { | 2279 | if (ops->set_cr(ctxt, 3, tss->cr3)) |
2331 | emulate_gp(ctxt, 0); | 2280 | return emulate_gp(ctxt, 0); |
2332 | return X86EMUL_PROPAGATE_FAULT; | ||
2333 | } | ||
2334 | c->eip = tss->eip; | 2281 | c->eip = tss->eip; |
2335 | ctxt->eflags = tss->eflags | 2; | 2282 | ctxt->eflags = tss->eflags | 2; |
2336 | c->regs[VCPU_REGS_RAX] = tss->eax; | 2283 | c->regs[VCPU_REGS_RAX] = tss->eax; |
@@ -2346,13 +2293,13 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2346 | * SDM says that segment selectors are loaded before segment | 2293 | * SDM says that segment selectors are loaded before segment |
2347 | * descriptors | 2294 | * descriptors |
2348 | */ | 2295 | */ |
2349 | ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); | 2296 | set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); |
2350 | ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); | 2297 | set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); |
2351 | ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); | 2298 | set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); |
2352 | ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); | 2299 | set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); |
2353 | ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); | 2300 | set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); |
2354 | ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); | 2301 | set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); |
2355 | ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); | 2302 | set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); |
2356 | 2303 | ||
2357 | /* | 2304 | /* |
2358 | * Now load segment descriptors. If fault happenes at this stage | 2305 | * Now load segment descriptors. If fault happenes at this stage |
@@ -2390,46 +2337,38 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2390 | { | 2337 | { |
2391 | struct tss_segment_32 tss_seg; | 2338 | struct tss_segment_32 tss_seg; |
2392 | int ret; | 2339 | int ret; |
2393 | u32 err, new_tss_base = get_desc_base(new_desc); | 2340 | u32 new_tss_base = get_desc_base(new_desc); |
2394 | 2341 | ||
2395 | ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 2342 | ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, |
2396 | &err); | 2343 | &ctxt->exception); |
2397 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2344 | if (ret != X86EMUL_CONTINUE) |
2398 | /* FIXME: need to provide precise fault address */ | 2345 | /* FIXME: need to provide precise fault address */ |
2399 | emulate_pf(ctxt, old_tss_base, err); | ||
2400 | return ret; | 2346 | return ret; |
2401 | } | ||
2402 | 2347 | ||
2403 | save_state_to_tss32(ctxt, ops, &tss_seg); | 2348 | save_state_to_tss32(ctxt, ops, &tss_seg); |
2404 | 2349 | ||
2405 | ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 2350 | ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, |
2406 | &err); | 2351 | &ctxt->exception); |
2407 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2352 | if (ret != X86EMUL_CONTINUE) |
2408 | /* FIXME: need to provide precise fault address */ | 2353 | /* FIXME: need to provide precise fault address */ |
2409 | emulate_pf(ctxt, old_tss_base, err); | ||
2410 | return ret; | 2354 | return ret; |
2411 | } | ||
2412 | 2355 | ||
2413 | ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 2356 | ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, |
2414 | &err); | 2357 | &ctxt->exception); |
2415 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2358 | if (ret != X86EMUL_CONTINUE) |
2416 | /* FIXME: need to provide precise fault address */ | 2359 | /* FIXME: need to provide precise fault address */ |
2417 | emulate_pf(ctxt, new_tss_base, err); | ||
2418 | return ret; | 2360 | return ret; |
2419 | } | ||
2420 | 2361 | ||
2421 | if (old_tss_sel != 0xffff) { | 2362 | if (old_tss_sel != 0xffff) { |
2422 | tss_seg.prev_task_link = old_tss_sel; | 2363 | tss_seg.prev_task_link = old_tss_sel; |
2423 | 2364 | ||
2424 | ret = ops->write_std(new_tss_base, | 2365 | ret = ops->write_std(ctxt, new_tss_base, |
2425 | &tss_seg.prev_task_link, | 2366 | &tss_seg.prev_task_link, |
2426 | sizeof tss_seg.prev_task_link, | 2367 | sizeof tss_seg.prev_task_link, |
2427 | ctxt->vcpu, &err); | 2368 | &ctxt->exception); |
2428 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2369 | if (ret != X86EMUL_CONTINUE) |
2429 | /* FIXME: need to provide precise fault address */ | 2370 | /* FIXME: need to provide precise fault address */ |
2430 | emulate_pf(ctxt, new_tss_base, err); | ||
2431 | return ret; | 2371 | return ret; |
2432 | } | ||
2433 | } | 2372 | } |
2434 | 2373 | ||
2435 | return load_state_from_tss32(ctxt, ops, &tss_seg); | 2374 | return load_state_from_tss32(ctxt, ops, &tss_seg); |
@@ -2442,9 +2381,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2442 | { | 2381 | { |
2443 | struct desc_struct curr_tss_desc, next_tss_desc; | 2382 | struct desc_struct curr_tss_desc, next_tss_desc; |
2444 | int ret; | 2383 | int ret; |
2445 | u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); | 2384 | u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); |
2446 | ulong old_tss_base = | 2385 | ulong old_tss_base = |
2447 | ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); | 2386 | ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); |
2448 | u32 desc_limit; | 2387 | u32 desc_limit; |
2449 | 2388 | ||
2450 | /* FIXME: old_tss_base == ~0 ? */ | 2389 | /* FIXME: old_tss_base == ~0 ? */ |
@@ -2460,10 +2399,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2460 | 2399 | ||
2461 | if (reason != TASK_SWITCH_IRET) { | 2400 | if (reason != TASK_SWITCH_IRET) { |
2462 | if ((tss_selector & 3) > next_tss_desc.dpl || | 2401 | if ((tss_selector & 3) > next_tss_desc.dpl || |
2463 | ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { | 2402 | ops->cpl(ctxt) > next_tss_desc.dpl) |
2464 | emulate_gp(ctxt, 0); | 2403 | return emulate_gp(ctxt, 0); |
2465 | return X86EMUL_PROPAGATE_FAULT; | ||
2466 | } | ||
2467 | } | 2404 | } |
2468 | 2405 | ||
2469 | desc_limit = desc_limit_scaled(&next_tss_desc); | 2406 | desc_limit = desc_limit_scaled(&next_tss_desc); |
@@ -2506,9 +2443,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2506 | &next_tss_desc); | 2443 | &next_tss_desc); |
2507 | } | 2444 | } |
2508 | 2445 | ||
2509 | ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); | 2446 | ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); |
2510 | ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); | 2447 | ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); |
2511 | ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); | ||
2512 | 2448 | ||
2513 | if (has_error_code) { | 2449 | if (has_error_code) { |
2514 | struct decode_cache *c = &ctxt->decode; | 2450 | struct decode_cache *c = &ctxt->decode; |
@@ -2516,17 +2452,17 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2516 | c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; | 2452 | c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; |
2517 | c->lock_prefix = 0; | 2453 | c->lock_prefix = 0; |
2518 | c->src.val = (unsigned long) error_code; | 2454 | c->src.val = (unsigned long) error_code; |
2519 | emulate_push(ctxt, ops); | 2455 | ret = em_push(ctxt); |
2520 | } | 2456 | } |
2521 | 2457 | ||
2522 | return ret; | 2458 | return ret; |
2523 | } | 2459 | } |
2524 | 2460 | ||
2525 | int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | 2461 | int emulator_task_switch(struct x86_emulate_ctxt *ctxt, |
2526 | struct x86_emulate_ops *ops, | ||
2527 | u16 tss_selector, int reason, | 2462 | u16 tss_selector, int reason, |
2528 | bool has_error_code, u32 error_code) | 2463 | bool has_error_code, u32 error_code) |
2529 | { | 2464 | { |
2465 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2530 | struct decode_cache *c = &ctxt->decode; | 2466 | struct decode_cache *c = &ctxt->decode; |
2531 | int rc; | 2467 | int rc; |
2532 | 2468 | ||
@@ -2536,91 +2472,1357 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2536 | rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, | 2472 | rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, |
2537 | has_error_code, error_code); | 2473 | has_error_code, error_code); |
2538 | 2474 | ||
2539 | if (rc == X86EMUL_CONTINUE) { | 2475 | if (rc == X86EMUL_CONTINUE) |
2540 | rc = writeback(ctxt, ops); | 2476 | ctxt->eip = c->eip; |
2541 | if (rc == X86EMUL_CONTINUE) | ||
2542 | ctxt->eip = c->eip; | ||
2543 | } | ||
2544 | 2477 | ||
2545 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | 2478 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; |
2546 | } | 2479 | } |
2547 | 2480 | ||
2548 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, | 2481 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, |
2549 | int reg, struct operand *op) | 2482 | int reg, struct operand *op) |
2550 | { | 2483 | { |
2551 | struct decode_cache *c = &ctxt->decode; | 2484 | struct decode_cache *c = &ctxt->decode; |
2552 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; | 2485 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; |
2553 | 2486 | ||
2554 | register_address_increment(c, &c->regs[reg], df * op->bytes); | 2487 | register_address_increment(c, &c->regs[reg], df * op->bytes); |
2555 | op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); | 2488 | op->addr.mem.ea = register_address(c, c->regs[reg]); |
2489 | op->addr.mem.seg = seg; | ||
2490 | } | ||
2491 | |||
2492 | static int em_das(struct x86_emulate_ctxt *ctxt) | ||
2493 | { | ||
2494 | struct decode_cache *c = &ctxt->decode; | ||
2495 | u8 al, old_al; | ||
2496 | bool af, cf, old_cf; | ||
2497 | |||
2498 | cf = ctxt->eflags & X86_EFLAGS_CF; | ||
2499 | al = c->dst.val; | ||
2500 | |||
2501 | old_al = al; | ||
2502 | old_cf = cf; | ||
2503 | cf = false; | ||
2504 | af = ctxt->eflags & X86_EFLAGS_AF; | ||
2505 | if ((al & 0x0f) > 9 || af) { | ||
2506 | al -= 6; | ||
2507 | cf = old_cf | (al >= 250); | ||
2508 | af = true; | ||
2509 | } else { | ||
2510 | af = false; | ||
2511 | } | ||
2512 | if (old_al > 0x99 || old_cf) { | ||
2513 | al -= 0x60; | ||
2514 | cf = true; | ||
2515 | } | ||
2516 | |||
2517 | c->dst.val = al; | ||
2518 | /* Set PF, ZF, SF */ | ||
2519 | c->src.type = OP_IMM; | ||
2520 | c->src.val = 0; | ||
2521 | c->src.bytes = 1; | ||
2522 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | ||
2523 | ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); | ||
2524 | if (cf) | ||
2525 | ctxt->eflags |= X86_EFLAGS_CF; | ||
2526 | if (af) | ||
2527 | ctxt->eflags |= X86_EFLAGS_AF; | ||
2528 | return X86EMUL_CONTINUE; | ||
2529 | } | ||
2530 | |||
2531 | static int em_call_far(struct x86_emulate_ctxt *ctxt) | ||
2532 | { | ||
2533 | struct decode_cache *c = &ctxt->decode; | ||
2534 | u16 sel, old_cs; | ||
2535 | ulong old_eip; | ||
2536 | int rc; | ||
2537 | |||
2538 | old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); | ||
2539 | old_eip = c->eip; | ||
2540 | |||
2541 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); | ||
2542 | if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) | ||
2543 | return X86EMUL_CONTINUE; | ||
2544 | |||
2545 | c->eip = 0; | ||
2546 | memcpy(&c->eip, c->src.valptr, c->op_bytes); | ||
2547 | |||
2548 | c->src.val = old_cs; | ||
2549 | rc = em_push(ctxt); | ||
2550 | if (rc != X86EMUL_CONTINUE) | ||
2551 | return rc; | ||
2552 | |||
2553 | c->src.val = old_eip; | ||
2554 | return em_push(ctxt); | ||
2555 | } | ||
2556 | |||
2557 | static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) | ||
2558 | { | ||
2559 | struct decode_cache *c = &ctxt->decode; | ||
2560 | int rc; | ||
2561 | |||
2562 | c->dst.type = OP_REG; | ||
2563 | c->dst.addr.reg = &c->eip; | ||
2564 | c->dst.bytes = c->op_bytes; | ||
2565 | rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes); | ||
2566 | if (rc != X86EMUL_CONTINUE) | ||
2567 | return rc; | ||
2568 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); | ||
2569 | return X86EMUL_CONTINUE; | ||
2570 | } | ||
2571 | |||
2572 | static int em_add(struct x86_emulate_ctxt *ctxt) | ||
2573 | { | ||
2574 | struct decode_cache *c = &ctxt->decode; | ||
2575 | |||
2576 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | ||
2577 | return X86EMUL_CONTINUE; | ||
2578 | } | ||
2579 | |||
2580 | static int em_or(struct x86_emulate_ctxt *ctxt) | ||
2581 | { | ||
2582 | struct decode_cache *c = &ctxt->decode; | ||
2583 | |||
2584 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | ||
2585 | return X86EMUL_CONTINUE; | ||
2586 | } | ||
2587 | |||
2588 | static int em_adc(struct x86_emulate_ctxt *ctxt) | ||
2589 | { | ||
2590 | struct decode_cache *c = &ctxt->decode; | ||
2591 | |||
2592 | emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); | ||
2593 | return X86EMUL_CONTINUE; | ||
2594 | } | ||
2595 | |||
2596 | static int em_sbb(struct x86_emulate_ctxt *ctxt) | ||
2597 | { | ||
2598 | struct decode_cache *c = &ctxt->decode; | ||
2599 | |||
2600 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); | ||
2601 | return X86EMUL_CONTINUE; | ||
2602 | } | ||
2603 | |||
2604 | static int em_and(struct x86_emulate_ctxt *ctxt) | ||
2605 | { | ||
2606 | struct decode_cache *c = &ctxt->decode; | ||
2607 | |||
2608 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); | ||
2609 | return X86EMUL_CONTINUE; | ||
2610 | } | ||
2611 | |||
2612 | static int em_sub(struct x86_emulate_ctxt *ctxt) | ||
2613 | { | ||
2614 | struct decode_cache *c = &ctxt->decode; | ||
2615 | |||
2616 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); | ||
2617 | return X86EMUL_CONTINUE; | ||
2618 | } | ||
2619 | |||
2620 | static int em_xor(struct x86_emulate_ctxt *ctxt) | ||
2621 | { | ||
2622 | struct decode_cache *c = &ctxt->decode; | ||
2623 | |||
2624 | emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); | ||
2625 | return X86EMUL_CONTINUE; | ||
2626 | } | ||
2627 | |||
2628 | static int em_cmp(struct x86_emulate_ctxt *ctxt) | ||
2629 | { | ||
2630 | struct decode_cache *c = &ctxt->decode; | ||
2631 | |||
2632 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
2633 | /* Disable writeback. */ | ||
2634 | c->dst.type = OP_NONE; | ||
2635 | return X86EMUL_CONTINUE; | ||
2636 | } | ||
2637 | |||
2638 | static int em_imul(struct x86_emulate_ctxt *ctxt) | ||
2639 | { | ||
2640 | struct decode_cache *c = &ctxt->decode; | ||
2641 | |||
2642 | emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); | ||
2643 | return X86EMUL_CONTINUE; | ||
2644 | } | ||
2645 | |||
2646 | static int em_imul_3op(struct x86_emulate_ctxt *ctxt) | ||
2647 | { | ||
2648 | struct decode_cache *c = &ctxt->decode; | ||
2649 | |||
2650 | c->dst.val = c->src2.val; | ||
2651 | return em_imul(ctxt); | ||
2652 | } | ||
2653 | |||
2654 | static int em_cwd(struct x86_emulate_ctxt *ctxt) | ||
2655 | { | ||
2656 | struct decode_cache *c = &ctxt->decode; | ||
2657 | |||
2658 | c->dst.type = OP_REG; | ||
2659 | c->dst.bytes = c->src.bytes; | ||
2660 | c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; | ||
2661 | c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1); | ||
2662 | |||
2663 | return X86EMUL_CONTINUE; | ||
2664 | } | ||
2665 | |||
2666 | static int em_rdtsc(struct x86_emulate_ctxt *ctxt) | ||
2667 | { | ||
2668 | struct decode_cache *c = &ctxt->decode; | ||
2669 | u64 tsc = 0; | ||
2670 | |||
2671 | ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); | ||
2672 | c->regs[VCPU_REGS_RAX] = (u32)tsc; | ||
2673 | c->regs[VCPU_REGS_RDX] = tsc >> 32; | ||
2674 | return X86EMUL_CONTINUE; | ||
2675 | } | ||
2676 | |||
2677 | static int em_mov(struct x86_emulate_ctxt *ctxt) | ||
2678 | { | ||
2679 | struct decode_cache *c = &ctxt->decode; | ||
2680 | c->dst.val = c->src.val; | ||
2681 | return X86EMUL_CONTINUE; | ||
2682 | } | ||
2683 | |||
2684 | static int em_movdqu(struct x86_emulate_ctxt *ctxt) | ||
2685 | { | ||
2686 | struct decode_cache *c = &ctxt->decode; | ||
2687 | memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes); | ||
2688 | return X86EMUL_CONTINUE; | ||
2689 | } | ||
2690 | |||
2691 | static int em_invlpg(struct x86_emulate_ctxt *ctxt) | ||
2692 | { | ||
2693 | struct decode_cache *c = &ctxt->decode; | ||
2694 | int rc; | ||
2695 | ulong linear; | ||
2696 | |||
2697 | rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear); | ||
2698 | if (rc == X86EMUL_CONTINUE) | ||
2699 | ctxt->ops->invlpg(ctxt, linear); | ||
2700 | /* Disable writeback. */ | ||
2701 | c->dst.type = OP_NONE; | ||
2702 | return X86EMUL_CONTINUE; | ||
2703 | } | ||
2704 | |||
2705 | static int em_clts(struct x86_emulate_ctxt *ctxt) | ||
2706 | { | ||
2707 | ulong cr0; | ||
2708 | |||
2709 | cr0 = ctxt->ops->get_cr(ctxt, 0); | ||
2710 | cr0 &= ~X86_CR0_TS; | ||
2711 | ctxt->ops->set_cr(ctxt, 0, cr0); | ||
2712 | return X86EMUL_CONTINUE; | ||
2713 | } | ||
2714 | |||
2715 | static int em_vmcall(struct x86_emulate_ctxt *ctxt) | ||
2716 | { | ||
2717 | struct decode_cache *c = &ctxt->decode; | ||
2718 | int rc; | ||
2719 | |||
2720 | if (c->modrm_mod != 3 || c->modrm_rm != 1) | ||
2721 | return X86EMUL_UNHANDLEABLE; | ||
2722 | |||
2723 | rc = ctxt->ops->fix_hypercall(ctxt); | ||
2724 | if (rc != X86EMUL_CONTINUE) | ||
2725 | return rc; | ||
2726 | |||
2727 | /* Let the processor re-execute the fixed hypercall */ | ||
2728 | c->eip = ctxt->eip; | ||
2729 | /* Disable writeback. */ | ||
2730 | c->dst.type = OP_NONE; | ||
2731 | return X86EMUL_CONTINUE; | ||
2732 | } | ||
2733 | |||
2734 | static int em_lgdt(struct x86_emulate_ctxt *ctxt) | ||
2735 | { | ||
2736 | struct decode_cache *c = &ctxt->decode; | ||
2737 | struct desc_ptr desc_ptr; | ||
2738 | int rc; | ||
2739 | |||
2740 | rc = read_descriptor(ctxt, c->src.addr.mem, | ||
2741 | &desc_ptr.size, &desc_ptr.address, | ||
2742 | c->op_bytes); | ||
2743 | if (rc != X86EMUL_CONTINUE) | ||
2744 | return rc; | ||
2745 | ctxt->ops->set_gdt(ctxt, &desc_ptr); | ||
2746 | /* Disable writeback. */ | ||
2747 | c->dst.type = OP_NONE; | ||
2748 | return X86EMUL_CONTINUE; | ||
2749 | } | ||
2750 | |||
2751 | static int em_vmmcall(struct x86_emulate_ctxt *ctxt) | ||
2752 | { | ||
2753 | struct decode_cache *c = &ctxt->decode; | ||
2754 | int rc; | ||
2755 | |||
2756 | rc = ctxt->ops->fix_hypercall(ctxt); | ||
2757 | |||
2758 | /* Disable writeback. */ | ||
2759 | c->dst.type = OP_NONE; | ||
2760 | return rc; | ||
2761 | } | ||
2762 | |||
2763 | static int em_lidt(struct x86_emulate_ctxt *ctxt) | ||
2764 | { | ||
2765 | struct decode_cache *c = &ctxt->decode; | ||
2766 | struct desc_ptr desc_ptr; | ||
2767 | int rc; | ||
2768 | |||
2769 | rc = read_descriptor(ctxt, c->src.addr.mem, | ||
2770 | &desc_ptr.size, &desc_ptr.address, | ||
2771 | c->op_bytes); | ||
2772 | if (rc != X86EMUL_CONTINUE) | ||
2773 | return rc; | ||
2774 | ctxt->ops->set_idt(ctxt, &desc_ptr); | ||
2775 | /* Disable writeback. */ | ||
2776 | c->dst.type = OP_NONE; | ||
2777 | return X86EMUL_CONTINUE; | ||
2778 | } | ||
2779 | |||
2780 | static int em_smsw(struct x86_emulate_ctxt *ctxt) | ||
2781 | { | ||
2782 | struct decode_cache *c = &ctxt->decode; | ||
2783 | |||
2784 | c->dst.bytes = 2; | ||
2785 | c->dst.val = ctxt->ops->get_cr(ctxt, 0); | ||
2786 | return X86EMUL_CONTINUE; | ||
2787 | } | ||
2788 | |||
2789 | static int em_lmsw(struct x86_emulate_ctxt *ctxt) | ||
2790 | { | ||
2791 | struct decode_cache *c = &ctxt->decode; | ||
2792 | ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) | ||
2793 | | (c->src.val & 0x0f)); | ||
2794 | c->dst.type = OP_NONE; | ||
2795 | return X86EMUL_CONTINUE; | ||
2796 | } | ||
2797 | |||
2798 | static bool valid_cr(int nr) | ||
2799 | { | ||
2800 | switch (nr) { | ||
2801 | case 0: | ||
2802 | case 2 ... 4: | ||
2803 | case 8: | ||
2804 | return true; | ||
2805 | default: | ||
2806 | return false; | ||
2807 | } | ||
2808 | } | ||
2809 | |||
2810 | static int check_cr_read(struct x86_emulate_ctxt *ctxt) | ||
2811 | { | ||
2812 | struct decode_cache *c = &ctxt->decode; | ||
2813 | |||
2814 | if (!valid_cr(c->modrm_reg)) | ||
2815 | return emulate_ud(ctxt); | ||
2816 | |||
2817 | return X86EMUL_CONTINUE; | ||
2818 | } | ||
2819 | |||
2820 | static int check_cr_write(struct x86_emulate_ctxt *ctxt) | ||
2821 | { | ||
2822 | struct decode_cache *c = &ctxt->decode; | ||
2823 | u64 new_val = c->src.val64; | ||
2824 | int cr = c->modrm_reg; | ||
2825 | u64 efer = 0; | ||
2826 | |||
2827 | static u64 cr_reserved_bits[] = { | ||
2828 | 0xffffffff00000000ULL, | ||
2829 | 0, 0, 0, /* CR3 checked later */ | ||
2830 | CR4_RESERVED_BITS, | ||
2831 | 0, 0, 0, | ||
2832 | CR8_RESERVED_BITS, | ||
2833 | }; | ||
2834 | |||
2835 | if (!valid_cr(cr)) | ||
2836 | return emulate_ud(ctxt); | ||
2837 | |||
2838 | if (new_val & cr_reserved_bits[cr]) | ||
2839 | return emulate_gp(ctxt, 0); | ||
2840 | |||
2841 | switch (cr) { | ||
2842 | case 0: { | ||
2843 | u64 cr4; | ||
2844 | if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) || | ||
2845 | ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD))) | ||
2846 | return emulate_gp(ctxt, 0); | ||
2847 | |||
2848 | cr4 = ctxt->ops->get_cr(ctxt, 4); | ||
2849 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); | ||
2850 | |||
2851 | if ((new_val & X86_CR0_PG) && (efer & EFER_LME) && | ||
2852 | !(cr4 & X86_CR4_PAE)) | ||
2853 | return emulate_gp(ctxt, 0); | ||
2854 | |||
2855 | break; | ||
2856 | } | ||
2857 | case 3: { | ||
2858 | u64 rsvd = 0; | ||
2859 | |||
2860 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); | ||
2861 | if (efer & EFER_LMA) | ||
2862 | rsvd = CR3_L_MODE_RESERVED_BITS; | ||
2863 | else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE) | ||
2864 | rsvd = CR3_PAE_RESERVED_BITS; | ||
2865 | else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG) | ||
2866 | rsvd = CR3_NONPAE_RESERVED_BITS; | ||
2867 | |||
2868 | if (new_val & rsvd) | ||
2869 | return emulate_gp(ctxt, 0); | ||
2870 | |||
2871 | break; | ||
2872 | } | ||
2873 | case 4: { | ||
2874 | u64 cr4; | ||
2875 | |||
2876 | cr4 = ctxt->ops->get_cr(ctxt, 4); | ||
2877 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); | ||
2878 | |||
2879 | if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) | ||
2880 | return emulate_gp(ctxt, 0); | ||
2881 | |||
2882 | break; | ||
2883 | } | ||
2884 | } | ||
2885 | |||
2886 | return X86EMUL_CONTINUE; | ||
2887 | } | ||
2888 | |||
2889 | static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) | ||
2890 | { | ||
2891 | unsigned long dr7; | ||
2892 | |||
2893 | ctxt->ops->get_dr(ctxt, 7, &dr7); | ||
2894 | |||
2895 | /* Check if DR7.Global_Enable is set */ | ||
2896 | return dr7 & (1 << 13); | ||
2897 | } | ||
2898 | |||
2899 | static int check_dr_read(struct x86_emulate_ctxt *ctxt) | ||
2900 | { | ||
2901 | struct decode_cache *c = &ctxt->decode; | ||
2902 | int dr = c->modrm_reg; | ||
2903 | u64 cr4; | ||
2904 | |||
2905 | if (dr > 7) | ||
2906 | return emulate_ud(ctxt); | ||
2907 | |||
2908 | cr4 = ctxt->ops->get_cr(ctxt, 4); | ||
2909 | if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) | ||
2910 | return emulate_ud(ctxt); | ||
2911 | |||
2912 | if (check_dr7_gd(ctxt)) | ||
2913 | return emulate_db(ctxt); | ||
2914 | |||
2915 | return X86EMUL_CONTINUE; | ||
2916 | } | ||
2917 | |||
2918 | static int check_dr_write(struct x86_emulate_ctxt *ctxt) | ||
2919 | { | ||
2920 | struct decode_cache *c = &ctxt->decode; | ||
2921 | u64 new_val = c->src.val64; | ||
2922 | int dr = c->modrm_reg; | ||
2923 | |||
2924 | if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) | ||
2925 | return emulate_gp(ctxt, 0); | ||
2926 | |||
2927 | return check_dr_read(ctxt); | ||
2928 | } | ||
2929 | |||
2930 | static int check_svme(struct x86_emulate_ctxt *ctxt) | ||
2931 | { | ||
2932 | u64 efer; | ||
2933 | |||
2934 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); | ||
2935 | |||
2936 | if (!(efer & EFER_SVME)) | ||
2937 | return emulate_ud(ctxt); | ||
2938 | |||
2939 | return X86EMUL_CONTINUE; | ||
2940 | } | ||
2941 | |||
2942 | static int check_svme_pa(struct x86_emulate_ctxt *ctxt) | ||
2943 | { | ||
2944 | u64 rax = ctxt->decode.regs[VCPU_REGS_RAX]; | ||
2945 | |||
2946 | /* Valid physical address? */ | ||
2947 | if (rax & 0xffff000000000000ULL) | ||
2948 | return emulate_gp(ctxt, 0); | ||
2949 | |||
2950 | return check_svme(ctxt); | ||
2951 | } | ||
2952 | |||
2953 | static int check_rdtsc(struct x86_emulate_ctxt *ctxt) | ||
2954 | { | ||
2955 | u64 cr4 = ctxt->ops->get_cr(ctxt, 4); | ||
2956 | |||
2957 | if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt)) | ||
2958 | return emulate_ud(ctxt); | ||
2959 | |||
2960 | return X86EMUL_CONTINUE; | ||
2961 | } | ||
2962 | |||
2963 | static int check_rdpmc(struct x86_emulate_ctxt *ctxt) | ||
2964 | { | ||
2965 | u64 cr4 = ctxt->ops->get_cr(ctxt, 4); | ||
2966 | u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX]; | ||
2967 | |||
2968 | if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || | ||
2969 | (rcx > 3)) | ||
2970 | return emulate_gp(ctxt, 0); | ||
2971 | |||
2972 | return X86EMUL_CONTINUE; | ||
2973 | } | ||
2974 | |||
2975 | static int check_perm_in(struct x86_emulate_ctxt *ctxt) | ||
2976 | { | ||
2977 | struct decode_cache *c = &ctxt->decode; | ||
2978 | |||
2979 | c->dst.bytes = min(c->dst.bytes, 4u); | ||
2980 | if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes)) | ||
2981 | return emulate_gp(ctxt, 0); | ||
2982 | |||
2983 | return X86EMUL_CONTINUE; | ||
2984 | } | ||
2985 | |||
2986 | static int check_perm_out(struct x86_emulate_ctxt *ctxt) | ||
2987 | { | ||
2988 | struct decode_cache *c = &ctxt->decode; | ||
2989 | |||
2990 | c->src.bytes = min(c->src.bytes, 4u); | ||
2991 | if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes)) | ||
2992 | return emulate_gp(ctxt, 0); | ||
2993 | |||
2994 | return X86EMUL_CONTINUE; | ||
2995 | } | ||
2996 | |||
2997 | #define D(_y) { .flags = (_y) } | ||
2998 | #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } | ||
2999 | #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ | ||
3000 | .check_perm = (_p) } | ||
3001 | #define N D(0) | ||
3002 | #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } | ||
3003 | #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } | ||
3004 | #define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } | ||
3005 | #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } | ||
3006 | #define II(_f, _e, _i) \ | ||
3007 | { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } | ||
3008 | #define IIP(_f, _e, _i, _p) \ | ||
3009 | { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \ | ||
3010 | .check_perm = (_p) } | ||
3011 | #define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } | ||
3012 | |||
3013 | #define D2bv(_f) D((_f) | ByteOp), D(_f) | ||
3014 | #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) | ||
3015 | #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) | ||
3016 | |||
3017 | #define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ | ||
3018 | I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ | ||
3019 | I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) | ||
3020 | |||
3021 | static struct opcode group7_rm1[] = { | ||
3022 | DI(SrcNone | ModRM | Priv, monitor), | ||
3023 | DI(SrcNone | ModRM | Priv, mwait), | ||
3024 | N, N, N, N, N, N, | ||
3025 | }; | ||
3026 | |||
3027 | static struct opcode group7_rm3[] = { | ||
3028 | DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), | ||
3029 | II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), | ||
3030 | DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), | ||
3031 | DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), | ||
3032 | DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), | ||
3033 | DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), | ||
3034 | DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), | ||
3035 | DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), | ||
3036 | }; | ||
3037 | |||
3038 | static struct opcode group7_rm7[] = { | ||
3039 | N, | ||
3040 | DIP(SrcNone | ModRM, rdtscp, check_rdtsc), | ||
3041 | N, N, N, N, N, N, | ||
3042 | }; | ||
3043 | |||
3044 | static struct opcode group1[] = { | ||
3045 | I(Lock, em_add), | ||
3046 | I(Lock, em_or), | ||
3047 | I(Lock, em_adc), | ||
3048 | I(Lock, em_sbb), | ||
3049 | I(Lock, em_and), | ||
3050 | I(Lock, em_sub), | ||
3051 | I(Lock, em_xor), | ||
3052 | I(0, em_cmp), | ||
3053 | }; | ||
3054 | |||
3055 | static struct opcode group1A[] = { | ||
3056 | D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, | ||
3057 | }; | ||
3058 | |||
3059 | static struct opcode group3[] = { | ||
3060 | D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), | ||
3061 | D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), | ||
3062 | X4(D(SrcMem | ModRM)), | ||
3063 | }; | ||
3064 | |||
3065 | static struct opcode group4[] = { | ||
3066 | D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), | ||
3067 | N, N, N, N, N, N, | ||
3068 | }; | ||
3069 | |||
3070 | static struct opcode group5[] = { | ||
3071 | D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), | ||
3072 | D(SrcMem | ModRM | Stack), | ||
3073 | I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), | ||
3074 | D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), | ||
3075 | D(SrcMem | ModRM | Stack), N, | ||
3076 | }; | ||
3077 | |||
3078 | static struct opcode group6[] = { | ||
3079 | DI(ModRM | Prot, sldt), | ||
3080 | DI(ModRM | Prot, str), | ||
3081 | DI(ModRM | Prot | Priv, lldt), | ||
3082 | DI(ModRM | Prot | Priv, ltr), | ||
3083 | N, N, N, N, | ||
3084 | }; | ||
3085 | |||
3086 | static struct group_dual group7 = { { | ||
3087 | DI(ModRM | Mov | DstMem | Priv, sgdt), | ||
3088 | DI(ModRM | Mov | DstMem | Priv, sidt), | ||
3089 | II(ModRM | SrcMem | Priv, em_lgdt, lgdt), | ||
3090 | II(ModRM | SrcMem | Priv, em_lidt, lidt), | ||
3091 | II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, | ||
3092 | II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), | ||
3093 | II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), | ||
3094 | }, { | ||
3095 | I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), | ||
3096 | EXT(0, group7_rm1), | ||
3097 | N, EXT(0, group7_rm3), | ||
3098 | II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, | ||
3099 | II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), | ||
3100 | } }; | ||
3101 | |||
3102 | static struct opcode group8[] = { | ||
3103 | N, N, N, N, | ||
3104 | D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), | ||
3105 | D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), | ||
3106 | }; | ||
3107 | |||
3108 | static struct group_dual group9 = { { | ||
3109 | N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, | ||
3110 | }, { | ||
3111 | N, N, N, N, N, N, N, N, | ||
3112 | } }; | ||
3113 | |||
3114 | static struct opcode group11[] = { | ||
3115 | I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), | ||
3116 | }; | ||
3117 | |||
3118 | static struct gprefix pfx_0f_6f_0f_7f = { | ||
3119 | N, N, N, I(Sse, em_movdqu), | ||
3120 | }; | ||
3121 | |||
3122 | static struct opcode opcode_table[256] = { | ||
3123 | /* 0x00 - 0x07 */ | ||
3124 | I6ALU(Lock, em_add), | ||
3125 | D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), | ||
3126 | /* 0x08 - 0x0F */ | ||
3127 | I6ALU(Lock, em_or), | ||
3128 | D(ImplicitOps | Stack | No64), N, | ||
3129 | /* 0x10 - 0x17 */ | ||
3130 | I6ALU(Lock, em_adc), | ||
3131 | D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), | ||
3132 | /* 0x18 - 0x1F */ | ||
3133 | I6ALU(Lock, em_sbb), | ||
3134 | D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), | ||
3135 | /* 0x20 - 0x27 */ | ||
3136 | I6ALU(Lock, em_and), N, N, | ||
3137 | /* 0x28 - 0x2F */ | ||
3138 | I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), | ||
3139 | /* 0x30 - 0x37 */ | ||
3140 | I6ALU(Lock, em_xor), N, N, | ||
3141 | /* 0x38 - 0x3F */ | ||
3142 | I6ALU(0, em_cmp), N, N, | ||
3143 | /* 0x40 - 0x4F */ | ||
3144 | X16(D(DstReg)), | ||
3145 | /* 0x50 - 0x57 */ | ||
3146 | X8(I(SrcReg | Stack, em_push)), | ||
3147 | /* 0x58 - 0x5F */ | ||
3148 | X8(I(DstReg | Stack, em_pop)), | ||
3149 | /* 0x60 - 0x67 */ | ||
3150 | I(ImplicitOps | Stack | No64, em_pusha), | ||
3151 | I(ImplicitOps | Stack | No64, em_popa), | ||
3152 | N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , | ||
3153 | N, N, N, N, | ||
3154 | /* 0x68 - 0x6F */ | ||
3155 | I(SrcImm | Mov | Stack, em_push), | ||
3156 | I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), | ||
3157 | I(SrcImmByte | Mov | Stack, em_push), | ||
3158 | I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), | ||
3159 | D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */ | ||
3160 | D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */ | ||
3161 | /* 0x70 - 0x7F */ | ||
3162 | X16(D(SrcImmByte)), | ||
3163 | /* 0x80 - 0x87 */ | ||
3164 | G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), | ||
3165 | G(DstMem | SrcImm | ModRM | Group, group1), | ||
3166 | G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), | ||
3167 | G(DstMem | SrcImmByte | ModRM | Group, group1), | ||
3168 | D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), | ||
3169 | /* 0x88 - 0x8F */ | ||
3170 | I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), | ||
3171 | I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), | ||
3172 | D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), | ||
3173 | D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), | ||
3174 | /* 0x90 - 0x97 */ | ||
3175 | DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), | ||
3176 | /* 0x98 - 0x9F */ | ||
3177 | D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), | ||
3178 | I(SrcImmFAddr | No64, em_call_far), N, | ||
3179 | II(ImplicitOps | Stack, em_pushf, pushf), | ||
3180 | II(ImplicitOps | Stack, em_popf, popf), N, N, | ||
3181 | /* 0xA0 - 0xA7 */ | ||
3182 | I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), | ||
3183 | I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), | ||
3184 | I2bv(SrcSI | DstDI | Mov | String, em_mov), | ||
3185 | I2bv(SrcSI | DstDI | String, em_cmp), | ||
3186 | /* 0xA8 - 0xAF */ | ||
3187 | D2bv(DstAcc | SrcImm), | ||
3188 | I2bv(SrcAcc | DstDI | Mov | String, em_mov), | ||
3189 | I2bv(SrcSI | DstAcc | Mov | String, em_mov), | ||
3190 | I2bv(SrcAcc | DstDI | String, em_cmp), | ||
3191 | /* 0xB0 - 0xB7 */ | ||
3192 | X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), | ||
3193 | /* 0xB8 - 0xBF */ | ||
3194 | X8(I(DstReg | SrcImm | Mov, em_mov)), | ||
3195 | /* 0xC0 - 0xC7 */ | ||
3196 | D2bv(DstMem | SrcImmByte | ModRM), | ||
3197 | I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), | ||
3198 | D(ImplicitOps | Stack), | ||
3199 | D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), | ||
3200 | G(ByteOp, group11), G(0, group11), | ||
3201 | /* 0xC8 - 0xCF */ | ||
3202 | N, N, N, D(ImplicitOps | Stack), | ||
3203 | D(ImplicitOps), DI(SrcImmByte, intn), | ||
3204 | D(ImplicitOps | No64), DI(ImplicitOps, iret), | ||
3205 | /* 0xD0 - 0xD7 */ | ||
3206 | D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), | ||
3207 | N, N, N, N, | ||
3208 | /* 0xD8 - 0xDF */ | ||
3209 | N, N, N, N, N, N, N, N, | ||
3210 | /* 0xE0 - 0xE7 */ | ||
3211 | X4(D(SrcImmByte)), | ||
3212 | D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), | ||
3213 | D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), | ||
3214 | /* 0xE8 - 0xEF */ | ||
3215 | D(SrcImm | Stack), D(SrcImm | ImplicitOps), | ||
3216 | D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), | ||
3217 | D2bvIP(SrcDX | DstAcc, in, check_perm_in), | ||
3218 | D2bvIP(SrcAcc | DstDX, out, check_perm_out), | ||
3219 | /* 0xF0 - 0xF7 */ | ||
3220 | N, DI(ImplicitOps, icebp), N, N, | ||
3221 | DI(ImplicitOps | Priv, hlt), D(ImplicitOps), | ||
3222 | G(ByteOp, group3), G(0, group3), | ||
3223 | /* 0xF8 - 0xFF */ | ||
3224 | D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), | ||
3225 | D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), | ||
3226 | }; | ||
3227 | |||
3228 | static struct opcode twobyte_table[256] = { | ||
3229 | /* 0x00 - 0x0F */ | ||
3230 | G(0, group6), GD(0, &group7), N, N, | ||
3231 | N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N, | ||
3232 | DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, | ||
3233 | N, D(ImplicitOps | ModRM), N, N, | ||
3234 | /* 0x10 - 0x1F */ | ||
3235 | N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, | ||
3236 | /* 0x20 - 0x2F */ | ||
3237 | DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), | ||
3238 | DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), | ||
3239 | DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write), | ||
3240 | DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write), | ||
3241 | N, N, N, N, | ||
3242 | N, N, N, N, N, N, N, N, | ||
3243 | /* 0x30 - 0x3F */ | ||
3244 | DI(ImplicitOps | Priv, wrmsr), | ||
3245 | IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), | ||
3246 | DI(ImplicitOps | Priv, rdmsr), | ||
3247 | DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), | ||
3248 | D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), | ||
3249 | N, N, | ||
3250 | N, N, N, N, N, N, N, N, | ||
3251 | /* 0x40 - 0x4F */ | ||
3252 | X16(D(DstReg | SrcMem | ModRM | Mov)), | ||
3253 | /* 0x50 - 0x5F */ | ||
3254 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, | ||
3255 | /* 0x60 - 0x6F */ | ||
3256 | N, N, N, N, | ||
3257 | N, N, N, N, | ||
3258 | N, N, N, N, | ||
3259 | N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f), | ||
3260 | /* 0x70 - 0x7F */ | ||
3261 | N, N, N, N, | ||
3262 | N, N, N, N, | ||
3263 | N, N, N, N, | ||
3264 | N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f), | ||
3265 | /* 0x80 - 0x8F */ | ||
3266 | X16(D(SrcImm)), | ||
3267 | /* 0x90 - 0x9F */ | ||
3268 | X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), | ||
3269 | /* 0xA0 - 0xA7 */ | ||
3270 | D(ImplicitOps | Stack), D(ImplicitOps | Stack), | ||
3271 | DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), | ||
3272 | D(DstMem | SrcReg | Src2ImmByte | ModRM), | ||
3273 | D(DstMem | SrcReg | Src2CL | ModRM), N, N, | ||
3274 | /* 0xA8 - 0xAF */ | ||
3275 | D(ImplicitOps | Stack), D(ImplicitOps | Stack), | ||
3276 | DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), | ||
3277 | D(DstMem | SrcReg | Src2ImmByte | ModRM), | ||
3278 | D(DstMem | SrcReg | Src2CL | ModRM), | ||
3279 | D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), | ||
3280 | /* 0xB0 - 0xB7 */ | ||
3281 | D2bv(DstMem | SrcReg | ModRM | Lock), | ||
3282 | D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), | ||
3283 | D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), | ||
3284 | D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | ||
3285 | /* 0xB8 - 0xBF */ | ||
3286 | N, N, | ||
3287 | G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), | ||
3288 | D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), | ||
3289 | D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | ||
3290 | /* 0xC0 - 0xCF */ | ||
3291 | D2bv(DstMem | SrcReg | ModRM | Lock), | ||
3292 | N, D(DstMem | SrcReg | ModRM | Mov), | ||
3293 | N, N, N, GD(0, &group9), | ||
3294 | N, N, N, N, N, N, N, N, | ||
3295 | /* 0xD0 - 0xDF */ | ||
3296 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, | ||
3297 | /* 0xE0 - 0xEF */ | ||
3298 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, | ||
3299 | /* 0xF0 - 0xFF */ | ||
3300 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N | ||
3301 | }; | ||
3302 | |||
3303 | #undef D | ||
3304 | #undef N | ||
3305 | #undef G | ||
3306 | #undef GD | ||
3307 | #undef I | ||
3308 | #undef GP | ||
3309 | #undef EXT | ||
3310 | |||
3311 | #undef D2bv | ||
3312 | #undef D2bvIP | ||
3313 | #undef I2bv | ||
3314 | #undef I6ALU | ||
3315 | |||
3316 | static unsigned imm_size(struct decode_cache *c) | ||
3317 | { | ||
3318 | unsigned size; | ||
3319 | |||
3320 | size = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
3321 | if (size == 8) | ||
3322 | size = 4; | ||
3323 | return size; | ||
3324 | } | ||
3325 | |||
3326 | static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, | ||
3327 | unsigned size, bool sign_extension) | ||
3328 | { | ||
3329 | struct decode_cache *c = &ctxt->decode; | ||
3330 | struct x86_emulate_ops *ops = ctxt->ops; | ||
3331 | int rc = X86EMUL_CONTINUE; | ||
3332 | |||
3333 | op->type = OP_IMM; | ||
3334 | op->bytes = size; | ||
3335 | op->addr.mem.ea = c->eip; | ||
3336 | /* NB. Immediates are sign-extended as necessary. */ | ||
3337 | switch (op->bytes) { | ||
3338 | case 1: | ||
3339 | op->val = insn_fetch(s8, 1, c->eip); | ||
3340 | break; | ||
3341 | case 2: | ||
3342 | op->val = insn_fetch(s16, 2, c->eip); | ||
3343 | break; | ||
3344 | case 4: | ||
3345 | op->val = insn_fetch(s32, 4, c->eip); | ||
3346 | break; | ||
3347 | } | ||
3348 | if (!sign_extension) { | ||
3349 | switch (op->bytes) { | ||
3350 | case 1: | ||
3351 | op->val &= 0xff; | ||
3352 | break; | ||
3353 | case 2: | ||
3354 | op->val &= 0xffff; | ||
3355 | break; | ||
3356 | case 4: | ||
3357 | op->val &= 0xffffffff; | ||
3358 | break; | ||
3359 | } | ||
3360 | } | ||
3361 | done: | ||
3362 | return rc; | ||
2556 | } | 3363 | } |
2557 | 3364 | ||
2558 | int | 3365 | int |
2559 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | 3366 | x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) |
2560 | { | 3367 | { |
3368 | struct x86_emulate_ops *ops = ctxt->ops; | ||
3369 | struct decode_cache *c = &ctxt->decode; | ||
3370 | int rc = X86EMUL_CONTINUE; | ||
3371 | int mode = ctxt->mode; | ||
3372 | int def_op_bytes, def_ad_bytes, goffset, simd_prefix; | ||
3373 | bool op_prefix = false; | ||
3374 | struct opcode opcode; | ||
3375 | struct operand memop = { .type = OP_NONE }, *memopp = NULL; | ||
3376 | |||
3377 | c->eip = ctxt->eip; | ||
3378 | c->fetch.start = c->eip; | ||
3379 | c->fetch.end = c->fetch.start + insn_len; | ||
3380 | if (insn_len > 0) | ||
3381 | memcpy(c->fetch.data, insn, insn_len); | ||
3382 | |||
3383 | switch (mode) { | ||
3384 | case X86EMUL_MODE_REAL: | ||
3385 | case X86EMUL_MODE_VM86: | ||
3386 | case X86EMUL_MODE_PROT16: | ||
3387 | def_op_bytes = def_ad_bytes = 2; | ||
3388 | break; | ||
3389 | case X86EMUL_MODE_PROT32: | ||
3390 | def_op_bytes = def_ad_bytes = 4; | ||
3391 | break; | ||
3392 | #ifdef CONFIG_X86_64 | ||
3393 | case X86EMUL_MODE_PROT64: | ||
3394 | def_op_bytes = 4; | ||
3395 | def_ad_bytes = 8; | ||
3396 | break; | ||
3397 | #endif | ||
3398 | default: | ||
3399 | return -1; | ||
3400 | } | ||
3401 | |||
3402 | c->op_bytes = def_op_bytes; | ||
3403 | c->ad_bytes = def_ad_bytes; | ||
3404 | |||
3405 | /* Legacy prefixes. */ | ||
3406 | for (;;) { | ||
3407 | switch (c->b = insn_fetch(u8, 1, c->eip)) { | ||
3408 | case 0x66: /* operand-size override */ | ||
3409 | op_prefix = true; | ||
3410 | /* switch between 2/4 bytes */ | ||
3411 | c->op_bytes = def_op_bytes ^ 6; | ||
3412 | break; | ||
3413 | case 0x67: /* address-size override */ | ||
3414 | if (mode == X86EMUL_MODE_PROT64) | ||
3415 | /* switch between 4/8 bytes */ | ||
3416 | c->ad_bytes = def_ad_bytes ^ 12; | ||
3417 | else | ||
3418 | /* switch between 2/4 bytes */ | ||
3419 | c->ad_bytes = def_ad_bytes ^ 6; | ||
3420 | break; | ||
3421 | case 0x26: /* ES override */ | ||
3422 | case 0x2e: /* CS override */ | ||
3423 | case 0x36: /* SS override */ | ||
3424 | case 0x3e: /* DS override */ | ||
3425 | set_seg_override(c, (c->b >> 3) & 3); | ||
3426 | break; | ||
3427 | case 0x64: /* FS override */ | ||
3428 | case 0x65: /* GS override */ | ||
3429 | set_seg_override(c, c->b & 7); | ||
3430 | break; | ||
3431 | case 0x40 ... 0x4f: /* REX */ | ||
3432 | if (mode != X86EMUL_MODE_PROT64) | ||
3433 | goto done_prefixes; | ||
3434 | c->rex_prefix = c->b; | ||
3435 | continue; | ||
3436 | case 0xf0: /* LOCK */ | ||
3437 | c->lock_prefix = 1; | ||
3438 | break; | ||
3439 | case 0xf2: /* REPNE/REPNZ */ | ||
3440 | case 0xf3: /* REP/REPE/REPZ */ | ||
3441 | c->rep_prefix = c->b; | ||
3442 | break; | ||
3443 | default: | ||
3444 | goto done_prefixes; | ||
3445 | } | ||
3446 | |||
3447 | /* Any legacy prefix after a REX prefix nullifies its effect. */ | ||
3448 | |||
3449 | c->rex_prefix = 0; | ||
3450 | } | ||
3451 | |||
3452 | done_prefixes: | ||
3453 | |||
3454 | /* REX prefix. */ | ||
3455 | if (c->rex_prefix & 8) | ||
3456 | c->op_bytes = 8; /* REX.W */ | ||
3457 | |||
3458 | /* Opcode byte(s). */ | ||
3459 | opcode = opcode_table[c->b]; | ||
3460 | /* Two-byte opcode? */ | ||
3461 | if (c->b == 0x0f) { | ||
3462 | c->twobyte = 1; | ||
3463 | c->b = insn_fetch(u8, 1, c->eip); | ||
3464 | opcode = twobyte_table[c->b]; | ||
3465 | } | ||
3466 | c->d = opcode.flags; | ||
3467 | |||
3468 | while (c->d & GroupMask) { | ||
3469 | switch (c->d & GroupMask) { | ||
3470 | case Group: | ||
3471 | c->modrm = insn_fetch(u8, 1, c->eip); | ||
3472 | --c->eip; | ||
3473 | goffset = (c->modrm >> 3) & 7; | ||
3474 | opcode = opcode.u.group[goffset]; | ||
3475 | break; | ||
3476 | case GroupDual: | ||
3477 | c->modrm = insn_fetch(u8, 1, c->eip); | ||
3478 | --c->eip; | ||
3479 | goffset = (c->modrm >> 3) & 7; | ||
3480 | if ((c->modrm >> 6) == 3) | ||
3481 | opcode = opcode.u.gdual->mod3[goffset]; | ||
3482 | else | ||
3483 | opcode = opcode.u.gdual->mod012[goffset]; | ||
3484 | break; | ||
3485 | case RMExt: | ||
3486 | goffset = c->modrm & 7; | ||
3487 | opcode = opcode.u.group[goffset]; | ||
3488 | break; | ||
3489 | case Prefix: | ||
3490 | if (c->rep_prefix && op_prefix) | ||
3491 | return X86EMUL_UNHANDLEABLE; | ||
3492 | simd_prefix = op_prefix ? 0x66 : c->rep_prefix; | ||
3493 | switch (simd_prefix) { | ||
3494 | case 0x00: opcode = opcode.u.gprefix->pfx_no; break; | ||
3495 | case 0x66: opcode = opcode.u.gprefix->pfx_66; break; | ||
3496 | case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break; | ||
3497 | case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; | ||
3498 | } | ||
3499 | break; | ||
3500 | default: | ||
3501 | return X86EMUL_UNHANDLEABLE; | ||
3502 | } | ||
3503 | |||
3504 | c->d &= ~GroupMask; | ||
3505 | c->d |= opcode.flags; | ||
3506 | } | ||
3507 | |||
3508 | c->execute = opcode.u.execute; | ||
3509 | c->check_perm = opcode.check_perm; | ||
3510 | c->intercept = opcode.intercept; | ||
3511 | |||
3512 | /* Unrecognised? */ | ||
3513 | if (c->d == 0 || (c->d & Undefined)) | ||
3514 | return -1; | ||
3515 | |||
3516 | if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn) | ||
3517 | return -1; | ||
3518 | |||
3519 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | ||
3520 | c->op_bytes = 8; | ||
3521 | |||
3522 | if (c->d & Op3264) { | ||
3523 | if (mode == X86EMUL_MODE_PROT64) | ||
3524 | c->op_bytes = 8; | ||
3525 | else | ||
3526 | c->op_bytes = 4; | ||
3527 | } | ||
3528 | |||
3529 | if (c->d & Sse) | ||
3530 | c->op_bytes = 16; | ||
3531 | |||
3532 | /* ModRM and SIB bytes. */ | ||
3533 | if (c->d & ModRM) { | ||
3534 | rc = decode_modrm(ctxt, ops, &memop); | ||
3535 | if (!c->has_seg_override) | ||
3536 | set_seg_override(c, c->modrm_seg); | ||
3537 | } else if (c->d & MemAbs) | ||
3538 | rc = decode_abs(ctxt, ops, &memop); | ||
3539 | if (rc != X86EMUL_CONTINUE) | ||
3540 | goto done; | ||
3541 | |||
3542 | if (!c->has_seg_override) | ||
3543 | set_seg_override(c, VCPU_SREG_DS); | ||
3544 | |||
3545 | memop.addr.mem.seg = seg_override(ctxt, c); | ||
3546 | |||
3547 | if (memop.type == OP_MEM && c->ad_bytes != 8) | ||
3548 | memop.addr.mem.ea = (u32)memop.addr.mem.ea; | ||
3549 | |||
3550 | /* | ||
3551 | * Decode and fetch the source operand: register, memory | ||
3552 | * or immediate. | ||
3553 | */ | ||
3554 | switch (c->d & SrcMask) { | ||
3555 | case SrcNone: | ||
3556 | break; | ||
3557 | case SrcReg: | ||
3558 | decode_register_operand(ctxt, &c->src, c, 0); | ||
3559 | break; | ||
3560 | case SrcMem16: | ||
3561 | memop.bytes = 2; | ||
3562 | goto srcmem_common; | ||
3563 | case SrcMem32: | ||
3564 | memop.bytes = 4; | ||
3565 | goto srcmem_common; | ||
3566 | case SrcMem: | ||
3567 | memop.bytes = (c->d & ByteOp) ? 1 : | ||
3568 | c->op_bytes; | ||
3569 | srcmem_common: | ||
3570 | c->src = memop; | ||
3571 | memopp = &c->src; | ||
3572 | break; | ||
3573 | case SrcImmU16: | ||
3574 | rc = decode_imm(ctxt, &c->src, 2, false); | ||
3575 | break; | ||
3576 | case SrcImm: | ||
3577 | rc = decode_imm(ctxt, &c->src, imm_size(c), true); | ||
3578 | break; | ||
3579 | case SrcImmU: | ||
3580 | rc = decode_imm(ctxt, &c->src, imm_size(c), false); | ||
3581 | break; | ||
3582 | case SrcImmByte: | ||
3583 | rc = decode_imm(ctxt, &c->src, 1, true); | ||
3584 | break; | ||
3585 | case SrcImmUByte: | ||
3586 | rc = decode_imm(ctxt, &c->src, 1, false); | ||
3587 | break; | ||
3588 | case SrcAcc: | ||
3589 | c->src.type = OP_REG; | ||
3590 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
3591 | c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; | ||
3592 | fetch_register_operand(&c->src); | ||
3593 | break; | ||
3594 | case SrcOne: | ||
3595 | c->src.bytes = 1; | ||
3596 | c->src.val = 1; | ||
3597 | break; | ||
3598 | case SrcSI: | ||
3599 | c->src.type = OP_MEM; | ||
3600 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
3601 | c->src.addr.mem.ea = | ||
3602 | register_address(c, c->regs[VCPU_REGS_RSI]); | ||
3603 | c->src.addr.mem.seg = seg_override(ctxt, c); | ||
3604 | c->src.val = 0; | ||
3605 | break; | ||
3606 | case SrcImmFAddr: | ||
3607 | c->src.type = OP_IMM; | ||
3608 | c->src.addr.mem.ea = c->eip; | ||
3609 | c->src.bytes = c->op_bytes + 2; | ||
3610 | insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); | ||
3611 | break; | ||
3612 | case SrcMemFAddr: | ||
3613 | memop.bytes = c->op_bytes + 2; | ||
3614 | goto srcmem_common; | ||
3615 | break; | ||
3616 | case SrcDX: | ||
3617 | c->src.type = OP_REG; | ||
3618 | c->src.bytes = 2; | ||
3619 | c->src.addr.reg = &c->regs[VCPU_REGS_RDX]; | ||
3620 | fetch_register_operand(&c->src); | ||
3621 | break; | ||
3622 | } | ||
3623 | |||
3624 | if (rc != X86EMUL_CONTINUE) | ||
3625 | goto done; | ||
3626 | |||
3627 | /* | ||
3628 | * Decode and fetch the second source operand: register, memory | ||
3629 | * or immediate. | ||
3630 | */ | ||
3631 | switch (c->d & Src2Mask) { | ||
3632 | case Src2None: | ||
3633 | break; | ||
3634 | case Src2CL: | ||
3635 | c->src2.bytes = 1; | ||
3636 | c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; | ||
3637 | break; | ||
3638 | case Src2ImmByte: | ||
3639 | rc = decode_imm(ctxt, &c->src2, 1, true); | ||
3640 | break; | ||
3641 | case Src2One: | ||
3642 | c->src2.bytes = 1; | ||
3643 | c->src2.val = 1; | ||
3644 | break; | ||
3645 | case Src2Imm: | ||
3646 | rc = decode_imm(ctxt, &c->src2, imm_size(c), true); | ||
3647 | break; | ||
3648 | } | ||
3649 | |||
3650 | if (rc != X86EMUL_CONTINUE) | ||
3651 | goto done; | ||
3652 | |||
3653 | /* Decode and fetch the destination operand: register or memory. */ | ||
3654 | switch (c->d & DstMask) { | ||
3655 | case DstReg: | ||
3656 | decode_register_operand(ctxt, &c->dst, c, | ||
3657 | c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); | ||
3658 | break; | ||
3659 | case DstImmUByte: | ||
3660 | c->dst.type = OP_IMM; | ||
3661 | c->dst.addr.mem.ea = c->eip; | ||
3662 | c->dst.bytes = 1; | ||
3663 | c->dst.val = insn_fetch(u8, 1, c->eip); | ||
3664 | break; | ||
3665 | case DstMem: | ||
3666 | case DstMem64: | ||
3667 | c->dst = memop; | ||
3668 | memopp = &c->dst; | ||
3669 | if ((c->d & DstMask) == DstMem64) | ||
3670 | c->dst.bytes = 8; | ||
3671 | else | ||
3672 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
3673 | if (c->d & BitOp) | ||
3674 | fetch_bit_operand(c); | ||
3675 | c->dst.orig_val = c->dst.val; | ||
3676 | break; | ||
3677 | case DstAcc: | ||
3678 | c->dst.type = OP_REG; | ||
3679 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
3680 | c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; | ||
3681 | fetch_register_operand(&c->dst); | ||
3682 | c->dst.orig_val = c->dst.val; | ||
3683 | break; | ||
3684 | case DstDI: | ||
3685 | c->dst.type = OP_MEM; | ||
3686 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
3687 | c->dst.addr.mem.ea = | ||
3688 | register_address(c, c->regs[VCPU_REGS_RDI]); | ||
3689 | c->dst.addr.mem.seg = VCPU_SREG_ES; | ||
3690 | c->dst.val = 0; | ||
3691 | break; | ||
3692 | case DstDX: | ||
3693 | c->dst.type = OP_REG; | ||
3694 | c->dst.bytes = 2; | ||
3695 | c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; | ||
3696 | fetch_register_operand(&c->dst); | ||
3697 | break; | ||
3698 | case ImplicitOps: | ||
3699 | /* Special instructions do their own operand decoding. */ | ||
3700 | default: | ||
3701 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3702 | break; | ||
3703 | } | ||
3704 | |||
3705 | done: | ||
3706 | if (memopp && memopp->type == OP_MEM && c->rip_relative) | ||
3707 | memopp->addr.mem.ea += c->eip; | ||
3708 | |||
3709 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; | ||
3710 | } | ||
3711 | |||
3712 | static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) | ||
3713 | { | ||
3714 | struct decode_cache *c = &ctxt->decode; | ||
3715 | |||
3716 | /* The second termination condition only applies for REPE | ||
3717 | * and REPNE. Test if the repeat string operation prefix is | ||
3718 | * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the | ||
3719 | * corresponding termination condition according to: | ||
3720 | * - if REPE/REPZ and ZF = 0 then done | ||
3721 | * - if REPNE/REPNZ and ZF = 1 then done | ||
3722 | */ | ||
3723 | if (((c->b == 0xa6) || (c->b == 0xa7) || | ||
3724 | (c->b == 0xae) || (c->b == 0xaf)) | ||
3725 | && (((c->rep_prefix == REPE_PREFIX) && | ||
3726 | ((ctxt->eflags & EFLG_ZF) == 0)) | ||
3727 | || ((c->rep_prefix == REPNE_PREFIX) && | ||
3728 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) | ||
3729 | return true; | ||
3730 | |||
3731 | return false; | ||
3732 | } | ||
3733 | |||
3734 | int | ||
3735 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | ||
3736 | { | ||
3737 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2561 | u64 msr_data; | 3738 | u64 msr_data; |
2562 | struct decode_cache *c = &ctxt->decode; | 3739 | struct decode_cache *c = &ctxt->decode; |
2563 | int rc = X86EMUL_CONTINUE; | 3740 | int rc = X86EMUL_CONTINUE; |
2564 | int saved_dst_type = c->dst.type; | 3741 | int saved_dst_type = c->dst.type; |
3742 | int irq; /* Used for int 3, int, and into */ | ||
2565 | 3743 | ||
2566 | ctxt->decode.mem_read.pos = 0; | 3744 | ctxt->decode.mem_read.pos = 0; |
2567 | 3745 | ||
2568 | if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { | 3746 | if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { |
2569 | emulate_ud(ctxt); | 3747 | rc = emulate_ud(ctxt); |
2570 | goto done; | 3748 | goto done; |
2571 | } | 3749 | } |
2572 | 3750 | ||
2573 | /* LOCK prefix is allowed only with some instructions */ | 3751 | /* LOCK prefix is allowed only with some instructions */ |
2574 | if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { | 3752 | if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { |
2575 | emulate_ud(ctxt); | 3753 | rc = emulate_ud(ctxt); |
3754 | goto done; | ||
3755 | } | ||
3756 | |||
3757 | if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { | ||
3758 | rc = emulate_ud(ctxt); | ||
3759 | goto done; | ||
3760 | } | ||
3761 | |||
3762 | if ((c->d & Sse) | ||
3763 | && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) | ||
3764 | || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { | ||
3765 | rc = emulate_ud(ctxt); | ||
3766 | goto done; | ||
3767 | } | ||
3768 | |||
3769 | if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { | ||
3770 | rc = emulate_nm(ctxt); | ||
2576 | goto done; | 3771 | goto done; |
2577 | } | 3772 | } |
2578 | 3773 | ||
3774 | if (unlikely(ctxt->guest_mode) && c->intercept) { | ||
3775 | rc = emulator_check_intercept(ctxt, c->intercept, | ||
3776 | X86_ICPT_PRE_EXCEPT); | ||
3777 | if (rc != X86EMUL_CONTINUE) | ||
3778 | goto done; | ||
3779 | } | ||
3780 | |||
2579 | /* Privileged instruction can be executed only in CPL=0 */ | 3781 | /* Privileged instruction can be executed only in CPL=0 */ |
2580 | if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { | 3782 | if ((c->d & Priv) && ops->cpl(ctxt)) { |
2581 | emulate_gp(ctxt, 0); | 3783 | rc = emulate_gp(ctxt, 0); |
2582 | goto done; | 3784 | goto done; |
2583 | } | 3785 | } |
2584 | 3786 | ||
3787 | /* Instruction can only be executed in protected mode */ | ||
3788 | if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { | ||
3789 | rc = emulate_ud(ctxt); | ||
3790 | goto done; | ||
3791 | } | ||
3792 | |||
3793 | /* Do instruction specific permission checks */ | ||
3794 | if (c->check_perm) { | ||
3795 | rc = c->check_perm(ctxt); | ||
3796 | if (rc != X86EMUL_CONTINUE) | ||
3797 | goto done; | ||
3798 | } | ||
3799 | |||
3800 | if (unlikely(ctxt->guest_mode) && c->intercept) { | ||
3801 | rc = emulator_check_intercept(ctxt, c->intercept, | ||
3802 | X86_ICPT_POST_EXCEPT); | ||
3803 | if (rc != X86EMUL_CONTINUE) | ||
3804 | goto done; | ||
3805 | } | ||
3806 | |||
2585 | if (c->rep_prefix && (c->d & String)) { | 3807 | if (c->rep_prefix && (c->d & String)) { |
2586 | ctxt->restart = true; | ||
2587 | /* All REP prefixes have the same first termination condition */ | 3808 | /* All REP prefixes have the same first termination condition */ |
2588 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { | 3809 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { |
2589 | string_done: | ||
2590 | ctxt->restart = false; | ||
2591 | ctxt->eip = c->eip; | 3810 | ctxt->eip = c->eip; |
2592 | goto done; | 3811 | goto done; |
2593 | } | 3812 | } |
2594 | /* The second termination condition only applies for REPE | ||
2595 | * and REPNE. Test if the repeat string operation prefix is | ||
2596 | * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the | ||
2597 | * corresponding termination condition according to: | ||
2598 | * - if REPE/REPZ and ZF = 0 then done | ||
2599 | * - if REPNE/REPNZ and ZF = 1 then done | ||
2600 | */ | ||
2601 | if ((c->b == 0xa6) || (c->b == 0xa7) || | ||
2602 | (c->b == 0xae) || (c->b == 0xaf)) { | ||
2603 | if ((c->rep_prefix == REPE_PREFIX) && | ||
2604 | ((ctxt->eflags & EFLG_ZF) == 0)) | ||
2605 | goto string_done; | ||
2606 | if ((c->rep_prefix == REPNE_PREFIX) && | ||
2607 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) | ||
2608 | goto string_done; | ||
2609 | } | ||
2610 | c->eip = ctxt->eip; | ||
2611 | } | 3813 | } |
2612 | 3814 | ||
2613 | if (c->src.type == OP_MEM) { | 3815 | if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { |
2614 | rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, | 3816 | rc = segmented_read(ctxt, c->src.addr.mem, |
2615 | c->src.valptr, c->src.bytes); | 3817 | c->src.valptr, c->src.bytes); |
2616 | if (rc != X86EMUL_CONTINUE) | 3818 | if (rc != X86EMUL_CONTINUE) |
2617 | goto done; | 3819 | goto done; |
2618 | c->src.orig_val64 = c->src.val64; | 3820 | c->src.orig_val64 = c->src.val64; |
2619 | } | 3821 | } |
2620 | 3822 | ||
2621 | if (c->src2.type == OP_MEM) { | 3823 | if (c->src2.type == OP_MEM) { |
2622 | rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, | 3824 | rc = segmented_read(ctxt, c->src2.addr.mem, |
2623 | &c->src2.val, c->src2.bytes); | 3825 | &c->src2.val, c->src2.bytes); |
2624 | if (rc != X86EMUL_CONTINUE) | 3826 | if (rc != X86EMUL_CONTINUE) |
2625 | goto done; | 3827 | goto done; |
2626 | } | 3828 | } |
@@ -2631,7 +3833,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2631 | 3833 | ||
2632 | if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { | 3834 | if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { |
2633 | /* optimisation - avoid slow emulated read if Mov */ | 3835 | /* optimisation - avoid slow emulated read if Mov */ |
2634 | rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, | 3836 | rc = segmented_read(ctxt, c->dst.addr.mem, |
2635 | &c->dst.val, c->dst.bytes); | 3837 | &c->dst.val, c->dst.bytes); |
2636 | if (rc != X86EMUL_CONTINUE) | 3838 | if (rc != X86EMUL_CONTINUE) |
2637 | goto done; | 3839 | goto done; |
@@ -2640,68 +3842,44 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2640 | 3842 | ||
2641 | special_insn: | 3843 | special_insn: |
2642 | 3844 | ||
3845 | if (unlikely(ctxt->guest_mode) && c->intercept) { | ||
3846 | rc = emulator_check_intercept(ctxt, c->intercept, | ||
3847 | X86_ICPT_POST_MEMACCESS); | ||
3848 | if (rc != X86EMUL_CONTINUE) | ||
3849 | goto done; | ||
3850 | } | ||
3851 | |||
3852 | if (c->execute) { | ||
3853 | rc = c->execute(ctxt); | ||
3854 | if (rc != X86EMUL_CONTINUE) | ||
3855 | goto done; | ||
3856 | goto writeback; | ||
3857 | } | ||
3858 | |||
2643 | if (c->twobyte) | 3859 | if (c->twobyte) |
2644 | goto twobyte_insn; | 3860 | goto twobyte_insn; |
2645 | 3861 | ||
2646 | switch (c->b) { | 3862 | switch (c->b) { |
2647 | case 0x00 ... 0x05: | ||
2648 | add: /* add */ | ||
2649 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | ||
2650 | break; | ||
2651 | case 0x06: /* push es */ | 3863 | case 0x06: /* push es */ |
2652 | emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); | 3864 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); |
2653 | break; | 3865 | break; |
2654 | case 0x07: /* pop es */ | 3866 | case 0x07: /* pop es */ |
2655 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); | 3867 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); |
2656 | if (rc != X86EMUL_CONTINUE) | ||
2657 | goto done; | ||
2658 | break; | ||
2659 | case 0x08 ... 0x0d: | ||
2660 | or: /* or */ | ||
2661 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | ||
2662 | break; | 3868 | break; |
2663 | case 0x0e: /* push cs */ | 3869 | case 0x0e: /* push cs */ |
2664 | emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); | 3870 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); |
2665 | break; | ||
2666 | case 0x10 ... 0x15: | ||
2667 | adc: /* adc */ | ||
2668 | emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); | ||
2669 | break; | 3871 | break; |
2670 | case 0x16: /* push ss */ | 3872 | case 0x16: /* push ss */ |
2671 | emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); | 3873 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); |
2672 | break; | 3874 | break; |
2673 | case 0x17: /* pop ss */ | 3875 | case 0x17: /* pop ss */ |
2674 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); | 3876 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); |
2675 | if (rc != X86EMUL_CONTINUE) | ||
2676 | goto done; | ||
2677 | break; | ||
2678 | case 0x18 ... 0x1d: | ||
2679 | sbb: /* sbb */ | ||
2680 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); | ||
2681 | break; | 3877 | break; |
2682 | case 0x1e: /* push ds */ | 3878 | case 0x1e: /* push ds */ |
2683 | emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); | 3879 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); |
2684 | break; | 3880 | break; |
2685 | case 0x1f: /* pop ds */ | 3881 | case 0x1f: /* pop ds */ |
2686 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); | 3882 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); |
2687 | if (rc != X86EMUL_CONTINUE) | ||
2688 | goto done; | ||
2689 | break; | ||
2690 | case 0x20 ... 0x25: | ||
2691 | and: /* and */ | ||
2692 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); | ||
2693 | break; | ||
2694 | case 0x28 ... 0x2d: | ||
2695 | sub: /* sub */ | ||
2696 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); | ||
2697 | break; | ||
2698 | case 0x30 ... 0x35: | ||
2699 | xor: /* xor */ | ||
2700 | emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); | ||
2701 | break; | ||
2702 | case 0x38 ... 0x3d: | ||
2703 | cmp: /* cmp */ | ||
2704 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
2705 | break; | 3883 | break; |
2706 | case 0x40 ... 0x47: /* inc r16/r32 */ | 3884 | case 0x40 ... 0x47: /* inc r16/r32 */ |
2707 | emulate_1op("inc", c->dst, ctxt->eflags); | 3885 | emulate_1op("inc", c->dst, ctxt->eflags); |
@@ -2709,83 +3887,24 @@ special_insn: | |||
2709 | case 0x48 ... 0x4f: /* dec r16/r32 */ | 3887 | case 0x48 ... 0x4f: /* dec r16/r32 */ |
2710 | emulate_1op("dec", c->dst, ctxt->eflags); | 3888 | emulate_1op("dec", c->dst, ctxt->eflags); |
2711 | break; | 3889 | break; |
2712 | case 0x50 ... 0x57: /* push reg */ | ||
2713 | emulate_push(ctxt, ops); | ||
2714 | break; | ||
2715 | case 0x58 ... 0x5f: /* pop reg */ | ||
2716 | pop_instruction: | ||
2717 | rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); | ||
2718 | if (rc != X86EMUL_CONTINUE) | ||
2719 | goto done; | ||
2720 | break; | ||
2721 | case 0x60: /* pusha */ | ||
2722 | rc = emulate_pusha(ctxt, ops); | ||
2723 | if (rc != X86EMUL_CONTINUE) | ||
2724 | goto done; | ||
2725 | break; | ||
2726 | case 0x61: /* popa */ | ||
2727 | rc = emulate_popa(ctxt, ops); | ||
2728 | if (rc != X86EMUL_CONTINUE) | ||
2729 | goto done; | ||
2730 | break; | ||
2731 | case 0x63: /* movsxd */ | 3890 | case 0x63: /* movsxd */ |
2732 | if (ctxt->mode != X86EMUL_MODE_PROT64) | 3891 | if (ctxt->mode != X86EMUL_MODE_PROT64) |
2733 | goto cannot_emulate; | 3892 | goto cannot_emulate; |
2734 | c->dst.val = (s32) c->src.val; | 3893 | c->dst.val = (s32) c->src.val; |
2735 | break; | 3894 | break; |
2736 | case 0x68: /* push imm */ | ||
2737 | case 0x6a: /* push imm8 */ | ||
2738 | emulate_push(ctxt, ops); | ||
2739 | break; | ||
2740 | case 0x6c: /* insb */ | 3895 | case 0x6c: /* insb */ |
2741 | case 0x6d: /* insw/insd */ | 3896 | case 0x6d: /* insw/insd */ |
2742 | c->dst.bytes = min(c->dst.bytes, 4u); | 3897 | c->src.val = c->regs[VCPU_REGS_RDX]; |
2743 | if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], | 3898 | goto do_io_in; |
2744 | c->dst.bytes)) { | ||
2745 | emulate_gp(ctxt, 0); | ||
2746 | goto done; | ||
2747 | } | ||
2748 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, | ||
2749 | c->regs[VCPU_REGS_RDX], &c->dst.val)) | ||
2750 | goto done; /* IO is needed, skip writeback */ | ||
2751 | break; | ||
2752 | case 0x6e: /* outsb */ | 3899 | case 0x6e: /* outsb */ |
2753 | case 0x6f: /* outsw/outsd */ | 3900 | case 0x6f: /* outsw/outsd */ |
2754 | c->src.bytes = min(c->src.bytes, 4u); | 3901 | c->dst.val = c->regs[VCPU_REGS_RDX]; |
2755 | if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], | 3902 | goto do_io_out; |
2756 | c->src.bytes)) { | ||
2757 | emulate_gp(ctxt, 0); | ||
2758 | goto done; | ||
2759 | } | ||
2760 | ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], | ||
2761 | &c->src.val, 1, ctxt->vcpu); | ||
2762 | |||
2763 | c->dst.type = OP_NONE; /* nothing to writeback */ | ||
2764 | break; | 3903 | break; |
2765 | case 0x70 ... 0x7f: /* jcc (short) */ | 3904 | case 0x70 ... 0x7f: /* jcc (short) */ |
2766 | if (test_cc(c->b, ctxt->eflags)) | 3905 | if (test_cc(c->b, ctxt->eflags)) |
2767 | jmp_rel(c, c->src.val); | 3906 | jmp_rel(c, c->src.val); |
2768 | break; | 3907 | break; |
2769 | case 0x80 ... 0x83: /* Grp1 */ | ||
2770 | switch (c->modrm_reg) { | ||
2771 | case 0: | ||
2772 | goto add; | ||
2773 | case 1: | ||
2774 | goto or; | ||
2775 | case 2: | ||
2776 | goto adc; | ||
2777 | case 3: | ||
2778 | goto sbb; | ||
2779 | case 4: | ||
2780 | goto and; | ||
2781 | case 5: | ||
2782 | goto sub; | ||
2783 | case 6: | ||
2784 | goto xor; | ||
2785 | case 7: | ||
2786 | goto cmp; | ||
2787 | } | ||
2788 | break; | ||
2789 | case 0x84 ... 0x85: | 3908 | case 0x84 ... 0x85: |
2790 | test: | 3909 | test: |
2791 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | 3910 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); |
@@ -2793,38 +3912,24 @@ special_insn: | |||
2793 | case 0x86 ... 0x87: /* xchg */ | 3912 | case 0x86 ... 0x87: /* xchg */ |
2794 | xchg: | 3913 | xchg: |
2795 | /* Write back the register source. */ | 3914 | /* Write back the register source. */ |
2796 | switch (c->dst.bytes) { | 3915 | c->src.val = c->dst.val; |
2797 | case 1: | 3916 | write_register_operand(&c->src); |
2798 | *(u8 *) c->src.ptr = (u8) c->dst.val; | ||
2799 | break; | ||
2800 | case 2: | ||
2801 | *(u16 *) c->src.ptr = (u16) c->dst.val; | ||
2802 | break; | ||
2803 | case 4: | ||
2804 | *c->src.ptr = (u32) c->dst.val; | ||
2805 | break; /* 64b reg: zero-extend */ | ||
2806 | case 8: | ||
2807 | *c->src.ptr = c->dst.val; | ||
2808 | break; | ||
2809 | } | ||
2810 | /* | 3917 | /* |
2811 | * Write back the memory destination with implicit LOCK | 3918 | * Write back the memory destination with implicit LOCK |
2812 | * prefix. | 3919 | * prefix. |
2813 | */ | 3920 | */ |
2814 | c->dst.val = c->src.val; | 3921 | c->dst.val = c->src.orig_val; |
2815 | c->lock_prefix = 1; | 3922 | c->lock_prefix = 1; |
2816 | break; | 3923 | break; |
2817 | case 0x88 ... 0x8b: /* mov */ | ||
2818 | goto mov; | ||
2819 | case 0x8c: /* mov r/m, sreg */ | 3924 | case 0x8c: /* mov r/m, sreg */ |
2820 | if (c->modrm_reg > VCPU_SREG_GS) { | 3925 | if (c->modrm_reg > VCPU_SREG_GS) { |
2821 | emulate_ud(ctxt); | 3926 | rc = emulate_ud(ctxt); |
2822 | goto done; | 3927 | goto done; |
2823 | } | 3928 | } |
2824 | c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); | 3929 | c->dst.val = get_segment_selector(ctxt, c->modrm_reg); |
2825 | break; | 3930 | break; |
2826 | case 0x8d: /* lea r16/r32, m */ | 3931 | case 0x8d: /* lea r16/r32, m */ |
2827 | c->dst.val = c->modrm_ea; | 3932 | c->dst.val = c->src.addr.mem.ea; |
2828 | break; | 3933 | break; |
2829 | case 0x8e: { /* mov seg, r/m16 */ | 3934 | case 0x8e: { /* mov seg, r/m16 */ |
2830 | uint16_t sel; | 3935 | uint16_t sel; |
@@ -2833,7 +3938,7 @@ special_insn: | |||
2833 | 3938 | ||
2834 | if (c->modrm_reg == VCPU_SREG_CS || | 3939 | if (c->modrm_reg == VCPU_SREG_CS || |
2835 | c->modrm_reg > VCPU_SREG_GS) { | 3940 | c->modrm_reg > VCPU_SREG_GS) { |
2836 | emulate_ud(ctxt); | 3941 | rc = emulate_ud(ctxt); |
2837 | goto done; | 3942 | goto done; |
2838 | } | 3943 | } |
2839 | 3944 | ||
@@ -2846,76 +3951,72 @@ special_insn: | |||
2846 | break; | 3951 | break; |
2847 | } | 3952 | } |
2848 | case 0x8f: /* pop (sole member of Grp1a) */ | 3953 | case 0x8f: /* pop (sole member of Grp1a) */ |
2849 | rc = emulate_grp1a(ctxt, ops); | 3954 | rc = em_grp1a(ctxt); |
2850 | if (rc != X86EMUL_CONTINUE) | ||
2851 | goto done; | ||
2852 | break; | 3955 | break; |
2853 | case 0x90: /* nop / xchg r8,rax */ | 3956 | case 0x90 ... 0x97: /* nop / xchg reg, rax */ |
2854 | if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { | 3957 | if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) |
2855 | c->dst.type = OP_NONE; /* nop */ | ||
2856 | break; | 3958 | break; |
2857 | } | ||
2858 | case 0x91 ... 0x97: /* xchg reg,rax */ | ||
2859 | c->src.type = OP_REG; | ||
2860 | c->src.bytes = c->op_bytes; | ||
2861 | c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; | ||
2862 | c->src.val = *(c->src.ptr); | ||
2863 | goto xchg; | 3959 | goto xchg; |
2864 | case 0x9c: /* pushf */ | 3960 | case 0x98: /* cbw/cwde/cdqe */ |
2865 | c->src.val = (unsigned long) ctxt->eflags; | 3961 | switch (c->op_bytes) { |
2866 | emulate_push(ctxt, ops); | 3962 | case 2: c->dst.val = (s8)c->dst.val; break; |
2867 | break; | 3963 | case 4: c->dst.val = (s16)c->dst.val; break; |
2868 | case 0x9d: /* popf */ | 3964 | case 8: c->dst.val = (s32)c->dst.val; break; |
2869 | c->dst.type = OP_REG; | 3965 | } |
2870 | c->dst.ptr = (unsigned long *) &ctxt->eflags; | ||
2871 | c->dst.bytes = c->op_bytes; | ||
2872 | rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); | ||
2873 | if (rc != X86EMUL_CONTINUE) | ||
2874 | goto done; | ||
2875 | break; | 3966 | break; |
2876 | case 0xa0 ... 0xa3: /* mov */ | ||
2877 | case 0xa4 ... 0xa5: /* movs */ | ||
2878 | goto mov; | ||
2879 | case 0xa6 ... 0xa7: /* cmps */ | ||
2880 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
2881 | DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); | ||
2882 | goto cmp; | ||
2883 | case 0xa8 ... 0xa9: /* test ax, imm */ | 3967 | case 0xa8 ... 0xa9: /* test ax, imm */ |
2884 | goto test; | 3968 | goto test; |
2885 | case 0xaa ... 0xab: /* stos */ | ||
2886 | c->dst.val = c->regs[VCPU_REGS_RAX]; | ||
2887 | break; | ||
2888 | case 0xac ... 0xad: /* lods */ | ||
2889 | goto mov; | ||
2890 | case 0xae ... 0xaf: /* scas */ | ||
2891 | DPRINTF("Urk! I don't handle SCAS.\n"); | ||
2892 | goto cannot_emulate; | ||
2893 | case 0xb0 ... 0xbf: /* mov r, imm */ | ||
2894 | goto mov; | ||
2895 | case 0xc0 ... 0xc1: | 3969 | case 0xc0 ... 0xc1: |
2896 | emulate_grp2(ctxt); | 3970 | rc = em_grp2(ctxt); |
2897 | break; | 3971 | break; |
2898 | case 0xc3: /* ret */ | 3972 | case 0xc3: /* ret */ |
2899 | c->dst.type = OP_REG; | 3973 | c->dst.type = OP_REG; |
2900 | c->dst.ptr = &c->eip; | 3974 | c->dst.addr.reg = &c->eip; |
2901 | c->dst.bytes = c->op_bytes; | 3975 | c->dst.bytes = c->op_bytes; |
2902 | goto pop_instruction; | 3976 | rc = em_pop(ctxt); |
2903 | case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ | 3977 | break; |
2904 | mov: | 3978 | case 0xc4: /* les */ |
2905 | c->dst.val = c->src.val; | 3979 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); |
3980 | break; | ||
3981 | case 0xc5: /* lds */ | ||
3982 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); | ||
2906 | break; | 3983 | break; |
2907 | case 0xcb: /* ret far */ | 3984 | case 0xcb: /* ret far */ |
2908 | rc = emulate_ret_far(ctxt, ops); | 3985 | rc = emulate_ret_far(ctxt, ops); |
2909 | if (rc != X86EMUL_CONTINUE) | 3986 | break; |
2910 | goto done; | 3987 | case 0xcc: /* int3 */ |
3988 | irq = 3; | ||
3989 | goto do_interrupt; | ||
3990 | case 0xcd: /* int n */ | ||
3991 | irq = c->src.val; | ||
3992 | do_interrupt: | ||
3993 | rc = emulate_int(ctxt, ops, irq); | ||
3994 | break; | ||
3995 | case 0xce: /* into */ | ||
3996 | if (ctxt->eflags & EFLG_OF) { | ||
3997 | irq = 4; | ||
3998 | goto do_interrupt; | ||
3999 | } | ||
4000 | break; | ||
4001 | case 0xcf: /* iret */ | ||
4002 | rc = emulate_iret(ctxt, ops); | ||
2911 | break; | 4003 | break; |
2912 | case 0xd0 ... 0xd1: /* Grp2 */ | 4004 | case 0xd0 ... 0xd1: /* Grp2 */ |
2913 | c->src.val = 1; | 4005 | rc = em_grp2(ctxt); |
2914 | emulate_grp2(ctxt); | ||
2915 | break; | 4006 | break; |
2916 | case 0xd2 ... 0xd3: /* Grp2 */ | 4007 | case 0xd2 ... 0xd3: /* Grp2 */ |
2917 | c->src.val = c->regs[VCPU_REGS_RCX]; | 4008 | c->src.val = c->regs[VCPU_REGS_RCX]; |
2918 | emulate_grp2(ctxt); | 4009 | rc = em_grp2(ctxt); |
4010 | break; | ||
4011 | case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ | ||
4012 | register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); | ||
4013 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 && | ||
4014 | (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) | ||
4015 | jmp_rel(c, c->src.val); | ||
4016 | break; | ||
4017 | case 0xe3: /* jcxz/jecxz/jrcxz */ | ||
4018 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) | ||
4019 | jmp_rel(c, c->src.val); | ||
2919 | break; | 4020 | break; |
2920 | case 0xe4: /* inb */ | 4021 | case 0xe4: /* inb */ |
2921 | case 0xe5: /* in */ | 4022 | case 0xe5: /* in */ |
@@ -2927,23 +4028,14 @@ special_insn: | |||
2927 | long int rel = c->src.val; | 4028 | long int rel = c->src.val; |
2928 | c->src.val = (unsigned long) c->eip; | 4029 | c->src.val = (unsigned long) c->eip; |
2929 | jmp_rel(c, rel); | 4030 | jmp_rel(c, rel); |
2930 | emulate_push(ctxt, ops); | 4031 | rc = em_push(ctxt); |
2931 | break; | 4032 | break; |
2932 | } | 4033 | } |
2933 | case 0xe9: /* jmp rel */ | 4034 | case 0xe9: /* jmp rel */ |
2934 | goto jmp; | 4035 | goto jmp; |
2935 | case 0xea: { /* jmp far */ | 4036 | case 0xea: /* jmp far */ |
2936 | unsigned short sel; | 4037 | rc = em_jmp_far(ctxt); |
2937 | jump_far: | ||
2938 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); | ||
2939 | |||
2940 | if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS)) | ||
2941 | goto done; | ||
2942 | |||
2943 | c->eip = 0; | ||
2944 | memcpy(&c->eip, c->src.valptr, c->op_bytes); | ||
2945 | break; | 4038 | break; |
2946 | } | ||
2947 | case 0xeb: | 4039 | case 0xeb: |
2948 | jmp: /* jmp rel short */ | 4040 | jmp: /* jmp rel short */ |
2949 | jmp_rel(c, c->src.val); | 4041 | jmp_rel(c, c->src.val); |
@@ -2951,87 +4043,71 @@ special_insn: | |||
2951 | break; | 4043 | break; |
2952 | case 0xec: /* in al,dx */ | 4044 | case 0xec: /* in al,dx */ |
2953 | case 0xed: /* in (e/r)ax,dx */ | 4045 | case 0xed: /* in (e/r)ax,dx */ |
2954 | c->src.val = c->regs[VCPU_REGS_RDX]; | ||
2955 | do_io_in: | 4046 | do_io_in: |
2956 | c->dst.bytes = min(c->dst.bytes, 4u); | ||
2957 | if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { | ||
2958 | emulate_gp(ctxt, 0); | ||
2959 | goto done; | ||
2960 | } | ||
2961 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, | 4047 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, |
2962 | &c->dst.val)) | 4048 | &c->dst.val)) |
2963 | goto done; /* IO is needed */ | 4049 | goto done; /* IO is needed */ |
2964 | break; | 4050 | break; |
2965 | case 0xee: /* out dx,al */ | 4051 | case 0xee: /* out dx,al */ |
2966 | case 0xef: /* out dx,(e/r)ax */ | 4052 | case 0xef: /* out dx,(e/r)ax */ |
2967 | c->src.val = c->regs[VCPU_REGS_RDX]; | ||
2968 | do_io_out: | 4053 | do_io_out: |
2969 | c->dst.bytes = min(c->dst.bytes, 4u); | 4054 | ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val, |
2970 | if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { | 4055 | &c->src.val, 1); |
2971 | emulate_gp(ctxt, 0); | ||
2972 | goto done; | ||
2973 | } | ||
2974 | ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, | ||
2975 | ctxt->vcpu); | ||
2976 | c->dst.type = OP_NONE; /* Disable writeback. */ | 4056 | c->dst.type = OP_NONE; /* Disable writeback. */ |
2977 | break; | 4057 | break; |
2978 | case 0xf4: /* hlt */ | 4058 | case 0xf4: /* hlt */ |
2979 | ctxt->vcpu->arch.halt_request = 1; | 4059 | ctxt->ops->halt(ctxt); |
2980 | break; | 4060 | break; |
2981 | case 0xf5: /* cmc */ | 4061 | case 0xf5: /* cmc */ |
2982 | /* complement carry flag from eflags reg */ | 4062 | /* complement carry flag from eflags reg */ |
2983 | ctxt->eflags ^= EFLG_CF; | 4063 | ctxt->eflags ^= EFLG_CF; |
2984 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
2985 | break; | 4064 | break; |
2986 | case 0xf6 ... 0xf7: /* Grp3 */ | 4065 | case 0xf6 ... 0xf7: /* Grp3 */ |
2987 | if (!emulate_grp3(ctxt, ops)) | 4066 | rc = em_grp3(ctxt); |
2988 | goto cannot_emulate; | ||
2989 | break; | 4067 | break; |
2990 | case 0xf8: /* clc */ | 4068 | case 0xf8: /* clc */ |
2991 | ctxt->eflags &= ~EFLG_CF; | 4069 | ctxt->eflags &= ~EFLG_CF; |
2992 | c->dst.type = OP_NONE; /* Disable writeback. */ | 4070 | break; |
4071 | case 0xf9: /* stc */ | ||
4072 | ctxt->eflags |= EFLG_CF; | ||
2993 | break; | 4073 | break; |
2994 | case 0xfa: /* cli */ | 4074 | case 0xfa: /* cli */ |
2995 | if (emulator_bad_iopl(ctxt, ops)) { | 4075 | if (emulator_bad_iopl(ctxt, ops)) { |
2996 | emulate_gp(ctxt, 0); | 4076 | rc = emulate_gp(ctxt, 0); |
2997 | goto done; | 4077 | goto done; |
2998 | } else { | 4078 | } else |
2999 | ctxt->eflags &= ~X86_EFLAGS_IF; | 4079 | ctxt->eflags &= ~X86_EFLAGS_IF; |
3000 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3001 | } | ||
3002 | break; | 4080 | break; |
3003 | case 0xfb: /* sti */ | 4081 | case 0xfb: /* sti */ |
3004 | if (emulator_bad_iopl(ctxt, ops)) { | 4082 | if (emulator_bad_iopl(ctxt, ops)) { |
3005 | emulate_gp(ctxt, 0); | 4083 | rc = emulate_gp(ctxt, 0); |
3006 | goto done; | 4084 | goto done; |
3007 | } else { | 4085 | } else { |
3008 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; | 4086 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; |
3009 | ctxt->eflags |= X86_EFLAGS_IF; | 4087 | ctxt->eflags |= X86_EFLAGS_IF; |
3010 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3011 | } | 4088 | } |
3012 | break; | 4089 | break; |
3013 | case 0xfc: /* cld */ | 4090 | case 0xfc: /* cld */ |
3014 | ctxt->eflags &= ~EFLG_DF; | 4091 | ctxt->eflags &= ~EFLG_DF; |
3015 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3016 | break; | 4092 | break; |
3017 | case 0xfd: /* std */ | 4093 | case 0xfd: /* std */ |
3018 | ctxt->eflags |= EFLG_DF; | 4094 | ctxt->eflags |= EFLG_DF; |
3019 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3020 | break; | 4095 | break; |
3021 | case 0xfe: /* Grp4 */ | 4096 | case 0xfe: /* Grp4 */ |
3022 | grp45: | 4097 | rc = em_grp45(ctxt); |
3023 | rc = emulate_grp45(ctxt, ops); | ||
3024 | if (rc != X86EMUL_CONTINUE) | ||
3025 | goto done; | ||
3026 | break; | 4098 | break; |
3027 | case 0xff: /* Grp5 */ | 4099 | case 0xff: /* Grp5 */ |
3028 | if (c->modrm_reg == 5) | 4100 | rc = em_grp45(ctxt); |
3029 | goto jump_far; | 4101 | break; |
3030 | goto grp45; | 4102 | default: |
4103 | goto cannot_emulate; | ||
3031 | } | 4104 | } |
3032 | 4105 | ||
4106 | if (rc != X86EMUL_CONTINUE) | ||
4107 | goto done; | ||
4108 | |||
3033 | writeback: | 4109 | writeback: |
3034 | rc = writeback(ctxt, ops); | 4110 | rc = writeback(ctxt); |
3035 | if (rc != X86EMUL_CONTINUE) | 4111 | if (rc != X86EMUL_CONTINUE) |
3036 | goto done; | 4112 | goto done; |
3037 | 4113 | ||
@@ -3042,165 +4118,82 @@ writeback: | |||
3042 | c->dst.type = saved_dst_type; | 4118 | c->dst.type = saved_dst_type; |
3043 | 4119 | ||
3044 | if ((c->d & SrcMask) == SrcSI) | 4120 | if ((c->d & SrcMask) == SrcSI) |
3045 | string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), | 4121 | string_addr_inc(ctxt, seg_override(ctxt, c), |
3046 | VCPU_REGS_RSI, &c->src); | 4122 | VCPU_REGS_RSI, &c->src); |
3047 | 4123 | ||
3048 | if ((c->d & DstMask) == DstDI) | 4124 | if ((c->d & DstMask) == DstDI) |
3049 | string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, | 4125 | string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, |
3050 | &c->dst); | 4126 | &c->dst); |
3051 | 4127 | ||
3052 | if (c->rep_prefix && (c->d & String)) { | 4128 | if (c->rep_prefix && (c->d & String)) { |
3053 | struct read_cache *rc = &ctxt->decode.io_read; | 4129 | struct read_cache *r = &ctxt->decode.io_read; |
3054 | register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); | 4130 | register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); |
3055 | /* | 4131 | |
3056 | * Re-enter guest when pio read ahead buffer is empty or, | 4132 | if (!string_insn_completed(ctxt)) { |
3057 | * if it is not used, after each 1024 iteration. | 4133 | /* |
3058 | */ | 4134 | * Re-enter guest when pio read ahead buffer is empty |
3059 | if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || | 4135 | * or, if it is not used, after each 1024 iteration. |
3060 | (rc->end != 0 && rc->end == rc->pos)) | 4136 | */ |
3061 | ctxt->restart = false; | 4137 | if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && |
4138 | (r->end == 0 || r->end != r->pos)) { | ||
4139 | /* | ||
4140 | * Reset read cache. Usually happens before | ||
4141 | * decode, but since instruction is restarted | ||
4142 | * we have to do it here. | ||
4143 | */ | ||
4144 | ctxt->decode.mem_read.end = 0; | ||
4145 | return EMULATION_RESTART; | ||
4146 | } | ||
4147 | goto done; /* skip rip writeback */ | ||
4148 | } | ||
3062 | } | 4149 | } |
3063 | /* | 4150 | |
3064 | * reset read cache here in case string instruction is restared | ||
3065 | * without decoding | ||
3066 | */ | ||
3067 | ctxt->decode.mem_read.end = 0; | ||
3068 | ctxt->eip = c->eip; | 4151 | ctxt->eip = c->eip; |
3069 | 4152 | ||
3070 | done: | 4153 | done: |
3071 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | 4154 | if (rc == X86EMUL_PROPAGATE_FAULT) |
4155 | ctxt->have_exception = true; | ||
4156 | if (rc == X86EMUL_INTERCEPTED) | ||
4157 | return EMULATION_INTERCEPTED; | ||
4158 | |||
4159 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; | ||
3072 | 4160 | ||
3073 | twobyte_insn: | 4161 | twobyte_insn: |
3074 | switch (c->b) { | 4162 | switch (c->b) { |
3075 | case 0x01: /* lgdt, lidt, lmsw */ | ||
3076 | switch (c->modrm_reg) { | ||
3077 | u16 size; | ||
3078 | unsigned long address; | ||
3079 | |||
3080 | case 0: /* vmcall */ | ||
3081 | if (c->modrm_mod != 3 || c->modrm_rm != 1) | ||
3082 | goto cannot_emulate; | ||
3083 | |||
3084 | rc = kvm_fix_hypercall(ctxt->vcpu); | ||
3085 | if (rc != X86EMUL_CONTINUE) | ||
3086 | goto done; | ||
3087 | |||
3088 | /* Let the processor re-execute the fixed hypercall */ | ||
3089 | c->eip = ctxt->eip; | ||
3090 | /* Disable writeback. */ | ||
3091 | c->dst.type = OP_NONE; | ||
3092 | break; | ||
3093 | case 2: /* lgdt */ | ||
3094 | rc = read_descriptor(ctxt, ops, c->src.ptr, | ||
3095 | &size, &address, c->op_bytes); | ||
3096 | if (rc != X86EMUL_CONTINUE) | ||
3097 | goto done; | ||
3098 | realmode_lgdt(ctxt->vcpu, size, address); | ||
3099 | /* Disable writeback. */ | ||
3100 | c->dst.type = OP_NONE; | ||
3101 | break; | ||
3102 | case 3: /* lidt/vmmcall */ | ||
3103 | if (c->modrm_mod == 3) { | ||
3104 | switch (c->modrm_rm) { | ||
3105 | case 1: | ||
3106 | rc = kvm_fix_hypercall(ctxt->vcpu); | ||
3107 | if (rc != X86EMUL_CONTINUE) | ||
3108 | goto done; | ||
3109 | break; | ||
3110 | default: | ||
3111 | goto cannot_emulate; | ||
3112 | } | ||
3113 | } else { | ||
3114 | rc = read_descriptor(ctxt, ops, c->src.ptr, | ||
3115 | &size, &address, | ||
3116 | c->op_bytes); | ||
3117 | if (rc != X86EMUL_CONTINUE) | ||
3118 | goto done; | ||
3119 | realmode_lidt(ctxt->vcpu, size, address); | ||
3120 | } | ||
3121 | /* Disable writeback. */ | ||
3122 | c->dst.type = OP_NONE; | ||
3123 | break; | ||
3124 | case 4: /* smsw */ | ||
3125 | c->dst.bytes = 2; | ||
3126 | c->dst.val = ops->get_cr(0, ctxt->vcpu); | ||
3127 | break; | ||
3128 | case 6: /* lmsw */ | ||
3129 | ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) | | ||
3130 | (c->src.val & 0x0f), ctxt->vcpu); | ||
3131 | c->dst.type = OP_NONE; | ||
3132 | break; | ||
3133 | case 5: /* not defined */ | ||
3134 | emulate_ud(ctxt); | ||
3135 | goto done; | ||
3136 | case 7: /* invlpg*/ | ||
3137 | emulate_invlpg(ctxt->vcpu, c->modrm_ea); | ||
3138 | /* Disable writeback. */ | ||
3139 | c->dst.type = OP_NONE; | ||
3140 | break; | ||
3141 | default: | ||
3142 | goto cannot_emulate; | ||
3143 | } | ||
3144 | break; | ||
3145 | case 0x05: /* syscall */ | 4163 | case 0x05: /* syscall */ |
3146 | rc = emulate_syscall(ctxt, ops); | 4164 | rc = emulate_syscall(ctxt, ops); |
3147 | if (rc != X86EMUL_CONTINUE) | ||
3148 | goto done; | ||
3149 | else | ||
3150 | goto writeback; | ||
3151 | break; | 4165 | break; |
3152 | case 0x06: | 4166 | case 0x06: |
3153 | emulate_clts(ctxt->vcpu); | 4167 | rc = em_clts(ctxt); |
3154 | c->dst.type = OP_NONE; | ||
3155 | break; | 4168 | break; |
3156 | case 0x09: /* wbinvd */ | 4169 | case 0x09: /* wbinvd */ |
3157 | kvm_emulate_wbinvd(ctxt->vcpu); | 4170 | (ctxt->ops->wbinvd)(ctxt); |
3158 | c->dst.type = OP_NONE; | ||
3159 | break; | 4171 | break; |
3160 | case 0x08: /* invd */ | 4172 | case 0x08: /* invd */ |
3161 | case 0x0d: /* GrpP (prefetch) */ | 4173 | case 0x0d: /* GrpP (prefetch) */ |
3162 | case 0x18: /* Grp16 (prefetch/nop) */ | 4174 | case 0x18: /* Grp16 (prefetch/nop) */ |
3163 | c->dst.type = OP_NONE; | ||
3164 | break; | 4175 | break; |
3165 | case 0x20: /* mov cr, reg */ | 4176 | case 0x20: /* mov cr, reg */ |
3166 | switch (c->modrm_reg) { | 4177 | c->dst.val = ops->get_cr(ctxt, c->modrm_reg); |
3167 | case 1: | ||
3168 | case 5 ... 7: | ||
3169 | case 9 ... 15: | ||
3170 | emulate_ud(ctxt); | ||
3171 | goto done; | ||
3172 | } | ||
3173 | c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); | ||
3174 | c->dst.type = OP_NONE; /* no writeback */ | ||
3175 | break; | 4178 | break; |
3176 | case 0x21: /* mov from dr to reg */ | 4179 | case 0x21: /* mov from dr to reg */ |
3177 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && | 4180 | ops->get_dr(ctxt, c->modrm_reg, &c->dst.val); |
3178 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { | ||
3179 | emulate_ud(ctxt); | ||
3180 | goto done; | ||
3181 | } | ||
3182 | ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); | ||
3183 | c->dst.type = OP_NONE; /* no writeback */ | ||
3184 | break; | 4181 | break; |
3185 | case 0x22: /* mov reg, cr */ | 4182 | case 0x22: /* mov reg, cr */ |
3186 | if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { | 4183 | if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) { |
3187 | emulate_gp(ctxt, 0); | 4184 | emulate_gp(ctxt, 0); |
4185 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3188 | goto done; | 4186 | goto done; |
3189 | } | 4187 | } |
3190 | c->dst.type = OP_NONE; | 4188 | c->dst.type = OP_NONE; |
3191 | break; | 4189 | break; |
3192 | case 0x23: /* mov from reg to dr */ | 4190 | case 0x23: /* mov from reg to dr */ |
3193 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && | 4191 | if (ops->set_dr(ctxt, c->modrm_reg, c->src.val & |
3194 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { | ||
3195 | emulate_ud(ctxt); | ||
3196 | goto done; | ||
3197 | } | ||
3198 | |||
3199 | if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & | ||
3200 | ((ctxt->mode == X86EMUL_MODE_PROT64) ? | 4192 | ((ctxt->mode == X86EMUL_MODE_PROT64) ? |
3201 | ~0ULL : ~0U), ctxt->vcpu) < 0) { | 4193 | ~0ULL : ~0U)) < 0) { |
3202 | /* #UD condition is already handled by the code above */ | 4194 | /* #UD condition is already handled by the code above */ |
3203 | emulate_gp(ctxt, 0); | 4195 | emulate_gp(ctxt, 0); |
4196 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3204 | goto done; | 4197 | goto done; |
3205 | } | 4198 | } |
3206 | 4199 | ||
@@ -3210,38 +4203,30 @@ twobyte_insn: | |||
3210 | /* wrmsr */ | 4203 | /* wrmsr */ |
3211 | msr_data = (u32)c->regs[VCPU_REGS_RAX] | 4204 | msr_data = (u32)c->regs[VCPU_REGS_RAX] |
3212 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); | 4205 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); |
3213 | if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { | 4206 | if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) { |
3214 | emulate_gp(ctxt, 0); | 4207 | emulate_gp(ctxt, 0); |
4208 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3215 | goto done; | 4209 | goto done; |
3216 | } | 4210 | } |
3217 | rc = X86EMUL_CONTINUE; | 4211 | rc = X86EMUL_CONTINUE; |
3218 | c->dst.type = OP_NONE; | ||
3219 | break; | 4212 | break; |
3220 | case 0x32: | 4213 | case 0x32: |
3221 | /* rdmsr */ | 4214 | /* rdmsr */ |
3222 | if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { | 4215 | if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) { |
3223 | emulate_gp(ctxt, 0); | 4216 | emulate_gp(ctxt, 0); |
4217 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3224 | goto done; | 4218 | goto done; |
3225 | } else { | 4219 | } else { |
3226 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | 4220 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; |
3227 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; | 4221 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; |
3228 | } | 4222 | } |
3229 | rc = X86EMUL_CONTINUE; | 4223 | rc = X86EMUL_CONTINUE; |
3230 | c->dst.type = OP_NONE; | ||
3231 | break; | 4224 | break; |
3232 | case 0x34: /* sysenter */ | 4225 | case 0x34: /* sysenter */ |
3233 | rc = emulate_sysenter(ctxt, ops); | 4226 | rc = emulate_sysenter(ctxt, ops); |
3234 | if (rc != X86EMUL_CONTINUE) | ||
3235 | goto done; | ||
3236 | else | ||
3237 | goto writeback; | ||
3238 | break; | 4227 | break; |
3239 | case 0x35: /* sysexit */ | 4228 | case 0x35: /* sysexit */ |
3240 | rc = emulate_sysexit(ctxt, ops); | 4229 | rc = emulate_sysexit(ctxt, ops); |
3241 | if (rc != X86EMUL_CONTINUE) | ||
3242 | goto done; | ||
3243 | else | ||
3244 | goto writeback; | ||
3245 | break; | 4230 | break; |
3246 | case 0x40 ... 0x4f: /* cmov */ | 4231 | case 0x40 ... 0x4f: /* cmov */ |
3247 | c->dst.val = c->dst.orig_val = c->src.val; | 4232 | c->dst.val = c->dst.orig_val = c->src.val; |
@@ -3251,15 +4236,15 @@ twobyte_insn: | |||
3251 | case 0x80 ... 0x8f: /* jnz rel, etc*/ | 4236 | case 0x80 ... 0x8f: /* jnz rel, etc*/ |
3252 | if (test_cc(c->b, ctxt->eflags)) | 4237 | if (test_cc(c->b, ctxt->eflags)) |
3253 | jmp_rel(c, c->src.val); | 4238 | jmp_rel(c, c->src.val); |
3254 | c->dst.type = OP_NONE; | 4239 | break; |
4240 | case 0x90 ... 0x9f: /* setcc r/m8 */ | ||
4241 | c->dst.val = test_cc(c->b, ctxt->eflags); | ||
3255 | break; | 4242 | break; |
3256 | case 0xa0: /* push fs */ | 4243 | case 0xa0: /* push fs */ |
3257 | emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); | 4244 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); |
3258 | break; | 4245 | break; |
3259 | case 0xa1: /* pop fs */ | 4246 | case 0xa1: /* pop fs */ |
3260 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); | 4247 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); |
3261 | if (rc != X86EMUL_CONTINUE) | ||
3262 | goto done; | ||
3263 | break; | 4248 | break; |
3264 | case 0xa3: | 4249 | case 0xa3: |
3265 | bt: /* bt */ | 4250 | bt: /* bt */ |
@@ -3273,17 +4258,13 @@ twobyte_insn: | |||
3273 | emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); | 4258 | emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); |
3274 | break; | 4259 | break; |
3275 | case 0xa8: /* push gs */ | 4260 | case 0xa8: /* push gs */ |
3276 | emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); | 4261 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); |
3277 | break; | 4262 | break; |
3278 | case 0xa9: /* pop gs */ | 4263 | case 0xa9: /* pop gs */ |
3279 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); | 4264 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); |
3280 | if (rc != X86EMUL_CONTINUE) | ||
3281 | goto done; | ||
3282 | break; | 4265 | break; |
3283 | case 0xab: | 4266 | case 0xab: |
3284 | bts: /* bts */ | 4267 | bts: /* bts */ |
3285 | /* only subword offset */ | ||
3286 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
3287 | emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); | 4268 | emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); |
3288 | break; | 4269 | break; |
3289 | case 0xac: /* shrd imm8, r, r/m */ | 4270 | case 0xac: /* shrd imm8, r, r/m */ |
@@ -3306,15 +4287,22 @@ twobyte_insn: | |||
3306 | } else { | 4287 | } else { |
3307 | /* Failure: write the value we saw to EAX. */ | 4288 | /* Failure: write the value we saw to EAX. */ |
3308 | c->dst.type = OP_REG; | 4289 | c->dst.type = OP_REG; |
3309 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | 4290 | c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; |
3310 | } | 4291 | } |
3311 | break; | 4292 | break; |
4293 | case 0xb2: /* lss */ | ||
4294 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); | ||
4295 | break; | ||
3312 | case 0xb3: | 4296 | case 0xb3: |
3313 | btr: /* btr */ | 4297 | btr: /* btr */ |
3314 | /* only subword offset */ | ||
3315 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
3316 | emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); | 4298 | emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); |
3317 | break; | 4299 | break; |
4300 | case 0xb4: /* lfs */ | ||
4301 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); | ||
4302 | break; | ||
4303 | case 0xb5: /* lgs */ | ||
4304 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); | ||
4305 | break; | ||
3318 | case 0xb6 ... 0xb7: /* movzx */ | 4306 | case 0xb6 ... 0xb7: /* movzx */ |
3319 | c->dst.bytes = c->op_bytes; | 4307 | c->dst.bytes = c->op_bytes; |
3320 | c->dst.val = (c->d & ByteOp) ? (u8) c->src.val | 4308 | c->dst.val = (c->d & ByteOp) ? (u8) c->src.val |
@@ -3334,29 +4322,60 @@ twobyte_insn: | |||
3334 | break; | 4322 | break; |
3335 | case 0xbb: | 4323 | case 0xbb: |
3336 | btc: /* btc */ | 4324 | btc: /* btc */ |
3337 | /* only subword offset */ | ||
3338 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
3339 | emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); | 4325 | emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); |
3340 | break; | 4326 | break; |
4327 | case 0xbc: { /* bsf */ | ||
4328 | u8 zf; | ||
4329 | __asm__ ("bsf %2, %0; setz %1" | ||
4330 | : "=r"(c->dst.val), "=q"(zf) | ||
4331 | : "r"(c->src.val)); | ||
4332 | ctxt->eflags &= ~X86_EFLAGS_ZF; | ||
4333 | if (zf) { | ||
4334 | ctxt->eflags |= X86_EFLAGS_ZF; | ||
4335 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
4336 | } | ||
4337 | break; | ||
4338 | } | ||
4339 | case 0xbd: { /* bsr */ | ||
4340 | u8 zf; | ||
4341 | __asm__ ("bsr %2, %0; setz %1" | ||
4342 | : "=r"(c->dst.val), "=q"(zf) | ||
4343 | : "r"(c->src.val)); | ||
4344 | ctxt->eflags &= ~X86_EFLAGS_ZF; | ||
4345 | if (zf) { | ||
4346 | ctxt->eflags |= X86_EFLAGS_ZF; | ||
4347 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
4348 | } | ||
4349 | break; | ||
4350 | } | ||
3341 | case 0xbe ... 0xbf: /* movsx */ | 4351 | case 0xbe ... 0xbf: /* movsx */ |
3342 | c->dst.bytes = c->op_bytes; | 4352 | c->dst.bytes = c->op_bytes; |
3343 | c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : | 4353 | c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : |
3344 | (s16) c->src.val; | 4354 | (s16) c->src.val; |
3345 | break; | 4355 | break; |
4356 | case 0xc0 ... 0xc1: /* xadd */ | ||
4357 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | ||
4358 | /* Write back the register source. */ | ||
4359 | c->src.val = c->dst.orig_val; | ||
4360 | write_register_operand(&c->src); | ||
4361 | break; | ||
3346 | case 0xc3: /* movnti */ | 4362 | case 0xc3: /* movnti */ |
3347 | c->dst.bytes = c->op_bytes; | 4363 | c->dst.bytes = c->op_bytes; |
3348 | c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : | 4364 | c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : |
3349 | (u64) c->src.val; | 4365 | (u64) c->src.val; |
3350 | break; | 4366 | break; |
3351 | case 0xc7: /* Grp9 (cmpxchg8b) */ | 4367 | case 0xc7: /* Grp9 (cmpxchg8b) */ |
3352 | rc = emulate_grp9(ctxt, ops); | 4368 | rc = em_grp9(ctxt); |
3353 | if (rc != X86EMUL_CONTINUE) | ||
3354 | goto done; | ||
3355 | break; | 4369 | break; |
4370 | default: | ||
4371 | goto cannot_emulate; | ||
3356 | } | 4372 | } |
4373 | |||
4374 | if (rc != X86EMUL_CONTINUE) | ||
4375 | goto done; | ||
4376 | |||
3357 | goto writeback; | 4377 | goto writeback; |
3358 | 4378 | ||
3359 | cannot_emulate: | 4379 | cannot_emulate: |
3360 | DPRINTF("Cannot emulate %02x\n", c->b); | 4380 | return EMULATION_FAILED; |
3361 | return -1; | ||
3362 | } | 4381 | } |
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index ddeb2314b522..efad72385058 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * Copyright (c) 2006 Intel Corporation | 5 | * Copyright (c) 2006 Intel Corporation |
6 | * Copyright (c) 2007 Keir Fraser, XenSource Inc | 6 | * Copyright (c) 2007 Keir Fraser, XenSource Inc |
7 | * Copyright (c) 2008 Intel Corporation | 7 | * Copyright (c) 2008 Intel Corporation |
8 | * Copyright 2009 Red Hat, Inc. and/or its affilates. | 8 | * Copyright 2009 Red Hat, Inc. and/or its affiliates. |
9 | * | 9 | * |
10 | * Permission is hereby granted, free of charge, to any person obtaining a copy | 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
11 | * of this software and associated documentation files (the "Software"), to deal | 11 | * of this software and associated documentation files (the "Software"), to deal |
@@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel) | |||
232 | } | 232 | } |
233 | } | 233 | } |
234 | 234 | ||
235 | int pit_has_pending_timer(struct kvm_vcpu *vcpu) | ||
236 | { | ||
237 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | ||
238 | |||
239 | if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack) | ||
240 | return atomic_read(&pit->pit_state.pit_timer.pending); | ||
241 | return 0; | ||
242 | } | ||
243 | |||
244 | static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) | 235 | static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) |
245 | { | 236 | { |
246 | struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, | 237 | struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, |
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 46d08ca0b48f..51a97426e791 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h | |||
@@ -33,7 +33,6 @@ struct kvm_kpit_state { | |||
33 | }; | 33 | }; |
34 | 34 | ||
35 | struct kvm_pit { | 35 | struct kvm_pit { |
36 | unsigned long base_addresss; | ||
37 | struct kvm_io_device dev; | 36 | struct kvm_io_device dev; |
38 | struct kvm_io_device speaker_dev; | 37 | struct kvm_io_device speaker_dev; |
39 | struct kvm *kvm; | 38 | struct kvm *kvm; |
@@ -51,7 +50,6 @@ struct kvm_pit { | |||
51 | #define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 | 50 | #define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 |
52 | #define KVM_PIT_CHANNEL_MASK 0x3 | 51 | #define KVM_PIT_CHANNEL_MASK 0x3 |
53 | 52 | ||
54 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); | ||
55 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); | 53 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); |
56 | struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); | 54 | struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); |
57 | void kvm_free_pit(struct kvm *kvm); | 55 | void kvm_free_pit(struct kvm *kvm); |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 4b7b73ce2098..19fe855e7953 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Copyright (c) 2003-2004 Fabrice Bellard | 4 | * Copyright (c) 2003-2004 Fabrice Bellard |
5 | * Copyright (c) 2007 Intel Corporation | 5 | * Copyright (c) 2007 Intel Corporation |
6 | * Copyright 2009 Red Hat, Inc. and/or its affilates. | 6 | * Copyright 2009 Red Hat, Inc. and/or its affiliates. |
7 | * | 7 | * |
8 | * Permission is hereby granted, free of charge, to any person obtaining a copy | 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
9 | * of this software and associated documentation files (the "Software"), to deal | 9 | * of this software and associated documentation files (the "Software"), to deal |
@@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level); | |||
39 | static void pic_lock(struct kvm_pic *s) | 39 | static void pic_lock(struct kvm_pic *s) |
40 | __acquires(&s->lock) | 40 | __acquires(&s->lock) |
41 | { | 41 | { |
42 | raw_spin_lock(&s->lock); | 42 | spin_lock(&s->lock); |
43 | } | 43 | } |
44 | 44 | ||
45 | static void pic_unlock(struct kvm_pic *s) | 45 | static void pic_unlock(struct kvm_pic *s) |
@@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s) | |||
51 | 51 | ||
52 | s->wakeup_needed = false; | 52 | s->wakeup_needed = false; |
53 | 53 | ||
54 | raw_spin_unlock(&s->lock); | 54 | spin_unlock(&s->lock); |
55 | 55 | ||
56 | if (wakeup) { | 56 | if (wakeup) { |
57 | kvm_for_each_vcpu(i, vcpu, s->kvm) { | 57 | kvm_for_each_vcpu(i, vcpu, s->kvm) { |
@@ -62,11 +62,9 @@ static void pic_unlock(struct kvm_pic *s) | |||
62 | } | 62 | } |
63 | 63 | ||
64 | if (!found) | 64 | if (!found) |
65 | found = s->kvm->bsp_vcpu; | ||
66 | |||
67 | if (!found) | ||
68 | return; | 65 | return; |
69 | 66 | ||
67 | kvm_make_request(KVM_REQ_EVENT, found); | ||
70 | kvm_vcpu_kick(found); | 68 | kvm_vcpu_kick(found); |
71 | } | 69 | } |
72 | } | 70 | } |
@@ -74,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s) | |||
74 | static void pic_clear_isr(struct kvm_kpic_state *s, int irq) | 72 | static void pic_clear_isr(struct kvm_kpic_state *s, int irq) |
75 | { | 73 | { |
76 | s->isr &= ~(1 << irq); | 74 | s->isr &= ~(1 << irq); |
77 | s->isr_ack |= (1 << irq); | ||
78 | if (s != &s->pics_state->pics[0]) | 75 | if (s != &s->pics_state->pics[0]) |
79 | irq += 8; | 76 | irq += 8; |
80 | /* | 77 | /* |
@@ -88,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) | |||
88 | pic_lock(s->pics_state); | 85 | pic_lock(s->pics_state); |
89 | } | 86 | } |
90 | 87 | ||
91 | void kvm_pic_clear_isr_ack(struct kvm *kvm) | ||
92 | { | ||
93 | struct kvm_pic *s = pic_irqchip(kvm); | ||
94 | |||
95 | pic_lock(s); | ||
96 | s->pics[0].isr_ack = 0xff; | ||
97 | s->pics[1].isr_ack = 0xff; | ||
98 | pic_unlock(s); | ||
99 | } | ||
100 | |||
101 | /* | 88 | /* |
102 | * set irq level. If an edge is detected, then the IRR is set to 1 | 89 | * set irq level. If an edge is detected, then the IRR is set to 1 |
103 | */ | 90 | */ |
@@ -280,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s) | |||
280 | s->irr = 0; | 267 | s->irr = 0; |
281 | s->imr = 0; | 268 | s->imr = 0; |
282 | s->isr = 0; | 269 | s->isr = 0; |
283 | s->isr_ack = 0xff; | ||
284 | s->priority_add = 0; | 270 | s->priority_add = 0; |
285 | s->irq_base = 0; | 271 | s->irq_base = 0; |
286 | s->read_reg_select = 0; | 272 | s->read_reg_select = 0; |
@@ -308,13 +294,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
308 | addr &= 1; | 294 | addr &= 1; |
309 | if (addr == 0) { | 295 | if (addr == 0) { |
310 | if (val & 0x10) { | 296 | if (val & 0x10) { |
311 | kvm_pic_reset(s); /* init */ | ||
312 | /* | ||
313 | * deassert a pending interrupt | ||
314 | */ | ||
315 | pic_irq_request(s->pics_state->kvm, 0); | ||
316 | s->init_state = 1; | ||
317 | s->init4 = val & 1; | 297 | s->init4 = val & 1; |
298 | s->last_irr = 0; | ||
299 | s->imr = 0; | ||
300 | s->priority_add = 0; | ||
301 | s->special_mask = 0; | ||
302 | s->read_reg_select = 0; | ||
303 | if (!s->init4) { | ||
304 | s->special_fully_nested_mode = 0; | ||
305 | s->auto_eoi = 0; | ||
306 | } | ||
307 | s->init_state = 1; | ||
318 | if (val & 0x02) | 308 | if (val & 0x02) |
319 | printk(KERN_ERR "single mode not supported"); | 309 | printk(KERN_ERR "single mode not supported"); |
320 | if (val & 0x08) | 310 | if (val & 0x08) |
@@ -540,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this, | |||
540 | */ | 530 | */ |
541 | static void pic_irq_request(struct kvm *kvm, int level) | 531 | static void pic_irq_request(struct kvm *kvm, int level) |
542 | { | 532 | { |
543 | struct kvm_vcpu *vcpu = kvm->bsp_vcpu; | ||
544 | struct kvm_pic *s = pic_irqchip(kvm); | 533 | struct kvm_pic *s = pic_irqchip(kvm); |
545 | int irq = pic_get_irq(&s->pics[0]); | ||
546 | 534 | ||
547 | s->output = level; | 535 | if (!s->output) |
548 | if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { | ||
549 | s->pics[0].isr_ack &= ~(1 << irq); | ||
550 | s->wakeup_needed = true; | 536 | s->wakeup_needed = true; |
551 | } | 537 | s->output = level; |
552 | } | 538 | } |
553 | 539 | ||
554 | static const struct kvm_io_device_ops picdev_ops = { | 540 | static const struct kvm_io_device_ops picdev_ops = { |
@@ -564,7 +550,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) | |||
564 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); | 550 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); |
565 | if (!s) | 551 | if (!s) |
566 | return NULL; | 552 | return NULL; |
567 | raw_spin_lock_init(&s->lock); | 553 | spin_lock_init(&s->lock); |
568 | s->kvm = kvm; | 554 | s->kvm = kvm; |
569 | s->pics[0].elcr_mask = 0xf8; | 555 | s->pics[0].elcr_mask = 0xf8; |
570 | s->pics[1].elcr_mask = 0xde; | 556 | s->pics[1].elcr_mask = 0xde; |
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 2095a049835e..7e06ba1618bd 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * irq.c: API for in kernel interrupt controller | 2 | * irq.c: API for in kernel interrupt controller |
3 | * Copyright (c) 2007, Intel Corporation. | 3 | * Copyright (c) 2007, Intel Corporation. |
4 | * Copyright 2009 Red Hat, Inc. and/or its affilates. | 4 | * Copyright 2009 Red Hat, Inc. and/or its affiliates. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify it | 6 | * This program is free software; you can redistribute it and/or modify it |
7 | * under the terms and conditions of the GNU General Public License, | 7 | * under the terms and conditions of the GNU General Public License, |
@@ -33,12 +33,7 @@ | |||
33 | */ | 33 | */ |
34 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) | 34 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) |
35 | { | 35 | { |
36 | int ret; | 36 | return apic_has_pending_timer(vcpu); |
37 | |||
38 | ret = pit_has_pending_timer(vcpu); | ||
39 | ret |= apic_has_pending_timer(vcpu); | ||
40 | |||
41 | return ret; | ||
42 | } | 37 | } |
43 | EXPORT_SYMBOL(kvm_cpu_has_pending_timer); | 38 | EXPORT_SYMBOL(kvm_cpu_has_pending_timer); |
44 | 39 | ||
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 63c314502993..53e2d084bffb 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h | |||
@@ -60,7 +60,7 @@ struct kvm_kpic_state { | |||
60 | }; | 60 | }; |
61 | 61 | ||
62 | struct kvm_pic { | 62 | struct kvm_pic { |
63 | raw_spinlock_t lock; | 63 | spinlock_t lock; |
64 | bool wakeup_needed; | 64 | bool wakeup_needed; |
65 | unsigned pending_acks; | 65 | unsigned pending_acks; |
66 | struct kvm *kvm; | 66 | struct kvm *kvm; |
@@ -75,7 +75,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm); | |||
75 | void kvm_destroy_pic(struct kvm *kvm); | 75 | void kvm_destroy_pic(struct kvm *kvm); |
76 | int kvm_pic_read_irq(struct kvm *kvm); | 76 | int kvm_pic_read_irq(struct kvm *kvm); |
77 | void kvm_pic_update_irq(struct kvm_pic *s); | 77 | void kvm_pic_update_irq(struct kvm_pic *s); |
78 | void kvm_pic_clear_isr_ack(struct kvm *kvm); | ||
79 | 78 | ||
80 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) | 79 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) |
81 | { | 80 | { |
@@ -100,7 +99,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); | |||
100 | void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); | 99 | void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); |
101 | void __kvm_migrate_timers(struct kvm_vcpu *vcpu); | 100 | void __kvm_migrate_timers(struct kvm_vcpu *vcpu); |
102 | 101 | ||
103 | int pit_has_pending_timer(struct kvm_vcpu *vcpu); | ||
104 | int apic_has_pending_timer(struct kvm_vcpu *vcpu); | 102 | int apic_has_pending_timer(struct kvm_vcpu *vcpu); |
105 | 103 | ||
106 | #endif | 104 | #endif |
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 6491ac8e755b..3377d53fcd36 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h | |||
@@ -42,7 +42,14 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) | |||
42 | (unsigned long *)&vcpu->arch.regs_avail)) | 42 | (unsigned long *)&vcpu->arch.regs_avail)) |
43 | kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); | 43 | kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); |
44 | 44 | ||
45 | return vcpu->arch.pdptrs[index]; | 45 | return vcpu->arch.walk_mmu->pdptrs[index]; |
46 | } | ||
47 | |||
48 | static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index) | ||
49 | { | ||
50 | load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu)); | ||
51 | |||
52 | return mmu->pdptrs[index]; | ||
46 | } | 53 | } |
47 | 54 | ||
48 | static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) | 55 | static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) |
@@ -66,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) | |||
66 | return vcpu->arch.cr4 & mask; | 73 | return vcpu->arch.cr4 & mask; |
67 | } | 74 | } |
68 | 75 | ||
76 | static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) | ||
77 | { | ||
78 | if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) | ||
79 | kvm_x86_ops->decache_cr3(vcpu); | ||
80 | return vcpu->arch.cr3; | ||
81 | } | ||
82 | |||
69 | static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) | 83 | static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) |
70 | { | 84 | { |
71 | return kvm_read_cr4_bits(vcpu, ~0UL); | 85 | return kvm_read_cr4_bits(vcpu, ~0UL); |
@@ -77,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) | |||
77 | | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); | 91 | | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); |
78 | } | 92 | } |
79 | 93 | ||
94 | static inline void enter_guest_mode(struct kvm_vcpu *vcpu) | ||
95 | { | ||
96 | vcpu->arch.hflags |= HF_GUEST_MASK; | ||
97 | } | ||
98 | |||
99 | static inline void leave_guest_mode(struct kvm_vcpu *vcpu) | ||
100 | { | ||
101 | vcpu->arch.hflags &= ~HF_GUEST_MASK; | ||
102 | } | ||
103 | |||
104 | static inline bool is_guest_mode(struct kvm_vcpu *vcpu) | ||
105 | { | ||
106 | return vcpu->arch.hflags & HF_GUEST_MASK; | ||
107 | } | ||
108 | |||
80 | #endif | 109 | #endif |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 77d8c0f4817d..2b2255b1f04b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * Copyright (C) 2006 Qumranet, Inc. | 5 | * Copyright (C) 2006 Qumranet, Inc. |
6 | * Copyright (C) 2007 Novell | 6 | * Copyright (C) 2007 Novell |
7 | * Copyright (C) 2007 Intel | 7 | * Copyright (C) 2007 Intel |
8 | * Copyright 2009 Red Hat, Inc. and/or its affilates. | 8 | * Copyright 2009 Red Hat, Inc. and/or its affiliates. |
9 | * | 9 | * |
10 | * Authors: | 10 | * Authors: |
11 | * Dor Laor <dor.laor@qumranet.com> | 11 | * Dor Laor <dor.laor@qumranet.com> |
@@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) | |||
259 | 259 | ||
260 | static void apic_update_ppr(struct kvm_lapic *apic) | 260 | static void apic_update_ppr(struct kvm_lapic *apic) |
261 | { | 261 | { |
262 | u32 tpr, isrv, ppr; | 262 | u32 tpr, isrv, ppr, old_ppr; |
263 | int isr; | 263 | int isr; |
264 | 264 | ||
265 | old_ppr = apic_get_reg(apic, APIC_PROCPRI); | ||
265 | tpr = apic_get_reg(apic, APIC_TASKPRI); | 266 | tpr = apic_get_reg(apic, APIC_TASKPRI); |
266 | isr = apic_find_highest_isr(apic); | 267 | isr = apic_find_highest_isr(apic); |
267 | isrv = (isr != -1) ? isr : 0; | 268 | isrv = (isr != -1) ? isr : 0; |
@@ -274,7 +275,11 @@ static void apic_update_ppr(struct kvm_lapic *apic) | |||
274 | apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", | 275 | apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", |
275 | apic, ppr, isr, isrv); | 276 | apic, ppr, isr, isrv); |
276 | 277 | ||
277 | apic_set_reg(apic, APIC_PROCPRI, ppr); | 278 | if (old_ppr != ppr) { |
279 | apic_set_reg(apic, APIC_PROCPRI, ppr); | ||
280 | if (ppr < old_ppr) | ||
281 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | ||
282 | } | ||
278 | } | 283 | } |
279 | 284 | ||
280 | static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) | 285 | static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) |
@@ -391,6 +396,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
391 | break; | 396 | break; |
392 | } | 397 | } |
393 | 398 | ||
399 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
394 | kvm_vcpu_kick(vcpu); | 400 | kvm_vcpu_kick(vcpu); |
395 | break; | 401 | break; |
396 | 402 | ||
@@ -411,11 +417,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
411 | case APIC_DM_INIT: | 417 | case APIC_DM_INIT: |
412 | if (level) { | 418 | if (level) { |
413 | result = 1; | 419 | result = 1; |
414 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) | ||
415 | printk(KERN_DEBUG | ||
416 | "INIT on a runnable vcpu %d\n", | ||
417 | vcpu->vcpu_id); | ||
418 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; | 420 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; |
421 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
419 | kvm_vcpu_kick(vcpu); | 422 | kvm_vcpu_kick(vcpu); |
420 | } else { | 423 | } else { |
421 | apic_debug("Ignoring de-assert INIT to vcpu %d\n", | 424 | apic_debug("Ignoring de-assert INIT to vcpu %d\n", |
@@ -430,6 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
430 | result = 1; | 433 | result = 1; |
431 | vcpu->arch.sipi_vector = vector; | 434 | vcpu->arch.sipi_vector = vector; |
432 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; | 435 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; |
436 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
433 | kvm_vcpu_kick(vcpu); | 437 | kvm_vcpu_kick(vcpu); |
434 | } | 438 | } |
435 | break; | 439 | break; |
@@ -475,6 +479,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) | |||
475 | trigger_mode = IOAPIC_EDGE_TRIG; | 479 | trigger_mode = IOAPIC_EDGE_TRIG; |
476 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) | 480 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) |
477 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | 481 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); |
482 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | ||
478 | } | 483 | } |
479 | 484 | ||
480 | static void apic_send_ipi(struct kvm_lapic *apic) | 485 | static void apic_send_ipi(struct kvm_lapic *apic) |
@@ -866,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) | |||
866 | 871 | ||
867 | hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); | 872 | hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); |
868 | 873 | ||
869 | if (vcpu->arch.apic->regs_page) | 874 | if (vcpu->arch.apic->regs) |
870 | __free_page(vcpu->arch.apic->regs_page); | 875 | free_page((unsigned long)vcpu->arch.apic->regs); |
871 | 876 | ||
872 | kfree(vcpu->arch.apic); | 877 | kfree(vcpu->arch.apic); |
873 | } | 878 | } |
@@ -1056,14 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) | |||
1056 | 1061 | ||
1057 | vcpu->arch.apic = apic; | 1062 | vcpu->arch.apic = apic; |
1058 | 1063 | ||
1059 | apic->regs_page = alloc_page(GFP_KERNEL); | 1064 | apic->regs = (void *)get_zeroed_page(GFP_KERNEL); |
1060 | if (apic->regs_page == NULL) { | 1065 | if (!apic->regs) { |
1061 | printk(KERN_ERR "malloc apic regs error for vcpu %x\n", | 1066 | printk(KERN_ERR "malloc apic regs error for vcpu %x\n", |
1062 | vcpu->vcpu_id); | 1067 | vcpu->vcpu_id); |
1063 | goto nomem_free_apic; | 1068 | goto nomem_free_apic; |
1064 | } | 1069 | } |
1065 | apic->regs = page_address(apic->regs_page); | ||
1066 | memset(apic->regs, 0, PAGE_SIZE); | ||
1067 | apic->vcpu = vcpu; | 1070 | apic->vcpu = vcpu; |
1068 | 1071 | ||
1069 | hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, | 1072 | hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, |
@@ -1152,6 +1155,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | |||
1152 | update_divide_count(apic); | 1155 | update_divide_count(apic); |
1153 | start_apic_timer(apic); | 1156 | start_apic_timer(apic); |
1154 | apic->irr_pending = true; | 1157 | apic->irr_pending = true; |
1158 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
1155 | } | 1159 | } |
1156 | 1160 | ||
1157 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | 1161 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) |
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f5fe32c5edad..52c9e6b9e725 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h | |||
@@ -13,7 +13,6 @@ struct kvm_lapic { | |||
13 | u32 divide_count; | 13 | u32 divide_count; |
14 | struct kvm_vcpu *vcpu; | 14 | struct kvm_vcpu *vcpu; |
15 | bool irr_pending; | 15 | bool irr_pending; |
16 | struct page *regs_page; | ||
17 | void *regs; | 16 | void *regs; |
18 | gpa_t vapic_addr; | 17 | gpa_t vapic_addr; |
19 | struct page *vapic_page; | 18 | struct page *vapic_page; |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 311f6dad8951..aee38623b768 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * MMU support | 7 | * MMU support |
8 | * | 8 | * |
9 | * Copyright (C) 2006 Qumranet, Inc. | 9 | * Copyright (C) 2006 Qumranet, Inc. |
10 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 10 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
11 | * | 11 | * |
12 | * Authors: | 12 | * Authors: |
13 | * Yaniv Kamay <yaniv@qumranet.com> | 13 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -18,9 +18,11 @@ | |||
18 | * | 18 | * |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include "irq.h" | ||
21 | #include "mmu.h" | 22 | #include "mmu.h" |
22 | #include "x86.h" | 23 | #include "x86.h" |
23 | #include "kvm_cache_regs.h" | 24 | #include "kvm_cache_regs.h" |
25 | #include "x86.h" | ||
24 | 26 | ||
25 | #include <linux/kvm_host.h> | 27 | #include <linux/kvm_host.h> |
26 | #include <linux/types.h> | 28 | #include <linux/types.h> |
@@ -49,15 +51,25 @@ | |||
49 | */ | 51 | */ |
50 | bool tdp_enabled = false; | 52 | bool tdp_enabled = false; |
51 | 53 | ||
52 | #undef MMU_DEBUG | 54 | enum { |
55 | AUDIT_PRE_PAGE_FAULT, | ||
56 | AUDIT_POST_PAGE_FAULT, | ||
57 | AUDIT_PRE_PTE_WRITE, | ||
58 | AUDIT_POST_PTE_WRITE, | ||
59 | AUDIT_PRE_SYNC, | ||
60 | AUDIT_POST_SYNC | ||
61 | }; | ||
53 | 62 | ||
54 | #undef AUDIT | 63 | char *audit_point_name[] = { |
64 | "pre page fault", | ||
65 | "post page fault", | ||
66 | "pre pte write", | ||
67 | "post pte write", | ||
68 | "pre sync", | ||
69 | "post sync" | ||
70 | }; | ||
55 | 71 | ||
56 | #ifdef AUDIT | 72 | #undef MMU_DEBUG |
57 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); | ||
58 | #else | ||
59 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | ||
60 | #endif | ||
61 | 73 | ||
62 | #ifdef MMU_DEBUG | 74 | #ifdef MMU_DEBUG |
63 | 75 | ||
@@ -71,7 +83,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | |||
71 | 83 | ||
72 | #endif | 84 | #endif |
73 | 85 | ||
74 | #if defined(MMU_DEBUG) || defined(AUDIT) | 86 | #ifdef MMU_DEBUG |
75 | static int dbg = 0; | 87 | static int dbg = 0; |
76 | module_param(dbg, bool, 0644); | 88 | module_param(dbg, bool, 0644); |
77 | #endif | 89 | #endif |
@@ -89,6 +101,8 @@ module_param(oos_shadow, bool, 0644); | |||
89 | } | 101 | } |
90 | #endif | 102 | #endif |
91 | 103 | ||
104 | #define PTE_PREFETCH_NUM 8 | ||
105 | |||
92 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | 106 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 |
93 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | 107 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 |
94 | 108 | ||
@@ -97,9 +111,6 @@ module_param(oos_shadow, bool, 0644); | |||
97 | #define PT64_LEVEL_SHIFT(level) \ | 111 | #define PT64_LEVEL_SHIFT(level) \ |
98 | (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) | 112 | (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) |
99 | 113 | ||
100 | #define PT64_LEVEL_MASK(level) \ | ||
101 | (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) | ||
102 | |||
103 | #define PT64_INDEX(address, level)\ | 114 | #define PT64_INDEX(address, level)\ |
104 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) | 115 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) |
105 | 116 | ||
@@ -109,8 +120,6 @@ module_param(oos_shadow, bool, 0644); | |||
109 | #define PT32_LEVEL_SHIFT(level) \ | 120 | #define PT32_LEVEL_SHIFT(level) \ |
110 | (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) | 121 | (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) |
111 | 122 | ||
112 | #define PT32_LEVEL_MASK(level) \ | ||
113 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) | ||
114 | #define PT32_LVL_OFFSET_MASK(level) \ | 123 | #define PT32_LVL_OFFSET_MASK(level) \ |
115 | (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ | 124 | (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ |
116 | * PT32_LEVEL_BITS))) - 1)) | 125 | * PT32_LEVEL_BITS))) - 1)) |
@@ -178,10 +187,10 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); | |||
178 | static struct kmem_cache *pte_chain_cache; | 187 | static struct kmem_cache *pte_chain_cache; |
179 | static struct kmem_cache *rmap_desc_cache; | 188 | static struct kmem_cache *rmap_desc_cache; |
180 | static struct kmem_cache *mmu_page_header_cache; | 189 | static struct kmem_cache *mmu_page_header_cache; |
190 | static struct percpu_counter kvm_total_used_mmu_pages; | ||
181 | 191 | ||
182 | static u64 __read_mostly shadow_trap_nonpresent_pte; | 192 | static u64 __read_mostly shadow_trap_nonpresent_pte; |
183 | static u64 __read_mostly shadow_notrap_nonpresent_pte; | 193 | static u64 __read_mostly shadow_notrap_nonpresent_pte; |
184 | static u64 __read_mostly shadow_base_present_pte; | ||
185 | static u64 __read_mostly shadow_nx_mask; | 194 | static u64 __read_mostly shadow_nx_mask; |
186 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ | 195 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ |
187 | static u64 __read_mostly shadow_user_mask; | 196 | static u64 __read_mostly shadow_user_mask; |
@@ -200,12 +209,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | |||
200 | } | 209 | } |
201 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); | 210 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); |
202 | 211 | ||
203 | void kvm_mmu_set_base_ptes(u64 base_pte) | ||
204 | { | ||
205 | shadow_base_present_pte = base_pte; | ||
206 | } | ||
207 | EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); | ||
208 | |||
209 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 212 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
210 | u64 dirty_mask, u64 nx_mask, u64 x_mask) | 213 | u64 dirty_mask, u64 nx_mask, u64 x_mask) |
211 | { | 214 | { |
@@ -299,18 +302,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte) | |||
299 | #endif | 302 | #endif |
300 | } | 303 | } |
301 | 304 | ||
305 | static bool spte_has_volatile_bits(u64 spte) | ||
306 | { | ||
307 | if (!shadow_accessed_mask) | ||
308 | return false; | ||
309 | |||
310 | if (!is_shadow_present_pte(spte)) | ||
311 | return false; | ||
312 | |||
313 | if ((spte & shadow_accessed_mask) && | ||
314 | (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) | ||
315 | return false; | ||
316 | |||
317 | return true; | ||
318 | } | ||
319 | |||
320 | static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) | ||
321 | { | ||
322 | return (old_spte & bit_mask) && !(new_spte & bit_mask); | ||
323 | } | ||
324 | |||
302 | static void update_spte(u64 *sptep, u64 new_spte) | 325 | static void update_spte(u64 *sptep, u64 new_spte) |
303 | { | 326 | { |
304 | u64 old_spte; | 327 | u64 mask, old_spte = *sptep; |
328 | |||
329 | WARN_ON(!is_rmap_spte(new_spte)); | ||
330 | |||
331 | new_spte |= old_spte & shadow_dirty_mask; | ||
332 | |||
333 | mask = shadow_accessed_mask; | ||
334 | if (is_writable_pte(old_spte)) | ||
335 | mask |= shadow_dirty_mask; | ||
305 | 336 | ||
306 | if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || | 337 | if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) |
307 | !is_rmap_spte(*sptep)) | ||
308 | __set_spte(sptep, new_spte); | 338 | __set_spte(sptep, new_spte); |
309 | else { | 339 | else |
310 | old_spte = __xchg_spte(sptep, new_spte); | 340 | old_spte = __xchg_spte(sptep, new_spte); |
311 | if (old_spte & shadow_accessed_mask) | 341 | |
312 | mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); | 342 | if (!shadow_accessed_mask) |
313 | } | 343 | return; |
344 | |||
345 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) | ||
346 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); | ||
347 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) | ||
348 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); | ||
314 | } | 349 | } |
315 | 350 | ||
316 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | 351 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, |
@@ -339,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, | |||
339 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, | 374 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, |
340 | int min) | 375 | int min) |
341 | { | 376 | { |
342 | struct page *page; | 377 | void *page; |
343 | 378 | ||
344 | if (cache->nobjs >= min) | 379 | if (cache->nobjs >= min) |
345 | return 0; | 380 | return 0; |
346 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | 381 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { |
347 | page = alloc_page(GFP_KERNEL); | 382 | page = (void *)__get_free_page(GFP_KERNEL); |
348 | if (!page) | 383 | if (!page) |
349 | return -ENOMEM; | 384 | return -ENOMEM; |
350 | cache->objects[cache->nobjs++] = page_address(page); | 385 | cache->objects[cache->nobjs++] = page; |
351 | } | 386 | } |
352 | return 0; | 387 | return 0; |
353 | } | 388 | } |
@@ -367,7 +402,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | |||
367 | if (r) | 402 | if (r) |
368 | goto out; | 403 | goto out; |
369 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, | 404 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, |
370 | rmap_desc_cache, 4); | 405 | rmap_desc_cache, 4 + PTE_PREFETCH_NUM); |
371 | if (r) | 406 | if (r) |
372 | goto out; | 407 | goto out; |
373 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); | 408 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); |
@@ -437,46 +472,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) | |||
437 | } | 472 | } |
438 | 473 | ||
439 | /* | 474 | /* |
440 | * Return the pointer to the largepage write count for a given | 475 | * Return the pointer to the large page information for a given gfn, |
441 | * gfn, handling slots that are not large page aligned. | 476 | * handling slots that are not large page aligned. |
442 | */ | 477 | */ |
443 | static int *slot_largepage_idx(gfn_t gfn, | 478 | static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, |
444 | struct kvm_memory_slot *slot, | 479 | struct kvm_memory_slot *slot, |
445 | int level) | 480 | int level) |
446 | { | 481 | { |
447 | unsigned long idx; | 482 | unsigned long idx; |
448 | 483 | ||
449 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - | 484 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - |
450 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | 485 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); |
451 | return &slot->lpage_info[level - 2][idx].write_count; | 486 | return &slot->lpage_info[level - 2][idx]; |
452 | } | 487 | } |
453 | 488 | ||
454 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) | 489 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) |
455 | { | 490 | { |
456 | struct kvm_memory_slot *slot; | 491 | struct kvm_memory_slot *slot; |
457 | int *write_count; | 492 | struct kvm_lpage_info *linfo; |
458 | int i; | 493 | int i; |
459 | 494 | ||
460 | slot = gfn_to_memslot(kvm, gfn); | 495 | slot = gfn_to_memslot(kvm, gfn); |
461 | for (i = PT_DIRECTORY_LEVEL; | 496 | for (i = PT_DIRECTORY_LEVEL; |
462 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 497 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
463 | write_count = slot_largepage_idx(gfn, slot, i); | 498 | linfo = lpage_info_slot(gfn, slot, i); |
464 | *write_count += 1; | 499 | linfo->write_count += 1; |
465 | } | 500 | } |
466 | } | 501 | } |
467 | 502 | ||
468 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | 503 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) |
469 | { | 504 | { |
470 | struct kvm_memory_slot *slot; | 505 | struct kvm_memory_slot *slot; |
471 | int *write_count; | 506 | struct kvm_lpage_info *linfo; |
472 | int i; | 507 | int i; |
473 | 508 | ||
474 | slot = gfn_to_memslot(kvm, gfn); | 509 | slot = gfn_to_memslot(kvm, gfn); |
475 | for (i = PT_DIRECTORY_LEVEL; | 510 | for (i = PT_DIRECTORY_LEVEL; |
476 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 511 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
477 | write_count = slot_largepage_idx(gfn, slot, i); | 512 | linfo = lpage_info_slot(gfn, slot, i); |
478 | *write_count -= 1; | 513 | linfo->write_count -= 1; |
479 | WARN_ON(*write_count < 0); | 514 | WARN_ON(linfo->write_count < 0); |
480 | } | 515 | } |
481 | } | 516 | } |
482 | 517 | ||
@@ -485,12 +520,12 @@ static int has_wrprotected_page(struct kvm *kvm, | |||
485 | int level) | 520 | int level) |
486 | { | 521 | { |
487 | struct kvm_memory_slot *slot; | 522 | struct kvm_memory_slot *slot; |
488 | int *largepage_idx; | 523 | struct kvm_lpage_info *linfo; |
489 | 524 | ||
490 | slot = gfn_to_memslot(kvm, gfn); | 525 | slot = gfn_to_memslot(kvm, gfn); |
491 | if (slot) { | 526 | if (slot) { |
492 | largepage_idx = slot_largepage_idx(gfn, slot, level); | 527 | linfo = lpage_info_slot(gfn, slot, level); |
493 | return *largepage_idx; | 528 | return linfo->write_count; |
494 | } | 529 | } |
495 | 530 | ||
496 | return 1; | 531 | return 1; |
@@ -514,14 +549,28 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) | |||
514 | return ret; | 549 | return ret; |
515 | } | 550 | } |
516 | 551 | ||
517 | static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | 552 | static struct kvm_memory_slot * |
553 | gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, | ||
554 | bool no_dirty_log) | ||
518 | { | 555 | { |
519 | struct kvm_memory_slot *slot; | 556 | struct kvm_memory_slot *slot; |
520 | int host_level, level, max_level; | ||
521 | 557 | ||
522 | slot = gfn_to_memslot(vcpu->kvm, large_gfn); | 558 | slot = gfn_to_memslot(vcpu->kvm, gfn); |
523 | if (slot && slot->dirty_bitmap) | 559 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID || |
524 | return PT_PAGE_TABLE_LEVEL; | 560 | (no_dirty_log && slot->dirty_bitmap)) |
561 | slot = NULL; | ||
562 | |||
563 | return slot; | ||
564 | } | ||
565 | |||
566 | static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) | ||
567 | { | ||
568 | return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); | ||
569 | } | ||
570 | |||
571 | static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | ||
572 | { | ||
573 | int host_level, level, max_level; | ||
525 | 574 | ||
526 | host_level = host_mapping_level(vcpu->kvm, large_gfn); | 575 | host_level = host_mapping_level(vcpu->kvm, large_gfn); |
527 | 576 | ||
@@ -545,16 +594,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | |||
545 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | 594 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) |
546 | { | 595 | { |
547 | struct kvm_memory_slot *slot; | 596 | struct kvm_memory_slot *slot; |
548 | unsigned long idx; | 597 | struct kvm_lpage_info *linfo; |
549 | 598 | ||
550 | slot = gfn_to_memslot(kvm, gfn); | 599 | slot = gfn_to_memslot(kvm, gfn); |
551 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | 600 | if (likely(level == PT_PAGE_TABLE_LEVEL)) |
552 | return &slot->rmap[gfn - slot->base_gfn]; | 601 | return &slot->rmap[gfn - slot->base_gfn]; |
553 | 602 | ||
554 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - | 603 | linfo = lpage_info_slot(gfn, slot, level); |
555 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | ||
556 | 604 | ||
557 | return &slot->lpage_info[level - 2][idx].rmap_pde; | 605 | return &linfo->rmap_pde; |
558 | } | 606 | } |
559 | 607 | ||
560 | /* | 608 | /* |
@@ -591,6 +639,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
591 | desc->sptes[0] = (u64 *)*rmapp; | 639 | desc->sptes[0] = (u64 *)*rmapp; |
592 | desc->sptes[1] = spte; | 640 | desc->sptes[1] = spte; |
593 | *rmapp = (unsigned long)desc | 1; | 641 | *rmapp = (unsigned long)desc | 1; |
642 | ++count; | ||
594 | } else { | 643 | } else { |
595 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | 644 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); |
596 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 645 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
@@ -603,7 +652,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
603 | desc = desc->more; | 652 | desc = desc->more; |
604 | } | 653 | } |
605 | for (i = 0; desc->sptes[i]; ++i) | 654 | for (i = 0; desc->sptes[i]; ++i) |
606 | ; | 655 | ++count; |
607 | desc->sptes[i] = spte; | 656 | desc->sptes[i] = spte; |
608 | } | 657 | } |
609 | return count; | 658 | return count; |
@@ -645,18 +694,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
645 | gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); | 694 | gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); |
646 | rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); | 695 | rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); |
647 | if (!*rmapp) { | 696 | if (!*rmapp) { |
648 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | 697 | printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte); |
649 | BUG(); | 698 | BUG(); |
650 | } else if (!(*rmapp & 1)) { | 699 | } else if (!(*rmapp & 1)) { |
651 | rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); | 700 | rmap_printk("rmap_remove: %p 1->0\n", spte); |
652 | if ((u64 *)*rmapp != spte) { | 701 | if ((u64 *)*rmapp != spte) { |
653 | printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", | 702 | printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); |
654 | spte, *spte); | ||
655 | BUG(); | 703 | BUG(); |
656 | } | 704 | } |
657 | *rmapp = 0; | 705 | *rmapp = 0; |
658 | } else { | 706 | } else { |
659 | rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); | 707 | rmap_printk("rmap_remove: %p many->many\n", spte); |
660 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 708 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
661 | prev_desc = NULL; | 709 | prev_desc = NULL; |
662 | while (desc) { | 710 | while (desc) { |
@@ -670,35 +718,36 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
670 | prev_desc = desc; | 718 | prev_desc = desc; |
671 | desc = desc->more; | 719 | desc = desc->more; |
672 | } | 720 | } |
673 | pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); | 721 | pr_err("rmap_remove: %p many->many\n", spte); |
674 | BUG(); | 722 | BUG(); |
675 | } | 723 | } |
676 | } | 724 | } |
677 | 725 | ||
678 | static void set_spte_track_bits(u64 *sptep, u64 new_spte) | 726 | static int set_spte_track_bits(u64 *sptep, u64 new_spte) |
679 | { | 727 | { |
680 | pfn_t pfn; | 728 | pfn_t pfn; |
681 | u64 old_spte = *sptep; | 729 | u64 old_spte = *sptep; |
682 | 730 | ||
683 | if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || | 731 | if (!spte_has_volatile_bits(old_spte)) |
684 | old_spte & shadow_accessed_mask) { | ||
685 | __set_spte(sptep, new_spte); | 732 | __set_spte(sptep, new_spte); |
686 | } else | 733 | else |
687 | old_spte = __xchg_spte(sptep, new_spte); | 734 | old_spte = __xchg_spte(sptep, new_spte); |
688 | 735 | ||
689 | if (!is_rmap_spte(old_spte)) | 736 | if (!is_rmap_spte(old_spte)) |
690 | return; | 737 | return 0; |
738 | |||
691 | pfn = spte_to_pfn(old_spte); | 739 | pfn = spte_to_pfn(old_spte); |
692 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) | 740 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) |
693 | kvm_set_pfn_accessed(pfn); | 741 | kvm_set_pfn_accessed(pfn); |
694 | if (is_writable_pte(old_spte)) | 742 | if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) |
695 | kvm_set_pfn_dirty(pfn); | 743 | kvm_set_pfn_dirty(pfn); |
744 | return 1; | ||
696 | } | 745 | } |
697 | 746 | ||
698 | static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) | 747 | static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) |
699 | { | 748 | { |
700 | set_spte_track_bits(sptep, new_spte); | 749 | if (set_spte_track_bits(sptep, new_spte)) |
701 | rmap_remove(kvm, sptep); | 750 | rmap_remove(kvm, sptep); |
702 | } | 751 | } |
703 | 752 | ||
704 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | 753 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) |
@@ -746,13 +795,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
746 | } | 795 | } |
747 | spte = rmap_next(kvm, rmapp, spte); | 796 | spte = rmap_next(kvm, rmapp, spte); |
748 | } | 797 | } |
749 | if (write_protected) { | ||
750 | pfn_t pfn; | ||
751 | |||
752 | spte = rmap_next(kvm, rmapp, NULL); | ||
753 | pfn = spte_to_pfn(*spte); | ||
754 | kvm_set_pfn_dirty(pfn); | ||
755 | } | ||
756 | 798 | ||
757 | /* check for huge page mappings */ | 799 | /* check for huge page mappings */ |
758 | for (i = PT_DIRECTORY_LEVEL; | 800 | for (i = PT_DIRECTORY_LEVEL; |
@@ -848,19 +890,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |||
848 | end = start + (memslot->npages << PAGE_SHIFT); | 890 | end = start + (memslot->npages << PAGE_SHIFT); |
849 | if (hva >= start && hva < end) { | 891 | if (hva >= start && hva < end) { |
850 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; | 892 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; |
893 | gfn_t gfn = memslot->base_gfn + gfn_offset; | ||
851 | 894 | ||
852 | ret = handler(kvm, &memslot->rmap[gfn_offset], data); | 895 | ret = handler(kvm, &memslot->rmap[gfn_offset], data); |
853 | 896 | ||
854 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { | 897 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { |
855 | unsigned long idx; | 898 | struct kvm_lpage_info *linfo; |
856 | int sh; | 899 | |
857 | 900 | linfo = lpage_info_slot(gfn, memslot, | |
858 | sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); | 901 | PT_DIRECTORY_LEVEL + j); |
859 | idx = ((memslot->base_gfn+gfn_offset) >> sh) - | 902 | ret |= handler(kvm, &linfo->rmap_pde, data); |
860 | (memslot->base_gfn >> sh); | ||
861 | ret |= handler(kvm, | ||
862 | &memslot->lpage_info[j][idx].rmap_pde, | ||
863 | data); | ||
864 | } | 903 | } |
865 | trace_kvm_age_page(hva, memslot, ret); | 904 | trace_kvm_age_page(hva, memslot, ret); |
866 | retval |= ret; | 905 | retval |= ret; |
@@ -911,6 +950,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
911 | return young; | 950 | return young; |
912 | } | 951 | } |
913 | 952 | ||
953 | static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | ||
954 | unsigned long data) | ||
955 | { | ||
956 | u64 *spte; | ||
957 | int young = 0; | ||
958 | |||
959 | /* | ||
960 | * If there's no access bit in the secondary pte set by the | ||
961 | * hardware it's up to gup-fast/gup to set the access bit in | ||
962 | * the primary pte or in the page structure. | ||
963 | */ | ||
964 | if (!shadow_accessed_mask) | ||
965 | goto out; | ||
966 | |||
967 | spte = rmap_next(kvm, rmapp, NULL); | ||
968 | while (spte) { | ||
969 | u64 _spte = *spte; | ||
970 | BUG_ON(!(_spte & PT_PRESENT_MASK)); | ||
971 | young = _spte & PT_ACCESSED_MASK; | ||
972 | if (young) { | ||
973 | young = 1; | ||
974 | break; | ||
975 | } | ||
976 | spte = rmap_next(kvm, rmapp, spte); | ||
977 | } | ||
978 | out: | ||
979 | return young; | ||
980 | } | ||
981 | |||
914 | #define RMAP_RECYCLE_THRESHOLD 1000 | 982 | #define RMAP_RECYCLE_THRESHOLD 1000 |
915 | 983 | ||
916 | static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | 984 | static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) |
@@ -931,6 +999,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva) | |||
931 | return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); | 999 | return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); |
932 | } | 1000 | } |
933 | 1001 | ||
1002 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) | ||
1003 | { | ||
1004 | return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); | ||
1005 | } | ||
1006 | |||
934 | #ifdef MMU_DEBUG | 1007 | #ifdef MMU_DEBUG |
935 | static int is_empty_shadow_page(u64 *spt) | 1008 | static int is_empty_shadow_page(u64 *spt) |
936 | { | 1009 | { |
@@ -947,16 +1020,28 @@ static int is_empty_shadow_page(u64 *spt) | |||
947 | } | 1020 | } |
948 | #endif | 1021 | #endif |
949 | 1022 | ||
1023 | /* | ||
1024 | * This value is the sum of all of the kvm instances's | ||
1025 | * kvm->arch.n_used_mmu_pages values. We need a global, | ||
1026 | * aggregate version in order to make the slab shrinker | ||
1027 | * faster | ||
1028 | */ | ||
1029 | static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) | ||
1030 | { | ||
1031 | kvm->arch.n_used_mmu_pages += nr; | ||
1032 | percpu_counter_add(&kvm_total_used_mmu_pages, nr); | ||
1033 | } | ||
1034 | |||
950 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1035 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
951 | { | 1036 | { |
952 | ASSERT(is_empty_shadow_page(sp->spt)); | 1037 | ASSERT(is_empty_shadow_page(sp->spt)); |
953 | hlist_del(&sp->hash_link); | 1038 | hlist_del(&sp->hash_link); |
954 | list_del(&sp->link); | 1039 | list_del(&sp->link); |
955 | __free_page(virt_to_page(sp->spt)); | 1040 | free_page((unsigned long)sp->spt); |
956 | if (!sp->role.direct) | 1041 | if (!sp->role.direct) |
957 | __free_page(virt_to_page(sp->gfns)); | 1042 | free_page((unsigned long)sp->gfns); |
958 | kmem_cache_free(mmu_page_header_cache, sp); | 1043 | kmem_cache_free(mmu_page_header_cache, sp); |
959 | ++kvm->arch.n_free_mmu_pages; | 1044 | kvm_mod_used_mmu_pages(kvm, -1); |
960 | } | 1045 | } |
961 | 1046 | ||
962 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | 1047 | static unsigned kvm_page_table_hashfn(gfn_t gfn) |
@@ -979,7 +1064,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
979 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | 1064 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); |
980 | sp->multimapped = 0; | 1065 | sp->multimapped = 0; |
981 | sp->parent_pte = parent_pte; | 1066 | sp->parent_pte = parent_pte; |
982 | --vcpu->kvm->arch.n_free_mmu_pages; | 1067 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); |
983 | return sp; | 1068 | return sp; |
984 | } | 1069 | } |
985 | 1070 | ||
@@ -1110,7 +1195,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | |||
1110 | } | 1195 | } |
1111 | 1196 | ||
1112 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, | 1197 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, |
1113 | struct kvm_mmu_page *sp, bool clear_unsync) | 1198 | struct kvm_mmu_page *sp) |
1114 | { | 1199 | { |
1115 | return 1; | 1200 | return 1; |
1116 | } | 1201 | } |
@@ -1119,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) | |||
1119 | { | 1204 | { |
1120 | } | 1205 | } |
1121 | 1206 | ||
1207 | static void nonpaging_update_pte(struct kvm_vcpu *vcpu, | ||
1208 | struct kvm_mmu_page *sp, u64 *spte, | ||
1209 | const void *pte) | ||
1210 | { | ||
1211 | WARN_ON(1); | ||
1212 | } | ||
1213 | |||
1122 | #define KVM_PAGE_ARRAY_NR 16 | 1214 | #define KVM_PAGE_ARRAY_NR 16 |
1123 | 1215 | ||
1124 | struct kvm_mmu_pages { | 1216 | struct kvm_mmu_pages { |
@@ -1240,7 +1332,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
1240 | if (clear_unsync) | 1332 | if (clear_unsync) |
1241 | kvm_unlink_unsync_page(vcpu->kvm, sp); | 1333 | kvm_unlink_unsync_page(vcpu->kvm, sp); |
1242 | 1334 | ||
1243 | if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { | 1335 | if (vcpu->arch.mmu.sync_page(vcpu, sp)) { |
1244 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); | 1336 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); |
1245 | return 1; | 1337 | return 1; |
1246 | } | 1338 | } |
@@ -1281,12 +1373,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) | |||
1281 | continue; | 1373 | continue; |
1282 | 1374 | ||
1283 | WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); | 1375 | WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); |
1376 | kvm_unlink_unsync_page(vcpu->kvm, s); | ||
1284 | if ((s->role.cr4_pae != !!is_pae(vcpu)) || | 1377 | if ((s->role.cr4_pae != !!is_pae(vcpu)) || |
1285 | (vcpu->arch.mmu.sync_page(vcpu, s, true))) { | 1378 | (vcpu->arch.mmu.sync_page(vcpu, s))) { |
1286 | kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); | 1379 | kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); |
1287 | continue; | 1380 | continue; |
1288 | } | 1381 | } |
1289 | kvm_unlink_unsync_page(vcpu->kvm, s); | ||
1290 | flush = true; | 1382 | flush = true; |
1291 | } | 1383 | } |
1292 | 1384 | ||
@@ -1403,7 +1495,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1403 | if (role.direct) | 1495 | if (role.direct) |
1404 | role.cr4_pae = 0; | 1496 | role.cr4_pae = 0; |
1405 | role.access = access; | 1497 | role.access = access; |
1406 | if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { | 1498 | if (!vcpu->arch.mmu.direct_map |
1499 | && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { | ||
1407 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | 1500 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); |
1408 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | 1501 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; |
1409 | role.quadrant = quadrant; | 1502 | role.quadrant = quadrant; |
@@ -1458,6 +1551,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, | |||
1458 | iterator->addr = addr; | 1551 | iterator->addr = addr; |
1459 | iterator->shadow_addr = vcpu->arch.mmu.root_hpa; | 1552 | iterator->shadow_addr = vcpu->arch.mmu.root_hpa; |
1460 | iterator->level = vcpu->arch.mmu.shadow_root_level; | 1553 | iterator->level = vcpu->arch.mmu.shadow_root_level; |
1554 | |||
1555 | if (iterator->level == PT64_ROOT_LEVEL && | ||
1556 | vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && | ||
1557 | !vcpu->arch.mmu.direct_map) | ||
1558 | --iterator->level; | ||
1559 | |||
1461 | if (iterator->level == PT32E_ROOT_LEVEL) { | 1560 | if (iterator->level == PT32E_ROOT_LEVEL) { |
1462 | iterator->shadow_addr | 1561 | iterator->shadow_addr |
1463 | = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | 1562 | = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; |
@@ -1665,41 +1764,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, | |||
1665 | 1764 | ||
1666 | /* | 1765 | /* |
1667 | * Changing the number of mmu pages allocated to the vm | 1766 | * Changing the number of mmu pages allocated to the vm |
1668 | * Note: if kvm_nr_mmu_pages is too small, you will get dead lock | 1767 | * Note: if goal_nr_mmu_pages is too small, you will get dead lock |
1669 | */ | 1768 | */ |
1670 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) | 1769 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) |
1671 | { | 1770 | { |
1672 | int used_pages; | ||
1673 | LIST_HEAD(invalid_list); | 1771 | LIST_HEAD(invalid_list); |
1674 | |||
1675 | used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; | ||
1676 | used_pages = max(0, used_pages); | ||
1677 | |||
1678 | /* | 1772 | /* |
1679 | * If we set the number of mmu pages to be smaller be than the | 1773 | * If we set the number of mmu pages to be smaller be than the |
1680 | * number of actived pages , we must to free some mmu pages before we | 1774 | * number of actived pages , we must to free some mmu pages before we |
1681 | * change the value | 1775 | * change the value |
1682 | */ | 1776 | */ |
1683 | 1777 | ||
1684 | if (used_pages > kvm_nr_mmu_pages) { | 1778 | if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { |
1685 | while (used_pages > kvm_nr_mmu_pages && | 1779 | while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && |
1686 | !list_empty(&kvm->arch.active_mmu_pages)) { | 1780 | !list_empty(&kvm->arch.active_mmu_pages)) { |
1687 | struct kvm_mmu_page *page; | 1781 | struct kvm_mmu_page *page; |
1688 | 1782 | ||
1689 | page = container_of(kvm->arch.active_mmu_pages.prev, | 1783 | page = container_of(kvm->arch.active_mmu_pages.prev, |
1690 | struct kvm_mmu_page, link); | 1784 | struct kvm_mmu_page, link); |
1691 | used_pages -= kvm_mmu_prepare_zap_page(kvm, page, | 1785 | kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); |
1692 | &invalid_list); | 1786 | kvm_mmu_commit_zap_page(kvm, &invalid_list); |
1693 | } | 1787 | } |
1694 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 1788 | goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; |
1695 | kvm_nr_mmu_pages = used_pages; | ||
1696 | kvm->arch.n_free_mmu_pages = 0; | ||
1697 | } | 1789 | } |
1698 | else | ||
1699 | kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages | ||
1700 | - kvm->arch.n_alloc_mmu_pages; | ||
1701 | 1790 | ||
1702 | kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; | 1791 | kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; |
1703 | } | 1792 | } |
1704 | 1793 | ||
1705 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | 1794 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) |
@@ -1709,11 +1798,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |||
1709 | LIST_HEAD(invalid_list); | 1798 | LIST_HEAD(invalid_list); |
1710 | int r; | 1799 | int r; |
1711 | 1800 | ||
1712 | pgprintk("%s: looking for gfn %lx\n", __func__, gfn); | 1801 | pgprintk("%s: looking for gfn %llx\n", __func__, gfn); |
1713 | r = 0; | 1802 | r = 0; |
1714 | 1803 | ||
1715 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { | 1804 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { |
1716 | pgprintk("%s: gfn %lx role %x\n", __func__, gfn, | 1805 | pgprintk("%s: gfn %llx role %x\n", __func__, gfn, |
1717 | sp->role.word); | 1806 | sp->role.word); |
1718 | r = 1; | 1807 | r = 1; |
1719 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); | 1808 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); |
@@ -1729,7 +1818,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) | |||
1729 | LIST_HEAD(invalid_list); | 1818 | LIST_HEAD(invalid_list); |
1730 | 1819 | ||
1731 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { | 1820 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { |
1732 | pgprintk("%s: zap %lx %x\n", | 1821 | pgprintk("%s: zap %llx %x\n", |
1733 | __func__, gfn, sp->role.word); | 1822 | __func__, gfn, sp->role.word); |
1734 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); | 1823 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); |
1735 | } | 1824 | } |
@@ -1915,9 +2004,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1915 | unsigned pte_access, int user_fault, | 2004 | unsigned pte_access, int user_fault, |
1916 | int write_fault, int dirty, int level, | 2005 | int write_fault, int dirty, int level, |
1917 | gfn_t gfn, pfn_t pfn, bool speculative, | 2006 | gfn_t gfn, pfn_t pfn, bool speculative, |
1918 | bool can_unsync, bool reset_host_protection) | 2007 | bool can_unsync, bool host_writable) |
1919 | { | 2008 | { |
1920 | u64 spte; | 2009 | u64 spte, entry = *sptep; |
1921 | int ret = 0; | 2010 | int ret = 0; |
1922 | 2011 | ||
1923 | /* | 2012 | /* |
@@ -1925,7 +2014,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1925 | * whether the guest actually used the pte (in order to detect | 2014 | * whether the guest actually used the pte (in order to detect |
1926 | * demand paging). | 2015 | * demand paging). |
1927 | */ | 2016 | */ |
1928 | spte = shadow_base_present_pte | shadow_dirty_mask; | 2017 | spte = PT_PRESENT_MASK; |
1929 | if (!speculative) | 2018 | if (!speculative) |
1930 | spte |= shadow_accessed_mask; | 2019 | spte |= shadow_accessed_mask; |
1931 | if (!dirty) | 2020 | if (!dirty) |
@@ -1942,14 +2031,16 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1942 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, | 2031 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, |
1943 | kvm_is_mmio_pfn(pfn)); | 2032 | kvm_is_mmio_pfn(pfn)); |
1944 | 2033 | ||
1945 | if (reset_host_protection) | 2034 | if (host_writable) |
1946 | spte |= SPTE_HOST_WRITEABLE; | 2035 | spte |= SPTE_HOST_WRITEABLE; |
2036 | else | ||
2037 | pte_access &= ~ACC_WRITE_MASK; | ||
1947 | 2038 | ||
1948 | spte |= (u64)pfn << PAGE_SHIFT; | 2039 | spte |= (u64)pfn << PAGE_SHIFT; |
1949 | 2040 | ||
1950 | if ((pte_access & ACC_WRITE_MASK) | 2041 | if ((pte_access & ACC_WRITE_MASK) |
1951 | || (!tdp_enabled && write_fault && !is_write_protection(vcpu) | 2042 | || (!vcpu->arch.mmu.direct_map && write_fault |
1952 | && !user_fault)) { | 2043 | && !is_write_protection(vcpu) && !user_fault)) { |
1953 | 2044 | ||
1954 | if (level > PT_PAGE_TABLE_LEVEL && | 2045 | if (level > PT_PAGE_TABLE_LEVEL && |
1955 | has_wrprotected_page(vcpu->kvm, gfn, level)) { | 2046 | has_wrprotected_page(vcpu->kvm, gfn, level)) { |
@@ -1960,7 +2051,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1960 | 2051 | ||
1961 | spte |= PT_WRITABLE_MASK; | 2052 | spte |= PT_WRITABLE_MASK; |
1962 | 2053 | ||
1963 | if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) | 2054 | if (!vcpu->arch.mmu.direct_map |
2055 | && !(pte_access & ACC_WRITE_MASK)) | ||
1964 | spte &= ~PT_USER_MASK; | 2056 | spte &= ~PT_USER_MASK; |
1965 | 2057 | ||
1966 | /* | 2058 | /* |
@@ -1973,7 +2065,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1973 | goto set_pte; | 2065 | goto set_pte; |
1974 | 2066 | ||
1975 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { | 2067 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { |
1976 | pgprintk("%s: found shadow page for %lx, marking ro\n", | 2068 | pgprintk("%s: found shadow page for %llx, marking ro\n", |
1977 | __func__, gfn); | 2069 | __func__, gfn); |
1978 | ret = 1; | 2070 | ret = 1; |
1979 | pte_access &= ~ACC_WRITE_MASK; | 2071 | pte_access &= ~ACC_WRITE_MASK; |
@@ -1986,9 +2078,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1986 | mark_page_dirty(vcpu->kvm, gfn); | 2078 | mark_page_dirty(vcpu->kvm, gfn); |
1987 | 2079 | ||
1988 | set_pte: | 2080 | set_pte: |
1989 | if (is_writable_pte(*sptep) && !is_writable_pte(spte)) | ||
1990 | kvm_set_pfn_dirty(pfn); | ||
1991 | update_spte(sptep, spte); | 2081 | update_spte(sptep, spte); |
2082 | /* | ||
2083 | * If we overwrite a writable spte with a read-only one we | ||
2084 | * should flush remote TLBs. Otherwise rmap_write_protect | ||
2085 | * will find a read-only spte, even though the writable spte | ||
2086 | * might be cached on a CPU's TLB. | ||
2087 | */ | ||
2088 | if (is_writable_pte(entry) && !is_writable_pte(*sptep)) | ||
2089 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1992 | done: | 2090 | done: |
1993 | return ret; | 2091 | return ret; |
1994 | } | 2092 | } |
@@ -1998,13 +2096,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1998 | int user_fault, int write_fault, int dirty, | 2096 | int user_fault, int write_fault, int dirty, |
1999 | int *ptwrite, int level, gfn_t gfn, | 2097 | int *ptwrite, int level, gfn_t gfn, |
2000 | pfn_t pfn, bool speculative, | 2098 | pfn_t pfn, bool speculative, |
2001 | bool reset_host_protection) | 2099 | bool host_writable) |
2002 | { | 2100 | { |
2003 | int was_rmapped = 0; | 2101 | int was_rmapped = 0; |
2004 | int rmap_count; | 2102 | int rmap_count; |
2005 | 2103 | ||
2006 | pgprintk("%s: spte %llx access %x write_fault %d" | 2104 | pgprintk("%s: spte %llx access %x write_fault %d" |
2007 | " user_fault %d gfn %lx\n", | 2105 | " user_fault %d gfn %llx\n", |
2008 | __func__, *sptep, pt_access, | 2106 | __func__, *sptep, pt_access, |
2009 | write_fault, user_fault, gfn); | 2107 | write_fault, user_fault, gfn); |
2010 | 2108 | ||
@@ -2023,7 +2121,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2023 | __set_spte(sptep, shadow_trap_nonpresent_pte); | 2121 | __set_spte(sptep, shadow_trap_nonpresent_pte); |
2024 | kvm_flush_remote_tlbs(vcpu->kvm); | 2122 | kvm_flush_remote_tlbs(vcpu->kvm); |
2025 | } else if (pfn != spte_to_pfn(*sptep)) { | 2123 | } else if (pfn != spte_to_pfn(*sptep)) { |
2026 | pgprintk("hfn old %lx new %lx\n", | 2124 | pgprintk("hfn old %llx new %llx\n", |
2027 | spte_to_pfn(*sptep), pfn); | 2125 | spte_to_pfn(*sptep), pfn); |
2028 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); | 2126 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); |
2029 | kvm_flush_remote_tlbs(vcpu->kvm); | 2127 | kvm_flush_remote_tlbs(vcpu->kvm); |
@@ -2033,14 +2131,14 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2033 | 2131 | ||
2034 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, | 2132 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, |
2035 | dirty, level, gfn, pfn, speculative, true, | 2133 | dirty, level, gfn, pfn, speculative, true, |
2036 | reset_host_protection)) { | 2134 | host_writable)) { |
2037 | if (write_fault) | 2135 | if (write_fault) |
2038 | *ptwrite = 1; | 2136 | *ptwrite = 1; |
2039 | kvm_mmu_flush_tlb(vcpu); | 2137 | kvm_mmu_flush_tlb(vcpu); |
2040 | } | 2138 | } |
2041 | 2139 | ||
2042 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); | 2140 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); |
2043 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", | 2141 | pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", |
2044 | is_large_pte(*sptep)? "2MB" : "4kB", | 2142 | is_large_pte(*sptep)? "2MB" : "4kB", |
2045 | *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, | 2143 | *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, |
2046 | *sptep, sptep); | 2144 | *sptep, sptep); |
@@ -2064,8 +2162,95 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | |||
2064 | { | 2162 | { |
2065 | } | 2163 | } |
2066 | 2164 | ||
2165 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | ||
2166 | bool no_dirty_log) | ||
2167 | { | ||
2168 | struct kvm_memory_slot *slot; | ||
2169 | unsigned long hva; | ||
2170 | |||
2171 | slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); | ||
2172 | if (!slot) { | ||
2173 | get_page(bad_page); | ||
2174 | return page_to_pfn(bad_page); | ||
2175 | } | ||
2176 | |||
2177 | hva = gfn_to_hva_memslot(slot, gfn); | ||
2178 | |||
2179 | return hva_to_pfn_atomic(vcpu->kvm, hva); | ||
2180 | } | ||
2181 | |||
2182 | static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, | ||
2183 | struct kvm_mmu_page *sp, | ||
2184 | u64 *start, u64 *end) | ||
2185 | { | ||
2186 | struct page *pages[PTE_PREFETCH_NUM]; | ||
2187 | unsigned access = sp->role.access; | ||
2188 | int i, ret; | ||
2189 | gfn_t gfn; | ||
2190 | |||
2191 | gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); | ||
2192 | if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) | ||
2193 | return -1; | ||
2194 | |||
2195 | ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); | ||
2196 | if (ret <= 0) | ||
2197 | return -1; | ||
2198 | |||
2199 | for (i = 0; i < ret; i++, gfn++, start++) | ||
2200 | mmu_set_spte(vcpu, start, ACC_ALL, | ||
2201 | access, 0, 0, 1, NULL, | ||
2202 | sp->role.level, gfn, | ||
2203 | page_to_pfn(pages[i]), true, true); | ||
2204 | |||
2205 | return 0; | ||
2206 | } | ||
2207 | |||
2208 | static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, | ||
2209 | struct kvm_mmu_page *sp, u64 *sptep) | ||
2210 | { | ||
2211 | u64 *spte, *start = NULL; | ||
2212 | int i; | ||
2213 | |||
2214 | WARN_ON(!sp->role.direct); | ||
2215 | |||
2216 | i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); | ||
2217 | spte = sp->spt + i; | ||
2218 | |||
2219 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { | ||
2220 | if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { | ||
2221 | if (!start) | ||
2222 | continue; | ||
2223 | if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) | ||
2224 | break; | ||
2225 | start = NULL; | ||
2226 | } else if (!start) | ||
2227 | start = spte; | ||
2228 | } | ||
2229 | } | ||
2230 | |||
2231 | static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) | ||
2232 | { | ||
2233 | struct kvm_mmu_page *sp; | ||
2234 | |||
2235 | /* | ||
2236 | * Since it's no accessed bit on EPT, it's no way to | ||
2237 | * distinguish between actually accessed translations | ||
2238 | * and prefetched, so disable pte prefetch if EPT is | ||
2239 | * enabled. | ||
2240 | */ | ||
2241 | if (!shadow_accessed_mask) | ||
2242 | return; | ||
2243 | |||
2244 | sp = page_header(__pa(sptep)); | ||
2245 | if (sp->role.level > PT_PAGE_TABLE_LEVEL) | ||
2246 | return; | ||
2247 | |||
2248 | __direct_pte_prefetch(vcpu, sp, sptep); | ||
2249 | } | ||
2250 | |||
2067 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | 2251 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, |
2068 | int level, gfn_t gfn, pfn_t pfn) | 2252 | int map_writable, int level, gfn_t gfn, pfn_t pfn, |
2253 | bool prefault) | ||
2069 | { | 2254 | { |
2070 | struct kvm_shadow_walk_iterator iterator; | 2255 | struct kvm_shadow_walk_iterator iterator; |
2071 | struct kvm_mmu_page *sp; | 2256 | struct kvm_mmu_page *sp; |
@@ -2074,9 +2259,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2074 | 2259 | ||
2075 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { | 2260 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { |
2076 | if (iterator.level == level) { | 2261 | if (iterator.level == level) { |
2077 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, | 2262 | unsigned pte_access = ACC_ALL; |
2263 | |||
2264 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, | ||
2078 | 0, write, 1, &pt_write, | 2265 | 0, write, 1, &pt_write, |
2079 | level, gfn, pfn, false, true); | 2266 | level, gfn, pfn, prefault, map_writable); |
2267 | direct_pte_prefetch(vcpu, iterator.sptep); | ||
2080 | ++vcpu->stat.pf_fixed; | 2268 | ++vcpu->stat.pf_fixed; |
2081 | break; | 2269 | break; |
2082 | } | 2270 | } |
@@ -2098,28 +2286,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2098 | __set_spte(iterator.sptep, | 2286 | __set_spte(iterator.sptep, |
2099 | __pa(sp->spt) | 2287 | __pa(sp->spt) |
2100 | | PT_PRESENT_MASK | PT_WRITABLE_MASK | 2288 | | PT_PRESENT_MASK | PT_WRITABLE_MASK |
2101 | | shadow_user_mask | shadow_x_mask); | 2289 | | shadow_user_mask | shadow_x_mask |
2290 | | shadow_accessed_mask); | ||
2102 | } | 2291 | } |
2103 | } | 2292 | } |
2104 | return pt_write; | 2293 | return pt_write; |
2105 | } | 2294 | } |
2106 | 2295 | ||
2107 | static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) | 2296 | static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) |
2108 | { | 2297 | { |
2109 | char buf[1]; | 2298 | siginfo_t info; |
2110 | void __user *hva; | ||
2111 | int r; | ||
2112 | 2299 | ||
2113 | /* Touch the page, so send SIGBUS */ | 2300 | info.si_signo = SIGBUS; |
2114 | hva = (void __user *)gfn_to_hva(kvm, gfn); | 2301 | info.si_errno = 0; |
2115 | r = copy_from_user(buf, hva, 1); | 2302 | info.si_code = BUS_MCEERR_AR; |
2303 | info.si_addr = (void __user *)address; | ||
2304 | info.si_addr_lsb = PAGE_SHIFT; | ||
2305 | |||
2306 | send_sig_info(SIGBUS, &info, tsk); | ||
2116 | } | 2307 | } |
2117 | 2308 | ||
2118 | static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) | 2309 | static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) |
2119 | { | 2310 | { |
2120 | kvm_release_pfn_clean(pfn); | 2311 | kvm_release_pfn_clean(pfn); |
2121 | if (is_hwpoison_pfn(pfn)) { | 2312 | if (is_hwpoison_pfn(pfn)) { |
2122 | kvm_send_hwpoison_signal(kvm, gfn); | 2313 | kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); |
2123 | return 0; | 2314 | return 0; |
2124 | } else if (is_fault_pfn(pfn)) | 2315 | } else if (is_fault_pfn(pfn)) |
2125 | return -EFAULT; | 2316 | return -EFAULT; |
@@ -2127,27 +2318,81 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) | |||
2127 | return 1; | 2318 | return 1; |
2128 | } | 2319 | } |
2129 | 2320 | ||
2130 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | 2321 | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, |
2322 | gfn_t *gfnp, pfn_t *pfnp, int *levelp) | ||
2323 | { | ||
2324 | pfn_t pfn = *pfnp; | ||
2325 | gfn_t gfn = *gfnp; | ||
2326 | int level = *levelp; | ||
2327 | |||
2328 | /* | ||
2329 | * Check if it's a transparent hugepage. If this would be an | ||
2330 | * hugetlbfs page, level wouldn't be set to | ||
2331 | * PT_PAGE_TABLE_LEVEL and there would be no adjustment done | ||
2332 | * here. | ||
2333 | */ | ||
2334 | if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && | ||
2335 | level == PT_PAGE_TABLE_LEVEL && | ||
2336 | PageTransCompound(pfn_to_page(pfn)) && | ||
2337 | !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { | ||
2338 | unsigned long mask; | ||
2339 | /* | ||
2340 | * mmu_notifier_retry was successful and we hold the | ||
2341 | * mmu_lock here, so the pmd can't become splitting | ||
2342 | * from under us, and in turn | ||
2343 | * __split_huge_page_refcount() can't run from under | ||
2344 | * us and we can safely transfer the refcount from | ||
2345 | * PG_tail to PG_head as we switch the pfn to tail to | ||
2346 | * head. | ||
2347 | */ | ||
2348 | *levelp = level = PT_DIRECTORY_LEVEL; | ||
2349 | mask = KVM_PAGES_PER_HPAGE(level) - 1; | ||
2350 | VM_BUG_ON((gfn & mask) != (pfn & mask)); | ||
2351 | if (pfn & mask) { | ||
2352 | gfn &= ~mask; | ||
2353 | *gfnp = gfn; | ||
2354 | kvm_release_pfn_clean(pfn); | ||
2355 | pfn &= ~mask; | ||
2356 | if (!get_page_unless_zero(pfn_to_page(pfn))) | ||
2357 | BUG(); | ||
2358 | *pfnp = pfn; | ||
2359 | } | ||
2360 | } | ||
2361 | } | ||
2362 | |||
2363 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | ||
2364 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | ||
2365 | |||
2366 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | ||
2367 | bool prefault) | ||
2131 | { | 2368 | { |
2132 | int r; | 2369 | int r; |
2133 | int level; | 2370 | int level; |
2371 | int force_pt_level; | ||
2134 | pfn_t pfn; | 2372 | pfn_t pfn; |
2135 | unsigned long mmu_seq; | 2373 | unsigned long mmu_seq; |
2374 | bool map_writable; | ||
2136 | 2375 | ||
2137 | level = mapping_level(vcpu, gfn); | 2376 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); |
2138 | 2377 | if (likely(!force_pt_level)) { | |
2139 | /* | 2378 | level = mapping_level(vcpu, gfn); |
2140 | * This path builds a PAE pagetable - so we can map 2mb pages at | 2379 | /* |
2141 | * maximum. Therefore check if the level is larger than that. | 2380 | * This path builds a PAE pagetable - so we can map |
2142 | */ | 2381 | * 2mb pages at maximum. Therefore check if the level |
2143 | if (level > PT_DIRECTORY_LEVEL) | 2382 | * is larger than that. |
2144 | level = PT_DIRECTORY_LEVEL; | 2383 | */ |
2384 | if (level > PT_DIRECTORY_LEVEL) | ||
2385 | level = PT_DIRECTORY_LEVEL; | ||
2145 | 2386 | ||
2146 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | 2387 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); |
2388 | } else | ||
2389 | level = PT_PAGE_TABLE_LEVEL; | ||
2147 | 2390 | ||
2148 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2391 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2149 | smp_rmb(); | 2392 | smp_rmb(); |
2150 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2393 | |
2394 | if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) | ||
2395 | return 0; | ||
2151 | 2396 | ||
2152 | /* mmio */ | 2397 | /* mmio */ |
2153 | if (is_error_pfn(pfn)) | 2398 | if (is_error_pfn(pfn)) |
@@ -2157,7 +2402,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
2157 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2402 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2158 | goto out_unlock; | 2403 | goto out_unlock; |
2159 | kvm_mmu_free_some_pages(vcpu); | 2404 | kvm_mmu_free_some_pages(vcpu); |
2160 | r = __direct_map(vcpu, v, write, level, gfn, pfn); | 2405 | if (likely(!force_pt_level)) |
2406 | transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); | ||
2407 | r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, | ||
2408 | prefault); | ||
2161 | spin_unlock(&vcpu->kvm->mmu_lock); | 2409 | spin_unlock(&vcpu->kvm->mmu_lock); |
2162 | 2410 | ||
2163 | 2411 | ||
@@ -2179,7 +2427,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
2179 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | 2427 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
2180 | return; | 2428 | return; |
2181 | spin_lock(&vcpu->kvm->mmu_lock); | 2429 | spin_lock(&vcpu->kvm->mmu_lock); |
2182 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | 2430 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && |
2431 | (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || | ||
2432 | vcpu->arch.mmu.direct_map)) { | ||
2183 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2433 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2184 | 2434 | ||
2185 | sp = page_header(root); | 2435 | sp = page_header(root); |
@@ -2222,83 +2472,163 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) | |||
2222 | return ret; | 2472 | return ret; |
2223 | } | 2473 | } |
2224 | 2474 | ||
2225 | static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | 2475 | static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) |
2226 | { | 2476 | { |
2227 | int i; | ||
2228 | gfn_t root_gfn; | ||
2229 | struct kvm_mmu_page *sp; | 2477 | struct kvm_mmu_page *sp; |
2230 | int direct = 0; | 2478 | unsigned i; |
2231 | u64 pdptr; | ||
2232 | |||
2233 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; | ||
2234 | 2479 | ||
2235 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | 2480 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { |
2481 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2482 | kvm_mmu_free_some_pages(vcpu); | ||
2483 | sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, | ||
2484 | 1, ACC_ALL, NULL); | ||
2485 | ++sp->root_count; | ||
2486 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2487 | vcpu->arch.mmu.root_hpa = __pa(sp->spt); | ||
2488 | } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { | ||
2489 | for (i = 0; i < 4; ++i) { | ||
2490 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
2491 | |||
2492 | ASSERT(!VALID_PAGE(root)); | ||
2493 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2494 | kvm_mmu_free_some_pages(vcpu); | ||
2495 | sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), | ||
2496 | i << 30, | ||
2497 | PT32_ROOT_LEVEL, 1, ACC_ALL, | ||
2498 | NULL); | ||
2499 | root = __pa(sp->spt); | ||
2500 | ++sp->root_count; | ||
2501 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2502 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; | ||
2503 | } | ||
2504 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | ||
2505 | } else | ||
2506 | BUG(); | ||
2507 | |||
2508 | return 0; | ||
2509 | } | ||
2510 | |||
2511 | static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) | ||
2512 | { | ||
2513 | struct kvm_mmu_page *sp; | ||
2514 | u64 pdptr, pm_mask; | ||
2515 | gfn_t root_gfn; | ||
2516 | int i; | ||
2517 | |||
2518 | root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; | ||
2519 | |||
2520 | if (mmu_check_root(vcpu, root_gfn)) | ||
2521 | return 1; | ||
2522 | |||
2523 | /* | ||
2524 | * Do we shadow a long mode page table? If so we need to | ||
2525 | * write-protect the guests page table root. | ||
2526 | */ | ||
2527 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | ||
2236 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2528 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2237 | 2529 | ||
2238 | ASSERT(!VALID_PAGE(root)); | 2530 | ASSERT(!VALID_PAGE(root)); |
2239 | if (mmu_check_root(vcpu, root_gfn)) | 2531 | |
2240 | return 1; | ||
2241 | if (tdp_enabled) { | ||
2242 | direct = 1; | ||
2243 | root_gfn = 0; | ||
2244 | } | ||
2245 | spin_lock(&vcpu->kvm->mmu_lock); | 2532 | spin_lock(&vcpu->kvm->mmu_lock); |
2246 | kvm_mmu_free_some_pages(vcpu); | 2533 | kvm_mmu_free_some_pages(vcpu); |
2247 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, | 2534 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, |
2248 | PT64_ROOT_LEVEL, direct, | 2535 | 0, ACC_ALL, NULL); |
2249 | ACC_ALL, NULL); | ||
2250 | root = __pa(sp->spt); | 2536 | root = __pa(sp->spt); |
2251 | ++sp->root_count; | 2537 | ++sp->root_count; |
2252 | spin_unlock(&vcpu->kvm->mmu_lock); | 2538 | spin_unlock(&vcpu->kvm->mmu_lock); |
2253 | vcpu->arch.mmu.root_hpa = root; | 2539 | vcpu->arch.mmu.root_hpa = root; |
2254 | return 0; | 2540 | return 0; |
2255 | } | 2541 | } |
2256 | direct = !is_paging(vcpu); | 2542 | |
2543 | /* | ||
2544 | * We shadow a 32 bit page table. This may be a legacy 2-level | ||
2545 | * or a PAE 3-level page table. In either case we need to be aware that | ||
2546 | * the shadow page table may be a PAE or a long mode page table. | ||
2547 | */ | ||
2548 | pm_mask = PT_PRESENT_MASK; | ||
2549 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) | ||
2550 | pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; | ||
2551 | |||
2257 | for (i = 0; i < 4; ++i) { | 2552 | for (i = 0; i < 4; ++i) { |
2258 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | 2553 | hpa_t root = vcpu->arch.mmu.pae_root[i]; |
2259 | 2554 | ||
2260 | ASSERT(!VALID_PAGE(root)); | 2555 | ASSERT(!VALID_PAGE(root)); |
2261 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { | 2556 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { |
2262 | pdptr = kvm_pdptr_read(vcpu, i); | 2557 | pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); |
2263 | if (!is_present_gpte(pdptr)) { | 2558 | if (!is_present_gpte(pdptr)) { |
2264 | vcpu->arch.mmu.pae_root[i] = 0; | 2559 | vcpu->arch.mmu.pae_root[i] = 0; |
2265 | continue; | 2560 | continue; |
2266 | } | 2561 | } |
2267 | root_gfn = pdptr >> PAGE_SHIFT; | 2562 | root_gfn = pdptr >> PAGE_SHIFT; |
2268 | } else if (vcpu->arch.mmu.root_level == 0) | 2563 | if (mmu_check_root(vcpu, root_gfn)) |
2269 | root_gfn = 0; | 2564 | return 1; |
2270 | if (mmu_check_root(vcpu, root_gfn)) | ||
2271 | return 1; | ||
2272 | if (tdp_enabled) { | ||
2273 | direct = 1; | ||
2274 | root_gfn = i << 30; | ||
2275 | } | 2565 | } |
2276 | spin_lock(&vcpu->kvm->mmu_lock); | 2566 | spin_lock(&vcpu->kvm->mmu_lock); |
2277 | kvm_mmu_free_some_pages(vcpu); | 2567 | kvm_mmu_free_some_pages(vcpu); |
2278 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | 2568 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, |
2279 | PT32_ROOT_LEVEL, direct, | 2569 | PT32_ROOT_LEVEL, 0, |
2280 | ACC_ALL, NULL); | 2570 | ACC_ALL, NULL); |
2281 | root = __pa(sp->spt); | 2571 | root = __pa(sp->spt); |
2282 | ++sp->root_count; | 2572 | ++sp->root_count; |
2283 | spin_unlock(&vcpu->kvm->mmu_lock); | 2573 | spin_unlock(&vcpu->kvm->mmu_lock); |
2284 | 2574 | ||
2285 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; | 2575 | vcpu->arch.mmu.pae_root[i] = root | pm_mask; |
2286 | } | 2576 | } |
2287 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | 2577 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); |
2578 | |||
2579 | /* | ||
2580 | * If we shadow a 32 bit page table with a long mode page | ||
2581 | * table we enter this path. | ||
2582 | */ | ||
2583 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
2584 | if (vcpu->arch.mmu.lm_root == NULL) { | ||
2585 | /* | ||
2586 | * The additional page necessary for this is only | ||
2587 | * allocated on demand. | ||
2588 | */ | ||
2589 | |||
2590 | u64 *lm_root; | ||
2591 | |||
2592 | lm_root = (void*)get_zeroed_page(GFP_KERNEL); | ||
2593 | if (lm_root == NULL) | ||
2594 | return 1; | ||
2595 | |||
2596 | lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; | ||
2597 | |||
2598 | vcpu->arch.mmu.lm_root = lm_root; | ||
2599 | } | ||
2600 | |||
2601 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); | ||
2602 | } | ||
2603 | |||
2288 | return 0; | 2604 | return 0; |
2289 | } | 2605 | } |
2290 | 2606 | ||
2607 | static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | ||
2608 | { | ||
2609 | if (vcpu->arch.mmu.direct_map) | ||
2610 | return mmu_alloc_direct_roots(vcpu); | ||
2611 | else | ||
2612 | return mmu_alloc_shadow_roots(vcpu); | ||
2613 | } | ||
2614 | |||
2291 | static void mmu_sync_roots(struct kvm_vcpu *vcpu) | 2615 | static void mmu_sync_roots(struct kvm_vcpu *vcpu) |
2292 | { | 2616 | { |
2293 | int i; | 2617 | int i; |
2294 | struct kvm_mmu_page *sp; | 2618 | struct kvm_mmu_page *sp; |
2295 | 2619 | ||
2620 | if (vcpu->arch.mmu.direct_map) | ||
2621 | return; | ||
2622 | |||
2296 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | 2623 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
2297 | return; | 2624 | return; |
2298 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | 2625 | |
2626 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); | ||
2627 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | ||
2299 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2628 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2300 | sp = page_header(root); | 2629 | sp = page_header(root); |
2301 | mmu_sync_children(vcpu, sp); | 2630 | mmu_sync_children(vcpu, sp); |
2631 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | ||
2302 | return; | 2632 | return; |
2303 | } | 2633 | } |
2304 | for (i = 0; i < 4; ++i) { | 2634 | for (i = 0; i < 4; ++i) { |
@@ -2310,6 +2640,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2310 | mmu_sync_children(vcpu, sp); | 2640 | mmu_sync_children(vcpu, sp); |
2311 | } | 2641 | } |
2312 | } | 2642 | } |
2643 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | ||
2313 | } | 2644 | } |
2314 | 2645 | ||
2315 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | 2646 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) |
@@ -2320,15 +2651,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2320 | } | 2651 | } |
2321 | 2652 | ||
2322 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, | 2653 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, |
2323 | u32 access, u32 *error) | 2654 | u32 access, struct x86_exception *exception) |
2324 | { | 2655 | { |
2325 | if (error) | 2656 | if (exception) |
2326 | *error = 0; | 2657 | exception->error_code = 0; |
2327 | return vaddr; | 2658 | return vaddr; |
2328 | } | 2659 | } |
2329 | 2660 | ||
2661 | static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, | ||
2662 | u32 access, | ||
2663 | struct x86_exception *exception) | ||
2664 | { | ||
2665 | if (exception) | ||
2666 | exception->error_code = 0; | ||
2667 | return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); | ||
2668 | } | ||
2669 | |||
2330 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | 2670 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, |
2331 | u32 error_code) | 2671 | u32 error_code, bool prefault) |
2332 | { | 2672 | { |
2333 | gfn_t gfn; | 2673 | gfn_t gfn; |
2334 | int r; | 2674 | int r; |
@@ -2344,17 +2684,68 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
2344 | gfn = gva >> PAGE_SHIFT; | 2684 | gfn = gva >> PAGE_SHIFT; |
2345 | 2685 | ||
2346 | return nonpaging_map(vcpu, gva & PAGE_MASK, | 2686 | return nonpaging_map(vcpu, gva & PAGE_MASK, |
2347 | error_code & PFERR_WRITE_MASK, gfn); | 2687 | error_code & PFERR_WRITE_MASK, gfn, prefault); |
2688 | } | ||
2689 | |||
2690 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) | ||
2691 | { | ||
2692 | struct kvm_arch_async_pf arch; | ||
2693 | |||
2694 | arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; | ||
2695 | arch.gfn = gfn; | ||
2696 | arch.direct_map = vcpu->arch.mmu.direct_map; | ||
2697 | arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); | ||
2698 | |||
2699 | return kvm_setup_async_pf(vcpu, gva, gfn, &arch); | ||
2700 | } | ||
2701 | |||
2702 | static bool can_do_async_pf(struct kvm_vcpu *vcpu) | ||
2703 | { | ||
2704 | if (unlikely(!irqchip_in_kernel(vcpu->kvm) || | ||
2705 | kvm_event_needs_reinjection(vcpu))) | ||
2706 | return false; | ||
2707 | |||
2708 | return kvm_x86_ops->interrupt_allowed(vcpu); | ||
2709 | } | ||
2710 | |||
2711 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | ||
2712 | gva_t gva, pfn_t *pfn, bool write, bool *writable) | ||
2713 | { | ||
2714 | bool async; | ||
2715 | |||
2716 | *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); | ||
2717 | |||
2718 | if (!async) | ||
2719 | return false; /* *pfn has correct page already */ | ||
2720 | |||
2721 | put_page(pfn_to_page(*pfn)); | ||
2722 | |||
2723 | if (!prefault && can_do_async_pf(vcpu)) { | ||
2724 | trace_kvm_try_async_get_page(gva, gfn); | ||
2725 | if (kvm_find_async_pf_gfn(vcpu, gfn)) { | ||
2726 | trace_kvm_async_pf_doublefault(gva, gfn); | ||
2727 | kvm_make_request(KVM_REQ_APF_HALT, vcpu); | ||
2728 | return true; | ||
2729 | } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) | ||
2730 | return true; | ||
2731 | } | ||
2732 | |||
2733 | *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); | ||
2734 | |||
2735 | return false; | ||
2348 | } | 2736 | } |
2349 | 2737 | ||
2350 | static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | 2738 | static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, |
2351 | u32 error_code) | 2739 | bool prefault) |
2352 | { | 2740 | { |
2353 | pfn_t pfn; | 2741 | pfn_t pfn; |
2354 | int r; | 2742 | int r; |
2355 | int level; | 2743 | int level; |
2744 | int force_pt_level; | ||
2356 | gfn_t gfn = gpa >> PAGE_SHIFT; | 2745 | gfn_t gfn = gpa >> PAGE_SHIFT; |
2357 | unsigned long mmu_seq; | 2746 | unsigned long mmu_seq; |
2747 | int write = error_code & PFERR_WRITE_MASK; | ||
2748 | bool map_writable; | ||
2358 | 2749 | ||
2359 | ASSERT(vcpu); | 2750 | ASSERT(vcpu); |
2360 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 2751 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
@@ -2363,21 +2754,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
2363 | if (r) | 2754 | if (r) |
2364 | return r; | 2755 | return r; |
2365 | 2756 | ||
2366 | level = mapping_level(vcpu, gfn); | 2757 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); |
2367 | 2758 | if (likely(!force_pt_level)) { | |
2368 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | 2759 | level = mapping_level(vcpu, gfn); |
2760 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | ||
2761 | } else | ||
2762 | level = PT_PAGE_TABLE_LEVEL; | ||
2369 | 2763 | ||
2370 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2764 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2371 | smp_rmb(); | 2765 | smp_rmb(); |
2372 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2766 | |
2767 | if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) | ||
2768 | return 0; | ||
2769 | |||
2770 | /* mmio */ | ||
2373 | if (is_error_pfn(pfn)) | 2771 | if (is_error_pfn(pfn)) |
2374 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); | 2772 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); |
2375 | spin_lock(&vcpu->kvm->mmu_lock); | 2773 | spin_lock(&vcpu->kvm->mmu_lock); |
2376 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2774 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2377 | goto out_unlock; | 2775 | goto out_unlock; |
2378 | kvm_mmu_free_some_pages(vcpu); | 2776 | kvm_mmu_free_some_pages(vcpu); |
2379 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | 2777 | if (likely(!force_pt_level)) |
2380 | level, gfn, pfn); | 2778 | transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); |
2779 | r = __direct_map(vcpu, gpa, write, map_writable, | ||
2780 | level, gfn, pfn, prefault); | ||
2381 | spin_unlock(&vcpu->kvm->mmu_lock); | 2781 | spin_unlock(&vcpu->kvm->mmu_lock); |
2382 | 2782 | ||
2383 | return r; | 2783 | return r; |
@@ -2393,10 +2793,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu) | |||
2393 | mmu_free_roots(vcpu); | 2793 | mmu_free_roots(vcpu); |
2394 | } | 2794 | } |
2395 | 2795 | ||
2396 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) | 2796 | static int nonpaging_init_context(struct kvm_vcpu *vcpu, |
2797 | struct kvm_mmu *context) | ||
2397 | { | 2798 | { |
2398 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
2399 | |||
2400 | context->new_cr3 = nonpaging_new_cr3; | 2799 | context->new_cr3 = nonpaging_new_cr3; |
2401 | context->page_fault = nonpaging_page_fault; | 2800 | context->page_fault = nonpaging_page_fault; |
2402 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 2801 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
@@ -2404,9 +2803,12 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) | |||
2404 | context->prefetch_page = nonpaging_prefetch_page; | 2803 | context->prefetch_page = nonpaging_prefetch_page; |
2405 | context->sync_page = nonpaging_sync_page; | 2804 | context->sync_page = nonpaging_sync_page; |
2406 | context->invlpg = nonpaging_invlpg; | 2805 | context->invlpg = nonpaging_invlpg; |
2806 | context->update_pte = nonpaging_update_pte; | ||
2407 | context->root_level = 0; | 2807 | context->root_level = 0; |
2408 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 2808 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
2409 | context->root_hpa = INVALID_PAGE; | 2809 | context->root_hpa = INVALID_PAGE; |
2810 | context->direct_map = true; | ||
2811 | context->nx = false; | ||
2410 | return 0; | 2812 | return 0; |
2411 | } | 2813 | } |
2412 | 2814 | ||
@@ -2418,15 +2820,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | |||
2418 | 2820 | ||
2419 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | 2821 | static void paging_new_cr3(struct kvm_vcpu *vcpu) |
2420 | { | 2822 | { |
2421 | pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); | 2823 | pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); |
2422 | mmu_free_roots(vcpu); | 2824 | mmu_free_roots(vcpu); |
2423 | } | 2825 | } |
2424 | 2826 | ||
2827 | static unsigned long get_cr3(struct kvm_vcpu *vcpu) | ||
2828 | { | ||
2829 | return kvm_read_cr3(vcpu); | ||
2830 | } | ||
2831 | |||
2425 | static void inject_page_fault(struct kvm_vcpu *vcpu, | 2832 | static void inject_page_fault(struct kvm_vcpu *vcpu, |
2426 | u64 addr, | 2833 | struct x86_exception *fault) |
2427 | u32 err_code) | ||
2428 | { | 2834 | { |
2429 | kvm_inject_page_fault(vcpu, addr, err_code); | 2835 | vcpu->arch.mmu.inject_page_fault(vcpu, fault); |
2430 | } | 2836 | } |
2431 | 2837 | ||
2432 | static void paging_free(struct kvm_vcpu *vcpu) | 2838 | static void paging_free(struct kvm_vcpu *vcpu) |
@@ -2434,12 +2840,12 @@ static void paging_free(struct kvm_vcpu *vcpu) | |||
2434 | nonpaging_free(vcpu); | 2840 | nonpaging_free(vcpu); |
2435 | } | 2841 | } |
2436 | 2842 | ||
2437 | static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) | 2843 | static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) |
2438 | { | 2844 | { |
2439 | int bit7; | 2845 | int bit7; |
2440 | 2846 | ||
2441 | bit7 = (gpte >> 7) & 1; | 2847 | bit7 = (gpte >> 7) & 1; |
2442 | return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; | 2848 | return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; |
2443 | } | 2849 | } |
2444 | 2850 | ||
2445 | #define PTTYPE 64 | 2851 | #define PTTYPE 64 |
@@ -2450,13 +2856,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) | |||
2450 | #include "paging_tmpl.h" | 2856 | #include "paging_tmpl.h" |
2451 | #undef PTTYPE | 2857 | #undef PTTYPE |
2452 | 2858 | ||
2453 | static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) | 2859 | static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, |
2860 | struct kvm_mmu *context, | ||
2861 | int level) | ||
2454 | { | 2862 | { |
2455 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
2456 | int maxphyaddr = cpuid_maxphyaddr(vcpu); | 2863 | int maxphyaddr = cpuid_maxphyaddr(vcpu); |
2457 | u64 exb_bit_rsvd = 0; | 2864 | u64 exb_bit_rsvd = 0; |
2458 | 2865 | ||
2459 | if (!is_nx(vcpu)) | 2866 | if (!context->nx) |
2460 | exb_bit_rsvd = rsvd_bits(63, 63); | 2867 | exb_bit_rsvd = rsvd_bits(63, 63); |
2461 | switch (level) { | 2868 | switch (level) { |
2462 | case PT32_ROOT_LEVEL: | 2869 | case PT32_ROOT_LEVEL: |
@@ -2511,9 +2918,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) | |||
2511 | } | 2918 | } |
2512 | } | 2919 | } |
2513 | 2920 | ||
2514 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | 2921 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, |
2922 | struct kvm_mmu *context, | ||
2923 | int level) | ||
2515 | { | 2924 | { |
2516 | struct kvm_mmu *context = &vcpu->arch.mmu; | 2925 | context->nx = is_nx(vcpu); |
2926 | |||
2927 | reset_rsvds_bits_mask(vcpu, context, level); | ||
2517 | 2928 | ||
2518 | ASSERT(is_pae(vcpu)); | 2929 | ASSERT(is_pae(vcpu)); |
2519 | context->new_cr3 = paging_new_cr3; | 2930 | context->new_cr3 = paging_new_cr3; |
@@ -2522,24 +2933,28 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | |||
2522 | context->prefetch_page = paging64_prefetch_page; | 2933 | context->prefetch_page = paging64_prefetch_page; |
2523 | context->sync_page = paging64_sync_page; | 2934 | context->sync_page = paging64_sync_page; |
2524 | context->invlpg = paging64_invlpg; | 2935 | context->invlpg = paging64_invlpg; |
2936 | context->update_pte = paging64_update_pte; | ||
2525 | context->free = paging_free; | 2937 | context->free = paging_free; |
2526 | context->root_level = level; | 2938 | context->root_level = level; |
2527 | context->shadow_root_level = level; | 2939 | context->shadow_root_level = level; |
2528 | context->root_hpa = INVALID_PAGE; | 2940 | context->root_hpa = INVALID_PAGE; |
2941 | context->direct_map = false; | ||
2529 | return 0; | 2942 | return 0; |
2530 | } | 2943 | } |
2531 | 2944 | ||
2532 | static int paging64_init_context(struct kvm_vcpu *vcpu) | 2945 | static int paging64_init_context(struct kvm_vcpu *vcpu, |
2946 | struct kvm_mmu *context) | ||
2533 | { | 2947 | { |
2534 | reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); | 2948 | return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); |
2535 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); | ||
2536 | } | 2949 | } |
2537 | 2950 | ||
2538 | static int paging32_init_context(struct kvm_vcpu *vcpu) | 2951 | static int paging32_init_context(struct kvm_vcpu *vcpu, |
2952 | struct kvm_mmu *context) | ||
2539 | { | 2953 | { |
2540 | struct kvm_mmu *context = &vcpu->arch.mmu; | 2954 | context->nx = false; |
2955 | |||
2956 | reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); | ||
2541 | 2957 | ||
2542 | reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); | ||
2543 | context->new_cr3 = paging_new_cr3; | 2958 | context->new_cr3 = paging_new_cr3; |
2544 | context->page_fault = paging32_page_fault; | 2959 | context->page_fault = paging32_page_fault; |
2545 | context->gva_to_gpa = paging32_gva_to_gpa; | 2960 | context->gva_to_gpa = paging32_gva_to_gpa; |
@@ -2547,44 +2962,57 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) | |||
2547 | context->prefetch_page = paging32_prefetch_page; | 2962 | context->prefetch_page = paging32_prefetch_page; |
2548 | context->sync_page = paging32_sync_page; | 2963 | context->sync_page = paging32_sync_page; |
2549 | context->invlpg = paging32_invlpg; | 2964 | context->invlpg = paging32_invlpg; |
2965 | context->update_pte = paging32_update_pte; | ||
2550 | context->root_level = PT32_ROOT_LEVEL; | 2966 | context->root_level = PT32_ROOT_LEVEL; |
2551 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 2967 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
2552 | context->root_hpa = INVALID_PAGE; | 2968 | context->root_hpa = INVALID_PAGE; |
2969 | context->direct_map = false; | ||
2553 | return 0; | 2970 | return 0; |
2554 | } | 2971 | } |
2555 | 2972 | ||
2556 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | 2973 | static int paging32E_init_context(struct kvm_vcpu *vcpu, |
2974 | struct kvm_mmu *context) | ||
2557 | { | 2975 | { |
2558 | reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); | 2976 | return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); |
2559 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); | ||
2560 | } | 2977 | } |
2561 | 2978 | ||
2562 | static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | 2979 | static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) |
2563 | { | 2980 | { |
2564 | struct kvm_mmu *context = &vcpu->arch.mmu; | 2981 | struct kvm_mmu *context = vcpu->arch.walk_mmu; |
2565 | 2982 | ||
2983 | context->base_role.word = 0; | ||
2566 | context->new_cr3 = nonpaging_new_cr3; | 2984 | context->new_cr3 = nonpaging_new_cr3; |
2567 | context->page_fault = tdp_page_fault; | 2985 | context->page_fault = tdp_page_fault; |
2568 | context->free = nonpaging_free; | 2986 | context->free = nonpaging_free; |
2569 | context->prefetch_page = nonpaging_prefetch_page; | 2987 | context->prefetch_page = nonpaging_prefetch_page; |
2570 | context->sync_page = nonpaging_sync_page; | 2988 | context->sync_page = nonpaging_sync_page; |
2571 | context->invlpg = nonpaging_invlpg; | 2989 | context->invlpg = nonpaging_invlpg; |
2990 | context->update_pte = nonpaging_update_pte; | ||
2572 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); | 2991 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); |
2573 | context->root_hpa = INVALID_PAGE; | 2992 | context->root_hpa = INVALID_PAGE; |
2993 | context->direct_map = true; | ||
2994 | context->set_cr3 = kvm_x86_ops->set_tdp_cr3; | ||
2995 | context->get_cr3 = get_cr3; | ||
2996 | context->inject_page_fault = kvm_inject_page_fault; | ||
2997 | context->nx = is_nx(vcpu); | ||
2574 | 2998 | ||
2575 | if (!is_paging(vcpu)) { | 2999 | if (!is_paging(vcpu)) { |
3000 | context->nx = false; | ||
2576 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 3001 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
2577 | context->root_level = 0; | 3002 | context->root_level = 0; |
2578 | } else if (is_long_mode(vcpu)) { | 3003 | } else if (is_long_mode(vcpu)) { |
2579 | reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); | 3004 | context->nx = is_nx(vcpu); |
3005 | reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); | ||
2580 | context->gva_to_gpa = paging64_gva_to_gpa; | 3006 | context->gva_to_gpa = paging64_gva_to_gpa; |
2581 | context->root_level = PT64_ROOT_LEVEL; | 3007 | context->root_level = PT64_ROOT_LEVEL; |
2582 | } else if (is_pae(vcpu)) { | 3008 | } else if (is_pae(vcpu)) { |
2583 | reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); | 3009 | context->nx = is_nx(vcpu); |
3010 | reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); | ||
2584 | context->gva_to_gpa = paging64_gva_to_gpa; | 3011 | context->gva_to_gpa = paging64_gva_to_gpa; |
2585 | context->root_level = PT32E_ROOT_LEVEL; | 3012 | context->root_level = PT32E_ROOT_LEVEL; |
2586 | } else { | 3013 | } else { |
2587 | reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); | 3014 | context->nx = false; |
3015 | reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); | ||
2588 | context->gva_to_gpa = paging32_gva_to_gpa; | 3016 | context->gva_to_gpa = paging32_gva_to_gpa; |
2589 | context->root_level = PT32_ROOT_LEVEL; | 3017 | context->root_level = PT32_ROOT_LEVEL; |
2590 | } | 3018 | } |
@@ -2592,33 +3020,81 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
2592 | return 0; | 3020 | return 0; |
2593 | } | 3021 | } |
2594 | 3022 | ||
2595 | static int init_kvm_softmmu(struct kvm_vcpu *vcpu) | 3023 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) |
2596 | { | 3024 | { |
2597 | int r; | 3025 | int r; |
2598 | |||
2599 | ASSERT(vcpu); | 3026 | ASSERT(vcpu); |
2600 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 3027 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
2601 | 3028 | ||
2602 | if (!is_paging(vcpu)) | 3029 | if (!is_paging(vcpu)) |
2603 | r = nonpaging_init_context(vcpu); | 3030 | r = nonpaging_init_context(vcpu, context); |
2604 | else if (is_long_mode(vcpu)) | 3031 | else if (is_long_mode(vcpu)) |
2605 | r = paging64_init_context(vcpu); | 3032 | r = paging64_init_context(vcpu, context); |
2606 | else if (is_pae(vcpu)) | 3033 | else if (is_pae(vcpu)) |
2607 | r = paging32E_init_context(vcpu); | 3034 | r = paging32E_init_context(vcpu, context); |
2608 | else | 3035 | else |
2609 | r = paging32_init_context(vcpu); | 3036 | r = paging32_init_context(vcpu, context); |
2610 | 3037 | ||
2611 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); | 3038 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); |
2612 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); | 3039 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); |
2613 | 3040 | ||
2614 | return r; | 3041 | return r; |
2615 | } | 3042 | } |
3043 | EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); | ||
2616 | 3044 | ||
2617 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | 3045 | static int init_kvm_softmmu(struct kvm_vcpu *vcpu) |
2618 | { | 3046 | { |
2619 | vcpu->arch.update_pte.pfn = bad_pfn; | 3047 | int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); |
2620 | 3048 | ||
2621 | if (tdp_enabled) | 3049 | vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; |
3050 | vcpu->arch.walk_mmu->get_cr3 = get_cr3; | ||
3051 | vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; | ||
3052 | |||
3053 | return r; | ||
3054 | } | ||
3055 | |||
3056 | static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) | ||
3057 | { | ||
3058 | struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; | ||
3059 | |||
3060 | g_context->get_cr3 = get_cr3; | ||
3061 | g_context->inject_page_fault = kvm_inject_page_fault; | ||
3062 | |||
3063 | /* | ||
3064 | * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The | ||
3065 | * translation of l2_gpa to l1_gpa addresses is done using the | ||
3066 | * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa | ||
3067 | * functions between mmu and nested_mmu are swapped. | ||
3068 | */ | ||
3069 | if (!is_paging(vcpu)) { | ||
3070 | g_context->nx = false; | ||
3071 | g_context->root_level = 0; | ||
3072 | g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; | ||
3073 | } else if (is_long_mode(vcpu)) { | ||
3074 | g_context->nx = is_nx(vcpu); | ||
3075 | reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); | ||
3076 | g_context->root_level = PT64_ROOT_LEVEL; | ||
3077 | g_context->gva_to_gpa = paging64_gva_to_gpa_nested; | ||
3078 | } else if (is_pae(vcpu)) { | ||
3079 | g_context->nx = is_nx(vcpu); | ||
3080 | reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); | ||
3081 | g_context->root_level = PT32E_ROOT_LEVEL; | ||
3082 | g_context->gva_to_gpa = paging64_gva_to_gpa_nested; | ||
3083 | } else { | ||
3084 | g_context->nx = false; | ||
3085 | reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); | ||
3086 | g_context->root_level = PT32_ROOT_LEVEL; | ||
3087 | g_context->gva_to_gpa = paging32_gva_to_gpa_nested; | ||
3088 | } | ||
3089 | |||
3090 | return 0; | ||
3091 | } | ||
3092 | |||
3093 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | ||
3094 | { | ||
3095 | if (mmu_is_nested(vcpu)) | ||
3096 | return init_kvm_nested_mmu(vcpu); | ||
3097 | else if (tdp_enabled) | ||
2622 | return init_kvm_tdp_mmu(vcpu); | 3098 | return init_kvm_tdp_mmu(vcpu); |
2623 | else | 3099 | else |
2624 | return init_kvm_softmmu(vcpu); | 3100 | return init_kvm_softmmu(vcpu); |
@@ -2653,7 +3129,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) | |||
2653 | if (r) | 3129 | if (r) |
2654 | goto out; | 3130 | goto out; |
2655 | /* set_cr3() should ensure TLB has been flushed */ | 3131 | /* set_cr3() should ensure TLB has been flushed */ |
2656 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | 3132 | vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); |
2657 | out: | 3133 | out: |
2658 | return r; | 3134 | return r; |
2659 | } | 3135 | } |
@@ -2663,6 +3139,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) | |||
2663 | { | 3139 | { |
2664 | mmu_free_roots(vcpu); | 3140 | mmu_free_roots(vcpu); |
2665 | } | 3141 | } |
3142 | EXPORT_SYMBOL_GPL(kvm_mmu_unload); | ||
2666 | 3143 | ||
2667 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | 3144 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, |
2668 | struct kvm_mmu_page *sp, | 3145 | struct kvm_mmu_page *sp, |
@@ -2686,8 +3163,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | |||
2686 | } | 3163 | } |
2687 | 3164 | ||
2688 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | 3165 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, |
2689 | struct kvm_mmu_page *sp, | 3166 | struct kvm_mmu_page *sp, u64 *spte, |
2690 | u64 *spte, | ||
2691 | const void *new) | 3167 | const void *new) |
2692 | { | 3168 | { |
2693 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { | 3169 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { |
@@ -2695,14 +3171,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | |||
2695 | return; | 3171 | return; |
2696 | } | 3172 | } |
2697 | 3173 | ||
2698 | if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) | ||
2699 | return; | ||
2700 | |||
2701 | ++vcpu->kvm->stat.mmu_pte_updated; | 3174 | ++vcpu->kvm->stat.mmu_pte_updated; |
2702 | if (!sp->role.cr4_pae) | 3175 | vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); |
2703 | paging32_update_pte(vcpu, sp, spte, new); | ||
2704 | else | ||
2705 | paging64_update_pte(vcpu, sp, spte, new); | ||
2706 | } | 3176 | } |
2707 | 3177 | ||
2708 | static bool need_remote_flush(u64 old, u64 new) | 3178 | static bool need_remote_flush(u64 old, u64 new) |
@@ -2737,28 +3207,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) | |||
2737 | return !!(spte && (*spte & shadow_accessed_mask)); | 3207 | return !!(spte && (*spte & shadow_accessed_mask)); |
2738 | } | 3208 | } |
2739 | 3209 | ||
2740 | static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
2741 | u64 gpte) | ||
2742 | { | ||
2743 | gfn_t gfn; | ||
2744 | pfn_t pfn; | ||
2745 | |||
2746 | if (!is_present_gpte(gpte)) | ||
2747 | return; | ||
2748 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
2749 | |||
2750 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||
2751 | smp_rmb(); | ||
2752 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | ||
2753 | |||
2754 | if (is_error_pfn(pfn)) { | ||
2755 | kvm_release_pfn_clean(pfn); | ||
2756 | return; | ||
2757 | } | ||
2758 | vcpu->arch.update_pte.gfn = gfn; | ||
2759 | vcpu->arch.update_pte.pfn = pfn; | ||
2760 | } | ||
2761 | |||
2762 | static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) | 3210 | static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) |
2763 | { | 3211 | { |
2764 | u64 *spte = vcpu->arch.last_pte_updated; | 3212 | u64 *spte = vcpu->arch.last_pte_updated; |
@@ -2780,21 +3228,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2780 | struct kvm_mmu_page *sp; | 3228 | struct kvm_mmu_page *sp; |
2781 | struct hlist_node *node; | 3229 | struct hlist_node *node; |
2782 | LIST_HEAD(invalid_list); | 3230 | LIST_HEAD(invalid_list); |
2783 | u64 entry, gentry; | 3231 | u64 entry, gentry, *spte; |
2784 | u64 *spte; | 3232 | unsigned pte_size, page_offset, misaligned, quadrant, offset; |
2785 | unsigned offset = offset_in_page(gpa); | 3233 | int level, npte, invlpg_counter, r, flooded = 0; |
2786 | unsigned pte_size; | ||
2787 | unsigned page_offset; | ||
2788 | unsigned misaligned; | ||
2789 | unsigned quadrant; | ||
2790 | int level; | ||
2791 | int flooded = 0; | ||
2792 | int npte; | ||
2793 | int r; | ||
2794 | int invlpg_counter; | ||
2795 | bool remote_flush, local_flush, zap_page; | 3234 | bool remote_flush, local_flush, zap_page; |
2796 | 3235 | ||
2797 | zap_page = remote_flush = local_flush = false; | 3236 | zap_page = remote_flush = local_flush = false; |
3237 | offset = offset_in_page(gpa); | ||
2798 | 3238 | ||
2799 | pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); | 3239 | pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); |
2800 | 3240 | ||
@@ -2802,9 +3242,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2802 | 3242 | ||
2803 | /* | 3243 | /* |
2804 | * Assume that the pte write on a page table of the same type | 3244 | * Assume that the pte write on a page table of the same type |
2805 | * as the current vcpu paging mode. This is nearly always true | 3245 | * as the current vcpu paging mode since we update the sptes only |
2806 | * (might be false while changing modes). Note it is verified later | 3246 | * when they have the same mode. |
2807 | * by update_pte(). | ||
2808 | */ | 3247 | */ |
2809 | if ((is_pae(vcpu) && bytes == 4) || !new) { | 3248 | if ((is_pae(vcpu) && bytes == 4) || !new) { |
2810 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ | 3249 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ |
@@ -2830,15 +3269,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2830 | break; | 3269 | break; |
2831 | } | 3270 | } |
2832 | 3271 | ||
2833 | mmu_guess_page_from_pte_write(vcpu, gpa, gentry); | ||
2834 | spin_lock(&vcpu->kvm->mmu_lock); | 3272 | spin_lock(&vcpu->kvm->mmu_lock); |
2835 | if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) | 3273 | if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) |
2836 | gentry = 0; | 3274 | gentry = 0; |
2837 | kvm_mmu_access_page(vcpu, gfn); | ||
2838 | kvm_mmu_free_some_pages(vcpu); | 3275 | kvm_mmu_free_some_pages(vcpu); |
2839 | ++vcpu->kvm->stat.mmu_pte_write; | 3276 | ++vcpu->kvm->stat.mmu_pte_write; |
2840 | kvm_mmu_audit(vcpu, "pre pte write"); | 3277 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); |
2841 | if (guest_initiated) { | 3278 | if (guest_initiated) { |
3279 | kvm_mmu_access_page(vcpu, gfn); | ||
2842 | if (gfn == vcpu->arch.last_pt_write_gfn | 3280 | if (gfn == vcpu->arch.last_pt_write_gfn |
2843 | && !last_updated_pte_accessed(vcpu)) { | 3281 | && !last_updated_pte_accessed(vcpu)) { |
2844 | ++vcpu->arch.last_pt_write_count; | 3282 | ++vcpu->arch.last_pt_write_count; |
@@ -2910,12 +3348,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2910 | } | 3348 | } |
2911 | mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); | 3349 | mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); |
2912 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | 3350 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
2913 | kvm_mmu_audit(vcpu, "post pte write"); | 3351 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); |
2914 | spin_unlock(&vcpu->kvm->mmu_lock); | 3352 | spin_unlock(&vcpu->kvm->mmu_lock); |
2915 | if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { | ||
2916 | kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); | ||
2917 | vcpu->arch.update_pte.pfn = bad_pfn; | ||
2918 | } | ||
2919 | } | 3353 | } |
2920 | 3354 | ||
2921 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | 3355 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) |
@@ -2923,7 +3357,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | |||
2923 | gpa_t gpa; | 3357 | gpa_t gpa; |
2924 | int r; | 3358 | int r; |
2925 | 3359 | ||
2926 | if (tdp_enabled) | 3360 | if (vcpu->arch.mmu.direct_map) |
2927 | return 0; | 3361 | return 0; |
2928 | 3362 | ||
2929 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); | 3363 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); |
@@ -2937,29 +3371,27 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); | |||
2937 | 3371 | ||
2938 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | 3372 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) |
2939 | { | 3373 | { |
2940 | int free_pages; | ||
2941 | LIST_HEAD(invalid_list); | 3374 | LIST_HEAD(invalid_list); |
2942 | 3375 | ||
2943 | free_pages = vcpu->kvm->arch.n_free_mmu_pages; | 3376 | while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && |
2944 | while (free_pages < KVM_REFILL_PAGES && | ||
2945 | !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { | 3377 | !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { |
2946 | struct kvm_mmu_page *sp; | 3378 | struct kvm_mmu_page *sp; |
2947 | 3379 | ||
2948 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | 3380 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, |
2949 | struct kvm_mmu_page, link); | 3381 | struct kvm_mmu_page, link); |
2950 | free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, | 3382 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); |
2951 | &invalid_list); | 3383 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
2952 | ++vcpu->kvm->stat.mmu_recycled; | 3384 | ++vcpu->kvm->stat.mmu_recycled; |
2953 | } | 3385 | } |
2954 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
2955 | } | 3386 | } |
2956 | 3387 | ||
2957 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | 3388 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, |
3389 | void *insn, int insn_len) | ||
2958 | { | 3390 | { |
2959 | int r; | 3391 | int r; |
2960 | enum emulation_result er; | 3392 | enum emulation_result er; |
2961 | 3393 | ||
2962 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); | 3394 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); |
2963 | if (r < 0) | 3395 | if (r < 0) |
2964 | goto out; | 3396 | goto out; |
2965 | 3397 | ||
@@ -2972,7 +3404,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | |||
2972 | if (r) | 3404 | if (r) |
2973 | goto out; | 3405 | goto out; |
2974 | 3406 | ||
2975 | er = emulate_instruction(vcpu, cr2, error_code, 0); | 3407 | er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); |
2976 | 3408 | ||
2977 | switch (er) { | 3409 | switch (er) { |
2978 | case EMULATE_DONE: | 3410 | case EMULATE_DONE: |
@@ -3013,6 +3445,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); | |||
3013 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | 3445 | static void free_mmu_pages(struct kvm_vcpu *vcpu) |
3014 | { | 3446 | { |
3015 | free_page((unsigned long)vcpu->arch.mmu.pae_root); | 3447 | free_page((unsigned long)vcpu->arch.mmu.pae_root); |
3448 | if (vcpu->arch.mmu.lm_root != NULL) | ||
3449 | free_page((unsigned long)vcpu->arch.mmu.lm_root); | ||
3016 | } | 3450 | } |
3017 | 3451 | ||
3018 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | 3452 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) |
@@ -3054,15 +3488,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) | |||
3054 | return init_kvm_mmu(vcpu); | 3488 | return init_kvm_mmu(vcpu); |
3055 | } | 3489 | } |
3056 | 3490 | ||
3057 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | ||
3058 | { | ||
3059 | ASSERT(vcpu); | ||
3060 | |||
3061 | destroy_kvm_mmu(vcpu); | ||
3062 | free_mmu_pages(vcpu); | ||
3063 | mmu_free_memory_caches(vcpu); | ||
3064 | } | ||
3065 | |||
3066 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | 3491 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) |
3067 | { | 3492 | { |
3068 | struct kvm_mmu_page *sp; | 3493 | struct kvm_mmu_page *sp; |
@@ -3075,10 +3500,22 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
3075 | continue; | 3500 | continue; |
3076 | 3501 | ||
3077 | pt = sp->spt; | 3502 | pt = sp->spt; |
3078 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | 3503 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { |
3504 | if (!is_shadow_present_pte(pt[i]) || | ||
3505 | !is_last_spte(pt[i], sp->role.level)) | ||
3506 | continue; | ||
3507 | |||
3508 | if (is_large_pte(pt[i])) { | ||
3509 | drop_spte(kvm, &pt[i], | ||
3510 | shadow_trap_nonpresent_pte); | ||
3511 | --kvm->stat.lpages; | ||
3512 | continue; | ||
3513 | } | ||
3514 | |||
3079 | /* avoid RMW */ | 3515 | /* avoid RMW */ |
3080 | if (is_writable_pte(pt[i])) | 3516 | if (is_writable_pte(pt[i])) |
3081 | pt[i] &= ~PT_WRITABLE_MASK; | 3517 | update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); |
3518 | } | ||
3082 | } | 3519 | } |
3083 | kvm_flush_remote_tlbs(kvm); | 3520 | kvm_flush_remote_tlbs(kvm); |
3084 | } | 3521 | } |
@@ -3108,27 +3545,27 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, | |||
3108 | return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); | 3545 | return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); |
3109 | } | 3546 | } |
3110 | 3547 | ||
3111 | static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) | 3548 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) |
3112 | { | 3549 | { |
3113 | struct kvm *kvm; | 3550 | struct kvm *kvm; |
3114 | struct kvm *kvm_freed = NULL; | 3551 | struct kvm *kvm_freed = NULL; |
3115 | int cache_count = 0; | 3552 | int nr_to_scan = sc->nr_to_scan; |
3553 | |||
3554 | if (nr_to_scan == 0) | ||
3555 | goto out; | ||
3116 | 3556 | ||
3117 | spin_lock(&kvm_lock); | 3557 | raw_spin_lock(&kvm_lock); |
3118 | 3558 | ||
3119 | list_for_each_entry(kvm, &vm_list, vm_list) { | 3559 | list_for_each_entry(kvm, &vm_list, vm_list) { |
3120 | int npages, idx, freed_pages; | 3560 | int idx, freed_pages; |
3121 | LIST_HEAD(invalid_list); | 3561 | LIST_HEAD(invalid_list); |
3122 | 3562 | ||
3123 | idx = srcu_read_lock(&kvm->srcu); | 3563 | idx = srcu_read_lock(&kvm->srcu); |
3124 | spin_lock(&kvm->mmu_lock); | 3564 | spin_lock(&kvm->mmu_lock); |
3125 | npages = kvm->arch.n_alloc_mmu_pages - | 3565 | if (!kvm_freed && nr_to_scan > 0 && |
3126 | kvm->arch.n_free_mmu_pages; | 3566 | kvm->arch.n_used_mmu_pages > 0) { |
3127 | cache_count += npages; | ||
3128 | if (!kvm_freed && nr_to_scan > 0 && npages > 0) { | ||
3129 | freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, | 3567 | freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, |
3130 | &invalid_list); | 3568 | &invalid_list); |
3131 | cache_count -= freed_pages; | ||
3132 | kvm_freed = kvm; | 3569 | kvm_freed = kvm; |
3133 | } | 3570 | } |
3134 | nr_to_scan--; | 3571 | nr_to_scan--; |
@@ -3140,9 +3577,10 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) | |||
3140 | if (kvm_freed) | 3577 | if (kvm_freed) |
3141 | list_move_tail(&kvm_freed->vm_list, &vm_list); | 3578 | list_move_tail(&kvm_freed->vm_list, &vm_list); |
3142 | 3579 | ||
3143 | spin_unlock(&kvm_lock); | 3580 | raw_spin_unlock(&kvm_lock); |
3144 | 3581 | ||
3145 | return cache_count; | 3582 | out: |
3583 | return percpu_counter_read_positive(&kvm_total_used_mmu_pages); | ||
3146 | } | 3584 | } |
3147 | 3585 | ||
3148 | static struct shrinker mmu_shrinker = { | 3586 | static struct shrinker mmu_shrinker = { |
@@ -3160,12 +3598,6 @@ static void mmu_destroy_caches(void) | |||
3160 | kmem_cache_destroy(mmu_page_header_cache); | 3598 | kmem_cache_destroy(mmu_page_header_cache); |
3161 | } | 3599 | } |
3162 | 3600 | ||
3163 | void kvm_mmu_module_exit(void) | ||
3164 | { | ||
3165 | mmu_destroy_caches(); | ||
3166 | unregister_shrinker(&mmu_shrinker); | ||
3167 | } | ||
3168 | |||
3169 | int kvm_mmu_module_init(void) | 3601 | int kvm_mmu_module_init(void) |
3170 | { | 3602 | { |
3171 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | 3603 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", |
@@ -3185,6 +3617,9 @@ int kvm_mmu_module_init(void) | |||
3185 | if (!mmu_page_header_cache) | 3617 | if (!mmu_page_header_cache) |
3186 | goto nomem; | 3618 | goto nomem; |
3187 | 3619 | ||
3620 | if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) | ||
3621 | goto nomem; | ||
3622 | |||
3188 | register_shrinker(&mmu_shrinker); | 3623 | register_shrinker(&mmu_shrinker); |
3189 | 3624 | ||
3190 | return 0; | 3625 | return 0; |
@@ -3259,7 +3694,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, | |||
3259 | 3694 | ||
3260 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) | 3695 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) |
3261 | { | 3696 | { |
3262 | (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); | 3697 | (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu)); |
3263 | return 1; | 3698 | return 1; |
3264 | } | 3699 | } |
3265 | 3700 | ||
@@ -3355,271 +3790,25 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) | |||
3355 | } | 3790 | } |
3356 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); | 3791 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); |
3357 | 3792 | ||
3358 | #ifdef AUDIT | 3793 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) |
3359 | |||
3360 | static const char *audit_msg; | ||
3361 | |||
3362 | static gva_t canonicalize(gva_t gva) | ||
3363 | { | ||
3364 | #ifdef CONFIG_X86_64 | ||
3365 | gva = (long long)(gva << 16) >> 16; | ||
3366 | #endif | ||
3367 | return gva; | ||
3368 | } | ||
3369 | |||
3370 | |||
3371 | typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); | ||
3372 | |||
3373 | static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, | ||
3374 | inspect_spte_fn fn) | ||
3375 | { | ||
3376 | int i; | ||
3377 | |||
3378 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
3379 | u64 ent = sp->spt[i]; | ||
3380 | |||
3381 | if (is_shadow_present_pte(ent)) { | ||
3382 | if (!is_last_spte(ent, sp->role.level)) { | ||
3383 | struct kvm_mmu_page *child; | ||
3384 | child = page_header(ent & PT64_BASE_ADDR_MASK); | ||
3385 | __mmu_spte_walk(kvm, child, fn); | ||
3386 | } else | ||
3387 | fn(kvm, &sp->spt[i]); | ||
3388 | } | ||
3389 | } | ||
3390 | } | ||
3391 | |||
3392 | static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) | ||
3393 | { | ||
3394 | int i; | ||
3395 | struct kvm_mmu_page *sp; | ||
3396 | |||
3397 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
3398 | return; | ||
3399 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
3400 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
3401 | sp = page_header(root); | ||
3402 | __mmu_spte_walk(vcpu->kvm, sp, fn); | ||
3403 | return; | ||
3404 | } | ||
3405 | for (i = 0; i < 4; ++i) { | ||
3406 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
3407 | |||
3408 | if (root && VALID_PAGE(root)) { | ||
3409 | root &= PT64_BASE_ADDR_MASK; | ||
3410 | sp = page_header(root); | ||
3411 | __mmu_spte_walk(vcpu->kvm, sp, fn); | ||
3412 | } | ||
3413 | } | ||
3414 | return; | ||
3415 | } | ||
3416 | |||
3417 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | ||
3418 | gva_t va, int level) | ||
3419 | { | ||
3420 | u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); | ||
3421 | int i; | ||
3422 | gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); | ||
3423 | |||
3424 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { | ||
3425 | u64 ent = pt[i]; | ||
3426 | |||
3427 | if (ent == shadow_trap_nonpresent_pte) | ||
3428 | continue; | ||
3429 | |||
3430 | va = canonicalize(va); | ||
3431 | if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) | ||
3432 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
3433 | else { | ||
3434 | gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); | ||
3435 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
3436 | pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); | ||
3437 | hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; | ||
3438 | |||
3439 | if (is_error_pfn(pfn)) { | ||
3440 | kvm_release_pfn_clean(pfn); | ||
3441 | continue; | ||
3442 | } | ||
3443 | |||
3444 | if (is_shadow_present_pte(ent) | ||
3445 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | ||
3446 | printk(KERN_ERR "xx audit error: (%s) levels %d" | ||
3447 | " gva %lx gpa %llx hpa %llx ent %llx %d\n", | ||
3448 | audit_msg, vcpu->arch.mmu.root_level, | ||
3449 | va, gpa, hpa, ent, | ||
3450 | is_shadow_present_pte(ent)); | ||
3451 | else if (ent == shadow_notrap_nonpresent_pte | ||
3452 | && !is_error_hpa(hpa)) | ||
3453 | printk(KERN_ERR "audit: (%s) notrap shadow," | ||
3454 | " valid guest gva %lx\n", audit_msg, va); | ||
3455 | kvm_release_pfn_clean(pfn); | ||
3456 | |||
3457 | } | ||
3458 | } | ||
3459 | } | ||
3460 | |||
3461 | static void audit_mappings(struct kvm_vcpu *vcpu) | ||
3462 | { | ||
3463 | unsigned i; | ||
3464 | |||
3465 | if (vcpu->arch.mmu.root_level == 4) | ||
3466 | audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); | ||
3467 | else | ||
3468 | for (i = 0; i < 4; ++i) | ||
3469 | if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) | ||
3470 | audit_mappings_page(vcpu, | ||
3471 | vcpu->arch.mmu.pae_root[i], | ||
3472 | i << 30, | ||
3473 | 2); | ||
3474 | } | ||
3475 | |||
3476 | static int count_rmaps(struct kvm_vcpu *vcpu) | ||
3477 | { | ||
3478 | struct kvm *kvm = vcpu->kvm; | ||
3479 | struct kvm_memslots *slots; | ||
3480 | int nmaps = 0; | ||
3481 | int i, j, k, idx; | ||
3482 | |||
3483 | idx = srcu_read_lock(&kvm->srcu); | ||
3484 | slots = kvm_memslots(kvm); | ||
3485 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
3486 | struct kvm_memory_slot *m = &slots->memslots[i]; | ||
3487 | struct kvm_rmap_desc *d; | ||
3488 | |||
3489 | for (j = 0; j < m->npages; ++j) { | ||
3490 | unsigned long *rmapp = &m->rmap[j]; | ||
3491 | |||
3492 | if (!*rmapp) | ||
3493 | continue; | ||
3494 | if (!(*rmapp & 1)) { | ||
3495 | ++nmaps; | ||
3496 | continue; | ||
3497 | } | ||
3498 | d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
3499 | while (d) { | ||
3500 | for (k = 0; k < RMAP_EXT; ++k) | ||
3501 | if (d->sptes[k]) | ||
3502 | ++nmaps; | ||
3503 | else | ||
3504 | break; | ||
3505 | d = d->more; | ||
3506 | } | ||
3507 | } | ||
3508 | } | ||
3509 | srcu_read_unlock(&kvm->srcu, idx); | ||
3510 | return nmaps; | ||
3511 | } | ||
3512 | |||
3513 | void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | ||
3514 | { | ||
3515 | unsigned long *rmapp; | ||
3516 | struct kvm_mmu_page *rev_sp; | ||
3517 | gfn_t gfn; | ||
3518 | |||
3519 | if (is_writable_pte(*sptep)) { | ||
3520 | rev_sp = page_header(__pa(sptep)); | ||
3521 | gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); | ||
3522 | |||
3523 | if (!gfn_to_memslot(kvm, gfn)) { | ||
3524 | if (!printk_ratelimit()) | ||
3525 | return; | ||
3526 | printk(KERN_ERR "%s: no memslot for gfn %ld\n", | ||
3527 | audit_msg, gfn); | ||
3528 | printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", | ||
3529 | audit_msg, (long int)(sptep - rev_sp->spt), | ||
3530 | rev_sp->gfn); | ||
3531 | dump_stack(); | ||
3532 | return; | ||
3533 | } | ||
3534 | |||
3535 | rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); | ||
3536 | if (!*rmapp) { | ||
3537 | if (!printk_ratelimit()) | ||
3538 | return; | ||
3539 | printk(KERN_ERR "%s: no rmap for writable spte %llx\n", | ||
3540 | audit_msg, *sptep); | ||
3541 | dump_stack(); | ||
3542 | } | ||
3543 | } | ||
3544 | |||
3545 | } | ||
3546 | |||
3547 | void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) | ||
3548 | { | ||
3549 | mmu_spte_walk(vcpu, inspect_spte_has_rmap); | ||
3550 | } | ||
3551 | |||
3552 | static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) | ||
3553 | { | 3794 | { |
3554 | struct kvm_mmu_page *sp; | 3795 | ASSERT(vcpu); |
3555 | int i; | ||
3556 | |||
3557 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
3558 | u64 *pt = sp->spt; | ||
3559 | |||
3560 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) | ||
3561 | continue; | ||
3562 | |||
3563 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
3564 | u64 ent = pt[i]; | ||
3565 | |||
3566 | if (!(ent & PT_PRESENT_MASK)) | ||
3567 | continue; | ||
3568 | if (!is_writable_pte(ent)) | ||
3569 | continue; | ||
3570 | inspect_spte_has_rmap(vcpu->kvm, &pt[i]); | ||
3571 | } | ||
3572 | } | ||
3573 | return; | ||
3574 | } | ||
3575 | 3796 | ||
3576 | static void audit_rmap(struct kvm_vcpu *vcpu) | 3797 | destroy_kvm_mmu(vcpu); |
3577 | { | 3798 | free_mmu_pages(vcpu); |
3578 | check_writable_mappings_rmap(vcpu); | 3799 | mmu_free_memory_caches(vcpu); |
3579 | count_rmaps(vcpu); | ||
3580 | } | 3800 | } |
3581 | 3801 | ||
3582 | static void audit_write_protection(struct kvm_vcpu *vcpu) | 3802 | #ifdef CONFIG_KVM_MMU_AUDIT |
3583 | { | 3803 | #include "mmu_audit.c" |
3584 | struct kvm_mmu_page *sp; | 3804 | #else |
3585 | struct kvm_memory_slot *slot; | 3805 | static void mmu_audit_disable(void) { } |
3586 | unsigned long *rmapp; | 3806 | #endif |
3587 | u64 *spte; | ||
3588 | gfn_t gfn; | ||
3589 | |||
3590 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
3591 | if (sp->role.direct) | ||
3592 | continue; | ||
3593 | if (sp->unsync) | ||
3594 | continue; | ||
3595 | |||
3596 | slot = gfn_to_memslot(vcpu->kvm, sp->gfn); | ||
3597 | rmapp = &slot->rmap[gfn - slot->base_gfn]; | ||
3598 | |||
3599 | spte = rmap_next(vcpu->kvm, rmapp, NULL); | ||
3600 | while (spte) { | ||
3601 | if (is_writable_pte(*spte)) | ||
3602 | printk(KERN_ERR "%s: (%s) shadow page has " | ||
3603 | "writable mappings: gfn %lx role %x\n", | ||
3604 | __func__, audit_msg, sp->gfn, | ||
3605 | sp->role.word); | ||
3606 | spte = rmap_next(vcpu->kvm, rmapp, spte); | ||
3607 | } | ||
3608 | } | ||
3609 | } | ||
3610 | 3807 | ||
3611 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | 3808 | void kvm_mmu_module_exit(void) |
3612 | { | 3809 | { |
3613 | int olddbg = dbg; | 3810 | mmu_destroy_caches(); |
3614 | 3811 | percpu_counter_destroy(&kvm_total_used_mmu_pages); | |
3615 | dbg = 0; | 3812 | unregister_shrinker(&mmu_shrinker); |
3616 | audit_msg = msg; | 3813 | mmu_audit_disable(); |
3617 | audit_rmap(vcpu); | ||
3618 | audit_write_protection(vcpu); | ||
3619 | if (strcmp("pre pte write", audit_msg) != 0) | ||
3620 | audit_mappings(vcpu); | ||
3621 | audit_writable_sptes_have_rmaps(vcpu); | ||
3622 | dbg = olddbg; | ||
3623 | } | 3814 | } |
3624 | |||
3625 | #endif | ||
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index be66759321a5..7086ca85d3e7 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h | |||
@@ -49,10 +49,17 @@ | |||
49 | #define PFERR_FETCH_MASK (1U << 4) | 49 | #define PFERR_FETCH_MASK (1U << 4) |
50 | 50 | ||
51 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); | 51 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); |
52 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); | ||
53 | |||
54 | static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) | ||
55 | { | ||
56 | return kvm->arch.n_max_mmu_pages - | ||
57 | kvm->arch.n_used_mmu_pages; | ||
58 | } | ||
52 | 59 | ||
53 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | 60 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) |
54 | { | 61 | { |
55 | if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | 62 | if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES)) |
56 | __kvm_mmu_free_some_pages(vcpu); | 63 | __kvm_mmu_free_some_pages(vcpu); |
57 | } | 64 | } |
58 | 65 | ||
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c new file mode 100644 index 000000000000..5f6223b8bcf7 --- /dev/null +++ b/arch/x86/kvm/mmu_audit.c | |||
@@ -0,0 +1,304 @@ | |||
1 | /* | ||
2 | * mmu_audit.c: | ||
3 | * | ||
4 | * Audit code for KVM MMU | ||
5 | * | ||
6 | * Copyright (C) 2006 Qumranet, Inc. | ||
7 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. | ||
8 | * | ||
9 | * Authors: | ||
10 | * Yaniv Kamay <yaniv@qumranet.com> | ||
11 | * Avi Kivity <avi@qumranet.com> | ||
12 | * Marcelo Tosatti <mtosatti@redhat.com> | ||
13 | * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | #include <linux/ratelimit.h> | ||
21 | |||
22 | #define audit_printk(kvm, fmt, args...) \ | ||
23 | printk(KERN_ERR "audit: (%s) error: " \ | ||
24 | fmt, audit_point_name[kvm->arch.audit_point], ##args) | ||
25 | |||
26 | typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); | ||
27 | |||
28 | static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | ||
29 | inspect_spte_fn fn, int level) | ||
30 | { | ||
31 | int i; | ||
32 | |||
33 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
34 | u64 *ent = sp->spt; | ||
35 | |||
36 | fn(vcpu, ent + i, level); | ||
37 | |||
38 | if (is_shadow_present_pte(ent[i]) && | ||
39 | !is_last_spte(ent[i], level)) { | ||
40 | struct kvm_mmu_page *child; | ||
41 | |||
42 | child = page_header(ent[i] & PT64_BASE_ADDR_MASK); | ||
43 | __mmu_spte_walk(vcpu, child, fn, level - 1); | ||
44 | } | ||
45 | } | ||
46 | } | ||
47 | |||
48 | static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) | ||
49 | { | ||
50 | int i; | ||
51 | struct kvm_mmu_page *sp; | ||
52 | |||
53 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
54 | return; | ||
55 | |||
56 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | ||
57 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
58 | |||
59 | sp = page_header(root); | ||
60 | __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL); | ||
61 | return; | ||
62 | } | ||
63 | |||
64 | for (i = 0; i < 4; ++i) { | ||
65 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
66 | |||
67 | if (root && VALID_PAGE(root)) { | ||
68 | root &= PT64_BASE_ADDR_MASK; | ||
69 | sp = page_header(root); | ||
70 | __mmu_spte_walk(vcpu, sp, fn, 2); | ||
71 | } | ||
72 | } | ||
73 | |||
74 | return; | ||
75 | } | ||
76 | |||
77 | typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp); | ||
78 | |||
79 | static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) | ||
80 | { | ||
81 | struct kvm_mmu_page *sp; | ||
82 | |||
83 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) | ||
84 | fn(kvm, sp); | ||
85 | } | ||
86 | |||
87 | static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) | ||
88 | { | ||
89 | struct kvm_mmu_page *sp; | ||
90 | gfn_t gfn; | ||
91 | pfn_t pfn; | ||
92 | hpa_t hpa; | ||
93 | |||
94 | sp = page_header(__pa(sptep)); | ||
95 | |||
96 | if (sp->unsync) { | ||
97 | if (level != PT_PAGE_TABLE_LEVEL) { | ||
98 | audit_printk(vcpu->kvm, "unsync sp: %p " | ||
99 | "level = %d\n", sp, level); | ||
100 | return; | ||
101 | } | ||
102 | |||
103 | if (*sptep == shadow_notrap_nonpresent_pte) { | ||
104 | audit_printk(vcpu->kvm, "notrap spte in unsync " | ||
105 | "sp: %p\n", sp); | ||
106 | return; | ||
107 | } | ||
108 | } | ||
109 | |||
110 | if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { | ||
111 | audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n", | ||
112 | sp); | ||
113 | return; | ||
114 | } | ||
115 | |||
116 | if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) | ||
117 | return; | ||
118 | |||
119 | gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); | ||
120 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); | ||
121 | |||
122 | if (is_error_pfn(pfn)) { | ||
123 | kvm_release_pfn_clean(pfn); | ||
124 | return; | ||
125 | } | ||
126 | |||
127 | hpa = pfn << PAGE_SHIFT; | ||
128 | if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) | ||
129 | audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " | ||
130 | "ent %llxn", vcpu->arch.mmu.root_level, pfn, | ||
131 | hpa, *sptep); | ||
132 | } | ||
133 | |||
134 | static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | ||
135 | { | ||
136 | unsigned long *rmapp; | ||
137 | struct kvm_mmu_page *rev_sp; | ||
138 | gfn_t gfn; | ||
139 | |||
140 | |||
141 | rev_sp = page_header(__pa(sptep)); | ||
142 | gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); | ||
143 | |||
144 | if (!gfn_to_memslot(kvm, gfn)) { | ||
145 | if (!printk_ratelimit()) | ||
146 | return; | ||
147 | audit_printk(kvm, "no memslot for gfn %llx\n", gfn); | ||
148 | audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", | ||
149 | (long int)(sptep - rev_sp->spt), rev_sp->gfn); | ||
150 | dump_stack(); | ||
151 | return; | ||
152 | } | ||
153 | |||
154 | rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); | ||
155 | if (!*rmapp) { | ||
156 | if (!printk_ratelimit()) | ||
157 | return; | ||
158 | audit_printk(kvm, "no rmap for writable spte %llx\n", | ||
159 | *sptep); | ||
160 | dump_stack(); | ||
161 | } | ||
162 | } | ||
163 | |||
164 | static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) | ||
165 | { | ||
166 | if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level)) | ||
167 | inspect_spte_has_rmap(vcpu->kvm, sptep); | ||
168 | } | ||
169 | |||
170 | static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) | ||
171 | { | ||
172 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | ||
173 | |||
174 | if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) | ||
175 | audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " | ||
176 | "root.\n", sp); | ||
177 | } | ||
178 | |||
179 | static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
180 | { | ||
181 | int i; | ||
182 | |||
183 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) | ||
184 | return; | ||
185 | |||
186 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
187 | if (!is_rmap_spte(sp->spt[i])) | ||
188 | continue; | ||
189 | |||
190 | inspect_spte_has_rmap(kvm, sp->spt + i); | ||
191 | } | ||
192 | } | ||
193 | |||
194 | static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
195 | { | ||
196 | struct kvm_memory_slot *slot; | ||
197 | unsigned long *rmapp; | ||
198 | u64 *spte; | ||
199 | |||
200 | if (sp->role.direct || sp->unsync || sp->role.invalid) | ||
201 | return; | ||
202 | |||
203 | slot = gfn_to_memslot(kvm, sp->gfn); | ||
204 | rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; | ||
205 | |||
206 | spte = rmap_next(kvm, rmapp, NULL); | ||
207 | while (spte) { | ||
208 | if (is_writable_pte(*spte)) | ||
209 | audit_printk(kvm, "shadow page has writable " | ||
210 | "mappings: gfn %llx role %x\n", | ||
211 | sp->gfn, sp->role.word); | ||
212 | spte = rmap_next(kvm, rmapp, spte); | ||
213 | } | ||
214 | } | ||
215 | |||
216 | static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
217 | { | ||
218 | check_mappings_rmap(kvm, sp); | ||
219 | audit_write_protection(kvm, sp); | ||
220 | } | ||
221 | |||
222 | static void audit_all_active_sps(struct kvm *kvm) | ||
223 | { | ||
224 | walk_all_active_sps(kvm, audit_sp); | ||
225 | } | ||
226 | |||
227 | static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) | ||
228 | { | ||
229 | audit_sptes_have_rmaps(vcpu, sptep, level); | ||
230 | audit_mappings(vcpu, sptep, level); | ||
231 | audit_spte_after_sync(vcpu, sptep, level); | ||
232 | } | ||
233 | |||
234 | static void audit_vcpu_spte(struct kvm_vcpu *vcpu) | ||
235 | { | ||
236 | mmu_spte_walk(vcpu, audit_spte); | ||
237 | } | ||
238 | |||
239 | static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) | ||
240 | { | ||
241 | static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); | ||
242 | |||
243 | if (!__ratelimit(&ratelimit_state)) | ||
244 | return; | ||
245 | |||
246 | vcpu->kvm->arch.audit_point = point; | ||
247 | audit_all_active_sps(vcpu->kvm); | ||
248 | audit_vcpu_spte(vcpu); | ||
249 | } | ||
250 | |||
251 | static bool mmu_audit; | ||
252 | |||
253 | static void mmu_audit_enable(void) | ||
254 | { | ||
255 | int ret; | ||
256 | |||
257 | if (mmu_audit) | ||
258 | return; | ||
259 | |||
260 | ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); | ||
261 | WARN_ON(ret); | ||
262 | |||
263 | mmu_audit = true; | ||
264 | } | ||
265 | |||
266 | static void mmu_audit_disable(void) | ||
267 | { | ||
268 | if (!mmu_audit) | ||
269 | return; | ||
270 | |||
271 | unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); | ||
272 | tracepoint_synchronize_unregister(); | ||
273 | mmu_audit = false; | ||
274 | } | ||
275 | |||
276 | static int mmu_audit_set(const char *val, const struct kernel_param *kp) | ||
277 | { | ||
278 | int ret; | ||
279 | unsigned long enable; | ||
280 | |||
281 | ret = strict_strtoul(val, 10, &enable); | ||
282 | if (ret < 0) | ||
283 | return -EINVAL; | ||
284 | |||
285 | switch (enable) { | ||
286 | case 0: | ||
287 | mmu_audit_disable(); | ||
288 | break; | ||
289 | case 1: | ||
290 | mmu_audit_enable(); | ||
291 | break; | ||
292 | default: | ||
293 | return -EINVAL; | ||
294 | } | ||
295 | |||
296 | return 0; | ||
297 | } | ||
298 | |||
299 | static struct kernel_param_ops audit_param_ops = { | ||
300 | .set = mmu_audit_set, | ||
301 | .get = param_get_bool, | ||
302 | }; | ||
303 | |||
304 | module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); | ||
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 3aab0f0930ef..b60b4fdb3eda 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h | |||
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, | |||
195 | 195 | ||
196 | TP_ARGS(sp) | 196 | TP_ARGS(sp) |
197 | ); | 197 | ); |
198 | |||
199 | TRACE_EVENT( | ||
200 | kvm_mmu_audit, | ||
201 | TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), | ||
202 | TP_ARGS(vcpu, audit_point), | ||
203 | |||
204 | TP_STRUCT__entry( | ||
205 | __field(struct kvm_vcpu *, vcpu) | ||
206 | __field(int, audit_point) | ||
207 | ), | ||
208 | |||
209 | TP_fast_assign( | ||
210 | __entry->vcpu = vcpu; | ||
211 | __entry->audit_point = audit_point; | ||
212 | ), | ||
213 | |||
214 | TP_printk("vcpu:%d %s", __entry->vcpu->cpu, | ||
215 | audit_point_name[__entry->audit_point]) | ||
216 | ); | ||
198 | #endif /* _TRACE_KVMMMU_H */ | 217 | #endif /* _TRACE_KVMMMU_H */ |
199 | 218 | ||
200 | #undef TRACE_INCLUDE_PATH | 219 | #undef TRACE_INCLUDE_PATH |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 51ef9097960d..9d03ad4dd5ec 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -7,7 +7,7 @@ | |||
7 | * MMU support | 7 | * MMU support |
8 | * | 8 | * |
9 | * Copyright (C) 2006 Qumranet, Inc. | 9 | * Copyright (C) 2006 Qumranet, Inc. |
10 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 10 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
11 | * | 11 | * |
12 | * Authors: | 12 | * Authors: |
13 | * Yaniv Kamay <yaniv@qumranet.com> | 13 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -31,7 +31,6 @@ | |||
31 | #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) | 31 | #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) |
32 | #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) | 32 | #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) |
33 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | 33 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) |
34 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | ||
35 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | 34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS |
36 | #ifdef CONFIG_X86_64 | 35 | #ifdef CONFIG_X86_64 |
37 | #define PT_MAX_FULL_LEVELS 4 | 36 | #define PT_MAX_FULL_LEVELS 4 |
@@ -48,7 +47,6 @@ | |||
48 | #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) | 47 | #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) |
49 | #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) | 48 | #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) |
50 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | 49 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) |
51 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | ||
52 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | 50 | #define PT_LEVEL_BITS PT32_LEVEL_BITS |
53 | #define PT_MAX_FULL_LEVELS 2 | 51 | #define PT_MAX_FULL_LEVELS 2 |
54 | #define CMPXCHG cmpxchg | 52 | #define CMPXCHG cmpxchg |
@@ -67,11 +65,12 @@ struct guest_walker { | |||
67 | int level; | 65 | int level; |
68 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | 66 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; |
69 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; | 67 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; |
68 | pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; | ||
70 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; | 69 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; |
71 | unsigned pt_access; | 70 | unsigned pt_access; |
72 | unsigned pte_access; | 71 | unsigned pte_access; |
73 | gfn_t gfn; | 72 | gfn_t gfn; |
74 | u32 error_code; | 73 | struct x86_exception fault; |
75 | }; | 74 | }; |
76 | 75 | ||
77 | static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) | 76 | static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) |
@@ -79,15 +78,19 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) | |||
79 | return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; | 78 | return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; |
80 | } | 79 | } |
81 | 80 | ||
82 | static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, | 81 | static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
83 | gfn_t table_gfn, unsigned index, | 82 | pt_element_t __user *ptep_user, unsigned index, |
84 | pt_element_t orig_pte, pt_element_t new_pte) | 83 | pt_element_t orig_pte, pt_element_t new_pte) |
85 | { | 84 | { |
85 | int npages; | ||
86 | pt_element_t ret; | 86 | pt_element_t ret; |
87 | pt_element_t *table; | 87 | pt_element_t *table; |
88 | struct page *page; | 88 | struct page *page; |
89 | 89 | ||
90 | page = gfn_to_page(kvm, table_gfn); | 90 | npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); |
91 | /* Check if the user is doing something meaningless. */ | ||
92 | if (unlikely(npages != 1)) | ||
93 | return -EFAULT; | ||
91 | 94 | ||
92 | table = kmap_atomic(page, KM_USER0); | 95 | table = kmap_atomic(page, KM_USER0); |
93 | ret = CMPXCHG(&table[index], orig_pte, new_pte); | 96 | ret = CMPXCHG(&table[index], orig_pte, new_pte); |
@@ -104,7 +107,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | |||
104 | 107 | ||
105 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; | 108 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; |
106 | #if PTTYPE == 64 | 109 | #if PTTYPE == 64 |
107 | if (is_nx(vcpu)) | 110 | if (vcpu->arch.mmu.nx) |
108 | access &= ~(gpte >> PT64_NX_SHIFT); | 111 | access &= ~(gpte >> PT64_NX_SHIFT); |
109 | #endif | 112 | #endif |
110 | return access; | 113 | return access; |
@@ -113,26 +116,33 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | |||
113 | /* | 116 | /* |
114 | * Fetch a guest pte for a guest virtual address | 117 | * Fetch a guest pte for a guest virtual address |
115 | */ | 118 | */ |
116 | static int FNAME(walk_addr)(struct guest_walker *walker, | 119 | static int FNAME(walk_addr_generic)(struct guest_walker *walker, |
117 | struct kvm_vcpu *vcpu, gva_t addr, | 120 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
118 | int write_fault, int user_fault, int fetch_fault) | 121 | gva_t addr, u32 access) |
119 | { | 122 | { |
120 | pt_element_t pte; | 123 | pt_element_t pte; |
124 | pt_element_t __user *uninitialized_var(ptep_user); | ||
121 | gfn_t table_gfn; | 125 | gfn_t table_gfn; |
122 | unsigned index, pt_access, uninitialized_var(pte_access); | 126 | unsigned index, pt_access, uninitialized_var(pte_access); |
123 | gpa_t pte_gpa; | 127 | gpa_t pte_gpa; |
124 | bool eperm, present, rsvd_fault; | 128 | bool eperm, present, rsvd_fault; |
129 | int offset, write_fault, user_fault, fetch_fault; | ||
130 | |||
131 | write_fault = access & PFERR_WRITE_MASK; | ||
132 | user_fault = access & PFERR_USER_MASK; | ||
133 | fetch_fault = access & PFERR_FETCH_MASK; | ||
125 | 134 | ||
126 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, | 135 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, |
127 | fetch_fault); | 136 | fetch_fault); |
128 | walk: | 137 | walk: |
129 | present = true; | 138 | present = true; |
130 | eperm = rsvd_fault = false; | 139 | eperm = rsvd_fault = false; |
131 | walker->level = vcpu->arch.mmu.root_level; | 140 | walker->level = mmu->root_level; |
132 | pte = vcpu->arch.cr3; | 141 | pte = mmu->get_cr3(vcpu); |
142 | |||
133 | #if PTTYPE == 64 | 143 | #if PTTYPE == 64 |
134 | if (!is_long_mode(vcpu)) { | 144 | if (walker->level == PT32E_ROOT_LEVEL) { |
135 | pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); | 145 | pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); |
136 | trace_kvm_mmu_paging_element(pte, walker->level); | 146 | trace_kvm_mmu_paging_element(pte, walker->level); |
137 | if (!is_present_gpte(pte)) { | 147 | if (!is_present_gpte(pte)) { |
138 | present = false; | 148 | present = false; |
@@ -142,54 +152,80 @@ walk: | |||
142 | } | 152 | } |
143 | #endif | 153 | #endif |
144 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | 154 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || |
145 | (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); | 155 | (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); |
146 | 156 | ||
147 | pt_access = ACC_ALL; | 157 | pt_access = ACC_ALL; |
148 | 158 | ||
149 | for (;;) { | 159 | for (;;) { |
160 | gfn_t real_gfn; | ||
161 | unsigned long host_addr; | ||
162 | |||
150 | index = PT_INDEX(addr, walker->level); | 163 | index = PT_INDEX(addr, walker->level); |
151 | 164 | ||
152 | table_gfn = gpte_to_gfn(pte); | 165 | table_gfn = gpte_to_gfn(pte); |
153 | pte_gpa = gfn_to_gpa(table_gfn); | 166 | offset = index * sizeof(pt_element_t); |
154 | pte_gpa += index * sizeof(pt_element_t); | 167 | pte_gpa = gfn_to_gpa(table_gfn) + offset; |
155 | walker->table_gfn[walker->level - 1] = table_gfn; | 168 | walker->table_gfn[walker->level - 1] = table_gfn; |
156 | walker->pte_gpa[walker->level - 1] = pte_gpa; | 169 | walker->pte_gpa[walker->level - 1] = pte_gpa; |
157 | 170 | ||
158 | if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { | 171 | real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), |
172 | PFERR_USER_MASK|PFERR_WRITE_MASK); | ||
173 | if (unlikely(real_gfn == UNMAPPED_GVA)) { | ||
174 | present = false; | ||
175 | break; | ||
176 | } | ||
177 | real_gfn = gpa_to_gfn(real_gfn); | ||
178 | |||
179 | host_addr = gfn_to_hva(vcpu->kvm, real_gfn); | ||
180 | if (unlikely(kvm_is_error_hva(host_addr))) { | ||
181 | present = false; | ||
182 | break; | ||
183 | } | ||
184 | |||
185 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); | ||
186 | if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) { | ||
159 | present = false; | 187 | present = false; |
160 | break; | 188 | break; |
161 | } | 189 | } |
162 | 190 | ||
163 | trace_kvm_mmu_paging_element(pte, walker->level); | 191 | trace_kvm_mmu_paging_element(pte, walker->level); |
164 | 192 | ||
165 | if (!is_present_gpte(pte)) { | 193 | if (unlikely(!is_present_gpte(pte))) { |
166 | present = false; | 194 | present = false; |
167 | break; | 195 | break; |
168 | } | 196 | } |
169 | 197 | ||
170 | if (is_rsvd_bits_set(vcpu, pte, walker->level)) { | 198 | if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, |
199 | walker->level))) { | ||
171 | rsvd_fault = true; | 200 | rsvd_fault = true; |
172 | break; | 201 | break; |
173 | } | 202 | } |
174 | 203 | ||
175 | if (write_fault && !is_writable_pte(pte)) | 204 | if (unlikely(write_fault && !is_writable_pte(pte) |
176 | if (user_fault || is_write_protection(vcpu)) | 205 | && (user_fault || is_write_protection(vcpu)))) |
177 | eperm = true; | 206 | eperm = true; |
178 | 207 | ||
179 | if (user_fault && !(pte & PT_USER_MASK)) | 208 | if (unlikely(user_fault && !(pte & PT_USER_MASK))) |
180 | eperm = true; | 209 | eperm = true; |
181 | 210 | ||
182 | #if PTTYPE == 64 | 211 | #if PTTYPE == 64 |
183 | if (fetch_fault && (pte & PT64_NX_MASK)) | 212 | if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) |
184 | eperm = true; | 213 | eperm = true; |
185 | #endif | 214 | #endif |
186 | 215 | ||
187 | if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { | 216 | if (!eperm && !rsvd_fault |
217 | && unlikely(!(pte & PT_ACCESSED_MASK))) { | ||
218 | int ret; | ||
188 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, | 219 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, |
189 | sizeof(pte)); | 220 | sizeof(pte)); |
190 | if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, | 221 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, |
191 | index, pte, pte|PT_ACCESSED_MASK)) | 222 | pte, pte|PT_ACCESSED_MASK); |
223 | if (unlikely(ret < 0)) { | ||
224 | present = false; | ||
225 | break; | ||
226 | } else if (ret) | ||
192 | goto walk; | 227 | goto walk; |
228 | |||
193 | mark_page_dirty(vcpu->kvm, table_gfn); | 229 | mark_page_dirty(vcpu->kvm, table_gfn); |
194 | pte |= PT_ACCESSED_MASK; | 230 | pte |= PT_ACCESSED_MASK; |
195 | } | 231 | } |
@@ -204,17 +240,28 @@ walk: | |||
204 | (PTTYPE == 64 || is_pse(vcpu))) || | 240 | (PTTYPE == 64 || is_pse(vcpu))) || |
205 | ((walker->level == PT_PDPE_LEVEL) && | 241 | ((walker->level == PT_PDPE_LEVEL) && |
206 | is_large_pte(pte) && | 242 | is_large_pte(pte) && |
207 | is_long_mode(vcpu))) { | 243 | mmu->root_level == PT64_ROOT_LEVEL)) { |
208 | int lvl = walker->level; | 244 | int lvl = walker->level; |
245 | gpa_t real_gpa; | ||
246 | gfn_t gfn; | ||
247 | u32 ac; | ||
209 | 248 | ||
210 | walker->gfn = gpte_to_gfn_lvl(pte, lvl); | 249 | gfn = gpte_to_gfn_lvl(pte, lvl); |
211 | walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) | 250 | gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; |
212 | >> PAGE_SHIFT; | ||
213 | 251 | ||
214 | if (PTTYPE == 32 && | 252 | if (PTTYPE == 32 && |
215 | walker->level == PT_DIRECTORY_LEVEL && | 253 | walker->level == PT_DIRECTORY_LEVEL && |
216 | is_cpuid_PSE36()) | 254 | is_cpuid_PSE36()) |
217 | walker->gfn += pse36_gfn_delta(pte); | 255 | gfn += pse36_gfn_delta(pte); |
256 | |||
257 | ac = write_fault | fetch_fault | user_fault; | ||
258 | |||
259 | real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), | ||
260 | ac); | ||
261 | if (real_gpa == UNMAPPED_GVA) | ||
262 | return 0; | ||
263 | |||
264 | walker->gfn = real_gpa >> PAGE_SHIFT; | ||
218 | 265 | ||
219 | break; | 266 | break; |
220 | } | 267 | } |
@@ -223,17 +270,21 @@ walk: | |||
223 | --walker->level; | 270 | --walker->level; |
224 | } | 271 | } |
225 | 272 | ||
226 | if (!present || eperm || rsvd_fault) | 273 | if (unlikely(!present || eperm || rsvd_fault)) |
227 | goto error; | 274 | goto error; |
228 | 275 | ||
229 | if (write_fault && !is_dirty_gpte(pte)) { | 276 | if (write_fault && unlikely(!is_dirty_gpte(pte))) { |
230 | bool ret; | 277 | int ret; |
231 | 278 | ||
232 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); | 279 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); |
233 | ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, | 280 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, |
234 | pte|PT_DIRTY_MASK); | 281 | pte, pte|PT_DIRTY_MASK); |
235 | if (ret) | 282 | if (unlikely(ret < 0)) { |
283 | present = false; | ||
284 | goto error; | ||
285 | } else if (ret) | ||
236 | goto walk; | 286 | goto walk; |
287 | |||
237 | mark_page_dirty(vcpu->kvm, table_gfn); | 288 | mark_page_dirty(vcpu->kvm, table_gfn); |
238 | pte |= PT_DIRTY_MASK; | 289 | pte |= PT_DIRTY_MASK; |
239 | walker->ptes[walker->level - 1] = pte; | 290 | walker->ptes[walker->level - 1] = pte; |
@@ -246,52 +297,87 @@ walk: | |||
246 | return 1; | 297 | return 1; |
247 | 298 | ||
248 | error: | 299 | error: |
249 | walker->error_code = 0; | 300 | walker->fault.vector = PF_VECTOR; |
301 | walker->fault.error_code_valid = true; | ||
302 | walker->fault.error_code = 0; | ||
250 | if (present) | 303 | if (present) |
251 | walker->error_code |= PFERR_PRESENT_MASK; | 304 | walker->fault.error_code |= PFERR_PRESENT_MASK; |
252 | if (write_fault) | 305 | |
253 | walker->error_code |= PFERR_WRITE_MASK; | 306 | walker->fault.error_code |= write_fault | user_fault; |
254 | if (user_fault) | 307 | |
255 | walker->error_code |= PFERR_USER_MASK; | 308 | if (fetch_fault && mmu->nx) |
256 | if (fetch_fault && is_nx(vcpu)) | 309 | walker->fault.error_code |= PFERR_FETCH_MASK; |
257 | walker->error_code |= PFERR_FETCH_MASK; | ||
258 | if (rsvd_fault) | 310 | if (rsvd_fault) |
259 | walker->error_code |= PFERR_RSVD_MASK; | 311 | walker->fault.error_code |= PFERR_RSVD_MASK; |
260 | trace_kvm_mmu_walker_error(walker->error_code); | 312 | |
313 | walker->fault.address = addr; | ||
314 | walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; | ||
315 | |||
316 | trace_kvm_mmu_walker_error(walker->fault.error_code); | ||
261 | return 0; | 317 | return 0; |
262 | } | 318 | } |
263 | 319 | ||
320 | static int FNAME(walk_addr)(struct guest_walker *walker, | ||
321 | struct kvm_vcpu *vcpu, gva_t addr, u32 access) | ||
322 | { | ||
323 | return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr, | ||
324 | access); | ||
325 | } | ||
326 | |||
327 | static int FNAME(walk_addr_nested)(struct guest_walker *walker, | ||
328 | struct kvm_vcpu *vcpu, gva_t addr, | ||
329 | u32 access) | ||
330 | { | ||
331 | return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, | ||
332 | addr, access); | ||
333 | } | ||
334 | |||
335 | static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, | ||
336 | struct kvm_mmu_page *sp, u64 *spte, | ||
337 | pt_element_t gpte) | ||
338 | { | ||
339 | u64 nonpresent = shadow_trap_nonpresent_pte; | ||
340 | |||
341 | if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) | ||
342 | goto no_present; | ||
343 | |||
344 | if (!is_present_gpte(gpte)) { | ||
345 | if (!sp->unsync) | ||
346 | nonpresent = shadow_notrap_nonpresent_pte; | ||
347 | goto no_present; | ||
348 | } | ||
349 | |||
350 | if (!(gpte & PT_ACCESSED_MASK)) | ||
351 | goto no_present; | ||
352 | |||
353 | return false; | ||
354 | |||
355 | no_present: | ||
356 | drop_spte(vcpu->kvm, spte, nonpresent); | ||
357 | return true; | ||
358 | } | ||
359 | |||
264 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | 360 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
265 | u64 *spte, const void *pte) | 361 | u64 *spte, const void *pte) |
266 | { | 362 | { |
267 | pt_element_t gpte; | 363 | pt_element_t gpte; |
268 | unsigned pte_access; | 364 | unsigned pte_access; |
269 | pfn_t pfn; | 365 | pfn_t pfn; |
270 | u64 new_spte; | ||
271 | 366 | ||
272 | gpte = *(const pt_element_t *)pte; | 367 | gpte = *(const pt_element_t *)pte; |
273 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | 368 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) |
274 | if (!is_present_gpte(gpte)) { | ||
275 | if (sp->unsync) | ||
276 | new_spte = shadow_trap_nonpresent_pte; | ||
277 | else | ||
278 | new_spte = shadow_notrap_nonpresent_pte; | ||
279 | __set_spte(spte, new_spte); | ||
280 | } | ||
281 | return; | 369 | return; |
282 | } | 370 | |
283 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); | 371 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
284 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 372 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
285 | if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) | 373 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); |
374 | if (is_error_pfn(pfn)) { | ||
375 | kvm_release_pfn_clean(pfn); | ||
286 | return; | 376 | return; |
287 | pfn = vcpu->arch.update_pte.pfn; | 377 | } |
288 | if (is_error_pfn(pfn)) | 378 | |
289 | return; | ||
290 | if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) | ||
291 | return; | ||
292 | kvm_get_pfn(pfn); | ||
293 | /* | 379 | /* |
294 | * we call mmu_set_spte() with reset_host_protection = true beacuse that | 380 | * we call mmu_set_spte() with host_writable = true because that |
295 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). | 381 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). |
296 | */ | 382 | */ |
297 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | 383 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, |
@@ -302,21 +388,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
302 | static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, | 388 | static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, |
303 | struct guest_walker *gw, int level) | 389 | struct guest_walker *gw, int level) |
304 | { | 390 | { |
305 | int r; | ||
306 | pt_element_t curr_pte; | 391 | pt_element_t curr_pte; |
307 | 392 | gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; | |
308 | r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], | 393 | u64 mask; |
394 | int r, index; | ||
395 | |||
396 | if (level == PT_PAGE_TABLE_LEVEL) { | ||
397 | mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; | ||
398 | base_gpa = pte_gpa & ~mask; | ||
399 | index = (pte_gpa - base_gpa) / sizeof(pt_element_t); | ||
400 | |||
401 | r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, | ||
402 | gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); | ||
403 | curr_pte = gw->prefetch_ptes[index]; | ||
404 | } else | ||
405 | r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, | ||
309 | &curr_pte, sizeof(curr_pte)); | 406 | &curr_pte, sizeof(curr_pte)); |
407 | |||
310 | return r || curr_pte != gw->ptes[level - 1]; | 408 | return r || curr_pte != gw->ptes[level - 1]; |
311 | } | 409 | } |
312 | 410 | ||
411 | static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | ||
412 | u64 *sptep) | ||
413 | { | ||
414 | struct kvm_mmu_page *sp; | ||
415 | pt_element_t *gptep = gw->prefetch_ptes; | ||
416 | u64 *spte; | ||
417 | int i; | ||
418 | |||
419 | sp = page_header(__pa(sptep)); | ||
420 | |||
421 | if (sp->role.level > PT_PAGE_TABLE_LEVEL) | ||
422 | return; | ||
423 | |||
424 | if (sp->role.direct) | ||
425 | return __direct_pte_prefetch(vcpu, sp, sptep); | ||
426 | |||
427 | i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); | ||
428 | spte = sp->spt + i; | ||
429 | |||
430 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { | ||
431 | pt_element_t gpte; | ||
432 | unsigned pte_access; | ||
433 | gfn_t gfn; | ||
434 | pfn_t pfn; | ||
435 | bool dirty; | ||
436 | |||
437 | if (spte == sptep) | ||
438 | continue; | ||
439 | |||
440 | if (*spte != shadow_trap_nonpresent_pte) | ||
441 | continue; | ||
442 | |||
443 | gpte = gptep[i]; | ||
444 | |||
445 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) | ||
446 | continue; | ||
447 | |||
448 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | ||
449 | gfn = gpte_to_gfn(gpte); | ||
450 | dirty = is_dirty_gpte(gpte); | ||
451 | pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, | ||
452 | (pte_access & ACC_WRITE_MASK) && dirty); | ||
453 | if (is_error_pfn(pfn)) { | ||
454 | kvm_release_pfn_clean(pfn); | ||
455 | break; | ||
456 | } | ||
457 | |||
458 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | ||
459 | dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, | ||
460 | pfn, true, true); | ||
461 | } | ||
462 | } | ||
463 | |||
313 | /* | 464 | /* |
314 | * Fetch a shadow pte for a specific level in the paging hierarchy. | 465 | * Fetch a shadow pte for a specific level in the paging hierarchy. |
315 | */ | 466 | */ |
316 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 467 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
317 | struct guest_walker *gw, | 468 | struct guest_walker *gw, |
318 | int user_fault, int write_fault, int hlevel, | 469 | int user_fault, int write_fault, int hlevel, |
319 | int *ptwrite, pfn_t pfn) | 470 | int *ptwrite, pfn_t pfn, bool map_writable, |
471 | bool prefault) | ||
320 | { | 472 | { |
321 | unsigned access = gw->pt_access; | 473 | unsigned access = gw->pt_access; |
322 | struct kvm_mmu_page *sp = NULL; | 474 | struct kvm_mmu_page *sp = NULL; |
@@ -390,7 +542,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
390 | 542 | ||
391 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, | 543 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, |
392 | user_fault, write_fault, dirty, ptwrite, it.level, | 544 | user_fault, write_fault, dirty, ptwrite, it.level, |
393 | gw->gfn, pfn, false, true); | 545 | gw->gfn, pfn, prefault, map_writable); |
546 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); | ||
394 | 547 | ||
395 | return it.sptep; | 548 | return it.sptep; |
396 | 549 | ||
@@ -415,22 +568,22 @@ out_gpte_changed: | |||
415 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or | 568 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or |
416 | * a negative value on error. | 569 | * a negative value on error. |
417 | */ | 570 | */ |
418 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | 571 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, |
419 | u32 error_code) | 572 | bool prefault) |
420 | { | 573 | { |
421 | int write_fault = error_code & PFERR_WRITE_MASK; | 574 | int write_fault = error_code & PFERR_WRITE_MASK; |
422 | int user_fault = error_code & PFERR_USER_MASK; | 575 | int user_fault = error_code & PFERR_USER_MASK; |
423 | int fetch_fault = error_code & PFERR_FETCH_MASK; | ||
424 | struct guest_walker walker; | 576 | struct guest_walker walker; |
425 | u64 *sptep; | 577 | u64 *sptep; |
426 | int write_pt = 0; | 578 | int write_pt = 0; |
427 | int r; | 579 | int r; |
428 | pfn_t pfn; | 580 | pfn_t pfn; |
429 | int level = PT_PAGE_TABLE_LEVEL; | 581 | int level = PT_PAGE_TABLE_LEVEL; |
582 | int force_pt_level; | ||
430 | unsigned long mmu_seq; | 583 | unsigned long mmu_seq; |
584 | bool map_writable; | ||
431 | 585 | ||
432 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | 586 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
433 | kvm_mmu_audit(vcpu, "pre page fault"); | ||
434 | 587 | ||
435 | r = mmu_topup_memory_caches(vcpu); | 588 | r = mmu_topup_memory_caches(vcpu); |
436 | if (r) | 589 | if (r) |
@@ -439,27 +592,36 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
439 | /* | 592 | /* |
440 | * Look up the guest pte for the faulting address. | 593 | * Look up the guest pte for the faulting address. |
441 | */ | 594 | */ |
442 | r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, | 595 | r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); |
443 | fetch_fault); | ||
444 | 596 | ||
445 | /* | 597 | /* |
446 | * The page is not mapped by the guest. Let the guest handle it. | 598 | * The page is not mapped by the guest. Let the guest handle it. |
447 | */ | 599 | */ |
448 | if (!r) { | 600 | if (!r) { |
449 | pgprintk("%s: guest page fault\n", __func__); | 601 | pgprintk("%s: guest page fault\n", __func__); |
450 | inject_page_fault(vcpu, addr, walker.error_code); | 602 | if (!prefault) { |
451 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | 603 | inject_page_fault(vcpu, &walker.fault); |
604 | /* reset fork detector */ | ||
605 | vcpu->arch.last_pt_write_count = 0; | ||
606 | } | ||
452 | return 0; | 607 | return 0; |
453 | } | 608 | } |
454 | 609 | ||
455 | if (walker.level >= PT_DIRECTORY_LEVEL) { | 610 | if (walker.level >= PT_DIRECTORY_LEVEL) |
611 | force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); | ||
612 | else | ||
613 | force_pt_level = 1; | ||
614 | if (!force_pt_level) { | ||
456 | level = min(walker.level, mapping_level(vcpu, walker.gfn)); | 615 | level = min(walker.level, mapping_level(vcpu, walker.gfn)); |
457 | walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); | 616 | walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); |
458 | } | 617 | } |
459 | 618 | ||
460 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 619 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
461 | smp_rmb(); | 620 | smp_rmb(); |
462 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | 621 | |
622 | if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, | ||
623 | &map_writable)) | ||
624 | return 0; | ||
463 | 625 | ||
464 | /* mmio */ | 626 | /* mmio */ |
465 | if (is_error_pfn(pfn)) | 627 | if (is_error_pfn(pfn)) |
@@ -468,9 +630,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
468 | spin_lock(&vcpu->kvm->mmu_lock); | 630 | spin_lock(&vcpu->kvm->mmu_lock); |
469 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 631 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
470 | goto out_unlock; | 632 | goto out_unlock; |
633 | |||
634 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); | ||
471 | kvm_mmu_free_some_pages(vcpu); | 635 | kvm_mmu_free_some_pages(vcpu); |
636 | if (!force_pt_level) | ||
637 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); | ||
472 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 638 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
473 | level, &write_pt, pfn); | 639 | level, &write_pt, pfn, map_writable, prefault); |
474 | (void)sptep; | 640 | (void)sptep; |
475 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, | 641 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, |
476 | sptep, *sptep, write_pt); | 642 | sptep, *sptep, write_pt); |
@@ -479,7 +645,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
479 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | 645 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ |
480 | 646 | ||
481 | ++vcpu->stat.pf_fixed; | 647 | ++vcpu->stat.pf_fixed; |
482 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); | 648 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); |
483 | spin_unlock(&vcpu->kvm->mmu_lock); | 649 | spin_unlock(&vcpu->kvm->mmu_lock); |
484 | 650 | ||
485 | return write_pt; | 651 | return write_pt; |
@@ -550,22 +716,38 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
550 | } | 716 | } |
551 | 717 | ||
552 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, | 718 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, |
553 | u32 *error) | 719 | struct x86_exception *exception) |
720 | { | ||
721 | struct guest_walker walker; | ||
722 | gpa_t gpa = UNMAPPED_GVA; | ||
723 | int r; | ||
724 | |||
725 | r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); | ||
726 | |||
727 | if (r) { | ||
728 | gpa = gfn_to_gpa(walker.gfn); | ||
729 | gpa |= vaddr & ~PAGE_MASK; | ||
730 | } else if (exception) | ||
731 | *exception = walker.fault; | ||
732 | |||
733 | return gpa; | ||
734 | } | ||
735 | |||
736 | static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, | ||
737 | u32 access, | ||
738 | struct x86_exception *exception) | ||
554 | { | 739 | { |
555 | struct guest_walker walker; | 740 | struct guest_walker walker; |
556 | gpa_t gpa = UNMAPPED_GVA; | 741 | gpa_t gpa = UNMAPPED_GVA; |
557 | int r; | 742 | int r; |
558 | 743 | ||
559 | r = FNAME(walk_addr)(&walker, vcpu, vaddr, | 744 | r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); |
560 | !!(access & PFERR_WRITE_MASK), | ||
561 | !!(access & PFERR_USER_MASK), | ||
562 | !!(access & PFERR_FETCH_MASK)); | ||
563 | 745 | ||
564 | if (r) { | 746 | if (r) { |
565 | gpa = gfn_to_gpa(walker.gfn); | 747 | gpa = gfn_to_gpa(walker.gfn); |
566 | gpa |= vaddr & ~PAGE_MASK; | 748 | gpa |= vaddr & ~PAGE_MASK; |
567 | } else if (error) | 749 | } else if (exception) |
568 | *error = walker.error_code; | 750 | *exception = walker.fault; |
569 | 751 | ||
570 | return gpa; | 752 | return gpa; |
571 | } | 753 | } |
@@ -604,12 +786,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | |||
604 | * Using the cached information from sp->gfns is safe because: | 786 | * Using the cached information from sp->gfns is safe because: |
605 | * - The spte has a reference to the struct page, so the pfn for a given gfn | 787 | * - The spte has a reference to the struct page, so the pfn for a given gfn |
606 | * can't change unless all sptes pointing to it are nuked first. | 788 | * can't change unless all sptes pointing to it are nuked first. |
789 | * | ||
790 | * Note: | ||
791 | * We should flush all tlbs if spte is dropped even though guest is | ||
792 | * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page | ||
793 | * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't | ||
794 | * used by guest then tlbs are not flushed, so guest is allowed to access the | ||
795 | * freed pages. | ||
796 | * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. | ||
607 | */ | 797 | */ |
608 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | 798 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
609 | bool clear_unsync) | ||
610 | { | 799 | { |
611 | int i, offset, nr_present; | 800 | int i, offset, nr_present; |
612 | bool reset_host_protection; | 801 | bool host_writable; |
613 | gpa_t first_pte_gpa; | 802 | gpa_t first_pte_gpa; |
614 | 803 | ||
615 | offset = nr_present = 0; | 804 | offset = nr_present = 0; |
@@ -638,31 +827,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
638 | return -EINVAL; | 827 | return -EINVAL; |
639 | 828 | ||
640 | gfn = gpte_to_gfn(gpte); | 829 | gfn = gpte_to_gfn(gpte); |
641 | if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) | ||
642 | || gfn != sp->gfns[i] || !is_present_gpte(gpte) | ||
643 | || !(gpte & PT_ACCESSED_MASK)) { | ||
644 | u64 nonpresent; | ||
645 | 830 | ||
646 | if (is_present_gpte(gpte) || !clear_unsync) | 831 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { |
647 | nonpresent = shadow_trap_nonpresent_pte; | 832 | vcpu->kvm->tlbs_dirty++; |
648 | else | 833 | continue; |
649 | nonpresent = shadow_notrap_nonpresent_pte; | 834 | } |
650 | drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); | 835 | |
836 | if (gfn != sp->gfns[i]) { | ||
837 | drop_spte(vcpu->kvm, &sp->spt[i], | ||
838 | shadow_trap_nonpresent_pte); | ||
839 | vcpu->kvm->tlbs_dirty++; | ||
651 | continue; | 840 | continue; |
652 | } | 841 | } |
653 | 842 | ||
654 | nr_present++; | 843 | nr_present++; |
655 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 844 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
656 | if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { | 845 | host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; |
657 | pte_access &= ~ACC_WRITE_MASK; | 846 | |
658 | reset_host_protection = 0; | ||
659 | } else { | ||
660 | reset_host_protection = 1; | ||
661 | } | ||
662 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | 847 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, |
663 | is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, | 848 | is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, |
664 | spte_to_pfn(sp->spt[i]), true, false, | 849 | spte_to_pfn(sp->spt[i]), true, false, |
665 | reset_host_protection); | 850 | host_writable); |
666 | } | 851 | } |
667 | 852 | ||
668 | return !nr_present; | 853 | return !nr_present; |
@@ -673,7 +858,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
673 | #undef FNAME | 858 | #undef FNAME |
674 | #undef PT_BASE_ADDR_MASK | 859 | #undef PT_BASE_ADDR_MASK |
675 | #undef PT_INDEX | 860 | #undef PT_INDEX |
676 | #undef PT_LEVEL_MASK | ||
677 | #undef PT_LVL_ADDR_MASK | 861 | #undef PT_LVL_ADDR_MASK |
678 | #undef PT_LVL_OFFSET_MASK | 862 | #undef PT_LVL_OFFSET_MASK |
679 | #undef PT_LEVEL_BITS | 863 | #undef PT_LEVEL_BITS |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8a3f9f64f86f..506e4fe23adc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * AMD SVM support | 4 | * AMD SVM support |
5 | * | 5 | * |
6 | * Copyright (C) 2006 Qumranet, Inc. | 6 | * Copyright (C) 2006 Qumranet, Inc. |
7 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 7 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
8 | * | 8 | * |
9 | * Authors: | 9 | * Authors: |
10 | * Yaniv Kamay <yaniv@qumranet.com> | 10 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -31,6 +31,7 @@ | |||
31 | 31 | ||
32 | #include <asm/tlbflush.h> | 32 | #include <asm/tlbflush.h> |
33 | #include <asm/desc.h> | 33 | #include <asm/desc.h> |
34 | #include <asm/kvm_para.h> | ||
34 | 35 | ||
35 | #include <asm/virtext.h> | 36 | #include <asm/virtext.h> |
36 | #include "trace.h" | 37 | #include "trace.h" |
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL"); | |||
50 | #define SVM_FEATURE_LBRV (1 << 1) | 51 | #define SVM_FEATURE_LBRV (1 << 1) |
51 | #define SVM_FEATURE_SVML (1 << 2) | 52 | #define SVM_FEATURE_SVML (1 << 2) |
52 | #define SVM_FEATURE_NRIP (1 << 3) | 53 | #define SVM_FEATURE_NRIP (1 << 3) |
54 | #define SVM_FEATURE_TSC_RATE (1 << 4) | ||
55 | #define SVM_FEATURE_VMCB_CLEAN (1 << 5) | ||
56 | #define SVM_FEATURE_FLUSH_ASID (1 << 6) | ||
57 | #define SVM_FEATURE_DECODE_ASSIST (1 << 7) | ||
53 | #define SVM_FEATURE_PAUSE_FILTER (1 << 10) | 58 | #define SVM_FEATURE_PAUSE_FILTER (1 << 10) |
54 | 59 | ||
55 | #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ | 60 | #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ |
@@ -58,6 +63,10 @@ MODULE_LICENSE("GPL"); | |||
58 | 63 | ||
59 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) | 64 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) |
60 | 65 | ||
66 | #define TSC_RATIO_RSVD 0xffffff0000000000ULL | ||
67 | #define TSC_RATIO_MIN 0x0000000000000001ULL | ||
68 | #define TSC_RATIO_MAX 0x000000ffffffffffULL | ||
69 | |||
61 | static bool erratum_383_found __read_mostly; | 70 | static bool erratum_383_found __read_mostly; |
62 | 71 | ||
63 | static const u32 host_save_user_msrs[] = { | 72 | static const u32 host_save_user_msrs[] = { |
@@ -89,13 +98,13 @@ struct nested_state { | |||
89 | bool exit_required; | 98 | bool exit_required; |
90 | 99 | ||
91 | /* cache for intercepts of the guest */ | 100 | /* cache for intercepts of the guest */ |
92 | u16 intercept_cr_read; | 101 | u32 intercept_cr; |
93 | u16 intercept_cr_write; | 102 | u32 intercept_dr; |
94 | u16 intercept_dr_read; | ||
95 | u16 intercept_dr_write; | ||
96 | u32 intercept_exceptions; | 103 | u32 intercept_exceptions; |
97 | u64 intercept; | 104 | u64 intercept; |
98 | 105 | ||
106 | /* Nested Paging related state */ | ||
107 | u64 nested_cr3; | ||
99 | }; | 108 | }; |
100 | 109 | ||
101 | #define MSRPM_OFFSETS 16 | 110 | #define MSRPM_OFFSETS 16 |
@@ -113,18 +122,31 @@ struct vcpu_svm { | |||
113 | u64 next_rip; | 122 | u64 next_rip; |
114 | 123 | ||
115 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; | 124 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; |
116 | u64 host_gs_base; | 125 | struct { |
126 | u16 fs; | ||
127 | u16 gs; | ||
128 | u16 ldt; | ||
129 | u64 gs_base; | ||
130 | } host; | ||
117 | 131 | ||
118 | u32 *msrpm; | 132 | u32 *msrpm; |
119 | 133 | ||
134 | ulong nmi_iret_rip; | ||
135 | |||
120 | struct nested_state nested; | 136 | struct nested_state nested; |
121 | 137 | ||
122 | bool nmi_singlestep; | 138 | bool nmi_singlestep; |
123 | 139 | ||
124 | unsigned int3_injected; | 140 | unsigned int3_injected; |
125 | unsigned long int3_rip; | 141 | unsigned long int3_rip; |
142 | u32 apf_reason; | ||
143 | |||
144 | u64 tsc_ratio; | ||
126 | }; | 145 | }; |
127 | 146 | ||
147 | static DEFINE_PER_CPU(u64, current_tsc_ratio); | ||
148 | #define TSC_RATIO_DEFAULT 0x0100000000ULL | ||
149 | |||
128 | #define MSR_INVALID 0xffffffffU | 150 | #define MSR_INVALID 0xffffffffU |
129 | 151 | ||
130 | static struct svm_direct_access_msrs { | 152 | static struct svm_direct_access_msrs { |
@@ -169,15 +191,153 @@ static int nested_svm_intercept(struct vcpu_svm *svm); | |||
169 | static int nested_svm_vmexit(struct vcpu_svm *svm); | 191 | static int nested_svm_vmexit(struct vcpu_svm *svm); |
170 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | 192 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, |
171 | bool has_error_code, u32 error_code); | 193 | bool has_error_code, u32 error_code); |
194 | static u64 __scale_tsc(u64 ratio, u64 tsc); | ||
195 | |||
196 | enum { | ||
197 | VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, | ||
198 | pause filter count */ | ||
199 | VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */ | ||
200 | VMCB_ASID, /* ASID */ | ||
201 | VMCB_INTR, /* int_ctl, int_vector */ | ||
202 | VMCB_NPT, /* npt_en, nCR3, gPAT */ | ||
203 | VMCB_CR, /* CR0, CR3, CR4, EFER */ | ||
204 | VMCB_DR, /* DR6, DR7 */ | ||
205 | VMCB_DT, /* GDT, IDT */ | ||
206 | VMCB_SEG, /* CS, DS, SS, ES, CPL */ | ||
207 | VMCB_CR2, /* CR2 only */ | ||
208 | VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ | ||
209 | VMCB_DIRTY_MAX, | ||
210 | }; | ||
211 | |||
212 | /* TPR and CR2 are always written before VMRUN */ | ||
213 | #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) | ||
214 | |||
215 | static inline void mark_all_dirty(struct vmcb *vmcb) | ||
216 | { | ||
217 | vmcb->control.clean = 0; | ||
218 | } | ||
219 | |||
220 | static inline void mark_all_clean(struct vmcb *vmcb) | ||
221 | { | ||
222 | vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) | ||
223 | & ~VMCB_ALWAYS_DIRTY_MASK; | ||
224 | } | ||
225 | |||
226 | static inline void mark_dirty(struct vmcb *vmcb, int bit) | ||
227 | { | ||
228 | vmcb->control.clean &= ~(1 << bit); | ||
229 | } | ||
172 | 230 | ||
173 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) | 231 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) |
174 | { | 232 | { |
175 | return container_of(vcpu, struct vcpu_svm, vcpu); | 233 | return container_of(vcpu, struct vcpu_svm, vcpu); |
176 | } | 234 | } |
177 | 235 | ||
178 | static inline bool is_nested(struct vcpu_svm *svm) | 236 | static void recalc_intercepts(struct vcpu_svm *svm) |
237 | { | ||
238 | struct vmcb_control_area *c, *h; | ||
239 | struct nested_state *g; | ||
240 | |||
241 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); | ||
242 | |||
243 | if (!is_guest_mode(&svm->vcpu)) | ||
244 | return; | ||
245 | |||
246 | c = &svm->vmcb->control; | ||
247 | h = &svm->nested.hsave->control; | ||
248 | g = &svm->nested; | ||
249 | |||
250 | c->intercept_cr = h->intercept_cr | g->intercept_cr; | ||
251 | c->intercept_dr = h->intercept_dr | g->intercept_dr; | ||
252 | c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; | ||
253 | c->intercept = h->intercept | g->intercept; | ||
254 | } | ||
255 | |||
256 | static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) | ||
257 | { | ||
258 | if (is_guest_mode(&svm->vcpu)) | ||
259 | return svm->nested.hsave; | ||
260 | else | ||
261 | return svm->vmcb; | ||
262 | } | ||
263 | |||
264 | static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) | ||
265 | { | ||
266 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
267 | |||
268 | vmcb->control.intercept_cr |= (1U << bit); | ||
269 | |||
270 | recalc_intercepts(svm); | ||
271 | } | ||
272 | |||
273 | static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) | ||
274 | { | ||
275 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
276 | |||
277 | vmcb->control.intercept_cr &= ~(1U << bit); | ||
278 | |||
279 | recalc_intercepts(svm); | ||
280 | } | ||
281 | |||
282 | static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) | ||
179 | { | 283 | { |
180 | return svm->nested.vmcb; | 284 | struct vmcb *vmcb = get_host_vmcb(svm); |
285 | |||
286 | return vmcb->control.intercept_cr & (1U << bit); | ||
287 | } | ||
288 | |||
289 | static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) | ||
290 | { | ||
291 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
292 | |||
293 | vmcb->control.intercept_dr |= (1U << bit); | ||
294 | |||
295 | recalc_intercepts(svm); | ||
296 | } | ||
297 | |||
298 | static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) | ||
299 | { | ||
300 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
301 | |||
302 | vmcb->control.intercept_dr &= ~(1U << bit); | ||
303 | |||
304 | recalc_intercepts(svm); | ||
305 | } | ||
306 | |||
307 | static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) | ||
308 | { | ||
309 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
310 | |||
311 | vmcb->control.intercept_exceptions |= (1U << bit); | ||
312 | |||
313 | recalc_intercepts(svm); | ||
314 | } | ||
315 | |||
316 | static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) | ||
317 | { | ||
318 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
319 | |||
320 | vmcb->control.intercept_exceptions &= ~(1U << bit); | ||
321 | |||
322 | recalc_intercepts(svm); | ||
323 | } | ||
324 | |||
325 | static inline void set_intercept(struct vcpu_svm *svm, int bit) | ||
326 | { | ||
327 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
328 | |||
329 | vmcb->control.intercept |= (1ULL << bit); | ||
330 | |||
331 | recalc_intercepts(svm); | ||
332 | } | ||
333 | |||
334 | static inline void clr_intercept(struct vcpu_svm *svm, int bit) | ||
335 | { | ||
336 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
337 | |||
338 | vmcb->control.intercept &= ~(1ULL << bit); | ||
339 | |||
340 | recalc_intercepts(svm); | ||
181 | } | 341 | } |
182 | 342 | ||
183 | static inline void enable_gif(struct vcpu_svm *svm) | 343 | static inline void enable_gif(struct vcpu_svm *svm) |
@@ -218,7 +378,6 @@ struct svm_cpu_data { | |||
218 | }; | 378 | }; |
219 | 379 | ||
220 | static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); | 380 | static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); |
221 | static uint32_t svm_features; | ||
222 | 381 | ||
223 | struct svm_init_data { | 382 | struct svm_init_data { |
224 | int cpu; | 383 | int cpu; |
@@ -254,11 +413,6 @@ static u32 svm_msrpm_offset(u32 msr) | |||
254 | 413 | ||
255 | #define MAX_INST_SIZE 15 | 414 | #define MAX_INST_SIZE 15 |
256 | 415 | ||
257 | static inline u32 svm_has(u32 feat) | ||
258 | { | ||
259 | return svm_features & feat; | ||
260 | } | ||
261 | |||
262 | static inline void clgi(void) | 416 | static inline void clgi(void) |
263 | { | 417 | { |
264 | asm volatile (__ex(SVM_CLGI)); | 418 | asm volatile (__ex(SVM_CLGI)); |
@@ -274,14 +428,13 @@ static inline void invlpga(unsigned long addr, u32 asid) | |||
274 | asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); | 428 | asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); |
275 | } | 429 | } |
276 | 430 | ||
277 | static inline void force_new_asid(struct kvm_vcpu *vcpu) | 431 | static int get_npt_level(void) |
278 | { | ||
279 | to_svm(vcpu)->asid_generation--; | ||
280 | } | ||
281 | |||
282 | static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) | ||
283 | { | 432 | { |
284 | force_new_asid(vcpu); | 433 | #ifdef CONFIG_X86_64 |
434 | return PT64_ROOT_LEVEL; | ||
435 | #else | ||
436 | return PT32E_ROOT_LEVEL; | ||
437 | #endif | ||
285 | } | 438 | } |
286 | 439 | ||
287 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | 440 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) |
@@ -291,6 +444,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
291 | efer &= ~EFER_LME; | 444 | efer &= ~EFER_LME; |
292 | 445 | ||
293 | to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; | 446 | to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; |
447 | mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); | ||
294 | } | 448 | } |
295 | 449 | ||
296 | static int is_external_interrupt(u32 info) | 450 | static int is_external_interrupt(u32 info) |
@@ -328,7 +482,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
328 | svm->next_rip = svm->vmcb->control.next_rip; | 482 | svm->next_rip = svm->vmcb->control.next_rip; |
329 | 483 | ||
330 | if (!svm->next_rip) { | 484 | if (!svm->next_rip) { |
331 | if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != | 485 | if (emulate_instruction(vcpu, EMULTYPE_SKIP) != |
332 | EMULATE_DONE) | 486 | EMULATE_DONE) |
333 | printk(KERN_DEBUG "%s: NOP\n", __func__); | 487 | printk(KERN_DEBUG "%s: NOP\n", __func__); |
334 | return; | 488 | return; |
@@ -355,7 +509,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
355 | nested_svm_check_exception(svm, nr, has_error_code, error_code)) | 509 | nested_svm_check_exception(svm, nr, has_error_code, error_code)) |
356 | return; | 510 | return; |
357 | 511 | ||
358 | if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { | 512 | if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { |
359 | unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); | 513 | unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); |
360 | 514 | ||
361 | /* | 515 | /* |
@@ -416,6 +570,10 @@ static int has_svm(void) | |||
416 | 570 | ||
417 | static void svm_hardware_disable(void *garbage) | 571 | static void svm_hardware_disable(void *garbage) |
418 | { | 572 | { |
573 | /* Make sure we clean up behind us */ | ||
574 | if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) | ||
575 | wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); | ||
576 | |||
419 | cpu_svm_disable(); | 577 | cpu_svm_disable(); |
420 | } | 578 | } |
421 | 579 | ||
@@ -457,6 +615,11 @@ static int svm_hardware_enable(void *garbage) | |||
457 | 615 | ||
458 | wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); | 616 | wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); |
459 | 617 | ||
618 | if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { | ||
619 | wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); | ||
620 | __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; | ||
621 | } | ||
622 | |||
460 | svm_init_erratum_383(); | 623 | svm_init_erratum_383(); |
461 | 624 | ||
462 | return 0; | 625 | return 0; |
@@ -638,6 +801,23 @@ static __init int svm_hardware_setup(void) | |||
638 | if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) | 801 | if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) |
639 | kvm_enable_efer_bits(EFER_FFXSR); | 802 | kvm_enable_efer_bits(EFER_FFXSR); |
640 | 803 | ||
804 | if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { | ||
805 | u64 max; | ||
806 | |||
807 | kvm_has_tsc_control = true; | ||
808 | |||
809 | /* | ||
810 | * Make sure the user can only configure tsc_khz values that | ||
811 | * fit into a signed integer. | ||
812 | * A min value is not calculated needed because it will always | ||
813 | * be 1 on all machines and a value of 0 is used to disable | ||
814 | * tsc-scaling for the vcpu. | ||
815 | */ | ||
816 | max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX)); | ||
817 | |||
818 | kvm_max_guest_tsc_khz = max; | ||
819 | } | ||
820 | |||
641 | if (nested) { | 821 | if (nested) { |
642 | printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); | 822 | printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); |
643 | kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); | 823 | kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); |
@@ -649,9 +829,7 @@ static __init int svm_hardware_setup(void) | |||
649 | goto err; | 829 | goto err; |
650 | } | 830 | } |
651 | 831 | ||
652 | svm_features = cpuid_edx(SVM_CPUID_FUNC); | 832 | if (!boot_cpu_has(X86_FEATURE_NPT)) |
653 | |||
654 | if (!svm_has(SVM_FEATURE_NPT)) | ||
655 | npt_enabled = false; | 833 | npt_enabled = false; |
656 | 834 | ||
657 | if (npt_enabled && !npt) { | 835 | if (npt_enabled && !npt) { |
@@ -701,68 +879,161 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) | |||
701 | seg->base = 0; | 879 | seg->base = 0; |
702 | } | 880 | } |
703 | 881 | ||
882 | static u64 __scale_tsc(u64 ratio, u64 tsc) | ||
883 | { | ||
884 | u64 mult, frac, _tsc; | ||
885 | |||
886 | mult = ratio >> 32; | ||
887 | frac = ratio & ((1ULL << 32) - 1); | ||
888 | |||
889 | _tsc = tsc; | ||
890 | _tsc *= mult; | ||
891 | _tsc += (tsc >> 32) * frac; | ||
892 | _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32; | ||
893 | |||
894 | return _tsc; | ||
895 | } | ||
896 | |||
897 | static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) | ||
898 | { | ||
899 | struct vcpu_svm *svm = to_svm(vcpu); | ||
900 | u64 _tsc = tsc; | ||
901 | |||
902 | if (svm->tsc_ratio != TSC_RATIO_DEFAULT) | ||
903 | _tsc = __scale_tsc(svm->tsc_ratio, tsc); | ||
904 | |||
905 | return _tsc; | ||
906 | } | ||
907 | |||
908 | static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) | ||
909 | { | ||
910 | struct vcpu_svm *svm = to_svm(vcpu); | ||
911 | u64 ratio; | ||
912 | u64 khz; | ||
913 | |||
914 | /* TSC scaling supported? */ | ||
915 | if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) | ||
916 | return; | ||
917 | |||
918 | /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ | ||
919 | if (user_tsc_khz == 0) { | ||
920 | vcpu->arch.virtual_tsc_khz = 0; | ||
921 | svm->tsc_ratio = TSC_RATIO_DEFAULT; | ||
922 | return; | ||
923 | } | ||
924 | |||
925 | khz = user_tsc_khz; | ||
926 | |||
927 | /* TSC scaling required - calculate ratio */ | ||
928 | ratio = khz << 32; | ||
929 | do_div(ratio, tsc_khz); | ||
930 | |||
931 | if (ratio == 0 || ratio & TSC_RATIO_RSVD) { | ||
932 | WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n", | ||
933 | user_tsc_khz); | ||
934 | return; | ||
935 | } | ||
936 | vcpu->arch.virtual_tsc_khz = user_tsc_khz; | ||
937 | svm->tsc_ratio = ratio; | ||
938 | } | ||
939 | |||
940 | static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | ||
941 | { | ||
942 | struct vcpu_svm *svm = to_svm(vcpu); | ||
943 | u64 g_tsc_offset = 0; | ||
944 | |||
945 | if (is_guest_mode(vcpu)) { | ||
946 | g_tsc_offset = svm->vmcb->control.tsc_offset - | ||
947 | svm->nested.hsave->control.tsc_offset; | ||
948 | svm->nested.hsave->control.tsc_offset = offset; | ||
949 | } | ||
950 | |||
951 | svm->vmcb->control.tsc_offset = offset + g_tsc_offset; | ||
952 | |||
953 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); | ||
954 | } | ||
955 | |||
956 | static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | ||
957 | { | ||
958 | struct vcpu_svm *svm = to_svm(vcpu); | ||
959 | |||
960 | svm->vmcb->control.tsc_offset += adjustment; | ||
961 | if (is_guest_mode(vcpu)) | ||
962 | svm->nested.hsave->control.tsc_offset += adjustment; | ||
963 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); | ||
964 | } | ||
965 | |||
966 | static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) | ||
967 | { | ||
968 | u64 tsc; | ||
969 | |||
970 | tsc = svm_scale_tsc(vcpu, native_read_tsc()); | ||
971 | |||
972 | return target_tsc - tsc; | ||
973 | } | ||
974 | |||
704 | static void init_vmcb(struct vcpu_svm *svm) | 975 | static void init_vmcb(struct vcpu_svm *svm) |
705 | { | 976 | { |
706 | struct vmcb_control_area *control = &svm->vmcb->control; | 977 | struct vmcb_control_area *control = &svm->vmcb->control; |
707 | struct vmcb_save_area *save = &svm->vmcb->save; | 978 | struct vmcb_save_area *save = &svm->vmcb->save; |
708 | 979 | ||
709 | svm->vcpu.fpu_active = 1; | 980 | svm->vcpu.fpu_active = 1; |
981 | svm->vcpu.arch.hflags = 0; | ||
710 | 982 | ||
711 | control->intercept_cr_read = INTERCEPT_CR0_MASK | | 983 | set_cr_intercept(svm, INTERCEPT_CR0_READ); |
712 | INTERCEPT_CR3_MASK | | 984 | set_cr_intercept(svm, INTERCEPT_CR3_READ); |
713 | INTERCEPT_CR4_MASK; | 985 | set_cr_intercept(svm, INTERCEPT_CR4_READ); |
714 | 986 | set_cr_intercept(svm, INTERCEPT_CR0_WRITE); | |
715 | control->intercept_cr_write = INTERCEPT_CR0_MASK | | 987 | set_cr_intercept(svm, INTERCEPT_CR3_WRITE); |
716 | INTERCEPT_CR3_MASK | | 988 | set_cr_intercept(svm, INTERCEPT_CR4_WRITE); |
717 | INTERCEPT_CR4_MASK | | 989 | set_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
718 | INTERCEPT_CR8_MASK; | 990 | |
719 | 991 | set_dr_intercept(svm, INTERCEPT_DR0_READ); | |
720 | control->intercept_dr_read = INTERCEPT_DR0_MASK | | 992 | set_dr_intercept(svm, INTERCEPT_DR1_READ); |
721 | INTERCEPT_DR1_MASK | | 993 | set_dr_intercept(svm, INTERCEPT_DR2_READ); |
722 | INTERCEPT_DR2_MASK | | 994 | set_dr_intercept(svm, INTERCEPT_DR3_READ); |
723 | INTERCEPT_DR3_MASK | | 995 | set_dr_intercept(svm, INTERCEPT_DR4_READ); |
724 | INTERCEPT_DR4_MASK | | 996 | set_dr_intercept(svm, INTERCEPT_DR5_READ); |
725 | INTERCEPT_DR5_MASK | | 997 | set_dr_intercept(svm, INTERCEPT_DR6_READ); |
726 | INTERCEPT_DR6_MASK | | 998 | set_dr_intercept(svm, INTERCEPT_DR7_READ); |
727 | INTERCEPT_DR7_MASK; | 999 | |
728 | 1000 | set_dr_intercept(svm, INTERCEPT_DR0_WRITE); | |
729 | control->intercept_dr_write = INTERCEPT_DR0_MASK | | 1001 | set_dr_intercept(svm, INTERCEPT_DR1_WRITE); |
730 | INTERCEPT_DR1_MASK | | 1002 | set_dr_intercept(svm, INTERCEPT_DR2_WRITE); |
731 | INTERCEPT_DR2_MASK | | 1003 | set_dr_intercept(svm, INTERCEPT_DR3_WRITE); |
732 | INTERCEPT_DR3_MASK | | 1004 | set_dr_intercept(svm, INTERCEPT_DR4_WRITE); |
733 | INTERCEPT_DR4_MASK | | 1005 | set_dr_intercept(svm, INTERCEPT_DR5_WRITE); |
734 | INTERCEPT_DR5_MASK | | 1006 | set_dr_intercept(svm, INTERCEPT_DR6_WRITE); |
735 | INTERCEPT_DR6_MASK | | 1007 | set_dr_intercept(svm, INTERCEPT_DR7_WRITE); |
736 | INTERCEPT_DR7_MASK; | 1008 | |
737 | 1009 | set_exception_intercept(svm, PF_VECTOR); | |
738 | control->intercept_exceptions = (1 << PF_VECTOR) | | 1010 | set_exception_intercept(svm, UD_VECTOR); |
739 | (1 << UD_VECTOR) | | 1011 | set_exception_intercept(svm, MC_VECTOR); |
740 | (1 << MC_VECTOR); | 1012 | |
741 | 1013 | set_intercept(svm, INTERCEPT_INTR); | |
742 | 1014 | set_intercept(svm, INTERCEPT_NMI); | |
743 | control->intercept = (1ULL << INTERCEPT_INTR) | | 1015 | set_intercept(svm, INTERCEPT_SMI); |
744 | (1ULL << INTERCEPT_NMI) | | 1016 | set_intercept(svm, INTERCEPT_SELECTIVE_CR0); |
745 | (1ULL << INTERCEPT_SMI) | | 1017 | set_intercept(svm, INTERCEPT_CPUID); |
746 | (1ULL << INTERCEPT_SELECTIVE_CR0) | | 1018 | set_intercept(svm, INTERCEPT_INVD); |
747 | (1ULL << INTERCEPT_CPUID) | | 1019 | set_intercept(svm, INTERCEPT_HLT); |
748 | (1ULL << INTERCEPT_INVD) | | 1020 | set_intercept(svm, INTERCEPT_INVLPG); |
749 | (1ULL << INTERCEPT_HLT) | | 1021 | set_intercept(svm, INTERCEPT_INVLPGA); |
750 | (1ULL << INTERCEPT_INVLPG) | | 1022 | set_intercept(svm, INTERCEPT_IOIO_PROT); |
751 | (1ULL << INTERCEPT_INVLPGA) | | 1023 | set_intercept(svm, INTERCEPT_MSR_PROT); |
752 | (1ULL << INTERCEPT_IOIO_PROT) | | 1024 | set_intercept(svm, INTERCEPT_TASK_SWITCH); |
753 | (1ULL << INTERCEPT_MSR_PROT) | | 1025 | set_intercept(svm, INTERCEPT_SHUTDOWN); |
754 | (1ULL << INTERCEPT_TASK_SWITCH) | | 1026 | set_intercept(svm, INTERCEPT_VMRUN); |
755 | (1ULL << INTERCEPT_SHUTDOWN) | | 1027 | set_intercept(svm, INTERCEPT_VMMCALL); |
756 | (1ULL << INTERCEPT_VMRUN) | | 1028 | set_intercept(svm, INTERCEPT_VMLOAD); |
757 | (1ULL << INTERCEPT_VMMCALL) | | 1029 | set_intercept(svm, INTERCEPT_VMSAVE); |
758 | (1ULL << INTERCEPT_VMLOAD) | | 1030 | set_intercept(svm, INTERCEPT_STGI); |
759 | (1ULL << INTERCEPT_VMSAVE) | | 1031 | set_intercept(svm, INTERCEPT_CLGI); |
760 | (1ULL << INTERCEPT_STGI) | | 1032 | set_intercept(svm, INTERCEPT_SKINIT); |
761 | (1ULL << INTERCEPT_CLGI) | | 1033 | set_intercept(svm, INTERCEPT_WBINVD); |
762 | (1ULL << INTERCEPT_SKINIT) | | 1034 | set_intercept(svm, INTERCEPT_MONITOR); |
763 | (1ULL << INTERCEPT_WBINVD) | | 1035 | set_intercept(svm, INTERCEPT_MWAIT); |
764 | (1ULL << INTERCEPT_MONITOR) | | 1036 | set_intercept(svm, INTERCEPT_XSETBV); |
765 | (1ULL << INTERCEPT_MWAIT); | ||
766 | 1037 | ||
767 | control->iopm_base_pa = iopm_base; | 1038 | control->iopm_base_pa = iopm_base; |
768 | control->msrpm_base_pa = __pa(svm->msrpm); | 1039 | control->msrpm_base_pa = __pa(svm->msrpm); |
@@ -793,10 +1064,10 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
793 | init_sys_seg(&save->ldtr, SEG_TYPE_LDT); | 1064 | init_sys_seg(&save->ldtr, SEG_TYPE_LDT); |
794 | init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); | 1065 | init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); |
795 | 1066 | ||
796 | save->efer = EFER_SVME; | 1067 | svm_set_efer(&svm->vcpu, 0); |
797 | save->dr6 = 0xffff0ff0; | 1068 | save->dr6 = 0xffff0ff0; |
798 | save->dr7 = 0x400; | 1069 | save->dr7 = 0x400; |
799 | save->rflags = 2; | 1070 | kvm_set_rflags(&svm->vcpu, 2); |
800 | save->rip = 0x0000fff0; | 1071 | save->rip = 0x0000fff0; |
801 | svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; | 1072 | svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; |
802 | 1073 | ||
@@ -804,8 +1075,8 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
804 | * This is the guest-visible cr0 value. | 1075 | * This is the guest-visible cr0 value. |
805 | * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. | 1076 | * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. |
806 | */ | 1077 | */ |
807 | svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; | 1078 | svm->vcpu.arch.cr0 = 0; |
808 | (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); | 1079 | (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); |
809 | 1080 | ||
810 | save->cr4 = X86_CR4_PAE; | 1081 | save->cr4 = X86_CR4_PAE; |
811 | /* rdx = ?? */ | 1082 | /* rdx = ?? */ |
@@ -813,25 +1084,27 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
813 | if (npt_enabled) { | 1084 | if (npt_enabled) { |
814 | /* Setup VMCB for Nested Paging */ | 1085 | /* Setup VMCB for Nested Paging */ |
815 | control->nested_ctl = 1; | 1086 | control->nested_ctl = 1; |
816 | control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | | 1087 | clr_intercept(svm, INTERCEPT_TASK_SWITCH); |
817 | (1ULL << INTERCEPT_INVLPG)); | 1088 | clr_intercept(svm, INTERCEPT_INVLPG); |
818 | control->intercept_exceptions &= ~(1 << PF_VECTOR); | 1089 | clr_exception_intercept(svm, PF_VECTOR); |
819 | control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; | 1090 | clr_cr_intercept(svm, INTERCEPT_CR3_READ); |
820 | control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; | 1091 | clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); |
821 | save->g_pat = 0x0007040600070406ULL; | 1092 | save->g_pat = 0x0007040600070406ULL; |
822 | save->cr3 = 0; | 1093 | save->cr3 = 0; |
823 | save->cr4 = 0; | 1094 | save->cr4 = 0; |
824 | } | 1095 | } |
825 | force_new_asid(&svm->vcpu); | 1096 | svm->asid_generation = 0; |
826 | 1097 | ||
827 | svm->nested.vmcb = 0; | 1098 | svm->nested.vmcb = 0; |
828 | svm->vcpu.arch.hflags = 0; | 1099 | svm->vcpu.arch.hflags = 0; |
829 | 1100 | ||
830 | if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { | 1101 | if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { |
831 | control->pause_filter_count = 3000; | 1102 | control->pause_filter_count = 3000; |
832 | control->intercept |= (1ULL << INTERCEPT_PAUSE); | 1103 | set_intercept(svm, INTERCEPT_PAUSE); |
833 | } | 1104 | } |
834 | 1105 | ||
1106 | mark_all_dirty(svm->vmcb); | ||
1107 | |||
835 | enable_gif(svm); | 1108 | enable_gif(svm); |
836 | } | 1109 | } |
837 | 1110 | ||
@@ -867,6 +1140,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
867 | goto out; | 1140 | goto out; |
868 | } | 1141 | } |
869 | 1142 | ||
1143 | svm->tsc_ratio = TSC_RATIO_DEFAULT; | ||
1144 | |||
870 | err = kvm_vcpu_init(&svm->vcpu, kvm, id); | 1145 | err = kvm_vcpu_init(&svm->vcpu, kvm, id); |
871 | if (err) | 1146 | if (err) |
872 | goto free_svm; | 1147 | goto free_svm; |
@@ -901,7 +1176,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
901 | svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; | 1176 | svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; |
902 | svm->asid_generation = 0; | 1177 | svm->asid_generation = 0; |
903 | init_vmcb(svm); | 1178 | init_vmcb(svm); |
904 | svm->vmcb->control.tsc_offset = 0-native_read_tsc(); | 1179 | kvm_write_tsc(&svm->vcpu, 0); |
905 | 1180 | ||
906 | err = fx_init(&svm->vcpu); | 1181 | err = fx_init(&svm->vcpu); |
907 | if (err) | 1182 | if (err) |
@@ -947,25 +1222,25 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
947 | int i; | 1222 | int i; |
948 | 1223 | ||
949 | if (unlikely(cpu != vcpu->cpu)) { | 1224 | if (unlikely(cpu != vcpu->cpu)) { |
950 | u64 delta; | ||
951 | |||
952 | if (check_tsc_unstable()) { | ||
953 | /* | ||
954 | * Make sure that the guest sees a monotonically | ||
955 | * increasing TSC. | ||
956 | */ | ||
957 | delta = vcpu->arch.host_tsc - native_read_tsc(); | ||
958 | svm->vmcb->control.tsc_offset += delta; | ||
959 | if (is_nested(svm)) | ||
960 | svm->nested.hsave->control.tsc_offset += delta; | ||
961 | } | ||
962 | vcpu->cpu = cpu; | ||
963 | kvm_migrate_timers(vcpu); | ||
964 | svm->asid_generation = 0; | 1225 | svm->asid_generation = 0; |
1226 | mark_all_dirty(svm->vmcb); | ||
965 | } | 1227 | } |
966 | 1228 | ||
1229 | #ifdef CONFIG_X86_64 | ||
1230 | rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); | ||
1231 | #endif | ||
1232 | savesegment(fs, svm->host.fs); | ||
1233 | savesegment(gs, svm->host.gs); | ||
1234 | svm->host.ldt = kvm_read_ldt(); | ||
1235 | |||
967 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | 1236 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) |
968 | rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | 1237 | rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); |
1238 | |||
1239 | if (static_cpu_has(X86_FEATURE_TSCRATEMSR) && | ||
1240 | svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) { | ||
1241 | __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio; | ||
1242 | wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); | ||
1243 | } | ||
969 | } | 1244 | } |
970 | 1245 | ||
971 | static void svm_vcpu_put(struct kvm_vcpu *vcpu) | 1246 | static void svm_vcpu_put(struct kvm_vcpu *vcpu) |
@@ -974,10 +1249,18 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) | |||
974 | int i; | 1249 | int i; |
975 | 1250 | ||
976 | ++vcpu->stat.host_state_reload; | 1251 | ++vcpu->stat.host_state_reload; |
1252 | kvm_load_ldt(svm->host.ldt); | ||
1253 | #ifdef CONFIG_X86_64 | ||
1254 | loadsegment(fs, svm->host.fs); | ||
1255 | wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); | ||
1256 | load_gs_index(svm->host.gs); | ||
1257 | #else | ||
1258 | #ifdef CONFIG_X86_32_LAZY_GS | ||
1259 | loadsegment(gs, svm->host.gs); | ||
1260 | #endif | ||
1261 | #endif | ||
977 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | 1262 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) |
978 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | 1263 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); |
979 | |||
980 | vcpu->arch.host_tsc = native_read_tsc(); | ||
981 | } | 1264 | } |
982 | 1265 | ||
983 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | 1266 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) |
@@ -995,7 +1278,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | |||
995 | switch (reg) { | 1278 | switch (reg) { |
996 | case VCPU_EXREG_PDPTR: | 1279 | case VCPU_EXREG_PDPTR: |
997 | BUG_ON(!npt_enabled); | 1280 | BUG_ON(!npt_enabled); |
998 | load_pdptrs(vcpu, vcpu->arch.cr3); | 1281 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); |
999 | break; | 1282 | break; |
1000 | default: | 1283 | default: |
1001 | BUG(); | 1284 | BUG(); |
@@ -1004,12 +1287,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | |||
1004 | 1287 | ||
1005 | static void svm_set_vintr(struct vcpu_svm *svm) | 1288 | static void svm_set_vintr(struct vcpu_svm *svm) |
1006 | { | 1289 | { |
1007 | svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; | 1290 | set_intercept(svm, INTERCEPT_VINTR); |
1008 | } | 1291 | } |
1009 | 1292 | ||
1010 | static void svm_clear_vintr(struct vcpu_svm *svm) | 1293 | static void svm_clear_vintr(struct vcpu_svm *svm) |
1011 | { | 1294 | { |
1012 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); | 1295 | clr_intercept(svm, INTERCEPT_VINTR); |
1013 | } | 1296 | } |
1014 | 1297 | ||
1015 | static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) | 1298 | static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) |
@@ -1124,6 +1407,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | |||
1124 | 1407 | ||
1125 | svm->vmcb->save.idtr.limit = dt->size; | 1408 | svm->vmcb->save.idtr.limit = dt->size; |
1126 | svm->vmcb->save.idtr.base = dt->address ; | 1409 | svm->vmcb->save.idtr.base = dt->address ; |
1410 | mark_dirty(svm->vmcb, VMCB_DT); | ||
1127 | } | 1411 | } |
1128 | 1412 | ||
1129 | static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | 1413 | static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) |
@@ -1140,19 +1424,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | |||
1140 | 1424 | ||
1141 | svm->vmcb->save.gdtr.limit = dt->size; | 1425 | svm->vmcb->save.gdtr.limit = dt->size; |
1142 | svm->vmcb->save.gdtr.base = dt->address ; | 1426 | svm->vmcb->save.gdtr.base = dt->address ; |
1427 | mark_dirty(svm->vmcb, VMCB_DT); | ||
1143 | } | 1428 | } |
1144 | 1429 | ||
1145 | static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) | 1430 | static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) |
1146 | { | 1431 | { |
1147 | } | 1432 | } |
1148 | 1433 | ||
1434 | static void svm_decache_cr3(struct kvm_vcpu *vcpu) | ||
1435 | { | ||
1436 | } | ||
1437 | |||
1149 | static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | 1438 | static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) |
1150 | { | 1439 | { |
1151 | } | 1440 | } |
1152 | 1441 | ||
1153 | static void update_cr0_intercept(struct vcpu_svm *svm) | 1442 | static void update_cr0_intercept(struct vcpu_svm *svm) |
1154 | { | 1443 | { |
1155 | struct vmcb *vmcb = svm->vmcb; | ||
1156 | ulong gcr0 = svm->vcpu.arch.cr0; | 1444 | ulong gcr0 = svm->vcpu.arch.cr0; |
1157 | u64 *hcr0 = &svm->vmcb->save.cr0; | 1445 | u64 *hcr0 = &svm->vmcb->save.cr0; |
1158 | 1446 | ||
@@ -1162,27 +1450,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm) | |||
1162 | *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) | 1450 | *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) |
1163 | | (gcr0 & SVM_CR0_SELECTIVE_MASK); | 1451 | | (gcr0 & SVM_CR0_SELECTIVE_MASK); |
1164 | 1452 | ||
1453 | mark_dirty(svm->vmcb, VMCB_CR); | ||
1165 | 1454 | ||
1166 | if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { | 1455 | if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { |
1167 | vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; | 1456 | clr_cr_intercept(svm, INTERCEPT_CR0_READ); |
1168 | vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; | 1457 | clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); |
1169 | if (is_nested(svm)) { | ||
1170 | struct vmcb *hsave = svm->nested.hsave; | ||
1171 | |||
1172 | hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; | ||
1173 | hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; | ||
1174 | vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read; | ||
1175 | vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write; | ||
1176 | } | ||
1177 | } else { | 1458 | } else { |
1178 | svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; | 1459 | set_cr_intercept(svm, INTERCEPT_CR0_READ); |
1179 | svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; | 1460 | set_cr_intercept(svm, INTERCEPT_CR0_WRITE); |
1180 | if (is_nested(svm)) { | ||
1181 | struct vmcb *hsave = svm->nested.hsave; | ||
1182 | |||
1183 | hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK; | ||
1184 | hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK; | ||
1185 | } | ||
1186 | } | 1461 | } |
1187 | } | 1462 | } |
1188 | 1463 | ||
@@ -1190,27 +1465,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1190 | { | 1465 | { |
1191 | struct vcpu_svm *svm = to_svm(vcpu); | 1466 | struct vcpu_svm *svm = to_svm(vcpu); |
1192 | 1467 | ||
1193 | if (is_nested(svm)) { | ||
1194 | /* | ||
1195 | * We are here because we run in nested mode, the host kvm | ||
1196 | * intercepts cr0 writes but the l1 hypervisor does not. | ||
1197 | * But the L1 hypervisor may intercept selective cr0 writes. | ||
1198 | * This needs to be checked here. | ||
1199 | */ | ||
1200 | unsigned long old, new; | ||
1201 | |||
1202 | /* Remove bits that would trigger a real cr0 write intercept */ | ||
1203 | old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK; | ||
1204 | new = cr0 & SVM_CR0_SELECTIVE_MASK; | ||
1205 | |||
1206 | if (old == new) { | ||
1207 | /* cr0 write with ts and mp unchanged */ | ||
1208 | svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; | ||
1209 | if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) | ||
1210 | return; | ||
1211 | } | ||
1212 | } | ||
1213 | |||
1214 | #ifdef CONFIG_X86_64 | 1468 | #ifdef CONFIG_X86_64 |
1215 | if (vcpu->arch.efer & EFER_LME) { | 1469 | if (vcpu->arch.efer & EFER_LME) { |
1216 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { | 1470 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { |
@@ -1238,6 +1492,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1238 | */ | 1492 | */ |
1239 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); | 1493 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); |
1240 | svm->vmcb->save.cr0 = cr0; | 1494 | svm->vmcb->save.cr0 = cr0; |
1495 | mark_dirty(svm->vmcb, VMCB_CR); | ||
1241 | update_cr0_intercept(svm); | 1496 | update_cr0_intercept(svm); |
1242 | } | 1497 | } |
1243 | 1498 | ||
@@ -1247,13 +1502,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
1247 | unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; | 1502 | unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; |
1248 | 1503 | ||
1249 | if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) | 1504 | if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) |
1250 | force_new_asid(vcpu); | 1505 | svm_flush_tlb(vcpu); |
1251 | 1506 | ||
1252 | vcpu->arch.cr4 = cr4; | 1507 | vcpu->arch.cr4 = cr4; |
1253 | if (!npt_enabled) | 1508 | if (!npt_enabled) |
1254 | cr4 |= X86_CR4_PAE; | 1509 | cr4 |= X86_CR4_PAE; |
1255 | cr4 |= host_cr4_mce; | 1510 | cr4 |= host_cr4_mce; |
1256 | to_svm(vcpu)->vmcb->save.cr4 = cr4; | 1511 | to_svm(vcpu)->vmcb->save.cr4 = cr4; |
1512 | mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); | ||
1257 | } | 1513 | } |
1258 | 1514 | ||
1259 | static void svm_set_segment(struct kvm_vcpu *vcpu, | 1515 | static void svm_set_segment(struct kvm_vcpu *vcpu, |
@@ -1282,26 +1538,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, | |||
1282 | = (svm->vmcb->save.cs.attrib | 1538 | = (svm->vmcb->save.cs.attrib |
1283 | >> SVM_SELECTOR_DPL_SHIFT) & 3; | 1539 | >> SVM_SELECTOR_DPL_SHIFT) & 3; |
1284 | 1540 | ||
1541 | mark_dirty(svm->vmcb, VMCB_SEG); | ||
1285 | } | 1542 | } |
1286 | 1543 | ||
1287 | static void update_db_intercept(struct kvm_vcpu *vcpu) | 1544 | static void update_db_intercept(struct kvm_vcpu *vcpu) |
1288 | { | 1545 | { |
1289 | struct vcpu_svm *svm = to_svm(vcpu); | 1546 | struct vcpu_svm *svm = to_svm(vcpu); |
1290 | 1547 | ||
1291 | svm->vmcb->control.intercept_exceptions &= | 1548 | clr_exception_intercept(svm, DB_VECTOR); |
1292 | ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); | 1549 | clr_exception_intercept(svm, BP_VECTOR); |
1293 | 1550 | ||
1294 | if (svm->nmi_singlestep) | 1551 | if (svm->nmi_singlestep) |
1295 | svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); | 1552 | set_exception_intercept(svm, DB_VECTOR); |
1296 | 1553 | ||
1297 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { | 1554 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { |
1298 | if (vcpu->guest_debug & | 1555 | if (vcpu->guest_debug & |
1299 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | 1556 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) |
1300 | svm->vmcb->control.intercept_exceptions |= | 1557 | set_exception_intercept(svm, DB_VECTOR); |
1301 | 1 << DB_VECTOR; | ||
1302 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | 1558 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) |
1303 | svm->vmcb->control.intercept_exceptions |= | 1559 | set_exception_intercept(svm, BP_VECTOR); |
1304 | 1 << BP_VECTOR; | ||
1305 | } else | 1560 | } else |
1306 | vcpu->guest_debug = 0; | 1561 | vcpu->guest_debug = 0; |
1307 | } | 1562 | } |
@@ -1315,21 +1570,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) | |||
1315 | else | 1570 | else |
1316 | svm->vmcb->save.dr7 = vcpu->arch.dr7; | 1571 | svm->vmcb->save.dr7 = vcpu->arch.dr7; |
1317 | 1572 | ||
1318 | update_db_intercept(vcpu); | 1573 | mark_dirty(svm->vmcb, VMCB_DR); |
1319 | } | ||
1320 | |||
1321 | static void load_host_msrs(struct kvm_vcpu *vcpu) | ||
1322 | { | ||
1323 | #ifdef CONFIG_X86_64 | ||
1324 | wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); | ||
1325 | #endif | ||
1326 | } | ||
1327 | 1574 | ||
1328 | static void save_host_msrs(struct kvm_vcpu *vcpu) | 1575 | update_db_intercept(vcpu); |
1329 | { | ||
1330 | #ifdef CONFIG_X86_64 | ||
1331 | rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); | ||
1332 | #endif | ||
1333 | } | 1576 | } |
1334 | 1577 | ||
1335 | static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) | 1578 | static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) |
@@ -1342,6 +1585,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) | |||
1342 | 1585 | ||
1343 | svm->asid_generation = sd->asid_generation; | 1586 | svm->asid_generation = sd->asid_generation; |
1344 | svm->vmcb->control.asid = sd->next_asid++; | 1587 | svm->vmcb->control.asid = sd->next_asid++; |
1588 | |||
1589 | mark_dirty(svm->vmcb, VMCB_ASID); | ||
1345 | } | 1590 | } |
1346 | 1591 | ||
1347 | static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) | 1592 | static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) |
@@ -1349,20 +1594,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) | |||
1349 | struct vcpu_svm *svm = to_svm(vcpu); | 1594 | struct vcpu_svm *svm = to_svm(vcpu); |
1350 | 1595 | ||
1351 | svm->vmcb->save.dr7 = value; | 1596 | svm->vmcb->save.dr7 = value; |
1597 | mark_dirty(svm->vmcb, VMCB_DR); | ||
1352 | } | 1598 | } |
1353 | 1599 | ||
1354 | static int pf_interception(struct vcpu_svm *svm) | 1600 | static int pf_interception(struct vcpu_svm *svm) |
1355 | { | 1601 | { |
1356 | u64 fault_address; | 1602 | u64 fault_address = svm->vmcb->control.exit_info_2; |
1357 | u32 error_code; | 1603 | u32 error_code; |
1604 | int r = 1; | ||
1358 | 1605 | ||
1359 | fault_address = svm->vmcb->control.exit_info_2; | 1606 | switch (svm->apf_reason) { |
1360 | error_code = svm->vmcb->control.exit_info_1; | 1607 | default: |
1361 | 1608 | error_code = svm->vmcb->control.exit_info_1; | |
1362 | trace_kvm_page_fault(fault_address, error_code); | 1609 | |
1363 | if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) | 1610 | trace_kvm_page_fault(fault_address, error_code); |
1364 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); | 1611 | if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) |
1365 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | 1612 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); |
1613 | r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, | ||
1614 | svm->vmcb->control.insn_bytes, | ||
1615 | svm->vmcb->control.insn_len); | ||
1616 | break; | ||
1617 | case KVM_PV_REASON_PAGE_NOT_PRESENT: | ||
1618 | svm->apf_reason = 0; | ||
1619 | local_irq_disable(); | ||
1620 | kvm_async_pf_task_wait(fault_address); | ||
1621 | local_irq_enable(); | ||
1622 | break; | ||
1623 | case KVM_PV_REASON_PAGE_READY: | ||
1624 | svm->apf_reason = 0; | ||
1625 | local_irq_disable(); | ||
1626 | kvm_async_pf_task_wake(fault_address); | ||
1627 | local_irq_enable(); | ||
1628 | break; | ||
1629 | } | ||
1630 | return r; | ||
1366 | } | 1631 | } |
1367 | 1632 | ||
1368 | static int db_interception(struct vcpu_svm *svm) | 1633 | static int db_interception(struct vcpu_svm *svm) |
@@ -1410,7 +1675,7 @@ static int ud_interception(struct vcpu_svm *svm) | |||
1410 | { | 1675 | { |
1411 | int er; | 1676 | int er; |
1412 | 1677 | ||
1413 | er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); | 1678 | er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); |
1414 | if (er != EMULATE_DONE) | 1679 | if (er != EMULATE_DONE) |
1415 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | 1680 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); |
1416 | return 1; | 1681 | return 1; |
@@ -1419,21 +1684,8 @@ static int ud_interception(struct vcpu_svm *svm) | |||
1419 | static void svm_fpu_activate(struct kvm_vcpu *vcpu) | 1684 | static void svm_fpu_activate(struct kvm_vcpu *vcpu) |
1420 | { | 1685 | { |
1421 | struct vcpu_svm *svm = to_svm(vcpu); | 1686 | struct vcpu_svm *svm = to_svm(vcpu); |
1422 | u32 excp; | ||
1423 | |||
1424 | if (is_nested(svm)) { | ||
1425 | u32 h_excp, n_excp; | ||
1426 | |||
1427 | h_excp = svm->nested.hsave->control.intercept_exceptions; | ||
1428 | n_excp = svm->nested.intercept_exceptions; | ||
1429 | h_excp &= ~(1 << NM_VECTOR); | ||
1430 | excp = h_excp | n_excp; | ||
1431 | } else { | ||
1432 | excp = svm->vmcb->control.intercept_exceptions; | ||
1433 | excp &= ~(1 << NM_VECTOR); | ||
1434 | } | ||
1435 | 1687 | ||
1436 | svm->vmcb->control.intercept_exceptions = excp; | 1688 | clr_exception_intercept(svm, NM_VECTOR); |
1437 | 1689 | ||
1438 | svm->vcpu.fpu_active = 1; | 1690 | svm->vcpu.fpu_active = 1; |
1439 | update_cr0_intercept(svm); | 1691 | update_cr0_intercept(svm); |
@@ -1540,7 +1792,7 @@ static int io_interception(struct vcpu_svm *svm) | |||
1540 | string = (io_info & SVM_IOIO_STR_MASK) != 0; | 1792 | string = (io_info & SVM_IOIO_STR_MASK) != 0; |
1541 | in = (io_info & SVM_IOIO_TYPE_MASK) != 0; | 1793 | in = (io_info & SVM_IOIO_TYPE_MASK) != 0; |
1542 | if (string || in) | 1794 | if (string || in) |
1543 | return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; | 1795 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
1544 | 1796 | ||
1545 | port = io_info >> 16; | 1797 | port = io_info >> 16; |
1546 | size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; | 1798 | size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; |
@@ -1581,6 +1833,56 @@ static int vmmcall_interception(struct vcpu_svm *svm) | |||
1581 | return 1; | 1833 | return 1; |
1582 | } | 1834 | } |
1583 | 1835 | ||
1836 | static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) | ||
1837 | { | ||
1838 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1839 | |||
1840 | return svm->nested.nested_cr3; | ||
1841 | } | ||
1842 | |||
1843 | static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, | ||
1844 | unsigned long root) | ||
1845 | { | ||
1846 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1847 | |||
1848 | svm->vmcb->control.nested_cr3 = root; | ||
1849 | mark_dirty(svm->vmcb, VMCB_NPT); | ||
1850 | svm_flush_tlb(vcpu); | ||
1851 | } | ||
1852 | |||
1853 | static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, | ||
1854 | struct x86_exception *fault) | ||
1855 | { | ||
1856 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1857 | |||
1858 | svm->vmcb->control.exit_code = SVM_EXIT_NPF; | ||
1859 | svm->vmcb->control.exit_code_hi = 0; | ||
1860 | svm->vmcb->control.exit_info_1 = fault->error_code; | ||
1861 | svm->vmcb->control.exit_info_2 = fault->address; | ||
1862 | |||
1863 | nested_svm_vmexit(svm); | ||
1864 | } | ||
1865 | |||
1866 | static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) | ||
1867 | { | ||
1868 | int r; | ||
1869 | |||
1870 | r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); | ||
1871 | |||
1872 | vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; | ||
1873 | vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; | ||
1874 | vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; | ||
1875 | vcpu->arch.mmu.shadow_root_level = get_npt_level(); | ||
1876 | vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; | ||
1877 | |||
1878 | return r; | ||
1879 | } | ||
1880 | |||
1881 | static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) | ||
1882 | { | ||
1883 | vcpu->arch.walk_mmu = &vcpu->arch.mmu; | ||
1884 | } | ||
1885 | |||
1584 | static int nested_svm_check_permissions(struct vcpu_svm *svm) | 1886 | static int nested_svm_check_permissions(struct vcpu_svm *svm) |
1585 | { | 1887 | { |
1586 | if (!(svm->vcpu.arch.efer & EFER_SVME) | 1888 | if (!(svm->vcpu.arch.efer & EFER_SVME) |
@@ -1602,7 +1904,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | |||
1602 | { | 1904 | { |
1603 | int vmexit; | 1905 | int vmexit; |
1604 | 1906 | ||
1605 | if (!is_nested(svm)) | 1907 | if (!is_guest_mode(&svm->vcpu)) |
1606 | return 0; | 1908 | return 0; |
1607 | 1909 | ||
1608 | svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; | 1910 | svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; |
@@ -1620,7 +1922,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | |||
1620 | /* This function returns true if it is save to enable the irq window */ | 1922 | /* This function returns true if it is save to enable the irq window */ |
1621 | static inline bool nested_svm_intr(struct vcpu_svm *svm) | 1923 | static inline bool nested_svm_intr(struct vcpu_svm *svm) |
1622 | { | 1924 | { |
1623 | if (!is_nested(svm)) | 1925 | if (!is_guest_mode(&svm->vcpu)) |
1624 | return true; | 1926 | return true; |
1625 | 1927 | ||
1626 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) | 1928 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) |
@@ -1629,6 +1931,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) | |||
1629 | if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) | 1931 | if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) |
1630 | return false; | 1932 | return false; |
1631 | 1933 | ||
1934 | /* | ||
1935 | * if vmexit was already requested (by intercepted exception | ||
1936 | * for instance) do not overwrite it with "external interrupt" | ||
1937 | * vmexit. | ||
1938 | */ | ||
1939 | if (svm->nested.exit_required) | ||
1940 | return false; | ||
1941 | |||
1632 | svm->vmcb->control.exit_code = SVM_EXIT_INTR; | 1942 | svm->vmcb->control.exit_code = SVM_EXIT_INTR; |
1633 | svm->vmcb->control.exit_info_1 = 0; | 1943 | svm->vmcb->control.exit_info_1 = 0; |
1634 | svm->vmcb->control.exit_info_2 = 0; | 1944 | svm->vmcb->control.exit_info_2 = 0; |
@@ -1651,7 +1961,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) | |||
1651 | /* This function returns true if it is save to enable the nmi window */ | 1961 | /* This function returns true if it is save to enable the nmi window */ |
1652 | static inline bool nested_svm_nmi(struct vcpu_svm *svm) | 1962 | static inline bool nested_svm_nmi(struct vcpu_svm *svm) |
1653 | { | 1963 | { |
1654 | if (!is_nested(svm)) | 1964 | if (!is_guest_mode(&svm->vcpu)) |
1655 | return true; | 1965 | return true; |
1656 | 1966 | ||
1657 | if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) | 1967 | if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) |
@@ -1750,8 +2060,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) | |||
1750 | return NESTED_EXIT_HOST; | 2060 | return NESTED_EXIT_HOST; |
1751 | break; | 2061 | break; |
1752 | case SVM_EXIT_EXCP_BASE + PF_VECTOR: | 2062 | case SVM_EXIT_EXCP_BASE + PF_VECTOR: |
1753 | /* When we're shadowing, trap PFs */ | 2063 | /* When we're shadowing, trap PFs, but not async PF */ |
1754 | if (!npt_enabled) | 2064 | if (!npt_enabled && svm->apf_reason == 0) |
1755 | return NESTED_EXIT_HOST; | 2065 | return NESTED_EXIT_HOST; |
1756 | break; | 2066 | break; |
1757 | case SVM_EXIT_EXCP_BASE + NM_VECTOR: | 2067 | case SVM_EXIT_EXCP_BASE + NM_VECTOR: |
@@ -1779,27 +2089,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm) | |||
1779 | case SVM_EXIT_IOIO: | 2089 | case SVM_EXIT_IOIO: |
1780 | vmexit = nested_svm_intercept_ioio(svm); | 2090 | vmexit = nested_svm_intercept_ioio(svm); |
1781 | break; | 2091 | break; |
1782 | case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { | 2092 | case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { |
1783 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); | 2093 | u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); |
1784 | if (svm->nested.intercept_cr_read & cr_bits) | 2094 | if (svm->nested.intercept_cr & bit) |
1785 | vmexit = NESTED_EXIT_DONE; | ||
1786 | break; | ||
1787 | } | ||
1788 | case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { | ||
1789 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); | ||
1790 | if (svm->nested.intercept_cr_write & cr_bits) | ||
1791 | vmexit = NESTED_EXIT_DONE; | ||
1792 | break; | ||
1793 | } | ||
1794 | case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { | ||
1795 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); | ||
1796 | if (svm->nested.intercept_dr_read & dr_bits) | ||
1797 | vmexit = NESTED_EXIT_DONE; | 2095 | vmexit = NESTED_EXIT_DONE; |
1798 | break; | 2096 | break; |
1799 | } | 2097 | } |
1800 | case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { | 2098 | case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { |
1801 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); | 2099 | u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); |
1802 | if (svm->nested.intercept_dr_write & dr_bits) | 2100 | if (svm->nested.intercept_dr & bit) |
1803 | vmexit = NESTED_EXIT_DONE; | 2101 | vmexit = NESTED_EXIT_DONE; |
1804 | break; | 2102 | break; |
1805 | } | 2103 | } |
@@ -1807,6 +2105,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm) | |||
1807 | u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); | 2105 | u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); |
1808 | if (svm->nested.intercept_exceptions & excp_bits) | 2106 | if (svm->nested.intercept_exceptions & excp_bits) |
1809 | vmexit = NESTED_EXIT_DONE; | 2107 | vmexit = NESTED_EXIT_DONE; |
2108 | /* async page fault always cause vmexit */ | ||
2109 | else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && | ||
2110 | svm->apf_reason != 0) | ||
2111 | vmexit = NESTED_EXIT_DONE; | ||
1810 | break; | 2112 | break; |
1811 | } | 2113 | } |
1812 | case SVM_EXIT_ERR: { | 2114 | case SVM_EXIT_ERR: { |
@@ -1840,10 +2142,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr | |||
1840 | struct vmcb_control_area *dst = &dst_vmcb->control; | 2142 | struct vmcb_control_area *dst = &dst_vmcb->control; |
1841 | struct vmcb_control_area *from = &from_vmcb->control; | 2143 | struct vmcb_control_area *from = &from_vmcb->control; |
1842 | 2144 | ||
1843 | dst->intercept_cr_read = from->intercept_cr_read; | 2145 | dst->intercept_cr = from->intercept_cr; |
1844 | dst->intercept_cr_write = from->intercept_cr_write; | 2146 | dst->intercept_dr = from->intercept_dr; |
1845 | dst->intercept_dr_read = from->intercept_dr_read; | ||
1846 | dst->intercept_dr_write = from->intercept_dr_write; | ||
1847 | dst->intercept_exceptions = from->intercept_exceptions; | 2147 | dst->intercept_exceptions = from->intercept_exceptions; |
1848 | dst->intercept = from->intercept; | 2148 | dst->intercept = from->intercept; |
1849 | dst->iopm_base_pa = from->iopm_base_pa; | 2149 | dst->iopm_base_pa = from->iopm_base_pa; |
@@ -1884,7 +2184,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1884 | if (!nested_vmcb) | 2184 | if (!nested_vmcb) |
1885 | return 1; | 2185 | return 1; |
1886 | 2186 | ||
1887 | /* Exit nested SVM mode */ | 2187 | /* Exit Guest-Mode */ |
2188 | leave_guest_mode(&svm->vcpu); | ||
1888 | svm->nested.vmcb = 0; | 2189 | svm->nested.vmcb = 0; |
1889 | 2190 | ||
1890 | /* Give the current vmcb to the guest */ | 2191 | /* Give the current vmcb to the guest */ |
@@ -1896,11 +2197,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1896 | nested_vmcb->save.ds = vmcb->save.ds; | 2197 | nested_vmcb->save.ds = vmcb->save.ds; |
1897 | nested_vmcb->save.gdtr = vmcb->save.gdtr; | 2198 | nested_vmcb->save.gdtr = vmcb->save.gdtr; |
1898 | nested_vmcb->save.idtr = vmcb->save.idtr; | 2199 | nested_vmcb->save.idtr = vmcb->save.idtr; |
2200 | nested_vmcb->save.efer = svm->vcpu.arch.efer; | ||
1899 | nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); | 2201 | nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); |
1900 | nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; | 2202 | nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); |
1901 | nested_vmcb->save.cr2 = vmcb->save.cr2; | 2203 | nested_vmcb->save.cr2 = vmcb->save.cr2; |
1902 | nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; | 2204 | nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; |
1903 | nested_vmcb->save.rflags = vmcb->save.rflags; | 2205 | nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); |
1904 | nested_vmcb->save.rip = vmcb->save.rip; | 2206 | nested_vmcb->save.rip = vmcb->save.rip; |
1905 | nested_vmcb->save.rsp = vmcb->save.rsp; | 2207 | nested_vmcb->save.rsp = vmcb->save.rsp; |
1906 | nested_vmcb->save.rax = vmcb->save.rax; | 2208 | nested_vmcb->save.rax = vmcb->save.rax; |
@@ -1917,6 +2219,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1917 | nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; | 2219 | nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; |
1918 | nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; | 2220 | nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; |
1919 | nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; | 2221 | nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; |
2222 | nested_vmcb->control.next_rip = vmcb->control.next_rip; | ||
1920 | 2223 | ||
1921 | /* | 2224 | /* |
1922 | * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have | 2225 | * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have |
@@ -1947,6 +2250,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1947 | kvm_clear_exception_queue(&svm->vcpu); | 2250 | kvm_clear_exception_queue(&svm->vcpu); |
1948 | kvm_clear_interrupt_queue(&svm->vcpu); | 2251 | kvm_clear_interrupt_queue(&svm->vcpu); |
1949 | 2252 | ||
2253 | svm->nested.nested_cr3 = 0; | ||
2254 | |||
1950 | /* Restore selected save entries */ | 2255 | /* Restore selected save entries */ |
1951 | svm->vmcb->save.es = hsave->save.es; | 2256 | svm->vmcb->save.es = hsave->save.es; |
1952 | svm->vmcb->save.cs = hsave->save.cs; | 2257 | svm->vmcb->save.cs = hsave->save.cs; |
@@ -1954,7 +2259,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1954 | svm->vmcb->save.ds = hsave->save.ds; | 2259 | svm->vmcb->save.ds = hsave->save.ds; |
1955 | svm->vmcb->save.gdtr = hsave->save.gdtr; | 2260 | svm->vmcb->save.gdtr = hsave->save.gdtr; |
1956 | svm->vmcb->save.idtr = hsave->save.idtr; | 2261 | svm->vmcb->save.idtr = hsave->save.idtr; |
1957 | svm->vmcb->save.rflags = hsave->save.rflags; | 2262 | kvm_set_rflags(&svm->vcpu, hsave->save.rflags); |
1958 | svm_set_efer(&svm->vcpu, hsave->save.efer); | 2263 | svm_set_efer(&svm->vcpu, hsave->save.efer); |
1959 | svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); | 2264 | svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); |
1960 | svm_set_cr4(&svm->vcpu, hsave->save.cr4); | 2265 | svm_set_cr4(&svm->vcpu, hsave->save.cr4); |
@@ -1971,8 +2276,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1971 | svm->vmcb->save.cpl = 0; | 2276 | svm->vmcb->save.cpl = 0; |
1972 | svm->vmcb->control.exit_int_info = 0; | 2277 | svm->vmcb->control.exit_int_info = 0; |
1973 | 2278 | ||
2279 | mark_all_dirty(svm->vmcb); | ||
2280 | |||
1974 | nested_svm_unmap(page); | 2281 | nested_svm_unmap(page); |
1975 | 2282 | ||
2283 | nested_svm_uninit_mmu_context(&svm->vcpu); | ||
1976 | kvm_mmu_reset_context(&svm->vcpu); | 2284 | kvm_mmu_reset_context(&svm->vcpu); |
1977 | kvm_mmu_load(&svm->vcpu); | 2285 | kvm_mmu_load(&svm->vcpu); |
1978 | 2286 | ||
@@ -2012,6 +2320,20 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) | |||
2012 | return true; | 2320 | return true; |
2013 | } | 2321 | } |
2014 | 2322 | ||
2323 | static bool nested_vmcb_checks(struct vmcb *vmcb) | ||
2324 | { | ||
2325 | if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) | ||
2326 | return false; | ||
2327 | |||
2328 | if (vmcb->control.asid == 0) | ||
2329 | return false; | ||
2330 | |||
2331 | if (vmcb->control.nested_ctl && !npt_enabled) | ||
2332 | return false; | ||
2333 | |||
2334 | return true; | ||
2335 | } | ||
2336 | |||
2015 | static bool nested_svm_vmrun(struct vcpu_svm *svm) | 2337 | static bool nested_svm_vmrun(struct vcpu_svm *svm) |
2016 | { | 2338 | { |
2017 | struct vmcb *nested_vmcb; | 2339 | struct vmcb *nested_vmcb; |
@@ -2026,14 +2348,25 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2026 | if (!nested_vmcb) | 2348 | if (!nested_vmcb) |
2027 | return false; | 2349 | return false; |
2028 | 2350 | ||
2029 | trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, | 2351 | if (!nested_vmcb_checks(nested_vmcb)) { |
2352 | nested_vmcb->control.exit_code = SVM_EXIT_ERR; | ||
2353 | nested_vmcb->control.exit_code_hi = 0; | ||
2354 | nested_vmcb->control.exit_info_1 = 0; | ||
2355 | nested_vmcb->control.exit_info_2 = 0; | ||
2356 | |||
2357 | nested_svm_unmap(page); | ||
2358 | |||
2359 | return false; | ||
2360 | } | ||
2361 | |||
2362 | trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, | ||
2030 | nested_vmcb->save.rip, | 2363 | nested_vmcb->save.rip, |
2031 | nested_vmcb->control.int_ctl, | 2364 | nested_vmcb->control.int_ctl, |
2032 | nested_vmcb->control.event_inj, | 2365 | nested_vmcb->control.event_inj, |
2033 | nested_vmcb->control.nested_ctl); | 2366 | nested_vmcb->control.nested_ctl); |
2034 | 2367 | ||
2035 | trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, | 2368 | trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, |
2036 | nested_vmcb->control.intercept_cr_write, | 2369 | nested_vmcb->control.intercept_cr >> 16, |
2037 | nested_vmcb->control.intercept_exceptions, | 2370 | nested_vmcb->control.intercept_exceptions, |
2038 | nested_vmcb->control.intercept); | 2371 | nested_vmcb->control.intercept); |
2039 | 2372 | ||
@@ -2054,22 +2387,28 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2054 | hsave->save.efer = svm->vcpu.arch.efer; | 2387 | hsave->save.efer = svm->vcpu.arch.efer; |
2055 | hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); | 2388 | hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); |
2056 | hsave->save.cr4 = svm->vcpu.arch.cr4; | 2389 | hsave->save.cr4 = svm->vcpu.arch.cr4; |
2057 | hsave->save.rflags = vmcb->save.rflags; | 2390 | hsave->save.rflags = kvm_get_rflags(&svm->vcpu); |
2058 | hsave->save.rip = svm->next_rip; | 2391 | hsave->save.rip = kvm_rip_read(&svm->vcpu); |
2059 | hsave->save.rsp = vmcb->save.rsp; | 2392 | hsave->save.rsp = vmcb->save.rsp; |
2060 | hsave->save.rax = vmcb->save.rax; | 2393 | hsave->save.rax = vmcb->save.rax; |
2061 | if (npt_enabled) | 2394 | if (npt_enabled) |
2062 | hsave->save.cr3 = vmcb->save.cr3; | 2395 | hsave->save.cr3 = vmcb->save.cr3; |
2063 | else | 2396 | else |
2064 | hsave->save.cr3 = svm->vcpu.arch.cr3; | 2397 | hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); |
2065 | 2398 | ||
2066 | copy_vmcb_control_area(hsave, vmcb); | 2399 | copy_vmcb_control_area(hsave, vmcb); |
2067 | 2400 | ||
2068 | if (svm->vmcb->save.rflags & X86_EFLAGS_IF) | 2401 | if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) |
2069 | svm->vcpu.arch.hflags |= HF_HIF_MASK; | 2402 | svm->vcpu.arch.hflags |= HF_HIF_MASK; |
2070 | else | 2403 | else |
2071 | svm->vcpu.arch.hflags &= ~HF_HIF_MASK; | 2404 | svm->vcpu.arch.hflags &= ~HF_HIF_MASK; |
2072 | 2405 | ||
2406 | if (nested_vmcb->control.nested_ctl) { | ||
2407 | kvm_mmu_unload(&svm->vcpu); | ||
2408 | svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; | ||
2409 | nested_svm_init_mmu_context(&svm->vcpu); | ||
2410 | } | ||
2411 | |||
2073 | /* Load the nested guest state */ | 2412 | /* Load the nested guest state */ |
2074 | svm->vmcb->save.es = nested_vmcb->save.es; | 2413 | svm->vmcb->save.es = nested_vmcb->save.es; |
2075 | svm->vmcb->save.cs = nested_vmcb->save.cs; | 2414 | svm->vmcb->save.cs = nested_vmcb->save.cs; |
@@ -2077,7 +2416,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2077 | svm->vmcb->save.ds = nested_vmcb->save.ds; | 2416 | svm->vmcb->save.ds = nested_vmcb->save.ds; |
2078 | svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; | 2417 | svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; |
2079 | svm->vmcb->save.idtr = nested_vmcb->save.idtr; | 2418 | svm->vmcb->save.idtr = nested_vmcb->save.idtr; |
2080 | svm->vmcb->save.rflags = nested_vmcb->save.rflags; | 2419 | kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); |
2081 | svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); | 2420 | svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); |
2082 | svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); | 2421 | svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); |
2083 | svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); | 2422 | svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); |
@@ -2107,14 +2446,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2107 | svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; | 2446 | svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; |
2108 | 2447 | ||
2109 | /* cache intercepts */ | 2448 | /* cache intercepts */ |
2110 | svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; | 2449 | svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; |
2111 | svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; | 2450 | svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; |
2112 | svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read; | ||
2113 | svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write; | ||
2114 | svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; | 2451 | svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; |
2115 | svm->nested.intercept = nested_vmcb->control.intercept; | 2452 | svm->nested.intercept = nested_vmcb->control.intercept; |
2116 | 2453 | ||
2117 | force_new_asid(&svm->vcpu); | 2454 | svm_flush_tlb(&svm->vcpu); |
2118 | svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; | 2455 | svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; |
2119 | if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) | 2456 | if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) |
2120 | svm->vcpu.arch.hflags |= HF_VINTR_MASK; | 2457 | svm->vcpu.arch.hflags |= HF_VINTR_MASK; |
@@ -2123,29 +2460,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2123 | 2460 | ||
2124 | if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { | 2461 | if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { |
2125 | /* We only want the cr8 intercept bits of the guest */ | 2462 | /* We only want the cr8 intercept bits of the guest */ |
2126 | svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; | 2463 | clr_cr_intercept(svm, INTERCEPT_CR8_READ); |
2127 | svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; | 2464 | clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
2128 | } | 2465 | } |
2129 | 2466 | ||
2130 | /* We don't want to see VMMCALLs from a nested guest */ | 2467 | /* We don't want to see VMMCALLs from a nested guest */ |
2131 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); | 2468 | clr_intercept(svm, INTERCEPT_VMMCALL); |
2132 | |||
2133 | /* | ||
2134 | * We don't want a nested guest to be more powerful than the guest, so | ||
2135 | * all intercepts are ORed | ||
2136 | */ | ||
2137 | svm->vmcb->control.intercept_cr_read |= | ||
2138 | nested_vmcb->control.intercept_cr_read; | ||
2139 | svm->vmcb->control.intercept_cr_write |= | ||
2140 | nested_vmcb->control.intercept_cr_write; | ||
2141 | svm->vmcb->control.intercept_dr_read |= | ||
2142 | nested_vmcb->control.intercept_dr_read; | ||
2143 | svm->vmcb->control.intercept_dr_write |= | ||
2144 | nested_vmcb->control.intercept_dr_write; | ||
2145 | svm->vmcb->control.intercept_exceptions |= | ||
2146 | nested_vmcb->control.intercept_exceptions; | ||
2147 | |||
2148 | svm->vmcb->control.intercept |= nested_vmcb->control.intercept; | ||
2149 | 2469 | ||
2150 | svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; | 2470 | svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; |
2151 | svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; | 2471 | svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; |
@@ -2156,11 +2476,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2156 | 2476 | ||
2157 | nested_svm_unmap(page); | 2477 | nested_svm_unmap(page); |
2158 | 2478 | ||
2159 | /* nested_vmcb is our indicator if nested SVM is activated */ | 2479 | /* Enter Guest-Mode */ |
2480 | enter_guest_mode(&svm->vcpu); | ||
2481 | |||
2482 | /* | ||
2483 | * Merge guest and host intercepts - must be called with vcpu in | ||
2484 | * guest-mode to take affect here | ||
2485 | */ | ||
2486 | recalc_intercepts(svm); | ||
2487 | |||
2160 | svm->nested.vmcb = vmcb_gpa; | 2488 | svm->nested.vmcb = vmcb_gpa; |
2161 | 2489 | ||
2162 | enable_gif(svm); | 2490 | enable_gif(svm); |
2163 | 2491 | ||
2492 | mark_all_dirty(svm->vmcb); | ||
2493 | |||
2164 | return true; | 2494 | return true; |
2165 | } | 2495 | } |
2166 | 2496 | ||
@@ -2188,13 +2518,13 @@ static int vmload_interception(struct vcpu_svm *svm) | |||
2188 | if (nested_svm_check_permissions(svm)) | 2518 | if (nested_svm_check_permissions(svm)) |
2189 | return 1; | 2519 | return 1; |
2190 | 2520 | ||
2191 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | ||
2192 | skip_emulated_instruction(&svm->vcpu); | ||
2193 | |||
2194 | nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); | 2521 | nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); |
2195 | if (!nested_vmcb) | 2522 | if (!nested_vmcb) |
2196 | return 1; | 2523 | return 1; |
2197 | 2524 | ||
2525 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | ||
2526 | skip_emulated_instruction(&svm->vcpu); | ||
2527 | |||
2198 | nested_svm_vmloadsave(nested_vmcb, svm->vmcb); | 2528 | nested_svm_vmloadsave(nested_vmcb, svm->vmcb); |
2199 | nested_svm_unmap(page); | 2529 | nested_svm_unmap(page); |
2200 | 2530 | ||
@@ -2209,13 +2539,13 @@ static int vmsave_interception(struct vcpu_svm *svm) | |||
2209 | if (nested_svm_check_permissions(svm)) | 2539 | if (nested_svm_check_permissions(svm)) |
2210 | return 1; | 2540 | return 1; |
2211 | 2541 | ||
2212 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | ||
2213 | skip_emulated_instruction(&svm->vcpu); | ||
2214 | |||
2215 | nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); | 2542 | nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); |
2216 | if (!nested_vmcb) | 2543 | if (!nested_vmcb) |
2217 | return 1; | 2544 | return 1; |
2218 | 2545 | ||
2546 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | ||
2547 | skip_emulated_instruction(&svm->vcpu); | ||
2548 | |||
2219 | nested_svm_vmloadsave(svm->vmcb, nested_vmcb); | 2549 | nested_svm_vmloadsave(svm->vmcb, nested_vmcb); |
2220 | nested_svm_unmap(page); | 2550 | nested_svm_unmap(page); |
2221 | 2551 | ||
@@ -2227,8 +2557,8 @@ static int vmrun_interception(struct vcpu_svm *svm) | |||
2227 | if (nested_svm_check_permissions(svm)) | 2557 | if (nested_svm_check_permissions(svm)) |
2228 | return 1; | 2558 | return 1; |
2229 | 2559 | ||
2230 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 2560 | /* Save rip after vmrun instruction */ |
2231 | skip_emulated_instruction(&svm->vcpu); | 2561 | kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); |
2232 | 2562 | ||
2233 | if (!nested_svm_vmrun(svm)) | 2563 | if (!nested_svm_vmrun(svm)) |
2234 | return 1; | 2564 | return 1; |
@@ -2257,6 +2587,7 @@ static int stgi_interception(struct vcpu_svm *svm) | |||
2257 | 2587 | ||
2258 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 2588 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
2259 | skip_emulated_instruction(&svm->vcpu); | 2589 | skip_emulated_instruction(&svm->vcpu); |
2590 | kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); | ||
2260 | 2591 | ||
2261 | enable_gif(svm); | 2592 | enable_gif(svm); |
2262 | 2593 | ||
@@ -2277,6 +2608,8 @@ static int clgi_interception(struct vcpu_svm *svm) | |||
2277 | svm_clear_vintr(svm); | 2608 | svm_clear_vintr(svm); |
2278 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | 2609 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; |
2279 | 2610 | ||
2611 | mark_dirty(svm->vmcb, VMCB_INTR); | ||
2612 | |||
2280 | return 1; | 2613 | return 1; |
2281 | } | 2614 | } |
2282 | 2615 | ||
@@ -2303,6 +2636,19 @@ static int skinit_interception(struct vcpu_svm *svm) | |||
2303 | return 1; | 2636 | return 1; |
2304 | } | 2637 | } |
2305 | 2638 | ||
2639 | static int xsetbv_interception(struct vcpu_svm *svm) | ||
2640 | { | ||
2641 | u64 new_bv = kvm_read_edx_eax(&svm->vcpu); | ||
2642 | u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); | ||
2643 | |||
2644 | if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { | ||
2645 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | ||
2646 | skip_emulated_instruction(&svm->vcpu); | ||
2647 | } | ||
2648 | |||
2649 | return 1; | ||
2650 | } | ||
2651 | |||
2306 | static int invalid_op_interception(struct vcpu_svm *svm) | 2652 | static int invalid_op_interception(struct vcpu_svm *svm) |
2307 | { | 2653 | { |
2308 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | 2654 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); |
@@ -2384,34 +2730,162 @@ static int cpuid_interception(struct vcpu_svm *svm) | |||
2384 | static int iret_interception(struct vcpu_svm *svm) | 2730 | static int iret_interception(struct vcpu_svm *svm) |
2385 | { | 2731 | { |
2386 | ++svm->vcpu.stat.nmi_window_exits; | 2732 | ++svm->vcpu.stat.nmi_window_exits; |
2387 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); | 2733 | clr_intercept(svm, INTERCEPT_IRET); |
2388 | svm->vcpu.arch.hflags |= HF_IRET_MASK; | 2734 | svm->vcpu.arch.hflags |= HF_IRET_MASK; |
2735 | svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); | ||
2389 | return 1; | 2736 | return 1; |
2390 | } | 2737 | } |
2391 | 2738 | ||
2392 | static int invlpg_interception(struct vcpu_svm *svm) | 2739 | static int invlpg_interception(struct vcpu_svm *svm) |
2393 | { | 2740 | { |
2394 | return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; | 2741 | if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) |
2742 | return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; | ||
2743 | |||
2744 | kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); | ||
2745 | skip_emulated_instruction(&svm->vcpu); | ||
2746 | return 1; | ||
2395 | } | 2747 | } |
2396 | 2748 | ||
2397 | static int emulate_on_interception(struct vcpu_svm *svm) | 2749 | static int emulate_on_interception(struct vcpu_svm *svm) |
2398 | { | 2750 | { |
2399 | return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; | 2751 | return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; |
2752 | } | ||
2753 | |||
2754 | bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) | ||
2755 | { | ||
2756 | unsigned long cr0 = svm->vcpu.arch.cr0; | ||
2757 | bool ret = false; | ||
2758 | u64 intercept; | ||
2759 | |||
2760 | intercept = svm->nested.intercept; | ||
2761 | |||
2762 | if (!is_guest_mode(&svm->vcpu) || | ||
2763 | (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) | ||
2764 | return false; | ||
2765 | |||
2766 | cr0 &= ~SVM_CR0_SELECTIVE_MASK; | ||
2767 | val &= ~SVM_CR0_SELECTIVE_MASK; | ||
2768 | |||
2769 | if (cr0 ^ val) { | ||
2770 | svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; | ||
2771 | ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); | ||
2772 | } | ||
2773 | |||
2774 | return ret; | ||
2775 | } | ||
2776 | |||
2777 | #define CR_VALID (1ULL << 63) | ||
2778 | |||
2779 | static int cr_interception(struct vcpu_svm *svm) | ||
2780 | { | ||
2781 | int reg, cr; | ||
2782 | unsigned long val; | ||
2783 | int err; | ||
2784 | |||
2785 | if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) | ||
2786 | return emulate_on_interception(svm); | ||
2787 | |||
2788 | if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) | ||
2789 | return emulate_on_interception(svm); | ||
2790 | |||
2791 | reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; | ||
2792 | cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; | ||
2793 | |||
2794 | err = 0; | ||
2795 | if (cr >= 16) { /* mov to cr */ | ||
2796 | cr -= 16; | ||
2797 | val = kvm_register_read(&svm->vcpu, reg); | ||
2798 | switch (cr) { | ||
2799 | case 0: | ||
2800 | if (!check_selective_cr0_intercepted(svm, val)) | ||
2801 | err = kvm_set_cr0(&svm->vcpu, val); | ||
2802 | else | ||
2803 | return 1; | ||
2804 | |||
2805 | break; | ||
2806 | case 3: | ||
2807 | err = kvm_set_cr3(&svm->vcpu, val); | ||
2808 | break; | ||
2809 | case 4: | ||
2810 | err = kvm_set_cr4(&svm->vcpu, val); | ||
2811 | break; | ||
2812 | case 8: | ||
2813 | err = kvm_set_cr8(&svm->vcpu, val); | ||
2814 | break; | ||
2815 | default: | ||
2816 | WARN(1, "unhandled write to CR%d", cr); | ||
2817 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
2818 | return 1; | ||
2819 | } | ||
2820 | } else { /* mov from cr */ | ||
2821 | switch (cr) { | ||
2822 | case 0: | ||
2823 | val = kvm_read_cr0(&svm->vcpu); | ||
2824 | break; | ||
2825 | case 2: | ||
2826 | val = svm->vcpu.arch.cr2; | ||
2827 | break; | ||
2828 | case 3: | ||
2829 | val = kvm_read_cr3(&svm->vcpu); | ||
2830 | break; | ||
2831 | case 4: | ||
2832 | val = kvm_read_cr4(&svm->vcpu); | ||
2833 | break; | ||
2834 | case 8: | ||
2835 | val = kvm_get_cr8(&svm->vcpu); | ||
2836 | break; | ||
2837 | default: | ||
2838 | WARN(1, "unhandled read from CR%d", cr); | ||
2839 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
2840 | return 1; | ||
2841 | } | ||
2842 | kvm_register_write(&svm->vcpu, reg, val); | ||
2843 | } | ||
2844 | kvm_complete_insn_gp(&svm->vcpu, err); | ||
2845 | |||
2846 | return 1; | ||
2847 | } | ||
2848 | |||
2849 | static int dr_interception(struct vcpu_svm *svm) | ||
2850 | { | ||
2851 | int reg, dr; | ||
2852 | unsigned long val; | ||
2853 | int err; | ||
2854 | |||
2855 | if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) | ||
2856 | return emulate_on_interception(svm); | ||
2857 | |||
2858 | reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; | ||
2859 | dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; | ||
2860 | |||
2861 | if (dr >= 16) { /* mov to DRn */ | ||
2862 | val = kvm_register_read(&svm->vcpu, reg); | ||
2863 | kvm_set_dr(&svm->vcpu, dr - 16, val); | ||
2864 | } else { | ||
2865 | err = kvm_get_dr(&svm->vcpu, dr, &val); | ||
2866 | if (!err) | ||
2867 | kvm_register_write(&svm->vcpu, reg, val); | ||
2868 | } | ||
2869 | |||
2870 | skip_emulated_instruction(&svm->vcpu); | ||
2871 | |||
2872 | return 1; | ||
2400 | } | 2873 | } |
2401 | 2874 | ||
2402 | static int cr8_write_interception(struct vcpu_svm *svm) | 2875 | static int cr8_write_interception(struct vcpu_svm *svm) |
2403 | { | 2876 | { |
2404 | struct kvm_run *kvm_run = svm->vcpu.run; | 2877 | struct kvm_run *kvm_run = svm->vcpu.run; |
2878 | int r; | ||
2405 | 2879 | ||
2406 | u8 cr8_prev = kvm_get_cr8(&svm->vcpu); | 2880 | u8 cr8_prev = kvm_get_cr8(&svm->vcpu); |
2407 | /* instruction emulation calls kvm_set_cr8() */ | 2881 | /* instruction emulation calls kvm_set_cr8() */ |
2408 | emulate_instruction(&svm->vcpu, 0, 0, 0); | 2882 | r = cr_interception(svm); |
2409 | if (irqchip_in_kernel(svm->vcpu.kvm)) { | 2883 | if (irqchip_in_kernel(svm->vcpu.kvm)) { |
2410 | svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; | 2884 | clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
2411 | return 1; | 2885 | return r; |
2412 | } | 2886 | } |
2413 | if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) | 2887 | if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) |
2414 | return 1; | 2888 | return r; |
2415 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | 2889 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; |
2416 | return 0; | 2890 | return 0; |
2417 | } | 2891 | } |
@@ -2422,14 +2896,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
2422 | 2896 | ||
2423 | switch (ecx) { | 2897 | switch (ecx) { |
2424 | case MSR_IA32_TSC: { | 2898 | case MSR_IA32_TSC: { |
2425 | u64 tsc_offset; | 2899 | struct vmcb *vmcb = get_host_vmcb(svm); |
2426 | 2900 | ||
2427 | if (is_nested(svm)) | 2901 | *data = vmcb->control.tsc_offset + |
2428 | tsc_offset = svm->nested.hsave->control.tsc_offset; | 2902 | svm_scale_tsc(vcpu, native_read_tsc()); |
2429 | else | ||
2430 | tsc_offset = svm->vmcb->control.tsc_offset; | ||
2431 | 2903 | ||
2432 | *data = tsc_offset + native_read_tsc(); | ||
2433 | break; | 2904 | break; |
2434 | } | 2905 | } |
2435 | case MSR_STAR: | 2906 | case MSR_STAR: |
@@ -2542,20 +3013,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
2542 | struct vcpu_svm *svm = to_svm(vcpu); | 3013 | struct vcpu_svm *svm = to_svm(vcpu); |
2543 | 3014 | ||
2544 | switch (ecx) { | 3015 | switch (ecx) { |
2545 | case MSR_IA32_TSC: { | 3016 | case MSR_IA32_TSC: |
2546 | u64 tsc_offset = data - native_read_tsc(); | 3017 | kvm_write_tsc(vcpu, data); |
2547 | u64 g_tsc_offset = 0; | ||
2548 | |||
2549 | if (is_nested(svm)) { | ||
2550 | g_tsc_offset = svm->vmcb->control.tsc_offset - | ||
2551 | svm->nested.hsave->control.tsc_offset; | ||
2552 | svm->nested.hsave->control.tsc_offset = tsc_offset; | ||
2553 | } | ||
2554 | |||
2555 | svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset; | ||
2556 | |||
2557 | break; | 3018 | break; |
2558 | } | ||
2559 | case MSR_STAR: | 3019 | case MSR_STAR: |
2560 | svm->vmcb->save.star = data; | 3020 | svm->vmcb->save.star = data; |
2561 | break; | 3021 | break; |
@@ -2585,7 +3045,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
2585 | svm->vmcb->save.sysenter_esp = data; | 3045 | svm->vmcb->save.sysenter_esp = data; |
2586 | break; | 3046 | break; |
2587 | case MSR_IA32_DEBUGCTLMSR: | 3047 | case MSR_IA32_DEBUGCTLMSR: |
2588 | if (!svm_has(SVM_FEATURE_LBRV)) { | 3048 | if (!boot_cpu_has(X86_FEATURE_LBRV)) { |
2589 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", | 3049 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", |
2590 | __func__, data); | 3050 | __func__, data); |
2591 | break; | 3051 | break; |
@@ -2594,6 +3054,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
2594 | return 1; | 3054 | return 1; |
2595 | 3055 | ||
2596 | svm->vmcb->save.dbgctl = data; | 3056 | svm->vmcb->save.dbgctl = data; |
3057 | mark_dirty(svm->vmcb, VMCB_LBR); | ||
2597 | if (data & (1ULL<<0)) | 3058 | if (data & (1ULL<<0)) |
2598 | svm_enable_lbrv(svm); | 3059 | svm_enable_lbrv(svm); |
2599 | else | 3060 | else |
@@ -2643,8 +3104,10 @@ static int interrupt_window_interception(struct vcpu_svm *svm) | |||
2643 | { | 3104 | { |
2644 | struct kvm_run *kvm_run = svm->vcpu.run; | 3105 | struct kvm_run *kvm_run = svm->vcpu.run; |
2645 | 3106 | ||
3107 | kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); | ||
2646 | svm_clear_vintr(svm); | 3108 | svm_clear_vintr(svm); |
2647 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | 3109 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; |
3110 | mark_dirty(svm->vmcb, VMCB_INTR); | ||
2648 | /* | 3111 | /* |
2649 | * If the user space waits to inject interrupts, exit as soon as | 3112 | * If the user space waits to inject interrupts, exit as soon as |
2650 | * possible | 3113 | * possible |
@@ -2667,31 +3130,31 @@ static int pause_interception(struct vcpu_svm *svm) | |||
2667 | } | 3130 | } |
2668 | 3131 | ||
2669 | static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { | 3132 | static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { |
2670 | [SVM_EXIT_READ_CR0] = emulate_on_interception, | 3133 | [SVM_EXIT_READ_CR0] = cr_interception, |
2671 | [SVM_EXIT_READ_CR3] = emulate_on_interception, | 3134 | [SVM_EXIT_READ_CR3] = cr_interception, |
2672 | [SVM_EXIT_READ_CR4] = emulate_on_interception, | 3135 | [SVM_EXIT_READ_CR4] = cr_interception, |
2673 | [SVM_EXIT_READ_CR8] = emulate_on_interception, | 3136 | [SVM_EXIT_READ_CR8] = cr_interception, |
2674 | [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, | 3137 | [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, |
2675 | [SVM_EXIT_WRITE_CR0] = emulate_on_interception, | 3138 | [SVM_EXIT_WRITE_CR0] = cr_interception, |
2676 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, | 3139 | [SVM_EXIT_WRITE_CR3] = cr_interception, |
2677 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, | 3140 | [SVM_EXIT_WRITE_CR4] = cr_interception, |
2678 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, | 3141 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, |
2679 | [SVM_EXIT_READ_DR0] = emulate_on_interception, | 3142 | [SVM_EXIT_READ_DR0] = dr_interception, |
2680 | [SVM_EXIT_READ_DR1] = emulate_on_interception, | 3143 | [SVM_EXIT_READ_DR1] = dr_interception, |
2681 | [SVM_EXIT_READ_DR2] = emulate_on_interception, | 3144 | [SVM_EXIT_READ_DR2] = dr_interception, |
2682 | [SVM_EXIT_READ_DR3] = emulate_on_interception, | 3145 | [SVM_EXIT_READ_DR3] = dr_interception, |
2683 | [SVM_EXIT_READ_DR4] = emulate_on_interception, | 3146 | [SVM_EXIT_READ_DR4] = dr_interception, |
2684 | [SVM_EXIT_READ_DR5] = emulate_on_interception, | 3147 | [SVM_EXIT_READ_DR5] = dr_interception, |
2685 | [SVM_EXIT_READ_DR6] = emulate_on_interception, | 3148 | [SVM_EXIT_READ_DR6] = dr_interception, |
2686 | [SVM_EXIT_READ_DR7] = emulate_on_interception, | 3149 | [SVM_EXIT_READ_DR7] = dr_interception, |
2687 | [SVM_EXIT_WRITE_DR0] = emulate_on_interception, | 3150 | [SVM_EXIT_WRITE_DR0] = dr_interception, |
2688 | [SVM_EXIT_WRITE_DR1] = emulate_on_interception, | 3151 | [SVM_EXIT_WRITE_DR1] = dr_interception, |
2689 | [SVM_EXIT_WRITE_DR2] = emulate_on_interception, | 3152 | [SVM_EXIT_WRITE_DR2] = dr_interception, |
2690 | [SVM_EXIT_WRITE_DR3] = emulate_on_interception, | 3153 | [SVM_EXIT_WRITE_DR3] = dr_interception, |
2691 | [SVM_EXIT_WRITE_DR4] = emulate_on_interception, | 3154 | [SVM_EXIT_WRITE_DR4] = dr_interception, |
2692 | [SVM_EXIT_WRITE_DR5] = emulate_on_interception, | 3155 | [SVM_EXIT_WRITE_DR5] = dr_interception, |
2693 | [SVM_EXIT_WRITE_DR6] = emulate_on_interception, | 3156 | [SVM_EXIT_WRITE_DR6] = dr_interception, |
2694 | [SVM_EXIT_WRITE_DR7] = emulate_on_interception, | 3157 | [SVM_EXIT_WRITE_DR7] = dr_interception, |
2695 | [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, | 3158 | [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, |
2696 | [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, | 3159 | [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, |
2697 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, | 3160 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, |
@@ -2724,100 +3187,121 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { | |||
2724 | [SVM_EXIT_WBINVD] = emulate_on_interception, | 3187 | [SVM_EXIT_WBINVD] = emulate_on_interception, |
2725 | [SVM_EXIT_MONITOR] = invalid_op_interception, | 3188 | [SVM_EXIT_MONITOR] = invalid_op_interception, |
2726 | [SVM_EXIT_MWAIT] = invalid_op_interception, | 3189 | [SVM_EXIT_MWAIT] = invalid_op_interception, |
3190 | [SVM_EXIT_XSETBV] = xsetbv_interception, | ||
2727 | [SVM_EXIT_NPF] = pf_interception, | 3191 | [SVM_EXIT_NPF] = pf_interception, |
2728 | }; | 3192 | }; |
2729 | 3193 | ||
2730 | void dump_vmcb(struct kvm_vcpu *vcpu) | 3194 | static void dump_vmcb(struct kvm_vcpu *vcpu) |
2731 | { | 3195 | { |
2732 | struct vcpu_svm *svm = to_svm(vcpu); | 3196 | struct vcpu_svm *svm = to_svm(vcpu); |
2733 | struct vmcb_control_area *control = &svm->vmcb->control; | 3197 | struct vmcb_control_area *control = &svm->vmcb->control; |
2734 | struct vmcb_save_area *save = &svm->vmcb->save; | 3198 | struct vmcb_save_area *save = &svm->vmcb->save; |
2735 | 3199 | ||
2736 | pr_err("VMCB Control Area:\n"); | 3200 | pr_err("VMCB Control Area:\n"); |
2737 | pr_err("cr_read: %04x\n", control->intercept_cr_read); | 3201 | pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); |
2738 | pr_err("cr_write: %04x\n", control->intercept_cr_write); | 3202 | pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); |
2739 | pr_err("dr_read: %04x\n", control->intercept_dr_read); | 3203 | pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); |
2740 | pr_err("dr_write: %04x\n", control->intercept_dr_write); | 3204 | pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); |
2741 | pr_err("exceptions: %08x\n", control->intercept_exceptions); | 3205 | pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); |
2742 | pr_err("intercepts: %016llx\n", control->intercept); | 3206 | pr_err("%-20s%016llx\n", "intercepts:", control->intercept); |
2743 | pr_err("pause filter count: %d\n", control->pause_filter_count); | 3207 | pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); |
2744 | pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); | 3208 | pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); |
2745 | pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); | 3209 | pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); |
2746 | pr_err("tsc_offset: %016llx\n", control->tsc_offset); | 3210 | pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); |
2747 | pr_err("asid: %d\n", control->asid); | 3211 | pr_err("%-20s%d\n", "asid:", control->asid); |
2748 | pr_err("tlb_ctl: %d\n", control->tlb_ctl); | 3212 | pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); |
2749 | pr_err("int_ctl: %08x\n", control->int_ctl); | 3213 | pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); |
2750 | pr_err("int_vector: %08x\n", control->int_vector); | 3214 | pr_err("%-20s%08x\n", "int_vector:", control->int_vector); |
2751 | pr_err("int_state: %08x\n", control->int_state); | 3215 | pr_err("%-20s%08x\n", "int_state:", control->int_state); |
2752 | pr_err("exit_code: %08x\n", control->exit_code); | 3216 | pr_err("%-20s%08x\n", "exit_code:", control->exit_code); |
2753 | pr_err("exit_info1: %016llx\n", control->exit_info_1); | 3217 | pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); |
2754 | pr_err("exit_info2: %016llx\n", control->exit_info_2); | 3218 | pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); |
2755 | pr_err("exit_int_info: %08x\n", control->exit_int_info); | 3219 | pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); |
2756 | pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); | 3220 | pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); |
2757 | pr_err("nested_ctl: %lld\n", control->nested_ctl); | 3221 | pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); |
2758 | pr_err("nested_cr3: %016llx\n", control->nested_cr3); | 3222 | pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); |
2759 | pr_err("event_inj: %08x\n", control->event_inj); | 3223 | pr_err("%-20s%08x\n", "event_inj:", control->event_inj); |
2760 | pr_err("event_inj_err: %08x\n", control->event_inj_err); | 3224 | pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); |
2761 | pr_err("lbr_ctl: %lld\n", control->lbr_ctl); | 3225 | pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); |
2762 | pr_err("next_rip: %016llx\n", control->next_rip); | 3226 | pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); |
2763 | pr_err("VMCB State Save Area:\n"); | 3227 | pr_err("VMCB State Save Area:\n"); |
2764 | pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", | 3228 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", |
2765 | save->es.selector, save->es.attrib, | 3229 | "es:", |
2766 | save->es.limit, save->es.base); | 3230 | save->es.selector, save->es.attrib, |
2767 | pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", | 3231 | save->es.limit, save->es.base); |
2768 | save->cs.selector, save->cs.attrib, | 3232 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", |
2769 | save->cs.limit, save->cs.base); | 3233 | "cs:", |
2770 | pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", | 3234 | save->cs.selector, save->cs.attrib, |
2771 | save->ss.selector, save->ss.attrib, | 3235 | save->cs.limit, save->cs.base); |
2772 | save->ss.limit, save->ss.base); | 3236 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", |
2773 | pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", | 3237 | "ss:", |
2774 | save->ds.selector, save->ds.attrib, | 3238 | save->ss.selector, save->ss.attrib, |
2775 | save->ds.limit, save->ds.base); | 3239 | save->ss.limit, save->ss.base); |
2776 | pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", | 3240 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", |
2777 | save->fs.selector, save->fs.attrib, | 3241 | "ds:", |
2778 | save->fs.limit, save->fs.base); | 3242 | save->ds.selector, save->ds.attrib, |
2779 | pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", | 3243 | save->ds.limit, save->ds.base); |
2780 | save->gs.selector, save->gs.attrib, | 3244 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", |
2781 | save->gs.limit, save->gs.base); | 3245 | "fs:", |
2782 | pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", | 3246 | save->fs.selector, save->fs.attrib, |
2783 | save->gdtr.selector, save->gdtr.attrib, | 3247 | save->fs.limit, save->fs.base); |
2784 | save->gdtr.limit, save->gdtr.base); | 3248 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", |
2785 | pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", | 3249 | "gs:", |
2786 | save->ldtr.selector, save->ldtr.attrib, | 3250 | save->gs.selector, save->gs.attrib, |
2787 | save->ldtr.limit, save->ldtr.base); | 3251 | save->gs.limit, save->gs.base); |
2788 | pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", | 3252 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", |
2789 | save->idtr.selector, save->idtr.attrib, | 3253 | "gdtr:", |
2790 | save->idtr.limit, save->idtr.base); | 3254 | save->gdtr.selector, save->gdtr.attrib, |
2791 | pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", | 3255 | save->gdtr.limit, save->gdtr.base); |
2792 | save->tr.selector, save->tr.attrib, | 3256 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", |
2793 | save->tr.limit, save->tr.base); | 3257 | "ldtr:", |
3258 | save->ldtr.selector, save->ldtr.attrib, | ||
3259 | save->ldtr.limit, save->ldtr.base); | ||
3260 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", | ||
3261 | "idtr:", | ||
3262 | save->idtr.selector, save->idtr.attrib, | ||
3263 | save->idtr.limit, save->idtr.base); | ||
3264 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", | ||
3265 | "tr:", | ||
3266 | save->tr.selector, save->tr.attrib, | ||
3267 | save->tr.limit, save->tr.base); | ||
2794 | pr_err("cpl: %d efer: %016llx\n", | 3268 | pr_err("cpl: %d efer: %016llx\n", |
2795 | save->cpl, save->efer); | 3269 | save->cpl, save->efer); |
2796 | pr_err("cr0: %016llx cr2: %016llx\n", | 3270 | pr_err("%-15s %016llx %-13s %016llx\n", |
2797 | save->cr0, save->cr2); | 3271 | "cr0:", save->cr0, "cr2:", save->cr2); |
2798 | pr_err("cr3: %016llx cr4: %016llx\n", | 3272 | pr_err("%-15s %016llx %-13s %016llx\n", |
2799 | save->cr3, save->cr4); | 3273 | "cr3:", save->cr3, "cr4:", save->cr4); |
2800 | pr_err("dr6: %016llx dr7: %016llx\n", | 3274 | pr_err("%-15s %016llx %-13s %016llx\n", |
2801 | save->dr6, save->dr7); | 3275 | "dr6:", save->dr6, "dr7:", save->dr7); |
2802 | pr_err("rip: %016llx rflags: %016llx\n", | 3276 | pr_err("%-15s %016llx %-13s %016llx\n", |
2803 | save->rip, save->rflags); | 3277 | "rip:", save->rip, "rflags:", save->rflags); |
2804 | pr_err("rsp: %016llx rax: %016llx\n", | 3278 | pr_err("%-15s %016llx %-13s %016llx\n", |
2805 | save->rsp, save->rax); | 3279 | "rsp:", save->rsp, "rax:", save->rax); |
2806 | pr_err("star: %016llx lstar: %016llx\n", | 3280 | pr_err("%-15s %016llx %-13s %016llx\n", |
2807 | save->star, save->lstar); | 3281 | "star:", save->star, "lstar:", save->lstar); |
2808 | pr_err("cstar: %016llx sfmask: %016llx\n", | 3282 | pr_err("%-15s %016llx %-13s %016llx\n", |
2809 | save->cstar, save->sfmask); | 3283 | "cstar:", save->cstar, "sfmask:", save->sfmask); |
2810 | pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", | 3284 | pr_err("%-15s %016llx %-13s %016llx\n", |
2811 | save->kernel_gs_base, save->sysenter_cs); | 3285 | "kernel_gs_base:", save->kernel_gs_base, |
2812 | pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", | 3286 | "sysenter_cs:", save->sysenter_cs); |
2813 | save->sysenter_esp, save->sysenter_eip); | 3287 | pr_err("%-15s %016llx %-13s %016llx\n", |
2814 | pr_err("gpat: %016llx dbgctl: %016llx\n", | 3288 | "sysenter_esp:", save->sysenter_esp, |
2815 | save->g_pat, save->dbgctl); | 3289 | "sysenter_eip:", save->sysenter_eip); |
2816 | pr_err("br_from: %016llx br_to: %016llx\n", | 3290 | pr_err("%-15s %016llx %-13s %016llx\n", |
2817 | save->br_from, save->br_to); | 3291 | "gpat:", save->g_pat, "dbgctl:", save->dbgctl); |
2818 | pr_err("excp_from: %016llx excp_to: %016llx\n", | 3292 | pr_err("%-15s %016llx %-13s %016llx\n", |
2819 | save->last_excp_from, save->last_excp_to); | 3293 | "br_from:", save->br_from, "br_to:", save->br_to); |
2820 | 3294 | pr_err("%-15s %016llx %-13s %016llx\n", | |
3295 | "excp_from:", save->last_excp_from, | ||
3296 | "excp_to:", save->last_excp_to); | ||
3297 | } | ||
3298 | |||
3299 | static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) | ||
3300 | { | ||
3301 | struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; | ||
3302 | |||
3303 | *info1 = control->exit_info_1; | ||
3304 | *info2 = control->exit_info_2; | ||
2821 | } | 3305 | } |
2822 | 3306 | ||
2823 | static int handle_exit(struct kvm_vcpu *vcpu) | 3307 | static int handle_exit(struct kvm_vcpu *vcpu) |
@@ -2826,9 +3310,9 @@ static int handle_exit(struct kvm_vcpu *vcpu) | |||
2826 | struct kvm_run *kvm_run = vcpu->run; | 3310 | struct kvm_run *kvm_run = vcpu->run; |
2827 | u32 exit_code = svm->vmcb->control.exit_code; | 3311 | u32 exit_code = svm->vmcb->control.exit_code; |
2828 | 3312 | ||
2829 | trace_kvm_exit(exit_code, vcpu); | 3313 | trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); |
2830 | 3314 | ||
2831 | if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) | 3315 | if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) |
2832 | vcpu->arch.cr0 = svm->vmcb->save.cr0; | 3316 | vcpu->arch.cr0 = svm->vmcb->save.cr0; |
2833 | if (npt_enabled) | 3317 | if (npt_enabled) |
2834 | vcpu->arch.cr3 = svm->vmcb->save.cr3; | 3318 | vcpu->arch.cr3 = svm->vmcb->save.cr3; |
@@ -2840,7 +3324,7 @@ static int handle_exit(struct kvm_vcpu *vcpu) | |||
2840 | return 1; | 3324 | return 1; |
2841 | } | 3325 | } |
2842 | 3326 | ||
2843 | if (is_nested(svm)) { | 3327 | if (is_guest_mode(vcpu)) { |
2844 | int vmexit; | 3328 | int vmexit; |
2845 | 3329 | ||
2846 | trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, | 3330 | trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, |
@@ -2871,7 +3355,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) | |||
2871 | 3355 | ||
2872 | if (is_external_interrupt(svm->vmcb->control.exit_int_info) && | 3356 | if (is_external_interrupt(svm->vmcb->control.exit_int_info) && |
2873 | exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && | 3357 | exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && |
2874 | exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) | 3358 | exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && |
3359 | exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) | ||
2875 | printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " | 3360 | printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " |
2876 | "exit_code 0x%x\n", | 3361 | "exit_code 0x%x\n", |
2877 | __func__, svm->vmcb->control.exit_int_info, | 3362 | __func__, svm->vmcb->control.exit_int_info, |
@@ -2902,7 +3387,6 @@ static void pre_svm_run(struct vcpu_svm *svm) | |||
2902 | 3387 | ||
2903 | struct svm_cpu_data *sd = per_cpu(svm_data, cpu); | 3388 | struct svm_cpu_data *sd = per_cpu(svm_data, cpu); |
2904 | 3389 | ||
2905 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; | ||
2906 | /* FIXME: handle wraparound of asid_generation */ | 3390 | /* FIXME: handle wraparound of asid_generation */ |
2907 | if (svm->asid_generation != sd->asid_generation) | 3391 | if (svm->asid_generation != sd->asid_generation) |
2908 | new_asid(svm, sd); | 3392 | new_asid(svm, sd); |
@@ -2914,7 +3398,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) | |||
2914 | 3398 | ||
2915 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; | 3399 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; |
2916 | vcpu->arch.hflags |= HF_NMI_MASK; | 3400 | vcpu->arch.hflags |= HF_NMI_MASK; |
2917 | svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); | 3401 | set_intercept(svm, INTERCEPT_IRET); |
2918 | ++vcpu->stat.nmi_injections; | 3402 | ++vcpu->stat.nmi_injections; |
2919 | } | 3403 | } |
2920 | 3404 | ||
@@ -2927,6 +3411,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | |||
2927 | control->int_ctl &= ~V_INTR_PRIO_MASK; | 3411 | control->int_ctl &= ~V_INTR_PRIO_MASK; |
2928 | control->int_ctl |= V_IRQ_MASK | | 3412 | control->int_ctl |= V_IRQ_MASK | |
2929 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); | 3413 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); |
3414 | mark_dirty(svm->vmcb, VMCB_INTR); | ||
2930 | } | 3415 | } |
2931 | 3416 | ||
2932 | static void svm_set_irq(struct kvm_vcpu *vcpu) | 3417 | static void svm_set_irq(struct kvm_vcpu *vcpu) |
@@ -2946,14 +3431,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) | |||
2946 | { | 3431 | { |
2947 | struct vcpu_svm *svm = to_svm(vcpu); | 3432 | struct vcpu_svm *svm = to_svm(vcpu); |
2948 | 3433 | ||
2949 | if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) | 3434 | if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) |
2950 | return; | 3435 | return; |
2951 | 3436 | ||
2952 | if (irr == -1) | 3437 | if (irr == -1) |
2953 | return; | 3438 | return; |
2954 | 3439 | ||
2955 | if (tpr >= irr) | 3440 | if (tpr >= irr) |
2956 | svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; | 3441 | set_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
2957 | } | 3442 | } |
2958 | 3443 | ||
2959 | static int svm_nmi_allowed(struct kvm_vcpu *vcpu) | 3444 | static int svm_nmi_allowed(struct kvm_vcpu *vcpu) |
@@ -2981,10 +3466,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) | |||
2981 | 3466 | ||
2982 | if (masked) { | 3467 | if (masked) { |
2983 | svm->vcpu.arch.hflags |= HF_NMI_MASK; | 3468 | svm->vcpu.arch.hflags |= HF_NMI_MASK; |
2984 | svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); | 3469 | set_intercept(svm, INTERCEPT_IRET); |
2985 | } else { | 3470 | } else { |
2986 | svm->vcpu.arch.hflags &= ~HF_NMI_MASK; | 3471 | svm->vcpu.arch.hflags &= ~HF_NMI_MASK; |
2987 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); | 3472 | clr_intercept(svm, INTERCEPT_IRET); |
2988 | } | 3473 | } |
2989 | } | 3474 | } |
2990 | 3475 | ||
@@ -2998,9 +3483,9 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) | |||
2998 | (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) | 3483 | (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) |
2999 | return 0; | 3484 | return 0; |
3000 | 3485 | ||
3001 | ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); | 3486 | ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); |
3002 | 3487 | ||
3003 | if (is_nested(svm)) | 3488 | if (is_guest_mode(vcpu)) |
3004 | return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); | 3489 | return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); |
3005 | 3490 | ||
3006 | return ret; | 3491 | return ret; |
@@ -3046,7 +3531,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) | |||
3046 | 3531 | ||
3047 | static void svm_flush_tlb(struct kvm_vcpu *vcpu) | 3532 | static void svm_flush_tlb(struct kvm_vcpu *vcpu) |
3048 | { | 3533 | { |
3049 | force_new_asid(vcpu); | 3534 | struct vcpu_svm *svm = to_svm(vcpu); |
3535 | |||
3536 | if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) | ||
3537 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; | ||
3538 | else | ||
3539 | svm->asid_generation--; | ||
3050 | } | 3540 | } |
3051 | 3541 | ||
3052 | static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) | 3542 | static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) |
@@ -3057,10 +3547,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) | |||
3057 | { | 3547 | { |
3058 | struct vcpu_svm *svm = to_svm(vcpu); | 3548 | struct vcpu_svm *svm = to_svm(vcpu); |
3059 | 3549 | ||
3060 | if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) | 3550 | if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) |
3061 | return; | 3551 | return; |
3062 | 3552 | ||
3063 | if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { | 3553 | if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { |
3064 | int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; | 3554 | int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; |
3065 | kvm_set_cr8(vcpu, cr8); | 3555 | kvm_set_cr8(vcpu, cr8); |
3066 | } | 3556 | } |
@@ -3071,7 +3561,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) | |||
3071 | struct vcpu_svm *svm = to_svm(vcpu); | 3561 | struct vcpu_svm *svm = to_svm(vcpu); |
3072 | u64 cr8; | 3562 | u64 cr8; |
3073 | 3563 | ||
3074 | if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) | 3564 | if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) |
3075 | return; | 3565 | return; |
3076 | 3566 | ||
3077 | cr8 = kvm_get_cr8(vcpu); | 3567 | cr8 = kvm_get_cr8(vcpu); |
@@ -3088,8 +3578,15 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) | |||
3088 | 3578 | ||
3089 | svm->int3_injected = 0; | 3579 | svm->int3_injected = 0; |
3090 | 3580 | ||
3091 | if (svm->vcpu.arch.hflags & HF_IRET_MASK) | 3581 | /* |
3582 | * If we've made progress since setting HF_IRET_MASK, we've | ||
3583 | * executed an IRET and can allow NMI injection. | ||
3584 | */ | ||
3585 | if ((svm->vcpu.arch.hflags & HF_IRET_MASK) | ||
3586 | && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) { | ||
3092 | svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); | 3587 | svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); |
3588 | kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); | ||
3589 | } | ||
3093 | 3590 | ||
3094 | svm->vcpu.arch.nmi_injected = false; | 3591 | svm->vcpu.arch.nmi_injected = false; |
3095 | kvm_clear_exception_queue(&svm->vcpu); | 3592 | kvm_clear_exception_queue(&svm->vcpu); |
@@ -3098,6 +3595,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) | |||
3098 | if (!(exitintinfo & SVM_EXITINTINFO_VALID)) | 3595 | if (!(exitintinfo & SVM_EXITINTINFO_VALID)) |
3099 | return; | 3596 | return; |
3100 | 3597 | ||
3598 | kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); | ||
3599 | |||
3101 | vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; | 3600 | vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; |
3102 | type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; | 3601 | type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; |
3103 | 3602 | ||
@@ -3134,6 +3633,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) | |||
3134 | } | 3633 | } |
3135 | } | 3634 | } |
3136 | 3635 | ||
3636 | static void svm_cancel_injection(struct kvm_vcpu *vcpu) | ||
3637 | { | ||
3638 | struct vcpu_svm *svm = to_svm(vcpu); | ||
3639 | struct vmcb_control_area *control = &svm->vmcb->control; | ||
3640 | |||
3641 | control->exit_int_info = control->event_inj; | ||
3642 | control->exit_int_info_err = control->event_inj_err; | ||
3643 | control->event_inj = 0; | ||
3644 | svm_complete_interrupts(svm); | ||
3645 | } | ||
3646 | |||
3137 | #ifdef CONFIG_X86_64 | 3647 | #ifdef CONFIG_X86_64 |
3138 | #define R "r" | 3648 | #define R "r" |
3139 | #else | 3649 | #else |
@@ -3143,9 +3653,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) | |||
3143 | static void svm_vcpu_run(struct kvm_vcpu *vcpu) | 3653 | static void svm_vcpu_run(struct kvm_vcpu *vcpu) |
3144 | { | 3654 | { |
3145 | struct vcpu_svm *svm = to_svm(vcpu); | 3655 | struct vcpu_svm *svm = to_svm(vcpu); |
3146 | u16 fs_selector; | ||
3147 | u16 gs_selector; | ||
3148 | u16 ldt_selector; | ||
3149 | 3656 | ||
3150 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; | 3657 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; |
3151 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | 3658 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; |
@@ -3162,14 +3669,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3162 | 3669 | ||
3163 | sync_lapic_to_cr8(vcpu); | 3670 | sync_lapic_to_cr8(vcpu); |
3164 | 3671 | ||
3165 | save_host_msrs(vcpu); | ||
3166 | savesegment(fs, fs_selector); | ||
3167 | savesegment(gs, gs_selector); | ||
3168 | ldt_selector = kvm_read_ldt(); | ||
3169 | svm->vmcb->save.cr2 = vcpu->arch.cr2; | 3672 | svm->vmcb->save.cr2 = vcpu->arch.cr2; |
3170 | /* required for live migration with NPT */ | ||
3171 | if (npt_enabled) | ||
3172 | svm->vmcb->save.cr3 = vcpu->arch.cr3; | ||
3173 | 3673 | ||
3174 | clgi(); | 3674 | clgi(); |
3175 | 3675 | ||
@@ -3246,31 +3746,44 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3246 | #endif | 3746 | #endif |
3247 | ); | 3747 | ); |
3248 | 3748 | ||
3249 | vcpu->arch.cr2 = svm->vmcb->save.cr2; | ||
3250 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
3251 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
3252 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; | ||
3253 | |||
3254 | load_host_msrs(vcpu); | ||
3255 | loadsegment(fs, fs_selector); | ||
3256 | #ifdef CONFIG_X86_64 | 3749 | #ifdef CONFIG_X86_64 |
3257 | load_gs_index(gs_selector); | 3750 | wrmsrl(MSR_GS_BASE, svm->host.gs_base); |
3258 | wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); | ||
3259 | #else | 3751 | #else |
3260 | loadsegment(gs, gs_selector); | 3752 | loadsegment(fs, svm->host.fs); |
3753 | #ifndef CONFIG_X86_32_LAZY_GS | ||
3754 | loadsegment(gs, svm->host.gs); | ||
3755 | #endif | ||
3261 | #endif | 3756 | #endif |
3262 | kvm_load_ldt(ldt_selector); | ||
3263 | 3757 | ||
3264 | reload_tss(vcpu); | 3758 | reload_tss(vcpu); |
3265 | 3759 | ||
3266 | local_irq_disable(); | 3760 | local_irq_disable(); |
3267 | 3761 | ||
3762 | vcpu->arch.cr2 = svm->vmcb->save.cr2; | ||
3763 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
3764 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
3765 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; | ||
3766 | |||
3767 | if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) | ||
3768 | kvm_before_handle_nmi(&svm->vcpu); | ||
3769 | |||
3268 | stgi(); | 3770 | stgi(); |
3269 | 3771 | ||
3772 | /* Any pending NMI will happen here */ | ||
3773 | |||
3774 | if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) | ||
3775 | kvm_after_handle_nmi(&svm->vcpu); | ||
3776 | |||
3270 | sync_cr8_to_lapic(vcpu); | 3777 | sync_cr8_to_lapic(vcpu); |
3271 | 3778 | ||
3272 | svm->next_rip = 0; | 3779 | svm->next_rip = 0; |
3273 | 3780 | ||
3781 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; | ||
3782 | |||
3783 | /* if exit due to PF check for async PF */ | ||
3784 | if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) | ||
3785 | svm->apf_reason = kvm_read_and_reset_pf_reason(); | ||
3786 | |||
3274 | if (npt_enabled) { | 3787 | if (npt_enabled) { |
3275 | vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); | 3788 | vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); |
3276 | vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); | 3789 | vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); |
@@ -3283,6 +3796,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3283 | if (unlikely(svm->vmcb->control.exit_code == | 3796 | if (unlikely(svm->vmcb->control.exit_code == |
3284 | SVM_EXIT_EXCP_BASE + MC_VECTOR)) | 3797 | SVM_EXIT_EXCP_BASE + MC_VECTOR)) |
3285 | svm_handle_mce(svm); | 3798 | svm_handle_mce(svm); |
3799 | |||
3800 | mark_all_clean(svm->vmcb); | ||
3286 | } | 3801 | } |
3287 | 3802 | ||
3288 | #undef R | 3803 | #undef R |
@@ -3291,14 +3806,23 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | |||
3291 | { | 3806 | { |
3292 | struct vcpu_svm *svm = to_svm(vcpu); | 3807 | struct vcpu_svm *svm = to_svm(vcpu); |
3293 | 3808 | ||
3294 | if (npt_enabled) { | ||
3295 | svm->vmcb->control.nested_cr3 = root; | ||
3296 | force_new_asid(vcpu); | ||
3297 | return; | ||
3298 | } | ||
3299 | |||
3300 | svm->vmcb->save.cr3 = root; | 3809 | svm->vmcb->save.cr3 = root; |
3301 | force_new_asid(vcpu); | 3810 | mark_dirty(svm->vmcb, VMCB_CR); |
3811 | svm_flush_tlb(vcpu); | ||
3812 | } | ||
3813 | |||
3814 | static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) | ||
3815 | { | ||
3816 | struct vcpu_svm *svm = to_svm(vcpu); | ||
3817 | |||
3818 | svm->vmcb->control.nested_cr3 = root; | ||
3819 | mark_dirty(svm->vmcb, VMCB_NPT); | ||
3820 | |||
3821 | /* Also sync guest cr3 here in case we live migrate */ | ||
3822 | svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); | ||
3823 | mark_dirty(svm->vmcb, VMCB_CR); | ||
3824 | |||
3825 | svm_flush_tlb(vcpu); | ||
3302 | } | 3826 | } |
3303 | 3827 | ||
3304 | static int is_disabled(void) | 3828 | static int is_disabled(void) |
@@ -3333,15 +3857,6 @@ static bool svm_cpu_has_accelerated_tpr(void) | |||
3333 | return false; | 3857 | return false; |
3334 | } | 3858 | } |
3335 | 3859 | ||
3336 | static int get_npt_level(void) | ||
3337 | { | ||
3338 | #ifdef CONFIG_X86_64 | ||
3339 | return PT64_ROOT_LEVEL; | ||
3340 | #else | ||
3341 | return PT32E_ROOT_LEVEL; | ||
3342 | #endif | ||
3343 | } | ||
3344 | |||
3345 | static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) | 3860 | static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) |
3346 | { | 3861 | { |
3347 | return 0; | 3862 | return 0; |
@@ -3354,12 +3869,25 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) | |||
3354 | static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | 3869 | static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) |
3355 | { | 3870 | { |
3356 | switch (func) { | 3871 | switch (func) { |
3872 | case 0x80000001: | ||
3873 | if (nested) | ||
3874 | entry->ecx |= (1 << 2); /* Set SVM bit */ | ||
3875 | break; | ||
3357 | case 0x8000000A: | 3876 | case 0x8000000A: |
3358 | entry->eax = 1; /* SVM revision 1 */ | 3877 | entry->eax = 1; /* SVM revision 1 */ |
3359 | entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper | 3878 | entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper |
3360 | ASID emulation to nested SVM */ | 3879 | ASID emulation to nested SVM */ |
3361 | entry->ecx = 0; /* Reserved */ | 3880 | entry->ecx = 0; /* Reserved */ |
3362 | entry->edx = 0; /* Do not support any additional features */ | 3881 | entry->edx = 0; /* Per default do not support any |
3882 | additional features */ | ||
3883 | |||
3884 | /* Support next_rip if host supports it */ | ||
3885 | if (boot_cpu_has(X86_FEATURE_NRIPS)) | ||
3886 | entry->edx |= SVM_FEATURE_NRIP; | ||
3887 | |||
3888 | /* Support NPT for the guest if enabled */ | ||
3889 | if (npt_enabled) | ||
3890 | entry->edx |= SVM_FEATURE_NPT; | ||
3363 | 3891 | ||
3364 | break; | 3892 | break; |
3365 | } | 3893 | } |
@@ -3414,6 +3942,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { | |||
3414 | { SVM_EXIT_WBINVD, "wbinvd" }, | 3942 | { SVM_EXIT_WBINVD, "wbinvd" }, |
3415 | { SVM_EXIT_MONITOR, "monitor" }, | 3943 | { SVM_EXIT_MONITOR, "monitor" }, |
3416 | { SVM_EXIT_MWAIT, "mwait" }, | 3944 | { SVM_EXIT_MWAIT, "mwait" }, |
3945 | { SVM_EXIT_XSETBV, "xsetbv" }, | ||
3417 | { SVM_EXIT_NPF, "npf" }, | 3946 | { SVM_EXIT_NPF, "npf" }, |
3418 | { -1, NULL } | 3947 | { -1, NULL } |
3419 | }; | 3948 | }; |
@@ -3437,12 +3966,190 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) | |||
3437 | { | 3966 | { |
3438 | struct vcpu_svm *svm = to_svm(vcpu); | 3967 | struct vcpu_svm *svm = to_svm(vcpu); |
3439 | 3968 | ||
3440 | svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; | 3969 | set_exception_intercept(svm, NM_VECTOR); |
3441 | if (is_nested(svm)) | ||
3442 | svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR; | ||
3443 | update_cr0_intercept(svm); | 3970 | update_cr0_intercept(svm); |
3444 | } | 3971 | } |
3445 | 3972 | ||
3973 | #define PRE_EX(exit) { .exit_code = (exit), \ | ||
3974 | .stage = X86_ICPT_PRE_EXCEPT, } | ||
3975 | #define POST_EX(exit) { .exit_code = (exit), \ | ||
3976 | .stage = X86_ICPT_POST_EXCEPT, } | ||
3977 | #define POST_MEM(exit) { .exit_code = (exit), \ | ||
3978 | .stage = X86_ICPT_POST_MEMACCESS, } | ||
3979 | |||
3980 | static struct __x86_intercept { | ||
3981 | u32 exit_code; | ||
3982 | enum x86_intercept_stage stage; | ||
3983 | } x86_intercept_map[] = { | ||
3984 | [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), | ||
3985 | [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), | ||
3986 | [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), | ||
3987 | [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), | ||
3988 | [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), | ||
3989 | [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), | ||
3990 | [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), | ||
3991 | [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), | ||
3992 | [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), | ||
3993 | [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), | ||
3994 | [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), | ||
3995 | [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), | ||
3996 | [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), | ||
3997 | [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), | ||
3998 | [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), | ||
3999 | [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), | ||
4000 | [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), | ||
4001 | [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), | ||
4002 | [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), | ||
4003 | [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), | ||
4004 | [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), | ||
4005 | [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), | ||
4006 | [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), | ||
4007 | [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), | ||
4008 | [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), | ||
4009 | [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), | ||
4010 | [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), | ||
4011 | [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), | ||
4012 | [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), | ||
4013 | [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), | ||
4014 | [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), | ||
4015 | [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), | ||
4016 | [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), | ||
4017 | [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), | ||
4018 | [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), | ||
4019 | [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), | ||
4020 | [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), | ||
4021 | [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), | ||
4022 | [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), | ||
4023 | [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), | ||
4024 | [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), | ||
4025 | [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), | ||
4026 | [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), | ||
4027 | [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), | ||
4028 | [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), | ||
4029 | [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), | ||
4030 | }; | ||
4031 | |||
4032 | #undef PRE_EX | ||
4033 | #undef POST_EX | ||
4034 | #undef POST_MEM | ||
4035 | |||
4036 | static int svm_check_intercept(struct kvm_vcpu *vcpu, | ||
4037 | struct x86_instruction_info *info, | ||
4038 | enum x86_intercept_stage stage) | ||
4039 | { | ||
4040 | struct vcpu_svm *svm = to_svm(vcpu); | ||
4041 | int vmexit, ret = X86EMUL_CONTINUE; | ||
4042 | struct __x86_intercept icpt_info; | ||
4043 | struct vmcb *vmcb = svm->vmcb; | ||
4044 | |||
4045 | if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) | ||
4046 | goto out; | ||
4047 | |||
4048 | icpt_info = x86_intercept_map[info->intercept]; | ||
4049 | |||
4050 | if (stage != icpt_info.stage) | ||
4051 | goto out; | ||
4052 | |||
4053 | switch (icpt_info.exit_code) { | ||
4054 | case SVM_EXIT_READ_CR0: | ||
4055 | if (info->intercept == x86_intercept_cr_read) | ||
4056 | icpt_info.exit_code += info->modrm_reg; | ||
4057 | break; | ||
4058 | case SVM_EXIT_WRITE_CR0: { | ||
4059 | unsigned long cr0, val; | ||
4060 | u64 intercept; | ||
4061 | |||
4062 | if (info->intercept == x86_intercept_cr_write) | ||
4063 | icpt_info.exit_code += info->modrm_reg; | ||
4064 | |||
4065 | if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0) | ||
4066 | break; | ||
4067 | |||
4068 | intercept = svm->nested.intercept; | ||
4069 | |||
4070 | if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) | ||
4071 | break; | ||
4072 | |||
4073 | cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; | ||
4074 | val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; | ||
4075 | |||
4076 | if (info->intercept == x86_intercept_lmsw) { | ||
4077 | cr0 &= 0xfUL; | ||
4078 | val &= 0xfUL; | ||
4079 | /* lmsw can't clear PE - catch this here */ | ||
4080 | if (cr0 & X86_CR0_PE) | ||
4081 | val |= X86_CR0_PE; | ||
4082 | } | ||
4083 | |||
4084 | if (cr0 ^ val) | ||
4085 | icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; | ||
4086 | |||
4087 | break; | ||
4088 | } | ||
4089 | case SVM_EXIT_READ_DR0: | ||
4090 | case SVM_EXIT_WRITE_DR0: | ||
4091 | icpt_info.exit_code += info->modrm_reg; | ||
4092 | break; | ||
4093 | case SVM_EXIT_MSR: | ||
4094 | if (info->intercept == x86_intercept_wrmsr) | ||
4095 | vmcb->control.exit_info_1 = 1; | ||
4096 | else | ||
4097 | vmcb->control.exit_info_1 = 0; | ||
4098 | break; | ||
4099 | case SVM_EXIT_PAUSE: | ||
4100 | /* | ||
4101 | * We get this for NOP only, but pause | ||
4102 | * is rep not, check this here | ||
4103 | */ | ||
4104 | if (info->rep_prefix != REPE_PREFIX) | ||
4105 | goto out; | ||
4106 | case SVM_EXIT_IOIO: { | ||
4107 | u64 exit_info; | ||
4108 | u32 bytes; | ||
4109 | |||
4110 | exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16; | ||
4111 | |||
4112 | if (info->intercept == x86_intercept_in || | ||
4113 | info->intercept == x86_intercept_ins) { | ||
4114 | exit_info |= SVM_IOIO_TYPE_MASK; | ||
4115 | bytes = info->src_bytes; | ||
4116 | } else { | ||
4117 | bytes = info->dst_bytes; | ||
4118 | } | ||
4119 | |||
4120 | if (info->intercept == x86_intercept_outs || | ||
4121 | info->intercept == x86_intercept_ins) | ||
4122 | exit_info |= SVM_IOIO_STR_MASK; | ||
4123 | |||
4124 | if (info->rep_prefix) | ||
4125 | exit_info |= SVM_IOIO_REP_MASK; | ||
4126 | |||
4127 | bytes = min(bytes, 4u); | ||
4128 | |||
4129 | exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; | ||
4130 | |||
4131 | exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); | ||
4132 | |||
4133 | vmcb->control.exit_info_1 = exit_info; | ||
4134 | vmcb->control.exit_info_2 = info->next_rip; | ||
4135 | |||
4136 | break; | ||
4137 | } | ||
4138 | default: | ||
4139 | break; | ||
4140 | } | ||
4141 | |||
4142 | vmcb->control.next_rip = info->next_rip; | ||
4143 | vmcb->control.exit_code = icpt_info.exit_code; | ||
4144 | vmexit = nested_svm_exit_handled(svm); | ||
4145 | |||
4146 | ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED | ||
4147 | : X86EMUL_CONTINUE; | ||
4148 | |||
4149 | out: | ||
4150 | return ret; | ||
4151 | } | ||
4152 | |||
3446 | static struct kvm_x86_ops svm_x86_ops = { | 4153 | static struct kvm_x86_ops svm_x86_ops = { |
3447 | .cpu_has_kvm_support = has_svm, | 4154 | .cpu_has_kvm_support = has_svm, |
3448 | .disabled_by_bios = is_disabled, | 4155 | .disabled_by_bios = is_disabled, |
@@ -3470,6 +4177,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
3470 | .get_cpl = svm_get_cpl, | 4177 | .get_cpl = svm_get_cpl, |
3471 | .get_cs_db_l_bits = kvm_get_cs_db_l_bits, | 4178 | .get_cs_db_l_bits = kvm_get_cs_db_l_bits, |
3472 | .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, | 4179 | .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, |
4180 | .decache_cr3 = svm_decache_cr3, | ||
3473 | .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, | 4181 | .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, |
3474 | .set_cr0 = svm_set_cr0, | 4182 | .set_cr0 = svm_set_cr0, |
3475 | .set_cr3 = svm_set_cr3, | 4183 | .set_cr3 = svm_set_cr3, |
@@ -3497,6 +4205,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
3497 | .set_irq = svm_set_irq, | 4205 | .set_irq = svm_set_irq, |
3498 | .set_nmi = svm_inject_nmi, | 4206 | .set_nmi = svm_inject_nmi, |
3499 | .queue_exception = svm_queue_exception, | 4207 | .queue_exception = svm_queue_exception, |
4208 | .cancel_injection = svm_cancel_injection, | ||
3500 | .interrupt_allowed = svm_interrupt_allowed, | 4209 | .interrupt_allowed = svm_interrupt_allowed, |
3501 | .nmi_allowed = svm_nmi_allowed, | 4210 | .nmi_allowed = svm_nmi_allowed, |
3502 | .get_nmi_mask = svm_get_nmi_mask, | 4211 | .get_nmi_mask = svm_get_nmi_mask, |
@@ -3509,7 +4218,9 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
3509 | .get_tdp_level = get_npt_level, | 4218 | .get_tdp_level = get_npt_level, |
3510 | .get_mt_mask = svm_get_mt_mask, | 4219 | .get_mt_mask = svm_get_mt_mask, |
3511 | 4220 | ||
4221 | .get_exit_info = svm_get_exit_info, | ||
3512 | .exit_reasons_str = svm_exit_reasons_str, | 4222 | .exit_reasons_str = svm_exit_reasons_str, |
4223 | |||
3513 | .get_lpage_level = svm_get_lpage_level, | 4224 | .get_lpage_level = svm_get_lpage_level, |
3514 | 4225 | ||
3515 | .cpuid_update = svm_cpuid_update, | 4226 | .cpuid_update = svm_cpuid_update, |
@@ -3519,6 +4230,15 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
3519 | .set_supported_cpuid = svm_set_supported_cpuid, | 4230 | .set_supported_cpuid = svm_set_supported_cpuid, |
3520 | 4231 | ||
3521 | .has_wbinvd_exit = svm_has_wbinvd_exit, | 4232 | .has_wbinvd_exit = svm_has_wbinvd_exit, |
4233 | |||
4234 | .set_tsc_khz = svm_set_tsc_khz, | ||
4235 | .write_tsc_offset = svm_write_tsc_offset, | ||
4236 | .adjust_tsc_offset = svm_adjust_tsc_offset, | ||
4237 | .compute_tsc_offset = svm_compute_tsc_offset, | ||
4238 | |||
4239 | .set_tdp_cr3 = set_tdp_cr3, | ||
4240 | |||
4241 | .check_intercept = svm_check_intercept, | ||
3522 | }; | 4242 | }; |
3523 | 4243 | ||
3524 | static int __init svm_init(void) | 4244 | static int __init svm_init(void) |
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index e16a0dbe74d8..abd86e865be3 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * | 6 | * |
7 | * timer support | 7 | * timer support |
8 | * | 8 | * |
9 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 9 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
10 | * | 10 | * |
11 | * This work is licensed under the terms of the GNU GPL, version 2. See | 11 | * This work is licensed under the terms of the GNU GPL, version 2. See |
12 | * the COPYING file in the top-level directory. | 12 | * the COPYING file in the top-level directory. |
@@ -25,7 +25,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) | |||
25 | 25 | ||
26 | /* | 26 | /* |
27 | * There is a race window between reading and incrementing, but we do | 27 | * There is a race window between reading and incrementing, but we do |
28 | * not care about potentially loosing timer events in the !reinject | 28 | * not care about potentially losing timer events in the !reinject |
29 | * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked | 29 | * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked |
30 | * in vcpu_enter_guest. | 30 | * in vcpu_enter_guest. |
31 | */ | 31 | */ |
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index a6544b8e7c0f..db932760ea82 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h | |||
@@ -62,21 +62,21 @@ TRACE_EVENT(kvm_hv_hypercall, | |||
62 | TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), | 62 | TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), |
63 | 63 | ||
64 | TP_STRUCT__entry( | 64 | TP_STRUCT__entry( |
65 | __field( __u16, code ) | ||
66 | __field( bool, fast ) | ||
67 | __field( __u16, rep_cnt ) | 65 | __field( __u16, rep_cnt ) |
68 | __field( __u16, rep_idx ) | 66 | __field( __u16, rep_idx ) |
69 | __field( __u64, ingpa ) | 67 | __field( __u64, ingpa ) |
70 | __field( __u64, outgpa ) | 68 | __field( __u64, outgpa ) |
69 | __field( __u16, code ) | ||
70 | __field( bool, fast ) | ||
71 | ), | 71 | ), |
72 | 72 | ||
73 | TP_fast_assign( | 73 | TP_fast_assign( |
74 | __entry->code = code; | ||
75 | __entry->fast = fast; | ||
76 | __entry->rep_cnt = rep_cnt; | 74 | __entry->rep_cnt = rep_cnt; |
77 | __entry->rep_idx = rep_idx; | 75 | __entry->rep_idx = rep_idx; |
78 | __entry->ingpa = ingpa; | 76 | __entry->ingpa = ingpa; |
79 | __entry->outgpa = outgpa; | 77 | __entry->outgpa = outgpa; |
78 | __entry->code = code; | ||
79 | __entry->fast = fast; | ||
80 | ), | 80 | ), |
81 | 81 | ||
82 | TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", | 82 | TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", |
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic, | |||
178 | #define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) | 178 | #define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) |
179 | #define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) | 179 | #define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) |
180 | 180 | ||
181 | #define KVM_ISA_VMX 1 | ||
182 | #define KVM_ISA_SVM 2 | ||
183 | |||
181 | /* | 184 | /* |
182 | * Tracepoint for kvm guest exit: | 185 | * Tracepoint for kvm guest exit: |
183 | */ | 186 | */ |
184 | TRACE_EVENT(kvm_exit, | 187 | TRACE_EVENT(kvm_exit, |
185 | TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu), | 188 | TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), |
186 | TP_ARGS(exit_reason, vcpu), | 189 | TP_ARGS(exit_reason, vcpu, isa), |
187 | 190 | ||
188 | TP_STRUCT__entry( | 191 | TP_STRUCT__entry( |
189 | __field( unsigned int, exit_reason ) | 192 | __field( unsigned int, exit_reason ) |
190 | __field( unsigned long, guest_rip ) | 193 | __field( unsigned long, guest_rip ) |
194 | __field( u32, isa ) | ||
195 | __field( u64, info1 ) | ||
196 | __field( u64, info2 ) | ||
191 | ), | 197 | ), |
192 | 198 | ||
193 | TP_fast_assign( | 199 | TP_fast_assign( |
194 | __entry->exit_reason = exit_reason; | 200 | __entry->exit_reason = exit_reason; |
195 | __entry->guest_rip = kvm_rip_read(vcpu); | 201 | __entry->guest_rip = kvm_rip_read(vcpu); |
202 | __entry->isa = isa; | ||
203 | kvm_x86_ops->get_exit_info(vcpu, &__entry->info1, | ||
204 | &__entry->info2); | ||
196 | ), | 205 | ), |
197 | 206 | ||
198 | TP_printk("reason %s rip 0x%lx", | 207 | TP_printk("reason %s rip 0x%lx info %llx %llx", |
199 | ftrace_print_symbols_seq(p, __entry->exit_reason, | 208 | ftrace_print_symbols_seq(p, __entry->exit_reason, |
200 | kvm_x86_ops->exit_reasons_str), | 209 | kvm_x86_ops->exit_reasons_str), |
201 | __entry->guest_rip) | 210 | __entry->guest_rip, __entry->info1, __entry->info2) |
202 | ); | 211 | ); |
203 | 212 | ||
204 | /* | 213 | /* |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7bddfab12013..d48ec60ea421 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * machines without emulation or binary translation. | 5 | * machines without emulation or binary translation. |
6 | * | 6 | * |
7 | * Copyright (C) 2006 Qumranet, Inc. | 7 | * Copyright (C) 2006 Qumranet, Inc. |
8 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 8 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
9 | * | 9 | * |
10 | * Authors: | 10 | * Authors: |
11 | * Avi Kivity <avi@qumranet.com> | 11 | * Avi Kivity <avi@qumranet.com> |
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); | |||
69 | static int __read_mostly vmm_exclusive = 1; | 69 | static int __read_mostly vmm_exclusive = 1; |
70 | module_param(vmm_exclusive, bool, S_IRUGO); | 70 | module_param(vmm_exclusive, bool, S_IRUGO); |
71 | 71 | ||
72 | static int __read_mostly yield_on_hlt = 1; | ||
73 | module_param(yield_on_hlt, bool, S_IRUGO); | ||
74 | |||
72 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ | 75 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ |
73 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) | 76 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) |
74 | #define KVM_GUEST_CR0_MASK \ | 77 | #define KVM_GUEST_CR0_MASK \ |
@@ -90,14 +93,14 @@ module_param(vmm_exclusive, bool, S_IRUGO); | |||
90 | * These 2 parameters are used to config the controls for Pause-Loop Exiting: | 93 | * These 2 parameters are used to config the controls for Pause-Loop Exiting: |
91 | * ple_gap: upper bound on the amount of time between two successive | 94 | * ple_gap: upper bound on the amount of time between two successive |
92 | * executions of PAUSE in a loop. Also indicate if ple enabled. | 95 | * executions of PAUSE in a loop. Also indicate if ple enabled. |
93 | * According to test, this time is usually small than 41 cycles. | 96 | * According to test, this time is usually smaller than 128 cycles. |
94 | * ple_window: upper bound on the amount of time a guest is allowed to execute | 97 | * ple_window: upper bound on the amount of time a guest is allowed to execute |
95 | * in a PAUSE loop. Tests indicate that most spinlocks are held for | 98 | * in a PAUSE loop. Tests indicate that most spinlocks are held for |
96 | * less than 2^12 cycles | 99 | * less than 2^12 cycles |
97 | * Time is measured based on a counter that runs at the same rate as the TSC, | 100 | * Time is measured based on a counter that runs at the same rate as the TSC, |
98 | * refer SDM volume 3b section 21.6.13 & 22.1.3. | 101 | * refer SDM volume 3b section 21.6.13 & 22.1.3. |
99 | */ | 102 | */ |
100 | #define KVM_VMX_DEFAULT_PLE_GAP 41 | 103 | #define KVM_VMX_DEFAULT_PLE_GAP 128 |
101 | #define KVM_VMX_DEFAULT_PLE_WINDOW 4096 | 104 | #define KVM_VMX_DEFAULT_PLE_WINDOW 4096 |
102 | static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; | 105 | static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; |
103 | module_param(ple_gap, int, S_IRUGO); | 106 | module_param(ple_gap, int, S_IRUGO); |
@@ -125,7 +128,11 @@ struct vcpu_vmx { | |||
125 | unsigned long host_rsp; | 128 | unsigned long host_rsp; |
126 | int launched; | 129 | int launched; |
127 | u8 fail; | 130 | u8 fail; |
131 | u8 cpl; | ||
132 | bool nmi_known_unmasked; | ||
133 | u32 exit_intr_info; | ||
128 | u32 idt_vectoring_info; | 134 | u32 idt_vectoring_info; |
135 | ulong rflags; | ||
129 | struct shared_msr_entry *guest_msrs; | 136 | struct shared_msr_entry *guest_msrs; |
130 | int nmsrs; | 137 | int nmsrs; |
131 | int save_nmsrs; | 138 | int save_nmsrs; |
@@ -154,12 +161,11 @@ struct vcpu_vmx { | |||
154 | u32 limit; | 161 | u32 limit; |
155 | u32 ar; | 162 | u32 ar; |
156 | } tr, es, ds, fs, gs; | 163 | } tr, es, ds, fs, gs; |
157 | struct { | ||
158 | bool pending; | ||
159 | u8 vector; | ||
160 | unsigned rip; | ||
161 | } irq; | ||
162 | } rmode; | 164 | } rmode; |
165 | struct { | ||
166 | u32 bitmask; /* 4 bits per segment (1 bit per field) */ | ||
167 | struct kvm_save_segment seg[8]; | ||
168 | } segment_cache; | ||
163 | int vpid; | 169 | int vpid; |
164 | bool emulation_required; | 170 | bool emulation_required; |
165 | 171 | ||
@@ -172,15 +178,25 @@ struct vcpu_vmx { | |||
172 | bool rdtscp_enabled; | 178 | bool rdtscp_enabled; |
173 | }; | 179 | }; |
174 | 180 | ||
181 | enum segment_cache_field { | ||
182 | SEG_FIELD_SEL = 0, | ||
183 | SEG_FIELD_BASE = 1, | ||
184 | SEG_FIELD_LIMIT = 2, | ||
185 | SEG_FIELD_AR = 3, | ||
186 | |||
187 | SEG_FIELD_NR = 4 | ||
188 | }; | ||
189 | |||
175 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | 190 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) |
176 | { | 191 | { |
177 | return container_of(vcpu, struct vcpu_vmx, vcpu); | 192 | return container_of(vcpu, struct vcpu_vmx, vcpu); |
178 | } | 193 | } |
179 | 194 | ||
180 | static int init_rmode(struct kvm *kvm); | ||
181 | static u64 construct_eptp(unsigned long root_hpa); | 195 | static u64 construct_eptp(unsigned long root_hpa); |
182 | static void kvm_cpu_vmxon(u64 addr); | 196 | static void kvm_cpu_vmxon(u64 addr); |
183 | static void kvm_cpu_vmxoff(void); | 197 | static void kvm_cpu_vmxoff(void); |
198 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); | ||
199 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); | ||
184 | 200 | ||
185 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | 201 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); |
186 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | 202 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); |
@@ -192,6 +208,8 @@ static unsigned long *vmx_io_bitmap_b; | |||
192 | static unsigned long *vmx_msr_bitmap_legacy; | 208 | static unsigned long *vmx_msr_bitmap_legacy; |
193 | static unsigned long *vmx_msr_bitmap_longmode; | 209 | static unsigned long *vmx_msr_bitmap_longmode; |
194 | 210 | ||
211 | static bool cpu_has_load_ia32_efer; | ||
212 | |||
195 | static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); | 213 | static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); |
196 | static DEFINE_SPINLOCK(vmx_vpid_lock); | 214 | static DEFINE_SPINLOCK(vmx_vpid_lock); |
197 | 215 | ||
@@ -476,7 +494,7 @@ static void vmcs_clear(struct vmcs *vmcs) | |||
476 | u8 error; | 494 | u8 error; |
477 | 495 | ||
478 | asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" | 496 | asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" |
479 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | 497 | : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) |
480 | : "cc", "memory"); | 498 | : "cc", "memory"); |
481 | if (error) | 499 | if (error) |
482 | printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", | 500 | printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", |
@@ -489,7 +507,7 @@ static void vmcs_load(struct vmcs *vmcs) | |||
489 | u8 error; | 507 | u8 error; |
490 | 508 | ||
491 | asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" | 509 | asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" |
492 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | 510 | : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) |
493 | : "cc", "memory"); | 511 | : "cc", "memory"); |
494 | if (error) | 512 | if (error) |
495 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", | 513 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", |
@@ -505,7 +523,6 @@ static void __vcpu_clear(void *arg) | |||
505 | vmcs_clear(vmx->vmcs); | 523 | vmcs_clear(vmx->vmcs); |
506 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) | 524 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) |
507 | per_cpu(current_vmcs, cpu) = NULL; | 525 | per_cpu(current_vmcs, cpu) = NULL; |
508 | rdtscll(vmx->vcpu.arch.host_tsc); | ||
509 | list_del(&vmx->local_vcpus_link); | 526 | list_del(&vmx->local_vcpus_link); |
510 | vmx->vcpu.cpu = -1; | 527 | vmx->vcpu.cpu = -1; |
511 | vmx->launched = 0; | 528 | vmx->launched = 0; |
@@ -570,10 +587,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) | |||
570 | 587 | ||
571 | static unsigned long vmcs_readl(unsigned long field) | 588 | static unsigned long vmcs_readl(unsigned long field) |
572 | { | 589 | { |
573 | unsigned long value; | 590 | unsigned long value = 0; |
574 | 591 | ||
575 | asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) | 592 | asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) |
576 | : "=a"(value) : "d"(field) : "cc"); | 593 | : "+a"(value) : "d"(field) : "cc"); |
577 | return value; | 594 | return value; |
578 | } | 595 | } |
579 | 596 | ||
@@ -642,6 +659,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask) | |||
642 | vmcs_writel(field, vmcs_readl(field) | mask); | 659 | vmcs_writel(field, vmcs_readl(field) | mask); |
643 | } | 660 | } |
644 | 661 | ||
662 | static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) | ||
663 | { | ||
664 | vmx->segment_cache.bitmask = 0; | ||
665 | } | ||
666 | |||
667 | static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, | ||
668 | unsigned field) | ||
669 | { | ||
670 | bool ret; | ||
671 | u32 mask = 1 << (seg * SEG_FIELD_NR + field); | ||
672 | |||
673 | if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { | ||
674 | vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); | ||
675 | vmx->segment_cache.bitmask = 0; | ||
676 | } | ||
677 | ret = vmx->segment_cache.bitmask & mask; | ||
678 | vmx->segment_cache.bitmask |= mask; | ||
679 | return ret; | ||
680 | } | ||
681 | |||
682 | static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) | ||
683 | { | ||
684 | u16 *p = &vmx->segment_cache.seg[seg].selector; | ||
685 | |||
686 | if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) | ||
687 | *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); | ||
688 | return *p; | ||
689 | } | ||
690 | |||
691 | static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) | ||
692 | { | ||
693 | ulong *p = &vmx->segment_cache.seg[seg].base; | ||
694 | |||
695 | if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) | ||
696 | *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); | ||
697 | return *p; | ||
698 | } | ||
699 | |||
700 | static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) | ||
701 | { | ||
702 | u32 *p = &vmx->segment_cache.seg[seg].limit; | ||
703 | |||
704 | if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) | ||
705 | *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); | ||
706 | return *p; | ||
707 | } | ||
708 | |||
709 | static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) | ||
710 | { | ||
711 | u32 *p = &vmx->segment_cache.seg[seg].ar; | ||
712 | |||
713 | if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) | ||
714 | *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); | ||
715 | return *p; | ||
716 | } | ||
717 | |||
645 | static void update_exception_bitmap(struct kvm_vcpu *vcpu) | 718 | static void update_exception_bitmap(struct kvm_vcpu *vcpu) |
646 | { | 719 | { |
647 | u32 eb; | 720 | u32 eb; |
@@ -666,6 +739,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) | |||
666 | unsigned i; | 739 | unsigned i; |
667 | struct msr_autoload *m = &vmx->msr_autoload; | 740 | struct msr_autoload *m = &vmx->msr_autoload; |
668 | 741 | ||
742 | if (msr == MSR_EFER && cpu_has_load_ia32_efer) { | ||
743 | vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); | ||
744 | vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); | ||
745 | return; | ||
746 | } | ||
747 | |||
669 | for (i = 0; i < m->nr; ++i) | 748 | for (i = 0; i < m->nr; ++i) |
670 | if (m->guest[i].index == msr) | 749 | if (m->guest[i].index == msr) |
671 | break; | 750 | break; |
@@ -685,6 +764,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, | |||
685 | unsigned i; | 764 | unsigned i; |
686 | struct msr_autoload *m = &vmx->msr_autoload; | 765 | struct msr_autoload *m = &vmx->msr_autoload; |
687 | 766 | ||
767 | if (msr == MSR_EFER && cpu_has_load_ia32_efer) { | ||
768 | vmcs_write64(GUEST_IA32_EFER, guest_val); | ||
769 | vmcs_write64(HOST_IA32_EFER, host_val); | ||
770 | vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); | ||
771 | vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); | ||
772 | return; | ||
773 | } | ||
774 | |||
688 | for (i = 0; i < m->nr; ++i) | 775 | for (i = 0; i < m->nr; ++i) |
689 | if (m->guest[i].index == msr) | 776 | if (m->guest[i].index == msr) |
690 | break; | 777 | break; |
@@ -706,11 +793,10 @@ static void reload_tss(void) | |||
706 | /* | 793 | /* |
707 | * VT restores TR but not its size. Useless. | 794 | * VT restores TR but not its size. Useless. |
708 | */ | 795 | */ |
709 | struct desc_ptr gdt; | 796 | struct desc_ptr *gdt = &__get_cpu_var(host_gdt); |
710 | struct desc_struct *descs; | 797 | struct desc_struct *descs; |
711 | 798 | ||
712 | native_store_gdt(&gdt); | 799 | descs = (void *)gdt->address; |
713 | descs = (void *)gdt.address; | ||
714 | descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ | 800 | descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ |
715 | load_TR_desc(); | 801 | load_TR_desc(); |
716 | } | 802 | } |
@@ -753,7 +839,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) | |||
753 | 839 | ||
754 | static unsigned long segment_base(u16 selector) | 840 | static unsigned long segment_base(u16 selector) |
755 | { | 841 | { |
756 | struct desc_ptr gdt; | 842 | struct desc_ptr *gdt = &__get_cpu_var(host_gdt); |
757 | struct desc_struct *d; | 843 | struct desc_struct *d; |
758 | unsigned long table_base; | 844 | unsigned long table_base; |
759 | unsigned long v; | 845 | unsigned long v; |
@@ -761,8 +847,7 @@ static unsigned long segment_base(u16 selector) | |||
761 | if (!(selector & ~3)) | 847 | if (!(selector & ~3)) |
762 | return 0; | 848 | return 0; |
763 | 849 | ||
764 | native_store_gdt(&gdt); | 850 | table_base = gdt->address; |
765 | table_base = gdt.address; | ||
766 | 851 | ||
767 | if (selector & 4) { /* from ldt */ | 852 | if (selector & 4) { /* from ldt */ |
768 | u16 ldt_selector = kvm_read_ldt(); | 853 | u16 ldt_selector = kvm_read_ldt(); |
@@ -828,10 +913,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) | |||
828 | #endif | 913 | #endif |
829 | 914 | ||
830 | #ifdef CONFIG_X86_64 | 915 | #ifdef CONFIG_X86_64 |
831 | if (is_long_mode(&vmx->vcpu)) { | 916 | rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); |
832 | rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); | 917 | if (is_long_mode(&vmx->vcpu)) |
833 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); | 918 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); |
834 | } | ||
835 | #endif | 919 | #endif |
836 | for (i = 0; i < vmx->save_nmsrs; ++i) | 920 | for (i = 0; i < vmx->save_nmsrs; ++i) |
837 | kvm_set_shared_msr(vmx->guest_msrs[i].index, | 921 | kvm_set_shared_msr(vmx->guest_msrs[i].index, |
@@ -846,23 +930,23 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) | |||
846 | 930 | ||
847 | ++vmx->vcpu.stat.host_state_reload; | 931 | ++vmx->vcpu.stat.host_state_reload; |
848 | vmx->host_state.loaded = 0; | 932 | vmx->host_state.loaded = 0; |
849 | if (vmx->host_state.fs_reload_needed) | 933 | #ifdef CONFIG_X86_64 |
850 | loadsegment(fs, vmx->host_state.fs_sel); | 934 | if (is_long_mode(&vmx->vcpu)) |
935 | rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); | ||
936 | #endif | ||
851 | if (vmx->host_state.gs_ldt_reload_needed) { | 937 | if (vmx->host_state.gs_ldt_reload_needed) { |
852 | kvm_load_ldt(vmx->host_state.ldt_sel); | 938 | kvm_load_ldt(vmx->host_state.ldt_sel); |
853 | #ifdef CONFIG_X86_64 | 939 | #ifdef CONFIG_X86_64 |
854 | load_gs_index(vmx->host_state.gs_sel); | 940 | load_gs_index(vmx->host_state.gs_sel); |
855 | wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); | ||
856 | #else | 941 | #else |
857 | loadsegment(gs, vmx->host_state.gs_sel); | 942 | loadsegment(gs, vmx->host_state.gs_sel); |
858 | #endif | 943 | #endif |
859 | } | 944 | } |
945 | if (vmx->host_state.fs_reload_needed) | ||
946 | loadsegment(fs, vmx->host_state.fs_sel); | ||
860 | reload_tss(); | 947 | reload_tss(); |
861 | #ifdef CONFIG_X86_64 | 948 | #ifdef CONFIG_X86_64 |
862 | if (is_long_mode(&vmx->vcpu)) { | 949 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); |
863 | rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); | ||
864 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); | ||
865 | } | ||
866 | #endif | 950 | #endif |
867 | if (current_thread_info()->status & TS_USEDFPU) | 951 | if (current_thread_info()->status & TS_USEDFPU) |
868 | clts(); | 952 | clts(); |
@@ -883,7 +967,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) | |||
883 | static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | 967 | static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) |
884 | { | 968 | { |
885 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 969 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
886 | u64 tsc_this, delta, new_offset; | ||
887 | u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); | 970 | u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); |
888 | 971 | ||
889 | if (!vmm_exclusive) | 972 | if (!vmm_exclusive) |
@@ -897,37 +980,24 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
897 | } | 980 | } |
898 | 981 | ||
899 | if (vcpu->cpu != cpu) { | 982 | if (vcpu->cpu != cpu) { |
900 | struct desc_ptr dt; | 983 | struct desc_ptr *gdt = &__get_cpu_var(host_gdt); |
901 | unsigned long sysenter_esp; | 984 | unsigned long sysenter_esp; |
902 | 985 | ||
903 | kvm_migrate_timers(vcpu); | ||
904 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | 986 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); |
905 | local_irq_disable(); | 987 | local_irq_disable(); |
906 | list_add(&vmx->local_vcpus_link, | 988 | list_add(&vmx->local_vcpus_link, |
907 | &per_cpu(vcpus_on_cpu, cpu)); | 989 | &per_cpu(vcpus_on_cpu, cpu)); |
908 | local_irq_enable(); | 990 | local_irq_enable(); |
909 | 991 | ||
910 | vcpu->cpu = cpu; | ||
911 | /* | 992 | /* |
912 | * Linux uses per-cpu TSS and GDT, so set these when switching | 993 | * Linux uses per-cpu TSS and GDT, so set these when switching |
913 | * processors. | 994 | * processors. |
914 | */ | 995 | */ |
915 | vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ | 996 | vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ |
916 | native_store_gdt(&dt); | 997 | vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ |
917 | vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */ | ||
918 | 998 | ||
919 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); | 999 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); |
920 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ | 1000 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ |
921 | |||
922 | /* | ||
923 | * Make sure the time stamp counter is monotonous. | ||
924 | */ | ||
925 | rdtscll(tsc_this); | ||
926 | if (tsc_this < vcpu->arch.host_tsc) { | ||
927 | delta = vcpu->arch.host_tsc - tsc_this; | ||
928 | new_offset = vmcs_read64(TSC_OFFSET) + delta; | ||
929 | vmcs_write64(TSC_OFFSET, new_offset); | ||
930 | } | ||
931 | } | 1001 | } |
932 | } | 1002 | } |
933 | 1003 | ||
@@ -972,17 +1042,24 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | |||
972 | { | 1042 | { |
973 | unsigned long rflags, save_rflags; | 1043 | unsigned long rflags, save_rflags; |
974 | 1044 | ||
975 | rflags = vmcs_readl(GUEST_RFLAGS); | 1045 | if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { |
976 | if (to_vmx(vcpu)->rmode.vm86_active) { | 1046 | __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); |
977 | rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; | 1047 | rflags = vmcs_readl(GUEST_RFLAGS); |
978 | save_rflags = to_vmx(vcpu)->rmode.save_rflags; | 1048 | if (to_vmx(vcpu)->rmode.vm86_active) { |
979 | rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; | 1049 | rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; |
1050 | save_rflags = to_vmx(vcpu)->rmode.save_rflags; | ||
1051 | rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; | ||
1052 | } | ||
1053 | to_vmx(vcpu)->rflags = rflags; | ||
980 | } | 1054 | } |
981 | return rflags; | 1055 | return to_vmx(vcpu)->rflags; |
982 | } | 1056 | } |
983 | 1057 | ||
984 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | 1058 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) |
985 | { | 1059 | { |
1060 | __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); | ||
1061 | __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | ||
1062 | to_vmx(vcpu)->rflags = rflags; | ||
986 | if (to_vmx(vcpu)->rmode.vm86_active) { | 1063 | if (to_vmx(vcpu)->rmode.vm86_active) { |
987 | to_vmx(vcpu)->rmode.save_rflags = rflags; | 1064 | to_vmx(vcpu)->rmode.save_rflags = rflags; |
988 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | 1065 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; |
@@ -1031,6 +1108,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
1031 | vmx_set_interrupt_shadow(vcpu, 0); | 1108 | vmx_set_interrupt_shadow(vcpu, 0); |
1032 | } | 1109 | } |
1033 | 1110 | ||
1111 | static void vmx_clear_hlt(struct kvm_vcpu *vcpu) | ||
1112 | { | ||
1113 | /* Ensure that we clear the HLT state in the VMCS. We don't need to | ||
1114 | * explicitly skip the instruction because if the HLT state is set, then | ||
1115 | * the instruction is already executing and RIP has already been | ||
1116 | * advanced. */ | ||
1117 | if (!yield_on_hlt && | ||
1118 | vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) | ||
1119 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); | ||
1120 | } | ||
1121 | |||
1034 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | 1122 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
1035 | bool has_error_code, u32 error_code, | 1123 | bool has_error_code, u32 error_code, |
1036 | bool reinject) | 1124 | bool reinject) |
@@ -1044,16 +1132,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
1044 | } | 1132 | } |
1045 | 1133 | ||
1046 | if (vmx->rmode.vm86_active) { | 1134 | if (vmx->rmode.vm86_active) { |
1047 | vmx->rmode.irq.pending = true; | 1135 | int inc_eip = 0; |
1048 | vmx->rmode.irq.vector = nr; | ||
1049 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | ||
1050 | if (kvm_exception_is_soft(nr)) | 1136 | if (kvm_exception_is_soft(nr)) |
1051 | vmx->rmode.irq.rip += | 1137 | inc_eip = vcpu->arch.event_exit_inst_len; |
1052 | vmx->vcpu.arch.event_exit_inst_len; | 1138 | if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) |
1053 | intr_info |= INTR_TYPE_SOFT_INTR; | 1139 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
1054 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); | ||
1055 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
1056 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); | ||
1057 | return; | 1140 | return; |
1058 | } | 1141 | } |
1059 | 1142 | ||
@@ -1065,6 +1148,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
1065 | intr_info |= INTR_TYPE_HARD_EXCEPTION; | 1148 | intr_info |= INTR_TYPE_HARD_EXCEPTION; |
1066 | 1149 | ||
1067 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); | 1150 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); |
1151 | vmx_clear_hlt(vcpu); | ||
1068 | } | 1152 | } |
1069 | 1153 | ||
1070 | static bool vmx_rdtscp_supported(void) | 1154 | static bool vmx_rdtscp_supported(void) |
@@ -1149,12 +1233,32 @@ static u64 guest_read_tsc(void) | |||
1149 | } | 1233 | } |
1150 | 1234 | ||
1151 | /* | 1235 | /* |
1152 | * writes 'guest_tsc' into guest's timestamp counter "register" | 1236 | * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ |
1153 | * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc | 1237 | * ioctl. In this case the call-back should update internal vmx state to make |
1238 | * the changes effective. | ||
1239 | */ | ||
1240 | static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) | ||
1241 | { | ||
1242 | /* Nothing to do here */ | ||
1243 | } | ||
1244 | |||
1245 | /* | ||
1246 | * writes 'offset' into guest's timestamp counter offset register | ||
1154 | */ | 1247 | */ |
1155 | static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) | 1248 | static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) |
1156 | { | 1249 | { |
1157 | vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); | 1250 | vmcs_write64(TSC_OFFSET, offset); |
1251 | } | ||
1252 | |||
1253 | static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | ||
1254 | { | ||
1255 | u64 offset = vmcs_read64(TSC_OFFSET); | ||
1256 | vmcs_write64(TSC_OFFSET, offset + adjustment); | ||
1257 | } | ||
1258 | |||
1259 | static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) | ||
1260 | { | ||
1261 | return target_tsc - native_read_tsc(); | ||
1158 | } | 1262 | } |
1159 | 1263 | ||
1160 | /* | 1264 | /* |
@@ -1227,7 +1331,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
1227 | { | 1331 | { |
1228 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1332 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1229 | struct shared_msr_entry *msr; | 1333 | struct shared_msr_entry *msr; |
1230 | u64 host_tsc; | ||
1231 | int ret = 0; | 1334 | int ret = 0; |
1232 | 1335 | ||
1233 | switch (msr_index) { | 1336 | switch (msr_index) { |
@@ -1237,9 +1340,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
1237 | break; | 1340 | break; |
1238 | #ifdef CONFIG_X86_64 | 1341 | #ifdef CONFIG_X86_64 |
1239 | case MSR_FS_BASE: | 1342 | case MSR_FS_BASE: |
1343 | vmx_segment_cache_clear(vmx); | ||
1240 | vmcs_writel(GUEST_FS_BASE, data); | 1344 | vmcs_writel(GUEST_FS_BASE, data); |
1241 | break; | 1345 | break; |
1242 | case MSR_GS_BASE: | 1346 | case MSR_GS_BASE: |
1347 | vmx_segment_cache_clear(vmx); | ||
1243 | vmcs_writel(GUEST_GS_BASE, data); | 1348 | vmcs_writel(GUEST_GS_BASE, data); |
1244 | break; | 1349 | break; |
1245 | case MSR_KERNEL_GS_BASE: | 1350 | case MSR_KERNEL_GS_BASE: |
@@ -1257,8 +1362,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
1257 | vmcs_writel(GUEST_SYSENTER_ESP, data); | 1362 | vmcs_writel(GUEST_SYSENTER_ESP, data); |
1258 | break; | 1363 | break; |
1259 | case MSR_IA32_TSC: | 1364 | case MSR_IA32_TSC: |
1260 | rdtscll(host_tsc); | 1365 | kvm_write_tsc(vcpu, data); |
1261 | guest_write_tsc(data, host_tsc); | ||
1262 | break; | 1366 | break; |
1263 | case MSR_IA32_CR_PAT: | 1367 | case MSR_IA32_CR_PAT: |
1264 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | 1368 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { |
@@ -1328,16 +1432,25 @@ static __init int vmx_disabled_by_bios(void) | |||
1328 | 1432 | ||
1329 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); | 1433 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); |
1330 | if (msr & FEATURE_CONTROL_LOCKED) { | 1434 | if (msr & FEATURE_CONTROL_LOCKED) { |
1435 | /* launched w/ TXT and VMX disabled */ | ||
1331 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) | 1436 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) |
1332 | && tboot_enabled()) | 1437 | && tboot_enabled()) |
1333 | return 1; | 1438 | return 1; |
1439 | /* launched w/o TXT and VMX only enabled w/ TXT */ | ||
1440 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) | ||
1441 | && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) | ||
1442 | && !tboot_enabled()) { | ||
1443 | printk(KERN_WARNING "kvm: disable TXT in the BIOS or " | ||
1444 | "activate TXT before enabling KVM\n"); | ||
1445 | return 1; | ||
1446 | } | ||
1447 | /* launched w/o TXT and VMX disabled */ | ||
1334 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) | 1448 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) |
1335 | && !tboot_enabled()) | 1449 | && !tboot_enabled()) |
1336 | return 1; | 1450 | return 1; |
1337 | } | 1451 | } |
1338 | 1452 | ||
1339 | return 0; | 1453 | return 0; |
1340 | /* locked but not enabled */ | ||
1341 | } | 1454 | } |
1342 | 1455 | ||
1343 | static void kvm_cpu_vmxon(u64 addr) | 1456 | static void kvm_cpu_vmxon(u64 addr) |
@@ -1427,6 +1540,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, | |||
1427 | return 0; | 1540 | return 0; |
1428 | } | 1541 | } |
1429 | 1542 | ||
1543 | static __init bool allow_1_setting(u32 msr, u32 ctl) | ||
1544 | { | ||
1545 | u32 vmx_msr_low, vmx_msr_high; | ||
1546 | |||
1547 | rdmsr(msr, vmx_msr_low, vmx_msr_high); | ||
1548 | return vmx_msr_high & ctl; | ||
1549 | } | ||
1550 | |||
1430 | static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | 1551 | static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) |
1431 | { | 1552 | { |
1432 | u32 vmx_msr_low, vmx_msr_high; | 1553 | u32 vmx_msr_low, vmx_msr_high; |
@@ -1443,7 +1564,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1443 | &_pin_based_exec_control) < 0) | 1564 | &_pin_based_exec_control) < 0) |
1444 | return -EIO; | 1565 | return -EIO; |
1445 | 1566 | ||
1446 | min = CPU_BASED_HLT_EXITING | | 1567 | min = |
1447 | #ifdef CONFIG_X86_64 | 1568 | #ifdef CONFIG_X86_64 |
1448 | CPU_BASED_CR8_LOAD_EXITING | | 1569 | CPU_BASED_CR8_LOAD_EXITING | |
1449 | CPU_BASED_CR8_STORE_EXITING | | 1570 | CPU_BASED_CR8_STORE_EXITING | |
@@ -1456,6 +1577,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1456 | CPU_BASED_MWAIT_EXITING | | 1577 | CPU_BASED_MWAIT_EXITING | |
1457 | CPU_BASED_MONITOR_EXITING | | 1578 | CPU_BASED_MONITOR_EXITING | |
1458 | CPU_BASED_INVLPG_EXITING; | 1579 | CPU_BASED_INVLPG_EXITING; |
1580 | |||
1581 | if (yield_on_hlt) | ||
1582 | min |= CPU_BASED_HLT_EXITING; | ||
1583 | |||
1459 | opt = CPU_BASED_TPR_SHADOW | | 1584 | opt = CPU_BASED_TPR_SHADOW | |
1460 | CPU_BASED_USE_MSR_BITMAPS | | 1585 | CPU_BASED_USE_MSR_BITMAPS | |
1461 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | 1586 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
@@ -1537,6 +1662,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1537 | vmcs_conf->vmexit_ctrl = _vmexit_control; | 1662 | vmcs_conf->vmexit_ctrl = _vmexit_control; |
1538 | vmcs_conf->vmentry_ctrl = _vmentry_control; | 1663 | vmcs_conf->vmentry_ctrl = _vmentry_control; |
1539 | 1664 | ||
1665 | cpu_has_load_ia32_efer = | ||
1666 | allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, | ||
1667 | VM_ENTRY_LOAD_IA32_EFER) | ||
1668 | && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, | ||
1669 | VM_EXIT_LOAD_IA32_EFER); | ||
1670 | |||
1540 | return 0; | 1671 | return 0; |
1541 | } | 1672 | } |
1542 | 1673 | ||
@@ -1657,6 +1788,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
1657 | vmx->emulation_required = 1; | 1788 | vmx->emulation_required = 1; |
1658 | vmx->rmode.vm86_active = 0; | 1789 | vmx->rmode.vm86_active = 0; |
1659 | 1790 | ||
1791 | vmx_segment_cache_clear(vmx); | ||
1792 | |||
1793 | vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); | ||
1660 | vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); | 1794 | vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); |
1661 | vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); | 1795 | vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); |
1662 | vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); | 1796 | vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); |
@@ -1679,6 +1813,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
1679 | fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); | 1813 | fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); |
1680 | fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); | 1814 | fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); |
1681 | 1815 | ||
1816 | vmx_segment_cache_clear(vmx); | ||
1817 | |||
1682 | vmcs_write16(GUEST_SS_SELECTOR, 0); | 1818 | vmcs_write16(GUEST_SS_SELECTOR, 0); |
1683 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); | 1819 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); |
1684 | 1820 | ||
@@ -1710,9 +1846,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | |||
1710 | save->limit = vmcs_read32(sf->limit); | 1846 | save->limit = vmcs_read32(sf->limit); |
1711 | save->ar = vmcs_read32(sf->ar_bytes); | 1847 | save->ar = vmcs_read32(sf->ar_bytes); |
1712 | vmcs_write16(sf->selector, save->base >> 4); | 1848 | vmcs_write16(sf->selector, save->base >> 4); |
1713 | vmcs_write32(sf->base, save->base & 0xfffff); | 1849 | vmcs_write32(sf->base, save->base & 0xffff0); |
1714 | vmcs_write32(sf->limit, 0xffff); | 1850 | vmcs_write32(sf->limit, 0xffff); |
1715 | vmcs_write32(sf->ar_bytes, 0xf3); | 1851 | vmcs_write32(sf->ar_bytes, 0xf3); |
1852 | if (save->base & 0xf) | ||
1853 | printk_once(KERN_WARNING "kvm: segment base is not paragraph" | ||
1854 | " aligned when entering protected mode (seg=%d)", | ||
1855 | seg); | ||
1716 | } | 1856 | } |
1717 | 1857 | ||
1718 | static void enter_rmode(struct kvm_vcpu *vcpu) | 1858 | static void enter_rmode(struct kvm_vcpu *vcpu) |
@@ -1726,6 +1866,21 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1726 | vmx->emulation_required = 1; | 1866 | vmx->emulation_required = 1; |
1727 | vmx->rmode.vm86_active = 1; | 1867 | vmx->rmode.vm86_active = 1; |
1728 | 1868 | ||
1869 | /* | ||
1870 | * Very old userspace does not call KVM_SET_TSS_ADDR before entering | ||
1871 | * vcpu. Call it here with phys address pointing 16M below 4G. | ||
1872 | */ | ||
1873 | if (!vcpu->kvm->arch.tss_addr) { | ||
1874 | printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " | ||
1875 | "called before entering vcpu\n"); | ||
1876 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | ||
1877 | vmx_set_tss_addr(vcpu->kvm, 0xfeffd000); | ||
1878 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); | ||
1879 | } | ||
1880 | |||
1881 | vmx_segment_cache_clear(vmx); | ||
1882 | |||
1883 | vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); | ||
1729 | vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | 1884 | vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); |
1730 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); | 1885 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); |
1731 | 1886 | ||
@@ -1764,7 +1919,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1764 | 1919 | ||
1765 | continue_rmode: | 1920 | continue_rmode: |
1766 | kvm_mmu_reset_context(vcpu); | 1921 | kvm_mmu_reset_context(vcpu); |
1767 | init_rmode(vcpu->kvm); | ||
1768 | } | 1922 | } |
1769 | 1923 | ||
1770 | static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) | 1924 | static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) |
@@ -1802,6 +1956,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu) | |||
1802 | { | 1956 | { |
1803 | u32 guest_tr_ar; | 1957 | u32 guest_tr_ar; |
1804 | 1958 | ||
1959 | vmx_segment_cache_clear(to_vmx(vcpu)); | ||
1960 | |||
1805 | guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); | 1961 | guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); |
1806 | if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { | 1962 | if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { |
1807 | printk(KERN_DEBUG "%s: tss fixup for long mode. \n", | 1963 | printk(KERN_DEBUG "%s: tss fixup for long mode. \n", |
@@ -1841,6 +1997,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) | |||
1841 | vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; | 1997 | vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; |
1842 | } | 1998 | } |
1843 | 1999 | ||
2000 | static void vmx_decache_cr3(struct kvm_vcpu *vcpu) | ||
2001 | { | ||
2002 | if (enable_ept && is_paging(vcpu)) | ||
2003 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | ||
2004 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
2005 | } | ||
2006 | |||
1844 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | 2007 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) |
1845 | { | 2008 | { |
1846 | ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; | 2009 | ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; |
@@ -1856,20 +2019,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) | |||
1856 | return; | 2019 | return; |
1857 | 2020 | ||
1858 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | 2021 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { |
1859 | vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); | 2022 | vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); |
1860 | vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); | 2023 | vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); |
1861 | vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); | 2024 | vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); |
1862 | vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); | 2025 | vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); |
1863 | } | 2026 | } |
1864 | } | 2027 | } |
1865 | 2028 | ||
1866 | static void ept_save_pdptrs(struct kvm_vcpu *vcpu) | 2029 | static void ept_save_pdptrs(struct kvm_vcpu *vcpu) |
1867 | { | 2030 | { |
1868 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | 2031 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { |
1869 | vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); | 2032 | vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); |
1870 | vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); | 2033 | vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); |
1871 | vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); | 2034 | vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); |
1872 | vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); | 2035 | vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); |
1873 | } | 2036 | } |
1874 | 2037 | ||
1875 | __set_bit(VCPU_EXREG_PDPTR, | 2038 | __set_bit(VCPU_EXREG_PDPTR, |
@@ -1884,6 +2047,8 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | |||
1884 | unsigned long cr0, | 2047 | unsigned long cr0, |
1885 | struct kvm_vcpu *vcpu) | 2048 | struct kvm_vcpu *vcpu) |
1886 | { | 2049 | { |
2050 | if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) | ||
2051 | vmx_decache_cr3(vcpu); | ||
1887 | if (!(cr0 & X86_CR0_PG)) { | 2052 | if (!(cr0 & X86_CR0_PG)) { |
1888 | /* From paging/starting to nonpaging */ | 2053 | /* From paging/starting to nonpaging */ |
1889 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | 2054 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, |
@@ -1941,6 +2106,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1941 | vmcs_writel(CR0_READ_SHADOW, cr0); | 2106 | vmcs_writel(CR0_READ_SHADOW, cr0); |
1942 | vmcs_writel(GUEST_CR0, hw_cr0); | 2107 | vmcs_writel(GUEST_CR0, hw_cr0); |
1943 | vcpu->arch.cr0 = cr0; | 2108 | vcpu->arch.cr0 = cr0; |
2109 | __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | ||
1944 | } | 2110 | } |
1945 | 2111 | ||
1946 | static u64 construct_eptp(unsigned long root_hpa) | 2112 | static u64 construct_eptp(unsigned long root_hpa) |
@@ -1964,7 +2130,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
1964 | if (enable_ept) { | 2130 | if (enable_ept) { |
1965 | eptp = construct_eptp(cr3); | 2131 | eptp = construct_eptp(cr3); |
1966 | vmcs_write64(EPT_POINTER, eptp); | 2132 | vmcs_write64(EPT_POINTER, eptp); |
1967 | guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : | 2133 | guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) : |
1968 | vcpu->kvm->arch.ept_identity_map_addr; | 2134 | vcpu->kvm->arch.ept_identity_map_addr; |
1969 | ept_load_pdptrs(vcpu); | 2135 | ept_load_pdptrs(vcpu); |
1970 | } | 2136 | } |
@@ -1992,23 +2158,39 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
1992 | vmcs_writel(GUEST_CR4, hw_cr4); | 2158 | vmcs_writel(GUEST_CR4, hw_cr4); |
1993 | } | 2159 | } |
1994 | 2160 | ||
1995 | static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
1996 | { | ||
1997 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1998 | |||
1999 | return vmcs_readl(sf->base); | ||
2000 | } | ||
2001 | |||
2002 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | 2161 | static void vmx_get_segment(struct kvm_vcpu *vcpu, |
2003 | struct kvm_segment *var, int seg) | 2162 | struct kvm_segment *var, int seg) |
2004 | { | 2163 | { |
2005 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 2164 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2165 | struct kvm_save_segment *save; | ||
2006 | u32 ar; | 2166 | u32 ar; |
2007 | 2167 | ||
2008 | var->base = vmcs_readl(sf->base); | 2168 | if (vmx->rmode.vm86_active |
2009 | var->limit = vmcs_read32(sf->limit); | 2169 | && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES |
2010 | var->selector = vmcs_read16(sf->selector); | 2170 | || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS |
2011 | ar = vmcs_read32(sf->ar_bytes); | 2171 | || seg == VCPU_SREG_GS) |
2172 | && !emulate_invalid_guest_state) { | ||
2173 | switch (seg) { | ||
2174 | case VCPU_SREG_TR: save = &vmx->rmode.tr; break; | ||
2175 | case VCPU_SREG_ES: save = &vmx->rmode.es; break; | ||
2176 | case VCPU_SREG_DS: save = &vmx->rmode.ds; break; | ||
2177 | case VCPU_SREG_FS: save = &vmx->rmode.fs; break; | ||
2178 | case VCPU_SREG_GS: save = &vmx->rmode.gs; break; | ||
2179 | default: BUG(); | ||
2180 | } | ||
2181 | var->selector = save->selector; | ||
2182 | var->base = save->base; | ||
2183 | var->limit = save->limit; | ||
2184 | ar = save->ar; | ||
2185 | if (seg == VCPU_SREG_TR | ||
2186 | || var->selector == vmx_read_guest_seg_selector(vmx, seg)) | ||
2187 | goto use_saved_rmode_seg; | ||
2188 | } | ||
2189 | var->base = vmx_read_guest_seg_base(vmx, seg); | ||
2190 | var->limit = vmx_read_guest_seg_limit(vmx, seg); | ||
2191 | var->selector = vmx_read_guest_seg_selector(vmx, seg); | ||
2192 | ar = vmx_read_guest_seg_ar(vmx, seg); | ||
2193 | use_saved_rmode_seg: | ||
2012 | if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) | 2194 | if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) |
2013 | ar = 0; | 2195 | ar = 0; |
2014 | var->type = ar & 15; | 2196 | var->type = ar & 15; |
@@ -2022,17 +2204,39 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, | |||
2022 | var->unusable = (ar >> 16) & 1; | 2204 | var->unusable = (ar >> 16) & 1; |
2023 | } | 2205 | } |
2024 | 2206 | ||
2025 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) | 2207 | static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) |
2208 | { | ||
2209 | struct kvm_segment s; | ||
2210 | |||
2211 | if (to_vmx(vcpu)->rmode.vm86_active) { | ||
2212 | vmx_get_segment(vcpu, &s, seg); | ||
2213 | return s.base; | ||
2214 | } | ||
2215 | return vmx_read_guest_seg_base(to_vmx(vcpu), seg); | ||
2216 | } | ||
2217 | |||
2218 | static int __vmx_get_cpl(struct kvm_vcpu *vcpu) | ||
2026 | { | 2219 | { |
2027 | if (!is_protmode(vcpu)) | 2220 | if (!is_protmode(vcpu)) |
2028 | return 0; | 2221 | return 0; |
2029 | 2222 | ||
2030 | if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ | 2223 | if (!is_long_mode(vcpu) |
2224 | && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ | ||
2031 | return 3; | 2225 | return 3; |
2032 | 2226 | ||
2033 | return vmcs_read16(GUEST_CS_SELECTOR) & 3; | 2227 | return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3; |
2034 | } | 2228 | } |
2035 | 2229 | ||
2230 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) | ||
2231 | { | ||
2232 | if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { | ||
2233 | __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | ||
2234 | to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); | ||
2235 | } | ||
2236 | return to_vmx(vcpu)->cpl; | ||
2237 | } | ||
2238 | |||
2239 | |||
2036 | static u32 vmx_segment_access_rights(struct kvm_segment *var) | 2240 | static u32 vmx_segment_access_rights(struct kvm_segment *var) |
2037 | { | 2241 | { |
2038 | u32 ar; | 2242 | u32 ar; |
@@ -2062,7 +2266,10 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
2062 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 2266 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
2063 | u32 ar; | 2267 | u32 ar; |
2064 | 2268 | ||
2269 | vmx_segment_cache_clear(vmx); | ||
2270 | |||
2065 | if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { | 2271 | if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { |
2272 | vmcs_write16(sf->selector, var->selector); | ||
2066 | vmx->rmode.tr.selector = var->selector; | 2273 | vmx->rmode.tr.selector = var->selector; |
2067 | vmx->rmode.tr.base = var->base; | 2274 | vmx->rmode.tr.base = var->base; |
2068 | vmx->rmode.tr.limit = var->limit; | 2275 | vmx->rmode.tr.limit = var->limit; |
@@ -2097,11 +2304,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
2097 | ar |= 0x1; /* Accessed */ | 2304 | ar |= 0x1; /* Accessed */ |
2098 | 2305 | ||
2099 | vmcs_write32(sf->ar_bytes, ar); | 2306 | vmcs_write32(sf->ar_bytes, ar); |
2307 | __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | ||
2100 | } | 2308 | } |
2101 | 2309 | ||
2102 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | 2310 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) |
2103 | { | 2311 | { |
2104 | u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); | 2312 | u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); |
2105 | 2313 | ||
2106 | *db = (ar >> 14) & 1; | 2314 | *db = (ar >> 14) & 1; |
2107 | *l = (ar >> 13) & 1; | 2315 | *l = (ar >> 13) & 1; |
@@ -2323,11 +2531,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu) | |||
2323 | 2531 | ||
2324 | static int init_rmode_tss(struct kvm *kvm) | 2532 | static int init_rmode_tss(struct kvm *kvm) |
2325 | { | 2533 | { |
2326 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; | 2534 | gfn_t fn; |
2327 | u16 data = 0; | 2535 | u16 data = 0; |
2328 | int ret = 0; | 2536 | int r, idx, ret = 0; |
2329 | int r; | ||
2330 | 2537 | ||
2538 | idx = srcu_read_lock(&kvm->srcu); | ||
2539 | fn = rmode_tss_base(kvm) >> PAGE_SHIFT; | ||
2331 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); | 2540 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); |
2332 | if (r < 0) | 2541 | if (r < 0) |
2333 | goto out; | 2542 | goto out; |
@@ -2351,12 +2560,13 @@ static int init_rmode_tss(struct kvm *kvm) | |||
2351 | 2560 | ||
2352 | ret = 1; | 2561 | ret = 1; |
2353 | out: | 2562 | out: |
2563 | srcu_read_unlock(&kvm->srcu, idx); | ||
2354 | return ret; | 2564 | return ret; |
2355 | } | 2565 | } |
2356 | 2566 | ||
2357 | static int init_rmode_identity_map(struct kvm *kvm) | 2567 | static int init_rmode_identity_map(struct kvm *kvm) |
2358 | { | 2568 | { |
2359 | int i, r, ret; | 2569 | int i, idx, r, ret; |
2360 | pfn_t identity_map_pfn; | 2570 | pfn_t identity_map_pfn; |
2361 | u32 tmp; | 2571 | u32 tmp; |
2362 | 2572 | ||
@@ -2371,6 +2581,7 @@ static int init_rmode_identity_map(struct kvm *kvm) | |||
2371 | return 1; | 2581 | return 1; |
2372 | ret = 0; | 2582 | ret = 0; |
2373 | identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; | 2583 | identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; |
2584 | idx = srcu_read_lock(&kvm->srcu); | ||
2374 | r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); | 2585 | r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); |
2375 | if (r < 0) | 2586 | if (r < 0) |
2376 | goto out; | 2587 | goto out; |
@@ -2386,6 +2597,7 @@ static int init_rmode_identity_map(struct kvm *kvm) | |||
2386 | kvm->arch.ept_identity_pagetable_done = true; | 2597 | kvm->arch.ept_identity_pagetable_done = true; |
2387 | ret = 1; | 2598 | ret = 1; |
2388 | out: | 2599 | out: |
2600 | srcu_read_unlock(&kvm->srcu, idx); | ||
2389 | return ret; | 2601 | return ret; |
2390 | } | 2602 | } |
2391 | 2603 | ||
@@ -2515,7 +2727,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2515 | { | 2727 | { |
2516 | u32 host_sysenter_cs, msr_low, msr_high; | 2728 | u32 host_sysenter_cs, msr_low, msr_high; |
2517 | u32 junk; | 2729 | u32 junk; |
2518 | u64 host_pat, tsc_this, tsc_base; | 2730 | u64 host_pat; |
2519 | unsigned long a; | 2731 | unsigned long a; |
2520 | struct desc_ptr dt; | 2732 | struct desc_ptr dt; |
2521 | int i; | 2733 | int i; |
@@ -2656,32 +2868,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2656 | vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; | 2868 | vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; |
2657 | vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); | 2869 | vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); |
2658 | 2870 | ||
2659 | tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; | 2871 | kvm_write_tsc(&vmx->vcpu, 0); |
2660 | rdtscll(tsc_this); | ||
2661 | if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc) | ||
2662 | tsc_base = tsc_this; | ||
2663 | |||
2664 | guest_write_tsc(0, tsc_base); | ||
2665 | 2872 | ||
2666 | return 0; | 2873 | return 0; |
2667 | } | 2874 | } |
2668 | 2875 | ||
2669 | static int init_rmode(struct kvm *kvm) | ||
2670 | { | ||
2671 | int idx, ret = 0; | ||
2672 | |||
2673 | idx = srcu_read_lock(&kvm->srcu); | ||
2674 | if (!init_rmode_tss(kvm)) | ||
2675 | goto exit; | ||
2676 | if (!init_rmode_identity_map(kvm)) | ||
2677 | goto exit; | ||
2678 | |||
2679 | ret = 1; | ||
2680 | exit: | ||
2681 | srcu_read_unlock(&kvm->srcu, idx); | ||
2682 | return ret; | ||
2683 | } | ||
2684 | |||
2685 | static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | 2876 | static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) |
2686 | { | 2877 | { |
2687 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2878 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
@@ -2689,10 +2880,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2689 | int ret; | 2880 | int ret; |
2690 | 2881 | ||
2691 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); | 2882 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); |
2692 | if (!init_rmode(vmx->vcpu.kvm)) { | ||
2693 | ret = -ENOMEM; | ||
2694 | goto out; | ||
2695 | } | ||
2696 | 2883 | ||
2697 | vmx->rmode.vm86_active = 0; | 2884 | vmx->rmode.vm86_active = 0; |
2698 | 2885 | ||
@@ -2709,6 +2896,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2709 | if (ret != 0) | 2896 | if (ret != 0) |
2710 | goto out; | 2897 | goto out; |
2711 | 2898 | ||
2899 | vmx_segment_cache_clear(vmx); | ||
2900 | |||
2712 | seg_setup(VCPU_SREG_CS); | 2901 | seg_setup(VCPU_SREG_CS); |
2713 | /* | 2902 | /* |
2714 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | 2903 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode |
@@ -2757,7 +2946,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2757 | vmcs_writel(GUEST_IDTR_BASE, 0); | 2946 | vmcs_writel(GUEST_IDTR_BASE, 0); |
2758 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | 2947 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); |
2759 | 2948 | ||
2760 | vmcs_write32(GUEST_ACTIVITY_STATE, 0); | 2949 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); |
2761 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | 2950 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); |
2762 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); | 2951 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); |
2763 | 2952 | ||
@@ -2772,7 +2961,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2772 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); | 2961 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); |
2773 | if (vm_need_tpr_shadow(vmx->vcpu.kvm)) | 2962 | if (vm_need_tpr_shadow(vmx->vcpu.kvm)) |
2774 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, | 2963 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, |
2775 | page_to_phys(vmx->vcpu.arch.apic->regs_page)); | 2964 | __pa(vmx->vcpu.arch.apic->regs)); |
2776 | vmcs_write32(TPR_THRESHOLD, 0); | 2965 | vmcs_write32(TPR_THRESHOLD, 0); |
2777 | } | 2966 | } |
2778 | 2967 | ||
@@ -2819,6 +3008,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) | |||
2819 | return; | 3008 | return; |
2820 | } | 3009 | } |
2821 | 3010 | ||
3011 | if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { | ||
3012 | enable_irq_window(vcpu); | ||
3013 | return; | ||
3014 | } | ||
2822 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 3015 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
2823 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; | 3016 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; |
2824 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 3017 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
@@ -2834,16 +3027,11 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) | |||
2834 | 3027 | ||
2835 | ++vcpu->stat.irq_injections; | 3028 | ++vcpu->stat.irq_injections; |
2836 | if (vmx->rmode.vm86_active) { | 3029 | if (vmx->rmode.vm86_active) { |
2837 | vmx->rmode.irq.pending = true; | 3030 | int inc_eip = 0; |
2838 | vmx->rmode.irq.vector = irq; | ||
2839 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | ||
2840 | if (vcpu->arch.interrupt.soft) | 3031 | if (vcpu->arch.interrupt.soft) |
2841 | vmx->rmode.irq.rip += | 3032 | inc_eip = vcpu->arch.event_exit_inst_len; |
2842 | vmx->vcpu.arch.event_exit_inst_len; | 3033 | if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) |
2843 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 3034 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
2844 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); | ||
2845 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
2846 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); | ||
2847 | return; | 3035 | return; |
2848 | } | 3036 | } |
2849 | intr = irq | INTR_INFO_VALID_MASK; | 3037 | intr = irq | INTR_INFO_VALID_MASK; |
@@ -2854,6 +3042,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) | |||
2854 | } else | 3042 | } else |
2855 | intr |= INTR_TYPE_EXT_INTR; | 3043 | intr |= INTR_TYPE_EXT_INTR; |
2856 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); | 3044 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); |
3045 | vmx_clear_hlt(vcpu); | ||
2857 | } | 3046 | } |
2858 | 3047 | ||
2859 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | 3048 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) |
@@ -2874,19 +3063,15 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
2874 | } | 3063 | } |
2875 | 3064 | ||
2876 | ++vcpu->stat.nmi_injections; | 3065 | ++vcpu->stat.nmi_injections; |
3066 | vmx->nmi_known_unmasked = false; | ||
2877 | if (vmx->rmode.vm86_active) { | 3067 | if (vmx->rmode.vm86_active) { |
2878 | vmx->rmode.irq.pending = true; | 3068 | if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) |
2879 | vmx->rmode.irq.vector = NMI_VECTOR; | 3069 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
2880 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | ||
2881 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
2882 | NMI_VECTOR | INTR_TYPE_SOFT_INTR | | ||
2883 | INTR_INFO_VALID_MASK); | ||
2884 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
2885 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); | ||
2886 | return; | 3070 | return; |
2887 | } | 3071 | } |
2888 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 3072 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
2889 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); | 3073 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); |
3074 | vmx_clear_hlt(vcpu); | ||
2890 | } | 3075 | } |
2891 | 3076 | ||
2892 | static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) | 3077 | static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) |
@@ -2895,13 +3080,16 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) | |||
2895 | return 0; | 3080 | return 0; |
2896 | 3081 | ||
2897 | return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | 3082 | return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & |
2898 | (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); | 3083 | (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
3084 | | GUEST_INTR_STATE_NMI)); | ||
2899 | } | 3085 | } |
2900 | 3086 | ||
2901 | static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) | 3087 | static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) |
2902 | { | 3088 | { |
2903 | if (!cpu_has_virtual_nmis()) | 3089 | if (!cpu_has_virtual_nmis()) |
2904 | return to_vmx(vcpu)->soft_vnmi_blocked; | 3090 | return to_vmx(vcpu)->soft_vnmi_blocked; |
3091 | if (to_vmx(vcpu)->nmi_known_unmasked) | ||
3092 | return false; | ||
2905 | return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; | 3093 | return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; |
2906 | } | 3094 | } |
2907 | 3095 | ||
@@ -2915,6 +3103,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) | |||
2915 | vmx->vnmi_blocked_time = 0; | 3103 | vmx->vnmi_blocked_time = 0; |
2916 | } | 3104 | } |
2917 | } else { | 3105 | } else { |
3106 | vmx->nmi_known_unmasked = !masked; | ||
2918 | if (masked) | 3107 | if (masked) |
2919 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | 3108 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, |
2920 | GUEST_INTR_STATE_NMI); | 3109 | GUEST_INTR_STATE_NMI); |
@@ -2945,6 +3134,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) | |||
2945 | if (ret) | 3134 | if (ret) |
2946 | return ret; | 3135 | return ret; |
2947 | kvm->arch.tss_addr = addr; | 3136 | kvm->arch.tss_addr = addr; |
3137 | if (!init_rmode_tss(kvm)) | ||
3138 | return -ENOMEM; | ||
3139 | |||
2948 | return 0; | 3140 | return 0; |
2949 | } | 3141 | } |
2950 | 3142 | ||
@@ -2956,7 +3148,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, | |||
2956 | * Cause the #SS fault with 0 error code in VM86 mode. | 3148 | * Cause the #SS fault with 0 error code in VM86 mode. |
2957 | */ | 3149 | */ |
2958 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) | 3150 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) |
2959 | if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) | 3151 | if (emulate_instruction(vcpu, 0) == EMULATE_DONE) |
2960 | return 1; | 3152 | return 1; |
2961 | /* | 3153 | /* |
2962 | * Forward all other exceptions that are valid in real mode. | 3154 | * Forward all other exceptions that are valid in real mode. |
@@ -3029,7 +3221,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
3029 | enum emulation_result er; | 3221 | enum emulation_result er; |
3030 | 3222 | ||
3031 | vect_info = vmx->idt_vectoring_info; | 3223 | vect_info = vmx->idt_vectoring_info; |
3032 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | 3224 | intr_info = vmx->exit_intr_info; |
3033 | 3225 | ||
3034 | if (is_machine_check(intr_info)) | 3226 | if (is_machine_check(intr_info)) |
3035 | return handle_machine_check(vcpu); | 3227 | return handle_machine_check(vcpu); |
@@ -3053,14 +3245,13 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
3053 | } | 3245 | } |
3054 | 3246 | ||
3055 | if (is_invalid_opcode(intr_info)) { | 3247 | if (is_invalid_opcode(intr_info)) { |
3056 | er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); | 3248 | er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); |
3057 | if (er != EMULATE_DONE) | 3249 | if (er != EMULATE_DONE) |
3058 | kvm_queue_exception(vcpu, UD_VECTOR); | 3250 | kvm_queue_exception(vcpu, UD_VECTOR); |
3059 | return 1; | 3251 | return 1; |
3060 | } | 3252 | } |
3061 | 3253 | ||
3062 | error_code = 0; | 3254 | error_code = 0; |
3063 | rip = kvm_rip_read(vcpu); | ||
3064 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) | 3255 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) |
3065 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | 3256 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); |
3066 | if (is_page_fault(intr_info)) { | 3257 | if (is_page_fault(intr_info)) { |
@@ -3072,7 +3263,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
3072 | 3263 | ||
3073 | if (kvm_event_needs_reinjection(vcpu)) | 3264 | if (kvm_event_needs_reinjection(vcpu)) |
3074 | kvm_mmu_unprotect_page_virt(vcpu, cr2); | 3265 | kvm_mmu_unprotect_page_virt(vcpu, cr2); |
3075 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | 3266 | return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); |
3076 | } | 3267 | } |
3077 | 3268 | ||
3078 | if (vmx->rmode.vm86_active && | 3269 | if (vmx->rmode.vm86_active && |
@@ -3107,6 +3298,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
3107 | vmx->vcpu.arch.event_exit_inst_len = | 3298 | vmx->vcpu.arch.event_exit_inst_len = |
3108 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | 3299 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); |
3109 | kvm_run->exit_reason = KVM_EXIT_DEBUG; | 3300 | kvm_run->exit_reason = KVM_EXIT_DEBUG; |
3301 | rip = kvm_rip_read(vcpu); | ||
3110 | kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; | 3302 | kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; |
3111 | kvm_run->debug.arch.exception = ex_no; | 3303 | kvm_run->debug.arch.exception = ex_no; |
3112 | break; | 3304 | break; |
@@ -3144,7 +3336,7 @@ static int handle_io(struct kvm_vcpu *vcpu) | |||
3144 | ++vcpu->stat.io_exits; | 3336 | ++vcpu->stat.io_exits; |
3145 | 3337 | ||
3146 | if (string || in) | 3338 | if (string || in) |
3147 | return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; | 3339 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
3148 | 3340 | ||
3149 | port = exit_qualification >> 16; | 3341 | port = exit_qualification >> 16; |
3150 | size = (exit_qualification & 7) + 1; | 3342 | size = (exit_qualification & 7) + 1; |
@@ -3164,14 +3356,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
3164 | hypercall[2] = 0xc1; | 3356 | hypercall[2] = 0xc1; |
3165 | } | 3357 | } |
3166 | 3358 | ||
3167 | static void complete_insn_gp(struct kvm_vcpu *vcpu, int err) | ||
3168 | { | ||
3169 | if (err) | ||
3170 | kvm_inject_gp(vcpu, 0); | ||
3171 | else | ||
3172 | skip_emulated_instruction(vcpu); | ||
3173 | } | ||
3174 | |||
3175 | static int handle_cr(struct kvm_vcpu *vcpu) | 3359 | static int handle_cr(struct kvm_vcpu *vcpu) |
3176 | { | 3360 | { |
3177 | unsigned long exit_qualification, val; | 3361 | unsigned long exit_qualification, val; |
@@ -3189,21 +3373,21 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3189 | switch (cr) { | 3373 | switch (cr) { |
3190 | case 0: | 3374 | case 0: |
3191 | err = kvm_set_cr0(vcpu, val); | 3375 | err = kvm_set_cr0(vcpu, val); |
3192 | complete_insn_gp(vcpu, err); | 3376 | kvm_complete_insn_gp(vcpu, err); |
3193 | return 1; | 3377 | return 1; |
3194 | case 3: | 3378 | case 3: |
3195 | err = kvm_set_cr3(vcpu, val); | 3379 | err = kvm_set_cr3(vcpu, val); |
3196 | complete_insn_gp(vcpu, err); | 3380 | kvm_complete_insn_gp(vcpu, err); |
3197 | return 1; | 3381 | return 1; |
3198 | case 4: | 3382 | case 4: |
3199 | err = kvm_set_cr4(vcpu, val); | 3383 | err = kvm_set_cr4(vcpu, val); |
3200 | complete_insn_gp(vcpu, err); | 3384 | kvm_complete_insn_gp(vcpu, err); |
3201 | return 1; | 3385 | return 1; |
3202 | case 8: { | 3386 | case 8: { |
3203 | u8 cr8_prev = kvm_get_cr8(vcpu); | 3387 | u8 cr8_prev = kvm_get_cr8(vcpu); |
3204 | u8 cr8 = kvm_register_read(vcpu, reg); | 3388 | u8 cr8 = kvm_register_read(vcpu, reg); |
3205 | kvm_set_cr8(vcpu, cr8); | 3389 | err = kvm_set_cr8(vcpu, cr8); |
3206 | skip_emulated_instruction(vcpu); | 3390 | kvm_complete_insn_gp(vcpu, err); |
3207 | if (irqchip_in_kernel(vcpu->kvm)) | 3391 | if (irqchip_in_kernel(vcpu->kvm)) |
3208 | return 1; | 3392 | return 1; |
3209 | if (cr8_prev <= cr8) | 3393 | if (cr8_prev <= cr8) |
@@ -3222,8 +3406,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3222 | case 1: /*mov from cr*/ | 3406 | case 1: /*mov from cr*/ |
3223 | switch (cr) { | 3407 | switch (cr) { |
3224 | case 3: | 3408 | case 3: |
3225 | kvm_register_write(vcpu, reg, vcpu->arch.cr3); | 3409 | val = kvm_read_cr3(vcpu); |
3226 | trace_kvm_cr_read(cr, vcpu->arch.cr3); | 3410 | kvm_register_write(vcpu, reg, val); |
3411 | trace_kvm_cr_read(cr, val); | ||
3227 | skip_emulated_instruction(vcpu); | 3412 | skip_emulated_instruction(vcpu); |
3228 | return 1; | 3413 | return 1; |
3229 | case 8: | 3414 | case 8: |
@@ -3346,6 +3531,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) | |||
3346 | 3531 | ||
3347 | static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) | 3532 | static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) |
3348 | { | 3533 | { |
3534 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
3349 | return 1; | 3535 | return 1; |
3350 | } | 3536 | } |
3351 | 3537 | ||
@@ -3358,6 +3544,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) | |||
3358 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | 3544 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; |
3359 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 3545 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
3360 | 3546 | ||
3547 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
3548 | |||
3361 | ++vcpu->stat.irq_window_exits; | 3549 | ++vcpu->stat.irq_window_exits; |
3362 | 3550 | ||
3363 | /* | 3551 | /* |
@@ -3392,6 +3580,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu) | |||
3392 | return 1; | 3580 | return 1; |
3393 | } | 3581 | } |
3394 | 3582 | ||
3583 | static int handle_invd(struct kvm_vcpu *vcpu) | ||
3584 | { | ||
3585 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; | ||
3586 | } | ||
3587 | |||
3395 | static int handle_invlpg(struct kvm_vcpu *vcpu) | 3588 | static int handle_invlpg(struct kvm_vcpu *vcpu) |
3396 | { | 3589 | { |
3397 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 3590 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
@@ -3420,7 +3613,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) | |||
3420 | 3613 | ||
3421 | static int handle_apic_access(struct kvm_vcpu *vcpu) | 3614 | static int handle_apic_access(struct kvm_vcpu *vcpu) |
3422 | { | 3615 | { |
3423 | return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; | 3616 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
3424 | } | 3617 | } |
3425 | 3618 | ||
3426 | static int handle_task_switch(struct kvm_vcpu *vcpu) | 3619 | static int handle_task_switch(struct kvm_vcpu *vcpu) |
@@ -3442,9 +3635,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) | |||
3442 | switch (type) { | 3635 | switch (type) { |
3443 | case INTR_TYPE_NMI_INTR: | 3636 | case INTR_TYPE_NMI_INTR: |
3444 | vcpu->arch.nmi_injected = false; | 3637 | vcpu->arch.nmi_injected = false; |
3445 | if (cpu_has_virtual_nmis()) | 3638 | vmx_set_nmi_mask(vcpu, true); |
3446 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
3447 | GUEST_INTR_STATE_NMI); | ||
3448 | break; | 3639 | break; |
3449 | case INTR_TYPE_EXT_INTR: | 3640 | case INTR_TYPE_EXT_INTR: |
3450 | case INTR_TYPE_SOFT_INTR: | 3641 | case INTR_TYPE_SOFT_INTR: |
@@ -3519,7 +3710,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) | |||
3519 | 3710 | ||
3520 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 3711 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
3521 | trace_kvm_page_fault(gpa, exit_qualification); | 3712 | trace_kvm_page_fault(gpa, exit_qualification); |
3522 | return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); | 3713 | return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); |
3523 | } | 3714 | } |
3524 | 3715 | ||
3525 | static u64 ept_rsvd_mask(u64 spte, int level) | 3716 | static u64 ept_rsvd_mask(u64 spte, int level) |
@@ -3614,6 +3805,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) | |||
3614 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; | 3805 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; |
3615 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 3806 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
3616 | ++vcpu->stat.nmi_window_exits; | 3807 | ++vcpu->stat.nmi_window_exits; |
3808 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
3617 | 3809 | ||
3618 | return 1; | 3810 | return 1; |
3619 | } | 3811 | } |
@@ -3623,9 +3815,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
3623 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3815 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3624 | enum emulation_result err = EMULATE_DONE; | 3816 | enum emulation_result err = EMULATE_DONE; |
3625 | int ret = 1; | 3817 | int ret = 1; |
3818 | u32 cpu_exec_ctrl; | ||
3819 | bool intr_window_requested; | ||
3820 | |||
3821 | cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
3822 | intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; | ||
3626 | 3823 | ||
3627 | while (!guest_state_valid(vcpu)) { | 3824 | while (!guest_state_valid(vcpu)) { |
3628 | err = emulate_instruction(vcpu, 0, 0, 0); | 3825 | if (intr_window_requested |
3826 | && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) | ||
3827 | return handle_interrupt_window(&vmx->vcpu); | ||
3828 | |||
3829 | err = emulate_instruction(vcpu, 0); | ||
3629 | 3830 | ||
3630 | if (err == EMULATE_DO_MMIO) { | 3831 | if (err == EMULATE_DO_MMIO) { |
3631 | ret = 0; | 3832 | ret = 0; |
@@ -3682,6 +3883,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3682 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, | 3883 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, |
3683 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | 3884 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, |
3684 | [EXIT_REASON_HLT] = handle_halt, | 3885 | [EXIT_REASON_HLT] = handle_halt, |
3886 | [EXIT_REASON_INVD] = handle_invd, | ||
3685 | [EXIT_REASON_INVLPG] = handle_invlpg, | 3887 | [EXIT_REASON_INVLPG] = handle_invlpg, |
3686 | [EXIT_REASON_VMCALL] = handle_vmcall, | 3888 | [EXIT_REASON_VMCALL] = handle_vmcall, |
3687 | [EXIT_REASON_VMCLEAR] = handle_vmx_insn, | 3889 | [EXIT_REASON_VMCLEAR] = handle_vmx_insn, |
@@ -3709,6 +3911,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3709 | static const int kvm_vmx_max_exit_handlers = | 3911 | static const int kvm_vmx_max_exit_handlers = |
3710 | ARRAY_SIZE(kvm_vmx_exit_handlers); | 3912 | ARRAY_SIZE(kvm_vmx_exit_handlers); |
3711 | 3913 | ||
3914 | static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) | ||
3915 | { | ||
3916 | *info1 = vmcs_readl(EXIT_QUALIFICATION); | ||
3917 | *info2 = vmcs_read32(VM_EXIT_INTR_INFO); | ||
3918 | } | ||
3919 | |||
3712 | /* | 3920 | /* |
3713 | * The guest has exited. See if we can fix it or if we need userspace | 3921 | * The guest has exited. See if we can fix it or if we need userspace |
3714 | * assistance. | 3922 | * assistance. |
@@ -3719,17 +3927,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
3719 | u32 exit_reason = vmx->exit_reason; | 3927 | u32 exit_reason = vmx->exit_reason; |
3720 | u32 vectoring_info = vmx->idt_vectoring_info; | 3928 | u32 vectoring_info = vmx->idt_vectoring_info; |
3721 | 3929 | ||
3722 | trace_kvm_exit(exit_reason, vcpu); | 3930 | trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); |
3723 | 3931 | ||
3724 | /* If guest state is invalid, start emulating */ | 3932 | /* If guest state is invalid, start emulating */ |
3725 | if (vmx->emulation_required && emulate_invalid_guest_state) | 3933 | if (vmx->emulation_required && emulate_invalid_guest_state) |
3726 | return handle_invalid_guest_state(vcpu); | 3934 | return handle_invalid_guest_state(vcpu); |
3727 | 3935 | ||
3728 | /* Access CR3 don't cause VMExit in paging mode, so we need | ||
3729 | * to sync with guest real CR3. */ | ||
3730 | if (enable_ept && is_paging(vcpu)) | ||
3731 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | ||
3732 | |||
3733 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { | 3936 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { |
3734 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 3937 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
3735 | vcpu->run->fail_entry.hardware_entry_failure_reason | 3938 | vcpu->run->fail_entry.hardware_entry_failure_reason |
@@ -3790,23 +3993,19 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) | |||
3790 | vmcs_write32(TPR_THRESHOLD, irr); | 3993 | vmcs_write32(TPR_THRESHOLD, irr); |
3791 | } | 3994 | } |
3792 | 3995 | ||
3793 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | 3996 | static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) |
3794 | { | 3997 | { |
3795 | u32 exit_intr_info; | 3998 | u32 exit_intr_info; |
3796 | u32 idt_vectoring_info = vmx->idt_vectoring_info; | ||
3797 | bool unblock_nmi; | ||
3798 | u8 vector; | ||
3799 | int type; | ||
3800 | bool idtv_info_valid; | ||
3801 | 3999 | ||
3802 | exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | 4000 | if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY |
4001 | || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) | ||
4002 | return; | ||
3803 | 4003 | ||
3804 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); | 4004 | vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
4005 | exit_intr_info = vmx->exit_intr_info; | ||
3805 | 4006 | ||
3806 | /* Handle machine checks before interrupts are enabled */ | 4007 | /* Handle machine checks before interrupts are enabled */ |
3807 | if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) | 4008 | if (is_machine_check(exit_intr_info)) |
3808 | || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI | ||
3809 | && is_machine_check(exit_intr_info))) | ||
3810 | kvm_machine_check(); | 4009 | kvm_machine_check(); |
3811 | 4010 | ||
3812 | /* We need to handle NMIs before interrupts are enabled */ | 4011 | /* We need to handle NMIs before interrupts are enabled */ |
@@ -3816,10 +4015,25 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
3816 | asm("int $2"); | 4015 | asm("int $2"); |
3817 | kvm_after_handle_nmi(&vmx->vcpu); | 4016 | kvm_after_handle_nmi(&vmx->vcpu); |
3818 | } | 4017 | } |
4018 | } | ||
3819 | 4019 | ||
3820 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | 4020 | static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) |
4021 | { | ||
4022 | u32 exit_intr_info; | ||
4023 | bool unblock_nmi; | ||
4024 | u8 vector; | ||
4025 | bool idtv_info_valid; | ||
4026 | |||
4027 | idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; | ||
3821 | 4028 | ||
3822 | if (cpu_has_virtual_nmis()) { | 4029 | if (cpu_has_virtual_nmis()) { |
4030 | if (vmx->nmi_known_unmasked) | ||
4031 | return; | ||
4032 | /* | ||
4033 | * Can't use vmx->exit_intr_info since we're not sure what | ||
4034 | * the exit reason is. | ||
4035 | */ | ||
4036 | exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
3823 | unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; | 4037 | unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; |
3824 | vector = exit_intr_info & INTR_INFO_VECTOR_MASK; | 4038 | vector = exit_intr_info & INTR_INFO_VECTOR_MASK; |
3825 | /* | 4039 | /* |
@@ -3836,9 +4050,25 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
3836 | vector != DF_VECTOR && !idtv_info_valid) | 4050 | vector != DF_VECTOR && !idtv_info_valid) |
3837 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | 4051 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, |
3838 | GUEST_INTR_STATE_NMI); | 4052 | GUEST_INTR_STATE_NMI); |
4053 | else | ||
4054 | vmx->nmi_known_unmasked = | ||
4055 | !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | ||
4056 | & GUEST_INTR_STATE_NMI); | ||
3839 | } else if (unlikely(vmx->soft_vnmi_blocked)) | 4057 | } else if (unlikely(vmx->soft_vnmi_blocked)) |
3840 | vmx->vnmi_blocked_time += | 4058 | vmx->vnmi_blocked_time += |
3841 | ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); | 4059 | ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); |
4060 | } | ||
4061 | |||
4062 | static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, | ||
4063 | u32 idt_vectoring_info, | ||
4064 | int instr_len_field, | ||
4065 | int error_code_field) | ||
4066 | { | ||
4067 | u8 vector; | ||
4068 | int type; | ||
4069 | bool idtv_info_valid; | ||
4070 | |||
4071 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | ||
3842 | 4072 | ||
3843 | vmx->vcpu.arch.nmi_injected = false; | 4073 | vmx->vcpu.arch.nmi_injected = false; |
3844 | kvm_clear_exception_queue(&vmx->vcpu); | 4074 | kvm_clear_exception_queue(&vmx->vcpu); |
@@ -3847,6 +4077,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
3847 | if (!idtv_info_valid) | 4077 | if (!idtv_info_valid) |
3848 | return; | 4078 | return; |
3849 | 4079 | ||
4080 | kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); | ||
4081 | |||
3850 | vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; | 4082 | vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; |
3851 | type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; | 4083 | type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; |
3852 | 4084 | ||
@@ -3858,23 +4090,22 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
3858 | * Clear bit "block by NMI" before VM entry if a NMI | 4090 | * Clear bit "block by NMI" before VM entry if a NMI |
3859 | * delivery faulted. | 4091 | * delivery faulted. |
3860 | */ | 4092 | */ |
3861 | vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, | 4093 | vmx_set_nmi_mask(&vmx->vcpu, false); |
3862 | GUEST_INTR_STATE_NMI); | ||
3863 | break; | 4094 | break; |
3864 | case INTR_TYPE_SOFT_EXCEPTION: | 4095 | case INTR_TYPE_SOFT_EXCEPTION: |
3865 | vmx->vcpu.arch.event_exit_inst_len = | 4096 | vmx->vcpu.arch.event_exit_inst_len = |
3866 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | 4097 | vmcs_read32(instr_len_field); |
3867 | /* fall through */ | 4098 | /* fall through */ |
3868 | case INTR_TYPE_HARD_EXCEPTION: | 4099 | case INTR_TYPE_HARD_EXCEPTION: |
3869 | if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { | 4100 | if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { |
3870 | u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); | 4101 | u32 err = vmcs_read32(error_code_field); |
3871 | kvm_queue_exception_e(&vmx->vcpu, vector, err); | 4102 | kvm_queue_exception_e(&vmx->vcpu, vector, err); |
3872 | } else | 4103 | } else |
3873 | kvm_queue_exception(&vmx->vcpu, vector); | 4104 | kvm_queue_exception(&vmx->vcpu, vector); |
3874 | break; | 4105 | break; |
3875 | case INTR_TYPE_SOFT_INTR: | 4106 | case INTR_TYPE_SOFT_INTR: |
3876 | vmx->vcpu.arch.event_exit_inst_len = | 4107 | vmx->vcpu.arch.event_exit_inst_len = |
3877 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | 4108 | vmcs_read32(instr_len_field); |
3878 | /* fall through */ | 4109 | /* fall through */ |
3879 | case INTR_TYPE_EXT_INTR: | 4110 | case INTR_TYPE_EXT_INTR: |
3880 | kvm_queue_interrupt(&vmx->vcpu, vector, | 4111 | kvm_queue_interrupt(&vmx->vcpu, vector, |
@@ -3885,27 +4116,21 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
3885 | } | 4116 | } |
3886 | } | 4117 | } |
3887 | 4118 | ||
3888 | /* | 4119 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) |
3889 | * Failure to inject an interrupt should give us the information | ||
3890 | * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs | ||
3891 | * when fetching the interrupt redirection bitmap in the real-mode | ||
3892 | * tss, this doesn't happen. So we do it ourselves. | ||
3893 | */ | ||
3894 | static void fixup_rmode_irq(struct vcpu_vmx *vmx) | ||
3895 | { | 4120 | { |
3896 | vmx->rmode.irq.pending = 0; | 4121 | __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, |
3897 | if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) | 4122 | VM_EXIT_INSTRUCTION_LEN, |
3898 | return; | 4123 | IDT_VECTORING_ERROR_CODE); |
3899 | kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); | 4124 | } |
3900 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { | 4125 | |
3901 | vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; | 4126 | static void vmx_cancel_injection(struct kvm_vcpu *vcpu) |
3902 | vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; | 4127 | { |
3903 | return; | 4128 | __vmx_complete_interrupts(to_vmx(vcpu), |
3904 | } | 4129 | vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), |
3905 | vmx->idt_vectoring_info = | 4130 | VM_ENTRY_INSTRUCTION_LEN, |
3906 | VECTORING_INFO_VALID_MASK | 4131 | VM_ENTRY_EXCEPTION_ERROR_CODE); |
3907 | | INTR_TYPE_EXT_INTR | 4132 | |
3908 | | vmx->rmode.irq.vector; | 4133 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); |
3909 | } | 4134 | } |
3910 | 4135 | ||
3911 | #ifdef CONFIG_X86_64 | 4136 | #ifdef CONFIG_X86_64 |
@@ -3916,7 +4141,7 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) | |||
3916 | #define Q "l" | 4141 | #define Q "l" |
3917 | #endif | 4142 | #endif |
3918 | 4143 | ||
3919 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | 4144 | static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) |
3920 | { | 4145 | { |
3921 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 4146 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3922 | 4147 | ||
@@ -3945,6 +4170,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
3945 | asm( | 4170 | asm( |
3946 | /* Store host registers */ | 4171 | /* Store host registers */ |
3947 | "push %%"R"dx; push %%"R"bp;" | 4172 | "push %%"R"dx; push %%"R"bp;" |
4173 | "push %%"R"cx \n\t" /* placeholder for guest rcx */ | ||
3948 | "push %%"R"cx \n\t" | 4174 | "push %%"R"cx \n\t" |
3949 | "cmp %%"R"sp, %c[host_rsp](%0) \n\t" | 4175 | "cmp %%"R"sp, %c[host_rsp](%0) \n\t" |
3950 | "je 1f \n\t" | 4176 | "je 1f \n\t" |
@@ -3986,10 +4212,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
3986 | ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" | 4212 | ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" |
3987 | ".Lkvm_vmx_return: " | 4213 | ".Lkvm_vmx_return: " |
3988 | /* Save guest registers, load host registers, keep flags */ | 4214 | /* Save guest registers, load host registers, keep flags */ |
3989 | "xchg %0, (%%"R"sp) \n\t" | 4215 | "mov %0, %c[wordsize](%%"R"sp) \n\t" |
4216 | "pop %0 \n\t" | ||
3990 | "mov %%"R"ax, %c[rax](%0) \n\t" | 4217 | "mov %%"R"ax, %c[rax](%0) \n\t" |
3991 | "mov %%"R"bx, %c[rbx](%0) \n\t" | 4218 | "mov %%"R"bx, %c[rbx](%0) \n\t" |
3992 | "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" | 4219 | "pop"Q" %c[rcx](%0) \n\t" |
3993 | "mov %%"R"dx, %c[rdx](%0) \n\t" | 4220 | "mov %%"R"dx, %c[rdx](%0) \n\t" |
3994 | "mov %%"R"si, %c[rsi](%0) \n\t" | 4221 | "mov %%"R"si, %c[rsi](%0) \n\t" |
3995 | "mov %%"R"di, %c[rdi](%0) \n\t" | 4222 | "mov %%"R"di, %c[rdi](%0) \n\t" |
@@ -4007,7 +4234,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4007 | "mov %%cr2, %%"R"ax \n\t" | 4234 | "mov %%cr2, %%"R"ax \n\t" |
4008 | "mov %%"R"ax, %c[cr2](%0) \n\t" | 4235 | "mov %%"R"ax, %c[cr2](%0) \n\t" |
4009 | 4236 | ||
4010 | "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" | 4237 | "pop %%"R"bp; pop %%"R"dx \n\t" |
4011 | "setbe %c[fail](%0) \n\t" | 4238 | "setbe %c[fail](%0) \n\t" |
4012 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), | 4239 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), |
4013 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), | 4240 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), |
@@ -4030,25 +4257,32 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4030 | [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), | 4257 | [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), |
4031 | [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), | 4258 | [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), |
4032 | #endif | 4259 | #endif |
4033 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) | 4260 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), |
4261 | [wordsize]"i"(sizeof(ulong)) | ||
4034 | : "cc", "memory" | 4262 | : "cc", "memory" |
4035 | , R"bx", R"di", R"si" | 4263 | , R"ax", R"bx", R"di", R"si" |
4036 | #ifdef CONFIG_X86_64 | 4264 | #ifdef CONFIG_X86_64 |
4037 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" | 4265 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" |
4038 | #endif | 4266 | #endif |
4039 | ); | 4267 | ); |
4040 | 4268 | ||
4041 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | 4269 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) |
4042 | | (1 << VCPU_EXREG_PDPTR)); | 4270 | | (1 << VCPU_EXREG_RFLAGS) |
4271 | | (1 << VCPU_EXREG_CPL) | ||
4272 | | (1 << VCPU_EXREG_PDPTR) | ||
4273 | | (1 << VCPU_EXREG_SEGMENTS) | ||
4274 | | (1 << VCPU_EXREG_CR3)); | ||
4043 | vcpu->arch.regs_dirty = 0; | 4275 | vcpu->arch.regs_dirty = 0; |
4044 | 4276 | ||
4045 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 4277 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
4046 | if (vmx->rmode.irq.pending) | ||
4047 | fixup_rmode_irq(vmx); | ||
4048 | 4278 | ||
4049 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | 4279 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); |
4050 | vmx->launched = 1; | 4280 | vmx->launched = 1; |
4051 | 4281 | ||
4282 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
4283 | |||
4284 | vmx_complete_atomic_exit(vmx); | ||
4285 | vmx_recover_nmi_blocking(vmx); | ||
4052 | vmx_complete_interrupts(vmx); | 4286 | vmx_complete_interrupts(vmx); |
4053 | } | 4287 | } |
4054 | 4288 | ||
@@ -4106,8 +4340,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
4106 | goto free_vcpu; | 4340 | goto free_vcpu; |
4107 | 4341 | ||
4108 | vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | 4342 | vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); |
4343 | err = -ENOMEM; | ||
4109 | if (!vmx->guest_msrs) { | 4344 | if (!vmx->guest_msrs) { |
4110 | err = -ENOMEM; | ||
4111 | goto uninit_vcpu; | 4345 | goto uninit_vcpu; |
4112 | } | 4346 | } |
4113 | 4347 | ||
@@ -4119,21 +4353,26 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
4119 | 4353 | ||
4120 | cpu = get_cpu(); | 4354 | cpu = get_cpu(); |
4121 | vmx_vcpu_load(&vmx->vcpu, cpu); | 4355 | vmx_vcpu_load(&vmx->vcpu, cpu); |
4356 | vmx->vcpu.cpu = cpu; | ||
4122 | err = vmx_vcpu_setup(vmx); | 4357 | err = vmx_vcpu_setup(vmx); |
4123 | vmx_vcpu_put(&vmx->vcpu); | 4358 | vmx_vcpu_put(&vmx->vcpu); |
4124 | put_cpu(); | 4359 | put_cpu(); |
4125 | if (err) | 4360 | if (err) |
4126 | goto free_vmcs; | 4361 | goto free_vmcs; |
4127 | if (vm_need_virtualize_apic_accesses(kvm)) | 4362 | if (vm_need_virtualize_apic_accesses(kvm)) |
4128 | if (alloc_apic_access_page(kvm) != 0) | 4363 | err = alloc_apic_access_page(kvm); |
4364 | if (err) | ||
4129 | goto free_vmcs; | 4365 | goto free_vmcs; |
4130 | 4366 | ||
4131 | if (enable_ept) { | 4367 | if (enable_ept) { |
4132 | if (!kvm->arch.ept_identity_map_addr) | 4368 | if (!kvm->arch.ept_identity_map_addr) |
4133 | kvm->arch.ept_identity_map_addr = | 4369 | kvm->arch.ept_identity_map_addr = |
4134 | VMX_EPT_IDENTITY_PAGETABLE_ADDR; | 4370 | VMX_EPT_IDENTITY_PAGETABLE_ADDR; |
4371 | err = -ENOMEM; | ||
4135 | if (alloc_identity_pagetable(kvm) != 0) | 4372 | if (alloc_identity_pagetable(kvm) != 0) |
4136 | goto free_vmcs; | 4373 | goto free_vmcs; |
4374 | if (!init_rmode_identity_map(kvm)) | ||
4375 | goto free_vmcs; | ||
4137 | } | 4376 | } |
4138 | 4377 | ||
4139 | return &vmx->vcpu; | 4378 | return &vmx->vcpu; |
@@ -4249,11 +4488,6 @@ static int vmx_get_lpage_level(void) | |||
4249 | return PT_PDPE_LEVEL; | 4488 | return PT_PDPE_LEVEL; |
4250 | } | 4489 | } |
4251 | 4490 | ||
4252 | static inline u32 bit(int bitno) | ||
4253 | { | ||
4254 | return 1 << (bitno & 31); | ||
4255 | } | ||
4256 | |||
4257 | static void vmx_cpuid_update(struct kvm_vcpu *vcpu) | 4491 | static void vmx_cpuid_update(struct kvm_vcpu *vcpu) |
4258 | { | 4492 | { |
4259 | struct kvm_cpuid_entry2 *best; | 4493 | struct kvm_cpuid_entry2 *best; |
@@ -4280,6 +4514,13 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | |||
4280 | { | 4514 | { |
4281 | } | 4515 | } |
4282 | 4516 | ||
4517 | static int vmx_check_intercept(struct kvm_vcpu *vcpu, | ||
4518 | struct x86_instruction_info *info, | ||
4519 | enum x86_intercept_stage stage) | ||
4520 | { | ||
4521 | return X86EMUL_CONTINUE; | ||
4522 | } | ||
4523 | |||
4283 | static struct kvm_x86_ops vmx_x86_ops = { | 4524 | static struct kvm_x86_ops vmx_x86_ops = { |
4284 | .cpu_has_kvm_support = cpu_has_kvm_support, | 4525 | .cpu_has_kvm_support = cpu_has_kvm_support, |
4285 | .disabled_by_bios = vmx_disabled_by_bios, | 4526 | .disabled_by_bios = vmx_disabled_by_bios, |
@@ -4307,6 +4548,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
4307 | .get_cpl = vmx_get_cpl, | 4548 | .get_cpl = vmx_get_cpl, |
4308 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, | 4549 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, |
4309 | .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, | 4550 | .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, |
4551 | .decache_cr3 = vmx_decache_cr3, | ||
4310 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, | 4552 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, |
4311 | .set_cr0 = vmx_set_cr0, | 4553 | .set_cr0 = vmx_set_cr0, |
4312 | .set_cr3 = vmx_set_cr3, | 4554 | .set_cr3 = vmx_set_cr3, |
@@ -4334,6 +4576,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
4334 | .set_irq = vmx_inject_irq, | 4576 | .set_irq = vmx_inject_irq, |
4335 | .set_nmi = vmx_inject_nmi, | 4577 | .set_nmi = vmx_inject_nmi, |
4336 | .queue_exception = vmx_queue_exception, | 4578 | .queue_exception = vmx_queue_exception, |
4579 | .cancel_injection = vmx_cancel_injection, | ||
4337 | .interrupt_allowed = vmx_interrupt_allowed, | 4580 | .interrupt_allowed = vmx_interrupt_allowed, |
4338 | .nmi_allowed = vmx_nmi_allowed, | 4581 | .nmi_allowed = vmx_nmi_allowed, |
4339 | .get_nmi_mask = vmx_get_nmi_mask, | 4582 | .get_nmi_mask = vmx_get_nmi_mask, |
@@ -4346,7 +4589,9 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
4346 | .get_tdp_level = get_ept_level, | 4589 | .get_tdp_level = get_ept_level, |
4347 | .get_mt_mask = vmx_get_mt_mask, | 4590 | .get_mt_mask = vmx_get_mt_mask, |
4348 | 4591 | ||
4592 | .get_exit_info = vmx_get_exit_info, | ||
4349 | .exit_reasons_str = vmx_exit_reasons_str, | 4593 | .exit_reasons_str = vmx_exit_reasons_str, |
4594 | |||
4350 | .get_lpage_level = vmx_get_lpage_level, | 4595 | .get_lpage_level = vmx_get_lpage_level, |
4351 | 4596 | ||
4352 | .cpuid_update = vmx_cpuid_update, | 4597 | .cpuid_update = vmx_cpuid_update, |
@@ -4356,6 +4601,15 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
4356 | .set_supported_cpuid = vmx_set_supported_cpuid, | 4601 | .set_supported_cpuid = vmx_set_supported_cpuid, |
4357 | 4602 | ||
4358 | .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, | 4603 | .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, |
4604 | |||
4605 | .set_tsc_khz = vmx_set_tsc_khz, | ||
4606 | .write_tsc_offset = vmx_write_tsc_offset, | ||
4607 | .adjust_tsc_offset = vmx_adjust_tsc_offset, | ||
4608 | .compute_tsc_offset = vmx_compute_tsc_offset, | ||
4609 | |||
4610 | .set_tdp_cr3 = vmx_set_cr3, | ||
4611 | |||
4612 | .check_intercept = vmx_check_intercept, | ||
4359 | }; | 4613 | }; |
4360 | 4614 | ||
4361 | static int __init vmx_init(void) | 4615 | static int __init vmx_init(void) |
@@ -4417,8 +4671,6 @@ static int __init vmx_init(void) | |||
4417 | 4671 | ||
4418 | if (enable_ept) { | 4672 | if (enable_ept) { |
4419 | bypass_guest_pf = 0; | 4673 | bypass_guest_pf = 0; |
4420 | kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | | ||
4421 | VMX_EPT_WRITABLE_MASK); | ||
4422 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, | 4674 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, |
4423 | VMX_EPT_EXECUTABLE_MASK); | 4675 | VMX_EPT_EXECUTABLE_MASK); |
4424 | kvm_enable_tdp(); | 4676 | kvm_enable_tdp(); |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3a09c625d526..77c9d8673dc4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * Copyright (C) 2006 Qumranet, Inc. | 6 | * Copyright (C) 2006 Qumranet, Inc. |
7 | * Copyright (C) 2008 Qumranet, Inc. | 7 | * Copyright (C) 2008 Qumranet, Inc. |
8 | * Copyright IBM Corporation, 2008 | 8 | * Copyright IBM Corporation, 2008 |
9 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 9 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
10 | * | 10 | * |
11 | * Authors: | 11 | * Authors: |
12 | * Avi Kivity <avi@qumranet.com> | 12 | * Avi Kivity <avi@qumranet.com> |
@@ -43,6 +43,7 @@ | |||
43 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
44 | #include <linux/perf_event.h> | 44 | #include <linux/perf_event.h> |
45 | #include <linux/uaccess.h> | 45 | #include <linux/uaccess.h> |
46 | #include <linux/hash.h> | ||
46 | #include <trace/events/kvm.h> | 47 | #include <trace/events/kvm.h> |
47 | 48 | ||
48 | #define CREATE_TRACE_POINTS | 49 | #define CREATE_TRACE_POINTS |
@@ -55,32 +56,25 @@ | |||
55 | #include <asm/mce.h> | 56 | #include <asm/mce.h> |
56 | #include <asm/i387.h> | 57 | #include <asm/i387.h> |
57 | #include <asm/xcr.h> | 58 | #include <asm/xcr.h> |
59 | #include <asm/pvclock.h> | ||
60 | #include <asm/div64.h> | ||
58 | 61 | ||
59 | #define MAX_IO_MSRS 256 | 62 | #define MAX_IO_MSRS 256 |
60 | #define CR0_RESERVED_BITS \ | ||
61 | (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | ||
62 | | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | ||
63 | | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) | ||
64 | #define CR4_RESERVED_BITS \ | ||
65 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | ||
66 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | ||
67 | | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ | ||
68 | | X86_CR4_OSXSAVE \ | ||
69 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) | ||
70 | |||
71 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) | ||
72 | |||
73 | #define KVM_MAX_MCE_BANKS 32 | 63 | #define KVM_MAX_MCE_BANKS 32 |
74 | #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P | 64 | #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) |
65 | |||
66 | #define emul_to_vcpu(ctxt) \ | ||
67 | container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) | ||
75 | 68 | ||
76 | /* EFER defaults: | 69 | /* EFER defaults: |
77 | * - enable syscall per default because its emulated by KVM | 70 | * - enable syscall per default because its emulated by KVM |
78 | * - enable LME and LMA per default on 64 bit KVM | 71 | * - enable LME and LMA per default on 64 bit KVM |
79 | */ | 72 | */ |
80 | #ifdef CONFIG_X86_64 | 73 | #ifdef CONFIG_X86_64 |
81 | static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; | 74 | static |
75 | u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); | ||
82 | #else | 76 | #else |
83 | static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; | 77 | static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); |
84 | #endif | 78 | #endif |
85 | 79 | ||
86 | #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM | 80 | #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM |
@@ -96,6 +90,11 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); | |||
96 | int ignore_msrs = 0; | 90 | int ignore_msrs = 0; |
97 | module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); | 91 | module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); |
98 | 92 | ||
93 | bool kvm_has_tsc_control; | ||
94 | EXPORT_SYMBOL_GPL(kvm_has_tsc_control); | ||
95 | u32 kvm_max_guest_tsc_khz; | ||
96 | EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); | ||
97 | |||
99 | #define KVM_NR_SHARED_MSRS 16 | 98 | #define KVM_NR_SHARED_MSRS 16 |
100 | 99 | ||
101 | struct kvm_shared_msrs_global { | 100 | struct kvm_shared_msrs_global { |
@@ -153,9 +152,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
153 | 152 | ||
154 | u64 __read_mostly host_xcr0; | 153 | u64 __read_mostly host_xcr0; |
155 | 154 | ||
156 | static inline u32 bit(int bitno) | 155 | int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); |
156 | |||
157 | static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) | ||
157 | { | 158 | { |
158 | return 1 << (bitno & 31); | 159 | int i; |
160 | for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) | ||
161 | vcpu->arch.apf.gfns[i] = ~0; | ||
159 | } | 162 | } |
160 | 163 | ||
161 | static void kvm_on_user_return(struct user_return_notifier *urn) | 164 | static void kvm_on_user_return(struct user_return_notifier *urn) |
@@ -282,6 +285,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, | |||
282 | u32 prev_nr; | 285 | u32 prev_nr; |
283 | int class1, class2; | 286 | int class1, class2; |
284 | 287 | ||
288 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
289 | |||
285 | if (!vcpu->arch.exception.pending) { | 290 | if (!vcpu->arch.exception.pending) { |
286 | queue: | 291 | queue: |
287 | vcpu->arch.exception.pending = true; | 292 | vcpu->arch.exception.pending = true; |
@@ -327,16 +332,33 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) | |||
327 | } | 332 | } |
328 | EXPORT_SYMBOL_GPL(kvm_requeue_exception); | 333 | EXPORT_SYMBOL_GPL(kvm_requeue_exception); |
329 | 334 | ||
330 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, | 335 | void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) |
331 | u32 error_code) | 336 | { |
337 | if (err) | ||
338 | kvm_inject_gp(vcpu, 0); | ||
339 | else | ||
340 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
341 | } | ||
342 | EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); | ||
343 | |||
344 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) | ||
332 | { | 345 | { |
333 | ++vcpu->stat.pf_guest; | 346 | ++vcpu->stat.pf_guest; |
334 | vcpu->arch.cr2 = addr; | 347 | vcpu->arch.cr2 = fault->address; |
335 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); | 348 | kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); |
349 | } | ||
350 | |||
351 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) | ||
352 | { | ||
353 | if (mmu_is_nested(vcpu) && !fault->nested_page_fault) | ||
354 | vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); | ||
355 | else | ||
356 | vcpu->arch.mmu.inject_page_fault(vcpu, fault); | ||
336 | } | 357 | } |
337 | 358 | ||
338 | void kvm_inject_nmi(struct kvm_vcpu *vcpu) | 359 | void kvm_inject_nmi(struct kvm_vcpu *vcpu) |
339 | { | 360 | { |
361 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
340 | vcpu->arch.nmi_pending = 1; | 362 | vcpu->arch.nmi_pending = 1; |
341 | } | 363 | } |
342 | EXPORT_SYMBOL_GPL(kvm_inject_nmi); | 364 | EXPORT_SYMBOL_GPL(kvm_inject_nmi); |
@@ -367,18 +389,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) | |||
367 | EXPORT_SYMBOL_GPL(kvm_require_cpl); | 389 | EXPORT_SYMBOL_GPL(kvm_require_cpl); |
368 | 390 | ||
369 | /* | 391 | /* |
392 | * This function will be used to read from the physical memory of the currently | ||
393 | * running guest. The difference to kvm_read_guest_page is that this function | ||
394 | * can read from guest physical or from the guest's guest physical memory. | ||
395 | */ | ||
396 | int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | ||
397 | gfn_t ngfn, void *data, int offset, int len, | ||
398 | u32 access) | ||
399 | { | ||
400 | gfn_t real_gfn; | ||
401 | gpa_t ngpa; | ||
402 | |||
403 | ngpa = gfn_to_gpa(ngfn); | ||
404 | real_gfn = mmu->translate_gpa(vcpu, ngpa, access); | ||
405 | if (real_gfn == UNMAPPED_GVA) | ||
406 | return -EFAULT; | ||
407 | |||
408 | real_gfn = gpa_to_gfn(real_gfn); | ||
409 | |||
410 | return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); | ||
411 | } | ||
412 | EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); | ||
413 | |||
414 | int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, | ||
415 | void *data, int offset, int len, u32 access) | ||
416 | { | ||
417 | return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, | ||
418 | data, offset, len, access); | ||
419 | } | ||
420 | |||
421 | /* | ||
370 | * Load the pae pdptrs. Return true is they are all valid. | 422 | * Load the pae pdptrs. Return true is they are all valid. |
371 | */ | 423 | */ |
372 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | 424 | int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) |
373 | { | 425 | { |
374 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; | 426 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; |
375 | unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; | 427 | unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; |
376 | int i; | 428 | int i; |
377 | int ret; | 429 | int ret; |
378 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; | 430 | u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; |
379 | 431 | ||
380 | ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, | 432 | ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, |
381 | offset * sizeof(u64), sizeof(pdpte)); | 433 | offset * sizeof(u64), sizeof(pdpte), |
434 | PFERR_USER_MASK|PFERR_WRITE_MASK); | ||
382 | if (ret < 0) { | 435 | if (ret < 0) { |
383 | ret = 0; | 436 | ret = 0; |
384 | goto out; | 437 | goto out; |
@@ -392,7 +445,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
392 | } | 445 | } |
393 | ret = 1; | 446 | ret = 1; |
394 | 447 | ||
395 | memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); | 448 | memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); |
396 | __set_bit(VCPU_EXREG_PDPTR, | 449 | __set_bit(VCPU_EXREG_PDPTR, |
397 | (unsigned long *)&vcpu->arch.regs_avail); | 450 | (unsigned long *)&vcpu->arch.regs_avail); |
398 | __set_bit(VCPU_EXREG_PDPTR, | 451 | __set_bit(VCPU_EXREG_PDPTR, |
@@ -405,8 +458,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs); | |||
405 | 458 | ||
406 | static bool pdptrs_changed(struct kvm_vcpu *vcpu) | 459 | static bool pdptrs_changed(struct kvm_vcpu *vcpu) |
407 | { | 460 | { |
408 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; | 461 | u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; |
409 | bool changed = true; | 462 | bool changed = true; |
463 | int offset; | ||
464 | gfn_t gfn; | ||
410 | int r; | 465 | int r; |
411 | 466 | ||
412 | if (is_long_mode(vcpu) || !is_pae(vcpu)) | 467 | if (is_long_mode(vcpu) || !is_pae(vcpu)) |
@@ -416,10 +471,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) | |||
416 | (unsigned long *)&vcpu->arch.regs_avail)) | 471 | (unsigned long *)&vcpu->arch.regs_avail)) |
417 | return true; | 472 | return true; |
418 | 473 | ||
419 | r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); | 474 | gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; |
475 | offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); | ||
476 | r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), | ||
477 | PFERR_USER_MASK | PFERR_WRITE_MASK); | ||
420 | if (r < 0) | 478 | if (r < 0) |
421 | goto out; | 479 | goto out; |
422 | changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; | 480 | changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; |
423 | out: | 481 | out: |
424 | 482 | ||
425 | return changed; | 483 | return changed; |
@@ -458,12 +516,18 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
458 | return 1; | 516 | return 1; |
459 | } else | 517 | } else |
460 | #endif | 518 | #endif |
461 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) | 519 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, |
520 | kvm_read_cr3(vcpu))) | ||
462 | return 1; | 521 | return 1; |
463 | } | 522 | } |
464 | 523 | ||
465 | kvm_x86_ops->set_cr0(vcpu, cr0); | 524 | kvm_x86_ops->set_cr0(vcpu, cr0); |
466 | 525 | ||
526 | if ((cr0 ^ old_cr0) & X86_CR0_PG) { | ||
527 | kvm_clear_async_pf_completion_queue(vcpu); | ||
528 | kvm_async_pf_hash_reset(vcpu); | ||
529 | } | ||
530 | |||
467 | if ((cr0 ^ old_cr0) & update_bits) | 531 | if ((cr0 ^ old_cr0) & update_bits) |
468 | kvm_mmu_reset_context(vcpu); | 532 | kvm_mmu_reset_context(vcpu); |
469 | return 0; | 533 | return 0; |
@@ -547,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
547 | return 1; | 611 | return 1; |
548 | } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) | 612 | } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) |
549 | && ((cr4 ^ old_cr4) & pdptr_bits) | 613 | && ((cr4 ^ old_cr4) & pdptr_bits) |
550 | && !load_pdptrs(vcpu, vcpu->arch.cr3)) | 614 | && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, |
615 | kvm_read_cr3(vcpu))) | ||
551 | return 1; | 616 | return 1; |
552 | 617 | ||
553 | if (cr4 & X86_CR4_VMXE) | 618 | if (cr4 & X86_CR4_VMXE) |
@@ -567,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); | |||
567 | 632 | ||
568 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 633 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
569 | { | 634 | { |
570 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { | 635 | if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { |
571 | kvm_mmu_sync_roots(vcpu); | 636 | kvm_mmu_sync_roots(vcpu); |
572 | kvm_mmu_flush_tlb(vcpu); | 637 | kvm_mmu_flush_tlb(vcpu); |
573 | return 0; | 638 | return 0; |
@@ -580,7 +645,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
580 | if (is_pae(vcpu)) { | 645 | if (is_pae(vcpu)) { |
581 | if (cr3 & CR3_PAE_RESERVED_BITS) | 646 | if (cr3 & CR3_PAE_RESERVED_BITS) |
582 | return 1; | 647 | return 1; |
583 | if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) | 648 | if (is_paging(vcpu) && |
649 | !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) | ||
584 | return 1; | 650 | return 1; |
585 | } | 651 | } |
586 | /* | 652 | /* |
@@ -601,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
601 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) | 667 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) |
602 | return 1; | 668 | return 1; |
603 | vcpu->arch.cr3 = cr3; | 669 | vcpu->arch.cr3 = cr3; |
670 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
604 | vcpu->arch.mmu.new_cr3(vcpu); | 671 | vcpu->arch.mmu.new_cr3(vcpu); |
605 | return 0; | 672 | return 0; |
606 | } | 673 | } |
607 | EXPORT_SYMBOL_GPL(kvm_set_cr3); | 674 | EXPORT_SYMBOL_GPL(kvm_set_cr3); |
608 | 675 | ||
609 | int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | 676 | int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) |
610 | { | 677 | { |
611 | if (cr8 & CR8_RESERVED_BITS) | 678 | if (cr8 & CR8_RESERVED_BITS) |
612 | return 1; | 679 | return 1; |
@@ -616,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | |||
616 | vcpu->arch.cr8 = cr8; | 683 | vcpu->arch.cr8 = cr8; |
617 | return 0; | 684 | return 0; |
618 | } | 685 | } |
619 | |||
620 | void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | ||
621 | { | ||
622 | if (__kvm_set_cr8(vcpu, cr8)) | ||
623 | kvm_inject_gp(vcpu, 0); | ||
624 | } | ||
625 | EXPORT_SYMBOL_GPL(kvm_set_cr8); | 686 | EXPORT_SYMBOL_GPL(kvm_set_cr8); |
626 | 687 | ||
627 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) | 688 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) |
@@ -726,18 +787,18 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); | |||
726 | * kvm-specific. Those are put in the beginning of the list. | 787 | * kvm-specific. Those are put in the beginning of the list. |
727 | */ | 788 | */ |
728 | 789 | ||
729 | #define KVM_SAVE_MSRS_BEGIN 7 | 790 | #define KVM_SAVE_MSRS_BEGIN 8 |
730 | static u32 msrs_to_save[] = { | 791 | static u32 msrs_to_save[] = { |
731 | MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, | 792 | MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, |
732 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, | 793 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, |
733 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, | 794 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, |
734 | HV_X64_MSR_APIC_ASSIST_PAGE, | 795 | HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, |
735 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | 796 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, |
736 | MSR_STAR, | 797 | MSR_STAR, |
737 | #ifdef CONFIG_X86_64 | 798 | #ifdef CONFIG_X86_64 |
738 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | 799 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, |
739 | #endif | 800 | #endif |
740 | MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA | 801 | MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA |
741 | }; | 802 | }; |
742 | 803 | ||
743 | static unsigned num_msrs_to_save; | 804 | static unsigned num_msrs_to_save; |
@@ -781,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
781 | kvm_x86_ops->set_efer(vcpu, efer); | 842 | kvm_x86_ops->set_efer(vcpu, efer); |
782 | 843 | ||
783 | vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; | 844 | vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; |
784 | kvm_mmu_reset_context(vcpu); | ||
785 | 845 | ||
786 | /* Update reserved bits */ | 846 | /* Update reserved bits */ |
787 | if ((efer ^ old_efer) & EFER_NX) | 847 | if ((efer ^ old_efer) & EFER_NX) |
@@ -838,7 +898,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) | |||
838 | 898 | ||
839 | /* | 899 | /* |
840 | * The guest calculates current wall clock time by adding | 900 | * The guest calculates current wall clock time by adding |
841 | * system time (updated by kvm_write_guest_time below) to the | 901 | * system time (updated by kvm_guest_time_update below) to the |
842 | * wall clock specified here. guest system time equals host | 902 | * wall clock specified here. guest system time equals host |
843 | * system time for us, thus we must fill in host boot time here. | 903 | * system time for us, thus we must fill in host boot time here. |
844 | */ | 904 | */ |
@@ -866,65 +926,235 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor) | |||
866 | return quotient; | 926 | return quotient; |
867 | } | 927 | } |
868 | 928 | ||
869 | static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) | 929 | static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, |
930 | s8 *pshift, u32 *pmultiplier) | ||
870 | { | 931 | { |
871 | uint64_t nsecs = 1000000000LL; | 932 | uint64_t scaled64; |
872 | int32_t shift = 0; | 933 | int32_t shift = 0; |
873 | uint64_t tps64; | 934 | uint64_t tps64; |
874 | uint32_t tps32; | 935 | uint32_t tps32; |
875 | 936 | ||
876 | tps64 = tsc_khz * 1000LL; | 937 | tps64 = base_khz * 1000LL; |
877 | while (tps64 > nsecs*2) { | 938 | scaled64 = scaled_khz * 1000LL; |
939 | while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { | ||
878 | tps64 >>= 1; | 940 | tps64 >>= 1; |
879 | shift--; | 941 | shift--; |
880 | } | 942 | } |
881 | 943 | ||
882 | tps32 = (uint32_t)tps64; | 944 | tps32 = (uint32_t)tps64; |
883 | while (tps32 <= (uint32_t)nsecs) { | 945 | while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { |
884 | tps32 <<= 1; | 946 | if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) |
947 | scaled64 >>= 1; | ||
948 | else | ||
949 | tps32 <<= 1; | ||
885 | shift++; | 950 | shift++; |
886 | } | 951 | } |
887 | 952 | ||
888 | hv_clock->tsc_shift = shift; | 953 | *pshift = shift; |
889 | hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); | 954 | *pmultiplier = div_frac(scaled64, tps32); |
890 | 955 | ||
891 | pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", | 956 | pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", |
892 | __func__, tsc_khz, hv_clock->tsc_shift, | 957 | __func__, base_khz, scaled_khz, shift, *pmultiplier); |
893 | hv_clock->tsc_to_system_mul); | 958 | } |
959 | |||
960 | static inline u64 get_kernel_ns(void) | ||
961 | { | ||
962 | struct timespec ts; | ||
963 | |||
964 | WARN_ON(preemptible()); | ||
965 | ktime_get_ts(&ts); | ||
966 | monotonic_to_bootbased(&ts); | ||
967 | return timespec_to_ns(&ts); | ||
894 | } | 968 | } |
895 | 969 | ||
896 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); | 970 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); |
971 | unsigned long max_tsc_khz; | ||
897 | 972 | ||
898 | static void kvm_write_guest_time(struct kvm_vcpu *v) | 973 | static inline int kvm_tsc_changes_freq(void) |
974 | { | ||
975 | int cpu = get_cpu(); | ||
976 | int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && | ||
977 | cpufreq_quick_get(cpu) != 0; | ||
978 | put_cpu(); | ||
979 | return ret; | ||
980 | } | ||
981 | |||
982 | static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) | ||
983 | { | ||
984 | if (vcpu->arch.virtual_tsc_khz) | ||
985 | return vcpu->arch.virtual_tsc_khz; | ||
986 | else | ||
987 | return __this_cpu_read(cpu_tsc_khz); | ||
988 | } | ||
989 | |||
990 | static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) | ||
991 | { | ||
992 | u64 ret; | ||
993 | |||
994 | WARN_ON(preemptible()); | ||
995 | if (kvm_tsc_changes_freq()) | ||
996 | printk_once(KERN_WARNING | ||
997 | "kvm: unreliable cycle conversion on adjustable rate TSC\n"); | ||
998 | ret = nsec * vcpu_tsc_khz(vcpu); | ||
999 | do_div(ret, USEC_PER_SEC); | ||
1000 | return ret; | ||
1001 | } | ||
1002 | |||
1003 | static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) | ||
1004 | { | ||
1005 | /* Compute a scale to convert nanoseconds in TSC cycles */ | ||
1006 | kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, | ||
1007 | &vcpu->arch.tsc_catchup_shift, | ||
1008 | &vcpu->arch.tsc_catchup_mult); | ||
1009 | } | ||
1010 | |||
1011 | static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) | ||
1012 | { | ||
1013 | u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, | ||
1014 | vcpu->arch.tsc_catchup_mult, | ||
1015 | vcpu->arch.tsc_catchup_shift); | ||
1016 | tsc += vcpu->arch.last_tsc_write; | ||
1017 | return tsc; | ||
1018 | } | ||
1019 | |||
1020 | void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) | ||
1021 | { | ||
1022 | struct kvm *kvm = vcpu->kvm; | ||
1023 | u64 offset, ns, elapsed; | ||
1024 | unsigned long flags; | ||
1025 | s64 sdiff; | ||
1026 | |||
1027 | raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); | ||
1028 | offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); | ||
1029 | ns = get_kernel_ns(); | ||
1030 | elapsed = ns - kvm->arch.last_tsc_nsec; | ||
1031 | sdiff = data - kvm->arch.last_tsc_write; | ||
1032 | if (sdiff < 0) | ||
1033 | sdiff = -sdiff; | ||
1034 | |||
1035 | /* | ||
1036 | * Special case: close write to TSC within 5 seconds of | ||
1037 | * another CPU is interpreted as an attempt to synchronize | ||
1038 | * The 5 seconds is to accommodate host load / swapping as | ||
1039 | * well as any reset of TSC during the boot process. | ||
1040 | * | ||
1041 | * In that case, for a reliable TSC, we can match TSC offsets, | ||
1042 | * or make a best guest using elapsed value. | ||
1043 | */ | ||
1044 | if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && | ||
1045 | elapsed < 5ULL * NSEC_PER_SEC) { | ||
1046 | if (!check_tsc_unstable()) { | ||
1047 | offset = kvm->arch.last_tsc_offset; | ||
1048 | pr_debug("kvm: matched tsc offset for %llu\n", data); | ||
1049 | } else { | ||
1050 | u64 delta = nsec_to_cycles(vcpu, elapsed); | ||
1051 | offset += delta; | ||
1052 | pr_debug("kvm: adjusted tsc offset by %llu\n", delta); | ||
1053 | } | ||
1054 | ns = kvm->arch.last_tsc_nsec; | ||
1055 | } | ||
1056 | kvm->arch.last_tsc_nsec = ns; | ||
1057 | kvm->arch.last_tsc_write = data; | ||
1058 | kvm->arch.last_tsc_offset = offset; | ||
1059 | kvm_x86_ops->write_tsc_offset(vcpu, offset); | ||
1060 | raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); | ||
1061 | |||
1062 | /* Reset of TSC must disable overshoot protection below */ | ||
1063 | vcpu->arch.hv_clock.tsc_timestamp = 0; | ||
1064 | vcpu->arch.last_tsc_write = data; | ||
1065 | vcpu->arch.last_tsc_nsec = ns; | ||
1066 | } | ||
1067 | EXPORT_SYMBOL_GPL(kvm_write_tsc); | ||
1068 | |||
1069 | static int kvm_guest_time_update(struct kvm_vcpu *v) | ||
899 | { | 1070 | { |
900 | struct timespec ts; | ||
901 | unsigned long flags; | 1071 | unsigned long flags; |
902 | struct kvm_vcpu_arch *vcpu = &v->arch; | 1072 | struct kvm_vcpu_arch *vcpu = &v->arch; |
903 | void *shared_kaddr; | 1073 | void *shared_kaddr; |
904 | unsigned long this_tsc_khz; | 1074 | unsigned long this_tsc_khz; |
1075 | s64 kernel_ns, max_kernel_ns; | ||
1076 | u64 tsc_timestamp; | ||
905 | 1077 | ||
906 | if ((!vcpu->time_page)) | 1078 | /* Keep irq disabled to prevent changes to the clock */ |
907 | return; | 1079 | local_irq_save(flags); |
1080 | kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); | ||
1081 | kernel_ns = get_kernel_ns(); | ||
1082 | this_tsc_khz = vcpu_tsc_khz(v); | ||
1083 | if (unlikely(this_tsc_khz == 0)) { | ||
1084 | local_irq_restore(flags); | ||
1085 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); | ||
1086 | return 1; | ||
1087 | } | ||
908 | 1088 | ||
909 | this_tsc_khz = get_cpu_var(cpu_tsc_khz); | 1089 | /* |
910 | if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { | 1090 | * We may have to catch up the TSC to match elapsed wall clock |
911 | kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); | 1091 | * time for two reasons, even if kvmclock is used. |
912 | vcpu->hv_clock_tsc_khz = this_tsc_khz; | 1092 | * 1) CPU could have been running below the maximum TSC rate |
1093 | * 2) Broken TSC compensation resets the base at each VCPU | ||
1094 | * entry to avoid unknown leaps of TSC even when running | ||
1095 | * again on the same CPU. This may cause apparent elapsed | ||
1096 | * time to disappear, and the guest to stand still or run | ||
1097 | * very slowly. | ||
1098 | */ | ||
1099 | if (vcpu->tsc_catchup) { | ||
1100 | u64 tsc = compute_guest_tsc(v, kernel_ns); | ||
1101 | if (tsc > tsc_timestamp) { | ||
1102 | kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); | ||
1103 | tsc_timestamp = tsc; | ||
1104 | } | ||
913 | } | 1105 | } |
914 | put_cpu_var(cpu_tsc_khz); | ||
915 | 1106 | ||
916 | /* Keep irq disabled to prevent changes to the clock */ | ||
917 | local_irq_save(flags); | ||
918 | kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); | ||
919 | ktime_get_ts(&ts); | ||
920 | monotonic_to_bootbased(&ts); | ||
921 | local_irq_restore(flags); | 1107 | local_irq_restore(flags); |
922 | 1108 | ||
923 | /* With all the info we got, fill in the values */ | 1109 | if (!vcpu->time_page) |
1110 | return 0; | ||
924 | 1111 | ||
925 | vcpu->hv_clock.system_time = ts.tv_nsec + | 1112 | /* |
926 | (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; | 1113 | * Time as measured by the TSC may go backwards when resetting the base |
1114 | * tsc_timestamp. The reason for this is that the TSC resolution is | ||
1115 | * higher than the resolution of the other clock scales. Thus, many | ||
1116 | * possible measurments of the TSC correspond to one measurement of any | ||
1117 | * other clock, and so a spread of values is possible. This is not a | ||
1118 | * problem for the computation of the nanosecond clock; with TSC rates | ||
1119 | * around 1GHZ, there can only be a few cycles which correspond to one | ||
1120 | * nanosecond value, and any path through this code will inevitably | ||
1121 | * take longer than that. However, with the kernel_ns value itself, | ||
1122 | * the precision may be much lower, down to HZ granularity. If the | ||
1123 | * first sampling of TSC against kernel_ns ends in the low part of the | ||
1124 | * range, and the second in the high end of the range, we can get: | ||
1125 | * | ||
1126 | * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new | ||
1127 | * | ||
1128 | * As the sampling errors potentially range in the thousands of cycles, | ||
1129 | * it is possible such a time value has already been observed by the | ||
1130 | * guest. To protect against this, we must compute the system time as | ||
1131 | * observed by the guest and ensure the new system time is greater. | ||
1132 | */ | ||
1133 | max_kernel_ns = 0; | ||
1134 | if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { | ||
1135 | max_kernel_ns = vcpu->last_guest_tsc - | ||
1136 | vcpu->hv_clock.tsc_timestamp; | ||
1137 | max_kernel_ns = pvclock_scale_delta(max_kernel_ns, | ||
1138 | vcpu->hv_clock.tsc_to_system_mul, | ||
1139 | vcpu->hv_clock.tsc_shift); | ||
1140 | max_kernel_ns += vcpu->last_kernel_ns; | ||
1141 | } | ||
927 | 1142 | ||
1143 | if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { | ||
1144 | kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, | ||
1145 | &vcpu->hv_clock.tsc_shift, | ||
1146 | &vcpu->hv_clock.tsc_to_system_mul); | ||
1147 | vcpu->hw_tsc_khz = this_tsc_khz; | ||
1148 | } | ||
1149 | |||
1150 | if (max_kernel_ns > kernel_ns) | ||
1151 | kernel_ns = max_kernel_ns; | ||
1152 | |||
1153 | /* With all the info we got, fill in the values */ | ||
1154 | vcpu->hv_clock.tsc_timestamp = tsc_timestamp; | ||
1155 | vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; | ||
1156 | vcpu->last_kernel_ns = kernel_ns; | ||
1157 | vcpu->last_guest_tsc = tsc_timestamp; | ||
928 | vcpu->hv_clock.flags = 0; | 1158 | vcpu->hv_clock.flags = 0; |
929 | 1159 | ||
930 | /* | 1160 | /* |
@@ -942,16 +1172,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) | |||
942 | kunmap_atomic(shared_kaddr, KM_USER0); | 1172 | kunmap_atomic(shared_kaddr, KM_USER0); |
943 | 1173 | ||
944 | mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); | 1174 | mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); |
945 | } | 1175 | return 0; |
946 | |||
947 | static int kvm_request_guest_time_update(struct kvm_vcpu *v) | ||
948 | { | ||
949 | struct kvm_vcpu_arch *vcpu = &v->arch; | ||
950 | |||
951 | if (!vcpu->time_page) | ||
952 | return 0; | ||
953 | kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); | ||
954 | return 1; | ||
955 | } | 1176 | } |
956 | 1177 | ||
957 | static bool msr_mtrr_valid(unsigned msr) | 1178 | static bool msr_mtrr_valid(unsigned msr) |
@@ -1214,6 +1435,38 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1214 | return 0; | 1435 | return 0; |
1215 | } | 1436 | } |
1216 | 1437 | ||
1438 | static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) | ||
1439 | { | ||
1440 | gpa_t gpa = data & ~0x3f; | ||
1441 | |||
1442 | /* Bits 2:5 are resrved, Should be zero */ | ||
1443 | if (data & 0x3c) | ||
1444 | return 1; | ||
1445 | |||
1446 | vcpu->arch.apf.msr_val = data; | ||
1447 | |||
1448 | if (!(data & KVM_ASYNC_PF_ENABLED)) { | ||
1449 | kvm_clear_async_pf_completion_queue(vcpu); | ||
1450 | kvm_async_pf_hash_reset(vcpu); | ||
1451 | return 0; | ||
1452 | } | ||
1453 | |||
1454 | if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) | ||
1455 | return 1; | ||
1456 | |||
1457 | vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); | ||
1458 | kvm_async_pf_wakeup_all(vcpu); | ||
1459 | return 0; | ||
1460 | } | ||
1461 | |||
1462 | static void kvmclock_reset(struct kvm_vcpu *vcpu) | ||
1463 | { | ||
1464 | if (vcpu->arch.time_page) { | ||
1465 | kvm_release_page_dirty(vcpu->arch.time_page); | ||
1466 | vcpu->arch.time_page = NULL; | ||
1467 | } | ||
1468 | } | ||
1469 | |||
1217 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 1470 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
1218 | { | 1471 | { |
1219 | switch (msr) { | 1472 | switch (msr) { |
@@ -1271,12 +1524,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1271 | break; | 1524 | break; |
1272 | case MSR_KVM_SYSTEM_TIME_NEW: | 1525 | case MSR_KVM_SYSTEM_TIME_NEW: |
1273 | case MSR_KVM_SYSTEM_TIME: { | 1526 | case MSR_KVM_SYSTEM_TIME: { |
1274 | if (vcpu->arch.time_page) { | 1527 | kvmclock_reset(vcpu); |
1275 | kvm_release_page_dirty(vcpu->arch.time_page); | ||
1276 | vcpu->arch.time_page = NULL; | ||
1277 | } | ||
1278 | 1528 | ||
1279 | vcpu->arch.time = data; | 1529 | vcpu->arch.time = data; |
1530 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | ||
1280 | 1531 | ||
1281 | /* we verify if the enable bit is set... */ | 1532 | /* we verify if the enable bit is set... */ |
1282 | if (!(data & 1)) | 1533 | if (!(data & 1)) |
@@ -1292,10 +1543,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1292 | kvm_release_page_clean(vcpu->arch.time_page); | 1543 | kvm_release_page_clean(vcpu->arch.time_page); |
1293 | vcpu->arch.time_page = NULL; | 1544 | vcpu->arch.time_page = NULL; |
1294 | } | 1545 | } |
1295 | |||
1296 | kvm_request_guest_time_update(vcpu); | ||
1297 | break; | 1546 | break; |
1298 | } | 1547 | } |
1548 | case MSR_KVM_ASYNC_PF_EN: | ||
1549 | if (kvm_pv_enable_async_pf(vcpu, data)) | ||
1550 | return 1; | ||
1551 | break; | ||
1299 | case MSR_IA32_MCG_CTL: | 1552 | case MSR_IA32_MCG_CTL: |
1300 | case MSR_IA32_MCG_STATUS: | 1553 | case MSR_IA32_MCG_STATUS: |
1301 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: | 1554 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: |
@@ -1330,6 +1583,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1330 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " | 1583 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " |
1331 | "0x%x data 0x%llx\n", msr, data); | 1584 | "0x%x data 0x%llx\n", msr, data); |
1332 | break; | 1585 | break; |
1586 | case MSR_K7_CLK_CTL: | ||
1587 | /* | ||
1588 | * Ignore all writes to this no longer documented MSR. | ||
1589 | * Writes are only relevant for old K7 processors, | ||
1590 | * all pre-dating SVM, but a recommended workaround from | ||
1591 | * AMD for these chips. It is possible to speicify the | ||
1592 | * affected processor models on the command line, hence | ||
1593 | * the need to ignore the workaround. | ||
1594 | */ | ||
1595 | break; | ||
1333 | case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: | 1596 | case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: |
1334 | if (kvm_hv_msr_partition_wide(msr)) { | 1597 | if (kvm_hv_msr_partition_wide(msr)) { |
1335 | int r; | 1598 | int r; |
@@ -1340,6 +1603,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1340 | } else | 1603 | } else |
1341 | return set_msr_hyperv(vcpu, msr, data); | 1604 | return set_msr_hyperv(vcpu, msr, data); |
1342 | break; | 1605 | break; |
1606 | case MSR_IA32_BBL_CR_CTL3: | ||
1607 | /* Drop writes to this legacy MSR -- see rdmsr | ||
1608 | * counterpart for further detail. | ||
1609 | */ | ||
1610 | pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); | ||
1611 | break; | ||
1343 | default: | 1612 | default: |
1344 | if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) | 1613 | if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) |
1345 | return xen_hvm_config(vcpu, data); | 1614 | return xen_hvm_config(vcpu, data); |
@@ -1522,6 +1791,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1522 | case 0xcd: /* fsb frequency */ | 1791 | case 0xcd: /* fsb frequency */ |
1523 | data = 3; | 1792 | data = 3; |
1524 | break; | 1793 | break; |
1794 | /* | ||
1795 | * MSR_EBC_FREQUENCY_ID | ||
1796 | * Conservative value valid for even the basic CPU models. | ||
1797 | * Models 0,1: 000 in bits 23:21 indicating a bus speed of | ||
1798 | * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, | ||
1799 | * and 266MHz for model 3, or 4. Set Core Clock | ||
1800 | * Frequency to System Bus Frequency Ratio to 1 (bits | ||
1801 | * 31:24) even though these are only valid for CPU | ||
1802 | * models > 2, however guests may end up dividing or | ||
1803 | * multiplying by zero otherwise. | ||
1804 | */ | ||
1805 | case MSR_EBC_FREQUENCY_ID: | ||
1806 | data = 1 << 24; | ||
1807 | break; | ||
1525 | case MSR_IA32_APICBASE: | 1808 | case MSR_IA32_APICBASE: |
1526 | data = kvm_get_apic_base(vcpu); | 1809 | data = kvm_get_apic_base(vcpu); |
1527 | break; | 1810 | break; |
@@ -1548,6 +1831,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1548 | case MSR_KVM_SYSTEM_TIME_NEW: | 1831 | case MSR_KVM_SYSTEM_TIME_NEW: |
1549 | data = vcpu->arch.time; | 1832 | data = vcpu->arch.time; |
1550 | break; | 1833 | break; |
1834 | case MSR_KVM_ASYNC_PF_EN: | ||
1835 | data = vcpu->arch.apf.msr_val; | ||
1836 | break; | ||
1551 | case MSR_IA32_P5_MC_ADDR: | 1837 | case MSR_IA32_P5_MC_ADDR: |
1552 | case MSR_IA32_P5_MC_TYPE: | 1838 | case MSR_IA32_P5_MC_TYPE: |
1553 | case MSR_IA32_MCG_CAP: | 1839 | case MSR_IA32_MCG_CAP: |
@@ -1555,6 +1841,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1555 | case MSR_IA32_MCG_STATUS: | 1841 | case MSR_IA32_MCG_STATUS: |
1556 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: | 1842 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: |
1557 | return get_msr_mce(vcpu, msr, pdata); | 1843 | return get_msr_mce(vcpu, msr, pdata); |
1844 | case MSR_K7_CLK_CTL: | ||
1845 | /* | ||
1846 | * Provide expected ramp-up count for K7. All other | ||
1847 | * are set to zero, indicating minimum divisors for | ||
1848 | * every field. | ||
1849 | * | ||
1850 | * This prevents guest kernels on AMD host with CPU | ||
1851 | * type 6, model 8 and higher from exploding due to | ||
1852 | * the rdmsr failing. | ||
1853 | */ | ||
1854 | data = 0x20000000; | ||
1855 | break; | ||
1558 | case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: | 1856 | case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: |
1559 | if (kvm_hv_msr_partition_wide(msr)) { | 1857 | if (kvm_hv_msr_partition_wide(msr)) { |
1560 | int r; | 1858 | int r; |
@@ -1565,6 +1863,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1565 | } else | 1863 | } else |
1566 | return get_msr_hyperv(vcpu, msr, pdata); | 1864 | return get_msr_hyperv(vcpu, msr, pdata); |
1567 | break; | 1865 | break; |
1866 | case MSR_IA32_BBL_CR_CTL3: | ||
1867 | /* This legacy MSR exists but isn't fully documented in current | ||
1868 | * silicon. It is however accessed by winxp in very narrow | ||
1869 | * scenarios where it sets bit #19, itself documented as | ||
1870 | * a "reserved" bit. Best effort attempt to source coherent | ||
1871 | * read data here should the balance of the register be | ||
1872 | * interpreted by the guest: | ||
1873 | * | ||
1874 | * L2 cache control register 3: 64GB range, 256KB size, | ||
1875 | * enabled, latency 0x1, configured | ||
1876 | */ | ||
1877 | data = 0xbe702111; | ||
1878 | break; | ||
1568 | default: | 1879 | default: |
1569 | if (!ignore_msrs) { | 1880 | if (!ignore_msrs) { |
1570 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | 1881 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); |
@@ -1665,6 +1976,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
1665 | case KVM_CAP_NOP_IO_DELAY: | 1976 | case KVM_CAP_NOP_IO_DELAY: |
1666 | case KVM_CAP_MP_STATE: | 1977 | case KVM_CAP_MP_STATE: |
1667 | case KVM_CAP_SYNC_MMU: | 1978 | case KVM_CAP_SYNC_MMU: |
1979 | case KVM_CAP_USER_NMI: | ||
1668 | case KVM_CAP_REINJECT_CONTROL: | 1980 | case KVM_CAP_REINJECT_CONTROL: |
1669 | case KVM_CAP_IRQ_INJECT_STATUS: | 1981 | case KVM_CAP_IRQ_INJECT_STATUS: |
1670 | case KVM_CAP_ASSIGN_DEV_IRQ: | 1982 | case KVM_CAP_ASSIGN_DEV_IRQ: |
@@ -1683,6 +1995,8 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
1683 | case KVM_CAP_DEBUGREGS: | 1995 | case KVM_CAP_DEBUGREGS: |
1684 | case KVM_CAP_X86_ROBUST_SINGLESTEP: | 1996 | case KVM_CAP_X86_ROBUST_SINGLESTEP: |
1685 | case KVM_CAP_XSAVE: | 1997 | case KVM_CAP_XSAVE: |
1998 | case KVM_CAP_ASYNC_PF: | ||
1999 | case KVM_CAP_GET_TSC_KHZ: | ||
1686 | r = 1; | 2000 | r = 1; |
1687 | break; | 2001 | break; |
1688 | case KVM_CAP_COALESCED_MMIO: | 2002 | case KVM_CAP_COALESCED_MMIO: |
@@ -1709,6 +2023,9 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
1709 | case KVM_CAP_XCRS: | 2023 | case KVM_CAP_XCRS: |
1710 | r = cpu_has_xsave; | 2024 | r = cpu_has_xsave; |
1711 | break; | 2025 | break; |
2026 | case KVM_CAP_TSC_CONTROL: | ||
2027 | r = kvm_has_tsc_control; | ||
2028 | break; | ||
1712 | default: | 2029 | default: |
1713 | r = 0; | 2030 | r = 0; |
1714 | break; | 2031 | break; |
@@ -1808,19 +2125,33 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
1808 | } | 2125 | } |
1809 | 2126 | ||
1810 | kvm_x86_ops->vcpu_load(vcpu, cpu); | 2127 | kvm_x86_ops->vcpu_load(vcpu, cpu); |
1811 | if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { | 2128 | if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { |
1812 | unsigned long khz = cpufreq_quick_get(cpu); | 2129 | /* Make sure TSC doesn't go backwards */ |
1813 | if (!khz) | 2130 | s64 tsc_delta; |
1814 | khz = tsc_khz; | 2131 | u64 tsc; |
1815 | per_cpu(cpu_tsc_khz, cpu) = khz; | 2132 | |
2133 | kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc); | ||
2134 | tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : | ||
2135 | tsc - vcpu->arch.last_guest_tsc; | ||
2136 | |||
2137 | if (tsc_delta < 0) | ||
2138 | mark_tsc_unstable("KVM discovered backwards TSC"); | ||
2139 | if (check_tsc_unstable()) { | ||
2140 | kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); | ||
2141 | vcpu->arch.tsc_catchup = 1; | ||
2142 | } | ||
2143 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | ||
2144 | if (vcpu->cpu != cpu) | ||
2145 | kvm_migrate_timers(vcpu); | ||
2146 | vcpu->cpu = cpu; | ||
1816 | } | 2147 | } |
1817 | kvm_request_guest_time_update(vcpu); | ||
1818 | } | 2148 | } |
1819 | 2149 | ||
1820 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | 2150 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
1821 | { | 2151 | { |
1822 | kvm_x86_ops->vcpu_put(vcpu); | 2152 | kvm_x86_ops->vcpu_put(vcpu); |
1823 | kvm_put_guest_fpu(vcpu); | 2153 | kvm_put_guest_fpu(vcpu); |
2154 | kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); | ||
1824 | } | 2155 | } |
1825 | 2156 | ||
1826 | static int is_efer_nx(void) | 2157 | static int is_efer_nx(void) |
@@ -1937,6 +2268,11 @@ out: | |||
1937 | return r; | 2268 | return r; |
1938 | } | 2269 | } |
1939 | 2270 | ||
2271 | static void cpuid_mask(u32 *word, int wordnum) | ||
2272 | { | ||
2273 | *word &= boot_cpu_data.x86_capability[wordnum]; | ||
2274 | } | ||
2275 | |||
1940 | static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, | 2276 | static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, |
1941 | u32 index) | 2277 | u32 index) |
1942 | { | 2278 | { |
@@ -1991,13 +2327,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1991 | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | | 2327 | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | |
1992 | 0 /* Reserved, DCA */ | F(XMM4_1) | | 2328 | 0 /* Reserved, DCA */ | F(XMM4_1) | |
1993 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | | 2329 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | |
1994 | 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); | 2330 | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | |
2331 | F(F16C); | ||
1995 | /* cpuid 0x80000001.ecx */ | 2332 | /* cpuid 0x80000001.ecx */ |
1996 | const u32 kvm_supported_word6_x86_features = | 2333 | const u32 kvm_supported_word6_x86_features = |
1997 | F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | | 2334 | F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | |
1998 | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | | 2335 | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | |
1999 | F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | | 2336 | F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | |
2000 | 0 /* SKINIT */ | 0 /* WDT */; | 2337 | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); |
2338 | |||
2339 | /* cpuid 0xC0000001.edx */ | ||
2340 | const u32 kvm_supported_word5_x86_features = | ||
2341 | F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | | ||
2342 | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | | ||
2343 | F(PMM) | F(PMM_EN); | ||
2001 | 2344 | ||
2002 | /* all calls to cpuid_count() should be made on the same cpu */ | 2345 | /* all calls to cpuid_count() should be made on the same cpu */ |
2003 | get_cpu(); | 2346 | get_cpu(); |
@@ -2010,7 +2353,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2010 | break; | 2353 | break; |
2011 | case 1: | 2354 | case 1: |
2012 | entry->edx &= kvm_supported_word0_x86_features; | 2355 | entry->edx &= kvm_supported_word0_x86_features; |
2356 | cpuid_mask(&entry->edx, 0); | ||
2013 | entry->ecx &= kvm_supported_word4_x86_features; | 2357 | entry->ecx &= kvm_supported_word4_x86_features; |
2358 | cpuid_mask(&entry->ecx, 4); | ||
2014 | /* we support x2apic emulation even if host does not support | 2359 | /* we support x2apic emulation even if host does not support |
2015 | * it since we emulate x2apic in software */ | 2360 | * it since we emulate x2apic in software */ |
2016 | entry->ecx |= F(X2APIC); | 2361 | entry->ecx |= F(X2APIC); |
@@ -2068,9 +2413,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2068 | int i; | 2413 | int i; |
2069 | 2414 | ||
2070 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | 2415 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; |
2071 | for (i = 1; *nent < maxnent; ++i) { | 2416 | for (i = 1; *nent < maxnent && i < 64; ++i) { |
2072 | if (entry[i - 1].eax == 0 && i != 2) | 2417 | if (entry[i].eax == 0) |
2073 | break; | 2418 | continue; |
2074 | do_cpuid_1_ent(&entry[i], function, i); | 2419 | do_cpuid_1_ent(&entry[i], function, i); |
2075 | entry[i].flags |= | 2420 | entry[i].flags |= |
2076 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | 2421 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; |
@@ -2091,6 +2436,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2091 | entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | | 2436 | entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | |
2092 | (1 << KVM_FEATURE_NOP_IO_DELAY) | | 2437 | (1 << KVM_FEATURE_NOP_IO_DELAY) | |
2093 | (1 << KVM_FEATURE_CLOCKSOURCE2) | | 2438 | (1 << KVM_FEATURE_CLOCKSOURCE2) | |
2439 | (1 << KVM_FEATURE_ASYNC_PF) | | ||
2094 | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); | 2440 | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); |
2095 | entry->ebx = 0; | 2441 | entry->ebx = 0; |
2096 | entry->ecx = 0; | 2442 | entry->ecx = 0; |
@@ -2101,7 +2447,23 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2101 | break; | 2447 | break; |
2102 | case 0x80000001: | 2448 | case 0x80000001: |
2103 | entry->edx &= kvm_supported_word1_x86_features; | 2449 | entry->edx &= kvm_supported_word1_x86_features; |
2450 | cpuid_mask(&entry->edx, 1); | ||
2104 | entry->ecx &= kvm_supported_word6_x86_features; | 2451 | entry->ecx &= kvm_supported_word6_x86_features; |
2452 | cpuid_mask(&entry->ecx, 6); | ||
2453 | break; | ||
2454 | /*Add support for Centaur's CPUID instruction*/ | ||
2455 | case 0xC0000000: | ||
2456 | /*Just support up to 0xC0000004 now*/ | ||
2457 | entry->eax = min(entry->eax, 0xC0000004); | ||
2458 | break; | ||
2459 | case 0xC0000001: | ||
2460 | entry->edx &= kvm_supported_word5_x86_features; | ||
2461 | cpuid_mask(&entry->edx, 5); | ||
2462 | break; | ||
2463 | case 0xC0000002: | ||
2464 | case 0xC0000003: | ||
2465 | case 0xC0000004: | ||
2466 | /*Now nothing to do, reserved for the future*/ | ||
2105 | break; | 2467 | break; |
2106 | } | 2468 | } |
2107 | 2469 | ||
@@ -2149,6 +2511,26 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | |||
2149 | if (nent >= cpuid->nent) | 2511 | if (nent >= cpuid->nent) |
2150 | goto out_free; | 2512 | goto out_free; |
2151 | 2513 | ||
2514 | /* Add support for Centaur's CPUID instruction. */ | ||
2515 | if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) { | ||
2516 | do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0, | ||
2517 | &nent, cpuid->nent); | ||
2518 | |||
2519 | r = -E2BIG; | ||
2520 | if (nent >= cpuid->nent) | ||
2521 | goto out_free; | ||
2522 | |||
2523 | limit = cpuid_entries[nent - 1].eax; | ||
2524 | for (func = 0xC0000001; | ||
2525 | func <= limit && nent < cpuid->nent; ++func) | ||
2526 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | ||
2527 | &nent, cpuid->nent); | ||
2528 | |||
2529 | r = -E2BIG; | ||
2530 | if (nent >= cpuid->nent) | ||
2531 | goto out_free; | ||
2532 | } | ||
2533 | |||
2152 | do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, | 2534 | do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, |
2153 | cpuid->nent); | 2535 | cpuid->nent); |
2154 | 2536 | ||
@@ -2203,6 +2585,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | |||
2203 | return -ENXIO; | 2585 | return -ENXIO; |
2204 | 2586 | ||
2205 | kvm_queue_interrupt(vcpu, irq->irq, false); | 2587 | kvm_queue_interrupt(vcpu, irq->irq, false); |
2588 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
2206 | 2589 | ||
2207 | return 0; | 2590 | return 0; |
2208 | } | 2591 | } |
@@ -2272,9 +2655,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, | |||
2272 | if (mce->status & MCI_STATUS_UC) { | 2655 | if (mce->status & MCI_STATUS_UC) { |
2273 | if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || | 2656 | if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || |
2274 | !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { | 2657 | !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { |
2275 | printk(KERN_DEBUG "kvm: set_mce: " | ||
2276 | "injects mce exception while " | ||
2277 | "previous one is in progress!\n"); | ||
2278 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | 2658 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
2279 | return 0; | 2659 | return 0; |
2280 | } | 2660 | } |
@@ -2305,6 +2685,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | |||
2305 | !kvm_exception_is_soft(vcpu->arch.exception.nr); | 2685 | !kvm_exception_is_soft(vcpu->arch.exception.nr); |
2306 | events->exception.nr = vcpu->arch.exception.nr; | 2686 | events->exception.nr = vcpu->arch.exception.nr; |
2307 | events->exception.has_error_code = vcpu->arch.exception.has_error_code; | 2687 | events->exception.has_error_code = vcpu->arch.exception.has_error_code; |
2688 | events->exception.pad = 0; | ||
2308 | events->exception.error_code = vcpu->arch.exception.error_code; | 2689 | events->exception.error_code = vcpu->arch.exception.error_code; |
2309 | 2690 | ||
2310 | events->interrupt.injected = | 2691 | events->interrupt.injected = |
@@ -2318,12 +2699,14 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | |||
2318 | events->nmi.injected = vcpu->arch.nmi_injected; | 2699 | events->nmi.injected = vcpu->arch.nmi_injected; |
2319 | events->nmi.pending = vcpu->arch.nmi_pending; | 2700 | events->nmi.pending = vcpu->arch.nmi_pending; |
2320 | events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); | 2701 | events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); |
2702 | events->nmi.pad = 0; | ||
2321 | 2703 | ||
2322 | events->sipi_vector = vcpu->arch.sipi_vector; | 2704 | events->sipi_vector = vcpu->arch.sipi_vector; |
2323 | 2705 | ||
2324 | events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | 2706 | events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING |
2325 | | KVM_VCPUEVENT_VALID_SIPI_VECTOR | 2707 | | KVM_VCPUEVENT_VALID_SIPI_VECTOR |
2326 | | KVM_VCPUEVENT_VALID_SHADOW); | 2708 | | KVM_VCPUEVENT_VALID_SHADOW); |
2709 | memset(&events->reserved, 0, sizeof(events->reserved)); | ||
2327 | } | 2710 | } |
2328 | 2711 | ||
2329 | static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | 2712 | static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, |
@@ -2342,8 +2725,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | |||
2342 | vcpu->arch.interrupt.pending = events->interrupt.injected; | 2725 | vcpu->arch.interrupt.pending = events->interrupt.injected; |
2343 | vcpu->arch.interrupt.nr = events->interrupt.nr; | 2726 | vcpu->arch.interrupt.nr = events->interrupt.nr; |
2344 | vcpu->arch.interrupt.soft = events->interrupt.soft; | 2727 | vcpu->arch.interrupt.soft = events->interrupt.soft; |
2345 | if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) | ||
2346 | kvm_pic_clear_isr_ack(vcpu->kvm); | ||
2347 | if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) | 2728 | if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) |
2348 | kvm_x86_ops->set_interrupt_shadow(vcpu, | 2729 | kvm_x86_ops->set_interrupt_shadow(vcpu, |
2349 | events->interrupt.shadow); | 2730 | events->interrupt.shadow); |
@@ -2356,6 +2737,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | |||
2356 | if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) | 2737 | if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) |
2357 | vcpu->arch.sipi_vector = events->sipi_vector; | 2738 | vcpu->arch.sipi_vector = events->sipi_vector; |
2358 | 2739 | ||
2740 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
2741 | |||
2359 | return 0; | 2742 | return 0; |
2360 | } | 2743 | } |
2361 | 2744 | ||
@@ -2366,6 +2749,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, | |||
2366 | dbgregs->dr6 = vcpu->arch.dr6; | 2749 | dbgregs->dr6 = vcpu->arch.dr6; |
2367 | dbgregs->dr7 = vcpu->arch.dr7; | 2750 | dbgregs->dr7 = vcpu->arch.dr7; |
2368 | dbgregs->flags = 0; | 2751 | dbgregs->flags = 0; |
2752 | memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); | ||
2369 | } | 2753 | } |
2370 | 2754 | ||
2371 | static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, | 2755 | static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, |
@@ -2715,6 +3099,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2715 | r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); | 3099 | r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); |
2716 | break; | 3100 | break; |
2717 | } | 3101 | } |
3102 | case KVM_SET_TSC_KHZ: { | ||
3103 | u32 user_tsc_khz; | ||
3104 | |||
3105 | r = -EINVAL; | ||
3106 | if (!kvm_has_tsc_control) | ||
3107 | break; | ||
3108 | |||
3109 | user_tsc_khz = (u32)arg; | ||
3110 | |||
3111 | if (user_tsc_khz >= kvm_max_guest_tsc_khz) | ||
3112 | goto out; | ||
3113 | |||
3114 | kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); | ||
3115 | |||
3116 | r = 0; | ||
3117 | goto out; | ||
3118 | } | ||
3119 | case KVM_GET_TSC_KHZ: { | ||
3120 | r = -EIO; | ||
3121 | if (check_tsc_unstable()) | ||
3122 | goto out; | ||
3123 | |||
3124 | r = vcpu_tsc_khz(vcpu); | ||
3125 | |||
3126 | goto out; | ||
3127 | } | ||
2718 | default: | 3128 | default: |
2719 | r = -EINVAL; | 3129 | r = -EINVAL; |
2720 | } | 3130 | } |
@@ -2759,7 +3169,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, | |||
2759 | 3169 | ||
2760 | static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) | 3170 | static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) |
2761 | { | 3171 | { |
2762 | return kvm->arch.n_alloc_mmu_pages; | 3172 | return kvm->arch.n_max_mmu_pages; |
2763 | } | 3173 | } |
2764 | 3174 | ||
2765 | static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | 3175 | static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) |
@@ -2795,18 +3205,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | |||
2795 | r = 0; | 3205 | r = 0; |
2796 | switch (chip->chip_id) { | 3206 | switch (chip->chip_id) { |
2797 | case KVM_IRQCHIP_PIC_MASTER: | 3207 | case KVM_IRQCHIP_PIC_MASTER: |
2798 | raw_spin_lock(&pic_irqchip(kvm)->lock); | 3208 | spin_lock(&pic_irqchip(kvm)->lock); |
2799 | memcpy(&pic_irqchip(kvm)->pics[0], | 3209 | memcpy(&pic_irqchip(kvm)->pics[0], |
2800 | &chip->chip.pic, | 3210 | &chip->chip.pic, |
2801 | sizeof(struct kvm_pic_state)); | 3211 | sizeof(struct kvm_pic_state)); |
2802 | raw_spin_unlock(&pic_irqchip(kvm)->lock); | 3212 | spin_unlock(&pic_irqchip(kvm)->lock); |
2803 | break; | 3213 | break; |
2804 | case KVM_IRQCHIP_PIC_SLAVE: | 3214 | case KVM_IRQCHIP_PIC_SLAVE: |
2805 | raw_spin_lock(&pic_irqchip(kvm)->lock); | 3215 | spin_lock(&pic_irqchip(kvm)->lock); |
2806 | memcpy(&pic_irqchip(kvm)->pics[1], | 3216 | memcpy(&pic_irqchip(kvm)->pics[1], |
2807 | &chip->chip.pic, | 3217 | &chip->chip.pic, |
2808 | sizeof(struct kvm_pic_state)); | 3218 | sizeof(struct kvm_pic_state)); |
2809 | raw_spin_unlock(&pic_irqchip(kvm)->lock); | 3219 | spin_unlock(&pic_irqchip(kvm)->lock); |
2810 | break; | 3220 | break; |
2811 | case KVM_IRQCHIP_IOAPIC: | 3221 | case KVM_IRQCHIP_IOAPIC: |
2812 | r = kvm_set_ioapic(kvm, &chip->chip.ioapic); | 3222 | r = kvm_set_ioapic(kvm, &chip->chip.ioapic); |
@@ -2849,6 +3259,7 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) | |||
2849 | sizeof(ps->channels)); | 3259 | sizeof(ps->channels)); |
2850 | ps->flags = kvm->arch.vpit->pit_state.flags; | 3260 | ps->flags = kvm->arch.vpit->pit_state.flags; |
2851 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | 3261 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); |
3262 | memset(&ps->reserved, 0, sizeof(ps->reserved)); | ||
2852 | return r; | 3263 | return r; |
2853 | } | 3264 | } |
2854 | 3265 | ||
@@ -2912,24 +3323,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
2912 | struct kvm_memslots *slots, *old_slots; | 3323 | struct kvm_memslots *slots, *old_slots; |
2913 | unsigned long *dirty_bitmap; | 3324 | unsigned long *dirty_bitmap; |
2914 | 3325 | ||
2915 | spin_lock(&kvm->mmu_lock); | 3326 | dirty_bitmap = memslot->dirty_bitmap_head; |
2916 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | 3327 | if (memslot->dirty_bitmap == dirty_bitmap) |
2917 | spin_unlock(&kvm->mmu_lock); | 3328 | dirty_bitmap += n / sizeof(long); |
2918 | |||
2919 | r = -ENOMEM; | ||
2920 | dirty_bitmap = vmalloc(n); | ||
2921 | if (!dirty_bitmap) | ||
2922 | goto out; | ||
2923 | memset(dirty_bitmap, 0, n); | 3329 | memset(dirty_bitmap, 0, n); |
2924 | 3330 | ||
2925 | r = -ENOMEM; | 3331 | r = -ENOMEM; |
2926 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | 3332 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); |
2927 | if (!slots) { | 3333 | if (!slots) |
2928 | vfree(dirty_bitmap); | ||
2929 | goto out; | 3334 | goto out; |
2930 | } | ||
2931 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | 3335 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); |
2932 | slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; | 3336 | slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; |
3337 | slots->generation++; | ||
2933 | 3338 | ||
2934 | old_slots = kvm->memslots; | 3339 | old_slots = kvm->memslots; |
2935 | rcu_assign_pointer(kvm->memslots, slots); | 3340 | rcu_assign_pointer(kvm->memslots, slots); |
@@ -2937,12 +3342,13 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
2937 | dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; | 3342 | dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; |
2938 | kfree(old_slots); | 3343 | kfree(old_slots); |
2939 | 3344 | ||
3345 | spin_lock(&kvm->mmu_lock); | ||
3346 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | ||
3347 | spin_unlock(&kvm->mmu_lock); | ||
3348 | |||
2940 | r = -EFAULT; | 3349 | r = -EFAULT; |
2941 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { | 3350 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) |
2942 | vfree(dirty_bitmap); | ||
2943 | goto out; | 3351 | goto out; |
2944 | } | ||
2945 | vfree(dirty_bitmap); | ||
2946 | } else { | 3352 | } else { |
2947 | r = -EFAULT; | 3353 | r = -EFAULT; |
2948 | if (clear_user(log->dirty_bitmap, n)) | 3354 | if (clear_user(log->dirty_bitmap, n)) |
@@ -3009,8 +3415,10 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3009 | if (vpic) { | 3415 | if (vpic) { |
3010 | r = kvm_ioapic_init(kvm); | 3416 | r = kvm_ioapic_init(kvm); |
3011 | if (r) { | 3417 | if (r) { |
3418 | mutex_lock(&kvm->slots_lock); | ||
3012 | kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, | 3419 | kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, |
3013 | &vpic->dev); | 3420 | &vpic->dev); |
3421 | mutex_unlock(&kvm->slots_lock); | ||
3014 | kfree(vpic); | 3422 | kfree(vpic); |
3015 | goto create_irqchip_unlock; | 3423 | goto create_irqchip_unlock; |
3016 | } | 3424 | } |
@@ -3021,10 +3429,12 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3021 | smp_wmb(); | 3429 | smp_wmb(); |
3022 | r = kvm_setup_default_irq_routing(kvm); | 3430 | r = kvm_setup_default_irq_routing(kvm); |
3023 | if (r) { | 3431 | if (r) { |
3432 | mutex_lock(&kvm->slots_lock); | ||
3024 | mutex_lock(&kvm->irq_lock); | 3433 | mutex_lock(&kvm->irq_lock); |
3025 | kvm_ioapic_destroy(kvm); | 3434 | kvm_ioapic_destroy(kvm); |
3026 | kvm_destroy_pic(kvm); | 3435 | kvm_destroy_pic(kvm); |
3027 | mutex_unlock(&kvm->irq_lock); | 3436 | mutex_unlock(&kvm->irq_lock); |
3437 | mutex_unlock(&kvm->slots_lock); | ||
3028 | } | 3438 | } |
3029 | create_irqchip_unlock: | 3439 | create_irqchip_unlock: |
3030 | mutex_unlock(&kvm->lock); | 3440 | mutex_unlock(&kvm->lock); |
@@ -3200,7 +3610,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3200 | break; | 3610 | break; |
3201 | } | 3611 | } |
3202 | case KVM_SET_CLOCK: { | 3612 | case KVM_SET_CLOCK: { |
3203 | struct timespec now; | ||
3204 | struct kvm_clock_data user_ns; | 3613 | struct kvm_clock_data user_ns; |
3205 | u64 now_ns; | 3614 | u64 now_ns; |
3206 | s64 delta; | 3615 | s64 delta; |
@@ -3214,21 +3623,23 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3214 | goto out; | 3623 | goto out; |
3215 | 3624 | ||
3216 | r = 0; | 3625 | r = 0; |
3217 | ktime_get_ts(&now); | 3626 | local_irq_disable(); |
3218 | now_ns = timespec_to_ns(&now); | 3627 | now_ns = get_kernel_ns(); |
3219 | delta = user_ns.clock - now_ns; | 3628 | delta = user_ns.clock - now_ns; |
3629 | local_irq_enable(); | ||
3220 | kvm->arch.kvmclock_offset = delta; | 3630 | kvm->arch.kvmclock_offset = delta; |
3221 | break; | 3631 | break; |
3222 | } | 3632 | } |
3223 | case KVM_GET_CLOCK: { | 3633 | case KVM_GET_CLOCK: { |
3224 | struct timespec now; | ||
3225 | struct kvm_clock_data user_ns; | 3634 | struct kvm_clock_data user_ns; |
3226 | u64 now_ns; | 3635 | u64 now_ns; |
3227 | 3636 | ||
3228 | ktime_get_ts(&now); | 3637 | local_irq_disable(); |
3229 | now_ns = timespec_to_ns(&now); | 3638 | now_ns = get_kernel_ns(); |
3230 | user_ns.clock = kvm->arch.kvmclock_offset + now_ns; | 3639 | user_ns.clock = kvm->arch.kvmclock_offset + now_ns; |
3640 | local_irq_enable(); | ||
3231 | user_ns.flags = 0; | 3641 | user_ns.flags = 0; |
3642 | memset(&user_ns.pad, 0, sizeof(user_ns.pad)); | ||
3232 | 3643 | ||
3233 | r = -EFAULT; | 3644 | r = -EFAULT; |
3234 | if (copy_to_user(argp, &user_ns, sizeof(user_ns))) | 3645 | if (copy_to_user(argp, &user_ns, sizeof(user_ns))) |
@@ -3263,20 +3674,43 @@ static void kvm_init_msr_list(void) | |||
3263 | static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, | 3674 | static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, |
3264 | const void *v) | 3675 | const void *v) |
3265 | { | 3676 | { |
3266 | if (vcpu->arch.apic && | 3677 | int handled = 0; |
3267 | !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) | 3678 | int n; |
3268 | return 0; | 3679 | |
3680 | do { | ||
3681 | n = min(len, 8); | ||
3682 | if (!(vcpu->arch.apic && | ||
3683 | !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v)) | ||
3684 | && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) | ||
3685 | break; | ||
3686 | handled += n; | ||
3687 | addr += n; | ||
3688 | len -= n; | ||
3689 | v += n; | ||
3690 | } while (len); | ||
3269 | 3691 | ||
3270 | return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); | 3692 | return handled; |
3271 | } | 3693 | } |
3272 | 3694 | ||
3273 | static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) | 3695 | static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) |
3274 | { | 3696 | { |
3275 | if (vcpu->arch.apic && | 3697 | int handled = 0; |
3276 | !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) | 3698 | int n; |
3277 | return 0; | 3699 | |
3700 | do { | ||
3701 | n = min(len, 8); | ||
3702 | if (!(vcpu->arch.apic && | ||
3703 | !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v)) | ||
3704 | && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) | ||
3705 | break; | ||
3706 | trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); | ||
3707 | handled += n; | ||
3708 | addr += n; | ||
3709 | len -= n; | ||
3710 | v += n; | ||
3711 | } while (len); | ||
3278 | 3712 | ||
3279 | return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); | 3713 | return handled; |
3280 | } | 3714 | } |
3281 | 3715 | ||
3282 | static void kvm_set_segment(struct kvm_vcpu *vcpu, | 3716 | static void kvm_set_segment(struct kvm_vcpu *vcpu, |
@@ -3291,49 +3725,71 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, | |||
3291 | kvm_x86_ops->get_segment(vcpu, var, seg); | 3725 | kvm_x86_ops->get_segment(vcpu, var, seg); |
3292 | } | 3726 | } |
3293 | 3727 | ||
3294 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3728 | static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) |
3729 | { | ||
3730 | return gpa; | ||
3731 | } | ||
3732 | |||
3733 | static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) | ||
3734 | { | ||
3735 | gpa_t t_gpa; | ||
3736 | struct x86_exception exception; | ||
3737 | |||
3738 | BUG_ON(!mmu_is_nested(vcpu)); | ||
3739 | |||
3740 | /* NPT walks are always user-walks */ | ||
3741 | access |= PFERR_USER_MASK; | ||
3742 | t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception); | ||
3743 | |||
3744 | return t_gpa; | ||
3745 | } | ||
3746 | |||
3747 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, | ||
3748 | struct x86_exception *exception) | ||
3295 | { | 3749 | { |
3296 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3750 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3297 | return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); | 3751 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); |
3298 | } | 3752 | } |
3299 | 3753 | ||
3300 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3754 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, |
3755 | struct x86_exception *exception) | ||
3301 | { | 3756 | { |
3302 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3757 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3303 | access |= PFERR_FETCH_MASK; | 3758 | access |= PFERR_FETCH_MASK; |
3304 | return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); | 3759 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); |
3305 | } | 3760 | } |
3306 | 3761 | ||
3307 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3762 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, |
3763 | struct x86_exception *exception) | ||
3308 | { | 3764 | { |
3309 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3765 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3310 | access |= PFERR_WRITE_MASK; | 3766 | access |= PFERR_WRITE_MASK; |
3311 | return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); | 3767 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); |
3312 | } | 3768 | } |
3313 | 3769 | ||
3314 | /* uses this to access any guest's mapped memory without checking CPL */ | 3770 | /* uses this to access any guest's mapped memory without checking CPL */ |
3315 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3771 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, |
3772 | struct x86_exception *exception) | ||
3316 | { | 3773 | { |
3317 | return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); | 3774 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); |
3318 | } | 3775 | } |
3319 | 3776 | ||
3320 | static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, | 3777 | static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, |
3321 | struct kvm_vcpu *vcpu, u32 access, | 3778 | struct kvm_vcpu *vcpu, u32 access, |
3322 | u32 *error) | 3779 | struct x86_exception *exception) |
3323 | { | 3780 | { |
3324 | void *data = val; | 3781 | void *data = val; |
3325 | int r = X86EMUL_CONTINUE; | 3782 | int r = X86EMUL_CONTINUE; |
3326 | 3783 | ||
3327 | while (bytes) { | 3784 | while (bytes) { |
3328 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); | 3785 | gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, |
3786 | exception); | ||
3329 | unsigned offset = addr & (PAGE_SIZE-1); | 3787 | unsigned offset = addr & (PAGE_SIZE-1); |
3330 | unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); | 3788 | unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); |
3331 | int ret; | 3789 | int ret; |
3332 | 3790 | ||
3333 | if (gpa == UNMAPPED_GVA) { | 3791 | if (gpa == UNMAPPED_GVA) |
3334 | r = X86EMUL_PROPAGATE_FAULT; | 3792 | return X86EMUL_PROPAGATE_FAULT; |
3335 | goto out; | ||
3336 | } | ||
3337 | ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); | 3793 | ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); |
3338 | if (ret < 0) { | 3794 | if (ret < 0) { |
3339 | r = X86EMUL_IO_NEEDED; | 3795 | r = X86EMUL_IO_NEEDED; |
@@ -3349,47 +3805,56 @@ out: | |||
3349 | } | 3805 | } |
3350 | 3806 | ||
3351 | /* used for instruction fetching */ | 3807 | /* used for instruction fetching */ |
3352 | static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, | 3808 | static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, |
3353 | struct kvm_vcpu *vcpu, u32 *error) | 3809 | gva_t addr, void *val, unsigned int bytes, |
3810 | struct x86_exception *exception) | ||
3354 | { | 3811 | { |
3812 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
3355 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3813 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3814 | |||
3356 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, | 3815 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, |
3357 | access | PFERR_FETCH_MASK, error); | 3816 | access | PFERR_FETCH_MASK, |
3817 | exception); | ||
3358 | } | 3818 | } |
3359 | 3819 | ||
3360 | static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, | 3820 | static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, |
3361 | struct kvm_vcpu *vcpu, u32 *error) | 3821 | gva_t addr, void *val, unsigned int bytes, |
3822 | struct x86_exception *exception) | ||
3362 | { | 3823 | { |
3824 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
3363 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3825 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3826 | |||
3364 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, | 3827 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, |
3365 | error); | 3828 | exception); |
3366 | } | 3829 | } |
3367 | 3830 | ||
3368 | static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, | 3831 | static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, |
3369 | struct kvm_vcpu *vcpu, u32 *error) | 3832 | gva_t addr, void *val, unsigned int bytes, |
3833 | struct x86_exception *exception) | ||
3370 | { | 3834 | { |
3371 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); | 3835 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
3836 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); | ||
3372 | } | 3837 | } |
3373 | 3838 | ||
3374 | static int kvm_write_guest_virt_system(gva_t addr, void *val, | 3839 | static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, |
3840 | gva_t addr, void *val, | ||
3375 | unsigned int bytes, | 3841 | unsigned int bytes, |
3376 | struct kvm_vcpu *vcpu, | 3842 | struct x86_exception *exception) |
3377 | u32 *error) | ||
3378 | { | 3843 | { |
3844 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
3379 | void *data = val; | 3845 | void *data = val; |
3380 | int r = X86EMUL_CONTINUE; | 3846 | int r = X86EMUL_CONTINUE; |
3381 | 3847 | ||
3382 | while (bytes) { | 3848 | while (bytes) { |
3383 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, | 3849 | gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, |
3384 | PFERR_WRITE_MASK, error); | 3850 | PFERR_WRITE_MASK, |
3851 | exception); | ||
3385 | unsigned offset = addr & (PAGE_SIZE-1); | 3852 | unsigned offset = addr & (PAGE_SIZE-1); |
3386 | unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); | 3853 | unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); |
3387 | int ret; | 3854 | int ret; |
3388 | 3855 | ||
3389 | if (gpa == UNMAPPED_GVA) { | 3856 | if (gpa == UNMAPPED_GVA) |
3390 | r = X86EMUL_PROPAGATE_FAULT; | 3857 | return X86EMUL_PROPAGATE_FAULT; |
3391 | goto out; | ||
3392 | } | ||
3393 | ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); | 3858 | ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); |
3394 | if (ret < 0) { | 3859 | if (ret < 0) { |
3395 | r = X86EMUL_IO_NEEDED; | 3860 | r = X86EMUL_IO_NEEDED; |
@@ -3404,13 +3869,15 @@ out: | |||
3404 | return r; | 3869 | return r; |
3405 | } | 3870 | } |
3406 | 3871 | ||
3407 | static int emulator_read_emulated(unsigned long addr, | 3872 | static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, |
3873 | unsigned long addr, | ||
3408 | void *val, | 3874 | void *val, |
3409 | unsigned int bytes, | 3875 | unsigned int bytes, |
3410 | unsigned int *error_code, | 3876 | struct x86_exception *exception) |
3411 | struct kvm_vcpu *vcpu) | ||
3412 | { | 3877 | { |
3878 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
3413 | gpa_t gpa; | 3879 | gpa_t gpa; |
3880 | int handled; | ||
3414 | 3881 | ||
3415 | if (vcpu->mmio_read_completed) { | 3882 | if (vcpu->mmio_read_completed) { |
3416 | memcpy(val, vcpu->mmio_data, bytes); | 3883 | memcpy(val, vcpu->mmio_data, bytes); |
@@ -3420,7 +3887,7 @@ static int emulator_read_emulated(unsigned long addr, | |||
3420 | return X86EMUL_CONTINUE; | 3887 | return X86EMUL_CONTINUE; |
3421 | } | 3888 | } |
3422 | 3889 | ||
3423 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); | 3890 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); |
3424 | 3891 | ||
3425 | if (gpa == UNMAPPED_GVA) | 3892 | if (gpa == UNMAPPED_GVA) |
3426 | return X86EMUL_PROPAGATE_FAULT; | 3893 | return X86EMUL_PROPAGATE_FAULT; |
@@ -3429,32 +3896,38 @@ static int emulator_read_emulated(unsigned long addr, | |||
3429 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | 3896 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) |
3430 | goto mmio; | 3897 | goto mmio; |
3431 | 3898 | ||
3432 | if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) | 3899 | if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) |
3433 | == X86EMUL_CONTINUE) | 3900 | == X86EMUL_CONTINUE) |
3434 | return X86EMUL_CONTINUE; | 3901 | return X86EMUL_CONTINUE; |
3435 | 3902 | ||
3436 | mmio: | 3903 | mmio: |
3437 | /* | 3904 | /* |
3438 | * Is this MMIO handled locally? | 3905 | * Is this MMIO handled locally? |
3439 | */ | 3906 | */ |
3440 | if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { | 3907 | handled = vcpu_mmio_read(vcpu, gpa, bytes, val); |
3441 | trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); | 3908 | |
3909 | if (handled == bytes) | ||
3442 | return X86EMUL_CONTINUE; | 3910 | return X86EMUL_CONTINUE; |
3443 | } | 3911 | |
3912 | gpa += handled; | ||
3913 | bytes -= handled; | ||
3914 | val += handled; | ||
3444 | 3915 | ||
3445 | trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); | 3916 | trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); |
3446 | 3917 | ||
3447 | vcpu->mmio_needed = 1; | 3918 | vcpu->mmio_needed = 1; |
3448 | vcpu->run->exit_reason = KVM_EXIT_MMIO; | 3919 | vcpu->run->exit_reason = KVM_EXIT_MMIO; |
3449 | vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; | 3920 | vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; |
3450 | vcpu->run->mmio.len = vcpu->mmio_size = bytes; | 3921 | vcpu->mmio_size = bytes; |
3922 | vcpu->run->mmio.len = min(vcpu->mmio_size, 8); | ||
3451 | vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; | 3923 | vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; |
3924 | vcpu->mmio_index = 0; | ||
3452 | 3925 | ||
3453 | return X86EMUL_IO_NEEDED; | 3926 | return X86EMUL_IO_NEEDED; |
3454 | } | 3927 | } |
3455 | 3928 | ||
3456 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | 3929 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, |
3457 | const void *val, int bytes) | 3930 | const void *val, int bytes) |
3458 | { | 3931 | { |
3459 | int ret; | 3932 | int ret; |
3460 | 3933 | ||
@@ -3468,12 +3941,13 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3468 | static int emulator_write_emulated_onepage(unsigned long addr, | 3941 | static int emulator_write_emulated_onepage(unsigned long addr, |
3469 | const void *val, | 3942 | const void *val, |
3470 | unsigned int bytes, | 3943 | unsigned int bytes, |
3471 | unsigned int *error_code, | 3944 | struct x86_exception *exception, |
3472 | struct kvm_vcpu *vcpu) | 3945 | struct kvm_vcpu *vcpu) |
3473 | { | 3946 | { |
3474 | gpa_t gpa; | 3947 | gpa_t gpa; |
3948 | int handled; | ||
3475 | 3949 | ||
3476 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); | 3950 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); |
3477 | 3951 | ||
3478 | if (gpa == UNMAPPED_GVA) | 3952 | if (gpa == UNMAPPED_GVA) |
3479 | return X86EMUL_PROPAGATE_FAULT; | 3953 | return X86EMUL_PROPAGATE_FAULT; |
@@ -3490,31 +3964,41 @@ mmio: | |||
3490 | /* | 3964 | /* |
3491 | * Is this MMIO handled locally? | 3965 | * Is this MMIO handled locally? |
3492 | */ | 3966 | */ |
3493 | if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) | 3967 | handled = vcpu_mmio_write(vcpu, gpa, bytes, val); |
3968 | if (handled == bytes) | ||
3494 | return X86EMUL_CONTINUE; | 3969 | return X86EMUL_CONTINUE; |
3495 | 3970 | ||
3971 | gpa += handled; | ||
3972 | bytes -= handled; | ||
3973 | val += handled; | ||
3974 | |||
3496 | vcpu->mmio_needed = 1; | 3975 | vcpu->mmio_needed = 1; |
3976 | memcpy(vcpu->mmio_data, val, bytes); | ||
3497 | vcpu->run->exit_reason = KVM_EXIT_MMIO; | 3977 | vcpu->run->exit_reason = KVM_EXIT_MMIO; |
3498 | vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; | 3978 | vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; |
3499 | vcpu->run->mmio.len = vcpu->mmio_size = bytes; | 3979 | vcpu->mmio_size = bytes; |
3980 | vcpu->run->mmio.len = min(vcpu->mmio_size, 8); | ||
3500 | vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; | 3981 | vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; |
3501 | memcpy(vcpu->run->mmio.data, val, bytes); | 3982 | memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); |
3983 | vcpu->mmio_index = 0; | ||
3502 | 3984 | ||
3503 | return X86EMUL_CONTINUE; | 3985 | return X86EMUL_CONTINUE; |
3504 | } | 3986 | } |
3505 | 3987 | ||
3506 | int emulator_write_emulated(unsigned long addr, | 3988 | int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, |
3989 | unsigned long addr, | ||
3507 | const void *val, | 3990 | const void *val, |
3508 | unsigned int bytes, | 3991 | unsigned int bytes, |
3509 | unsigned int *error_code, | 3992 | struct x86_exception *exception) |
3510 | struct kvm_vcpu *vcpu) | ||
3511 | { | 3993 | { |
3994 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
3995 | |||
3512 | /* Crossing a page boundary? */ | 3996 | /* Crossing a page boundary? */ |
3513 | if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { | 3997 | if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { |
3514 | int rc, now; | 3998 | int rc, now; |
3515 | 3999 | ||
3516 | now = -addr & ~PAGE_MASK; | 4000 | now = -addr & ~PAGE_MASK; |
3517 | rc = emulator_write_emulated_onepage(addr, val, now, error_code, | 4001 | rc = emulator_write_emulated_onepage(addr, val, now, exception, |
3518 | vcpu); | 4002 | vcpu); |
3519 | if (rc != X86EMUL_CONTINUE) | 4003 | if (rc != X86EMUL_CONTINUE) |
3520 | return rc; | 4004 | return rc; |
@@ -3522,7 +4006,7 @@ int emulator_write_emulated(unsigned long addr, | |||
3522 | val += now; | 4006 | val += now; |
3523 | bytes -= now; | 4007 | bytes -= now; |
3524 | } | 4008 | } |
3525 | return emulator_write_emulated_onepage(addr, val, bytes, error_code, | 4009 | return emulator_write_emulated_onepage(addr, val, bytes, exception, |
3526 | vcpu); | 4010 | vcpu); |
3527 | } | 4011 | } |
3528 | 4012 | ||
@@ -3536,13 +4020,14 @@ int emulator_write_emulated(unsigned long addr, | |||
3536 | (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) | 4020 | (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) |
3537 | #endif | 4021 | #endif |
3538 | 4022 | ||
3539 | static int emulator_cmpxchg_emulated(unsigned long addr, | 4023 | static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, |
4024 | unsigned long addr, | ||
3540 | const void *old, | 4025 | const void *old, |
3541 | const void *new, | 4026 | const void *new, |
3542 | unsigned int bytes, | 4027 | unsigned int bytes, |
3543 | unsigned int *error_code, | 4028 | struct x86_exception *exception) |
3544 | struct kvm_vcpu *vcpu) | ||
3545 | { | 4029 | { |
4030 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
3546 | gpa_t gpa; | 4031 | gpa_t gpa; |
3547 | struct page *page; | 4032 | struct page *page; |
3548 | char *kaddr; | 4033 | char *kaddr; |
@@ -3598,7 +4083,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
3598 | emul_write: | 4083 | emul_write: |
3599 | printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); | 4084 | printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); |
3600 | 4085 | ||
3601 | return emulator_write_emulated(addr, new, bytes, error_code, vcpu); | 4086 | return emulator_write_emulated(ctxt, addr, new, bytes, exception); |
3602 | } | 4087 | } |
3603 | 4088 | ||
3604 | static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) | 4089 | static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) |
@@ -3617,13 +4102,16 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) | |||
3617 | } | 4102 | } |
3618 | 4103 | ||
3619 | 4104 | ||
3620 | static int emulator_pio_in_emulated(int size, unsigned short port, void *val, | 4105 | static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, |
3621 | unsigned int count, struct kvm_vcpu *vcpu) | 4106 | int size, unsigned short port, void *val, |
4107 | unsigned int count) | ||
3622 | { | 4108 | { |
4109 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
4110 | |||
3623 | if (vcpu->arch.pio.count) | 4111 | if (vcpu->arch.pio.count) |
3624 | goto data_avail; | 4112 | goto data_avail; |
3625 | 4113 | ||
3626 | trace_kvm_pio(1, port, size, 1); | 4114 | trace_kvm_pio(0, port, size, count); |
3627 | 4115 | ||
3628 | vcpu->arch.pio.port = port; | 4116 | vcpu->arch.pio.port = port; |
3629 | vcpu->arch.pio.in = 1; | 4117 | vcpu->arch.pio.in = 1; |
@@ -3647,11 +4135,13 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, | |||
3647 | return 0; | 4135 | return 0; |
3648 | } | 4136 | } |
3649 | 4137 | ||
3650 | static int emulator_pio_out_emulated(int size, unsigned short port, | 4138 | static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, |
3651 | const void *val, unsigned int count, | 4139 | int size, unsigned short port, |
3652 | struct kvm_vcpu *vcpu) | 4140 | const void *val, unsigned int count) |
3653 | { | 4141 | { |
3654 | trace_kvm_pio(0, port, size, 1); | 4142 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
4143 | |||
4144 | trace_kvm_pio(1, port, size, count); | ||
3655 | 4145 | ||
3656 | vcpu->arch.pio.port = port; | 4146 | vcpu->arch.pio.port = port; |
3657 | vcpu->arch.pio.in = 0; | 4147 | vcpu->arch.pio.in = 0; |
@@ -3680,10 +4170,9 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) | |||
3680 | return kvm_x86_ops->get_segment_base(vcpu, seg); | 4170 | return kvm_x86_ops->get_segment_base(vcpu, seg); |
3681 | } | 4171 | } |
3682 | 4172 | ||
3683 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | 4173 | static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) |
3684 | { | 4174 | { |
3685 | kvm_mmu_invlpg(vcpu, address); | 4175 | kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); |
3686 | return X86EMUL_CONTINUE; | ||
3687 | } | 4176 | } |
3688 | 4177 | ||
3689 | int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) | 4178 | int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) |
@@ -3692,31 +4181,33 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) | |||
3692 | return X86EMUL_CONTINUE; | 4181 | return X86EMUL_CONTINUE; |
3693 | 4182 | ||
3694 | if (kvm_x86_ops->has_wbinvd_exit()) { | 4183 | if (kvm_x86_ops->has_wbinvd_exit()) { |
4184 | int cpu = get_cpu(); | ||
4185 | |||
4186 | cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); | ||
3695 | smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, | 4187 | smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, |
3696 | wbinvd_ipi, NULL, 1); | 4188 | wbinvd_ipi, NULL, 1); |
4189 | put_cpu(); | ||
3697 | cpumask_clear(vcpu->arch.wbinvd_dirty_mask); | 4190 | cpumask_clear(vcpu->arch.wbinvd_dirty_mask); |
3698 | } | 4191 | } else |
3699 | wbinvd(); | 4192 | wbinvd(); |
3700 | return X86EMUL_CONTINUE; | 4193 | return X86EMUL_CONTINUE; |
3701 | } | 4194 | } |
3702 | EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); | 4195 | EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); |
3703 | 4196 | ||
3704 | int emulate_clts(struct kvm_vcpu *vcpu) | 4197 | static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) |
3705 | { | 4198 | { |
3706 | kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); | 4199 | kvm_emulate_wbinvd(emul_to_vcpu(ctxt)); |
3707 | kvm_x86_ops->fpu_activate(vcpu); | ||
3708 | return X86EMUL_CONTINUE; | ||
3709 | } | 4200 | } |
3710 | 4201 | ||
3711 | int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) | 4202 | int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) |
3712 | { | 4203 | { |
3713 | return _kvm_get_dr(vcpu, dr, dest); | 4204 | return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); |
3714 | } | 4205 | } |
3715 | 4206 | ||
3716 | int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) | 4207 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) |
3717 | { | 4208 | { |
3718 | 4209 | ||
3719 | return __kvm_set_dr(vcpu, dr, value); | 4210 | return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); |
3720 | } | 4211 | } |
3721 | 4212 | ||
3722 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) | 4213 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) |
@@ -3724,8 +4215,9 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val) | |||
3724 | return (curr_cr & ~((1ULL << 32) - 1)) | new_val; | 4215 | return (curr_cr & ~((1ULL << 32) - 1)) | new_val; |
3725 | } | 4216 | } |
3726 | 4217 | ||
3727 | static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) | 4218 | static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) |
3728 | { | 4219 | { |
4220 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
3729 | unsigned long value; | 4221 | unsigned long value; |
3730 | 4222 | ||
3731 | switch (cr) { | 4223 | switch (cr) { |
@@ -3736,7 +4228,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) | |||
3736 | value = vcpu->arch.cr2; | 4228 | value = vcpu->arch.cr2; |
3737 | break; | 4229 | break; |
3738 | case 3: | 4230 | case 3: |
3739 | value = vcpu->arch.cr3; | 4231 | value = kvm_read_cr3(vcpu); |
3740 | break; | 4232 | break; |
3741 | case 4: | 4233 | case 4: |
3742 | value = kvm_read_cr4(vcpu); | 4234 | value = kvm_read_cr4(vcpu); |
@@ -3752,8 +4244,9 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) | |||
3752 | return value; | 4244 | return value; |
3753 | } | 4245 | } |
3754 | 4246 | ||
3755 | static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) | 4247 | static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) |
3756 | { | 4248 | { |
4249 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
3757 | int res = 0; | 4250 | int res = 0; |
3758 | 4251 | ||
3759 | switch (cr) { | 4252 | switch (cr) { |
@@ -3770,7 +4263,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) | |||
3770 | res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); | 4263 | res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); |
3771 | break; | 4264 | break; |
3772 | case 8: | 4265 | case 8: |
3773 | res = __kvm_set_cr8(vcpu, val & 0xfUL); | 4266 | res = kvm_set_cr8(vcpu, val); |
3774 | break; | 4267 | break; |
3775 | default: | 4268 | default: |
3776 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); | 4269 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); |
@@ -3780,28 +4273,45 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) | |||
3780 | return res; | 4273 | return res; |
3781 | } | 4274 | } |
3782 | 4275 | ||
3783 | static int emulator_get_cpl(struct kvm_vcpu *vcpu) | 4276 | static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) |
3784 | { | 4277 | { |
3785 | return kvm_x86_ops->get_cpl(vcpu); | 4278 | return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); |
3786 | } | 4279 | } |
3787 | 4280 | ||
3788 | static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) | 4281 | static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) |
3789 | { | 4282 | { |
3790 | kvm_x86_ops->get_gdt(vcpu, dt); | 4283 | kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt); |
3791 | } | 4284 | } |
3792 | 4285 | ||
3793 | static unsigned long emulator_get_cached_segment_base(int seg, | 4286 | static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) |
3794 | struct kvm_vcpu *vcpu) | ||
3795 | { | 4287 | { |
3796 | return get_segment_base(vcpu, seg); | 4288 | kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); |
3797 | } | 4289 | } |
3798 | 4290 | ||
3799 | static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, | 4291 | static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) |
3800 | struct kvm_vcpu *vcpu) | 4292 | { |
4293 | kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt); | ||
4294 | } | ||
4295 | |||
4296 | static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) | ||
4297 | { | ||
4298 | kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); | ||
4299 | } | ||
4300 | |||
4301 | static unsigned long emulator_get_cached_segment_base( | ||
4302 | struct x86_emulate_ctxt *ctxt, int seg) | ||
4303 | { | ||
4304 | return get_segment_base(emul_to_vcpu(ctxt), seg); | ||
4305 | } | ||
4306 | |||
4307 | static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, | ||
4308 | struct desc_struct *desc, u32 *base3, | ||
4309 | int seg) | ||
3801 | { | 4310 | { |
3802 | struct kvm_segment var; | 4311 | struct kvm_segment var; |
3803 | 4312 | ||
3804 | kvm_get_segment(vcpu, &var, seg); | 4313 | kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); |
4314 | *selector = var.selector; | ||
3805 | 4315 | ||
3806 | if (var.unusable) | 4316 | if (var.unusable) |
3807 | return false; | 4317 | return false; |
@@ -3810,6 +4320,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, | |||
3810 | var.limit >>= 12; | 4320 | var.limit >>= 12; |
3811 | set_desc_limit(desc, var.limit); | 4321 | set_desc_limit(desc, var.limit); |
3812 | set_desc_base(desc, (unsigned long)var.base); | 4322 | set_desc_base(desc, (unsigned long)var.base); |
4323 | #ifdef CONFIG_X86_64 | ||
4324 | if (base3) | ||
4325 | *base3 = var.base >> 32; | ||
4326 | #endif | ||
3813 | desc->type = var.type; | 4327 | desc->type = var.type; |
3814 | desc->s = var.s; | 4328 | desc->s = var.s; |
3815 | desc->dpl = var.dpl; | 4329 | desc->dpl = var.dpl; |
@@ -3822,15 +4336,18 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, | |||
3822 | return true; | 4336 | return true; |
3823 | } | 4337 | } |
3824 | 4338 | ||
3825 | static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, | 4339 | static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, |
3826 | struct kvm_vcpu *vcpu) | 4340 | struct desc_struct *desc, u32 base3, |
4341 | int seg) | ||
3827 | { | 4342 | { |
4343 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
3828 | struct kvm_segment var; | 4344 | struct kvm_segment var; |
3829 | 4345 | ||
3830 | /* needed to preserve selector */ | 4346 | var.selector = selector; |
3831 | kvm_get_segment(vcpu, &var, seg); | ||
3832 | |||
3833 | var.base = get_desc_base(desc); | 4347 | var.base = get_desc_base(desc); |
4348 | #ifdef CONFIG_X86_64 | ||
4349 | var.base |= ((u64)base3) << 32; | ||
4350 | #endif | ||
3834 | var.limit = get_desc_limit(desc); | 4351 | var.limit = get_desc_limit(desc); |
3835 | if (desc->g) | 4352 | if (desc->g) |
3836 | var.limit = (var.limit << 12) | 0xfff; | 4353 | var.limit = (var.limit << 12) | 0xfff; |
@@ -3850,22 +4367,44 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, | |||
3850 | return; | 4367 | return; |
3851 | } | 4368 | } |
3852 | 4369 | ||
3853 | static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) | 4370 | static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, |
4371 | u32 msr_index, u64 *pdata) | ||
3854 | { | 4372 | { |
3855 | struct kvm_segment kvm_seg; | 4373 | return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); |
4374 | } | ||
3856 | 4375 | ||
3857 | kvm_get_segment(vcpu, &kvm_seg, seg); | 4376 | static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, |
3858 | return kvm_seg.selector; | 4377 | u32 msr_index, u64 data) |
4378 | { | ||
4379 | return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); | ||
4380 | } | ||
4381 | |||
4382 | static void emulator_halt(struct x86_emulate_ctxt *ctxt) | ||
4383 | { | ||
4384 | emul_to_vcpu(ctxt)->arch.halt_request = 1; | ||
4385 | } | ||
4386 | |||
4387 | static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) | ||
4388 | { | ||
4389 | preempt_disable(); | ||
4390 | kvm_load_guest_fpu(emul_to_vcpu(ctxt)); | ||
4391 | /* | ||
4392 | * CR0.TS may reference the host fpu state, not the guest fpu state, | ||
4393 | * so it may be clear at this point. | ||
4394 | */ | ||
4395 | clts(); | ||
3859 | } | 4396 | } |
3860 | 4397 | ||
3861 | static void emulator_set_segment_selector(u16 sel, int seg, | 4398 | static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) |
3862 | struct kvm_vcpu *vcpu) | ||
3863 | { | 4399 | { |
3864 | struct kvm_segment kvm_seg; | 4400 | preempt_enable(); |
4401 | } | ||
3865 | 4402 | ||
3866 | kvm_get_segment(vcpu, &kvm_seg, seg); | 4403 | static int emulator_intercept(struct x86_emulate_ctxt *ctxt, |
3867 | kvm_seg.selector = sel; | 4404 | struct x86_instruction_info *info, |
3868 | kvm_set_segment(vcpu, &kvm_seg, seg); | 4405 | enum x86_intercept_stage stage) |
4406 | { | ||
4407 | return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); | ||
3869 | } | 4408 | } |
3870 | 4409 | ||
3871 | static struct x86_emulate_ops emulate_ops = { | 4410 | static struct x86_emulate_ops emulate_ops = { |
@@ -3875,21 +4414,29 @@ static struct x86_emulate_ops emulate_ops = { | |||
3875 | .read_emulated = emulator_read_emulated, | 4414 | .read_emulated = emulator_read_emulated, |
3876 | .write_emulated = emulator_write_emulated, | 4415 | .write_emulated = emulator_write_emulated, |
3877 | .cmpxchg_emulated = emulator_cmpxchg_emulated, | 4416 | .cmpxchg_emulated = emulator_cmpxchg_emulated, |
4417 | .invlpg = emulator_invlpg, | ||
3878 | .pio_in_emulated = emulator_pio_in_emulated, | 4418 | .pio_in_emulated = emulator_pio_in_emulated, |
3879 | .pio_out_emulated = emulator_pio_out_emulated, | 4419 | .pio_out_emulated = emulator_pio_out_emulated, |
3880 | .get_cached_descriptor = emulator_get_cached_descriptor, | 4420 | .get_segment = emulator_get_segment, |
3881 | .set_cached_descriptor = emulator_set_cached_descriptor, | 4421 | .set_segment = emulator_set_segment, |
3882 | .get_segment_selector = emulator_get_segment_selector, | ||
3883 | .set_segment_selector = emulator_set_segment_selector, | ||
3884 | .get_cached_segment_base = emulator_get_cached_segment_base, | 4422 | .get_cached_segment_base = emulator_get_cached_segment_base, |
3885 | .get_gdt = emulator_get_gdt, | 4423 | .get_gdt = emulator_get_gdt, |
4424 | .get_idt = emulator_get_idt, | ||
4425 | .set_gdt = emulator_set_gdt, | ||
4426 | .set_idt = emulator_set_idt, | ||
3886 | .get_cr = emulator_get_cr, | 4427 | .get_cr = emulator_get_cr, |
3887 | .set_cr = emulator_set_cr, | 4428 | .set_cr = emulator_set_cr, |
3888 | .cpl = emulator_get_cpl, | 4429 | .cpl = emulator_get_cpl, |
3889 | .get_dr = emulator_get_dr, | 4430 | .get_dr = emulator_get_dr, |
3890 | .set_dr = emulator_set_dr, | 4431 | .set_dr = emulator_set_dr, |
3891 | .set_msr = kvm_set_msr, | 4432 | .set_msr = emulator_set_msr, |
3892 | .get_msr = kvm_get_msr, | 4433 | .get_msr = emulator_get_msr, |
4434 | .halt = emulator_halt, | ||
4435 | .wbinvd = emulator_wbinvd, | ||
4436 | .fix_hypercall = emulator_fix_hypercall, | ||
4437 | .get_fpu = emulator_get_fpu, | ||
4438 | .put_fpu = emulator_put_fpu, | ||
4439 | .intercept = emulator_intercept, | ||
3893 | }; | 4440 | }; |
3894 | 4441 | ||
3895 | static void cache_all_regs(struct kvm_vcpu *vcpu) | 4442 | static void cache_all_regs(struct kvm_vcpu *vcpu) |
@@ -3917,23 +4464,89 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) | |||
3917 | static void inject_emulated_exception(struct kvm_vcpu *vcpu) | 4464 | static void inject_emulated_exception(struct kvm_vcpu *vcpu) |
3918 | { | 4465 | { |
3919 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; | 4466 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
3920 | if (ctxt->exception == PF_VECTOR) | 4467 | if (ctxt->exception.vector == PF_VECTOR) |
3921 | kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); | 4468 | kvm_propagate_fault(vcpu, &ctxt->exception); |
3922 | else if (ctxt->error_code_valid) | 4469 | else if (ctxt->exception.error_code_valid) |
3923 | kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); | 4470 | kvm_queue_exception_e(vcpu, ctxt->exception.vector, |
4471 | ctxt->exception.error_code); | ||
4472 | else | ||
4473 | kvm_queue_exception(vcpu, ctxt->exception.vector); | ||
4474 | } | ||
4475 | |||
4476 | static void init_emulate_ctxt(struct kvm_vcpu *vcpu) | ||
4477 | { | ||
4478 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | ||
4479 | int cs_db, cs_l; | ||
4480 | |||
4481 | /* | ||
4482 | * TODO: fix emulate.c to use guest_read/write_register | ||
4483 | * instead of direct ->regs accesses, can save hundred cycles | ||
4484 | * on Intel for instructions that don't read/change RSP, for | ||
4485 | * for example. | ||
4486 | */ | ||
4487 | cache_all_regs(vcpu); | ||
4488 | |||
4489 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
4490 | |||
4491 | vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); | ||
4492 | vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); | ||
4493 | vcpu->arch.emulate_ctxt.mode = | ||
4494 | (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : | ||
4495 | (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) | ||
4496 | ? X86EMUL_MODE_VM86 : cs_l | ||
4497 | ? X86EMUL_MODE_PROT64 : cs_db | ||
4498 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | ||
4499 | vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); | ||
4500 | memset(c, 0, sizeof(struct decode_cache)); | ||
4501 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | ||
4502 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; | ||
4503 | } | ||
4504 | |||
4505 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) | ||
4506 | { | ||
4507 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | ||
4508 | int ret; | ||
4509 | |||
4510 | init_emulate_ctxt(vcpu); | ||
4511 | |||
4512 | vcpu->arch.emulate_ctxt.decode.op_bytes = 2; | ||
4513 | vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; | ||
4514 | vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + | ||
4515 | inc_eip; | ||
4516 | ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); | ||
4517 | |||
4518 | if (ret != X86EMUL_CONTINUE) | ||
4519 | return EMULATE_FAIL; | ||
4520 | |||
4521 | vcpu->arch.emulate_ctxt.eip = c->eip; | ||
4522 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | ||
4523 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | ||
4524 | kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | ||
4525 | |||
4526 | if (irq == NMI_VECTOR) | ||
4527 | vcpu->arch.nmi_pending = false; | ||
3924 | else | 4528 | else |
3925 | kvm_queue_exception(vcpu, ctxt->exception); | 4529 | vcpu->arch.interrupt.pending = false; |
4530 | |||
4531 | return EMULATE_DONE; | ||
3926 | } | 4532 | } |
4533 | EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); | ||
3927 | 4534 | ||
3928 | static int handle_emulation_failure(struct kvm_vcpu *vcpu) | 4535 | static int handle_emulation_failure(struct kvm_vcpu *vcpu) |
3929 | { | 4536 | { |
4537 | int r = EMULATE_DONE; | ||
4538 | |||
3930 | ++vcpu->stat.insn_emulation_fail; | 4539 | ++vcpu->stat.insn_emulation_fail; |
3931 | trace_kvm_emulate_insn_failed(vcpu); | 4540 | trace_kvm_emulate_insn_failed(vcpu); |
3932 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | 4541 | if (!is_guest_mode(vcpu)) { |
3933 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | 4542 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
3934 | vcpu->run->internal.ndata = 0; | 4543 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; |
4544 | vcpu->run->internal.ndata = 0; | ||
4545 | r = EMULATE_FAIL; | ||
4546 | } | ||
3935 | kvm_queue_exception(vcpu, UD_VECTOR); | 4547 | kvm_queue_exception(vcpu, UD_VECTOR); |
3936 | return EMULATE_FAIL; | 4548 | |
4549 | return r; | ||
3937 | } | 4550 | } |
3938 | 4551 | ||
3939 | static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | 4552 | static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) |
@@ -3962,74 +4575,34 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | |||
3962 | return false; | 4575 | return false; |
3963 | } | 4576 | } |
3964 | 4577 | ||
3965 | int emulate_instruction(struct kvm_vcpu *vcpu, | 4578 | int x86_emulate_instruction(struct kvm_vcpu *vcpu, |
3966 | unsigned long cr2, | 4579 | unsigned long cr2, |
3967 | u16 error_code, | 4580 | int emulation_type, |
3968 | int emulation_type) | 4581 | void *insn, |
4582 | int insn_len) | ||
3969 | { | 4583 | { |
3970 | int r; | 4584 | int r; |
3971 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 4585 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; |
4586 | bool writeback = true; | ||
3972 | 4587 | ||
3973 | kvm_clear_exception_queue(vcpu); | 4588 | kvm_clear_exception_queue(vcpu); |
3974 | vcpu->arch.mmio_fault_cr2 = cr2; | ||
3975 | /* | ||
3976 | * TODO: fix emulate.c to use guest_read/write_register | ||
3977 | * instead of direct ->regs accesses, can save hundred cycles | ||
3978 | * on Intel for instructions that don't read/change RSP, for | ||
3979 | * for example. | ||
3980 | */ | ||
3981 | cache_all_regs(vcpu); | ||
3982 | 4589 | ||
3983 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { | 4590 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { |
3984 | int cs_db, cs_l; | 4591 | init_emulate_ctxt(vcpu); |
3985 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
3986 | |||
3987 | vcpu->arch.emulate_ctxt.vcpu = vcpu; | ||
3988 | vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); | ||
3989 | vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); | ||
3990 | vcpu->arch.emulate_ctxt.mode = | ||
3991 | (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : | ||
3992 | (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) | ||
3993 | ? X86EMUL_MODE_VM86 : cs_l | ||
3994 | ? X86EMUL_MODE_PROT64 : cs_db | ||
3995 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | ||
3996 | memset(c, 0, sizeof(struct decode_cache)); | ||
3997 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | ||
3998 | vcpu->arch.emulate_ctxt.interruptibility = 0; | 4592 | vcpu->arch.emulate_ctxt.interruptibility = 0; |
3999 | vcpu->arch.emulate_ctxt.exception = -1; | 4593 | vcpu->arch.emulate_ctxt.have_exception = false; |
4000 | 4594 | vcpu->arch.emulate_ctxt.perm_ok = false; | |
4001 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | ||
4002 | trace_kvm_emulate_insn_start(vcpu); | ||
4003 | 4595 | ||
4004 | /* Only allow emulation of specific instructions on #UD | 4596 | vcpu->arch.emulate_ctxt.only_vendor_specific_insn |
4005 | * (namely VMMCALL, sysenter, sysexit, syscall)*/ | 4597 | = emulation_type & EMULTYPE_TRAP_UD; |
4006 | if (emulation_type & EMULTYPE_TRAP_UD) { | ||
4007 | if (!c->twobyte) | ||
4008 | return EMULATE_FAIL; | ||
4009 | switch (c->b) { | ||
4010 | case 0x01: /* VMMCALL */ | ||
4011 | if (c->modrm_mod != 3 || c->modrm_rm != 1) | ||
4012 | return EMULATE_FAIL; | ||
4013 | break; | ||
4014 | case 0x34: /* sysenter */ | ||
4015 | case 0x35: /* sysexit */ | ||
4016 | if (c->modrm_mod != 0 || c->modrm_rm != 0) | ||
4017 | return EMULATE_FAIL; | ||
4018 | break; | ||
4019 | case 0x05: /* syscall */ | ||
4020 | if (c->modrm_mod != 0 || c->modrm_rm != 0) | ||
4021 | return EMULATE_FAIL; | ||
4022 | break; | ||
4023 | default: | ||
4024 | return EMULATE_FAIL; | ||
4025 | } | ||
4026 | 4598 | ||
4027 | if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) | 4599 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); |
4028 | return EMULATE_FAIL; | ||
4029 | } | ||
4030 | 4600 | ||
4601 | trace_kvm_emulate_insn_start(vcpu); | ||
4031 | ++vcpu->stat.insn_emulation; | 4602 | ++vcpu->stat.insn_emulation; |
4032 | if (r) { | 4603 | if (r) { |
4604 | if (emulation_type & EMULTYPE_TRAP_UD) | ||
4605 | return EMULATE_FAIL; | ||
4033 | if (reexecute_instruction(vcpu, cr2)) | 4606 | if (reexecute_instruction(vcpu, cr2)) |
4034 | return EMULATE_DONE; | 4607 | return EMULATE_DONE; |
4035 | if (emulation_type & EMULTYPE_SKIP) | 4608 | if (emulation_type & EMULTYPE_SKIP) |
@@ -4043,62 +4616,87 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
4043 | return EMULATE_DONE; | 4616 | return EMULATE_DONE; |
4044 | } | 4617 | } |
4045 | 4618 | ||
4046 | /* this is needed for vmware backdor interface to work since it | 4619 | /* this is needed for vmware backdoor interface to work since it |
4047 | changes registers values during IO operation */ | 4620 | changes registers values during IO operation */ |
4048 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | 4621 | if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { |
4622 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; | ||
4623 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | ||
4624 | } | ||
4049 | 4625 | ||
4050 | restart: | 4626 | restart: |
4051 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | 4627 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); |
4628 | |||
4629 | if (r == EMULATION_INTERCEPTED) | ||
4630 | return EMULATE_DONE; | ||
4052 | 4631 | ||
4053 | if (r) { /* emulation failed */ | 4632 | if (r == EMULATION_FAILED) { |
4054 | if (reexecute_instruction(vcpu, cr2)) | 4633 | if (reexecute_instruction(vcpu, cr2)) |
4055 | return EMULATE_DONE; | 4634 | return EMULATE_DONE; |
4056 | 4635 | ||
4057 | return handle_emulation_failure(vcpu); | 4636 | return handle_emulation_failure(vcpu); |
4058 | } | 4637 | } |
4059 | 4638 | ||
4060 | toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); | 4639 | if (vcpu->arch.emulate_ctxt.have_exception) { |
4061 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | ||
4062 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | ||
4063 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | ||
4064 | |||
4065 | if (vcpu->arch.emulate_ctxt.exception >= 0) { | ||
4066 | inject_emulated_exception(vcpu); | 4640 | inject_emulated_exception(vcpu); |
4067 | return EMULATE_DONE; | 4641 | r = EMULATE_DONE; |
4068 | } | 4642 | } else if (vcpu->arch.pio.count) { |
4069 | |||
4070 | if (vcpu->arch.pio.count) { | ||
4071 | if (!vcpu->arch.pio.in) | 4643 | if (!vcpu->arch.pio.in) |
4072 | vcpu->arch.pio.count = 0; | 4644 | vcpu->arch.pio.count = 0; |
4073 | return EMULATE_DO_MMIO; | 4645 | else |
4074 | } | 4646 | writeback = false; |
4075 | 4647 | r = EMULATE_DO_MMIO; | |
4076 | if (vcpu->mmio_needed) { | 4648 | } else if (vcpu->mmio_needed) { |
4077 | if (vcpu->mmio_is_write) | 4649 | if (!vcpu->mmio_is_write) |
4078 | vcpu->mmio_needed = 0; | 4650 | writeback = false; |
4079 | return EMULATE_DO_MMIO; | 4651 | r = EMULATE_DO_MMIO; |
4080 | } | 4652 | } else if (r == EMULATION_RESTART) |
4081 | |||
4082 | if (vcpu->arch.emulate_ctxt.restart) | ||
4083 | goto restart; | 4653 | goto restart; |
4654 | else | ||
4655 | r = EMULATE_DONE; | ||
4656 | |||
4657 | if (writeback) { | ||
4658 | toggle_interruptibility(vcpu, | ||
4659 | vcpu->arch.emulate_ctxt.interruptibility); | ||
4660 | kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | ||
4661 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
4662 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | ||
4663 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; | ||
4664 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | ||
4665 | } else | ||
4666 | vcpu->arch.emulate_regs_need_sync_to_vcpu = true; | ||
4084 | 4667 | ||
4085 | return EMULATE_DONE; | 4668 | return r; |
4086 | } | 4669 | } |
4087 | EXPORT_SYMBOL_GPL(emulate_instruction); | 4670 | EXPORT_SYMBOL_GPL(x86_emulate_instruction); |
4088 | 4671 | ||
4089 | int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) | 4672 | int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) |
4090 | { | 4673 | { |
4091 | unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); | 4674 | unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); |
4092 | int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); | 4675 | int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, |
4676 | size, port, &val, 1); | ||
4093 | /* do not return to emulator after return from userspace */ | 4677 | /* do not return to emulator after return from userspace */ |
4094 | vcpu->arch.pio.count = 0; | 4678 | vcpu->arch.pio.count = 0; |
4095 | return ret; | 4679 | return ret; |
4096 | } | 4680 | } |
4097 | EXPORT_SYMBOL_GPL(kvm_fast_pio_out); | 4681 | EXPORT_SYMBOL_GPL(kvm_fast_pio_out); |
4098 | 4682 | ||
4099 | static void bounce_off(void *info) | 4683 | static void tsc_bad(void *info) |
4100 | { | 4684 | { |
4101 | /* nothing */ | 4685 | __this_cpu_write(cpu_tsc_khz, 0); |
4686 | } | ||
4687 | |||
4688 | static void tsc_khz_changed(void *data) | ||
4689 | { | ||
4690 | struct cpufreq_freqs *freq = data; | ||
4691 | unsigned long khz = 0; | ||
4692 | |||
4693 | if (data) | ||
4694 | khz = freq->new; | ||
4695 | else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) | ||
4696 | khz = cpufreq_quick_get(raw_smp_processor_id()); | ||
4697 | if (!khz) | ||
4698 | khz = tsc_khz; | ||
4699 | __this_cpu_write(cpu_tsc_khz, khz); | ||
4102 | } | 4700 | } |
4103 | 4701 | ||
4104 | static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, | 4702 | static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, |
@@ -4109,24 +4707,63 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va | |||
4109 | struct kvm_vcpu *vcpu; | 4707 | struct kvm_vcpu *vcpu; |
4110 | int i, send_ipi = 0; | 4708 | int i, send_ipi = 0; |
4111 | 4709 | ||
4710 | /* | ||
4711 | * We allow guests to temporarily run on slowing clocks, | ||
4712 | * provided we notify them after, or to run on accelerating | ||
4713 | * clocks, provided we notify them before. Thus time never | ||
4714 | * goes backwards. | ||
4715 | * | ||
4716 | * However, we have a problem. We can't atomically update | ||
4717 | * the frequency of a given CPU from this function; it is | ||
4718 | * merely a notifier, which can be called from any CPU. | ||
4719 | * Changing the TSC frequency at arbitrary points in time | ||
4720 | * requires a recomputation of local variables related to | ||
4721 | * the TSC for each VCPU. We must flag these local variables | ||
4722 | * to be updated and be sure the update takes place with the | ||
4723 | * new frequency before any guests proceed. | ||
4724 | * | ||
4725 | * Unfortunately, the combination of hotplug CPU and frequency | ||
4726 | * change creates an intractable locking scenario; the order | ||
4727 | * of when these callouts happen is undefined with respect to | ||
4728 | * CPU hotplug, and they can race with each other. As such, | ||
4729 | * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is | ||
4730 | * undefined; you can actually have a CPU frequency change take | ||
4731 | * place in between the computation of X and the setting of the | ||
4732 | * variable. To protect against this problem, all updates of | ||
4733 | * the per_cpu tsc_khz variable are done in an interrupt | ||
4734 | * protected IPI, and all callers wishing to update the value | ||
4735 | * must wait for a synchronous IPI to complete (which is trivial | ||
4736 | * if the caller is on the CPU already). This establishes the | ||
4737 | * necessary total order on variable updates. | ||
4738 | * | ||
4739 | * Note that because a guest time update may take place | ||
4740 | * anytime after the setting of the VCPU's request bit, the | ||
4741 | * correct TSC value must be set before the request. However, | ||
4742 | * to ensure the update actually makes it to any guest which | ||
4743 | * starts running in hardware virtualization between the set | ||
4744 | * and the acquisition of the spinlock, we must also ping the | ||
4745 | * CPU after setting the request bit. | ||
4746 | * | ||
4747 | */ | ||
4748 | |||
4112 | if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) | 4749 | if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) |
4113 | return 0; | 4750 | return 0; |
4114 | if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) | 4751 | if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) |
4115 | return 0; | 4752 | return 0; |
4116 | per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; | ||
4117 | 4753 | ||
4118 | spin_lock(&kvm_lock); | 4754 | smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); |
4755 | |||
4756 | raw_spin_lock(&kvm_lock); | ||
4119 | list_for_each_entry(kvm, &vm_list, vm_list) { | 4757 | list_for_each_entry(kvm, &vm_list, vm_list) { |
4120 | kvm_for_each_vcpu(i, vcpu, kvm) { | 4758 | kvm_for_each_vcpu(i, vcpu, kvm) { |
4121 | if (vcpu->cpu != freq->cpu) | 4759 | if (vcpu->cpu != freq->cpu) |
4122 | continue; | 4760 | continue; |
4123 | if (!kvm_request_guest_time_update(vcpu)) | 4761 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
4124 | continue; | ||
4125 | if (vcpu->cpu != smp_processor_id()) | 4762 | if (vcpu->cpu != smp_processor_id()) |
4126 | send_ipi++; | 4763 | send_ipi = 1; |
4127 | } | 4764 | } |
4128 | } | 4765 | } |
4129 | spin_unlock(&kvm_lock); | 4766 | raw_spin_unlock(&kvm_lock); |
4130 | 4767 | ||
4131 | if (freq->old < freq->new && send_ipi) { | 4768 | if (freq->old < freq->new && send_ipi) { |
4132 | /* | 4769 | /* |
@@ -4141,32 +4778,59 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va | |||
4141 | * guest context is entered kvmclock will be updated, | 4778 | * guest context is entered kvmclock will be updated, |
4142 | * so the guest will not see stale values. | 4779 | * so the guest will not see stale values. |
4143 | */ | 4780 | */ |
4144 | smp_call_function_single(freq->cpu, bounce_off, NULL, 1); | 4781 | smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); |
4145 | } | 4782 | } |
4146 | return 0; | 4783 | return 0; |
4147 | } | 4784 | } |
4148 | 4785 | ||
4149 | static struct notifier_block kvmclock_cpufreq_notifier_block = { | 4786 | static struct notifier_block kvmclock_cpufreq_notifier_block = { |
4150 | .notifier_call = kvmclock_cpufreq_notifier | 4787 | .notifier_call = kvmclock_cpufreq_notifier |
4788 | }; | ||
4789 | |||
4790 | static int kvmclock_cpu_notifier(struct notifier_block *nfb, | ||
4791 | unsigned long action, void *hcpu) | ||
4792 | { | ||
4793 | unsigned int cpu = (unsigned long)hcpu; | ||
4794 | |||
4795 | switch (action) { | ||
4796 | case CPU_ONLINE: | ||
4797 | case CPU_DOWN_FAILED: | ||
4798 | smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); | ||
4799 | break; | ||
4800 | case CPU_DOWN_PREPARE: | ||
4801 | smp_call_function_single(cpu, tsc_bad, NULL, 1); | ||
4802 | break; | ||
4803 | } | ||
4804 | return NOTIFY_OK; | ||
4805 | } | ||
4806 | |||
4807 | static struct notifier_block kvmclock_cpu_notifier_block = { | ||
4808 | .notifier_call = kvmclock_cpu_notifier, | ||
4809 | .priority = -INT_MAX | ||
4151 | }; | 4810 | }; |
4152 | 4811 | ||
4153 | static void kvm_timer_init(void) | 4812 | static void kvm_timer_init(void) |
4154 | { | 4813 | { |
4155 | int cpu; | 4814 | int cpu; |
4156 | 4815 | ||
4816 | max_tsc_khz = tsc_khz; | ||
4817 | register_hotcpu_notifier(&kvmclock_cpu_notifier_block); | ||
4157 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { | 4818 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { |
4819 | #ifdef CONFIG_CPU_FREQ | ||
4820 | struct cpufreq_policy policy; | ||
4821 | memset(&policy, 0, sizeof(policy)); | ||
4822 | cpu = get_cpu(); | ||
4823 | cpufreq_get_policy(&policy, cpu); | ||
4824 | if (policy.cpuinfo.max_freq) | ||
4825 | max_tsc_khz = policy.cpuinfo.max_freq; | ||
4826 | put_cpu(); | ||
4827 | #endif | ||
4158 | cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, | 4828 | cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, |
4159 | CPUFREQ_TRANSITION_NOTIFIER); | 4829 | CPUFREQ_TRANSITION_NOTIFIER); |
4160 | for_each_online_cpu(cpu) { | ||
4161 | unsigned long khz = cpufreq_get(cpu); | ||
4162 | if (!khz) | ||
4163 | khz = tsc_khz; | ||
4164 | per_cpu(cpu_tsc_khz, cpu) = khz; | ||
4165 | } | ||
4166 | } else { | ||
4167 | for_each_possible_cpu(cpu) | ||
4168 | per_cpu(cpu_tsc_khz, cpu) = tsc_khz; | ||
4169 | } | 4830 | } |
4831 | pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); | ||
4832 | for_each_online_cpu(cpu) | ||
4833 | smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); | ||
4170 | } | 4834 | } |
4171 | 4835 | ||
4172 | static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); | 4836 | static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); |
@@ -4244,7 +4908,6 @@ int kvm_arch_init(void *opaque) | |||
4244 | 4908 | ||
4245 | kvm_x86_ops = ops; | 4909 | kvm_x86_ops = ops; |
4246 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); | 4910 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); |
4247 | kvm_mmu_set_base_ptes(PT_PRESENT_MASK); | ||
4248 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, | 4911 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, |
4249 | PT_DIRTY_MASK, PT64_NX_MASK, 0); | 4912 | PT_DIRTY_MASK, PT64_NX_MASK, 0); |
4250 | 4913 | ||
@@ -4268,6 +4931,7 @@ void kvm_arch_exit(void) | |||
4268 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) | 4931 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) |
4269 | cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, | 4932 | cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, |
4270 | CPUFREQ_TRANSITION_NOTIFIER); | 4933 | CPUFREQ_TRANSITION_NOTIFIER); |
4934 | unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); | ||
4271 | kvm_x86_ops = NULL; | 4935 | kvm_x86_ops = NULL; |
4272 | kvm_mmu_module_exit(); | 4936 | kvm_mmu_module_exit(); |
4273 | } | 4937 | } |
@@ -4403,8 +5067,9 @@ out: | |||
4403 | } | 5067 | } |
4404 | EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); | 5068 | EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); |
4405 | 5069 | ||
4406 | int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | 5070 | int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) |
4407 | { | 5071 | { |
5072 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
4408 | char instruction[3]; | 5073 | char instruction[3]; |
4409 | unsigned long rip = kvm_rip_read(vcpu); | 5074 | unsigned long rip = kvm_rip_read(vcpu); |
4410 | 5075 | ||
@@ -4417,21 +5082,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | |||
4417 | 5082 | ||
4418 | kvm_x86_ops->patch_hypercall(vcpu, instruction); | 5083 | kvm_x86_ops->patch_hypercall(vcpu, instruction); |
4419 | 5084 | ||
4420 | return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); | 5085 | return emulator_write_emulated(&vcpu->arch.emulate_ctxt, |
4421 | } | 5086 | rip, instruction, 3, NULL); |
4422 | |||
4423 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
4424 | { | ||
4425 | struct desc_ptr dt = { limit, base }; | ||
4426 | |||
4427 | kvm_x86_ops->set_gdt(vcpu, &dt); | ||
4428 | } | ||
4429 | |||
4430 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
4431 | { | ||
4432 | struct desc_ptr dt = { limit, base }; | ||
4433 | |||
4434 | kvm_x86_ops->set_idt(vcpu, &dt); | ||
4435 | } | 5087 | } |
4436 | 5088 | ||
4437 | static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) | 5089 | static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) |
@@ -4482,12 +5134,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, | |||
4482 | best = e; | 5134 | best = e; |
4483 | break; | 5135 | break; |
4484 | } | 5136 | } |
4485 | /* | ||
4486 | * Both basic or both extended? | ||
4487 | */ | ||
4488 | if (((e->function ^ function) & 0x80000000) == 0) | ||
4489 | if (!best || e->function > best->function) | ||
4490 | best = e; | ||
4491 | } | 5137 | } |
4492 | return best; | 5138 | return best; |
4493 | } | 5139 | } |
@@ -4507,6 +5153,27 @@ not_found: | |||
4507 | return 36; | 5153 | return 36; |
4508 | } | 5154 | } |
4509 | 5155 | ||
5156 | /* | ||
5157 | * If no match is found, check whether we exceed the vCPU's limit | ||
5158 | * and return the content of the highest valid _standard_ leaf instead. | ||
5159 | * This is to satisfy the CPUID specification. | ||
5160 | */ | ||
5161 | static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, | ||
5162 | u32 function, u32 index) | ||
5163 | { | ||
5164 | struct kvm_cpuid_entry2 *maxlevel; | ||
5165 | |||
5166 | maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); | ||
5167 | if (!maxlevel || maxlevel->eax >= function) | ||
5168 | return NULL; | ||
5169 | if (function & 0x80000000) { | ||
5170 | maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); | ||
5171 | if (!maxlevel) | ||
5172 | return NULL; | ||
5173 | } | ||
5174 | return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); | ||
5175 | } | ||
5176 | |||
4510 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | 5177 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) |
4511 | { | 5178 | { |
4512 | u32 function, index; | 5179 | u32 function, index; |
@@ -4519,6 +5186,10 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | |||
4519 | kvm_register_write(vcpu, VCPU_REGS_RCX, 0); | 5186 | kvm_register_write(vcpu, VCPU_REGS_RCX, 0); |
4520 | kvm_register_write(vcpu, VCPU_REGS_RDX, 0); | 5187 | kvm_register_write(vcpu, VCPU_REGS_RDX, 0); |
4521 | best = kvm_find_cpuid_entry(vcpu, function, index); | 5188 | best = kvm_find_cpuid_entry(vcpu, function, index); |
5189 | |||
5190 | if (!best) | ||
5191 | best = check_cpuid_limit(vcpu, function, index); | ||
5192 | |||
4522 | if (best) { | 5193 | if (best) { |
4523 | kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); | 5194 | kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); |
4524 | kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); | 5195 | kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); |
@@ -4675,6 +5346,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) | |||
4675 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | 5346 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu) |
4676 | { | 5347 | { |
4677 | int r; | 5348 | int r; |
5349 | bool nmi_pending; | ||
4678 | bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && | 5350 | bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && |
4679 | vcpu->run->request_interrupt_window; | 5351 | vcpu->run->request_interrupt_window; |
4680 | 5352 | ||
@@ -4683,8 +5355,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
4683 | kvm_mmu_unload(vcpu); | 5355 | kvm_mmu_unload(vcpu); |
4684 | if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) | 5356 | if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) |
4685 | __kvm_migrate_timers(vcpu); | 5357 | __kvm_migrate_timers(vcpu); |
4686 | if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) | 5358 | if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { |
4687 | kvm_write_guest_time(vcpu); | 5359 | r = kvm_guest_time_update(vcpu); |
5360 | if (unlikely(r)) | ||
5361 | goto out; | ||
5362 | } | ||
4688 | if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) | 5363 | if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) |
4689 | kvm_mmu_sync_roots(vcpu); | 5364 | kvm_mmu_sync_roots(vcpu); |
4690 | if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) | 5365 | if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) |
@@ -4703,12 +5378,41 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
4703 | vcpu->fpu_active = 0; | 5378 | vcpu->fpu_active = 0; |
4704 | kvm_x86_ops->fpu_deactivate(vcpu); | 5379 | kvm_x86_ops->fpu_deactivate(vcpu); |
4705 | } | 5380 | } |
5381 | if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { | ||
5382 | /* Page is swapped out. Do synthetic halt */ | ||
5383 | vcpu->arch.apf.halted = true; | ||
5384 | r = 1; | ||
5385 | goto out; | ||
5386 | } | ||
4706 | } | 5387 | } |
4707 | 5388 | ||
4708 | r = kvm_mmu_reload(vcpu); | 5389 | r = kvm_mmu_reload(vcpu); |
4709 | if (unlikely(r)) | 5390 | if (unlikely(r)) |
4710 | goto out; | 5391 | goto out; |
4711 | 5392 | ||
5393 | /* | ||
5394 | * An NMI can be injected between local nmi_pending read and | ||
5395 | * vcpu->arch.nmi_pending read inside inject_pending_event(). | ||
5396 | * But in that case, KVM_REQ_EVENT will be set, which makes | ||
5397 | * the race described above benign. | ||
5398 | */ | ||
5399 | nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending); | ||
5400 | |||
5401 | if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { | ||
5402 | inject_pending_event(vcpu); | ||
5403 | |||
5404 | /* enable NMI/IRQ window open exits if needed */ | ||
5405 | if (nmi_pending) | ||
5406 | kvm_x86_ops->enable_nmi_window(vcpu); | ||
5407 | else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) | ||
5408 | kvm_x86_ops->enable_irq_window(vcpu); | ||
5409 | |||
5410 | if (kvm_lapic_enabled(vcpu)) { | ||
5411 | update_cr8_intercept(vcpu); | ||
5412 | kvm_lapic_sync_to_vapic(vcpu); | ||
5413 | } | ||
5414 | } | ||
5415 | |||
4712 | preempt_disable(); | 5416 | preempt_disable(); |
4713 | 5417 | ||
4714 | kvm_x86_ops->prepare_guest_switch(vcpu); | 5418 | kvm_x86_ops->prepare_guest_switch(vcpu); |
@@ -4716,34 +5420,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
4716 | kvm_load_guest_fpu(vcpu); | 5420 | kvm_load_guest_fpu(vcpu); |
4717 | kvm_load_guest_xcr0(vcpu); | 5421 | kvm_load_guest_xcr0(vcpu); |
4718 | 5422 | ||
4719 | atomic_set(&vcpu->guest_mode, 1); | 5423 | vcpu->mode = IN_GUEST_MODE; |
4720 | smp_wmb(); | 5424 | |
5425 | /* We should set ->mode before check ->requests, | ||
5426 | * see the comment in make_all_cpus_request. | ||
5427 | */ | ||
5428 | smp_mb(); | ||
4721 | 5429 | ||
4722 | local_irq_disable(); | 5430 | local_irq_disable(); |
4723 | 5431 | ||
4724 | if (!atomic_read(&vcpu->guest_mode) || vcpu->requests | 5432 | if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests |
4725 | || need_resched() || signal_pending(current)) { | 5433 | || need_resched() || signal_pending(current)) { |
4726 | atomic_set(&vcpu->guest_mode, 0); | 5434 | vcpu->mode = OUTSIDE_GUEST_MODE; |
4727 | smp_wmb(); | 5435 | smp_wmb(); |
4728 | local_irq_enable(); | 5436 | local_irq_enable(); |
4729 | preempt_enable(); | 5437 | preempt_enable(); |
5438 | kvm_x86_ops->cancel_injection(vcpu); | ||
4730 | r = 1; | 5439 | r = 1; |
4731 | goto out; | 5440 | goto out; |
4732 | } | 5441 | } |
4733 | 5442 | ||
4734 | inject_pending_event(vcpu); | ||
4735 | |||
4736 | /* enable NMI/IRQ window open exits if needed */ | ||
4737 | if (vcpu->arch.nmi_pending) | ||
4738 | kvm_x86_ops->enable_nmi_window(vcpu); | ||
4739 | else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) | ||
4740 | kvm_x86_ops->enable_irq_window(vcpu); | ||
4741 | |||
4742 | if (kvm_lapic_enabled(vcpu)) { | ||
4743 | update_cr8_intercept(vcpu); | ||
4744 | kvm_lapic_sync_to_vapic(vcpu); | ||
4745 | } | ||
4746 | |||
4747 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | 5443 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); |
4748 | 5444 | ||
4749 | kvm_guest_enter(); | 5445 | kvm_guest_enter(); |
@@ -4769,7 +5465,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
4769 | if (hw_breakpoint_active()) | 5465 | if (hw_breakpoint_active()) |
4770 | hw_breakpoint_restore(); | 5466 | hw_breakpoint_restore(); |
4771 | 5467 | ||
4772 | atomic_set(&vcpu->guest_mode, 0); | 5468 | kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); |
5469 | |||
5470 | vcpu->mode = OUTSIDE_GUEST_MODE; | ||
4773 | smp_wmb(); | 5471 | smp_wmb(); |
4774 | local_irq_enable(); | 5472 | local_irq_enable(); |
4775 | 5473 | ||
@@ -4826,7 +5524,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
4826 | 5524 | ||
4827 | r = 1; | 5525 | r = 1; |
4828 | while (r > 0) { | 5526 | while (r > 0) { |
4829 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) | 5527 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && |
5528 | !vcpu->arch.apf.halted) | ||
4830 | r = vcpu_enter_guest(vcpu); | 5529 | r = vcpu_enter_guest(vcpu); |
4831 | else { | 5530 | else { |
4832 | srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); | 5531 | srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); |
@@ -4839,6 +5538,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
4839 | vcpu->arch.mp_state = | 5538 | vcpu->arch.mp_state = |
4840 | KVM_MP_STATE_RUNNABLE; | 5539 | KVM_MP_STATE_RUNNABLE; |
4841 | case KVM_MP_STATE_RUNNABLE: | 5540 | case KVM_MP_STATE_RUNNABLE: |
5541 | vcpu->arch.apf.halted = false; | ||
4842 | break; | 5542 | break; |
4843 | case KVM_MP_STATE_SIPI_RECEIVED: | 5543 | case KVM_MP_STATE_SIPI_RECEIVED: |
4844 | default: | 5544 | default: |
@@ -4860,6 +5560,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
4860 | vcpu->run->exit_reason = KVM_EXIT_INTR; | 5560 | vcpu->run->exit_reason = KVM_EXIT_INTR; |
4861 | ++vcpu->stat.request_irq_exits; | 5561 | ++vcpu->stat.request_irq_exits; |
4862 | } | 5562 | } |
5563 | |||
5564 | kvm_check_async_pf_completion(vcpu); | ||
5565 | |||
4863 | if (signal_pending(current)) { | 5566 | if (signal_pending(current)) { |
4864 | r = -EINTR; | 5567 | r = -EINTR; |
4865 | vcpu->run->exit_reason = KVM_EXIT_INTR; | 5568 | vcpu->run->exit_reason = KVM_EXIT_INTR; |
@@ -4879,11 +5582,49 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
4879 | return r; | 5582 | return r; |
4880 | } | 5583 | } |
4881 | 5584 | ||
5585 | static int complete_mmio(struct kvm_vcpu *vcpu) | ||
5586 | { | ||
5587 | struct kvm_run *run = vcpu->run; | ||
5588 | int r; | ||
5589 | |||
5590 | if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) | ||
5591 | return 1; | ||
5592 | |||
5593 | if (vcpu->mmio_needed) { | ||
5594 | vcpu->mmio_needed = 0; | ||
5595 | if (!vcpu->mmio_is_write) | ||
5596 | memcpy(vcpu->mmio_data + vcpu->mmio_index, | ||
5597 | run->mmio.data, 8); | ||
5598 | vcpu->mmio_index += 8; | ||
5599 | if (vcpu->mmio_index < vcpu->mmio_size) { | ||
5600 | run->exit_reason = KVM_EXIT_MMIO; | ||
5601 | run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; | ||
5602 | memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); | ||
5603 | run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8); | ||
5604 | run->mmio.is_write = vcpu->mmio_is_write; | ||
5605 | vcpu->mmio_needed = 1; | ||
5606 | return 0; | ||
5607 | } | ||
5608 | if (vcpu->mmio_is_write) | ||
5609 | return 1; | ||
5610 | vcpu->mmio_read_completed = 1; | ||
5611 | } | ||
5612 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); | ||
5613 | r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); | ||
5614 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | ||
5615 | if (r != EMULATE_DONE) | ||
5616 | return 0; | ||
5617 | return 1; | ||
5618 | } | ||
5619 | |||
4882 | int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 5620 | int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
4883 | { | 5621 | { |
4884 | int r; | 5622 | int r; |
4885 | sigset_t sigsaved; | 5623 | sigset_t sigsaved; |
4886 | 5624 | ||
5625 | if (!tsk_used_math(current) && init_fpu(current)) | ||
5626 | return -ENOMEM; | ||
5627 | |||
4887 | if (vcpu->sigset_active) | 5628 | if (vcpu->sigset_active) |
4888 | sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); | 5629 | sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); |
4889 | 5630 | ||
@@ -4895,24 +5636,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
4895 | } | 5636 | } |
4896 | 5637 | ||
4897 | /* re-sync apic's tpr */ | 5638 | /* re-sync apic's tpr */ |
4898 | if (!irqchip_in_kernel(vcpu->kvm)) | 5639 | if (!irqchip_in_kernel(vcpu->kvm)) { |
4899 | kvm_set_cr8(vcpu, kvm_run->cr8); | 5640 | if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { |
4900 | 5641 | r = -EINVAL; | |
4901 | if (vcpu->arch.pio.count || vcpu->mmio_needed || | ||
4902 | vcpu->arch.emulate_ctxt.restart) { | ||
4903 | if (vcpu->mmio_needed) { | ||
4904 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); | ||
4905 | vcpu->mmio_read_completed = 1; | ||
4906 | vcpu->mmio_needed = 0; | ||
4907 | } | ||
4908 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); | ||
4909 | r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); | ||
4910 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | ||
4911 | if (r != EMULATE_DONE) { | ||
4912 | r = 0; | ||
4913 | goto out; | 5642 | goto out; |
4914 | } | 5643 | } |
4915 | } | 5644 | } |
5645 | |||
5646 | r = complete_mmio(vcpu); | ||
5647 | if (r <= 0) | ||
5648 | goto out; | ||
5649 | |||
4916 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) | 5650 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) |
4917 | kvm_register_write(vcpu, VCPU_REGS_RAX, | 5651 | kvm_register_write(vcpu, VCPU_REGS_RAX, |
4918 | kvm_run->hypercall.ret); | 5652 | kvm_run->hypercall.ret); |
@@ -4929,6 +5663,18 @@ out: | |||
4929 | 5663 | ||
4930 | int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | 5664 | int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) |
4931 | { | 5665 | { |
5666 | if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { | ||
5667 | /* | ||
5668 | * We are here if userspace calls get_regs() in the middle of | ||
5669 | * instruction emulation. Registers state needs to be copied | ||
5670 | * back from emulation context to vcpu. Usrapace shouldn't do | ||
5671 | * that usually, but some bad designed PV devices (vmware | ||
5672 | * backdoor interface) need this to work | ||
5673 | */ | ||
5674 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | ||
5675 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | ||
5676 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; | ||
5677 | } | ||
4932 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); | 5678 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
4933 | regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); | 5679 | regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); |
4934 | regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); | 5680 | regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); |
@@ -4956,6 +5702,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
4956 | 5702 | ||
4957 | int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | 5703 | int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) |
4958 | { | 5704 | { |
5705 | vcpu->arch.emulate_regs_need_sync_from_vcpu = true; | ||
5706 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; | ||
5707 | |||
4959 | kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); | 5708 | kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); |
4960 | kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); | 5709 | kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); |
4961 | kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); | 5710 | kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); |
@@ -4980,6 +5729,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
4980 | 5729 | ||
4981 | vcpu->arch.exception.pending = false; | 5730 | vcpu->arch.exception.pending = false; |
4982 | 5731 | ||
5732 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5733 | |||
4983 | return 0; | 5734 | return 0; |
4984 | } | 5735 | } |
4985 | 5736 | ||
@@ -5017,7 +5768,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
5017 | 5768 | ||
5018 | sregs->cr0 = kvm_read_cr0(vcpu); | 5769 | sregs->cr0 = kvm_read_cr0(vcpu); |
5019 | sregs->cr2 = vcpu->arch.cr2; | 5770 | sregs->cr2 = vcpu->arch.cr2; |
5020 | sregs->cr3 = vcpu->arch.cr3; | 5771 | sregs->cr3 = kvm_read_cr3(vcpu); |
5021 | sregs->cr4 = kvm_read_cr4(vcpu); | 5772 | sregs->cr4 = kvm_read_cr4(vcpu); |
5022 | sregs->cr8 = kvm_get_cr8(vcpu); | 5773 | sregs->cr8 = kvm_get_cr8(vcpu); |
5023 | sregs->efer = vcpu->arch.efer; | 5774 | sregs->efer = vcpu->arch.efer; |
@@ -5043,6 +5794,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, | |||
5043 | struct kvm_mp_state *mp_state) | 5794 | struct kvm_mp_state *mp_state) |
5044 | { | 5795 | { |
5045 | vcpu->arch.mp_state = mp_state->mp_state; | 5796 | vcpu->arch.mp_state = mp_state->mp_state; |
5797 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5046 | return 0; | 5798 | return 0; |
5047 | } | 5799 | } |
5048 | 5800 | ||
@@ -5050,24 +5802,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | |||
5050 | bool has_error_code, u32 error_code) | 5802 | bool has_error_code, u32 error_code) |
5051 | { | 5803 | { |
5052 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 5804 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; |
5053 | int cs_db, cs_l, ret; | 5805 | int ret; |
5054 | cache_all_regs(vcpu); | ||
5055 | |||
5056 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
5057 | 5806 | ||
5058 | vcpu->arch.emulate_ctxt.vcpu = vcpu; | 5807 | init_emulate_ctxt(vcpu); |
5059 | vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); | ||
5060 | vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); | ||
5061 | vcpu->arch.emulate_ctxt.mode = | ||
5062 | (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : | ||
5063 | (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) | ||
5064 | ? X86EMUL_MODE_VM86 : cs_l | ||
5065 | ? X86EMUL_MODE_PROT64 : cs_db | ||
5066 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | ||
5067 | memset(c, 0, sizeof(struct decode_cache)); | ||
5068 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | ||
5069 | 5808 | ||
5070 | ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, | 5809 | ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, |
5071 | tss_selector, reason, has_error_code, | 5810 | tss_selector, reason, has_error_code, |
5072 | error_code); | 5811 | error_code); |
5073 | 5812 | ||
@@ -5076,7 +5815,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | |||
5076 | 5815 | ||
5077 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | 5816 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); |
5078 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | 5817 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); |
5079 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | 5818 | kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); |
5819 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5080 | return EMULATE_DONE; | 5820 | return EMULATE_DONE; |
5081 | } | 5821 | } |
5082 | EXPORT_SYMBOL_GPL(kvm_task_switch); | 5822 | EXPORT_SYMBOL_GPL(kvm_task_switch); |
@@ -5085,7 +5825,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
5085 | struct kvm_sregs *sregs) | 5825 | struct kvm_sregs *sregs) |
5086 | { | 5826 | { |
5087 | int mmu_reset_needed = 0; | 5827 | int mmu_reset_needed = 0; |
5088 | int pending_vec, max_bits; | 5828 | int pending_vec, max_bits, idx; |
5089 | struct desc_ptr dt; | 5829 | struct desc_ptr dt; |
5090 | 5830 | ||
5091 | dt.size = sregs->idt.limit; | 5831 | dt.size = sregs->idt.limit; |
@@ -5096,8 +5836,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
5096 | kvm_x86_ops->set_gdt(vcpu, &dt); | 5836 | kvm_x86_ops->set_gdt(vcpu, &dt); |
5097 | 5837 | ||
5098 | vcpu->arch.cr2 = sregs->cr2; | 5838 | vcpu->arch.cr2 = sregs->cr2; |
5099 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; | 5839 | mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; |
5100 | vcpu->arch.cr3 = sregs->cr3; | 5840 | vcpu->arch.cr3 = sregs->cr3; |
5841 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
5101 | 5842 | ||
5102 | kvm_set_cr8(vcpu, sregs->cr8); | 5843 | kvm_set_cr8(vcpu, sregs->cr8); |
5103 | 5844 | ||
@@ -5111,10 +5852,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
5111 | 5852 | ||
5112 | mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; | 5853 | mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; |
5113 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); | 5854 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); |
5855 | if (sregs->cr4 & X86_CR4_OSXSAVE) | ||
5856 | update_cpuid(vcpu); | ||
5857 | |||
5858 | idx = srcu_read_lock(&vcpu->kvm->srcu); | ||
5114 | if (!is_long_mode(vcpu) && is_pae(vcpu)) { | 5859 | if (!is_long_mode(vcpu) && is_pae(vcpu)) { |
5115 | load_pdptrs(vcpu, vcpu->arch.cr3); | 5860 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); |
5116 | mmu_reset_needed = 1; | 5861 | mmu_reset_needed = 1; |
5117 | } | 5862 | } |
5863 | srcu_read_unlock(&vcpu->kvm->srcu, idx); | ||
5118 | 5864 | ||
5119 | if (mmu_reset_needed) | 5865 | if (mmu_reset_needed) |
5120 | kvm_mmu_reset_context(vcpu); | 5866 | kvm_mmu_reset_context(vcpu); |
@@ -5125,8 +5871,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
5125 | if (pending_vec < max_bits) { | 5871 | if (pending_vec < max_bits) { |
5126 | kvm_queue_interrupt(vcpu, pending_vec, false); | 5872 | kvm_queue_interrupt(vcpu, pending_vec, false); |
5127 | pr_debug("Set back pending irq %d\n", pending_vec); | 5873 | pr_debug("Set back pending irq %d\n", pending_vec); |
5128 | if (irqchip_in_kernel(vcpu->kvm)) | ||
5129 | kvm_pic_clear_isr_ack(vcpu->kvm); | ||
5130 | } | 5874 | } |
5131 | 5875 | ||
5132 | kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | 5876 | kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); |
@@ -5147,6 +5891,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
5147 | !is_protmode(vcpu)) | 5891 | !is_protmode(vcpu)) |
5148 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 5892 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
5149 | 5893 | ||
5894 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5895 | |||
5150 | return 0; | 5896 | return 0; |
5151 | } | 5897 | } |
5152 | 5898 | ||
@@ -5320,10 +6066,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) | |||
5320 | 6066 | ||
5321 | void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) | 6067 | void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) |
5322 | { | 6068 | { |
5323 | if (vcpu->arch.time_page) { | 6069 | kvmclock_reset(vcpu); |
5324 | kvm_release_page_dirty(vcpu->arch.time_page); | ||
5325 | vcpu->arch.time_page = NULL; | ||
5326 | } | ||
5327 | 6070 | ||
5328 | free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); | 6071 | free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); |
5329 | fx_free(vcpu); | 6072 | fx_free(vcpu); |
@@ -5333,6 +6076,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) | |||
5333 | struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, | 6076 | struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, |
5334 | unsigned int id) | 6077 | unsigned int id) |
5335 | { | 6078 | { |
6079 | if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) | ||
6080 | printk_once(KERN_WARNING | ||
6081 | "kvm: SMP vm created on host with unstable TSC; " | ||
6082 | "guest TSC will not be reliable\n"); | ||
5336 | return kvm_x86_ops->vcpu_create(kvm, id); | 6083 | return kvm_x86_ops->vcpu_create(kvm, id); |
5337 | } | 6084 | } |
5338 | 6085 | ||
@@ -5357,6 +6104,8 @@ free_vcpu: | |||
5357 | 6104 | ||
5358 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | 6105 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
5359 | { | 6106 | { |
6107 | vcpu->arch.apf.msr_val = 0; | ||
6108 | |||
5360 | vcpu_load(vcpu); | 6109 | vcpu_load(vcpu); |
5361 | kvm_mmu_unload(vcpu); | 6110 | kvm_mmu_unload(vcpu); |
5362 | vcpu_put(vcpu); | 6111 | vcpu_put(vcpu); |
@@ -5375,22 +6124,29 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | |||
5375 | vcpu->arch.dr6 = DR6_FIXED_1; | 6124 | vcpu->arch.dr6 = DR6_FIXED_1; |
5376 | vcpu->arch.dr7 = DR7_FIXED_1; | 6125 | vcpu->arch.dr7 = DR7_FIXED_1; |
5377 | 6126 | ||
6127 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
6128 | vcpu->arch.apf.msr_val = 0; | ||
6129 | |||
6130 | kvmclock_reset(vcpu); | ||
6131 | |||
6132 | kvm_clear_async_pf_completion_queue(vcpu); | ||
6133 | kvm_async_pf_hash_reset(vcpu); | ||
6134 | vcpu->arch.apf.halted = false; | ||
6135 | |||
5378 | return kvm_x86_ops->vcpu_reset(vcpu); | 6136 | return kvm_x86_ops->vcpu_reset(vcpu); |
5379 | } | 6137 | } |
5380 | 6138 | ||
5381 | int kvm_arch_hardware_enable(void *garbage) | 6139 | int kvm_arch_hardware_enable(void *garbage) |
5382 | { | 6140 | { |
5383 | /* | 6141 | struct kvm *kvm; |
5384 | * Since this may be called from a hotplug notifcation, | 6142 | struct kvm_vcpu *vcpu; |
5385 | * we can't get the CPU frequency directly. | 6143 | int i; |
5386 | */ | ||
5387 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { | ||
5388 | int cpu = raw_smp_processor_id(); | ||
5389 | per_cpu(cpu_tsc_khz, cpu) = 0; | ||
5390 | } | ||
5391 | 6144 | ||
5392 | kvm_shared_msr_cpu_online(); | 6145 | kvm_shared_msr_cpu_online(); |
5393 | 6146 | list_for_each_entry(kvm, &vm_list, vm_list) | |
6147 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
6148 | if (vcpu->cpu == smp_processor_id()) | ||
6149 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | ||
5394 | return kvm_x86_ops->hardware_enable(garbage); | 6150 | return kvm_x86_ops->hardware_enable(garbage); |
5395 | } | 6151 | } |
5396 | 6152 | ||
@@ -5424,7 +6180,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
5424 | BUG_ON(vcpu->kvm == NULL); | 6180 | BUG_ON(vcpu->kvm == NULL); |
5425 | kvm = vcpu->kvm; | 6181 | kvm = vcpu->kvm; |
5426 | 6182 | ||
6183 | vcpu->arch.emulate_ctxt.ops = &emulate_ops; | ||
6184 | vcpu->arch.walk_mmu = &vcpu->arch.mmu; | ||
5427 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | 6185 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; |
6186 | vcpu->arch.mmu.translate_gpa = translate_gpa; | ||
6187 | vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; | ||
5428 | if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) | 6188 | if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) |
5429 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 6189 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
5430 | else | 6190 | else |
@@ -5437,6 +6197,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
5437 | } | 6197 | } |
5438 | vcpu->arch.pio_data = page_address(page); | 6198 | vcpu->arch.pio_data = page_address(page); |
5439 | 6199 | ||
6200 | kvm_init_tsc_catchup(vcpu, max_tsc_khz); | ||
6201 | |||
5440 | r = kvm_mmu_create(vcpu); | 6202 | r = kvm_mmu_create(vcpu); |
5441 | if (r < 0) | 6203 | if (r < 0) |
5442 | goto fail_free_pio_data; | 6204 | goto fail_free_pio_data; |
@@ -5458,6 +6220,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
5458 | if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) | 6220 | if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) |
5459 | goto fail_free_mce_banks; | 6221 | goto fail_free_mce_banks; |
5460 | 6222 | ||
6223 | kvm_async_pf_hash_reset(vcpu); | ||
6224 | |||
5461 | return 0; | 6225 | return 0; |
5462 | fail_free_mce_banks: | 6226 | fail_free_mce_banks: |
5463 | kfree(vcpu->arch.mce_banks); | 6227 | kfree(vcpu->arch.mce_banks); |
@@ -5483,22 +6247,17 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) | |||
5483 | free_page((unsigned long)vcpu->arch.pio_data); | 6247 | free_page((unsigned long)vcpu->arch.pio_data); |
5484 | } | 6248 | } |
5485 | 6249 | ||
5486 | struct kvm *kvm_arch_create_vm(void) | 6250 | int kvm_arch_init_vm(struct kvm *kvm) |
5487 | { | 6251 | { |
5488 | struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); | ||
5489 | |||
5490 | if (!kvm) | ||
5491 | return ERR_PTR(-ENOMEM); | ||
5492 | |||
5493 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | 6252 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
5494 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | 6253 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); |
5495 | 6254 | ||
5496 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ | 6255 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ |
5497 | set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); | 6256 | set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); |
5498 | 6257 | ||
5499 | rdtscll(kvm->arch.vm_init_tsc); | 6258 | raw_spin_lock_init(&kvm->arch.tsc_write_lock); |
5500 | 6259 | ||
5501 | return kvm; | 6260 | return 0; |
5502 | } | 6261 | } |
5503 | 6262 | ||
5504 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) | 6263 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) |
@@ -5516,8 +6275,10 @@ static void kvm_free_vcpus(struct kvm *kvm) | |||
5516 | /* | 6275 | /* |
5517 | * Unpin any mmu pages first. | 6276 | * Unpin any mmu pages first. |
5518 | */ | 6277 | */ |
5519 | kvm_for_each_vcpu(i, vcpu, kvm) | 6278 | kvm_for_each_vcpu(i, vcpu, kvm) { |
6279 | kvm_clear_async_pf_completion_queue(vcpu); | ||
5520 | kvm_unload_vcpu_mmu(vcpu); | 6280 | kvm_unload_vcpu_mmu(vcpu); |
6281 | } | ||
5521 | kvm_for_each_vcpu(i, vcpu, kvm) | 6282 | kvm_for_each_vcpu(i, vcpu, kvm) |
5522 | kvm_arch_vcpu_free(vcpu); | 6283 | kvm_arch_vcpu_free(vcpu); |
5523 | 6284 | ||
@@ -5541,13 +6302,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm) | |||
5541 | kfree(kvm->arch.vpic); | 6302 | kfree(kvm->arch.vpic); |
5542 | kfree(kvm->arch.vioapic); | 6303 | kfree(kvm->arch.vioapic); |
5543 | kvm_free_vcpus(kvm); | 6304 | kvm_free_vcpus(kvm); |
5544 | kvm_free_physmem(kvm); | ||
5545 | if (kvm->arch.apic_access_page) | 6305 | if (kvm->arch.apic_access_page) |
5546 | put_page(kvm->arch.apic_access_page); | 6306 | put_page(kvm->arch.apic_access_page); |
5547 | if (kvm->arch.ept_identity_pagetable) | 6307 | if (kvm->arch.ept_identity_pagetable) |
5548 | put_page(kvm->arch.ept_identity_pagetable); | 6308 | put_page(kvm->arch.ept_identity_pagetable); |
5549 | cleanup_srcu_struct(&kvm->srcu); | ||
5550 | kfree(kvm); | ||
5551 | } | 6309 | } |
5552 | 6310 | ||
5553 | int kvm_arch_prepare_memory_region(struct kvm *kvm, | 6311 | int kvm_arch_prepare_memory_region(struct kvm *kvm, |
@@ -5595,7 +6353,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
5595 | int user_alloc) | 6353 | int user_alloc) |
5596 | { | 6354 | { |
5597 | 6355 | ||
5598 | int npages = mem->memory_size >> PAGE_SHIFT; | 6356 | int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; |
5599 | 6357 | ||
5600 | if (!user_alloc && !old.user_alloc && old.rmap && !npages) { | 6358 | if (!user_alloc && !old.user_alloc && old.rmap && !npages) { |
5601 | int ret; | 6359 | int ret; |
@@ -5610,12 +6368,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
5610 | "failed to munmap memory\n"); | 6368 | "failed to munmap memory\n"); |
5611 | } | 6369 | } |
5612 | 6370 | ||
6371 | if (!kvm->arch.n_requested_mmu_pages) | ||
6372 | nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); | ||
6373 | |||
5613 | spin_lock(&kvm->mmu_lock); | 6374 | spin_lock(&kvm->mmu_lock); |
5614 | if (!kvm->arch.n_requested_mmu_pages) { | 6375 | if (nr_mmu_pages) |
5615 | unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); | ||
5616 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); | 6376 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); |
5617 | } | ||
5618 | |||
5619 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | 6377 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); |
5620 | spin_unlock(&kvm->mmu_lock); | 6378 | spin_unlock(&kvm->mmu_lock); |
5621 | } | 6379 | } |
@@ -5628,7 +6386,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm) | |||
5628 | 6386 | ||
5629 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | 6387 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
5630 | { | 6388 | { |
5631 | return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE | 6389 | return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && |
6390 | !vcpu->arch.apf.halted) | ||
6391 | || !list_empty_careful(&vcpu->async_pf.done) | ||
5632 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED | 6392 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED |
5633 | || vcpu->arch.nmi_pending || | 6393 | || vcpu->arch.nmi_pending || |
5634 | (kvm_arch_interrupt_allowed(vcpu) && | 6394 | (kvm_arch_interrupt_allowed(vcpu) && |
@@ -5647,7 +6407,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | |||
5647 | 6407 | ||
5648 | me = get_cpu(); | 6408 | me = get_cpu(); |
5649 | if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) | 6409 | if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) |
5650 | if (atomic_xchg(&vcpu->guest_mode, 0)) | 6410 | if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE) |
5651 | smp_send_reschedule(cpu); | 6411 | smp_send_reschedule(cpu); |
5652 | put_cpu(); | 6412 | put_cpu(); |
5653 | } | 6413 | } |
@@ -5683,9 +6443,151 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | |||
5683 | kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) | 6443 | kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) |
5684 | rflags |= X86_EFLAGS_TF; | 6444 | rflags |= X86_EFLAGS_TF; |
5685 | kvm_x86_ops->set_rflags(vcpu, rflags); | 6445 | kvm_x86_ops->set_rflags(vcpu, rflags); |
6446 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5686 | } | 6447 | } |
5687 | EXPORT_SYMBOL_GPL(kvm_set_rflags); | 6448 | EXPORT_SYMBOL_GPL(kvm_set_rflags); |
5688 | 6449 | ||
6450 | void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) | ||
6451 | { | ||
6452 | int r; | ||
6453 | |||
6454 | if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || | ||
6455 | is_error_page(work->page)) | ||
6456 | return; | ||
6457 | |||
6458 | r = kvm_mmu_reload(vcpu); | ||
6459 | if (unlikely(r)) | ||
6460 | return; | ||
6461 | |||
6462 | if (!vcpu->arch.mmu.direct_map && | ||
6463 | work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) | ||
6464 | return; | ||
6465 | |||
6466 | vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); | ||
6467 | } | ||
6468 | |||
6469 | static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) | ||
6470 | { | ||
6471 | return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); | ||
6472 | } | ||
6473 | |||
6474 | static inline u32 kvm_async_pf_next_probe(u32 key) | ||
6475 | { | ||
6476 | return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); | ||
6477 | } | ||
6478 | |||
6479 | static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
6480 | { | ||
6481 | u32 key = kvm_async_pf_hash_fn(gfn); | ||
6482 | |||
6483 | while (vcpu->arch.apf.gfns[key] != ~0) | ||
6484 | key = kvm_async_pf_next_probe(key); | ||
6485 | |||
6486 | vcpu->arch.apf.gfns[key] = gfn; | ||
6487 | } | ||
6488 | |||
6489 | static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
6490 | { | ||
6491 | int i; | ||
6492 | u32 key = kvm_async_pf_hash_fn(gfn); | ||
6493 | |||
6494 | for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && | ||
6495 | (vcpu->arch.apf.gfns[key] != gfn && | ||
6496 | vcpu->arch.apf.gfns[key] != ~0); i++) | ||
6497 | key = kvm_async_pf_next_probe(key); | ||
6498 | |||
6499 | return key; | ||
6500 | } | ||
6501 | |||
6502 | bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
6503 | { | ||
6504 | return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; | ||
6505 | } | ||
6506 | |||
6507 | static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
6508 | { | ||
6509 | u32 i, j, k; | ||
6510 | |||
6511 | i = j = kvm_async_pf_gfn_slot(vcpu, gfn); | ||
6512 | while (true) { | ||
6513 | vcpu->arch.apf.gfns[i] = ~0; | ||
6514 | do { | ||
6515 | j = kvm_async_pf_next_probe(j); | ||
6516 | if (vcpu->arch.apf.gfns[j] == ~0) | ||
6517 | return; | ||
6518 | k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); | ||
6519 | /* | ||
6520 | * k lies cyclically in ]i,j] | ||
6521 | * | i.k.j | | ||
6522 | * |....j i.k.| or |.k..j i...| | ||
6523 | */ | ||
6524 | } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); | ||
6525 | vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; | ||
6526 | i = j; | ||
6527 | } | ||
6528 | } | ||
6529 | |||
6530 | static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) | ||
6531 | { | ||
6532 | |||
6533 | return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, | ||
6534 | sizeof(val)); | ||
6535 | } | ||
6536 | |||
6537 | void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, | ||
6538 | struct kvm_async_pf *work) | ||
6539 | { | ||
6540 | struct x86_exception fault; | ||
6541 | |||
6542 | trace_kvm_async_pf_not_present(work->arch.token, work->gva); | ||
6543 | kvm_add_async_pf_gfn(vcpu, work->arch.gfn); | ||
6544 | |||
6545 | if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || | ||
6546 | (vcpu->arch.apf.send_user_only && | ||
6547 | kvm_x86_ops->get_cpl(vcpu) == 0)) | ||
6548 | kvm_make_request(KVM_REQ_APF_HALT, vcpu); | ||
6549 | else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { | ||
6550 | fault.vector = PF_VECTOR; | ||
6551 | fault.error_code_valid = true; | ||
6552 | fault.error_code = 0; | ||
6553 | fault.nested_page_fault = false; | ||
6554 | fault.address = work->arch.token; | ||
6555 | kvm_inject_page_fault(vcpu, &fault); | ||
6556 | } | ||
6557 | } | ||
6558 | |||
6559 | void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, | ||
6560 | struct kvm_async_pf *work) | ||
6561 | { | ||
6562 | struct x86_exception fault; | ||
6563 | |||
6564 | trace_kvm_async_pf_ready(work->arch.token, work->gva); | ||
6565 | if (is_error_page(work->page)) | ||
6566 | work->arch.token = ~0; /* broadcast wakeup */ | ||
6567 | else | ||
6568 | kvm_del_async_pf_gfn(vcpu, work->arch.gfn); | ||
6569 | |||
6570 | if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && | ||
6571 | !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { | ||
6572 | fault.vector = PF_VECTOR; | ||
6573 | fault.error_code_valid = true; | ||
6574 | fault.error_code = 0; | ||
6575 | fault.nested_page_fault = false; | ||
6576 | fault.address = work->arch.token; | ||
6577 | kvm_inject_page_fault(vcpu, &fault); | ||
6578 | } | ||
6579 | vcpu->arch.apf.halted = false; | ||
6580 | } | ||
6581 | |||
6582 | bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) | ||
6583 | { | ||
6584 | if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) | ||
6585 | return true; | ||
6586 | else | ||
6587 | return !kvm_event_needs_reinjection(vcpu) && | ||
6588 | kvm_x86_ops->interrupt_allowed(vcpu); | ||
6589 | } | ||
6590 | |||
5689 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); | 6591 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); |
5690 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); | 6592 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); |
5691 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); | 6593 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); |
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index b7a404722d2b..e407ed3df817 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) | |||
50 | #endif | 50 | #endif |
51 | } | 51 | } |
52 | 52 | ||
53 | static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) | ||
54 | { | ||
55 | return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; | ||
56 | } | ||
57 | |||
53 | static inline int is_pae(struct kvm_vcpu *vcpu) | 58 | static inline int is_pae(struct kvm_vcpu *vcpu) |
54 | { | 59 | { |
55 | return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); | 60 | return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); |
@@ -65,7 +70,15 @@ static inline int is_paging(struct kvm_vcpu *vcpu) | |||
65 | return kvm_read_cr0_bits(vcpu, X86_CR0_PG); | 70 | return kvm_read_cr0_bits(vcpu, X86_CR0_PG); |
66 | } | 71 | } |
67 | 72 | ||
73 | static inline u32 bit(int bitno) | ||
74 | { | ||
75 | return 1 << (bitno & 31); | ||
76 | } | ||
77 | |||
68 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); | 78 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); |
69 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); | 79 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); |
80 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); | ||
81 | |||
82 | void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); | ||
70 | 83 | ||
71 | #endif | 84 | #endif |