aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/emulate.c1247
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_timer.h4
-rw-r--r--arch/x86/kvm/mmu.c198
-rw-r--r--arch/x86/kvm/mmutrace.h13
-rw-r--r--arch/x86/kvm/paging_tmpl.h37
-rw-r--r--arch/x86/kvm/svm.c916
-rw-r--r--arch/x86/kvm/timer.c3
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c297
-rw-r--r--arch/x86/kvm/x86.c1506
-rw-r--r--arch/x86/kvm/x86.h7
13 files changed, 2553 insertions, 1894 deletions
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4dade6ac0827..5ac0bb465ed6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -33,6 +33,7 @@
33#include <asm/kvm_emulate.h> 33#include <asm/kvm_emulate.h>
34 34
35#include "x86.h" 35#include "x86.h"
36#include "tss.h"
36 37
37/* 38/*
38 * Opcode effective-address decode tables. 39 * Opcode effective-address decode tables.
@@ -50,6 +51,8 @@
50#define DstReg (2<<1) /* Register operand. */ 51#define DstReg (2<<1) /* Register operand. */
51#define DstMem (3<<1) /* Memory operand. */ 52#define DstMem (3<<1) /* Memory operand. */
52#define DstAcc (4<<1) /* Destination Accumulator */ 53#define DstAcc (4<<1) /* Destination Accumulator */
54#define DstDI (5<<1) /* Destination is in ES:(E)DI */
55#define DstMem64 (6<<1) /* 64bit memory operand */
53#define DstMask (7<<1) 56#define DstMask (7<<1)
54/* Source operand type. */ 57/* Source operand type. */
55#define SrcNone (0<<4) /* No source operand. */ 58#define SrcNone (0<<4) /* No source operand. */
@@ -63,6 +66,7 @@
63#define SrcOne (7<<4) /* Implied '1' */ 66#define SrcOne (7<<4) /* Implied '1' */
64#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 67#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
65#define SrcImmU (9<<4) /* Immediate operand, unsigned */ 68#define SrcImmU (9<<4) /* Immediate operand, unsigned */
69#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
66#define SrcMask (0xf<<4) 70#define SrcMask (0xf<<4)
67/* Generic ModRM decode. */ 71/* Generic ModRM decode. */
68#define ModRM (1<<8) 72#define ModRM (1<<8)
@@ -85,6 +89,9 @@
85#define Src2ImmByte (2<<29) 89#define Src2ImmByte (2<<29)
86#define Src2One (3<<29) 90#define Src2One (3<<29)
87#define Src2Imm16 (4<<29) 91#define Src2Imm16 (4<<29)
92#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
93 in memory and second argument is located
94 immediately after the first one in memory. */
88#define Src2Mask (7<<29) 95#define Src2Mask (7<<29)
89 96
90enum { 97enum {
@@ -147,8 +154,8 @@ static u32 opcode_table[256] = {
147 0, 0, 0, 0, 154 0, 0, 0, 0,
148 /* 0x68 - 0x6F */ 155 /* 0x68 - 0x6F */
149 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 156 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
150 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 157 DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
151 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 158 SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
152 /* 0x70 - 0x77 */ 159 /* 0x70 - 0x77 */
153 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, 160 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
154 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, 161 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
@@ -173,12 +180,12 @@ static u32 opcode_table[256] = {
173 /* 0xA0 - 0xA7 */ 180 /* 0xA0 - 0xA7 */
174 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 181 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
175 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 182 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
176 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
177 ByteOp | ImplicitOps | String, ImplicitOps | String, 184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
178 /* 0xA8 - 0xAF */ 185 /* 0xA8 - 0xAF */
179 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 186 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
180 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
181 ByteOp | ImplicitOps | String, ImplicitOps | String, 188 ByteOp | DstDI | String, DstDI | String,
182 /* 0xB0 - 0xB7 */ 189 /* 0xB0 - 0xB7 */
183 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, 190 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
184 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, 191 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
@@ -204,13 +211,13 @@ static u32 opcode_table[256] = {
204 0, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0,
205 /* 0xE0 - 0xE7 */ 212 /* 0xE0 - 0xE7 */
206 0, 0, 0, 0, 213 0, 0, 0, 0,
207 ByteOp | SrcImmUByte, SrcImmUByte, 214 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
208 ByteOp | SrcImmUByte, SrcImmUByte, 215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
209 /* 0xE8 - 0xEF */ 216 /* 0xE8 - 0xEF */
210 SrcImm | Stack, SrcImm | ImplicitOps, 217 SrcImm | Stack, SrcImm | ImplicitOps,
211 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, 218 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
212 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
213 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
214 /* 0xF0 - 0xF7 */ 221 /* 0xF0 - 0xF7 */
215 0, 0, 0, 0, 222 0, 0, 0, 0,
216 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, 223 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
@@ -343,7 +350,8 @@ static u32 group_table[] = {
343 [Group5*8] = 350 [Group5*8] =
344 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 351 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
345 SrcMem | ModRM | Stack, 0, 352 SrcMem | ModRM | Stack, 0,
346 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, 353 SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
354 SrcMem | ModRM | Stack, 0,
347 [Group7*8] = 355 [Group7*8] =
348 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, 356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
349 SrcNone | ModRM | DstMem | Mov, 0, 357 SrcNone | ModRM | DstMem | Mov, 0,
@@ -353,14 +361,14 @@ static u32 group_table[] = {
353 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, 361 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
354 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, 362 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
355 [Group9*8] = 363 [Group9*8] =
356 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, 364 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
357}; 365};
358 366
359static u32 group2_table[] = { 367static u32 group2_table[] = {
360 [Group7*8] = 368 [Group7*8] =
361 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, 369 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
362 SrcNone | ModRM | DstMem | Mov, 0, 370 SrcNone | ModRM | DstMem | Mov, 0,
363 SrcMem16 | ModRM | Mov, 0, 371 SrcMem16 | ModRM | Mov | Priv, 0,
364 [Group9*8] = 372 [Group9*8] =
365 0, 0, 0, 0, 0, 0, 0, 0, 373 0, 0, 0, 0, 0, 0, 0, 0,
366}; 374};
@@ -562,7 +570,7 @@ static u32 group2_table[] = {
562#define insn_fetch(_type, _size, _eip) \ 570#define insn_fetch(_type, _size, _eip) \
563({ unsigned long _x; \ 571({ unsigned long _x; \
564 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ 572 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
565 if (rc != 0) \ 573 if (rc != X86EMUL_CONTINUE) \
566 goto done; \ 574 goto done; \
567 (_eip) += (_size); \ 575 (_eip) += (_size); \
568 (_type)_x; \ 576 (_type)_x; \
@@ -638,40 +646,40 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
638 646
639static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 647static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
640 struct x86_emulate_ops *ops, 648 struct x86_emulate_ops *ops,
641 unsigned long linear, u8 *dest) 649 unsigned long eip, u8 *dest)
642{ 650{
643 struct fetch_cache *fc = &ctxt->decode.fetch; 651 struct fetch_cache *fc = &ctxt->decode.fetch;
644 int rc; 652 int rc;
645 int size; 653 int size, cur_size;
646 654
647 if (linear < fc->start || linear >= fc->end) { 655 if (eip == fc->end) {
648 size = min(15UL, PAGE_SIZE - offset_in_page(linear)); 656 cur_size = fc->end - fc->start;
649 rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); 657 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
650 if (rc) 658 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
659 size, ctxt->vcpu, NULL);
660 if (rc != X86EMUL_CONTINUE)
651 return rc; 661 return rc;
652 fc->start = linear; 662 fc->end += size;
653 fc->end = linear + size;
654 } 663 }
655 *dest = fc->data[linear - fc->start]; 664 *dest = fc->data[eip - fc->start];
656 return 0; 665 return X86EMUL_CONTINUE;
657} 666}
658 667
659static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, 668static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
660 struct x86_emulate_ops *ops, 669 struct x86_emulate_ops *ops,
661 unsigned long eip, void *dest, unsigned size) 670 unsigned long eip, void *dest, unsigned size)
662{ 671{
663 int rc = 0; 672 int rc;
664 673
665 /* x86 instructions are limited to 15 bytes. */ 674 /* x86 instructions are limited to 15 bytes. */
666 if (eip + size - ctxt->decode.eip_orig > 15) 675 if (eip + size - ctxt->eip > 15)
667 return X86EMUL_UNHANDLEABLE; 676 return X86EMUL_UNHANDLEABLE;
668 eip += ctxt->cs_base;
669 while (size--) { 677 while (size--) {
670 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 678 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
671 if (rc) 679 if (rc != X86EMUL_CONTINUE)
672 return rc; 680 return rc;
673 } 681 }
674 return 0; 682 return X86EMUL_CONTINUE;
675} 683}
676 684
677/* 685/*
@@ -702,7 +710,7 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
702 *address = 0; 710 *address = 0;
703 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 711 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
704 ctxt->vcpu, NULL); 712 ctxt->vcpu, NULL);
705 if (rc) 713 if (rc != X86EMUL_CONTINUE)
706 return rc; 714 return rc;
707 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 715 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
708 ctxt->vcpu, NULL); 716 ctxt->vcpu, NULL);
@@ -782,7 +790,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
782 struct decode_cache *c = &ctxt->decode; 790 struct decode_cache *c = &ctxt->decode;
783 u8 sib; 791 u8 sib;
784 int index_reg = 0, base_reg = 0, scale; 792 int index_reg = 0, base_reg = 0, scale;
785 int rc = 0; 793 int rc = X86EMUL_CONTINUE;
786 794
787 if (c->rex_prefix) { 795 if (c->rex_prefix) {
788 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 796 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -895,7 +903,7 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
895 struct x86_emulate_ops *ops) 903 struct x86_emulate_ops *ops)
896{ 904{
897 struct decode_cache *c = &ctxt->decode; 905 struct decode_cache *c = &ctxt->decode;
898 int rc = 0; 906 int rc = X86EMUL_CONTINUE;
899 907
900 switch (c->ad_bytes) { 908 switch (c->ad_bytes) {
901 case 2: 909 case 2:
@@ -916,14 +924,18 @@ int
916x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 924x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
917{ 925{
918 struct decode_cache *c = &ctxt->decode; 926 struct decode_cache *c = &ctxt->decode;
919 int rc = 0; 927 int rc = X86EMUL_CONTINUE;
920 int mode = ctxt->mode; 928 int mode = ctxt->mode;
921 int def_op_bytes, def_ad_bytes, group; 929 int def_op_bytes, def_ad_bytes, group;
922 930
923 /* Shadow copy of register state. Committed on successful emulation. */
924 931
932 /* we cannot decode insn before we complete previous rep insn */
933 WARN_ON(ctxt->restart);
934
935 /* Shadow copy of register state. Committed on successful emulation. */
925 memset(c, 0, sizeof(struct decode_cache)); 936 memset(c, 0, sizeof(struct decode_cache));
926 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu); 937 c->eip = ctxt->eip;
938 c->fetch.start = c->fetch.end = c->eip;
927 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 939 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
928 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 940 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
929 941
@@ -1015,11 +1027,6 @@ done_prefixes:
1015 } 1027 }
1016 } 1028 }
1017 1029
1018 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
1019 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
1020 return -1;
1021 }
1022
1023 if (c->d & Group) { 1030 if (c->d & Group) {
1024 group = c->d & GroupMask; 1031 group = c->d & GroupMask;
1025 c->modrm = insn_fetch(u8, 1, c->eip); 1032 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1046,7 +1053,7 @@ done_prefixes:
1046 rc = decode_modrm(ctxt, ops); 1053 rc = decode_modrm(ctxt, ops);
1047 else if (c->d & MemAbs) 1054 else if (c->d & MemAbs)
1048 rc = decode_abs(ctxt, ops); 1055 rc = decode_abs(ctxt, ops);
1049 if (rc) 1056 if (rc != X86EMUL_CONTINUE)
1050 goto done; 1057 goto done;
1051 1058
1052 if (!c->has_seg_override) 1059 if (!c->has_seg_override)
@@ -1057,6 +1064,10 @@ done_prefixes:
1057 1064
1058 if (c->ad_bytes != 8) 1065 if (c->ad_bytes != 8)
1059 c->modrm_ea = (u32)c->modrm_ea; 1066 c->modrm_ea = (u32)c->modrm_ea;
1067
1068 if (c->rip_relative)
1069 c->modrm_ea += c->eip;
1070
1060 /* 1071 /*
1061 * Decode and fetch the source operand: register, memory 1072 * Decode and fetch the source operand: register, memory
1062 * or immediate. 1073 * or immediate.
@@ -1091,6 +1102,8 @@ done_prefixes:
1091 break; 1102 break;
1092 } 1103 }
1093 c->src.type = OP_MEM; 1104 c->src.type = OP_MEM;
1105 c->src.ptr = (unsigned long *)c->modrm_ea;
1106 c->src.val = 0;
1094 break; 1107 break;
1095 case SrcImm: 1108 case SrcImm:
1096 case SrcImmU: 1109 case SrcImmU:
@@ -1139,6 +1152,14 @@ done_prefixes:
1139 c->src.bytes = 1; 1152 c->src.bytes = 1;
1140 c->src.val = 1; 1153 c->src.val = 1;
1141 break; 1154 break;
1155 case SrcSI:
1156 c->src.type = OP_MEM;
1157 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1158 c->src.ptr = (unsigned long *)
1159 register_address(c, seg_override_base(ctxt, c),
1160 c->regs[VCPU_REGS_RSI]);
1161 c->src.val = 0;
1162 break;
1142 } 1163 }
1143 1164
1144 /* 1165 /*
@@ -1168,6 +1189,12 @@ done_prefixes:
1168 c->src2.bytes = 1; 1189 c->src2.bytes = 1;
1169 c->src2.val = 1; 1190 c->src2.val = 1;
1170 break; 1191 break;
1192 case Src2Mem16:
1193 c->src2.type = OP_MEM;
1194 c->src2.bytes = 2;
1195 c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
1196 c->src2.val = 0;
1197 break;
1171 } 1198 }
1172 1199
1173 /* Decode and fetch the destination operand: register or memory. */ 1200 /* Decode and fetch the destination operand: register or memory. */
@@ -1180,6 +1207,7 @@ done_prefixes:
1180 c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); 1207 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
1181 break; 1208 break;
1182 case DstMem: 1209 case DstMem:
1210 case DstMem64:
1183 if ((c->d & ModRM) && c->modrm_mod == 3) { 1211 if ((c->d & ModRM) && c->modrm_mod == 3) {
1184 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1212 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1185 c->dst.type = OP_REG; 1213 c->dst.type = OP_REG;
@@ -1188,12 +1216,24 @@ done_prefixes:
1188 break; 1216 break;
1189 } 1217 }
1190 c->dst.type = OP_MEM; 1218 c->dst.type = OP_MEM;
1219 c->dst.ptr = (unsigned long *)c->modrm_ea;
1220 if ((c->d & DstMask) == DstMem64)
1221 c->dst.bytes = 8;
1222 else
1223 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1224 c->dst.val = 0;
1225 if (c->d & BitOp) {
1226 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1227
1228 c->dst.ptr = (void *)c->dst.ptr +
1229 (c->src.val & mask) / 8;
1230 }
1191 break; 1231 break;
1192 case DstAcc: 1232 case DstAcc:
1193 c->dst.type = OP_REG; 1233 c->dst.type = OP_REG;
1194 c->dst.bytes = c->op_bytes; 1234 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1195 c->dst.ptr = &c->regs[VCPU_REGS_RAX]; 1235 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1196 switch (c->op_bytes) { 1236 switch (c->dst.bytes) {
1197 case 1: 1237 case 1:
1198 c->dst.val = *(u8 *)c->dst.ptr; 1238 c->dst.val = *(u8 *)c->dst.ptr;
1199 break; 1239 break;
@@ -1203,18 +1243,248 @@ done_prefixes:
1203 case 4: 1243 case 4:
1204 c->dst.val = *(u32 *)c->dst.ptr; 1244 c->dst.val = *(u32 *)c->dst.ptr;
1205 break; 1245 break;
1246 case 8:
1247 c->dst.val = *(u64 *)c->dst.ptr;
1248 break;
1206 } 1249 }
1207 c->dst.orig_val = c->dst.val; 1250 c->dst.orig_val = c->dst.val;
1208 break; 1251 break;
1252 case DstDI:
1253 c->dst.type = OP_MEM;
1254 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1255 c->dst.ptr = (unsigned long *)
1256 register_address(c, es_base(ctxt),
1257 c->regs[VCPU_REGS_RDI]);
1258 c->dst.val = 0;
1259 break;
1209 } 1260 }
1210 1261
1211 if (c->rip_relative)
1212 c->modrm_ea += c->eip;
1213
1214done: 1262done:
1215 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1263 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1216} 1264}
1217 1265
1266static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 struct x86_emulate_ops *ops,
1268 unsigned int size, unsigned short port,
1269 void *dest)
1270{
1271 struct read_cache *rc = &ctxt->decode.io_read;
1272
1273 if (rc->pos == rc->end) { /* refill pio read ahead */
1274 struct decode_cache *c = &ctxt->decode;
1275 unsigned int in_page, n;
1276 unsigned int count = c->rep_prefix ?
1277 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
1278 in_page = (ctxt->eflags & EFLG_DF) ?
1279 offset_in_page(c->regs[VCPU_REGS_RDI]) :
1280 PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
1281 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
1282 count);
1283 if (n == 0)
1284 n = 1;
1285 rc->pos = rc->end = 0;
1286 if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
1287 return 0;
1288 rc->end = n * size;
1289 }
1290
1291 memcpy(dest, rc->data + rc->pos, size);
1292 rc->pos += size;
1293 return 1;
1294}
1295
1296static u32 desc_limit_scaled(struct desc_struct *desc)
1297{
1298 u32 limit = get_desc_limit(desc);
1299
1300 return desc->g ? (limit << 12) | 0xfff : limit;
1301}
1302
1303static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1304 struct x86_emulate_ops *ops,
1305 u16 selector, struct desc_ptr *dt)
1306{
1307 if (selector & 1 << 2) {
1308 struct desc_struct desc;
1309 memset (dt, 0, sizeof *dt);
1310 if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
1311 return;
1312
1313 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
1314 dt->address = get_desc_base(&desc);
1315 } else
1316 ops->get_gdt(dt, ctxt->vcpu);
1317}
1318
1319/* allowed just for 8 bytes segments */
1320static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1321 struct x86_emulate_ops *ops,
1322 u16 selector, struct desc_struct *desc)
1323{
1324 struct desc_ptr dt;
1325 u16 index = selector >> 3;
1326 int ret;
1327 u32 err;
1328 ulong addr;
1329
1330 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1331
1332 if (dt.size < index * 8 + 7) {
1333 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
1334 return X86EMUL_PROPAGATE_FAULT;
1335 }
1336 addr = dt.address + index * 8;
1337 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1338 if (ret == X86EMUL_PROPAGATE_FAULT)
1339 kvm_inject_page_fault(ctxt->vcpu, addr, err);
1340
1341 return ret;
1342}
1343
1344/* allowed just for 8 bytes segments */
1345static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1346 struct x86_emulate_ops *ops,
1347 u16 selector, struct desc_struct *desc)
1348{
1349 struct desc_ptr dt;
1350 u16 index = selector >> 3;
1351 u32 err;
1352 ulong addr;
1353 int ret;
1354
1355 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1356
1357 if (dt.size < index * 8 + 7) {
1358 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
1359 return X86EMUL_PROPAGATE_FAULT;
1360 }
1361
1362 addr = dt.address + index * 8;
1363 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1364 if (ret == X86EMUL_PROPAGATE_FAULT)
1365 kvm_inject_page_fault(ctxt->vcpu, addr, err);
1366
1367 return ret;
1368}
1369
1370static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1371 struct x86_emulate_ops *ops,
1372 u16 selector, int seg)
1373{
1374 struct desc_struct seg_desc;
1375 u8 dpl, rpl, cpl;
1376 unsigned err_vec = GP_VECTOR;
1377 u32 err_code = 0;
1378 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
1379 int ret;
1380
1381 memset(&seg_desc, 0, sizeof seg_desc);
1382
1383 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
1384 || ctxt->mode == X86EMUL_MODE_REAL) {
1385 /* set real mode segment descriptor */
1386 set_desc_base(&seg_desc, selector << 4);
1387 set_desc_limit(&seg_desc, 0xffff);
1388 seg_desc.type = 3;
1389 seg_desc.p = 1;
1390 seg_desc.s = 1;
1391 goto load;
1392 }
1393
1394 /* NULL selector is not valid for TR, CS and SS */
1395 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
1396 && null_selector)
1397 goto exception;
1398
1399 /* TR should be in GDT only */
1400 if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
1401 goto exception;
1402
1403 if (null_selector) /* for NULL selector skip all following checks */
1404 goto load;
1405
1406 ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
1407 if (ret != X86EMUL_CONTINUE)
1408 return ret;
1409
1410 err_code = selector & 0xfffc;
1411 err_vec = GP_VECTOR;
1412
1413 /* can't load system descriptor into segment selecor */
1414 if (seg <= VCPU_SREG_GS && !seg_desc.s)
1415 goto exception;
1416
1417 if (!seg_desc.p) {
1418 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
1419 goto exception;
1420 }
1421
1422 rpl = selector & 3;
1423 dpl = seg_desc.dpl;
1424 cpl = ops->cpl(ctxt->vcpu);
1425
1426 switch (seg) {
1427 case VCPU_SREG_SS:
1428 /*
1429 * segment is not a writable data segment or segment
1430 * selector's RPL != CPL or segment selector's RPL != CPL
1431 */
1432 if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
1433 goto exception;
1434 break;
1435 case VCPU_SREG_CS:
1436 if (!(seg_desc.type & 8))
1437 goto exception;
1438
1439 if (seg_desc.type & 4) {
1440 /* conforming */
1441 if (dpl > cpl)
1442 goto exception;
1443 } else {
1444 /* nonconforming */
1445 if (rpl > cpl || dpl != cpl)
1446 goto exception;
1447 }
1448 /* CS(RPL) <- CPL */
1449 selector = (selector & 0xfffc) | cpl;
1450 break;
1451 case VCPU_SREG_TR:
1452 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
1453 goto exception;
1454 break;
1455 case VCPU_SREG_LDTR:
1456 if (seg_desc.s || seg_desc.type != 2)
1457 goto exception;
1458 break;
1459 default: /* DS, ES, FS, or GS */
1460 /*
1461 * segment is not a data or readable code segment or
1462 * ((segment is a data or nonconforming code segment)
1463 * and (both RPL and CPL > DPL))
1464 */
1465 if ((seg_desc.type & 0xa) == 0x8 ||
1466 (((seg_desc.type & 0xc) != 0xc) &&
1467 (rpl > dpl && cpl > dpl)))
1468 goto exception;
1469 break;
1470 }
1471
1472 if (seg_desc.s) {
1473 /* mark segment as accessed */
1474 seg_desc.type |= 1;
1475 ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
1476 if (ret != X86EMUL_CONTINUE)
1477 return ret;
1478 }
1479load:
1480 ops->set_segment_selector(selector, seg, ctxt->vcpu);
1481 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
1482 return X86EMUL_CONTINUE;
1483exception:
1484 kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
1485 return X86EMUL_PROPAGATE_FAULT;
1486}
1487
1218static inline void emulate_push(struct x86_emulate_ctxt *ctxt) 1488static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1219{ 1489{
1220 struct decode_cache *c = &ctxt->decode; 1490 struct decode_cache *c = &ctxt->decode;
@@ -1251,7 +1521,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1251 int rc; 1521 int rc;
1252 unsigned long val, change_mask; 1522 unsigned long val, change_mask;
1253 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1523 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1254 int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); 1524 int cpl = ops->cpl(ctxt->vcpu);
1255 1525
1256 rc = emulate_pop(ctxt, ops, &val, len); 1526 rc = emulate_pop(ctxt, ops, &val, len);
1257 if (rc != X86EMUL_CONTINUE) 1527 if (rc != X86EMUL_CONTINUE)
@@ -1306,10 +1576,10 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1306 int rc; 1576 int rc;
1307 1577
1308 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); 1578 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1309 if (rc != 0) 1579 if (rc != X86EMUL_CONTINUE)
1310 return rc; 1580 return rc;
1311 1581
1312 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); 1582 rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
1313 return rc; 1583 return rc;
1314} 1584}
1315 1585
@@ -1332,7 +1602,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1332 struct x86_emulate_ops *ops) 1602 struct x86_emulate_ops *ops)
1333{ 1603{
1334 struct decode_cache *c = &ctxt->decode; 1604 struct decode_cache *c = &ctxt->decode;
1335 int rc = 0; 1605 int rc = X86EMUL_CONTINUE;
1336 int reg = VCPU_REGS_RDI; 1606 int reg = VCPU_REGS_RDI;
1337 1607
1338 while (reg >= VCPU_REGS_RAX) { 1608 while (reg >= VCPU_REGS_RAX) {
@@ -1343,7 +1613,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1343 } 1613 }
1344 1614
1345 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); 1615 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1346 if (rc != 0) 1616 if (rc != X86EMUL_CONTINUE)
1347 break; 1617 break;
1348 --reg; 1618 --reg;
1349 } 1619 }
@@ -1354,12 +1624,8 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1354 struct x86_emulate_ops *ops) 1624 struct x86_emulate_ops *ops)
1355{ 1625{
1356 struct decode_cache *c = &ctxt->decode; 1626 struct decode_cache *c = &ctxt->decode;
1357 int rc;
1358 1627
1359 rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); 1628 return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
1360 if (rc != 0)
1361 return rc;
1362 return 0;
1363} 1629}
1364 1630
1365static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) 1631static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
@@ -1395,7 +1661,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1395 struct x86_emulate_ops *ops) 1661 struct x86_emulate_ops *ops)
1396{ 1662{
1397 struct decode_cache *c = &ctxt->decode; 1663 struct decode_cache *c = &ctxt->decode;
1398 int rc = 0;
1399 1664
1400 switch (c->modrm_reg) { 1665 switch (c->modrm_reg) {
1401 case 0 ... 1: /* test */ 1666 case 0 ... 1: /* test */
@@ -1408,11 +1673,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1408 emulate_1op("neg", c->dst, ctxt->eflags); 1673 emulate_1op("neg", c->dst, ctxt->eflags);
1409 break; 1674 break;
1410 default: 1675 default:
1411 DPRINTF("Cannot emulate %02x\n", c->b); 1676 return 0;
1412 rc = X86EMUL_UNHANDLEABLE;
1413 break;
1414 } 1677 }
1415 return rc; 1678 return 1;
1416} 1679}
1417 1680
1418static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, 1681static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1442,20 +1705,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1442 emulate_push(ctxt); 1705 emulate_push(ctxt);
1443 break; 1706 break;
1444 } 1707 }
1445 return 0; 1708 return X86EMUL_CONTINUE;
1446} 1709}
1447 1710
1448static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, 1711static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1449 struct x86_emulate_ops *ops, 1712 struct x86_emulate_ops *ops)
1450 unsigned long memop)
1451{ 1713{
1452 struct decode_cache *c = &ctxt->decode; 1714 struct decode_cache *c = &ctxt->decode;
1453 u64 old, new; 1715 u64 old = c->dst.orig_val;
1454 int rc;
1455
1456 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1457 if (rc != X86EMUL_CONTINUE)
1458 return rc;
1459 1716
1460 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1717 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
1461 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { 1718 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
@@ -1463,17 +1720,13 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1463 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); 1720 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1464 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); 1721 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1465 ctxt->eflags &= ~EFLG_ZF; 1722 ctxt->eflags &= ~EFLG_ZF;
1466
1467 } else { 1723 } else {
1468 new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | 1724 c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
1469 (u32) c->regs[VCPU_REGS_RBX]; 1725 (u32) c->regs[VCPU_REGS_RBX];
1470 1726
1471 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1472 if (rc != X86EMUL_CONTINUE)
1473 return rc;
1474 ctxt->eflags |= EFLG_ZF; 1727 ctxt->eflags |= EFLG_ZF;
1475 } 1728 }
1476 return 0; 1729 return X86EMUL_CONTINUE;
1477} 1730}
1478 1731
1479static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, 1732static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
@@ -1484,14 +1737,14 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1484 unsigned long cs; 1737 unsigned long cs;
1485 1738
1486 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); 1739 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
1487 if (rc) 1740 if (rc != X86EMUL_CONTINUE)
1488 return rc; 1741 return rc;
1489 if (c->op_bytes == 4) 1742 if (c->op_bytes == 4)
1490 c->eip = (u32)c->eip; 1743 c->eip = (u32)c->eip;
1491 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1744 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1492 if (rc) 1745 if (rc != X86EMUL_CONTINUE)
1493 return rc; 1746 return rc;
1494 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); 1747 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
1495 return rc; 1748 return rc;
1496} 1749}
1497 1750
@@ -1544,7 +1797,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1544 default: 1797 default:
1545 break; 1798 break;
1546 } 1799 }
1547 return 0; 1800 return X86EMUL_CONTINUE;
1548} 1801}
1549 1802
1550static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) 1803static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
@@ -1598,8 +1851,11 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1598 u64 msr_data; 1851 u64 msr_data;
1599 1852
1600 /* syscall is not available in real mode */ 1853 /* syscall is not available in real mode */
1601 if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) 1854 if (ctxt->mode == X86EMUL_MODE_REAL ||
1602 return X86EMUL_UNHANDLEABLE; 1855 ctxt->mode == X86EMUL_MODE_VM86) {
1856 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1857 return X86EMUL_PROPAGATE_FAULT;
1858 }
1603 1859
1604 setup_syscalls_segments(ctxt, &cs, &ss); 1860 setup_syscalls_segments(ctxt, &cs, &ss);
1605 1861
@@ -1649,14 +1905,16 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1649 /* inject #GP if in real mode */ 1905 /* inject #GP if in real mode */
1650 if (ctxt->mode == X86EMUL_MODE_REAL) { 1906 if (ctxt->mode == X86EMUL_MODE_REAL) {
1651 kvm_inject_gp(ctxt->vcpu, 0); 1907 kvm_inject_gp(ctxt->vcpu, 0);
1652 return X86EMUL_UNHANDLEABLE; 1908 return X86EMUL_PROPAGATE_FAULT;
1653 } 1909 }
1654 1910
1655 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1911 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1656 * Therefore, we inject an #UD. 1912 * Therefore, we inject an #UD.
1657 */ 1913 */
1658 if (ctxt->mode == X86EMUL_MODE_PROT64) 1914 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1659 return X86EMUL_UNHANDLEABLE; 1915 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1916 return X86EMUL_PROPAGATE_FAULT;
1917 }
1660 1918
1661 setup_syscalls_segments(ctxt, &cs, &ss); 1919 setup_syscalls_segments(ctxt, &cs, &ss);
1662 1920
@@ -1711,7 +1969,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1711 if (ctxt->mode == X86EMUL_MODE_REAL || 1969 if (ctxt->mode == X86EMUL_MODE_REAL ||
1712 ctxt->mode == X86EMUL_MODE_VM86) { 1970 ctxt->mode == X86EMUL_MODE_VM86) {
1713 kvm_inject_gp(ctxt->vcpu, 0); 1971 kvm_inject_gp(ctxt->vcpu, 0);
1714 return X86EMUL_UNHANDLEABLE; 1972 return X86EMUL_PROPAGATE_FAULT;
1715 } 1973 }
1716 1974
1717 setup_syscalls_segments(ctxt, &cs, &ss); 1975 setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1756,7 +2014,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1756 return X86EMUL_CONTINUE; 2014 return X86EMUL_CONTINUE;
1757} 2015}
1758 2016
1759static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) 2017static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
2018 struct x86_emulate_ops *ops)
1760{ 2019{
1761 int iopl; 2020 int iopl;
1762 if (ctxt->mode == X86EMUL_MODE_REAL) 2021 if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -1764,7 +2023,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
1764 if (ctxt->mode == X86EMUL_MODE_VM86) 2023 if (ctxt->mode == X86EMUL_MODE_VM86)
1765 return true; 2024 return true;
1766 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2025 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1767 return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; 2026 return ops->cpl(ctxt->vcpu) > iopl;
1768} 2027}
1769 2028
1770static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2029static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1801,22 +2060,419 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
1801 struct x86_emulate_ops *ops, 2060 struct x86_emulate_ops *ops,
1802 u16 port, u16 len) 2061 u16 port, u16 len)
1803{ 2062{
1804 if (emulator_bad_iopl(ctxt)) 2063 if (emulator_bad_iopl(ctxt, ops))
1805 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 2064 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
1806 return false; 2065 return false;
1807 return true; 2066 return true;
1808} 2067}
1809 2068
2069static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
2070 struct x86_emulate_ops *ops,
2071 int seg)
2072{
2073 struct desc_struct desc;
2074 if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
2075 return get_desc_base(&desc);
2076 else
2077 return ~0;
2078}
2079
2080static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2081 struct x86_emulate_ops *ops,
2082 struct tss_segment_16 *tss)
2083{
2084 struct decode_cache *c = &ctxt->decode;
2085
2086 tss->ip = c->eip;
2087 tss->flag = ctxt->eflags;
2088 tss->ax = c->regs[VCPU_REGS_RAX];
2089 tss->cx = c->regs[VCPU_REGS_RCX];
2090 tss->dx = c->regs[VCPU_REGS_RDX];
2091 tss->bx = c->regs[VCPU_REGS_RBX];
2092 tss->sp = c->regs[VCPU_REGS_RSP];
2093 tss->bp = c->regs[VCPU_REGS_RBP];
2094 tss->si = c->regs[VCPU_REGS_RSI];
2095 tss->di = c->regs[VCPU_REGS_RDI];
2096
2097 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
2098 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2099 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
2100 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
2101 tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
2102}
2103
2104static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2105 struct x86_emulate_ops *ops,
2106 struct tss_segment_16 *tss)
2107{
2108 struct decode_cache *c = &ctxt->decode;
2109 int ret;
2110
2111 c->eip = tss->ip;
2112 ctxt->eflags = tss->flag | 2;
2113 c->regs[VCPU_REGS_RAX] = tss->ax;
2114 c->regs[VCPU_REGS_RCX] = tss->cx;
2115 c->regs[VCPU_REGS_RDX] = tss->dx;
2116 c->regs[VCPU_REGS_RBX] = tss->bx;
2117 c->regs[VCPU_REGS_RSP] = tss->sp;
2118 c->regs[VCPU_REGS_RBP] = tss->bp;
2119 c->regs[VCPU_REGS_RSI] = tss->si;
2120 c->regs[VCPU_REGS_RDI] = tss->di;
2121
2122 /*
2123 * SDM says that segment selectors are loaded before segment
2124 * descriptors
2125 */
2126 ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
2127 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
2128 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
2129 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
2130 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
2131
2132 /*
2133 * Now load segment descriptors. If fault happenes at this stage
2134 * it is handled in a context of new task
2135 */
2136 ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
2137 if (ret != X86EMUL_CONTINUE)
2138 return ret;
2139 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
2140 if (ret != X86EMUL_CONTINUE)
2141 return ret;
2142 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
2143 if (ret != X86EMUL_CONTINUE)
2144 return ret;
2145 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
2146 if (ret != X86EMUL_CONTINUE)
2147 return ret;
2148 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
2149 if (ret != X86EMUL_CONTINUE)
2150 return ret;
2151
2152 return X86EMUL_CONTINUE;
2153}
2154
2155static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2156 struct x86_emulate_ops *ops,
2157 u16 tss_selector, u16 old_tss_sel,
2158 ulong old_tss_base, struct desc_struct *new_desc)
2159{
2160 struct tss_segment_16 tss_seg;
2161 int ret;
2162 u32 err, new_tss_base = get_desc_base(new_desc);
2163
2164 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2165 &err);
2166 if (ret == X86EMUL_PROPAGATE_FAULT) {
2167 /* FIXME: need to provide precise fault address */
2168 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2169 return ret;
2170 }
2171
2172 save_state_to_tss16(ctxt, ops, &tss_seg);
2173
2174 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2175 &err);
2176 if (ret == X86EMUL_PROPAGATE_FAULT) {
2177 /* FIXME: need to provide precise fault address */
2178 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2179 return ret;
2180 }
2181
2182 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2183 &err);
2184 if (ret == X86EMUL_PROPAGATE_FAULT) {
2185 /* FIXME: need to provide precise fault address */
2186 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2187 return ret;
2188 }
2189
2190 if (old_tss_sel != 0xffff) {
2191 tss_seg.prev_task_link = old_tss_sel;
2192
2193 ret = ops->write_std(new_tss_base,
2194 &tss_seg.prev_task_link,
2195 sizeof tss_seg.prev_task_link,
2196 ctxt->vcpu, &err);
2197 if (ret == X86EMUL_PROPAGATE_FAULT) {
2198 /* FIXME: need to provide precise fault address */
2199 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2200 return ret;
2201 }
2202 }
2203
2204 return load_state_from_tss16(ctxt, ops, &tss_seg);
2205}
2206
2207static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2208 struct x86_emulate_ops *ops,
2209 struct tss_segment_32 *tss)
2210{
2211 struct decode_cache *c = &ctxt->decode;
2212
2213 tss->cr3 = ops->get_cr(3, ctxt->vcpu);
2214 tss->eip = c->eip;
2215 tss->eflags = ctxt->eflags;
2216 tss->eax = c->regs[VCPU_REGS_RAX];
2217 tss->ecx = c->regs[VCPU_REGS_RCX];
2218 tss->edx = c->regs[VCPU_REGS_RDX];
2219 tss->ebx = c->regs[VCPU_REGS_RBX];
2220 tss->esp = c->regs[VCPU_REGS_RSP];
2221 tss->ebp = c->regs[VCPU_REGS_RBP];
2222 tss->esi = c->regs[VCPU_REGS_RSI];
2223 tss->edi = c->regs[VCPU_REGS_RDI];
2224
2225 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
2226 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2227 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
2228 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
2229 tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
2230 tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
2231 tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
2232}
2233
2234static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2235 struct x86_emulate_ops *ops,
2236 struct tss_segment_32 *tss)
2237{
2238 struct decode_cache *c = &ctxt->decode;
2239 int ret;
2240
2241 ops->set_cr(3, tss->cr3, ctxt->vcpu);
2242 c->eip = tss->eip;
2243 ctxt->eflags = tss->eflags | 2;
2244 c->regs[VCPU_REGS_RAX] = tss->eax;
2245 c->regs[VCPU_REGS_RCX] = tss->ecx;
2246 c->regs[VCPU_REGS_RDX] = tss->edx;
2247 c->regs[VCPU_REGS_RBX] = tss->ebx;
2248 c->regs[VCPU_REGS_RSP] = tss->esp;
2249 c->regs[VCPU_REGS_RBP] = tss->ebp;
2250 c->regs[VCPU_REGS_RSI] = tss->esi;
2251 c->regs[VCPU_REGS_RDI] = tss->edi;
2252
2253 /*
2254 * SDM says that segment selectors are loaded before segment
2255 * descriptors
2256 */
2257 ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
2258 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
2259 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
2260 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
2261 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
2262 ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
2263 ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
2264
2265 /*
2266 * Now load segment descriptors. If fault happenes at this stage
2267 * it is handled in a context of new task
2268 */
2269 ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
2270 if (ret != X86EMUL_CONTINUE)
2271 return ret;
2272 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
2273 if (ret != X86EMUL_CONTINUE)
2274 return ret;
2275 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
2276 if (ret != X86EMUL_CONTINUE)
2277 return ret;
2278 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
2279 if (ret != X86EMUL_CONTINUE)
2280 return ret;
2281 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
2282 if (ret != X86EMUL_CONTINUE)
2283 return ret;
2284 ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
2285 if (ret != X86EMUL_CONTINUE)
2286 return ret;
2287 ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
2288 if (ret != X86EMUL_CONTINUE)
2289 return ret;
2290
2291 return X86EMUL_CONTINUE;
2292}
2293
2294static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2295 struct x86_emulate_ops *ops,
2296 u16 tss_selector, u16 old_tss_sel,
2297 ulong old_tss_base, struct desc_struct *new_desc)
2298{
2299 struct tss_segment_32 tss_seg;
2300 int ret;
2301 u32 err, new_tss_base = get_desc_base(new_desc);
2302
2303 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2304 &err);
2305 if (ret == X86EMUL_PROPAGATE_FAULT) {
2306 /* FIXME: need to provide precise fault address */
2307 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2308 return ret;
2309 }
2310
2311 save_state_to_tss32(ctxt, ops, &tss_seg);
2312
2313 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2314 &err);
2315 if (ret == X86EMUL_PROPAGATE_FAULT) {
2316 /* FIXME: need to provide precise fault address */
2317 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2318 return ret;
2319 }
2320
2321 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2322 &err);
2323 if (ret == X86EMUL_PROPAGATE_FAULT) {
2324 /* FIXME: need to provide precise fault address */
2325 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2326 return ret;
2327 }
2328
2329 if (old_tss_sel != 0xffff) {
2330 tss_seg.prev_task_link = old_tss_sel;
2331
2332 ret = ops->write_std(new_tss_base,
2333 &tss_seg.prev_task_link,
2334 sizeof tss_seg.prev_task_link,
2335 ctxt->vcpu, &err);
2336 if (ret == X86EMUL_PROPAGATE_FAULT) {
2337 /* FIXME: need to provide precise fault address */
2338 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2339 return ret;
2340 }
2341 }
2342
2343 return load_state_from_tss32(ctxt, ops, &tss_seg);
2344}
2345
2346static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2347 struct x86_emulate_ops *ops,
2348 u16 tss_selector, int reason,
2349 bool has_error_code, u32 error_code)
2350{
2351 struct desc_struct curr_tss_desc, next_tss_desc;
2352 int ret;
2353 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
2354 ulong old_tss_base =
2355 get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
2356 u32 desc_limit;
2357
2358 /* FIXME: old_tss_base == ~0 ? */
2359
2360 ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
2361 if (ret != X86EMUL_CONTINUE)
2362 return ret;
2363 ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
2364 if (ret != X86EMUL_CONTINUE)
2365 return ret;
2366
2367 /* FIXME: check that next_tss_desc is tss */
2368
2369 if (reason != TASK_SWITCH_IRET) {
2370 if ((tss_selector & 3) > next_tss_desc.dpl ||
2371 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
2372 kvm_inject_gp(ctxt->vcpu, 0);
2373 return X86EMUL_PROPAGATE_FAULT;
2374 }
2375 }
2376
2377 desc_limit = desc_limit_scaled(&next_tss_desc);
2378 if (!next_tss_desc.p ||
2379 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
2380 desc_limit < 0x2b)) {
2381 kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
2382 tss_selector & 0xfffc);
2383 return X86EMUL_PROPAGATE_FAULT;
2384 }
2385
2386 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
2387 curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
2388 write_segment_descriptor(ctxt, ops, old_tss_sel,
2389 &curr_tss_desc);
2390 }
2391
2392 if (reason == TASK_SWITCH_IRET)
2393 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
2394
2395 /* set back link to prev task only if NT bit is set in eflags
2396 note that old_tss_sel is not used afetr this point */
2397 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
2398 old_tss_sel = 0xffff;
2399
2400 if (next_tss_desc.type & 8)
2401 ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
2402 old_tss_base, &next_tss_desc);
2403 else
2404 ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
2405 old_tss_base, &next_tss_desc);
2406 if (ret != X86EMUL_CONTINUE)
2407 return ret;
2408
2409 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
2410 ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
2411
2412 if (reason != TASK_SWITCH_IRET) {
2413 next_tss_desc.type |= (1 << 1); /* set busy flag */
2414 write_segment_descriptor(ctxt, ops, tss_selector,
2415 &next_tss_desc);
2416 }
2417
2418 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
2419 ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
2420 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2421
2422 if (has_error_code) {
2423 struct decode_cache *c = &ctxt->decode;
2424
2425 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2426 c->lock_prefix = 0;
2427 c->src.val = (unsigned long) error_code;
2428 emulate_push(ctxt);
2429 }
2430
2431 return ret;
2432}
2433
2434int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2435 struct x86_emulate_ops *ops,
2436 u16 tss_selector, int reason,
2437 bool has_error_code, u32 error_code)
2438{
2439 struct decode_cache *c = &ctxt->decode;
2440 int rc;
2441
2442 memset(c, 0, sizeof(struct decode_cache));
2443 c->eip = ctxt->eip;
2444 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2445 c->dst.type = OP_NONE;
2446
2447 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2448 has_error_code, error_code);
2449
2450 if (rc == X86EMUL_CONTINUE) {
2451 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2452 kvm_rip_write(ctxt->vcpu, c->eip);
2453 rc = writeback(ctxt, ops);
2454 }
2455
2456 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2457}
2458
2459static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
2460 int reg, struct operand *op)
2461{
2462 struct decode_cache *c = &ctxt->decode;
2463 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2464
2465 register_address_increment(c, &c->regs[reg], df * op->bytes);
2466 op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]);
2467}
2468
1810int 2469int
1811x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 2470x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1812{ 2471{
1813 unsigned long memop = 0;
1814 u64 msr_data; 2472 u64 msr_data;
1815 unsigned long saved_eip = 0;
1816 struct decode_cache *c = &ctxt->decode; 2473 struct decode_cache *c = &ctxt->decode;
1817 unsigned int port; 2474 int rc = X86EMUL_CONTINUE;
1818 int io_dir_in; 2475 int saved_dst_type = c->dst.type;
1819 int rc = 0;
1820 2476
1821 ctxt->interruptibility = 0; 2477 ctxt->interruptibility = 0;
1822 2478
@@ -1826,26 +2482,30 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1826 */ 2482 */
1827 2483
1828 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 2484 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1829 saved_eip = c->eip; 2485
2486 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
2487 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2488 goto done;
2489 }
1830 2490
1831 /* LOCK prefix is allowed only with some instructions */ 2491 /* LOCK prefix is allowed only with some instructions */
1832 if (c->lock_prefix && !(c->d & Lock)) { 2492 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
1833 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2493 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1834 goto done; 2494 goto done;
1835 } 2495 }
1836 2496
1837 /* Privileged instruction can be executed only in CPL=0 */ 2497 /* Privileged instruction can be executed only in CPL=0 */
1838 if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { 2498 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
1839 kvm_inject_gp(ctxt->vcpu, 0); 2499 kvm_inject_gp(ctxt->vcpu, 0);
1840 goto done; 2500 goto done;
1841 } 2501 }
1842 2502
1843 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1844 memop = c->modrm_ea;
1845
1846 if (c->rep_prefix && (c->d & String)) { 2503 if (c->rep_prefix && (c->d & String)) {
2504 ctxt->restart = true;
1847 /* All REP prefixes have the same first termination condition */ 2505 /* All REP prefixes have the same first termination condition */
1848 if (c->regs[VCPU_REGS_RCX] == 0) { 2506 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2507 string_done:
2508 ctxt->restart = false;
1849 kvm_rip_write(ctxt->vcpu, c->eip); 2509 kvm_rip_write(ctxt->vcpu, c->eip);
1850 goto done; 2510 goto done;
1851 } 2511 }
@@ -1857,25 +2517,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1857 * - if REPNE/REPNZ and ZF = 1 then done 2517 * - if REPNE/REPNZ and ZF = 1 then done
1858 */ 2518 */
1859 if ((c->b == 0xa6) || (c->b == 0xa7) || 2519 if ((c->b == 0xa6) || (c->b == 0xa7) ||
1860 (c->b == 0xae) || (c->b == 0xaf)) { 2520 (c->b == 0xae) || (c->b == 0xaf)) {
1861 if ((c->rep_prefix == REPE_PREFIX) && 2521 if ((c->rep_prefix == REPE_PREFIX) &&
1862 ((ctxt->eflags & EFLG_ZF) == 0)) { 2522 ((ctxt->eflags & EFLG_ZF) == 0))
1863 kvm_rip_write(ctxt->vcpu, c->eip); 2523 goto string_done;
1864 goto done;
1865 }
1866 if ((c->rep_prefix == REPNE_PREFIX) && 2524 if ((c->rep_prefix == REPNE_PREFIX) &&
1867 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { 2525 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
1868 kvm_rip_write(ctxt->vcpu, c->eip); 2526 goto string_done;
1869 goto done;
1870 }
1871 } 2527 }
1872 c->regs[VCPU_REGS_RCX]--; 2528 c->eip = ctxt->eip;
1873 c->eip = kvm_rip_read(ctxt->vcpu);
1874 } 2529 }
1875 2530
1876 if (c->src.type == OP_MEM) { 2531 if (c->src.type == OP_MEM) {
1877 c->src.ptr = (unsigned long *)memop;
1878 c->src.val = 0;
1879 rc = ops->read_emulated((unsigned long)c->src.ptr, 2532 rc = ops->read_emulated((unsigned long)c->src.ptr,
1880 &c->src.val, 2533 &c->src.val,
1881 c->src.bytes, 2534 c->src.bytes,
@@ -1885,29 +2538,25 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1885 c->src.orig_val = c->src.val; 2538 c->src.orig_val = c->src.val;
1886 } 2539 }
1887 2540
2541 if (c->src2.type == OP_MEM) {
2542 rc = ops->read_emulated((unsigned long)c->src2.ptr,
2543 &c->src2.val,
2544 c->src2.bytes,
2545 ctxt->vcpu);
2546 if (rc != X86EMUL_CONTINUE)
2547 goto done;
2548 }
2549
1888 if ((c->d & DstMask) == ImplicitOps) 2550 if ((c->d & DstMask) == ImplicitOps)
1889 goto special_insn; 2551 goto special_insn;
1890 2552
1891 2553
1892 if (c->dst.type == OP_MEM) { 2554 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
1893 c->dst.ptr = (unsigned long *)memop; 2555 /* optimisation - avoid slow emulated read if Mov */
1894 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2556 rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
1895 c->dst.val = 0; 2557 c->dst.bytes, ctxt->vcpu);
1896 if (c->d & BitOp) { 2558 if (rc != X86EMUL_CONTINUE)
1897 unsigned long mask = ~(c->dst.bytes * 8 - 1); 2559 goto done;
1898
1899 c->dst.ptr = (void *)c->dst.ptr +
1900 (c->src.val & mask) / 8;
1901 }
1902 if (!(c->d & Mov)) {
1903 /* optimisation - avoid slow emulated read */
1904 rc = ops->read_emulated((unsigned long)c->dst.ptr,
1905 &c->dst.val,
1906 c->dst.bytes,
1907 ctxt->vcpu);
1908 if (rc != X86EMUL_CONTINUE)
1909 goto done;
1910 }
1911 } 2560 }
1912 c->dst.orig_val = c->dst.val; 2561 c->dst.orig_val = c->dst.val;
1913 2562
@@ -1926,7 +2575,7 @@ special_insn:
1926 break; 2575 break;
1927 case 0x07: /* pop es */ 2576 case 0x07: /* pop es */
1928 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 2577 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1929 if (rc != 0) 2578 if (rc != X86EMUL_CONTINUE)
1930 goto done; 2579 goto done;
1931 break; 2580 break;
1932 case 0x08 ... 0x0d: 2581 case 0x08 ... 0x0d:
@@ -1945,7 +2594,7 @@ special_insn:
1945 break; 2594 break;
1946 case 0x17: /* pop ss */ 2595 case 0x17: /* pop ss */
1947 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 2596 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1948 if (rc != 0) 2597 if (rc != X86EMUL_CONTINUE)
1949 goto done; 2598 goto done;
1950 break; 2599 break;
1951 case 0x18 ... 0x1d: 2600 case 0x18 ... 0x1d:
@@ -1957,7 +2606,7 @@ special_insn:
1957 break; 2606 break;
1958 case 0x1f: /* pop ds */ 2607 case 0x1f: /* pop ds */
1959 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 2608 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1960 if (rc != 0) 2609 if (rc != X86EMUL_CONTINUE)
1961 goto done; 2610 goto done;
1962 break; 2611 break;
1963 case 0x20 ... 0x25: 2612 case 0x20 ... 0x25:
@@ -1988,7 +2637,7 @@ special_insn:
1988 case 0x58 ... 0x5f: /* pop reg */ 2637 case 0x58 ... 0x5f: /* pop reg */
1989 pop_instruction: 2638 pop_instruction:
1990 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); 2639 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
1991 if (rc != 0) 2640 if (rc != X86EMUL_CONTINUE)
1992 goto done; 2641 goto done;
1993 break; 2642 break;
1994 case 0x60: /* pusha */ 2643 case 0x60: /* pusha */
@@ -1996,7 +2645,7 @@ special_insn:
1996 break; 2645 break;
1997 case 0x61: /* popa */ 2646 case 0x61: /* popa */
1998 rc = emulate_popa(ctxt, ops); 2647 rc = emulate_popa(ctxt, ops);
1999 if (rc != 0) 2648 if (rc != X86EMUL_CONTINUE)
2000 goto done; 2649 goto done;
2001 break; 2650 break;
2002 case 0x63: /* movsxd */ 2651 case 0x63: /* movsxd */
@@ -2010,47 +2659,29 @@ special_insn:
2010 break; 2659 break;
2011 case 0x6c: /* insb */ 2660 case 0x6c: /* insb */
2012 case 0x6d: /* insw/insd */ 2661 case 0x6d: /* insw/insd */
2662 c->dst.bytes = min(c->dst.bytes, 4u);
2013 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2663 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2014 (c->d & ByteOp) ? 1 : c->op_bytes)) { 2664 c->dst.bytes)) {
2015 kvm_inject_gp(ctxt->vcpu, 0); 2665 kvm_inject_gp(ctxt->vcpu, 0);
2016 goto done; 2666 goto done;
2017 } 2667 }
2018 if (kvm_emulate_pio_string(ctxt->vcpu, 2668 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
2019 1, 2669 c->regs[VCPU_REGS_RDX], &c->dst.val))
2020 (c->d & ByteOp) ? 1 : c->op_bytes, 2670 goto done; /* IO is needed, skip writeback */
2021 c->rep_prefix ? 2671 break;
2022 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
2023 (ctxt->eflags & EFLG_DF),
2024 register_address(c, es_base(ctxt),
2025 c->regs[VCPU_REGS_RDI]),
2026 c->rep_prefix,
2027 c->regs[VCPU_REGS_RDX]) == 0) {
2028 c->eip = saved_eip;
2029 return -1;
2030 }
2031 return 0;
2032 case 0x6e: /* outsb */ 2672 case 0x6e: /* outsb */
2033 case 0x6f: /* outsw/outsd */ 2673 case 0x6f: /* outsw/outsd */
2674 c->src.bytes = min(c->src.bytes, 4u);
2034 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2675 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2035 (c->d & ByteOp) ? 1 : c->op_bytes)) { 2676 c->src.bytes)) {
2036 kvm_inject_gp(ctxt->vcpu, 0); 2677 kvm_inject_gp(ctxt->vcpu, 0);
2037 goto done; 2678 goto done;
2038 } 2679 }
2039 if (kvm_emulate_pio_string(ctxt->vcpu, 2680 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
2040 0, 2681 &c->src.val, 1, ctxt->vcpu);
2041 (c->d & ByteOp) ? 1 : c->op_bytes, 2682
2042 c->rep_prefix ? 2683 c->dst.type = OP_NONE; /* nothing to writeback */
2043 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 2684 break;
2044 (ctxt->eflags & EFLG_DF),
2045 register_address(c,
2046 seg_override_base(ctxt, c),
2047 c->regs[VCPU_REGS_RSI]),
2048 c->rep_prefix,
2049 c->regs[VCPU_REGS_RDX]) == 0) {
2050 c->eip = saved_eip;
2051 return -1;
2052 }
2053 return 0;
2054 case 0x70 ... 0x7f: /* jcc (short) */ 2685 case 0x70 ... 0x7f: /* jcc (short) */
2055 if (test_cc(c->b, ctxt->eflags)) 2686 if (test_cc(c->b, ctxt->eflags))
2056 jmp_rel(c, c->src.val); 2687 jmp_rel(c, c->src.val);
@@ -2107,12 +2738,11 @@ special_insn:
2107 case 0x8c: { /* mov r/m, sreg */ 2738 case 0x8c: { /* mov r/m, sreg */
2108 struct kvm_segment segreg; 2739 struct kvm_segment segreg;
2109 2740
2110 if (c->modrm_reg <= 5) 2741 if (c->modrm_reg <= VCPU_SREG_GS)
2111 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); 2742 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
2112 else { 2743 else {
2113 printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", 2744 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2114 c->modrm); 2745 goto done;
2115 goto cannot_emulate;
2116 } 2746 }
2117 c->dst.val = segreg.selector; 2747 c->dst.val = segreg.selector;
2118 break; 2748 break;
@@ -2132,16 +2762,16 @@ special_insn:
2132 } 2762 }
2133 2763
2134 if (c->modrm_reg == VCPU_SREG_SS) 2764 if (c->modrm_reg == VCPU_SREG_SS)
2135 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); 2765 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
2136 2766
2137 rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); 2767 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
2138 2768
2139 c->dst.type = OP_NONE; /* Disable writeback. */ 2769 c->dst.type = OP_NONE; /* Disable writeback. */
2140 break; 2770 break;
2141 } 2771 }
2142 case 0x8f: /* pop (sole member of Grp1a) */ 2772 case 0x8f: /* pop (sole member of Grp1a) */
2143 rc = emulate_grp1a(ctxt, ops); 2773 rc = emulate_grp1a(ctxt, ops);
2144 if (rc != 0) 2774 if (rc != X86EMUL_CONTINUE)
2145 goto done; 2775 goto done;
2146 break; 2776 break;
2147 case 0x90: /* nop / xchg r8,rax */ 2777 case 0x90: /* nop / xchg r8,rax */
@@ -2175,89 +2805,16 @@ special_insn:
2175 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; 2805 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
2176 break; 2806 break;
2177 case 0xa4 ... 0xa5: /* movs */ 2807 case 0xa4 ... 0xa5: /* movs */
2178 c->dst.type = OP_MEM; 2808 goto mov;
2179 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2180 c->dst.ptr = (unsigned long *)register_address(c,
2181 es_base(ctxt),
2182 c->regs[VCPU_REGS_RDI]);
2183 rc = ops->read_emulated(register_address(c,
2184 seg_override_base(ctxt, c),
2185 c->regs[VCPU_REGS_RSI]),
2186 &c->dst.val,
2187 c->dst.bytes, ctxt->vcpu);
2188 if (rc != X86EMUL_CONTINUE)
2189 goto done;
2190 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2191 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2192 : c->dst.bytes);
2193 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2194 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2195 : c->dst.bytes);
2196 break;
2197 case 0xa6 ... 0xa7: /* cmps */ 2809 case 0xa6 ... 0xa7: /* cmps */
2198 c->src.type = OP_NONE; /* Disable writeback. */
2199 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2200 c->src.ptr = (unsigned long *)register_address(c,
2201 seg_override_base(ctxt, c),
2202 c->regs[VCPU_REGS_RSI]);
2203 rc = ops->read_emulated((unsigned long)c->src.ptr,
2204 &c->src.val,
2205 c->src.bytes,
2206 ctxt->vcpu);
2207 if (rc != X86EMUL_CONTINUE)
2208 goto done;
2209
2210 c->dst.type = OP_NONE; /* Disable writeback. */ 2810 c->dst.type = OP_NONE; /* Disable writeback. */
2211 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2212 c->dst.ptr = (unsigned long *)register_address(c,
2213 es_base(ctxt),
2214 c->regs[VCPU_REGS_RDI]);
2215 rc = ops->read_emulated((unsigned long)c->dst.ptr,
2216 &c->dst.val,
2217 c->dst.bytes,
2218 ctxt->vcpu);
2219 if (rc != X86EMUL_CONTINUE)
2220 goto done;
2221
2222 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2811 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
2223 2812 goto cmp;
2224 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2225
2226 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2227 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
2228 : c->src.bytes);
2229 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2230 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2231 : c->dst.bytes);
2232
2233 break;
2234 case 0xaa ... 0xab: /* stos */ 2813 case 0xaa ... 0xab: /* stos */
2235 c->dst.type = OP_MEM;
2236 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2237 c->dst.ptr = (unsigned long *)register_address(c,
2238 es_base(ctxt),
2239 c->regs[VCPU_REGS_RDI]);
2240 c->dst.val = c->regs[VCPU_REGS_RAX]; 2814 c->dst.val = c->regs[VCPU_REGS_RAX];
2241 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2242 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2243 : c->dst.bytes);
2244 break; 2815 break;
2245 case 0xac ... 0xad: /* lods */ 2816 case 0xac ... 0xad: /* lods */
2246 c->dst.type = OP_REG; 2817 goto mov;
2247 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2248 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2249 rc = ops->read_emulated(register_address(c,
2250 seg_override_base(ctxt, c),
2251 c->regs[VCPU_REGS_RSI]),
2252 &c->dst.val,
2253 c->dst.bytes,
2254 ctxt->vcpu);
2255 if (rc != X86EMUL_CONTINUE)
2256 goto done;
2257 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2258 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2259 : c->dst.bytes);
2260 break;
2261 case 0xae ... 0xaf: /* scas */ 2818 case 0xae ... 0xaf: /* scas */
2262 DPRINTF("Urk! I don't handle SCAS.\n"); 2819 DPRINTF("Urk! I don't handle SCAS.\n");
2263 goto cannot_emulate; 2820 goto cannot_emulate;
@@ -2277,7 +2834,7 @@ special_insn:
2277 break; 2834 break;
2278 case 0xcb: /* ret far */ 2835 case 0xcb: /* ret far */
2279 rc = emulate_ret_far(ctxt, ops); 2836 rc = emulate_ret_far(ctxt, ops);
2280 if (rc) 2837 if (rc != X86EMUL_CONTINUE)
2281 goto done; 2838 goto done;
2282 break; 2839 break;
2283 case 0xd0 ... 0xd1: /* Grp2 */ 2840 case 0xd0 ... 0xd1: /* Grp2 */
@@ -2290,14 +2847,10 @@ special_insn:
2290 break; 2847 break;
2291 case 0xe4: /* inb */ 2848 case 0xe4: /* inb */
2292 case 0xe5: /* in */ 2849 case 0xe5: /* in */
2293 port = c->src.val; 2850 goto do_io_in;
2294 io_dir_in = 1;
2295 goto do_io;
2296 case 0xe6: /* outb */ 2851 case 0xe6: /* outb */
2297 case 0xe7: /* out */ 2852 case 0xe7: /* out */
2298 port = c->src.val; 2853 goto do_io_out;
2299 io_dir_in = 0;
2300 goto do_io;
2301 case 0xe8: /* call (near) */ { 2854 case 0xe8: /* call (near) */ {
2302 long int rel = c->src.val; 2855 long int rel = c->src.val;
2303 c->src.val = (unsigned long) c->eip; 2856 c->src.val = (unsigned long) c->eip;
@@ -2308,8 +2861,9 @@ special_insn:
2308 case 0xe9: /* jmp rel */ 2861 case 0xe9: /* jmp rel */
2309 goto jmp; 2862 goto jmp;
2310 case 0xea: /* jmp far */ 2863 case 0xea: /* jmp far */
2311 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 2864 jump_far:
2312 VCPU_SREG_CS)) 2865 if (load_segment_descriptor(ctxt, ops, c->src2.val,
2866 VCPU_SREG_CS))
2313 goto done; 2867 goto done;
2314 2868
2315 c->eip = c->src.val; 2869 c->eip = c->src.val;
@@ -2321,25 +2875,29 @@ special_insn:
2321 break; 2875 break;
2322 case 0xec: /* in al,dx */ 2876 case 0xec: /* in al,dx */
2323 case 0xed: /* in (e/r)ax,dx */ 2877 case 0xed: /* in (e/r)ax,dx */
2324 port = c->regs[VCPU_REGS_RDX]; 2878 c->src.val = c->regs[VCPU_REGS_RDX];
2325 io_dir_in = 1; 2879 do_io_in:
2326 goto do_io; 2880 c->dst.bytes = min(c->dst.bytes, 4u);
2881 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2882 kvm_inject_gp(ctxt->vcpu, 0);
2883 goto done;
2884 }
2885 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
2886 &c->dst.val))
2887 goto done; /* IO is needed */
2888 break;
2327 case 0xee: /* out al,dx */ 2889 case 0xee: /* out al,dx */
2328 case 0xef: /* out (e/r)ax,dx */ 2890 case 0xef: /* out (e/r)ax,dx */
2329 port = c->regs[VCPU_REGS_RDX]; 2891 c->src.val = c->regs[VCPU_REGS_RDX];
2330 io_dir_in = 0; 2892 do_io_out:
2331 do_io: 2893 c->dst.bytes = min(c->dst.bytes, 4u);
2332 if (!emulator_io_permited(ctxt, ops, port, 2894 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2333 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2334 kvm_inject_gp(ctxt->vcpu, 0); 2895 kvm_inject_gp(ctxt->vcpu, 0);
2335 goto done; 2896 goto done;
2336 } 2897 }
2337 if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, 2898 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
2338 (c->d & ByteOp) ? 1 : c->op_bytes, 2899 ctxt->vcpu);
2339 port) != 0) { 2900 c->dst.type = OP_NONE; /* Disable writeback. */
2340 c->eip = saved_eip;
2341 goto cannot_emulate;
2342 }
2343 break; 2901 break;
2344 case 0xf4: /* hlt */ 2902 case 0xf4: /* hlt */
2345 ctxt->vcpu->arch.halt_request = 1; 2903 ctxt->vcpu->arch.halt_request = 1;
@@ -2350,16 +2908,15 @@ special_insn:
2350 c->dst.type = OP_NONE; /* Disable writeback. */ 2908 c->dst.type = OP_NONE; /* Disable writeback. */
2351 break; 2909 break;
2352 case 0xf6 ... 0xf7: /* Grp3 */ 2910 case 0xf6 ... 0xf7: /* Grp3 */
2353 rc = emulate_grp3(ctxt, ops); 2911 if (!emulate_grp3(ctxt, ops))
2354 if (rc != 0) 2912 goto cannot_emulate;
2355 goto done;
2356 break; 2913 break;
2357 case 0xf8: /* clc */ 2914 case 0xf8: /* clc */
2358 ctxt->eflags &= ~EFLG_CF; 2915 ctxt->eflags &= ~EFLG_CF;
2359 c->dst.type = OP_NONE; /* Disable writeback. */ 2916 c->dst.type = OP_NONE; /* Disable writeback. */
2360 break; 2917 break;
2361 case 0xfa: /* cli */ 2918 case 0xfa: /* cli */
2362 if (emulator_bad_iopl(ctxt)) 2919 if (emulator_bad_iopl(ctxt, ops))
2363 kvm_inject_gp(ctxt->vcpu, 0); 2920 kvm_inject_gp(ctxt->vcpu, 0);
2364 else { 2921 else {
2365 ctxt->eflags &= ~X86_EFLAGS_IF; 2922 ctxt->eflags &= ~X86_EFLAGS_IF;
@@ -2367,10 +2924,10 @@ special_insn:
2367 } 2924 }
2368 break; 2925 break;
2369 case 0xfb: /* sti */ 2926 case 0xfb: /* sti */
2370 if (emulator_bad_iopl(ctxt)) 2927 if (emulator_bad_iopl(ctxt, ops))
2371 kvm_inject_gp(ctxt->vcpu, 0); 2928 kvm_inject_gp(ctxt->vcpu, 0);
2372 else { 2929 else {
2373 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); 2930 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
2374 ctxt->eflags |= X86_EFLAGS_IF; 2931 ctxt->eflags |= X86_EFLAGS_IF;
2375 c->dst.type = OP_NONE; /* Disable writeback. */ 2932 c->dst.type = OP_NONE; /* Disable writeback. */
2376 } 2933 }
@@ -2383,28 +2940,55 @@ special_insn:
2383 ctxt->eflags |= EFLG_DF; 2940 ctxt->eflags |= EFLG_DF;
2384 c->dst.type = OP_NONE; /* Disable writeback. */ 2941 c->dst.type = OP_NONE; /* Disable writeback. */
2385 break; 2942 break;
2386 case 0xfe ... 0xff: /* Grp4/Grp5 */ 2943 case 0xfe: /* Grp4 */
2944 grp45:
2387 rc = emulate_grp45(ctxt, ops); 2945 rc = emulate_grp45(ctxt, ops);
2388 if (rc != 0) 2946 if (rc != X86EMUL_CONTINUE)
2389 goto done; 2947 goto done;
2390 break; 2948 break;
2949 case 0xff: /* Grp5 */
2950 if (c->modrm_reg == 5)
2951 goto jump_far;
2952 goto grp45;
2391 } 2953 }
2392 2954
2393writeback: 2955writeback:
2394 rc = writeback(ctxt, ops); 2956 rc = writeback(ctxt, ops);
2395 if (rc != 0) 2957 if (rc != X86EMUL_CONTINUE)
2396 goto done; 2958 goto done;
2397 2959
2960 /*
2961 * restore dst type in case the decoding will be reused
2962 * (happens for string instruction )
2963 */
2964 c->dst.type = saved_dst_type;
2965
2966 if ((c->d & SrcMask) == SrcSI)
2967 string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
2968 &c->src);
2969
2970 if ((c->d & DstMask) == DstDI)
2971 string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
2972
2973 if (c->rep_prefix && (c->d & String)) {
2974 struct read_cache *rc = &ctxt->decode.io_read;
2975 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
2976 /*
2977 * Re-enter guest when pio read ahead buffer is empty or,
2978 * if it is not used, after each 1024 iteration.
2979 */
2980 if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
2981 (rc->end != 0 && rc->end == rc->pos))
2982 ctxt->restart = false;
2983 }
2984
2398 /* Commit shadow register state. */ 2985 /* Commit shadow register state. */
2399 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 2986 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2400 kvm_rip_write(ctxt->vcpu, c->eip); 2987 kvm_rip_write(ctxt->vcpu, c->eip);
2988 ops->set_rflags(ctxt->vcpu, ctxt->eflags);
2401 2989
2402done: 2990done:
2403 if (rc == X86EMUL_UNHANDLEABLE) { 2991 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2404 c->eip = saved_eip;
2405 return -1;
2406 }
2407 return 0;
2408 2992
2409twobyte_insn: 2993twobyte_insn:
2410 switch (c->b) { 2994 switch (c->b) {
@@ -2418,18 +3002,18 @@ twobyte_insn:
2418 goto cannot_emulate; 3002 goto cannot_emulate;
2419 3003
2420 rc = kvm_fix_hypercall(ctxt->vcpu); 3004 rc = kvm_fix_hypercall(ctxt->vcpu);
2421 if (rc) 3005 if (rc != X86EMUL_CONTINUE)
2422 goto done; 3006 goto done;
2423 3007
2424 /* Let the processor re-execute the fixed hypercall */ 3008 /* Let the processor re-execute the fixed hypercall */
2425 c->eip = kvm_rip_read(ctxt->vcpu); 3009 c->eip = ctxt->eip;
2426 /* Disable writeback. */ 3010 /* Disable writeback. */
2427 c->dst.type = OP_NONE; 3011 c->dst.type = OP_NONE;
2428 break; 3012 break;
2429 case 2: /* lgdt */ 3013 case 2: /* lgdt */
2430 rc = read_descriptor(ctxt, ops, c->src.ptr, 3014 rc = read_descriptor(ctxt, ops, c->src.ptr,
2431 &size, &address, c->op_bytes); 3015 &size, &address, c->op_bytes);
2432 if (rc) 3016 if (rc != X86EMUL_CONTINUE)
2433 goto done; 3017 goto done;
2434 realmode_lgdt(ctxt->vcpu, size, address); 3018 realmode_lgdt(ctxt->vcpu, size, address);
2435 /* Disable writeback. */ 3019 /* Disable writeback. */
@@ -2440,7 +3024,7 @@ twobyte_insn:
2440 switch (c->modrm_rm) { 3024 switch (c->modrm_rm) {
2441 case 1: 3025 case 1:
2442 rc = kvm_fix_hypercall(ctxt->vcpu); 3026 rc = kvm_fix_hypercall(ctxt->vcpu);
2443 if (rc) 3027 if (rc != X86EMUL_CONTINUE)
2444 goto done; 3028 goto done;
2445 break; 3029 break;
2446 default: 3030 default:
@@ -2450,7 +3034,7 @@ twobyte_insn:
2450 rc = read_descriptor(ctxt, ops, c->src.ptr, 3034 rc = read_descriptor(ctxt, ops, c->src.ptr,
2451 &size, &address, 3035 &size, &address,
2452 c->op_bytes); 3036 c->op_bytes);
2453 if (rc) 3037 if (rc != X86EMUL_CONTINUE)
2454 goto done; 3038 goto done;
2455 realmode_lidt(ctxt->vcpu, size, address); 3039 realmode_lidt(ctxt->vcpu, size, address);
2456 } 3040 }
@@ -2459,15 +3043,18 @@ twobyte_insn:
2459 break; 3043 break;
2460 case 4: /* smsw */ 3044 case 4: /* smsw */
2461 c->dst.bytes = 2; 3045 c->dst.bytes = 2;
2462 c->dst.val = realmode_get_cr(ctxt->vcpu, 0); 3046 c->dst.val = ops->get_cr(0, ctxt->vcpu);
2463 break; 3047 break;
2464 case 6: /* lmsw */ 3048 case 6: /* lmsw */
2465 realmode_lmsw(ctxt->vcpu, (u16)c->src.val, 3049 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
2466 &ctxt->eflags); 3050 (c->src.val & 0x0f), ctxt->vcpu);
2467 c->dst.type = OP_NONE; 3051 c->dst.type = OP_NONE;
2468 break; 3052 break;
3053 case 5: /* not defined */
3054 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
3055 goto done;
2469 case 7: /* invlpg*/ 3056 case 7: /* invlpg*/
2470 emulate_invlpg(ctxt->vcpu, memop); 3057 emulate_invlpg(ctxt->vcpu, c->modrm_ea);
2471 /* Disable writeback. */ 3058 /* Disable writeback. */
2472 c->dst.type = OP_NONE; 3059 c->dst.type = OP_NONE;
2473 break; 3060 break;
@@ -2493,54 +3080,54 @@ twobyte_insn:
2493 c->dst.type = OP_NONE; 3080 c->dst.type = OP_NONE;
2494 break; 3081 break;
2495 case 0x20: /* mov cr, reg */ 3082 case 0x20: /* mov cr, reg */
2496 if (c->modrm_mod != 3) 3083 switch (c->modrm_reg) {
2497 goto cannot_emulate; 3084 case 1:
2498 c->regs[c->modrm_rm] = 3085 case 5 ... 7:
2499 realmode_get_cr(ctxt->vcpu, c->modrm_reg); 3086 case 9 ... 15:
3087 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
3088 goto done;
3089 }
3090 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
2500 c->dst.type = OP_NONE; /* no writeback */ 3091 c->dst.type = OP_NONE; /* no writeback */
2501 break; 3092 break;
2502 case 0x21: /* mov from dr to reg */ 3093 case 0x21: /* mov from dr to reg */
2503 if (c->modrm_mod != 3) 3094 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
2504 goto cannot_emulate; 3095 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
2505 rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); 3096 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2506 if (rc) 3097 goto done;
2507 goto cannot_emulate; 3098 }
3099 emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
2508 c->dst.type = OP_NONE; /* no writeback */ 3100 c->dst.type = OP_NONE; /* no writeback */
2509 break; 3101 break;
2510 case 0x22: /* mov reg, cr */ 3102 case 0x22: /* mov reg, cr */
2511 if (c->modrm_mod != 3) 3103 ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
2512 goto cannot_emulate;
2513 realmode_set_cr(ctxt->vcpu,
2514 c->modrm_reg, c->modrm_val, &ctxt->eflags);
2515 c->dst.type = OP_NONE; 3104 c->dst.type = OP_NONE;
2516 break; 3105 break;
2517 case 0x23: /* mov from reg to dr */ 3106 case 0x23: /* mov from reg to dr */
2518 if (c->modrm_mod != 3) 3107 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
2519 goto cannot_emulate; 3108 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
2520 rc = emulator_set_dr(ctxt, c->modrm_reg, 3109 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2521 c->regs[c->modrm_rm]); 3110 goto done;
2522 if (rc) 3111 }
2523 goto cannot_emulate; 3112 emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
2524 c->dst.type = OP_NONE; /* no writeback */ 3113 c->dst.type = OP_NONE; /* no writeback */
2525 break; 3114 break;
2526 case 0x30: 3115 case 0x30:
2527 /* wrmsr */ 3116 /* wrmsr */
2528 msr_data = (u32)c->regs[VCPU_REGS_RAX] 3117 msr_data = (u32)c->regs[VCPU_REGS_RAX]
2529 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3118 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
2530 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); 3119 if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
2531 if (rc) {
2532 kvm_inject_gp(ctxt->vcpu, 0); 3120 kvm_inject_gp(ctxt->vcpu, 0);
2533 c->eip = kvm_rip_read(ctxt->vcpu); 3121 goto done;
2534 } 3122 }
2535 rc = X86EMUL_CONTINUE; 3123 rc = X86EMUL_CONTINUE;
2536 c->dst.type = OP_NONE; 3124 c->dst.type = OP_NONE;
2537 break; 3125 break;
2538 case 0x32: 3126 case 0x32:
2539 /* rdmsr */ 3127 /* rdmsr */
2540 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); 3128 if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
2541 if (rc) {
2542 kvm_inject_gp(ctxt->vcpu, 0); 3129 kvm_inject_gp(ctxt->vcpu, 0);
2543 c->eip = kvm_rip_read(ctxt->vcpu); 3130 goto done;
2544 } else { 3131 } else {
2545 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3132 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
2546 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 3133 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
@@ -2577,7 +3164,7 @@ twobyte_insn:
2577 break; 3164 break;
2578 case 0xa1: /* pop fs */ 3165 case 0xa1: /* pop fs */
2579 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3166 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2580 if (rc != 0) 3167 if (rc != X86EMUL_CONTINUE)
2581 goto done; 3168 goto done;
2582 break; 3169 break;
2583 case 0xa3: 3170 case 0xa3:
@@ -2596,7 +3183,7 @@ twobyte_insn:
2596 break; 3183 break;
2597 case 0xa9: /* pop gs */ 3184 case 0xa9: /* pop gs */
2598 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3185 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2599 if (rc != 0) 3186 if (rc != X86EMUL_CONTINUE)
2600 goto done; 3187 goto done;
2601 break; 3188 break;
2602 case 0xab: 3189 case 0xab:
@@ -2668,16 +3255,14 @@ twobyte_insn:
2668 (u64) c->src.val; 3255 (u64) c->src.val;
2669 break; 3256 break;
2670 case 0xc7: /* Grp9 (cmpxchg8b) */ 3257 case 0xc7: /* Grp9 (cmpxchg8b) */
2671 rc = emulate_grp9(ctxt, ops, memop); 3258 rc = emulate_grp9(ctxt, ops);
2672 if (rc != 0) 3259 if (rc != X86EMUL_CONTINUE)
2673 goto done; 3260 goto done;
2674 c->dst.type = OP_NONE;
2675 break; 3261 break;
2676 } 3262 }
2677 goto writeback; 3263 goto writeback;
2678 3264
2679cannot_emulate: 3265cannot_emulate:
2680 DPRINTF("Cannot emulate %02x\n", c->b); 3266 DPRINTF("Cannot emulate %02x\n", c->b);
2681 c->eip = saved_eip;
2682 return -1; 3267 return -1;
2683} 3268}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a790fa128a9f..93825ff3338f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -33,6 +33,29 @@
33#include <linux/kvm_host.h> 33#include <linux/kvm_host.h>
34#include "trace.h" 34#include "trace.h"
35 35
36static void pic_lock(struct kvm_pic *s)
37 __acquires(&s->lock)
38{
39 raw_spin_lock(&s->lock);
40}
41
42static void pic_unlock(struct kvm_pic *s)
43 __releases(&s->lock)
44{
45 bool wakeup = s->wakeup_needed;
46 struct kvm_vcpu *vcpu;
47
48 s->wakeup_needed = false;
49
50 raw_spin_unlock(&s->lock);
51
52 if (wakeup) {
53 vcpu = s->kvm->bsp_vcpu;
54 if (vcpu)
55 kvm_vcpu_kick(vcpu);
56 }
57}
58
36static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 59static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
37{ 60{
38 s->isr &= ~(1 << irq); 61 s->isr &= ~(1 << irq);
@@ -45,19 +68,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
45 * Other interrupt may be delivered to PIC while lock is dropped but 68 * Other interrupt may be delivered to PIC while lock is dropped but
46 * it should be safe since PIC state is already updated at this stage. 69 * it should be safe since PIC state is already updated at this stage.
47 */ 70 */
48 raw_spin_unlock(&s->pics_state->lock); 71 pic_unlock(s->pics_state);
49 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 72 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
50 raw_spin_lock(&s->pics_state->lock); 73 pic_lock(s->pics_state);
51} 74}
52 75
53void kvm_pic_clear_isr_ack(struct kvm *kvm) 76void kvm_pic_clear_isr_ack(struct kvm *kvm)
54{ 77{
55 struct kvm_pic *s = pic_irqchip(kvm); 78 struct kvm_pic *s = pic_irqchip(kvm);
56 79
57 raw_spin_lock(&s->lock); 80 pic_lock(s);
58 s->pics[0].isr_ack = 0xff; 81 s->pics[0].isr_ack = 0xff;
59 s->pics[1].isr_ack = 0xff; 82 s->pics[1].isr_ack = 0xff;
60 raw_spin_unlock(&s->lock); 83 pic_unlock(s);
61} 84}
62 85
63/* 86/*
@@ -158,9 +181,9 @@ static void pic_update_irq(struct kvm_pic *s)
158 181
159void kvm_pic_update_irq(struct kvm_pic *s) 182void kvm_pic_update_irq(struct kvm_pic *s)
160{ 183{
161 raw_spin_lock(&s->lock); 184 pic_lock(s);
162 pic_update_irq(s); 185 pic_update_irq(s);
163 raw_spin_unlock(&s->lock); 186 pic_unlock(s);
164} 187}
165 188
166int kvm_pic_set_irq(void *opaque, int irq, int level) 189int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -168,14 +191,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
168 struct kvm_pic *s = opaque; 191 struct kvm_pic *s = opaque;
169 int ret = -1; 192 int ret = -1;
170 193
171 raw_spin_lock(&s->lock); 194 pic_lock(s);
172 if (irq >= 0 && irq < PIC_NUM_PINS) { 195 if (irq >= 0 && irq < PIC_NUM_PINS) {
173 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 196 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
174 pic_update_irq(s); 197 pic_update_irq(s);
175 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 198 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
176 s->pics[irq >> 3].imr, ret == 0); 199 s->pics[irq >> 3].imr, ret == 0);
177 } 200 }
178 raw_spin_unlock(&s->lock); 201 pic_unlock(s);
179 202
180 return ret; 203 return ret;
181} 204}
@@ -205,7 +228,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
205 int irq, irq2, intno; 228 int irq, irq2, intno;
206 struct kvm_pic *s = pic_irqchip(kvm); 229 struct kvm_pic *s = pic_irqchip(kvm);
207 230
208 raw_spin_lock(&s->lock); 231 pic_lock(s);
209 irq = pic_get_irq(&s->pics[0]); 232 irq = pic_get_irq(&s->pics[0]);
210 if (irq >= 0) { 233 if (irq >= 0) {
211 pic_intack(&s->pics[0], irq); 234 pic_intack(&s->pics[0], irq);
@@ -230,7 +253,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
230 intno = s->pics[0].irq_base + irq; 253 intno = s->pics[0].irq_base + irq;
231 } 254 }
232 pic_update_irq(s); 255 pic_update_irq(s);
233 raw_spin_unlock(&s->lock); 256 pic_unlock(s);
234 257
235 return intno; 258 return intno;
236} 259}
@@ -444,7 +467,7 @@ static int picdev_write(struct kvm_io_device *this,
444 printk(KERN_ERR "PIC: non byte write\n"); 467 printk(KERN_ERR "PIC: non byte write\n");
445 return 0; 468 return 0;
446 } 469 }
447 raw_spin_lock(&s->lock); 470 pic_lock(s);
448 switch (addr) { 471 switch (addr) {
449 case 0x20: 472 case 0x20:
450 case 0x21: 473 case 0x21:
@@ -457,7 +480,7 @@ static int picdev_write(struct kvm_io_device *this,
457 elcr_ioport_write(&s->pics[addr & 1], addr, data); 480 elcr_ioport_write(&s->pics[addr & 1], addr, data);
458 break; 481 break;
459 } 482 }
460 raw_spin_unlock(&s->lock); 483 pic_unlock(s);
461 return 0; 484 return 0;
462} 485}
463 486
@@ -474,7 +497,7 @@ static int picdev_read(struct kvm_io_device *this,
474 printk(KERN_ERR "PIC: non byte read\n"); 497 printk(KERN_ERR "PIC: non byte read\n");
475 return 0; 498 return 0;
476 } 499 }
477 raw_spin_lock(&s->lock); 500 pic_lock(s);
478 switch (addr) { 501 switch (addr) {
479 case 0x20: 502 case 0x20:
480 case 0x21: 503 case 0x21:
@@ -488,7 +511,7 @@ static int picdev_read(struct kvm_io_device *this,
488 break; 511 break;
489 } 512 }
490 *(unsigned char *)val = data; 513 *(unsigned char *)val = data;
491 raw_spin_unlock(&s->lock); 514 pic_unlock(s);
492 return 0; 515 return 0;
493} 516}
494 517
@@ -505,7 +528,7 @@ static void pic_irq_request(void *opaque, int level)
505 s->output = level; 528 s->output = level;
506 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { 529 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
507 s->pics[0].isr_ack &= ~(1 << irq); 530 s->pics[0].isr_ack &= ~(1 << irq);
508 kvm_vcpu_kick(vcpu); 531 s->wakeup_needed = true;
509 } 532 }
510} 533}
511 534
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 34b15915754d..cd1f362f413d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,6 +63,7 @@ struct kvm_kpic_state {
63 63
64struct kvm_pic { 64struct kvm_pic {
65 raw_spinlock_t lock; 65 raw_spinlock_t lock;
66 bool wakeup_needed;
66 unsigned pending_acks; 67 unsigned pending_acks;
67 struct kvm *kvm; 68 struct kvm *kvm;
68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 55c7524dda54..64bc6ea78d90 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -10,9 +10,7 @@ struct kvm_timer {
10}; 10};
11 11
12struct kvm_timer_ops { 12struct kvm_timer_ops {
13 bool (*is_periodic)(struct kvm_timer *); 13 bool (*is_periodic)(struct kvm_timer *);
14}; 14};
15 15
16
17enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); 16enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
18
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 48aeee8eefb0..ddfa8658fb6d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,6 @@ module_param(oos_shadow, bool, 0644);
148 148
149#include <trace/events/kvm.h> 149#include <trace/events/kvm.h>
150 150
151#undef TRACE_INCLUDE_FILE
152#define CREATE_TRACE_POINTS 151#define CREATE_TRACE_POINTS
153#include "mmutrace.h" 152#include "mmutrace.h"
154 153
@@ -174,12 +173,7 @@ struct kvm_shadow_walk_iterator {
174 shadow_walk_okay(&(_walker)); \ 173 shadow_walk_okay(&(_walker)); \
175 shadow_walk_next(&(_walker))) 174 shadow_walk_next(&(_walker)))
176 175
177 176typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
178struct kvm_unsync_walk {
179 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
180};
181
182typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
183 177
184static struct kmem_cache *pte_chain_cache; 178static struct kmem_cache *pte_chain_cache;
185static struct kmem_cache *rmap_desc_cache; 179static struct kmem_cache *rmap_desc_cache;
@@ -327,7 +321,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
327 page = alloc_page(GFP_KERNEL); 321 page = alloc_page(GFP_KERNEL);
328 if (!page) 322 if (!page)
329 return -ENOMEM; 323 return -ENOMEM;
330 set_page_private(page, 0);
331 cache->objects[cache->nobjs++] = page_address(page); 324 cache->objects[cache->nobjs++] = page_address(page);
332 } 325 }
333 return 0; 326 return 0;
@@ -438,9 +431,9 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
438 int i; 431 int i;
439 432
440 gfn = unalias_gfn(kvm, gfn); 433 gfn = unalias_gfn(kvm, gfn);
434 slot = gfn_to_memslot_unaliased(kvm, gfn);
441 for (i = PT_DIRECTORY_LEVEL; 435 for (i = PT_DIRECTORY_LEVEL;
442 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 436 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
443 slot = gfn_to_memslot_unaliased(kvm, gfn);
444 write_count = slot_largepage_idx(gfn, slot, i); 437 write_count = slot_largepage_idx(gfn, slot, i);
445 *write_count -= 1; 438 *write_count -= 1;
446 WARN_ON(*write_count < 0); 439 WARN_ON(*write_count < 0);
@@ -654,7 +647,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
654static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 647static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
655{ 648{
656 struct kvm_rmap_desc *desc; 649 struct kvm_rmap_desc *desc;
657 struct kvm_rmap_desc *prev_desc;
658 u64 *prev_spte; 650 u64 *prev_spte;
659 int i; 651 int i;
660 652
@@ -666,7 +658,6 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
666 return NULL; 658 return NULL;
667 } 659 }
668 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
669 prev_desc = NULL;
670 prev_spte = NULL; 661 prev_spte = NULL;
671 while (desc) { 662 while (desc) {
672 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { 663 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
@@ -794,7 +785,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
794 int retval = 0; 785 int retval = 0;
795 struct kvm_memslots *slots; 786 struct kvm_memslots *slots;
796 787
797 slots = rcu_dereference(kvm->memslots); 788 slots = kvm_memslots(kvm);
798 789
799 for (i = 0; i < slots->nmemslots; i++) { 790 for (i = 0; i < slots->nmemslots; i++) {
800 struct kvm_memory_slot *memslot = &slots->memslots[i]; 791 struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -925,7 +916,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
925 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 916 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
926 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 917 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
927 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 918 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
928 INIT_LIST_HEAD(&sp->oos_link);
929 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 919 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
930 sp->multimapped = 0; 920 sp->multimapped = 0;
931 sp->parent_pte = parent_pte; 921 sp->parent_pte = parent_pte;
@@ -1009,8 +999,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1009} 999}
1010 1000
1011 1001
1012static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1002static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1013 mmu_parent_walk_fn fn)
1014{ 1003{
1015 struct kvm_pte_chain *pte_chain; 1004 struct kvm_pte_chain *pte_chain;
1016 struct hlist_node *node; 1005 struct hlist_node *node;
@@ -1019,8 +1008,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1019 1008
1020 if (!sp->multimapped && sp->parent_pte) { 1009 if (!sp->multimapped && sp->parent_pte) {
1021 parent_sp = page_header(__pa(sp->parent_pte)); 1010 parent_sp = page_header(__pa(sp->parent_pte));
1022 fn(vcpu, parent_sp); 1011 fn(parent_sp);
1023 mmu_parent_walk(vcpu, parent_sp, fn); 1012 mmu_parent_walk(parent_sp, fn);
1024 return; 1013 return;
1025 } 1014 }
1026 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1015 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
@@ -1028,8 +1017,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1028 if (!pte_chain->parent_ptes[i]) 1017 if (!pte_chain->parent_ptes[i])
1029 break; 1018 break;
1030 parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); 1019 parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
1031 fn(vcpu, parent_sp); 1020 fn(parent_sp);
1032 mmu_parent_walk(vcpu, parent_sp, fn); 1021 mmu_parent_walk(parent_sp, fn);
1033 } 1022 }
1034} 1023}
1035 1024
@@ -1066,16 +1055,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
1066 } 1055 }
1067} 1056}
1068 1057
1069static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1058static int unsync_walk_fn(struct kvm_mmu_page *sp)
1070{ 1059{
1071 kvm_mmu_update_parents_unsync(sp); 1060 kvm_mmu_update_parents_unsync(sp);
1072 return 1; 1061 return 1;
1073} 1062}
1074 1063
1075static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, 1064static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1076 struct kvm_mmu_page *sp)
1077{ 1065{
1078 mmu_parent_walk(vcpu, sp, unsync_walk_fn); 1066 mmu_parent_walk(sp, unsync_walk_fn);
1079 kvm_mmu_update_parents_unsync(sp); 1067 kvm_mmu_update_parents_unsync(sp);
1080} 1068}
1081 1069
@@ -1209,7 +1197,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1209 1197
1210static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1198static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1211{ 1199{
1212 if (sp->role.glevels != vcpu->arch.mmu.root_level) { 1200 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1213 kvm_mmu_zap_page(vcpu->kvm, sp); 1201 kvm_mmu_zap_page(vcpu->kvm, sp);
1214 return 1; 1202 return 1;
1215 } 1203 }
@@ -1331,6 +1319,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1331 role = vcpu->arch.mmu.base_role; 1319 role = vcpu->arch.mmu.base_role;
1332 role.level = level; 1320 role.level = level;
1333 role.direct = direct; 1321 role.direct = direct;
1322 if (role.direct)
1323 role.cr4_pae = 0;
1334 role.access = access; 1324 role.access = access;
1335 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1325 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1336 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1326 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1351,7 +1341,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1351 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1341 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1352 if (sp->unsync_children) { 1342 if (sp->unsync_children) {
1353 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1343 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1354 kvm_mmu_mark_parents_unsync(vcpu, sp); 1344 kvm_mmu_mark_parents_unsync(sp);
1355 } 1345 }
1356 trace_kvm_mmu_get_page(sp, false); 1346 trace_kvm_mmu_get_page(sp, false);
1357 return sp; 1347 return sp;
@@ -1490,8 +1480,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1490 for_each_sp(pages, sp, parents, i) { 1480 for_each_sp(pages, sp, parents, i) {
1491 kvm_mmu_zap_page(kvm, sp); 1481 kvm_mmu_zap_page(kvm, sp);
1492 mmu_pages_clear_parents(&parents); 1482 mmu_pages_clear_parents(&parents);
1483 zapped++;
1493 } 1484 }
1494 zapped += pages.nr;
1495 kvm_mmu_pages_init(parent, &parents, &pages); 1485 kvm_mmu_pages_init(parent, &parents, &pages);
1496 } 1486 }
1497 1487
@@ -1542,14 +1532,16 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1542 */ 1532 */
1543 1533
1544 if (used_pages > kvm_nr_mmu_pages) { 1534 if (used_pages > kvm_nr_mmu_pages) {
1545 while (used_pages > kvm_nr_mmu_pages) { 1535 while (used_pages > kvm_nr_mmu_pages &&
1536 !list_empty(&kvm->arch.active_mmu_pages)) {
1546 struct kvm_mmu_page *page; 1537 struct kvm_mmu_page *page;
1547 1538
1548 page = container_of(kvm->arch.active_mmu_pages.prev, 1539 page = container_of(kvm->arch.active_mmu_pages.prev,
1549 struct kvm_mmu_page, link); 1540 struct kvm_mmu_page, link);
1550 kvm_mmu_zap_page(kvm, page); 1541 used_pages -= kvm_mmu_zap_page(kvm, page);
1551 used_pages--; 1542 used_pages--;
1552 } 1543 }
1544 kvm_nr_mmu_pages = used_pages;
1553 kvm->arch.n_free_mmu_pages = 0; 1545 kvm->arch.n_free_mmu_pages = 0;
1554 } 1546 }
1555 else 1547 else
@@ -1571,13 +1563,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1571 r = 0; 1563 r = 0;
1572 index = kvm_page_table_hashfn(gfn); 1564 index = kvm_page_table_hashfn(gfn);
1573 bucket = &kvm->arch.mmu_page_hash[index]; 1565 bucket = &kvm->arch.mmu_page_hash[index];
1566restart:
1574 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1567 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
1575 if (sp->gfn == gfn && !sp->role.direct) { 1568 if (sp->gfn == gfn && !sp->role.direct) {
1576 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1569 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1577 sp->role.word); 1570 sp->role.word);
1578 r = 1; 1571 r = 1;
1579 if (kvm_mmu_zap_page(kvm, sp)) 1572 if (kvm_mmu_zap_page(kvm, sp))
1580 n = bucket->first; 1573 goto restart;
1581 } 1574 }
1582 return r; 1575 return r;
1583} 1576}
@@ -1591,12 +1584,14 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1591 1584
1592 index = kvm_page_table_hashfn(gfn); 1585 index = kvm_page_table_hashfn(gfn);
1593 bucket = &kvm->arch.mmu_page_hash[index]; 1586 bucket = &kvm->arch.mmu_page_hash[index];
1587restart:
1594 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { 1588 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
1595 if (sp->gfn == gfn && !sp->role.direct 1589 if (sp->gfn == gfn && !sp->role.direct
1596 && !sp->role.invalid) { 1590 && !sp->role.invalid) {
1597 pgprintk("%s: zap %lx %x\n", 1591 pgprintk("%s: zap %lx %x\n",
1598 __func__, gfn, sp->role.word); 1592 __func__, gfn, sp->role.word);
1599 kvm_mmu_zap_page(kvm, sp); 1593 if (kvm_mmu_zap_page(kvm, sp))
1594 goto restart;
1600 } 1595 }
1601 } 1596 }
1602} 1597}
@@ -1623,20 +1618,6 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1623 } 1618 }
1624} 1619}
1625 1620
1626struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1627{
1628 struct page *page;
1629
1630 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
1631
1632 if (gpa == UNMAPPED_GVA)
1633 return NULL;
1634
1635 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1636
1637 return page;
1638}
1639
1640/* 1621/*
1641 * The function is based on mtrr_type_lookup() in 1622 * The function is based on mtrr_type_lookup() in
1642 * arch/x86/kernel/cpu/mtrr/generic.c 1623 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1762,7 +1743,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1762 ++vcpu->kvm->stat.mmu_unsync; 1743 ++vcpu->kvm->stat.mmu_unsync;
1763 sp->unsync = 1; 1744 sp->unsync = 1;
1764 1745
1765 kvm_mmu_mark_parents_unsync(vcpu, sp); 1746 kvm_mmu_mark_parents_unsync(sp);
1766 1747
1767 mmu_convert_notrap(sp); 1748 mmu_convert_notrap(sp);
1768 return 0; 1749 return 0;
@@ -2296,13 +2277,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2296 /* no rsvd bits for 2 level 4K page table entries */ 2277 /* no rsvd bits for 2 level 4K page table entries */
2297 context->rsvd_bits_mask[0][1] = 0; 2278 context->rsvd_bits_mask[0][1] = 0;
2298 context->rsvd_bits_mask[0][0] = 0; 2279 context->rsvd_bits_mask[0][0] = 0;
2280 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2281
2282 if (!is_pse(vcpu)) {
2283 context->rsvd_bits_mask[1][1] = 0;
2284 break;
2285 }
2286
2299 if (is_cpuid_PSE36()) 2287 if (is_cpuid_PSE36())
2300 /* 36bits PSE 4MB page */ 2288 /* 36bits PSE 4MB page */
2301 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 2289 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2302 else 2290 else
2303 /* 32 bits PSE 4MB page */ 2291 /* 32 bits PSE 4MB page */
2304 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 2292 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2305 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2306 break; 2293 break;
2307 case PT32E_ROOT_LEVEL: 2294 case PT32E_ROOT_LEVEL:
2308 context->rsvd_bits_mask[0][2] = 2295 context->rsvd_bits_mask[0][2] =
@@ -2315,7 +2302,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2315 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2302 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2316 rsvd_bits(maxphyaddr, 62) | 2303 rsvd_bits(maxphyaddr, 62) |
2317 rsvd_bits(13, 20); /* large page */ 2304 rsvd_bits(13, 20); /* large page */
2318 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; 2305 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2319 break; 2306 break;
2320 case PT64_ROOT_LEVEL: 2307 case PT64_ROOT_LEVEL:
2321 context->rsvd_bits_mask[0][3] = exb_bit_rsvd | 2308 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2333,7 +2320,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2333 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2320 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2334 rsvd_bits(maxphyaddr, 51) | 2321 rsvd_bits(maxphyaddr, 51) |
2335 rsvd_bits(13, 20); /* large page */ 2322 rsvd_bits(13, 20); /* large page */
2336 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; 2323 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2337 break; 2324 break;
2338 } 2325 }
2339} 2326}
@@ -2435,7 +2422,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2435 else 2422 else
2436 r = paging32_init_context(vcpu); 2423 r = paging32_init_context(vcpu);
2437 2424
2438 vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; 2425 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2439 2426
2440 return r; 2427 return r;
2441} 2428}
@@ -2524,7 +2511,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2524 } 2511 }
2525 2512
2526 ++vcpu->kvm->stat.mmu_pte_updated; 2513 ++vcpu->kvm->stat.mmu_pte_updated;
2527 if (sp->role.glevels == PT32_ROOT_LEVEL) 2514 if (!sp->role.cr4_pae)
2528 paging32_update_pte(vcpu, sp, spte, new); 2515 paging32_update_pte(vcpu, sp, spte, new);
2529 else 2516 else
2530 paging64_update_pte(vcpu, sp, spte, new); 2517 paging64_update_pte(vcpu, sp, spte, new);
@@ -2559,36 +2546,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2559} 2546}
2560 2547
2561static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2548static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2562 const u8 *new, int bytes) 2549 u64 gpte)
2563{ 2550{
2564 gfn_t gfn; 2551 gfn_t gfn;
2565 int r;
2566 u64 gpte = 0;
2567 pfn_t pfn; 2552 pfn_t pfn;
2568 2553
2569 if (bytes != 4 && bytes != 8)
2570 return;
2571
2572 /*
2573 * Assume that the pte write on a page table of the same type
2574 * as the current vcpu paging mode. This is nearly always true
2575 * (might be false while changing modes). Note it is verified later
2576 * by update_pte().
2577 */
2578 if (is_pae(vcpu)) {
2579 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2580 if ((bytes == 4) && (gpa % 4 == 0)) {
2581 r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
2582 if (r)
2583 return;
2584 memcpy((void *)&gpte + (gpa % 8), new, 4);
2585 } else if ((bytes == 8) && (gpa % 8 == 0)) {
2586 memcpy((void *)&gpte, new, 8);
2587 }
2588 } else {
2589 if ((bytes == 4) && (gpa % 4 == 0))
2590 memcpy((void *)&gpte, new, 4);
2591 }
2592 if (!is_present_gpte(gpte)) 2554 if (!is_present_gpte(gpte))
2593 return; 2555 return;
2594 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2556 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2637,10 +2599,46 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2637 int flooded = 0; 2599 int flooded = 0;
2638 int npte; 2600 int npte;
2639 int r; 2601 int r;
2602 int invlpg_counter;
2640 2603
2641 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2604 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2642 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 2605
2606 invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
2607
2608 /*
2609 * Assume that the pte write on a page table of the same type
2610 * as the current vcpu paging mode. This is nearly always true
2611 * (might be false while changing modes). Note it is verified later
2612 * by update_pte().
2613 */
2614 if ((is_pae(vcpu) && bytes == 4) || !new) {
2615 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2616 if (is_pae(vcpu)) {
2617 gpa &= ~(gpa_t)7;
2618 bytes = 8;
2619 }
2620 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
2621 if (r)
2622 gentry = 0;
2623 new = (const u8 *)&gentry;
2624 }
2625
2626 switch (bytes) {
2627 case 4:
2628 gentry = *(const u32 *)new;
2629 break;
2630 case 8:
2631 gentry = *(const u64 *)new;
2632 break;
2633 default:
2634 gentry = 0;
2635 break;
2636 }
2637
2638 mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2643 spin_lock(&vcpu->kvm->mmu_lock); 2639 spin_lock(&vcpu->kvm->mmu_lock);
2640 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2641 gentry = 0;
2644 kvm_mmu_access_page(vcpu, gfn); 2642 kvm_mmu_access_page(vcpu, gfn);
2645 kvm_mmu_free_some_pages(vcpu); 2643 kvm_mmu_free_some_pages(vcpu);
2646 ++vcpu->kvm->stat.mmu_pte_write; 2644 ++vcpu->kvm->stat.mmu_pte_write;
@@ -2659,10 +2657,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2659 } 2657 }
2660 index = kvm_page_table_hashfn(gfn); 2658 index = kvm_page_table_hashfn(gfn);
2661 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2659 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2660
2661restart:
2662 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2662 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
2663 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) 2663 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2664 continue; 2664 continue;
2665 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2665 pte_size = sp->role.cr4_pae ? 8 : 4;
2666 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2666 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2667 misaligned |= bytes < 4; 2667 misaligned |= bytes < 4;
2668 if (misaligned || flooded) { 2668 if (misaligned || flooded) {
@@ -2679,14 +2679,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2679 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2679 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2680 gpa, bytes, sp->role.word); 2680 gpa, bytes, sp->role.word);
2681 if (kvm_mmu_zap_page(vcpu->kvm, sp)) 2681 if (kvm_mmu_zap_page(vcpu->kvm, sp))
2682 n = bucket->first; 2682 goto restart;
2683 ++vcpu->kvm->stat.mmu_flooded; 2683 ++vcpu->kvm->stat.mmu_flooded;
2684 continue; 2684 continue;
2685 } 2685 }
2686 page_offset = offset; 2686 page_offset = offset;
2687 level = sp->role.level; 2687 level = sp->role.level;
2688 npte = 1; 2688 npte = 1;
2689 if (sp->role.glevels == PT32_ROOT_LEVEL) { 2689 if (!sp->role.cr4_pae) {
2690 page_offset <<= 1; /* 32->64 */ 2690 page_offset <<= 1; /* 32->64 */
2691 /* 2691 /*
2692 * A 32-bit pde maps 4MB while the shadow pdes map 2692 * A 32-bit pde maps 4MB while the shadow pdes map
@@ -2704,20 +2704,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2704 continue; 2704 continue;
2705 } 2705 }
2706 spte = &sp->spt[page_offset / sizeof(*spte)]; 2706 spte = &sp->spt[page_offset / sizeof(*spte)];
2707 if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
2708 gentry = 0;
2709 r = kvm_read_guest_atomic(vcpu->kvm,
2710 gpa & ~(u64)(pte_size - 1),
2711 &gentry, pte_size);
2712 new = (const void *)&gentry;
2713 if (r < 0)
2714 new = NULL;
2715 }
2716 while (npte--) { 2707 while (npte--) {
2717 entry = *spte; 2708 entry = *spte;
2718 mmu_pte_write_zap_pte(vcpu, sp, spte); 2709 mmu_pte_write_zap_pte(vcpu, sp, spte);
2719 if (new) 2710 if (gentry)
2720 mmu_pte_write_new_pte(vcpu, sp, spte, new); 2711 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2721 mmu_pte_write_flush_tlb(vcpu, entry, *spte); 2712 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
2722 ++spte; 2713 ++spte;
2723 } 2714 }
@@ -2897,10 +2888,11 @@ void kvm_mmu_zap_all(struct kvm *kvm)
2897 struct kvm_mmu_page *sp, *node; 2888 struct kvm_mmu_page *sp, *node;
2898 2889
2899 spin_lock(&kvm->mmu_lock); 2890 spin_lock(&kvm->mmu_lock);
2891restart:
2900 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 2892 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2901 if (kvm_mmu_zap_page(kvm, sp)) 2893 if (kvm_mmu_zap_page(kvm, sp))
2902 node = container_of(kvm->arch.active_mmu_pages.next, 2894 goto restart;
2903 struct kvm_mmu_page, link); 2895
2904 spin_unlock(&kvm->mmu_lock); 2896 spin_unlock(&kvm->mmu_lock);
2905 2897
2906 kvm_flush_remote_tlbs(kvm); 2898 kvm_flush_remote_tlbs(kvm);
@@ -3008,7 +3000,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3008 unsigned int nr_pages = 0; 3000 unsigned int nr_pages = 0;
3009 struct kvm_memslots *slots; 3001 struct kvm_memslots *slots;
3010 3002
3011 slots = rcu_dereference(kvm->memslots); 3003 slots = kvm_memslots(kvm);
3004
3012 for (i = 0; i < slots->nmemslots; i++) 3005 for (i = 0; i < slots->nmemslots; i++)
3013 nr_pages += slots->memslots[i].npages; 3006 nr_pages += slots->memslots[i].npages;
3014 3007
@@ -3171,8 +3164,7 @@ static gva_t canonicalize(gva_t gva)
3171} 3164}
3172 3165
3173 3166
3174typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, 3167typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3175 u64 *sptep);
3176 3168
3177static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, 3169static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3178 inspect_spte_fn fn) 3170 inspect_spte_fn fn)
@@ -3188,7 +3180,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3188 child = page_header(ent & PT64_BASE_ADDR_MASK); 3180 child = page_header(ent & PT64_BASE_ADDR_MASK);
3189 __mmu_spte_walk(kvm, child, fn); 3181 __mmu_spte_walk(kvm, child, fn);
3190 } else 3182 } else
3191 fn(kvm, sp, &sp->spt[i]); 3183 fn(kvm, &sp->spt[i]);
3192 } 3184 }
3193 } 3185 }
3194} 3186}
@@ -3279,11 +3271,13 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
3279 3271
3280static int count_rmaps(struct kvm_vcpu *vcpu) 3272static int count_rmaps(struct kvm_vcpu *vcpu)
3281{ 3273{
3274 struct kvm *kvm = vcpu->kvm;
3275 struct kvm_memslots *slots;
3282 int nmaps = 0; 3276 int nmaps = 0;
3283 int i, j, k, idx; 3277 int i, j, k, idx;
3284 3278
3285 idx = srcu_read_lock(&kvm->srcu); 3279 idx = srcu_read_lock(&kvm->srcu);
3286 slots = rcu_dereference(kvm->memslots); 3280 slots = kvm_memslots(kvm);
3287 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3281 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3288 struct kvm_memory_slot *m = &slots->memslots[i]; 3282 struct kvm_memory_slot *m = &slots->memslots[i];
3289 struct kvm_rmap_desc *d; 3283 struct kvm_rmap_desc *d;
@@ -3312,7 +3306,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3312 return nmaps; 3306 return nmaps;
3313} 3307}
3314 3308
3315void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) 3309void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3316{ 3310{
3317 unsigned long *rmapp; 3311 unsigned long *rmapp;
3318 struct kvm_mmu_page *rev_sp; 3312 struct kvm_mmu_page *rev_sp;
@@ -3328,14 +3322,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
3328 printk(KERN_ERR "%s: no memslot for gfn %ld\n", 3322 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3329 audit_msg, gfn); 3323 audit_msg, gfn);
3330 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", 3324 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3331 audit_msg, sptep - rev_sp->spt, 3325 audit_msg, (long int)(sptep - rev_sp->spt),
3332 rev_sp->gfn); 3326 rev_sp->gfn);
3333 dump_stack(); 3327 dump_stack();
3334 return; 3328 return;
3335 } 3329 }
3336 3330
3337 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 3331 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
3338 is_large_pte(*sptep)); 3332 rev_sp->role.level);
3339 if (!*rmapp) { 3333 if (!*rmapp) {
3340 if (!printk_ratelimit()) 3334 if (!printk_ratelimit())
3341 return; 3335 return;
@@ -3370,7 +3364,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3370 continue; 3364 continue;
3371 if (!(ent & PT_WRITABLE_MASK)) 3365 if (!(ent & PT_WRITABLE_MASK))
3372 continue; 3366 continue;
3373 inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); 3367 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3374 } 3368 }
3375 } 3369 }
3376 return; 3370 return;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3e4a5c6ca2a9..bc4f7f0be2b1 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -6,8 +6,6 @@
6 6
7#undef TRACE_SYSTEM 7#undef TRACE_SYSTEM
8#define TRACE_SYSTEM kvmmmu 8#define TRACE_SYSTEM kvmmmu
9#define TRACE_INCLUDE_PATH .
10#define TRACE_INCLUDE_FILE mmutrace
11 9
12#define KVM_MMU_PAGE_FIELDS \ 10#define KVM_MMU_PAGE_FIELDS \
13 __field(__u64, gfn) \ 11 __field(__u64, gfn) \
@@ -30,14 +28,14 @@
30 \ 28 \
31 role.word = __entry->role; \ 29 role.word = __entry->role; \
32 \ 30 \
33 trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ 31 trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \
34 " %snxe root %u %s%c", \ 32 " %snxe root %u %s%c", \
35 __entry->gfn, role.level, role.glevels, \ 33 __entry->gfn, role.level, \
34 role.cr4_pae ? " pae" : "", \
36 role.quadrant, \ 35 role.quadrant, \
37 role.direct ? " direct" : "", \ 36 role.direct ? " direct" : "", \
38 access_str[role.access], \ 37 access_str[role.access], \
39 role.invalid ? " invalid" : "", \ 38 role.invalid ? " invalid" : "", \
40 role.cr4_pge ? "" : "!", \
41 role.nxe ? "" : "!", \ 39 role.nxe ? "" : "!", \
42 __entry->root_count, \ 40 __entry->root_count, \
43 __entry->unsync ? "unsync" : "sync", 0); \ 41 __entry->unsync ? "unsync" : "sync", 0); \
@@ -216,5 +214,10 @@ TRACE_EVENT(
216 214
217#endif /* _TRACE_KVMMMU_H */ 215#endif /* _TRACE_KVMMMU_H */
218 216
217#undef TRACE_INCLUDE_PATH
218#define TRACE_INCLUDE_PATH .
219#undef TRACE_INCLUDE_FILE
220#define TRACE_INCLUDE_FILE mmutrace
221
219/* This part must be outside protection */ 222/* This part must be outside protection */
220#include <trace/define_trace.h> 223#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 81eab9a50e6a..d0cc07eb6eda 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -170,7 +170,7 @@ walk:
170 goto access_error; 170 goto access_error;
171 171
172#if PTTYPE == 64 172#if PTTYPE == 64
173 if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) 173 if (fetch_fault && (pte & PT64_NX_MASK))
174 goto access_error; 174 goto access_error;
175#endif 175#endif
176 176
@@ -190,10 +190,10 @@ walk:
190 190
191 if ((walker->level == PT_PAGE_TABLE_LEVEL) || 191 if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
192 ((walker->level == PT_DIRECTORY_LEVEL) && 192 ((walker->level == PT_DIRECTORY_LEVEL) &&
193 (pte & PT_PAGE_SIZE_MASK) && 193 is_large_pte(pte) &&
194 (PTTYPE == 64 || is_pse(vcpu))) || 194 (PTTYPE == 64 || is_pse(vcpu))) ||
195 ((walker->level == PT_PDPE_LEVEL) && 195 ((walker->level == PT_PDPE_LEVEL) &&
196 (pte & PT_PAGE_SIZE_MASK) && 196 is_large_pte(pte) &&
197 is_long_mode(vcpu))) { 197 is_long_mode(vcpu))) {
198 int lvl = walker->level; 198 int lvl = walker->level;
199 199
@@ -258,11 +258,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
258 pt_element_t gpte; 258 pt_element_t gpte;
259 unsigned pte_access; 259 unsigned pte_access;
260 pfn_t pfn; 260 pfn_t pfn;
261 u64 new_spte;
261 262
262 gpte = *(const pt_element_t *)pte; 263 gpte = *(const pt_element_t *)pte;
263 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 264 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
264 if (!is_present_gpte(gpte)) 265 if (!is_present_gpte(gpte)) {
265 __set_spte(spte, shadow_notrap_nonpresent_pte); 266 if (page->unsync)
267 new_spte = shadow_trap_nonpresent_pte;
268 else
269 new_spte = shadow_notrap_nonpresent_pte;
270 __set_spte(spte, new_spte);
271 }
266 return; 272 return;
267 } 273 }
268 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 274 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -457,6 +463,7 @@ out_unlock:
457static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 463static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
458{ 464{
459 struct kvm_shadow_walk_iterator iterator; 465 struct kvm_shadow_walk_iterator iterator;
466 gpa_t pte_gpa = -1;
460 int level; 467 int level;
461 u64 *sptep; 468 u64 *sptep;
462 int need_flush = 0; 469 int need_flush = 0;
@@ -470,6 +477,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
470 if (level == PT_PAGE_TABLE_LEVEL || 477 if (level == PT_PAGE_TABLE_LEVEL ||
471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 478 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 479 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
480 struct kvm_mmu_page *sp = page_header(__pa(sptep));
481
482 pte_gpa = (sp->gfn << PAGE_SHIFT);
483 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
473 484
474 if (is_shadow_present_pte(*sptep)) { 485 if (is_shadow_present_pte(*sptep)) {
475 rmap_remove(vcpu->kvm, sptep); 486 rmap_remove(vcpu->kvm, sptep);
@@ -487,7 +498,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
487 498
488 if (need_flush) 499 if (need_flush)
489 kvm_flush_remote_tlbs(vcpu->kvm); 500 kvm_flush_remote_tlbs(vcpu->kvm);
501
502 atomic_inc(&vcpu->kvm->arch.invlpg_counter);
503
490 spin_unlock(&vcpu->kvm->mmu_lock); 504 spin_unlock(&vcpu->kvm->mmu_lock);
505
506 if (pte_gpa == -1)
507 return;
508
509 if (mmu_topup_memory_caches(vcpu))
510 return;
511 kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
491} 512}
492 513
493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 514static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
@@ -551,12 +572,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
551{ 572{
552 int i, offset, nr_present; 573 int i, offset, nr_present;
553 bool reset_host_protection; 574 bool reset_host_protection;
575 gpa_t first_pte_gpa;
554 576
555 offset = nr_present = 0; 577 offset = nr_present = 0;
556 578
557 if (PTTYPE == 32) 579 if (PTTYPE == 32)
558 offset = sp->role.quadrant << PT64_LEVEL_BITS; 580 offset = sp->role.quadrant << PT64_LEVEL_BITS;
559 581
582 first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
583
560 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 584 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
561 unsigned pte_access; 585 unsigned pte_access;
562 pt_element_t gpte; 586 pt_element_t gpte;
@@ -566,8 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
566 if (!is_shadow_present_pte(sp->spt[i])) 590 if (!is_shadow_present_pte(sp->spt[i]))
567 continue; 591 continue;
568 592
569 pte_gpa = gfn_to_gpa(sp->gfn); 593 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
570 pte_gpa += (i+offset) * sizeof(pt_element_t);
571 594
572 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, 595 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
573 sizeof(pt_element_t))) 596 sizeof(pt_element_t)))
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 445c59411ed0..ab78eb8ba899 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -44,10 +44,11 @@ MODULE_LICENSE("GPL");
44#define SEG_TYPE_LDT 2 44#define SEG_TYPE_LDT 2
45#define SEG_TYPE_BUSY_TSS16 3 45#define SEG_TYPE_BUSY_TSS16 3
46 46
47#define SVM_FEATURE_NPT (1 << 0) 47#define SVM_FEATURE_NPT (1 << 0)
48#define SVM_FEATURE_LBRV (1 << 1) 48#define SVM_FEATURE_LBRV (1 << 1)
49#define SVM_FEATURE_SVML (1 << 2) 49#define SVM_FEATURE_SVML (1 << 2)
50#define SVM_FEATURE_PAUSE_FILTER (1 << 10) 50#define SVM_FEATURE_NRIP (1 << 3)
51#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
51 52
52#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 53#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
53#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 54#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -70,6 +71,7 @@ struct kvm_vcpu;
70struct nested_state { 71struct nested_state {
71 struct vmcb *hsave; 72 struct vmcb *hsave;
72 u64 hsave_msr; 73 u64 hsave_msr;
74 u64 vm_cr_msr;
73 u64 vmcb; 75 u64 vmcb;
74 76
75 /* These are the merged vectors */ 77 /* These are the merged vectors */
@@ -77,6 +79,7 @@ struct nested_state {
77 79
78 /* gpa pointers to the real vectors */ 80 /* gpa pointers to the real vectors */
79 u64 vmcb_msrpm; 81 u64 vmcb_msrpm;
82 u64 vmcb_iopm;
80 83
81 /* A VMEXIT is required but not yet emulated */ 84 /* A VMEXIT is required but not yet emulated */
82 bool exit_required; 85 bool exit_required;
@@ -91,6 +94,9 @@ struct nested_state {
91 94
92}; 95};
93 96
97#define MSRPM_OFFSETS 16
98static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
99
94struct vcpu_svm { 100struct vcpu_svm {
95 struct kvm_vcpu vcpu; 101 struct kvm_vcpu vcpu;
96 struct vmcb *vmcb; 102 struct vmcb *vmcb;
@@ -110,13 +116,39 @@ struct vcpu_svm {
110 struct nested_state nested; 116 struct nested_state nested;
111 117
112 bool nmi_singlestep; 118 bool nmi_singlestep;
119
120 unsigned int3_injected;
121 unsigned long int3_rip;
122};
123
124#define MSR_INVALID 0xffffffffU
125
126static struct svm_direct_access_msrs {
127 u32 index; /* Index of the MSR */
128 bool always; /* True if intercept is always on */
129} direct_access_msrs[] = {
130 { .index = MSR_K6_STAR, .always = true },
131 { .index = MSR_IA32_SYSENTER_CS, .always = true },
132#ifdef CONFIG_X86_64
133 { .index = MSR_GS_BASE, .always = true },
134 { .index = MSR_FS_BASE, .always = true },
135 { .index = MSR_KERNEL_GS_BASE, .always = true },
136 { .index = MSR_LSTAR, .always = true },
137 { .index = MSR_CSTAR, .always = true },
138 { .index = MSR_SYSCALL_MASK, .always = true },
139#endif
140 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
141 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
142 { .index = MSR_IA32_LASTINTFROMIP, .always = false },
143 { .index = MSR_IA32_LASTINTTOIP, .always = false },
144 { .index = MSR_INVALID, .always = false },
113}; 145};
114 146
115/* enable NPT for AMD64 and X86 with PAE */ 147/* enable NPT for AMD64 and X86 with PAE */
116#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 148#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
117static bool npt_enabled = true; 149static bool npt_enabled = true;
118#else 150#else
119static bool npt_enabled = false; 151static bool npt_enabled;
120#endif 152#endif
121static int npt = 1; 153static int npt = 1;
122 154
@@ -129,6 +161,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu);
129static void svm_complete_interrupts(struct vcpu_svm *svm); 161static void svm_complete_interrupts(struct vcpu_svm *svm);
130 162
131static int nested_svm_exit_handled(struct vcpu_svm *svm); 163static int nested_svm_exit_handled(struct vcpu_svm *svm);
164static int nested_svm_intercept(struct vcpu_svm *svm);
132static int nested_svm_vmexit(struct vcpu_svm *svm); 165static int nested_svm_vmexit(struct vcpu_svm *svm);
133static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 166static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
134 bool has_error_code, u32 error_code); 167 bool has_error_code, u32 error_code);
@@ -163,8 +196,8 @@ static unsigned long iopm_base;
163struct kvm_ldttss_desc { 196struct kvm_ldttss_desc {
164 u16 limit0; 197 u16 limit0;
165 u16 base0; 198 u16 base0;
166 unsigned base1 : 8, type : 5, dpl : 2, p : 1; 199 unsigned base1:8, type:5, dpl:2, p:1;
167 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; 200 unsigned limit1:4, zero0:3, g:1, base2:8;
168 u32 base3; 201 u32 base3;
169 u32 zero1; 202 u32 zero1;
170} __attribute__((packed)); 203} __attribute__((packed));
@@ -194,6 +227,27 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
194#define MSRS_RANGE_SIZE 2048 227#define MSRS_RANGE_SIZE 2048
195#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) 228#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
196 229
230static u32 svm_msrpm_offset(u32 msr)
231{
232 u32 offset;
233 int i;
234
235 for (i = 0; i < NUM_MSR_MAPS; i++) {
236 if (msr < msrpm_ranges[i] ||
237 msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
238 continue;
239
240 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
241 offset += (i * MSRS_RANGE_SIZE); /* add range offset */
242
243 /* Now we have the u8 offset - but need the u32 offset */
244 return offset / 4;
245 }
246
247 /* MSR not in any range */
248 return MSR_INVALID;
249}
250
197#define MAX_INST_SIZE 15 251#define MAX_INST_SIZE 15
198 252
199static inline u32 svm_has(u32 feat) 253static inline u32 svm_has(u32 feat)
@@ -213,7 +267,7 @@ static inline void stgi(void)
213 267
214static inline void invlpga(unsigned long addr, u32 asid) 268static inline void invlpga(unsigned long addr, u32 asid)
215{ 269{
216 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); 270 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
217} 271}
218 272
219static inline void force_new_asid(struct kvm_vcpu *vcpu) 273static inline void force_new_asid(struct kvm_vcpu *vcpu)
@@ -235,23 +289,6 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
235 vcpu->arch.efer = efer; 289 vcpu->arch.efer = efer;
236} 290}
237 291
238static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
239 bool has_error_code, u32 error_code)
240{
241 struct vcpu_svm *svm = to_svm(vcpu);
242
243 /* If we are within a nested VM we'd better #VMEXIT and let the
244 guest handle the exception */
245 if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
246 return;
247
248 svm->vmcb->control.event_inj = nr
249 | SVM_EVTINJ_VALID
250 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
251 | SVM_EVTINJ_TYPE_EXEPT;
252 svm->vmcb->control.event_inj_err = error_code;
253}
254
255static int is_external_interrupt(u32 info) 292static int is_external_interrupt(u32 info)
256{ 293{
257 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 294 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -264,7 +301,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
264 u32 ret = 0; 301 u32 ret = 0;
265 302
266 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 303 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
267 ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; 304 ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
268 return ret & mask; 305 return ret & mask;
269} 306}
270 307
@@ -283,6 +320,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
283{ 320{
284 struct vcpu_svm *svm = to_svm(vcpu); 321 struct vcpu_svm *svm = to_svm(vcpu);
285 322
323 if (svm->vmcb->control.next_rip != 0)
324 svm->next_rip = svm->vmcb->control.next_rip;
325
286 if (!svm->next_rip) { 326 if (!svm->next_rip) {
287 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != 327 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
288 EMULATE_DONE) 328 EMULATE_DONE)
@@ -297,6 +337,41 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
297 svm_set_interrupt_shadow(vcpu, 0); 337 svm_set_interrupt_shadow(vcpu, 0);
298} 338}
299 339
340static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
341 bool has_error_code, u32 error_code)
342{
343 struct vcpu_svm *svm = to_svm(vcpu);
344
345 /*
346 * If we are within a nested VM we'd better #VMEXIT and let the guest
347 * handle the exception
348 */
349 if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
350 return;
351
352 if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
353 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
354
355 /*
356 * For guest debugging where we have to reinject #BP if some
357 * INT3 is guest-owned:
358 * Emulate nRIP by moving RIP forward. Will fail if injection
359 * raises a fault that is not intercepted. Still better than
360 * failing in all cases.
361 */
362 skip_emulated_instruction(&svm->vcpu);
363 rip = kvm_rip_read(&svm->vcpu);
364 svm->int3_rip = rip + svm->vmcb->save.cs.base;
365 svm->int3_injected = rip - old_rip;
366 }
367
368 svm->vmcb->control.event_inj = nr
369 | SVM_EVTINJ_VALID
370 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
371 | SVM_EVTINJ_TYPE_EXEPT;
372 svm->vmcb->control.event_inj_err = error_code;
373}
374
300static int has_svm(void) 375static int has_svm(void)
301{ 376{
302 const char *msg; 377 const char *msg;
@@ -319,7 +394,7 @@ static int svm_hardware_enable(void *garbage)
319 394
320 struct svm_cpu_data *sd; 395 struct svm_cpu_data *sd;
321 uint64_t efer; 396 uint64_t efer;
322 struct descriptor_table gdt_descr; 397 struct desc_ptr gdt_descr;
323 struct desc_struct *gdt; 398 struct desc_struct *gdt;
324 int me = raw_smp_processor_id(); 399 int me = raw_smp_processor_id();
325 400
@@ -344,8 +419,8 @@ static int svm_hardware_enable(void *garbage)
344 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 419 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
345 sd->next_asid = sd->max_asid + 1; 420 sd->next_asid = sd->max_asid + 1;
346 421
347 kvm_get_gdt(&gdt_descr); 422 native_store_gdt(&gdt_descr);
348 gdt = (struct desc_struct *)gdt_descr.base; 423 gdt = (struct desc_struct *)gdt_descr.address;
349 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 424 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
350 425
351 wrmsrl(MSR_EFER, efer | EFER_SVME); 426 wrmsrl(MSR_EFER, efer | EFER_SVME);
@@ -391,42 +466,98 @@ err_1:
391 466
392} 467}
393 468
469static bool valid_msr_intercept(u32 index)
470{
471 int i;
472
473 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
474 if (direct_access_msrs[i].index == index)
475 return true;
476
477 return false;
478}
479
394static void set_msr_interception(u32 *msrpm, unsigned msr, 480static void set_msr_interception(u32 *msrpm, unsigned msr,
395 int read, int write) 481 int read, int write)
396{ 482{
483 u8 bit_read, bit_write;
484 unsigned long tmp;
485 u32 offset;
486
487 /*
488 * If this warning triggers extend the direct_access_msrs list at the
489 * beginning of the file
490 */
491 WARN_ON(!valid_msr_intercept(msr));
492
493 offset = svm_msrpm_offset(msr);
494 bit_read = 2 * (msr & 0x0f);
495 bit_write = 2 * (msr & 0x0f) + 1;
496 tmp = msrpm[offset];
497
498 BUG_ON(offset == MSR_INVALID);
499
500 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
501 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
502
503 msrpm[offset] = tmp;
504}
505
506static void svm_vcpu_init_msrpm(u32 *msrpm)
507{
397 int i; 508 int i;
398 509
399 for (i = 0; i < NUM_MSR_MAPS; i++) { 510 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
400 if (msr >= msrpm_ranges[i] && 511
401 msr < msrpm_ranges[i] + MSRS_IN_RANGE) { 512 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
402 u32 msr_offset = (i * MSRS_IN_RANGE + msr - 513 if (!direct_access_msrs[i].always)
403 msrpm_ranges[i]) * 2; 514 continue;
404 515
405 u32 *base = msrpm + (msr_offset / 32); 516 set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
406 u32 msr_shift = msr_offset % 32; 517 }
407 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); 518}
408 *base = (*base & ~(0x3 << msr_shift)) | 519
409 (mask << msr_shift); 520static void add_msr_offset(u32 offset)
521{
522 int i;
523
524 for (i = 0; i < MSRPM_OFFSETS; ++i) {
525
526 /* Offset already in list? */
527 if (msrpm_offsets[i] == offset)
410 return; 528 return;
411 } 529
530 /* Slot used by another offset? */
531 if (msrpm_offsets[i] != MSR_INVALID)
532 continue;
533
534 /* Add offset to list */
535 msrpm_offsets[i] = offset;
536
537 return;
412 } 538 }
539
540 /*
541 * If this BUG triggers the msrpm_offsets table has an overflow. Just
542 * increase MSRPM_OFFSETS in this case.
543 */
413 BUG(); 544 BUG();
414} 545}
415 546
416static void svm_vcpu_init_msrpm(u32 *msrpm) 547static void init_msrpm_offsets(void)
417{ 548{
418 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); 549 int i;
419 550
420#ifdef CONFIG_X86_64 551 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
421 set_msr_interception(msrpm, MSR_GS_BASE, 1, 1); 552
422 set_msr_interception(msrpm, MSR_FS_BASE, 1, 1); 553 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
423 set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1); 554 u32 offset;
424 set_msr_interception(msrpm, MSR_LSTAR, 1, 1); 555
425 set_msr_interception(msrpm, MSR_CSTAR, 1, 1); 556 offset = svm_msrpm_offset(direct_access_msrs[i].index);
426 set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1); 557 BUG_ON(offset == MSR_INVALID);
427#endif 558
428 set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); 559 add_msr_offset(offset);
429 set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); 560 }
430} 561}
431 562
432static void svm_enable_lbrv(struct vcpu_svm *svm) 563static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -467,6 +598,8 @@ static __init int svm_hardware_setup(void)
467 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); 598 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
468 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 599 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
469 600
601 init_msrpm_offsets();
602
470 if (boot_cpu_has(X86_FEATURE_NX)) 603 if (boot_cpu_has(X86_FEATURE_NX))
471 kvm_enable_efer_bits(EFER_NX); 604 kvm_enable_efer_bits(EFER_NX);
472 605
@@ -523,7 +656,7 @@ static void init_seg(struct vmcb_seg *seg)
523{ 656{
524 seg->selector = 0; 657 seg->selector = 0;
525 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 658 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
526 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 659 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
527 seg->limit = 0xffff; 660 seg->limit = 0xffff;
528 seg->base = 0; 661 seg->base = 0;
529} 662}
@@ -543,16 +676,16 @@ static void init_vmcb(struct vcpu_svm *svm)
543 676
544 svm->vcpu.fpu_active = 1; 677 svm->vcpu.fpu_active = 1;
545 678
546 control->intercept_cr_read = INTERCEPT_CR0_MASK | 679 control->intercept_cr_read = INTERCEPT_CR0_MASK |
547 INTERCEPT_CR3_MASK | 680 INTERCEPT_CR3_MASK |
548 INTERCEPT_CR4_MASK; 681 INTERCEPT_CR4_MASK;
549 682
550 control->intercept_cr_write = INTERCEPT_CR0_MASK | 683 control->intercept_cr_write = INTERCEPT_CR0_MASK |
551 INTERCEPT_CR3_MASK | 684 INTERCEPT_CR3_MASK |
552 INTERCEPT_CR4_MASK | 685 INTERCEPT_CR4_MASK |
553 INTERCEPT_CR8_MASK; 686 INTERCEPT_CR8_MASK;
554 687
555 control->intercept_dr_read = INTERCEPT_DR0_MASK | 688 control->intercept_dr_read = INTERCEPT_DR0_MASK |
556 INTERCEPT_DR1_MASK | 689 INTERCEPT_DR1_MASK |
557 INTERCEPT_DR2_MASK | 690 INTERCEPT_DR2_MASK |
558 INTERCEPT_DR3_MASK | 691 INTERCEPT_DR3_MASK |
@@ -561,7 +694,7 @@ static void init_vmcb(struct vcpu_svm *svm)
561 INTERCEPT_DR6_MASK | 694 INTERCEPT_DR6_MASK |
562 INTERCEPT_DR7_MASK; 695 INTERCEPT_DR7_MASK;
563 696
564 control->intercept_dr_write = INTERCEPT_DR0_MASK | 697 control->intercept_dr_write = INTERCEPT_DR0_MASK |
565 INTERCEPT_DR1_MASK | 698 INTERCEPT_DR1_MASK |
566 INTERCEPT_DR2_MASK | 699 INTERCEPT_DR2_MASK |
567 INTERCEPT_DR3_MASK | 700 INTERCEPT_DR3_MASK |
@@ -575,7 +708,7 @@ static void init_vmcb(struct vcpu_svm *svm)
575 (1 << MC_VECTOR); 708 (1 << MC_VECTOR);
576 709
577 710
578 control->intercept = (1ULL << INTERCEPT_INTR) | 711 control->intercept = (1ULL << INTERCEPT_INTR) |
579 (1ULL << INTERCEPT_NMI) | 712 (1ULL << INTERCEPT_NMI) |
580 (1ULL << INTERCEPT_SMI) | 713 (1ULL << INTERCEPT_SMI) |
581 (1ULL << INTERCEPT_SELECTIVE_CR0) | 714 (1ULL << INTERCEPT_SELECTIVE_CR0) |
@@ -636,7 +769,8 @@ static void init_vmcb(struct vcpu_svm *svm)
636 save->rip = 0x0000fff0; 769 save->rip = 0x0000fff0;
637 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 770 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
638 771
639 /* This is the guest-visible cr0 value. 772 /*
773 * This is the guest-visible cr0 value.
640 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 774 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
641 */ 775 */
642 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 776 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
@@ -706,30 +840,30 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
706 if (err) 840 if (err)
707 goto free_svm; 841 goto free_svm;
708 842
843 err = -ENOMEM;
709 page = alloc_page(GFP_KERNEL); 844 page = alloc_page(GFP_KERNEL);
710 if (!page) { 845 if (!page)
711 err = -ENOMEM;
712 goto uninit; 846 goto uninit;
713 }
714 847
715 err = -ENOMEM;
716 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 848 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
717 if (!msrpm_pages) 849 if (!msrpm_pages)
718 goto uninit; 850 goto free_page1;
719 851
720 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 852 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
721 if (!nested_msrpm_pages) 853 if (!nested_msrpm_pages)
722 goto uninit; 854 goto free_page2;
723
724 svm->msrpm = page_address(msrpm_pages);
725 svm_vcpu_init_msrpm(svm->msrpm);
726 855
727 hsave_page = alloc_page(GFP_KERNEL); 856 hsave_page = alloc_page(GFP_KERNEL);
728 if (!hsave_page) 857 if (!hsave_page)
729 goto uninit; 858 goto free_page3;
859
730 svm->nested.hsave = page_address(hsave_page); 860 svm->nested.hsave = page_address(hsave_page);
731 861
862 svm->msrpm = page_address(msrpm_pages);
863 svm_vcpu_init_msrpm(svm->msrpm);
864
732 svm->nested.msrpm = page_address(nested_msrpm_pages); 865 svm->nested.msrpm = page_address(nested_msrpm_pages);
866 svm_vcpu_init_msrpm(svm->nested.msrpm);
733 867
734 svm->vmcb = page_address(page); 868 svm->vmcb = page_address(page);
735 clear_page(svm->vmcb); 869 clear_page(svm->vmcb);
@@ -744,6 +878,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
744 878
745 return &svm->vcpu; 879 return &svm->vcpu;
746 880
881free_page3:
882 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
883free_page2:
884 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
885free_page1:
886 __free_page(page);
747uninit: 887uninit:
748 kvm_vcpu_uninit(&svm->vcpu); 888 kvm_vcpu_uninit(&svm->vcpu);
749free_svm: 889free_svm:
@@ -877,7 +1017,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
877 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 1017 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
878 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; 1018 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
879 1019
880 /* AMD's VMCB does not have an explicit unusable field, so emulate it 1020 /*
1021 * AMD's VMCB does not have an explicit unusable field, so emulate it
881 * for cross vendor migration purposes by "not present" 1022 * for cross vendor migration purposes by "not present"
882 */ 1023 */
883 var->unusable = !var->present || (var->type == 0); 1024 var->unusable = !var->present || (var->type == 0);
@@ -913,7 +1054,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
913 var->type |= 0x1; 1054 var->type |= 0x1;
914 break; 1055 break;
915 case VCPU_SREG_SS: 1056 case VCPU_SREG_SS:
916 /* On AMD CPUs sometimes the DB bit in the segment 1057 /*
1058 * On AMD CPUs sometimes the DB bit in the segment
917 * descriptor is left as 1, although the whole segment has 1059 * descriptor is left as 1, although the whole segment has
918 * been made unusable. Clear it here to pass an Intel VMX 1060 * been made unusable. Clear it here to pass an Intel VMX
919 * entry check when cross vendor migrating. 1061 * entry check when cross vendor migrating.
@@ -931,36 +1073,36 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
931 return save->cpl; 1073 return save->cpl;
932} 1074}
933 1075
934static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1076static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
935{ 1077{
936 struct vcpu_svm *svm = to_svm(vcpu); 1078 struct vcpu_svm *svm = to_svm(vcpu);
937 1079
938 dt->limit = svm->vmcb->save.idtr.limit; 1080 dt->size = svm->vmcb->save.idtr.limit;
939 dt->base = svm->vmcb->save.idtr.base; 1081 dt->address = svm->vmcb->save.idtr.base;
940} 1082}
941 1083
942static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1084static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
943{ 1085{
944 struct vcpu_svm *svm = to_svm(vcpu); 1086 struct vcpu_svm *svm = to_svm(vcpu);
945 1087
946 svm->vmcb->save.idtr.limit = dt->limit; 1088 svm->vmcb->save.idtr.limit = dt->size;
947 svm->vmcb->save.idtr.base = dt->base ; 1089 svm->vmcb->save.idtr.base = dt->address ;
948} 1090}
949 1091
950static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1092static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
951{ 1093{
952 struct vcpu_svm *svm = to_svm(vcpu); 1094 struct vcpu_svm *svm = to_svm(vcpu);
953 1095
954 dt->limit = svm->vmcb->save.gdtr.limit; 1096 dt->size = svm->vmcb->save.gdtr.limit;
955 dt->base = svm->vmcb->save.gdtr.base; 1097 dt->address = svm->vmcb->save.gdtr.base;
956} 1098}
957 1099
958static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1100static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
959{ 1101{
960 struct vcpu_svm *svm = to_svm(vcpu); 1102 struct vcpu_svm *svm = to_svm(vcpu);
961 1103
962 svm->vmcb->save.gdtr.limit = dt->limit; 1104 svm->vmcb->save.gdtr.limit = dt->size;
963 svm->vmcb->save.gdtr.base = dt->base ; 1105 svm->vmcb->save.gdtr.base = dt->address ;
964} 1106}
965 1107
966static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1108static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -973,6 +1115,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
973 1115
974static void update_cr0_intercept(struct vcpu_svm *svm) 1116static void update_cr0_intercept(struct vcpu_svm *svm)
975{ 1117{
1118 struct vmcb *vmcb = svm->vmcb;
976 ulong gcr0 = svm->vcpu.arch.cr0; 1119 ulong gcr0 = svm->vcpu.arch.cr0;
977 u64 *hcr0 = &svm->vmcb->save.cr0; 1120 u64 *hcr0 = &svm->vmcb->save.cr0;
978 1121
@@ -984,11 +1127,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
984 1127
985 1128
986 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1129 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
987 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; 1130 vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
988 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; 1131 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1132 if (is_nested(svm)) {
1133 struct vmcb *hsave = svm->nested.hsave;
1134
1135 hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
1136 hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1137 vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
1138 vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
1139 }
989 } else { 1140 } else {
990 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; 1141 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
991 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; 1142 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1143 if (is_nested(svm)) {
1144 struct vmcb *hsave = svm->nested.hsave;
1145
1146 hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
1147 hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1148 }
992 } 1149 }
993} 1150}
994 1151
@@ -996,6 +1153,27 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
996{ 1153{
997 struct vcpu_svm *svm = to_svm(vcpu); 1154 struct vcpu_svm *svm = to_svm(vcpu);
998 1155
1156 if (is_nested(svm)) {
1157 /*
1158 * We are here because we run in nested mode, the host kvm
1159 * intercepts cr0 writes but the l1 hypervisor does not.
1160 * But the L1 hypervisor may intercept selective cr0 writes.
1161 * This needs to be checked here.
1162 */
1163 unsigned long old, new;
1164
1165 /* Remove bits that would trigger a real cr0 write intercept */
1166 old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
1167 new = cr0 & SVM_CR0_SELECTIVE_MASK;
1168
1169 if (old == new) {
1170 /* cr0 write with ts and mp unchanged */
1171 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1172 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
1173 return;
1174 }
1175 }
1176
999#ifdef CONFIG_X86_64 1177#ifdef CONFIG_X86_64
1000 if (vcpu->arch.efer & EFER_LME) { 1178 if (vcpu->arch.efer & EFER_LME) {
1001 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1179 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -1129,70 +1307,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1129 svm->vmcb->control.asid = sd->next_asid++; 1307 svm->vmcb->control.asid = sd->next_asid++;
1130} 1308}
1131 1309
1132static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) 1310static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1133{
1134 struct vcpu_svm *svm = to_svm(vcpu);
1135
1136 switch (dr) {
1137 case 0 ... 3:
1138 *dest = vcpu->arch.db[dr];
1139 break;
1140 case 4:
1141 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1142 return EMULATE_FAIL; /* will re-inject UD */
1143 /* fall through */
1144 case 6:
1145 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1146 *dest = vcpu->arch.dr6;
1147 else
1148 *dest = svm->vmcb->save.dr6;
1149 break;
1150 case 5:
1151 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1152 return EMULATE_FAIL; /* will re-inject UD */
1153 /* fall through */
1154 case 7:
1155 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1156 *dest = vcpu->arch.dr7;
1157 else
1158 *dest = svm->vmcb->save.dr7;
1159 break;
1160 }
1161
1162 return EMULATE_DONE;
1163}
1164
1165static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
1166{ 1311{
1167 struct vcpu_svm *svm = to_svm(vcpu); 1312 struct vcpu_svm *svm = to_svm(vcpu);
1168 1313
1169 switch (dr) { 1314 svm->vmcb->save.dr7 = value;
1170 case 0 ... 3:
1171 vcpu->arch.db[dr] = value;
1172 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1173 vcpu->arch.eff_db[dr] = value;
1174 break;
1175 case 4:
1176 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1177 return EMULATE_FAIL; /* will re-inject UD */
1178 /* fall through */
1179 case 6:
1180 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
1181 break;
1182 case 5:
1183 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1184 return EMULATE_FAIL; /* will re-inject UD */
1185 /* fall through */
1186 case 7:
1187 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1188 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1189 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1190 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1191 }
1192 break;
1193 }
1194
1195 return EMULATE_DONE;
1196} 1315}
1197 1316
1198static int pf_interception(struct vcpu_svm *svm) 1317static int pf_interception(struct vcpu_svm *svm)
@@ -1229,7 +1348,7 @@ static int db_interception(struct vcpu_svm *svm)
1229 } 1348 }
1230 1349
1231 if (svm->vcpu.guest_debug & 1350 if (svm->vcpu.guest_debug &
1232 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ 1351 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1233 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1352 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1234 kvm_run->debug.arch.pc = 1353 kvm_run->debug.arch.pc =
1235 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1354 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1263,7 +1382,22 @@ static int ud_interception(struct vcpu_svm *svm)
1263static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1382static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1264{ 1383{
1265 struct vcpu_svm *svm = to_svm(vcpu); 1384 struct vcpu_svm *svm = to_svm(vcpu);
1266 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1385 u32 excp;
1386
1387 if (is_nested(svm)) {
1388 u32 h_excp, n_excp;
1389
1390 h_excp = svm->nested.hsave->control.intercept_exceptions;
1391 n_excp = svm->nested.intercept_exceptions;
1392 h_excp &= ~(1 << NM_VECTOR);
1393 excp = h_excp | n_excp;
1394 } else {
1395 excp = svm->vmcb->control.intercept_exceptions;
1396 excp &= ~(1 << NM_VECTOR);
1397 }
1398
1399 svm->vmcb->control.intercept_exceptions = excp;
1400
1267 svm->vcpu.fpu_active = 1; 1401 svm->vcpu.fpu_active = 1;
1268 update_cr0_intercept(svm); 1402 update_cr0_intercept(svm);
1269} 1403}
@@ -1304,29 +1438,23 @@ static int shutdown_interception(struct vcpu_svm *svm)
1304 1438
1305static int io_interception(struct vcpu_svm *svm) 1439static int io_interception(struct vcpu_svm *svm)
1306{ 1440{
1441 struct kvm_vcpu *vcpu = &svm->vcpu;
1307 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1442 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1308 int size, in, string; 1443 int size, in, string;
1309 unsigned port; 1444 unsigned port;
1310 1445
1311 ++svm->vcpu.stat.io_exits; 1446 ++svm->vcpu.stat.io_exits;
1312
1313 svm->next_rip = svm->vmcb->control.exit_info_2;
1314
1315 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1447 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1316
1317 if (string) {
1318 if (emulate_instruction(&svm->vcpu,
1319 0, 0, 0) == EMULATE_DO_MMIO)
1320 return 0;
1321 return 1;
1322 }
1323
1324 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1448 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1449 if (string || in)
1450 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
1451
1325 port = io_info >> 16; 1452 port = io_info >> 16;
1326 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1453 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1327 1454 svm->next_rip = svm->vmcb->control.exit_info_2;
1328 skip_emulated_instruction(&svm->vcpu); 1455 skip_emulated_instruction(&svm->vcpu);
1329 return kvm_emulate_pio(&svm->vcpu, in, size, port); 1456
1457 return kvm_fast_pio_out(vcpu, size, port);
1330} 1458}
1331 1459
1332static int nmi_interception(struct vcpu_svm *svm) 1460static int nmi_interception(struct vcpu_svm *svm)
@@ -1379,6 +1507,8 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
1379static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 1507static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1380 bool has_error_code, u32 error_code) 1508 bool has_error_code, u32 error_code)
1381{ 1509{
1510 int vmexit;
1511
1382 if (!is_nested(svm)) 1512 if (!is_nested(svm))
1383 return 0; 1513 return 0;
1384 1514
@@ -1387,21 +1517,28 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1387 svm->vmcb->control.exit_info_1 = error_code; 1517 svm->vmcb->control.exit_info_1 = error_code;
1388 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; 1518 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1389 1519
1390 return nested_svm_exit_handled(svm); 1520 vmexit = nested_svm_intercept(svm);
1521 if (vmexit == NESTED_EXIT_DONE)
1522 svm->nested.exit_required = true;
1523
1524 return vmexit;
1391} 1525}
1392 1526
1393static inline int nested_svm_intr(struct vcpu_svm *svm) 1527/* This function returns true if it is save to enable the irq window */
1528static inline bool nested_svm_intr(struct vcpu_svm *svm)
1394{ 1529{
1395 if (!is_nested(svm)) 1530 if (!is_nested(svm))
1396 return 0; 1531 return true;
1397 1532
1398 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1533 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1399 return 0; 1534 return true;
1400 1535
1401 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 1536 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1402 return 0; 1537 return false;
1403 1538
1404 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1539 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1540 svm->vmcb->control.exit_info_1 = 0;
1541 svm->vmcb->control.exit_info_2 = 0;
1405 1542
1406 if (svm->nested.intercept & 1ULL) { 1543 if (svm->nested.intercept & 1ULL) {
1407 /* 1544 /*
@@ -1412,21 +1549,40 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1412 */ 1549 */
1413 svm->nested.exit_required = true; 1550 svm->nested.exit_required = true;
1414 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); 1551 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1415 return 1; 1552 return false;
1416 } 1553 }
1417 1554
1418 return 0; 1555 return true;
1556}
1557
1558/* This function returns true if it is save to enable the nmi window */
1559static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1560{
1561 if (!is_nested(svm))
1562 return true;
1563
1564 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
1565 return true;
1566
1567 svm->vmcb->control.exit_code = SVM_EXIT_NMI;
1568 svm->nested.exit_required = true;
1569
1570 return false;
1419} 1571}
1420 1572
1421static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) 1573static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
1422{ 1574{
1423 struct page *page; 1575 struct page *page;
1424 1576
1577 might_sleep();
1578
1425 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1579 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1426 if (is_error_page(page)) 1580 if (is_error_page(page))
1427 goto error; 1581 goto error;
1428 1582
1429 return kmap_atomic(page, idx); 1583 *_page = page;
1584
1585 return kmap(page);
1430 1586
1431error: 1587error:
1432 kvm_release_page_clean(page); 1588 kvm_release_page_clean(page);
@@ -1435,61 +1591,55 @@ error:
1435 return NULL; 1591 return NULL;
1436} 1592}
1437 1593
1438static void nested_svm_unmap(void *addr, enum km_type idx) 1594static void nested_svm_unmap(struct page *page)
1439{ 1595{
1440 struct page *page; 1596 kunmap(page);
1597 kvm_release_page_dirty(page);
1598}
1441 1599
1442 if (!addr) 1600static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
1443 return; 1601{
1602 unsigned port;
1603 u8 val, bit;
1604 u64 gpa;
1444 1605
1445 page = kmap_atomic_to_page(addr); 1606 if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
1607 return NESTED_EXIT_HOST;
1446 1608
1447 kunmap_atomic(addr, idx); 1609 port = svm->vmcb->control.exit_info_1 >> 16;
1448 kvm_release_page_dirty(page); 1610 gpa = svm->nested.vmcb_iopm + (port / 8);
1611 bit = port % 8;
1612 val = 0;
1613
1614 if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
1615 val &= (1 << bit);
1616
1617 return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1449} 1618}
1450 1619
1451static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) 1620static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
1452{ 1621{
1453 u32 param = svm->vmcb->control.exit_info_1 & 1; 1622 u32 offset, msr, value;
1454 u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1623 int write, mask;
1455 bool ret = false;
1456 u32 t0, t1;
1457 u8 *msrpm;
1458 1624
1459 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) 1625 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1460 return false; 1626 return NESTED_EXIT_HOST;
1461
1462 msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
1463 1627
1464 if (!msrpm) 1628 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1465 goto out; 1629 offset = svm_msrpm_offset(msr);
1630 write = svm->vmcb->control.exit_info_1 & 1;
1631 mask = 1 << ((2 * (msr & 0xf)) + write);
1466 1632
1467 switch (msr) { 1633 if (offset == MSR_INVALID)
1468 case 0 ... 0x1fff: 1634 return NESTED_EXIT_DONE;
1469 t0 = (msr * 2) % 8;
1470 t1 = msr / 8;
1471 break;
1472 case 0xc0000000 ... 0xc0001fff:
1473 t0 = (8192 + msr - 0xc0000000) * 2;
1474 t1 = (t0 / 8);
1475 t0 %= 8;
1476 break;
1477 case 0xc0010000 ... 0xc0011fff:
1478 t0 = (16384 + msr - 0xc0010000) * 2;
1479 t1 = (t0 / 8);
1480 t0 %= 8;
1481 break;
1482 default:
1483 ret = true;
1484 goto out;
1485 }
1486 1635
1487 ret = msrpm[t1] & ((1 << param) << t0); 1636 /* Offset is in 32 bit units but need in 8 bit units */
1637 offset *= 4;
1488 1638
1489out: 1639 if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
1490 nested_svm_unmap(msrpm, KM_USER0); 1640 return NESTED_EXIT_DONE;
1491 1641
1492 return ret; 1642 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1493} 1643}
1494 1644
1495static int nested_svm_exit_special(struct vcpu_svm *svm) 1645static int nested_svm_exit_special(struct vcpu_svm *svm)
@@ -1500,16 +1650,19 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1500 case SVM_EXIT_INTR: 1650 case SVM_EXIT_INTR:
1501 case SVM_EXIT_NMI: 1651 case SVM_EXIT_NMI:
1502 return NESTED_EXIT_HOST; 1652 return NESTED_EXIT_HOST;
1503 /* For now we are always handling NPFs when using them */
1504 case SVM_EXIT_NPF: 1653 case SVM_EXIT_NPF:
1654 /* For now we are always handling NPFs when using them */
1505 if (npt_enabled) 1655 if (npt_enabled)
1506 return NESTED_EXIT_HOST; 1656 return NESTED_EXIT_HOST;
1507 break; 1657 break;
1508 /* When we're shadowing, trap PFs */
1509 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1658 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1659 /* When we're shadowing, trap PFs */
1510 if (!npt_enabled) 1660 if (!npt_enabled)
1511 return NESTED_EXIT_HOST; 1661 return NESTED_EXIT_HOST;
1512 break; 1662 break;
1663 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
1664 nm_interception(svm);
1665 break;
1513 default: 1666 default:
1514 break; 1667 break;
1515 } 1668 }
@@ -1520,7 +1673,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1520/* 1673/*
1521 * If this function returns true, this #vmexit was already handled 1674 * If this function returns true, this #vmexit was already handled
1522 */ 1675 */
1523static int nested_svm_exit_handled(struct vcpu_svm *svm) 1676static int nested_svm_intercept(struct vcpu_svm *svm)
1524{ 1677{
1525 u32 exit_code = svm->vmcb->control.exit_code; 1678 u32 exit_code = svm->vmcb->control.exit_code;
1526 int vmexit = NESTED_EXIT_HOST; 1679 int vmexit = NESTED_EXIT_HOST;
@@ -1529,6 +1682,9 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1529 case SVM_EXIT_MSR: 1682 case SVM_EXIT_MSR:
1530 vmexit = nested_svm_exit_handled_msr(svm); 1683 vmexit = nested_svm_exit_handled_msr(svm);
1531 break; 1684 break;
1685 case SVM_EXIT_IOIO:
1686 vmexit = nested_svm_intercept_ioio(svm);
1687 break;
1532 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 1688 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
1533 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 1689 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
1534 if (svm->nested.intercept_cr_read & cr_bits) 1690 if (svm->nested.intercept_cr_read & cr_bits)
@@ -1566,9 +1722,17 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1566 } 1722 }
1567 } 1723 }
1568 1724
1569 if (vmexit == NESTED_EXIT_DONE) { 1725 return vmexit;
1726}
1727
1728static int nested_svm_exit_handled(struct vcpu_svm *svm)
1729{
1730 int vmexit;
1731
1732 vmexit = nested_svm_intercept(svm);
1733
1734 if (vmexit == NESTED_EXIT_DONE)
1570 nested_svm_vmexit(svm); 1735 nested_svm_vmexit(svm);
1571 }
1572 1736
1573 return vmexit; 1737 return vmexit;
1574} 1738}
@@ -1610,6 +1774,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1610 struct vmcb *nested_vmcb; 1774 struct vmcb *nested_vmcb;
1611 struct vmcb *hsave = svm->nested.hsave; 1775 struct vmcb *hsave = svm->nested.hsave;
1612 struct vmcb *vmcb = svm->vmcb; 1776 struct vmcb *vmcb = svm->vmcb;
1777 struct page *page;
1613 1778
1614 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, 1779 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1615 vmcb->control.exit_info_1, 1780 vmcb->control.exit_info_1,
@@ -1617,10 +1782,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1617 vmcb->control.exit_int_info, 1782 vmcb->control.exit_int_info,
1618 vmcb->control.exit_int_info_err); 1783 vmcb->control.exit_int_info_err);
1619 1784
1620 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1785 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
1621 if (!nested_vmcb) 1786 if (!nested_vmcb)
1622 return 1; 1787 return 1;
1623 1788
1789 /* Exit nested SVM mode */
1790 svm->nested.vmcb = 0;
1791
1624 /* Give the current vmcb to the guest */ 1792 /* Give the current vmcb to the guest */
1625 disable_gif(svm); 1793 disable_gif(svm);
1626 1794
@@ -1630,9 +1798,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1630 nested_vmcb->save.ds = vmcb->save.ds; 1798 nested_vmcb->save.ds = vmcb->save.ds;
1631 nested_vmcb->save.gdtr = vmcb->save.gdtr; 1799 nested_vmcb->save.gdtr = vmcb->save.gdtr;
1632 nested_vmcb->save.idtr = vmcb->save.idtr; 1800 nested_vmcb->save.idtr = vmcb->save.idtr;
1801 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1633 if (npt_enabled) 1802 if (npt_enabled)
1634 nested_vmcb->save.cr3 = vmcb->save.cr3; 1803 nested_vmcb->save.cr3 = vmcb->save.cr3;
1804 else
1805 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
1635 nested_vmcb->save.cr2 = vmcb->save.cr2; 1806 nested_vmcb->save.cr2 = vmcb->save.cr2;
1807 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
1636 nested_vmcb->save.rflags = vmcb->save.rflags; 1808 nested_vmcb->save.rflags = vmcb->save.rflags;
1637 nested_vmcb->save.rip = vmcb->save.rip; 1809 nested_vmcb->save.rip = vmcb->save.rip;
1638 nested_vmcb->save.rsp = vmcb->save.rsp; 1810 nested_vmcb->save.rsp = vmcb->save.rsp;
@@ -1704,10 +1876,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1704 svm->vmcb->save.cpl = 0; 1876 svm->vmcb->save.cpl = 0;
1705 svm->vmcb->control.exit_int_info = 0; 1877 svm->vmcb->control.exit_int_info = 0;
1706 1878
1707 /* Exit nested SVM mode */ 1879 nested_svm_unmap(page);
1708 svm->nested.vmcb = 0;
1709
1710 nested_svm_unmap(nested_vmcb, KM_USER0);
1711 1880
1712 kvm_mmu_reset_context(&svm->vcpu); 1881 kvm_mmu_reset_context(&svm->vcpu);
1713 kvm_mmu_load(&svm->vcpu); 1882 kvm_mmu_load(&svm->vcpu);
@@ -1717,19 +1886,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1717 1886
1718static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) 1887static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
1719{ 1888{
1720 u32 *nested_msrpm; 1889 /*
1890 * This function merges the msr permission bitmaps of kvm and the
1891 * nested vmcb. It is omptimized in that it only merges the parts where
1892 * the kvm msr permission bitmap may contain zero bits
1893 */
1721 int i; 1894 int i;
1722 1895
1723 nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); 1896 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1724 if (!nested_msrpm) 1897 return true;
1725 return false;
1726 1898
1727 for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) 1899 for (i = 0; i < MSRPM_OFFSETS; i++) {
1728 svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; 1900 u32 value, p;
1901 u64 offset;
1729 1902
1730 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); 1903 if (msrpm_offsets[i] == 0xffffffff)
1904 break;
1905
1906 p = msrpm_offsets[i];
1907 offset = svm->nested.vmcb_msrpm + (p * 4);
1908
1909 if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
1910 return false;
1911
1912 svm->nested.msrpm[p] = svm->msrpm[p] | value;
1913 }
1731 1914
1732 nested_svm_unmap(nested_msrpm, KM_USER0); 1915 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
1733 1916
1734 return true; 1917 return true;
1735} 1918}
@@ -1739,26 +1922,34 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1739 struct vmcb *nested_vmcb; 1922 struct vmcb *nested_vmcb;
1740 struct vmcb *hsave = svm->nested.hsave; 1923 struct vmcb *hsave = svm->nested.hsave;
1741 struct vmcb *vmcb = svm->vmcb; 1924 struct vmcb *vmcb = svm->vmcb;
1925 struct page *page;
1926 u64 vmcb_gpa;
1927
1928 vmcb_gpa = svm->vmcb->save.rax;
1742 1929
1743 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 1930 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1744 if (!nested_vmcb) 1931 if (!nested_vmcb)
1745 return false; 1932 return false;
1746 1933
1747 /* nested_vmcb is our indicator if nested SVM is activated */ 1934 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
1748 svm->nested.vmcb = svm->vmcb->save.rax;
1749
1750 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1751 nested_vmcb->save.rip, 1935 nested_vmcb->save.rip,
1752 nested_vmcb->control.int_ctl, 1936 nested_vmcb->control.int_ctl,
1753 nested_vmcb->control.event_inj, 1937 nested_vmcb->control.event_inj,
1754 nested_vmcb->control.nested_ctl); 1938 nested_vmcb->control.nested_ctl);
1755 1939
1940 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
1941 nested_vmcb->control.intercept_cr_write,
1942 nested_vmcb->control.intercept_exceptions,
1943 nested_vmcb->control.intercept);
1944
1756 /* Clear internal status */ 1945 /* Clear internal status */
1757 kvm_clear_exception_queue(&svm->vcpu); 1946 kvm_clear_exception_queue(&svm->vcpu);
1758 kvm_clear_interrupt_queue(&svm->vcpu); 1947 kvm_clear_interrupt_queue(&svm->vcpu);
1759 1948
1760 /* Save the old vmcb, so we don't need to pick what we save, but 1949 /*
1761 can restore everything when a VMEXIT occurs */ 1950 * Save the old vmcb, so we don't need to pick what we save, but can
1951 * restore everything when a VMEXIT occurs
1952 */
1762 hsave->save.es = vmcb->save.es; 1953 hsave->save.es = vmcb->save.es;
1763 hsave->save.cs = vmcb->save.cs; 1954 hsave->save.cs = vmcb->save.cs;
1764 hsave->save.ss = vmcb->save.ss; 1955 hsave->save.ss = vmcb->save.ss;
@@ -1798,14 +1989,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1798 if (npt_enabled) { 1989 if (npt_enabled) {
1799 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 1990 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
1800 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 1991 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
1801 } else { 1992 } else
1802 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 1993 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
1803 kvm_mmu_reset_context(&svm->vcpu); 1994
1804 } 1995 /* Guest paging mode is active - reset mmu */
1996 kvm_mmu_reset_context(&svm->vcpu);
1997
1805 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; 1998 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
1806 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); 1999 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
1807 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); 2000 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
1808 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); 2001 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2002
1809 /* In case we don't even reach vcpu_run, the fields are not updated */ 2003 /* In case we don't even reach vcpu_run, the fields are not updated */
1810 svm->vmcb->save.rax = nested_vmcb->save.rax; 2004 svm->vmcb->save.rax = nested_vmcb->save.rax;
1811 svm->vmcb->save.rsp = nested_vmcb->save.rsp; 2005 svm->vmcb->save.rsp = nested_vmcb->save.rsp;
@@ -1814,22 +2008,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1814 svm->vmcb->save.dr6 = nested_vmcb->save.dr6; 2008 svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
1815 svm->vmcb->save.cpl = nested_vmcb->save.cpl; 2009 svm->vmcb->save.cpl = nested_vmcb->save.cpl;
1816 2010
1817 /* We don't want a nested guest to be more powerful than the guest, 2011 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
1818 so all intercepts are ORed */ 2012 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
1819 svm->vmcb->control.intercept_cr_read |=
1820 nested_vmcb->control.intercept_cr_read;
1821 svm->vmcb->control.intercept_cr_write |=
1822 nested_vmcb->control.intercept_cr_write;
1823 svm->vmcb->control.intercept_dr_read |=
1824 nested_vmcb->control.intercept_dr_read;
1825 svm->vmcb->control.intercept_dr_write |=
1826 nested_vmcb->control.intercept_dr_write;
1827 svm->vmcb->control.intercept_exceptions |=
1828 nested_vmcb->control.intercept_exceptions;
1829
1830 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
1831
1832 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
1833 2013
1834 /* cache intercepts */ 2014 /* cache intercepts */
1835 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; 2015 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
@@ -1846,13 +2026,40 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1846 else 2026 else
1847 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 2027 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1848 2028
2029 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2030 /* We only want the cr8 intercept bits of the guest */
2031 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
2032 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2033 }
2034
2035 /*
2036 * We don't want a nested guest to be more powerful than the guest, so
2037 * all intercepts are ORed
2038 */
2039 svm->vmcb->control.intercept_cr_read |=
2040 nested_vmcb->control.intercept_cr_read;
2041 svm->vmcb->control.intercept_cr_write |=
2042 nested_vmcb->control.intercept_cr_write;
2043 svm->vmcb->control.intercept_dr_read |=
2044 nested_vmcb->control.intercept_dr_read;
2045 svm->vmcb->control.intercept_dr_write |=
2046 nested_vmcb->control.intercept_dr_write;
2047 svm->vmcb->control.intercept_exceptions |=
2048 nested_vmcb->control.intercept_exceptions;
2049
2050 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
2051
2052 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
1849 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 2053 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1850 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 2054 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1851 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 2055 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1852 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 2056 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1853 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 2057 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1854 2058
1855 nested_svm_unmap(nested_vmcb, KM_USER0); 2059 nested_svm_unmap(page);
2060
2061 /* nested_vmcb is our indicator if nested SVM is activated */
2062 svm->nested.vmcb = vmcb_gpa;
1856 2063
1857 enable_gif(svm); 2064 enable_gif(svm);
1858 2065
@@ -1878,6 +2085,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1878static int vmload_interception(struct vcpu_svm *svm) 2085static int vmload_interception(struct vcpu_svm *svm)
1879{ 2086{
1880 struct vmcb *nested_vmcb; 2087 struct vmcb *nested_vmcb;
2088 struct page *page;
1881 2089
1882 if (nested_svm_check_permissions(svm)) 2090 if (nested_svm_check_permissions(svm))
1883 return 1; 2091 return 1;
@@ -1885,12 +2093,12 @@ static int vmload_interception(struct vcpu_svm *svm)
1885 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2093 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1886 skip_emulated_instruction(&svm->vcpu); 2094 skip_emulated_instruction(&svm->vcpu);
1887 2095
1888 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 2096 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1889 if (!nested_vmcb) 2097 if (!nested_vmcb)
1890 return 1; 2098 return 1;
1891 2099
1892 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 2100 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
1893 nested_svm_unmap(nested_vmcb, KM_USER0); 2101 nested_svm_unmap(page);
1894 2102
1895 return 1; 2103 return 1;
1896} 2104}
@@ -1898,6 +2106,7 @@ static int vmload_interception(struct vcpu_svm *svm)
1898static int vmsave_interception(struct vcpu_svm *svm) 2106static int vmsave_interception(struct vcpu_svm *svm)
1899{ 2107{
1900 struct vmcb *nested_vmcb; 2108 struct vmcb *nested_vmcb;
2109 struct page *page;
1901 2110
1902 if (nested_svm_check_permissions(svm)) 2111 if (nested_svm_check_permissions(svm))
1903 return 1; 2112 return 1;
@@ -1905,12 +2114,12 @@ static int vmsave_interception(struct vcpu_svm *svm)
1905 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2114 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1906 skip_emulated_instruction(&svm->vcpu); 2115 skip_emulated_instruction(&svm->vcpu);
1907 2116
1908 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 2117 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1909 if (!nested_vmcb) 2118 if (!nested_vmcb)
1910 return 1; 2119 return 1;
1911 2120
1912 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 2121 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
1913 nested_svm_unmap(nested_vmcb, KM_USER0); 2122 nested_svm_unmap(page);
1914 2123
1915 return 1; 2124 return 1;
1916} 2125}
@@ -2013,6 +2222,8 @@ static int task_switch_interception(struct vcpu_svm *svm)
2013 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 2222 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2014 uint32_t idt_v = 2223 uint32_t idt_v =
2015 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 2224 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2225 bool has_error_code = false;
2226 u32 error_code = 0;
2016 2227
2017 tss_selector = (u16)svm->vmcb->control.exit_info_1; 2228 tss_selector = (u16)svm->vmcb->control.exit_info_1;
2018 2229
@@ -2033,6 +2244,12 @@ static int task_switch_interception(struct vcpu_svm *svm)
2033 svm->vcpu.arch.nmi_injected = false; 2244 svm->vcpu.arch.nmi_injected = false;
2034 break; 2245 break;
2035 case SVM_EXITINTINFO_TYPE_EXEPT: 2246 case SVM_EXITINTINFO_TYPE_EXEPT:
2247 if (svm->vmcb->control.exit_info_2 &
2248 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2249 has_error_code = true;
2250 error_code =
2251 (u32)svm->vmcb->control.exit_info_2;
2252 }
2036 kvm_clear_exception_queue(&svm->vcpu); 2253 kvm_clear_exception_queue(&svm->vcpu);
2037 break; 2254 break;
2038 case SVM_EXITINTINFO_TYPE_INTR: 2255 case SVM_EXITINTINFO_TYPE_INTR:
@@ -2049,7 +2266,14 @@ static int task_switch_interception(struct vcpu_svm *svm)
2049 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) 2266 (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2050 skip_emulated_instruction(&svm->vcpu); 2267 skip_emulated_instruction(&svm->vcpu);
2051 2268
2052 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2269 if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
2270 has_error_code, error_code) == EMULATE_FAIL) {
2271 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2272 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2273 svm->vcpu.run->internal.ndata = 0;
2274 return 0;
2275 }
2276 return 1;
2053} 2277}
2054 2278
2055static int cpuid_interception(struct vcpu_svm *svm) 2279static int cpuid_interception(struct vcpu_svm *svm)
@@ -2140,9 +2364,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2140 case MSR_IA32_SYSENTER_ESP: 2364 case MSR_IA32_SYSENTER_ESP:
2141 *data = svm->sysenter_esp; 2365 *data = svm->sysenter_esp;
2142 break; 2366 break;
2143 /* Nobody will change the following 5 values in the VMCB so 2367 /*
2144 we can safely return them on rdmsr. They will always be 0 2368 * Nobody will change the following 5 values in the VMCB so we can
2145 until LBRV is implemented. */ 2369 * safely return them on rdmsr. They will always be 0 until LBRV is
2370 * implemented.
2371 */
2146 case MSR_IA32_DEBUGCTLMSR: 2372 case MSR_IA32_DEBUGCTLMSR:
2147 *data = svm->vmcb->save.dbgctl; 2373 *data = svm->vmcb->save.dbgctl;
2148 break; 2374 break;
@@ -2162,7 +2388,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2162 *data = svm->nested.hsave_msr; 2388 *data = svm->nested.hsave_msr;
2163 break; 2389 break;
2164 case MSR_VM_CR: 2390 case MSR_VM_CR:
2165 *data = 0; 2391 *data = svm->nested.vm_cr_msr;
2166 break; 2392 break;
2167 case MSR_IA32_UCODE_REV: 2393 case MSR_IA32_UCODE_REV:
2168 *data = 0x01000065; 2394 *data = 0x01000065;
@@ -2192,6 +2418,31 @@ static int rdmsr_interception(struct vcpu_svm *svm)
2192 return 1; 2418 return 1;
2193} 2419}
2194 2420
2421static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2422{
2423 struct vcpu_svm *svm = to_svm(vcpu);
2424 int svm_dis, chg_mask;
2425
2426 if (data & ~SVM_VM_CR_VALID_MASK)
2427 return 1;
2428
2429 chg_mask = SVM_VM_CR_VALID_MASK;
2430
2431 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2432 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2433
2434 svm->nested.vm_cr_msr &= ~chg_mask;
2435 svm->nested.vm_cr_msr |= (data & chg_mask);
2436
2437 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2438
2439 /* check for svm_disable while efer.svme is set */
2440 if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2441 return 1;
2442
2443 return 0;
2444}
2445
2195static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) 2446static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2196{ 2447{
2197 struct vcpu_svm *svm = to_svm(vcpu); 2448 struct vcpu_svm *svm = to_svm(vcpu);
@@ -2258,6 +2509,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2258 svm->nested.hsave_msr = data; 2509 svm->nested.hsave_msr = data;
2259 break; 2510 break;
2260 case MSR_VM_CR: 2511 case MSR_VM_CR:
2512 return svm_set_vm_cr(vcpu, data);
2261 case MSR_VM_IGNNE: 2513 case MSR_VM_IGNNE:
2262 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 2514 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2263 break; 2515 break;
@@ -2321,16 +2573,16 @@ static int pause_interception(struct vcpu_svm *svm)
2321} 2573}
2322 2574
2323static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 2575static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2324 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2576 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2325 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2577 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2326 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2578 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2327 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2579 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2328 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 2580 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2329 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2581 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
2330 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2582 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2331 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2583 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
2332 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 2584 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2333 [SVM_EXIT_READ_DR0] = emulate_on_interception, 2585 [SVM_EXIT_READ_DR0] = emulate_on_interception,
2334 [SVM_EXIT_READ_DR1] = emulate_on_interception, 2586 [SVM_EXIT_READ_DR1] = emulate_on_interception,
2335 [SVM_EXIT_READ_DR2] = emulate_on_interception, 2587 [SVM_EXIT_READ_DR2] = emulate_on_interception,
2336 [SVM_EXIT_READ_DR3] = emulate_on_interception, 2588 [SVM_EXIT_READ_DR3] = emulate_on_interception,
@@ -2349,15 +2601,14 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2349 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 2601 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2350 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 2602 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2351 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 2603 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
2352 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 2604 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
2353 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 2605 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
2354 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 2606 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
2355 [SVM_EXIT_INTR] = intr_interception, 2607 [SVM_EXIT_INTR] = intr_interception,
2356 [SVM_EXIT_NMI] = nmi_interception, 2608 [SVM_EXIT_NMI] = nmi_interception,
2357 [SVM_EXIT_SMI] = nop_on_interception, 2609 [SVM_EXIT_SMI] = nop_on_interception,
2358 [SVM_EXIT_INIT] = nop_on_interception, 2610 [SVM_EXIT_INIT] = nop_on_interception,
2359 [SVM_EXIT_VINTR] = interrupt_window_interception, 2611 [SVM_EXIT_VINTR] = interrupt_window_interception,
2360 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
2361 [SVM_EXIT_CPUID] = cpuid_interception, 2612 [SVM_EXIT_CPUID] = cpuid_interception,
2362 [SVM_EXIT_IRET] = iret_interception, 2613 [SVM_EXIT_IRET] = iret_interception,
2363 [SVM_EXIT_INVD] = emulate_on_interception, 2614 [SVM_EXIT_INVD] = emulate_on_interception,
@@ -2365,7 +2616,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2365 [SVM_EXIT_HLT] = halt_interception, 2616 [SVM_EXIT_HLT] = halt_interception,
2366 [SVM_EXIT_INVLPG] = invlpg_interception, 2617 [SVM_EXIT_INVLPG] = invlpg_interception,
2367 [SVM_EXIT_INVLPGA] = invlpga_interception, 2618 [SVM_EXIT_INVLPGA] = invlpga_interception,
2368 [SVM_EXIT_IOIO] = io_interception, 2619 [SVM_EXIT_IOIO] = io_interception,
2369 [SVM_EXIT_MSR] = msr_interception, 2620 [SVM_EXIT_MSR] = msr_interception,
2370 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 2621 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
2371 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 2622 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
@@ -2388,7 +2639,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2388 struct kvm_run *kvm_run = vcpu->run; 2639 struct kvm_run *kvm_run = vcpu->run;
2389 u32 exit_code = svm->vmcb->control.exit_code; 2640 u32 exit_code = svm->vmcb->control.exit_code;
2390 2641
2391 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2642 trace_kvm_exit(exit_code, vcpu);
2392 2643
2393 if (unlikely(svm->nested.exit_required)) { 2644 if (unlikely(svm->nested.exit_required)) {
2394 nested_svm_vmexit(svm); 2645 nested_svm_vmexit(svm);
@@ -2506,6 +2757,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
2506{ 2757{
2507 struct vcpu_svm *svm = to_svm(vcpu); 2758 struct vcpu_svm *svm = to_svm(vcpu);
2508 2759
2760 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2761 return;
2762
2509 if (irr == -1) 2763 if (irr == -1)
2510 return; 2764 return;
2511 2765
@@ -2563,13 +2817,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2563{ 2817{
2564 struct vcpu_svm *svm = to_svm(vcpu); 2818 struct vcpu_svm *svm = to_svm(vcpu);
2565 2819
2566 nested_svm_intr(svm); 2820 /*
2567 2821 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
2568 /* In case GIF=0 we can't rely on the CPU to tell us when 2822 * 1, because that's a separate STGI/VMRUN intercept. The next time we
2569 * GIF becomes 1, because that's a separate STGI/VMRUN intercept. 2823 * get that intercept, this function will be called again though and
2570 * The next time we get that intercept, this function will be 2824 * we'll get the vintr intercept.
2571 * called again though and we'll get the vintr intercept. */ 2825 */
2572 if (gif_set(svm)) { 2826 if (gif_set(svm) && nested_svm_intr(svm)) {
2573 svm_set_vintr(svm); 2827 svm_set_vintr(svm);
2574 svm_inject_irq(svm, 0x0); 2828 svm_inject_irq(svm, 0x0);
2575 } 2829 }
@@ -2583,12 +2837,15 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2583 == HF_NMI_MASK) 2837 == HF_NMI_MASK)
2584 return; /* IRET will cause a vm exit */ 2838 return; /* IRET will cause a vm exit */
2585 2839
2586 /* Something prevents NMI from been injected. Single step over 2840 /*
2587 possible problem (IRET or exception injection or interrupt 2841 * Something prevents NMI from been injected. Single step over possible
2588 shadow) */ 2842 * problem (IRET or exception injection or interrupt shadow)
2589 svm->nmi_singlestep = true; 2843 */
2590 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2844 if (gif_set(svm) && nested_svm_nmi(svm)) {
2591 update_db_intercept(vcpu); 2845 svm->nmi_singlestep = true;
2846 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2847 update_db_intercept(vcpu);
2848 }
2592} 2849}
2593 2850
2594static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 2851static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2609,6 +2866,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
2609{ 2866{
2610 struct vcpu_svm *svm = to_svm(vcpu); 2867 struct vcpu_svm *svm = to_svm(vcpu);
2611 2868
2869 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2870 return;
2871
2612 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 2872 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
2613 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 2873 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
2614 kvm_set_cr8(vcpu, cr8); 2874 kvm_set_cr8(vcpu, cr8);
@@ -2620,6 +2880,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
2620 struct vcpu_svm *svm = to_svm(vcpu); 2880 struct vcpu_svm *svm = to_svm(vcpu);
2621 u64 cr8; 2881 u64 cr8;
2622 2882
2883 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2884 return;
2885
2623 cr8 = kvm_get_cr8(vcpu); 2886 cr8 = kvm_get_cr8(vcpu);
2624 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 2887 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
2625 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 2888 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
@@ -2630,6 +2893,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2630 u8 vector; 2893 u8 vector;
2631 int type; 2894 int type;
2632 u32 exitintinfo = svm->vmcb->control.exit_int_info; 2895 u32 exitintinfo = svm->vmcb->control.exit_int_info;
2896 unsigned int3_injected = svm->int3_injected;
2897
2898 svm->int3_injected = 0;
2633 2899
2634 if (svm->vcpu.arch.hflags & HF_IRET_MASK) 2900 if (svm->vcpu.arch.hflags & HF_IRET_MASK)
2635 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 2901 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
@@ -2649,12 +2915,21 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2649 svm->vcpu.arch.nmi_injected = true; 2915 svm->vcpu.arch.nmi_injected = true;
2650 break; 2916 break;
2651 case SVM_EXITINTINFO_TYPE_EXEPT: 2917 case SVM_EXITINTINFO_TYPE_EXEPT:
2652 /* In case of software exception do not reinject an exception
2653 vector, but re-execute and instruction instead */
2654 if (is_nested(svm)) 2918 if (is_nested(svm))
2655 break; 2919 break;
2656 if (kvm_exception_is_soft(vector)) 2920 /*
2921 * In case of software exceptions, do not reinject the vector,
2922 * but re-execute the instruction instead. Rewind RIP first
2923 * if we emulated INT3 before.
2924 */
2925 if (kvm_exception_is_soft(vector)) {
2926 if (vector == BP_VECTOR && int3_injected &&
2927 kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
2928 kvm_rip_write(&svm->vcpu,
2929 kvm_rip_read(&svm->vcpu) -
2930 int3_injected);
2657 break; 2931 break;
2932 }
2658 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { 2933 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
2659 u32 err = svm->vmcb->control.exit_int_info_err; 2934 u32 err = svm->vmcb->control.exit_int_info_err;
2660 kvm_queue_exception_e(&svm->vcpu, vector, err); 2935 kvm_queue_exception_e(&svm->vcpu, vector, err);
@@ -2875,24 +3150,24 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
2875} 3150}
2876 3151
2877static const struct trace_print_flags svm_exit_reasons_str[] = { 3152static const struct trace_print_flags svm_exit_reasons_str[] = {
2878 { SVM_EXIT_READ_CR0, "read_cr0" }, 3153 { SVM_EXIT_READ_CR0, "read_cr0" },
2879 { SVM_EXIT_READ_CR3, "read_cr3" }, 3154 { SVM_EXIT_READ_CR3, "read_cr3" },
2880 { SVM_EXIT_READ_CR4, "read_cr4" }, 3155 { SVM_EXIT_READ_CR4, "read_cr4" },
2881 { SVM_EXIT_READ_CR8, "read_cr8" }, 3156 { SVM_EXIT_READ_CR8, "read_cr8" },
2882 { SVM_EXIT_WRITE_CR0, "write_cr0" }, 3157 { SVM_EXIT_WRITE_CR0, "write_cr0" },
2883 { SVM_EXIT_WRITE_CR3, "write_cr3" }, 3158 { SVM_EXIT_WRITE_CR3, "write_cr3" },
2884 { SVM_EXIT_WRITE_CR4, "write_cr4" }, 3159 { SVM_EXIT_WRITE_CR4, "write_cr4" },
2885 { SVM_EXIT_WRITE_CR8, "write_cr8" }, 3160 { SVM_EXIT_WRITE_CR8, "write_cr8" },
2886 { SVM_EXIT_READ_DR0, "read_dr0" }, 3161 { SVM_EXIT_READ_DR0, "read_dr0" },
2887 { SVM_EXIT_READ_DR1, "read_dr1" }, 3162 { SVM_EXIT_READ_DR1, "read_dr1" },
2888 { SVM_EXIT_READ_DR2, "read_dr2" }, 3163 { SVM_EXIT_READ_DR2, "read_dr2" },
2889 { SVM_EXIT_READ_DR3, "read_dr3" }, 3164 { SVM_EXIT_READ_DR3, "read_dr3" },
2890 { SVM_EXIT_WRITE_DR0, "write_dr0" }, 3165 { SVM_EXIT_WRITE_DR0, "write_dr0" },
2891 { SVM_EXIT_WRITE_DR1, "write_dr1" }, 3166 { SVM_EXIT_WRITE_DR1, "write_dr1" },
2892 { SVM_EXIT_WRITE_DR2, "write_dr2" }, 3167 { SVM_EXIT_WRITE_DR2, "write_dr2" },
2893 { SVM_EXIT_WRITE_DR3, "write_dr3" }, 3168 { SVM_EXIT_WRITE_DR3, "write_dr3" },
2894 { SVM_EXIT_WRITE_DR5, "write_dr5" }, 3169 { SVM_EXIT_WRITE_DR5, "write_dr5" },
2895 { SVM_EXIT_WRITE_DR7, "write_dr7" }, 3170 { SVM_EXIT_WRITE_DR7, "write_dr7" },
2896 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, 3171 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
2897 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, 3172 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
2898 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, 3173 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
@@ -2941,8 +3216,10 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
2941{ 3216{
2942 struct vcpu_svm *svm = to_svm(vcpu); 3217 struct vcpu_svm *svm = to_svm(vcpu);
2943 3218
2944 update_cr0_intercept(svm);
2945 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; 3219 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
3220 if (is_nested(svm))
3221 svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
3222 update_cr0_intercept(svm);
2946} 3223}
2947 3224
2948static struct kvm_x86_ops svm_x86_ops = { 3225static struct kvm_x86_ops svm_x86_ops = {
@@ -2981,8 +3258,7 @@ static struct kvm_x86_ops svm_x86_ops = {
2981 .set_idt = svm_set_idt, 3258 .set_idt = svm_set_idt,
2982 .get_gdt = svm_get_gdt, 3259 .get_gdt = svm_get_gdt,
2983 .set_gdt = svm_set_gdt, 3260 .set_gdt = svm_set_gdt,
2984 .get_dr = svm_get_dr, 3261 .set_dr7 = svm_set_dr7,
2985 .set_dr = svm_set_dr,
2986 .cache_reg = svm_cache_reg, 3262 .cache_reg = svm_cache_reg,
2987 .get_rflags = svm_get_rflags, 3263 .get_rflags = svm_get_rflags,
2988 .set_rflags = svm_set_rflags, 3264 .set_rflags = svm_set_rflags,
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index eea40439066c..4ddadb1a5ffe 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -12,7 +12,8 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
12 /* 12 /*
13 * There is a race window between reading and incrementing, but we do 13 * There is a race window between reading and incrementing, but we do
14 * not care about potentially loosing timer events in the !reinject 14 * not care about potentially loosing timer events in the !reinject
15 * case anyway. 15 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
16 * in vcpu_enter_guest.
16 */ 17 */
17 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 18 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
18 atomic_inc(&ktimer->pending); 19 atomic_inc(&ktimer->pending);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6ad30a29f044..a6544b8e7c0f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -5,8 +5,6 @@
5 5
6#undef TRACE_SYSTEM 6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm 7#define TRACE_SYSTEM kvm
8#define TRACE_INCLUDE_PATH arch/x86/kvm
9#define TRACE_INCLUDE_FILE trace
10 8
11/* 9/*
12 * Tracepoint for guest mode entry. 10 * Tracepoint for guest mode entry.
@@ -184,8 +182,8 @@ TRACE_EVENT(kvm_apic,
184 * Tracepoint for kvm guest exit: 182 * Tracepoint for kvm guest exit:
185 */ 183 */
186TRACE_EVENT(kvm_exit, 184TRACE_EVENT(kvm_exit,
187 TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), 185 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
188 TP_ARGS(exit_reason, guest_rip), 186 TP_ARGS(exit_reason, vcpu),
189 187
190 TP_STRUCT__entry( 188 TP_STRUCT__entry(
191 __field( unsigned int, exit_reason ) 189 __field( unsigned int, exit_reason )
@@ -194,7 +192,7 @@ TRACE_EVENT(kvm_exit,
194 192
195 TP_fast_assign( 193 TP_fast_assign(
196 __entry->exit_reason = exit_reason; 194 __entry->exit_reason = exit_reason;
197 __entry->guest_rip = guest_rip; 195 __entry->guest_rip = kvm_rip_read(vcpu);
198 ), 196 ),
199 197
200 TP_printk("reason %s rip 0x%lx", 198 TP_printk("reason %s rip 0x%lx",
@@ -221,6 +219,38 @@ TRACE_EVENT(kvm_inj_virq,
221 TP_printk("irq %u", __entry->irq) 219 TP_printk("irq %u", __entry->irq)
222); 220);
223 221
222#define EXS(x) { x##_VECTOR, "#" #x }
223
224#define kvm_trace_sym_exc \
225 EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
226 EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
227 EXS(MF), EXS(MC)
228
229/*
230 * Tracepoint for kvm interrupt injection:
231 */
232TRACE_EVENT(kvm_inj_exception,
233 TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
234 TP_ARGS(exception, has_error, error_code),
235
236 TP_STRUCT__entry(
237 __field( u8, exception )
238 __field( u8, has_error )
239 __field( u32, error_code )
240 ),
241
242 TP_fast_assign(
243 __entry->exception = exception;
244 __entry->has_error = has_error;
245 __entry->error_code = error_code;
246 ),
247
248 TP_printk("%s (0x%x)",
249 __print_symbolic(__entry->exception, kvm_trace_sym_exc),
250 /* FIXME: don't print error_code if not present */
251 __entry->has_error ? __entry->error_code : 0)
252);
253
224/* 254/*
225 * Tracepoint for page fault. 255 * Tracepoint for page fault.
226 */ 256 */
@@ -413,12 +443,34 @@ TRACE_EVENT(kvm_nested_vmrun,
413 ), 443 ),
414 444
415 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " 445 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
416 "event_inj: 0x%08x npt: %s\n", 446 "event_inj: 0x%08x npt: %s",
417 __entry->rip, __entry->vmcb, __entry->nested_rip, 447 __entry->rip, __entry->vmcb, __entry->nested_rip,
418 __entry->int_ctl, __entry->event_inj, 448 __entry->int_ctl, __entry->event_inj,
419 __entry->npt ? "on" : "off") 449 __entry->npt ? "on" : "off")
420); 450);
421 451
452TRACE_EVENT(kvm_nested_intercepts,
453 TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
454 TP_ARGS(cr_read, cr_write, exceptions, intercept),
455
456 TP_STRUCT__entry(
457 __field( __u16, cr_read )
458 __field( __u16, cr_write )
459 __field( __u32, exceptions )
460 __field( __u64, intercept )
461 ),
462
463 TP_fast_assign(
464 __entry->cr_read = cr_read;
465 __entry->cr_write = cr_write;
466 __entry->exceptions = exceptions;
467 __entry->intercept = intercept;
468 ),
469
470 TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
471 __entry->cr_read, __entry->cr_write, __entry->exceptions,
472 __entry->intercept)
473);
422/* 474/*
423 * Tracepoint for #VMEXIT while nested 475 * Tracepoint for #VMEXIT while nested
424 */ 476 */
@@ -447,7 +499,7 @@ TRACE_EVENT(kvm_nested_vmexit,
447 __entry->exit_int_info_err = exit_int_info_err; 499 __entry->exit_int_info_err = exit_int_info_err;
448 ), 500 ),
449 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " 501 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
450 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", 502 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
451 __entry->rip, 503 __entry->rip,
452 ftrace_print_symbols_seq(p, __entry->exit_code, 504 ftrace_print_symbols_seq(p, __entry->exit_code,
453 kvm_x86_ops->exit_reasons_str), 505 kvm_x86_ops->exit_reasons_str),
@@ -482,7 +534,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
482 ), 534 ),
483 535
484 TP_printk("reason: %s ext_inf1: 0x%016llx " 536 TP_printk("reason: %s ext_inf1: 0x%016llx "
485 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", 537 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
486 ftrace_print_symbols_seq(p, __entry->exit_code, 538 ftrace_print_symbols_seq(p, __entry->exit_code,
487 kvm_x86_ops->exit_reasons_str), 539 kvm_x86_ops->exit_reasons_str),
488 __entry->exit_info1, __entry->exit_info2, 540 __entry->exit_info1, __entry->exit_info2,
@@ -504,7 +556,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
504 __entry->rip = rip 556 __entry->rip = rip
505 ), 557 ),
506 558
507 TP_printk("rip: 0x%016llx\n", __entry->rip) 559 TP_printk("rip: 0x%016llx", __entry->rip)
508); 560);
509 561
510/* 562/*
@@ -526,7 +578,7 @@ TRACE_EVENT(kvm_invlpga,
526 __entry->address = address; 578 __entry->address = address;
527 ), 579 ),
528 580
529 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n", 581 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
530 __entry->rip, __entry->asid, __entry->address) 582 __entry->rip, __entry->asid, __entry->address)
531); 583);
532 584
@@ -547,11 +599,102 @@ TRACE_EVENT(kvm_skinit,
547 __entry->slb = slb; 599 __entry->slb = slb;
548 ), 600 ),
549 601
550 TP_printk("rip: 0x%016llx slb: 0x%08x\n", 602 TP_printk("rip: 0x%016llx slb: 0x%08x",
551 __entry->rip, __entry->slb) 603 __entry->rip, __entry->slb)
552); 604);
553 605
606#define __print_insn(insn, ilen) ({ \
607 int i; \
608 const char *ret = p->buffer + p->len; \
609 \
610 for (i = 0; i < ilen; ++i) \
611 trace_seq_printf(p, " %02x", insn[i]); \
612 trace_seq_printf(p, "%c", 0); \
613 ret; \
614 })
615
616#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
617#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
618#define KVM_EMUL_INSN_F_CS_D (1 << 2)
619#define KVM_EMUL_INSN_F_CS_L (1 << 3)
620
621#define kvm_trace_symbol_emul_flags \
622 { 0, "real" }, \
623 { KVM_EMUL_INSN_F_CR0_PE \
624 | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
625 { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
626 { KVM_EMUL_INSN_F_CR0_PE \
627 | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
628 { KVM_EMUL_INSN_F_CR0_PE \
629 | KVM_EMUL_INSN_F_CS_L, "prot64" }
630
631#define kei_decode_mode(mode) ({ \
632 u8 flags = 0xff; \
633 switch (mode) { \
634 case X86EMUL_MODE_REAL: \
635 flags = 0; \
636 break; \
637 case X86EMUL_MODE_VM86: \
638 flags = KVM_EMUL_INSN_F_EFL_VM; \
639 break; \
640 case X86EMUL_MODE_PROT16: \
641 flags = KVM_EMUL_INSN_F_CR0_PE; \
642 break; \
643 case X86EMUL_MODE_PROT32: \
644 flags = KVM_EMUL_INSN_F_CR0_PE \
645 | KVM_EMUL_INSN_F_CS_D; \
646 break; \
647 case X86EMUL_MODE_PROT64: \
648 flags = KVM_EMUL_INSN_F_CR0_PE \
649 | KVM_EMUL_INSN_F_CS_L; \
650 break; \
651 } \
652 flags; \
653 })
654
655TRACE_EVENT(kvm_emulate_insn,
656 TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
657 TP_ARGS(vcpu, failed),
658
659 TP_STRUCT__entry(
660 __field( __u64, rip )
661 __field( __u32, csbase )
662 __field( __u8, len )
663 __array( __u8, insn, 15 )
664 __field( __u8, flags )
665 __field( __u8, failed )
666 ),
667
668 TP_fast_assign(
669 __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
670 __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
671 __entry->len = vcpu->arch.emulate_ctxt.decode.eip
672 - vcpu->arch.emulate_ctxt.decode.fetch.start;
673 memcpy(__entry->insn,
674 vcpu->arch.emulate_ctxt.decode.fetch.data,
675 15);
676 __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
677 __entry->failed = failed;
678 ),
679
680 TP_printk("%x:%llx:%s (%s)%s",
681 __entry->csbase, __entry->rip,
682 __print_insn(__entry->insn, __entry->len),
683 __print_symbolic(__entry->flags,
684 kvm_trace_symbol_emul_flags),
685 __entry->failed ? " failed" : ""
686 )
687 );
688
689#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
690#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
691
554#endif /* _TRACE_KVM_H */ 692#endif /* _TRACE_KVM_H */
555 693
694#undef TRACE_INCLUDE_PATH
695#define TRACE_INCLUDE_PATH arch/x86/kvm
696#undef TRACE_INCLUDE_FILE
697#define TRACE_INCLUDE_FILE trace
698
556/* This part must be outside protection */ 699/* This part must be outside protection */
557#include <trace/define_trace.h> 700#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 82be6dac3d25..54c0035a63f0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -77,6 +77,8 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
77#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 77#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
78#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 78#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
79 79
80#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
81
80/* 82/*
81 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 83 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
82 * ple_gap: upper bound on the amount of time between two successive 84 * ple_gap: upper bound on the amount of time between two successive
@@ -131,7 +133,7 @@ struct vcpu_vmx {
131 } host_state; 133 } host_state;
132 struct { 134 struct {
133 int vm86_active; 135 int vm86_active;
134 u8 save_iopl; 136 ulong save_rflags;
135 struct kvm_save_segment { 137 struct kvm_save_segment {
136 u16 selector; 138 u16 selector;
137 unsigned long base; 139 unsigned long base;
@@ -232,56 +234,56 @@ static const u32 vmx_msr_index[] = {
232}; 234};
233#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 235#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
234 236
235static inline int is_page_fault(u32 intr_info) 237static inline bool is_page_fault(u32 intr_info)
236{ 238{
237 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 239 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
238 INTR_INFO_VALID_MASK)) == 240 INTR_INFO_VALID_MASK)) ==
239 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); 241 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
240} 242}
241 243
242static inline int is_no_device(u32 intr_info) 244static inline bool is_no_device(u32 intr_info)
243{ 245{
244 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 246 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
245 INTR_INFO_VALID_MASK)) == 247 INTR_INFO_VALID_MASK)) ==
246 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 248 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
247} 249}
248 250
249static inline int is_invalid_opcode(u32 intr_info) 251static inline bool is_invalid_opcode(u32 intr_info)
250{ 252{
251 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 253 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
252 INTR_INFO_VALID_MASK)) == 254 INTR_INFO_VALID_MASK)) ==
253 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); 255 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
254} 256}
255 257
256static inline int is_external_interrupt(u32 intr_info) 258static inline bool is_external_interrupt(u32 intr_info)
257{ 259{
258 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 260 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
259 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 261 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
260} 262}
261 263
262static inline int is_machine_check(u32 intr_info) 264static inline bool is_machine_check(u32 intr_info)
263{ 265{
264 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 266 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
265 INTR_INFO_VALID_MASK)) == 267 INTR_INFO_VALID_MASK)) ==
266 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); 268 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
267} 269}
268 270
269static inline int cpu_has_vmx_msr_bitmap(void) 271static inline bool cpu_has_vmx_msr_bitmap(void)
270{ 272{
271 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; 273 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
272} 274}
273 275
274static inline int cpu_has_vmx_tpr_shadow(void) 276static inline bool cpu_has_vmx_tpr_shadow(void)
275{ 277{
276 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; 278 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
277} 279}
278 280
279static inline int vm_need_tpr_shadow(struct kvm *kvm) 281static inline bool vm_need_tpr_shadow(struct kvm *kvm)
280{ 282{
281 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); 283 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
282} 284}
283 285
284static inline int cpu_has_secondary_exec_ctrls(void) 286static inline bool cpu_has_secondary_exec_ctrls(void)
285{ 287{
286 return vmcs_config.cpu_based_exec_ctrl & 288 return vmcs_config.cpu_based_exec_ctrl &
287 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 289 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -301,80 +303,80 @@ static inline bool cpu_has_vmx_flexpriority(void)
301 303
302static inline bool cpu_has_vmx_ept_execute_only(void) 304static inline bool cpu_has_vmx_ept_execute_only(void)
303{ 305{
304 return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); 306 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
305} 307}
306 308
307static inline bool cpu_has_vmx_eptp_uncacheable(void) 309static inline bool cpu_has_vmx_eptp_uncacheable(void)
308{ 310{
309 return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); 311 return vmx_capability.ept & VMX_EPTP_UC_BIT;
310} 312}
311 313
312static inline bool cpu_has_vmx_eptp_writeback(void) 314static inline bool cpu_has_vmx_eptp_writeback(void)
313{ 315{
314 return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); 316 return vmx_capability.ept & VMX_EPTP_WB_BIT;
315} 317}
316 318
317static inline bool cpu_has_vmx_ept_2m_page(void) 319static inline bool cpu_has_vmx_ept_2m_page(void)
318{ 320{
319 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); 321 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
320} 322}
321 323
322static inline bool cpu_has_vmx_ept_1g_page(void) 324static inline bool cpu_has_vmx_ept_1g_page(void)
323{ 325{
324 return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); 326 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
325} 327}
326 328
327static inline int cpu_has_vmx_invept_individual_addr(void) 329static inline bool cpu_has_vmx_invept_individual_addr(void)
328{ 330{
329 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); 331 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
330} 332}
331 333
332static inline int cpu_has_vmx_invept_context(void) 334static inline bool cpu_has_vmx_invept_context(void)
333{ 335{
334 return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT); 336 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
335} 337}
336 338
337static inline int cpu_has_vmx_invept_global(void) 339static inline bool cpu_has_vmx_invept_global(void)
338{ 340{
339 return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT); 341 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
340} 342}
341 343
342static inline int cpu_has_vmx_ept(void) 344static inline bool cpu_has_vmx_ept(void)
343{ 345{
344 return vmcs_config.cpu_based_2nd_exec_ctrl & 346 return vmcs_config.cpu_based_2nd_exec_ctrl &
345 SECONDARY_EXEC_ENABLE_EPT; 347 SECONDARY_EXEC_ENABLE_EPT;
346} 348}
347 349
348static inline int cpu_has_vmx_unrestricted_guest(void) 350static inline bool cpu_has_vmx_unrestricted_guest(void)
349{ 351{
350 return vmcs_config.cpu_based_2nd_exec_ctrl & 352 return vmcs_config.cpu_based_2nd_exec_ctrl &
351 SECONDARY_EXEC_UNRESTRICTED_GUEST; 353 SECONDARY_EXEC_UNRESTRICTED_GUEST;
352} 354}
353 355
354static inline int cpu_has_vmx_ple(void) 356static inline bool cpu_has_vmx_ple(void)
355{ 357{
356 return vmcs_config.cpu_based_2nd_exec_ctrl & 358 return vmcs_config.cpu_based_2nd_exec_ctrl &
357 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 359 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
358} 360}
359 361
360static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 362static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
361{ 363{
362 return flexpriority_enabled && irqchip_in_kernel(kvm); 364 return flexpriority_enabled && irqchip_in_kernel(kvm);
363} 365}
364 366
365static inline int cpu_has_vmx_vpid(void) 367static inline bool cpu_has_vmx_vpid(void)
366{ 368{
367 return vmcs_config.cpu_based_2nd_exec_ctrl & 369 return vmcs_config.cpu_based_2nd_exec_ctrl &
368 SECONDARY_EXEC_ENABLE_VPID; 370 SECONDARY_EXEC_ENABLE_VPID;
369} 371}
370 372
371static inline int cpu_has_vmx_rdtscp(void) 373static inline bool cpu_has_vmx_rdtscp(void)
372{ 374{
373 return vmcs_config.cpu_based_2nd_exec_ctrl & 375 return vmcs_config.cpu_based_2nd_exec_ctrl &
374 SECONDARY_EXEC_RDTSCP; 376 SECONDARY_EXEC_RDTSCP;
375} 377}
376 378
377static inline int cpu_has_virtual_nmis(void) 379static inline bool cpu_has_virtual_nmis(void)
378{ 380{
379 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 381 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
380} 382}
@@ -598,11 +600,11 @@ static void reload_tss(void)
598 /* 600 /*
599 * VT restores TR but not its size. Useless. 601 * VT restores TR but not its size. Useless.
600 */ 602 */
601 struct descriptor_table gdt; 603 struct desc_ptr gdt;
602 struct desc_struct *descs; 604 struct desc_struct *descs;
603 605
604 kvm_get_gdt(&gdt); 606 native_store_gdt(&gdt);
605 descs = (void *)gdt.base; 607 descs = (void *)gdt.address;
606 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 608 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
607 load_TR_desc(); 609 load_TR_desc();
608} 610}
@@ -632,6 +634,43 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
632 return true; 634 return true;
633} 635}
634 636
637static unsigned long segment_base(u16 selector)
638{
639 struct desc_ptr gdt;
640 struct desc_struct *d;
641 unsigned long table_base;
642 unsigned long v;
643
644 if (!(selector & ~3))
645 return 0;
646
647 native_store_gdt(&gdt);
648 table_base = gdt.address;
649
650 if (selector & 4) { /* from ldt */
651 u16 ldt_selector = kvm_read_ldt();
652
653 if (!(ldt_selector & ~3))
654 return 0;
655
656 table_base = segment_base(ldt_selector);
657 }
658 d = (struct desc_struct *)(table_base + (selector & ~7));
659 v = get_desc_base(d);
660#ifdef CONFIG_X86_64
661 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
662 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
663#endif
664 return v;
665}
666
667static inline unsigned long kvm_read_tr_base(void)
668{
669 u16 tr;
670 asm("str %0" : "=g"(tr));
671 return segment_base(tr);
672}
673
635static void vmx_save_host_state(struct kvm_vcpu *vcpu) 674static void vmx_save_host_state(struct kvm_vcpu *vcpu)
636{ 675{
637 struct vcpu_vmx *vmx = to_vmx(vcpu); 676 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -756,7 +795,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
756 } 795 }
757 796
758 if (vcpu->cpu != cpu) { 797 if (vcpu->cpu != cpu) {
759 struct descriptor_table dt; 798 struct desc_ptr dt;
760 unsigned long sysenter_esp; 799 unsigned long sysenter_esp;
761 800
762 vcpu->cpu = cpu; 801 vcpu->cpu = cpu;
@@ -765,8 +804,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
765 * processors. 804 * processors.
766 */ 805 */
767 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 806 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
768 kvm_get_gdt(&dt); 807 native_store_gdt(&dt);
769 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ 808 vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
770 809
771 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 810 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
772 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 811 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -818,18 +857,23 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
818 857
819static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 858static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
820{ 859{
821 unsigned long rflags; 860 unsigned long rflags, save_rflags;
822 861
823 rflags = vmcs_readl(GUEST_RFLAGS); 862 rflags = vmcs_readl(GUEST_RFLAGS);
824 if (to_vmx(vcpu)->rmode.vm86_active) 863 if (to_vmx(vcpu)->rmode.vm86_active) {
825 rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 864 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
865 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
866 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
867 }
826 return rflags; 868 return rflags;
827} 869}
828 870
829static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 871static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
830{ 872{
831 if (to_vmx(vcpu)->rmode.vm86_active) 873 if (to_vmx(vcpu)->rmode.vm86_active) {
874 to_vmx(vcpu)->rmode.save_rflags = rflags;
832 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 875 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
876 }
833 vmcs_writel(GUEST_RFLAGS, rflags); 877 vmcs_writel(GUEST_RFLAGS, rflags);
834} 878}
835 879
@@ -839,9 +883,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
839 int ret = 0; 883 int ret = 0;
840 884
841 if (interruptibility & GUEST_INTR_STATE_STI) 885 if (interruptibility & GUEST_INTR_STATE_STI)
842 ret |= X86_SHADOW_INT_STI; 886 ret |= KVM_X86_SHADOW_INT_STI;
843 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 887 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
844 ret |= X86_SHADOW_INT_MOV_SS; 888 ret |= KVM_X86_SHADOW_INT_MOV_SS;
845 889
846 return ret & mask; 890 return ret & mask;
847} 891}
@@ -853,9 +897,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
853 897
854 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 898 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
855 899
856 if (mask & X86_SHADOW_INT_MOV_SS) 900 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
857 interruptibility |= GUEST_INTR_STATE_MOV_SS; 901 interruptibility |= GUEST_INTR_STATE_MOV_SS;
858 if (mask & X86_SHADOW_INT_STI) 902 else if (mask & KVM_X86_SHADOW_INT_STI)
859 interruptibility |= GUEST_INTR_STATE_STI; 903 interruptibility |= GUEST_INTR_STATE_STI;
860 904
861 if ((interruptibility != interruptibility_old)) 905 if ((interruptibility != interruptibility_old))
@@ -1483,8 +1527,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1483 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); 1527 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
1484 1528
1485 flags = vmcs_readl(GUEST_RFLAGS); 1529 flags = vmcs_readl(GUEST_RFLAGS);
1486 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 1530 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1487 flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); 1531 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1488 vmcs_writel(GUEST_RFLAGS, flags); 1532 vmcs_writel(GUEST_RFLAGS, flags);
1489 1533
1490 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 1534 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1514,7 +1558,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
1514 struct kvm_memslots *slots; 1558 struct kvm_memslots *slots;
1515 gfn_t base_gfn; 1559 gfn_t base_gfn;
1516 1560
1517 slots = rcu_dereference(kvm->memslots); 1561 slots = kvm_memslots(kvm);
1518 base_gfn = kvm->memslots->memslots[0].base_gfn + 1562 base_gfn = kvm->memslots->memslots[0].base_gfn +
1519 kvm->memslots->memslots[0].npages - 3; 1563 kvm->memslots->memslots[0].npages - 3;
1520 return base_gfn << PAGE_SHIFT; 1564 return base_gfn << PAGE_SHIFT;
@@ -1557,8 +1601,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1557 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 1601 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1558 1602
1559 flags = vmcs_readl(GUEST_RFLAGS); 1603 flags = vmcs_readl(GUEST_RFLAGS);
1560 vmx->rmode.save_iopl 1604 vmx->rmode.save_rflags = flags;
1561 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1562 1605
1563 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1606 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1564 1607
@@ -1928,28 +1971,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1928 *l = (ar >> 13) & 1; 1971 *l = (ar >> 13) & 1;
1929} 1972}
1930 1973
1931static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1974static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1932{ 1975{
1933 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); 1976 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
1934 dt->base = vmcs_readl(GUEST_IDTR_BASE); 1977 dt->address = vmcs_readl(GUEST_IDTR_BASE);
1935} 1978}
1936 1979
1937static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1980static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1938{ 1981{
1939 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); 1982 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
1940 vmcs_writel(GUEST_IDTR_BASE, dt->base); 1983 vmcs_writel(GUEST_IDTR_BASE, dt->address);
1941} 1984}
1942 1985
1943static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1986static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1944{ 1987{
1945 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); 1988 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
1946 dt->base = vmcs_readl(GUEST_GDTR_BASE); 1989 dt->address = vmcs_readl(GUEST_GDTR_BASE);
1947} 1990}
1948 1991
1949static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1992static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1950{ 1993{
1951 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); 1994 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
1952 vmcs_writel(GUEST_GDTR_BASE, dt->base); 1995 vmcs_writel(GUEST_GDTR_BASE, dt->address);
1953} 1996}
1954 1997
1955static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 1998static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
@@ -2290,6 +2333,16 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2290 spin_unlock(&vmx_vpid_lock); 2333 spin_unlock(&vmx_vpid_lock);
2291} 2334}
2292 2335
2336static void free_vpid(struct vcpu_vmx *vmx)
2337{
2338 if (!enable_vpid)
2339 return;
2340 spin_lock(&vmx_vpid_lock);
2341 if (vmx->vpid != 0)
2342 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
2343 spin_unlock(&vmx_vpid_lock);
2344}
2345
2293static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) 2346static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
2294{ 2347{
2295 int f = sizeof(unsigned long); 2348 int f = sizeof(unsigned long);
@@ -2328,7 +2381,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2328 u32 junk; 2381 u32 junk;
2329 u64 host_pat, tsc_this, tsc_base; 2382 u64 host_pat, tsc_this, tsc_base;
2330 unsigned long a; 2383 unsigned long a;
2331 struct descriptor_table dt; 2384 struct desc_ptr dt;
2332 int i; 2385 int i;
2333 unsigned long kvm_vmx_return; 2386 unsigned long kvm_vmx_return;
2334 u32 exec_control; 2387 u32 exec_control;
@@ -2409,8 +2462,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2409 2462
2410 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 2463 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
2411 2464
2412 kvm_get_idt(&dt); 2465 native_store_idt(&dt);
2413 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 2466 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
2414 2467
2415 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 2468 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2416 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ 2469 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
@@ -2942,22 +2995,20 @@ static int handle_io(struct kvm_vcpu *vcpu)
2942 int size, in, string; 2995 int size, in, string;
2943 unsigned port; 2996 unsigned port;
2944 2997
2945 ++vcpu->stat.io_exits;
2946 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 2998 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2947 string = (exit_qualification & 16) != 0; 2999 string = (exit_qualification & 16) != 0;
3000 in = (exit_qualification & 8) != 0;
2948 3001
2949 if (string) { 3002 ++vcpu->stat.io_exits;
2950 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2951 return 0;
2952 return 1;
2953 }
2954 3003
2955 size = (exit_qualification & 7) + 1; 3004 if (string || in)
2956 in = (exit_qualification & 8) != 0; 3005 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
2957 port = exit_qualification >> 16;
2958 3006
3007 port = exit_qualification >> 16;
3008 size = (exit_qualification & 7) + 1;
2959 skip_emulated_instruction(vcpu); 3009 skip_emulated_instruction(vcpu);
2960 return kvm_emulate_pio(vcpu, in, size, port); 3010
3011 return kvm_fast_pio_out(vcpu, size, port);
2961} 3012}
2962 3013
2963static void 3014static void
@@ -3048,19 +3099,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3048 return 0; 3099 return 0;
3049} 3100}
3050 3101
3051static int check_dr_alias(struct kvm_vcpu *vcpu)
3052{
3053 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
3054 kvm_queue_exception(vcpu, UD_VECTOR);
3055 return -1;
3056 }
3057 return 0;
3058}
3059
3060static int handle_dr(struct kvm_vcpu *vcpu) 3102static int handle_dr(struct kvm_vcpu *vcpu)
3061{ 3103{
3062 unsigned long exit_qualification; 3104 unsigned long exit_qualification;
3063 unsigned long val;
3064 int dr, reg; 3105 int dr, reg;
3065 3106
3066 /* Do not handle if the CPL > 0, will trigger GP on re-entry */ 3107 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
@@ -3095,67 +3136,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
3095 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 3136 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
3096 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 3137 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
3097 if (exit_qualification & TYPE_MOV_FROM_DR) { 3138 if (exit_qualification & TYPE_MOV_FROM_DR) {
3098 switch (dr) { 3139 unsigned long val;
3099 case 0 ... 3: 3140 if (!kvm_get_dr(vcpu, dr, &val))
3100 val = vcpu->arch.db[dr]; 3141 kvm_register_write(vcpu, reg, val);
3101 break; 3142 } else
3102 case 4: 3143 kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
3103 if (check_dr_alias(vcpu) < 0)
3104 return 1;
3105 /* fall through */
3106 case 6:
3107 val = vcpu->arch.dr6;
3108 break;
3109 case 5:
3110 if (check_dr_alias(vcpu) < 0)
3111 return 1;
3112 /* fall through */
3113 default: /* 7 */
3114 val = vcpu->arch.dr7;
3115 break;
3116 }
3117 kvm_register_write(vcpu, reg, val);
3118 } else {
3119 val = vcpu->arch.regs[reg];
3120 switch (dr) {
3121 case 0 ... 3:
3122 vcpu->arch.db[dr] = val;
3123 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
3124 vcpu->arch.eff_db[dr] = val;
3125 break;
3126 case 4:
3127 if (check_dr_alias(vcpu) < 0)
3128 return 1;
3129 /* fall through */
3130 case 6:
3131 if (val & 0xffffffff00000000ULL) {
3132 kvm_inject_gp(vcpu, 0);
3133 return 1;
3134 }
3135 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
3136 break;
3137 case 5:
3138 if (check_dr_alias(vcpu) < 0)
3139 return 1;
3140 /* fall through */
3141 default: /* 7 */
3142 if (val & 0xffffffff00000000ULL) {
3143 kvm_inject_gp(vcpu, 0);
3144 return 1;
3145 }
3146 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
3147 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
3148 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
3149 vcpu->arch.switch_db_regs =
3150 (val & DR7_BP_EN_MASK);
3151 }
3152 break;
3153 }
3154 }
3155 skip_emulated_instruction(vcpu); 3144 skip_emulated_instruction(vcpu);
3156 return 1; 3145 return 1;
3157} 3146}
3158 3147
3148static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
3149{
3150 vmcs_writel(GUEST_DR7, val);
3151}
3152
3159static int handle_cpuid(struct kvm_vcpu *vcpu) 3153static int handle_cpuid(struct kvm_vcpu *vcpu)
3160{ 3154{
3161 kvm_emulate_cpuid(vcpu); 3155 kvm_emulate_cpuid(vcpu);
@@ -3287,6 +3281,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3287{ 3281{
3288 struct vcpu_vmx *vmx = to_vmx(vcpu); 3282 struct vcpu_vmx *vmx = to_vmx(vcpu);
3289 unsigned long exit_qualification; 3283 unsigned long exit_qualification;
3284 bool has_error_code = false;
3285 u32 error_code = 0;
3290 u16 tss_selector; 3286 u16 tss_selector;
3291 int reason, type, idt_v; 3287 int reason, type, idt_v;
3292 3288
@@ -3309,6 +3305,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3309 kvm_clear_interrupt_queue(vcpu); 3305 kvm_clear_interrupt_queue(vcpu);
3310 break; 3306 break;
3311 case INTR_TYPE_HARD_EXCEPTION: 3307 case INTR_TYPE_HARD_EXCEPTION:
3308 if (vmx->idt_vectoring_info &
3309 VECTORING_INFO_DELIVER_CODE_MASK) {
3310 has_error_code = true;
3311 error_code =
3312 vmcs_read32(IDT_VECTORING_ERROR_CODE);
3313 }
3314 /* fall through */
3312 case INTR_TYPE_SOFT_EXCEPTION: 3315 case INTR_TYPE_SOFT_EXCEPTION:
3313 kvm_clear_exception_queue(vcpu); 3316 kvm_clear_exception_queue(vcpu);
3314 break; 3317 break;
@@ -3323,8 +3326,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3323 type != INTR_TYPE_NMI_INTR)) 3326 type != INTR_TYPE_NMI_INTR))
3324 skip_emulated_instruction(vcpu); 3327 skip_emulated_instruction(vcpu);
3325 3328
3326 if (!kvm_task_switch(vcpu, tss_selector, reason)) 3329 if (kvm_task_switch(vcpu, tss_selector, reason,
3330 has_error_code, error_code) == EMULATE_FAIL) {
3331 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3332 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3333 vcpu->run->internal.ndata = 0;
3327 return 0; 3334 return 0;
3335 }
3328 3336
3329 /* clear all local breakpoint enable flags */ 3337 /* clear all local breakpoint enable flags */
3330 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); 3338 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
@@ -3569,7 +3577,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3569 u32 exit_reason = vmx->exit_reason; 3577 u32 exit_reason = vmx->exit_reason;
3570 u32 vectoring_info = vmx->idt_vectoring_info; 3578 u32 vectoring_info = vmx->idt_vectoring_info;
3571 3579
3572 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3580 trace_kvm_exit(exit_reason, vcpu);
3573 3581
3574 /* If guest state is invalid, start emulating */ 3582 /* If guest state is invalid, start emulating */
3575 if (vmx->emulation_required && emulate_invalid_guest_state) 3583 if (vmx->emulation_required && emulate_invalid_guest_state)
@@ -3918,10 +3926,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3918{ 3926{
3919 struct vcpu_vmx *vmx = to_vmx(vcpu); 3927 struct vcpu_vmx *vmx = to_vmx(vcpu);
3920 3928
3921 spin_lock(&vmx_vpid_lock); 3929 free_vpid(vmx);
3922 if (vmx->vpid != 0)
3923 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3924 spin_unlock(&vmx_vpid_lock);
3925 vmx_free_vmcs(vcpu); 3930 vmx_free_vmcs(vcpu);
3926 kfree(vmx->guest_msrs); 3931 kfree(vmx->guest_msrs);
3927 kvm_vcpu_uninit(vcpu); 3932 kvm_vcpu_uninit(vcpu);
@@ -3983,6 +3988,7 @@ free_msrs:
3983uninit_vcpu: 3988uninit_vcpu:
3984 kvm_vcpu_uninit(&vmx->vcpu); 3989 kvm_vcpu_uninit(&vmx->vcpu);
3985free_vcpu: 3990free_vcpu:
3991 free_vpid(vmx);
3986 kmem_cache_free(kvm_vcpu_cache, vmx); 3992 kmem_cache_free(kvm_vcpu_cache, vmx);
3987 return ERR_PTR(err); 3993 return ERR_PTR(err);
3988} 3994}
@@ -4149,6 +4155,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4149 .set_idt = vmx_set_idt, 4155 .set_idt = vmx_set_idt,
4150 .get_gdt = vmx_get_gdt, 4156 .get_gdt = vmx_get_gdt,
4151 .set_gdt = vmx_set_gdt, 4157 .set_gdt = vmx_set_gdt,
4158 .set_dr7 = vmx_set_dr7,
4152 .cache_reg = vmx_cache_reg, 4159 .cache_reg = vmx_cache_reg,
4153 .get_rflags = vmx_get_rflags, 4160 .get_rflags = vmx_get_rflags,
4154 .set_rflags = vmx_set_rflags, 4161 .set_rflags = vmx_set_rflags,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21b9b6aa3e88..848c814e8c3c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,7 +42,7 @@
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/perf_event.h> 43#include <linux/perf_event.h>
44#include <trace/events/kvm.h> 44#include <trace/events/kvm.h>
45#undef TRACE_INCLUDE_FILE 45
46#define CREATE_TRACE_POINTS 46#define CREATE_TRACE_POINTS
47#include "trace.h" 47#include "trace.h"
48 48
@@ -224,34 +224,6 @@ static void drop_user_return_notifiers(void *ignore)
224 kvm_on_user_return(&smsr->urn); 224 kvm_on_user_return(&smsr->urn);
225} 225}
226 226
227unsigned long segment_base(u16 selector)
228{
229 struct descriptor_table gdt;
230 struct desc_struct *d;
231 unsigned long table_base;
232 unsigned long v;
233
234 if (selector == 0)
235 return 0;
236
237 kvm_get_gdt(&gdt);
238 table_base = gdt.base;
239
240 if (selector & 4) { /* from ldt */
241 u16 ldt_selector = kvm_read_ldt();
242
243 table_base = segment_base(ldt_selector);
244 }
245 d = (struct desc_struct *)(table_base + (selector & ~7));
246 v = get_desc_base(d);
247#ifdef CONFIG_X86_64
248 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
249 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
250#endif
251 return v;
252}
253EXPORT_SYMBOL_GPL(segment_base);
254
255u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 227u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
256{ 228{
257 if (irqchip_in_kernel(vcpu->kvm)) 229 if (irqchip_in_kernel(vcpu->kvm))
@@ -434,8 +406,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
434 406
435#ifdef CONFIG_X86_64 407#ifdef CONFIG_X86_64
436 if (cr0 & 0xffffffff00000000UL) { 408 if (cr0 & 0xffffffff00000000UL) {
437 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
438 cr0, kvm_read_cr0(vcpu));
439 kvm_inject_gp(vcpu, 0); 409 kvm_inject_gp(vcpu, 0);
440 return; 410 return;
441 } 411 }
@@ -444,14 +414,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
444 cr0 &= ~CR0_RESERVED_BITS; 414 cr0 &= ~CR0_RESERVED_BITS;
445 415
446 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 416 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
447 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
448 kvm_inject_gp(vcpu, 0); 417 kvm_inject_gp(vcpu, 0);
449 return; 418 return;
450 } 419 }
451 420
452 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 421 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
453 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
454 "and a clear PE flag\n");
455 kvm_inject_gp(vcpu, 0); 422 kvm_inject_gp(vcpu, 0);
456 return; 423 return;
457 } 424 }
@@ -462,15 +429,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
462 int cs_db, cs_l; 429 int cs_db, cs_l;
463 430
464 if (!is_pae(vcpu)) { 431 if (!is_pae(vcpu)) {
465 printk(KERN_DEBUG "set_cr0: #GP, start paging "
466 "in long mode while PAE is disabled\n");
467 kvm_inject_gp(vcpu, 0); 432 kvm_inject_gp(vcpu, 0);
468 return; 433 return;
469 } 434 }
470 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 435 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
471 if (cs_l) { 436 if (cs_l) {
472 printk(KERN_DEBUG "set_cr0: #GP, start paging "
473 "in long mode while CS.L == 1\n");
474 kvm_inject_gp(vcpu, 0); 437 kvm_inject_gp(vcpu, 0);
475 return; 438 return;
476 439
@@ -478,8 +441,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
478 } else 441 } else
479#endif 442#endif
480 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 443 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
481 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
482 "reserved bits\n");
483 kvm_inject_gp(vcpu, 0); 444 kvm_inject_gp(vcpu, 0);
484 return; 445 return;
485 } 446 }
@@ -487,7 +448,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
487 } 448 }
488 449
489 kvm_x86_ops->set_cr0(vcpu, cr0); 450 kvm_x86_ops->set_cr0(vcpu, cr0);
490 vcpu->arch.cr0 = cr0;
491 451
492 kvm_mmu_reset_context(vcpu); 452 kvm_mmu_reset_context(vcpu);
493 return; 453 return;
@@ -506,34 +466,28 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
506 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 466 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
507 467
508 if (cr4 & CR4_RESERVED_BITS) { 468 if (cr4 & CR4_RESERVED_BITS) {
509 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
510 kvm_inject_gp(vcpu, 0); 469 kvm_inject_gp(vcpu, 0);
511 return; 470 return;
512 } 471 }
513 472
514 if (is_long_mode(vcpu)) { 473 if (is_long_mode(vcpu)) {
515 if (!(cr4 & X86_CR4_PAE)) { 474 if (!(cr4 & X86_CR4_PAE)) {
516 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
517 "in long mode\n");
518 kvm_inject_gp(vcpu, 0); 475 kvm_inject_gp(vcpu, 0);
519 return; 476 return;
520 } 477 }
521 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 478 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
522 && ((cr4 ^ old_cr4) & pdptr_bits) 479 && ((cr4 ^ old_cr4) & pdptr_bits)
523 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 480 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
524 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
525 kvm_inject_gp(vcpu, 0); 481 kvm_inject_gp(vcpu, 0);
526 return; 482 return;
527 } 483 }
528 484
529 if (cr4 & X86_CR4_VMXE) { 485 if (cr4 & X86_CR4_VMXE) {
530 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
531 kvm_inject_gp(vcpu, 0); 486 kvm_inject_gp(vcpu, 0);
532 return; 487 return;
533 } 488 }
534 kvm_x86_ops->set_cr4(vcpu, cr4); 489 kvm_x86_ops->set_cr4(vcpu, cr4);
535 vcpu->arch.cr4 = cr4; 490 vcpu->arch.cr4 = cr4;
536 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
537 kvm_mmu_reset_context(vcpu); 491 kvm_mmu_reset_context(vcpu);
538} 492}
539EXPORT_SYMBOL_GPL(kvm_set_cr4); 493EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -548,21 +502,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
548 502
549 if (is_long_mode(vcpu)) { 503 if (is_long_mode(vcpu)) {
550 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 504 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
551 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
552 kvm_inject_gp(vcpu, 0); 505 kvm_inject_gp(vcpu, 0);
553 return; 506 return;
554 } 507 }
555 } else { 508 } else {
556 if (is_pae(vcpu)) { 509 if (is_pae(vcpu)) {
557 if (cr3 & CR3_PAE_RESERVED_BITS) { 510 if (cr3 & CR3_PAE_RESERVED_BITS) {
558 printk(KERN_DEBUG
559 "set_cr3: #GP, reserved bits\n");
560 kvm_inject_gp(vcpu, 0); 511 kvm_inject_gp(vcpu, 0);
561 return; 512 return;
562 } 513 }
563 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 514 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
564 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
565 "reserved bits\n");
566 kvm_inject_gp(vcpu, 0); 515 kvm_inject_gp(vcpu, 0);
567 return; 516 return;
568 } 517 }
@@ -594,7 +543,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3);
594void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 543void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
595{ 544{
596 if (cr8 & CR8_RESERVED_BITS) { 545 if (cr8 & CR8_RESERVED_BITS) {
597 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
598 kvm_inject_gp(vcpu, 0); 546 kvm_inject_gp(vcpu, 0);
599 return; 547 return;
600 } 548 }
@@ -614,6 +562,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
614} 562}
615EXPORT_SYMBOL_GPL(kvm_get_cr8); 563EXPORT_SYMBOL_GPL(kvm_get_cr8);
616 564
565int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
566{
567 switch (dr) {
568 case 0 ... 3:
569 vcpu->arch.db[dr] = val;
570 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
571 vcpu->arch.eff_db[dr] = val;
572 break;
573 case 4:
574 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
575 kvm_queue_exception(vcpu, UD_VECTOR);
576 return 1;
577 }
578 /* fall through */
579 case 6:
580 if (val & 0xffffffff00000000ULL) {
581 kvm_inject_gp(vcpu, 0);
582 return 1;
583 }
584 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
585 break;
586 case 5:
587 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
588 kvm_queue_exception(vcpu, UD_VECTOR);
589 return 1;
590 }
591 /* fall through */
592 default: /* 7 */
593 if (val & 0xffffffff00000000ULL) {
594 kvm_inject_gp(vcpu, 0);
595 return 1;
596 }
597 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
598 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
599 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
600 vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
601 }
602 break;
603 }
604
605 return 0;
606}
607EXPORT_SYMBOL_GPL(kvm_set_dr);
608
609int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
610{
611 switch (dr) {
612 case 0 ... 3:
613 *val = vcpu->arch.db[dr];
614 break;
615 case 4:
616 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
617 kvm_queue_exception(vcpu, UD_VECTOR);
618 return 1;
619 }
620 /* fall through */
621 case 6:
622 *val = vcpu->arch.dr6;
623 break;
624 case 5:
625 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
626 kvm_queue_exception(vcpu, UD_VECTOR);
627 return 1;
628 }
629 /* fall through */
630 default: /* 7 */
631 *val = vcpu->arch.dr7;
632 break;
633 }
634
635 return 0;
636}
637EXPORT_SYMBOL_GPL(kvm_get_dr);
638
617static inline u32 bit(int bitno) 639static inline u32 bit(int bitno)
618{ 640{
619 return 1 << (bitno & 31); 641 return 1 << (bitno & 31);
@@ -650,15 +672,12 @@ static u32 emulated_msrs[] = {
650static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 672static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
651{ 673{
652 if (efer & efer_reserved_bits) { 674 if (efer & efer_reserved_bits) {
653 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
654 efer);
655 kvm_inject_gp(vcpu, 0); 675 kvm_inject_gp(vcpu, 0);
656 return; 676 return;
657 } 677 }
658 678
659 if (is_paging(vcpu) 679 if (is_paging(vcpu)
660 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { 680 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
661 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
662 kvm_inject_gp(vcpu, 0); 681 kvm_inject_gp(vcpu, 0);
663 return; 682 return;
664 } 683 }
@@ -668,7 +687,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
668 687
669 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 688 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
670 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 689 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
671 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
672 kvm_inject_gp(vcpu, 0); 690 kvm_inject_gp(vcpu, 0);
673 return; 691 return;
674 } 692 }
@@ -679,7 +697,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
679 697
680 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 698 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
681 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 699 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
682 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
683 kvm_inject_gp(vcpu, 0); 700 kvm_inject_gp(vcpu, 0);
684 return; 701 return;
685 } 702 }
@@ -968,9 +985,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
968 if (msr >= MSR_IA32_MC0_CTL && 985 if (msr >= MSR_IA32_MC0_CTL &&
969 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 986 msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
970 u32 offset = msr - MSR_IA32_MC0_CTL; 987 u32 offset = msr - MSR_IA32_MC0_CTL;
971 /* only 0 or all 1s can be written to IA32_MCi_CTL */ 988 /* only 0 or all 1s can be written to IA32_MCi_CTL
989 * some Linux kernels though clear bit 10 in bank 4 to
990 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
991 * this to avoid an uncatched #GP in the guest
992 */
972 if ((offset & 0x3) == 0 && 993 if ((offset & 0x3) == 0 &&
973 data != 0 && data != ~(u64)0) 994 data != 0 && (data | (1 << 10)) != ~(u64)0)
974 return -1; 995 return -1;
975 vcpu->arch.mce_banks[offset] = data; 996 vcpu->arch.mce_banks[offset] = data;
976 break; 997 break;
@@ -1114,6 +1135,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1114 break; 1135 break;
1115 case MSR_K7_HWCR: 1136 case MSR_K7_HWCR:
1116 data &= ~(u64)0x40; /* ignore flush filter disable */ 1137 data &= ~(u64)0x40; /* ignore flush filter disable */
1138 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1117 if (data != 0) { 1139 if (data != 0) {
1118 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1140 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1119 data); 1141 data);
@@ -1572,6 +1594,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1572 case KVM_CAP_HYPERV_VAPIC: 1594 case KVM_CAP_HYPERV_VAPIC:
1573 case KVM_CAP_HYPERV_SPIN: 1595 case KVM_CAP_HYPERV_SPIN:
1574 case KVM_CAP_PCI_SEGMENT: 1596 case KVM_CAP_PCI_SEGMENT:
1597 case KVM_CAP_DEBUGREGS:
1575 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1598 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1576 r = 1; 1599 r = 1;
1577 break; 1600 break;
@@ -2124,14 +2147,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2124{ 2147{
2125 vcpu_load(vcpu); 2148 vcpu_load(vcpu);
2126 2149
2127 events->exception.injected = vcpu->arch.exception.pending; 2150 events->exception.injected =
2151 vcpu->arch.exception.pending &&
2152 !kvm_exception_is_soft(vcpu->arch.exception.nr);
2128 events->exception.nr = vcpu->arch.exception.nr; 2153 events->exception.nr = vcpu->arch.exception.nr;
2129 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 2154 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2130 events->exception.error_code = vcpu->arch.exception.error_code; 2155 events->exception.error_code = vcpu->arch.exception.error_code;
2131 2156
2132 events->interrupt.injected = vcpu->arch.interrupt.pending; 2157 events->interrupt.injected =
2158 vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
2133 events->interrupt.nr = vcpu->arch.interrupt.nr; 2159 events->interrupt.nr = vcpu->arch.interrupt.nr;
2134 events->interrupt.soft = vcpu->arch.interrupt.soft; 2160 events->interrupt.soft = 0;
2161 events->interrupt.shadow =
2162 kvm_x86_ops->get_interrupt_shadow(vcpu,
2163 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2135 2164
2136 events->nmi.injected = vcpu->arch.nmi_injected; 2165 events->nmi.injected = vcpu->arch.nmi_injected;
2137 events->nmi.pending = vcpu->arch.nmi_pending; 2166 events->nmi.pending = vcpu->arch.nmi_pending;
@@ -2140,7 +2169,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2140 events->sipi_vector = vcpu->arch.sipi_vector; 2169 events->sipi_vector = vcpu->arch.sipi_vector;
2141 2170
2142 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2171 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2143 | KVM_VCPUEVENT_VALID_SIPI_VECTOR); 2172 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2173 | KVM_VCPUEVENT_VALID_SHADOW);
2144 2174
2145 vcpu_put(vcpu); 2175 vcpu_put(vcpu);
2146} 2176}
@@ -2149,7 +2179,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2149 struct kvm_vcpu_events *events) 2179 struct kvm_vcpu_events *events)
2150{ 2180{
2151 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 2181 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2152 | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) 2182 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2183 | KVM_VCPUEVENT_VALID_SHADOW))
2153 return -EINVAL; 2184 return -EINVAL;
2154 2185
2155 vcpu_load(vcpu); 2186 vcpu_load(vcpu);
@@ -2164,6 +2195,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2164 vcpu->arch.interrupt.soft = events->interrupt.soft; 2195 vcpu->arch.interrupt.soft = events->interrupt.soft;
2165 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) 2196 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2166 kvm_pic_clear_isr_ack(vcpu->kvm); 2197 kvm_pic_clear_isr_ack(vcpu->kvm);
2198 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2199 kvm_x86_ops->set_interrupt_shadow(vcpu,
2200 events->interrupt.shadow);
2167 2201
2168 vcpu->arch.nmi_injected = events->nmi.injected; 2202 vcpu->arch.nmi_injected = events->nmi.injected;
2169 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) 2203 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
@@ -2178,6 +2212,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2178 return 0; 2212 return 0;
2179} 2213}
2180 2214
2215static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2216 struct kvm_debugregs *dbgregs)
2217{
2218 vcpu_load(vcpu);
2219
2220 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2221 dbgregs->dr6 = vcpu->arch.dr6;
2222 dbgregs->dr7 = vcpu->arch.dr7;
2223 dbgregs->flags = 0;
2224
2225 vcpu_put(vcpu);
2226}
2227
2228static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2229 struct kvm_debugregs *dbgregs)
2230{
2231 if (dbgregs->flags)
2232 return -EINVAL;
2233
2234 vcpu_load(vcpu);
2235
2236 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2237 vcpu->arch.dr6 = dbgregs->dr6;
2238 vcpu->arch.dr7 = dbgregs->dr7;
2239
2240 vcpu_put(vcpu);
2241
2242 return 0;
2243}
2244
2181long kvm_arch_vcpu_ioctl(struct file *filp, 2245long kvm_arch_vcpu_ioctl(struct file *filp,
2182 unsigned int ioctl, unsigned long arg) 2246 unsigned int ioctl, unsigned long arg)
2183{ 2247{
@@ -2356,6 +2420,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2356 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); 2420 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2357 break; 2421 break;
2358 } 2422 }
2423 case KVM_GET_DEBUGREGS: {
2424 struct kvm_debugregs dbgregs;
2425
2426 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
2427
2428 r = -EFAULT;
2429 if (copy_to_user(argp, &dbgregs,
2430 sizeof(struct kvm_debugregs)))
2431 break;
2432 r = 0;
2433 break;
2434 }
2435 case KVM_SET_DEBUGREGS: {
2436 struct kvm_debugregs dbgregs;
2437
2438 r = -EFAULT;
2439 if (copy_from_user(&dbgregs, argp,
2440 sizeof(struct kvm_debugregs)))
2441 break;
2442
2443 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
2444 break;
2445 }
2359 default: 2446 default:
2360 r = -EINVAL; 2447 r = -EINVAL;
2361 } 2448 }
@@ -2409,7 +2496,7 @@ gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2409 struct kvm_mem_alias *alias; 2496 struct kvm_mem_alias *alias;
2410 struct kvm_mem_aliases *aliases; 2497 struct kvm_mem_aliases *aliases;
2411 2498
2412 aliases = rcu_dereference(kvm->arch.aliases); 2499 aliases = kvm_aliases(kvm);
2413 2500
2414 for (i = 0; i < aliases->naliases; ++i) { 2501 for (i = 0; i < aliases->naliases; ++i) {
2415 alias = &aliases->aliases[i]; 2502 alias = &aliases->aliases[i];
@@ -2428,7 +2515,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2428 struct kvm_mem_alias *alias; 2515 struct kvm_mem_alias *alias;
2429 struct kvm_mem_aliases *aliases; 2516 struct kvm_mem_aliases *aliases;
2430 2517
2431 aliases = rcu_dereference(kvm->arch.aliases); 2518 aliases = kvm_aliases(kvm);
2432 2519
2433 for (i = 0; i < aliases->naliases; ++i) { 2520 for (i = 0; i < aliases->naliases; ++i) {
2434 alias = &aliases->aliases[i]; 2521 alias = &aliases->aliases[i];
@@ -2636,8 +2723,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2636int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2723int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2637 struct kvm_dirty_log *log) 2724 struct kvm_dirty_log *log)
2638{ 2725{
2639 int r, n, i; 2726 int r, i;
2640 struct kvm_memory_slot *memslot; 2727 struct kvm_memory_slot *memslot;
2728 unsigned long n;
2641 unsigned long is_dirty = 0; 2729 unsigned long is_dirty = 0;
2642 unsigned long *dirty_bitmap = NULL; 2730 unsigned long *dirty_bitmap = NULL;
2643 2731
@@ -2652,7 +2740,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2652 if (!memslot->dirty_bitmap) 2740 if (!memslot->dirty_bitmap)
2653 goto out; 2741 goto out;
2654 2742
2655 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2743 n = kvm_dirty_bitmap_bytes(memslot);
2656 2744
2657 r = -ENOMEM; 2745 r = -ENOMEM;
2658 dirty_bitmap = vmalloc(n); 2746 dirty_bitmap = vmalloc(n);
@@ -2822,11 +2910,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
2822 r = -EFAULT; 2910 r = -EFAULT;
2823 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2911 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2824 goto out; 2912 goto out;
2913 r = -ENXIO;
2825 if (irqchip_in_kernel(kvm)) { 2914 if (irqchip_in_kernel(kvm)) {
2826 __s32 status; 2915 __s32 status;
2827 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2916 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2828 irq_event.irq, irq_event.level); 2917 irq_event.irq, irq_event.level);
2829 if (ioctl == KVM_IRQ_LINE_STATUS) { 2918 if (ioctl == KVM_IRQ_LINE_STATUS) {
2919 r = -EFAULT;
2830 irq_event.status = status; 2920 irq_event.status = status;
2831 if (copy_to_user(argp, &irq_event, 2921 if (copy_to_user(argp, &irq_event,
2832 sizeof irq_event)) 2922 sizeof irq_event))
@@ -3042,6 +3132,18 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3042 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3132 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
3043} 3133}
3044 3134
3135static void kvm_set_segment(struct kvm_vcpu *vcpu,
3136 struct kvm_segment *var, int seg)
3137{
3138 kvm_x86_ops->set_segment(vcpu, var, seg);
3139}
3140
3141void kvm_get_segment(struct kvm_vcpu *vcpu,
3142 struct kvm_segment *var, int seg)
3143{
3144 kvm_x86_ops->get_segment(vcpu, var, seg);
3145}
3146
3045gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3147gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3046{ 3148{
3047 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3149 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -3122,14 +3224,17 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3122 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3224 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
3123} 3225}
3124 3226
3125static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 3227static int kvm_write_guest_virt_system(gva_t addr, void *val,
3126 struct kvm_vcpu *vcpu, u32 *error) 3228 unsigned int bytes,
3229 struct kvm_vcpu *vcpu,
3230 u32 *error)
3127{ 3231{
3128 void *data = val; 3232 void *data = val;
3129 int r = X86EMUL_CONTINUE; 3233 int r = X86EMUL_CONTINUE;
3130 3234
3131 while (bytes) { 3235 while (bytes) {
3132 gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); 3236 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
3237 PFERR_WRITE_MASK, error);
3133 unsigned offset = addr & (PAGE_SIZE-1); 3238 unsigned offset = addr & (PAGE_SIZE-1);
3134 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3239 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3135 int ret; 3240 int ret;
@@ -3152,7 +3257,6 @@ out:
3152 return r; 3257 return r;
3153} 3258}
3154 3259
3155
3156static int emulator_read_emulated(unsigned long addr, 3260static int emulator_read_emulated(unsigned long addr,
3157 void *val, 3261 void *val,
3158 unsigned int bytes, 3262 unsigned int bytes,
@@ -3255,9 +3359,9 @@ mmio:
3255} 3359}
3256 3360
3257int emulator_write_emulated(unsigned long addr, 3361int emulator_write_emulated(unsigned long addr,
3258 const void *val, 3362 const void *val,
3259 unsigned int bytes, 3363 unsigned int bytes,
3260 struct kvm_vcpu *vcpu) 3364 struct kvm_vcpu *vcpu)
3261{ 3365{
3262 /* Crossing a page boundary? */ 3366 /* Crossing a page boundary? */
3263 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3367 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
@@ -3275,45 +3379,150 @@ int emulator_write_emulated(unsigned long addr,
3275} 3379}
3276EXPORT_SYMBOL_GPL(emulator_write_emulated); 3380EXPORT_SYMBOL_GPL(emulator_write_emulated);
3277 3381
3382#define CMPXCHG_TYPE(t, ptr, old, new) \
3383 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
3384
3385#ifdef CONFIG_X86_64
3386# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
3387#else
3388# define CMPXCHG64(ptr, old, new) \
3389 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
3390#endif
3391
3278static int emulator_cmpxchg_emulated(unsigned long addr, 3392static int emulator_cmpxchg_emulated(unsigned long addr,
3279 const void *old, 3393 const void *old,
3280 const void *new, 3394 const void *new,
3281 unsigned int bytes, 3395 unsigned int bytes,
3282 struct kvm_vcpu *vcpu) 3396 struct kvm_vcpu *vcpu)
3283{ 3397{
3284 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3398 gpa_t gpa;
3285#ifndef CONFIG_X86_64 3399 struct page *page;
3286 /* guests cmpxchg8b have to be emulated atomically */ 3400 char *kaddr;
3287 if (bytes == 8) { 3401 bool exchanged;
3288 gpa_t gpa;
3289 struct page *page;
3290 char *kaddr;
3291 u64 val;
3292 3402
3293 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); 3403 /* guests cmpxchg8b have to be emulated atomically */
3404 if (bytes > 8 || (bytes & (bytes - 1)))
3405 goto emul_write;
3294 3406
3295 if (gpa == UNMAPPED_GVA || 3407 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
3296 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3297 goto emul_write;
3298 3408
3299 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 3409 if (gpa == UNMAPPED_GVA ||
3300 goto emul_write; 3410 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3411 goto emul_write;
3301 3412
3302 val = *(u64 *)new; 3413 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
3414 goto emul_write;
3303 3415
3304 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3416 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3305 3417
3306 kaddr = kmap_atomic(page, KM_USER0); 3418 kaddr = kmap_atomic(page, KM_USER0);
3307 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 3419 kaddr += offset_in_page(gpa);
3308 kunmap_atomic(kaddr, KM_USER0); 3420 switch (bytes) {
3309 kvm_release_page_dirty(page); 3421 case 1:
3422 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
3423 break;
3424 case 2:
3425 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
3426 break;
3427 case 4:
3428 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
3429 break;
3430 case 8:
3431 exchanged = CMPXCHG64(kaddr, old, new);
3432 break;
3433 default:
3434 BUG();
3310 } 3435 }
3436 kunmap_atomic(kaddr, KM_USER0);
3437 kvm_release_page_dirty(page);
3438
3439 if (!exchanged)
3440 return X86EMUL_CMPXCHG_FAILED;
3441
3442 kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
3443
3444 return X86EMUL_CONTINUE;
3445
3311emul_write: 3446emul_write:
3312#endif 3447 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3313 3448
3314 return emulator_write_emulated(addr, new, bytes, vcpu); 3449 return emulator_write_emulated(addr, new, bytes, vcpu);
3315} 3450}
3316 3451
3452static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3453{
3454 /* TODO: String I/O for in kernel device */
3455 int r;
3456
3457 if (vcpu->arch.pio.in)
3458 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3459 vcpu->arch.pio.size, pd);
3460 else
3461 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3462 vcpu->arch.pio.port, vcpu->arch.pio.size,
3463 pd);
3464 return r;
3465}
3466
3467
3468static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3469 unsigned int count, struct kvm_vcpu *vcpu)
3470{
3471 if (vcpu->arch.pio.count)
3472 goto data_avail;
3473
3474 trace_kvm_pio(1, port, size, 1);
3475
3476 vcpu->arch.pio.port = port;
3477 vcpu->arch.pio.in = 1;
3478 vcpu->arch.pio.count = count;
3479 vcpu->arch.pio.size = size;
3480
3481 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3482 data_avail:
3483 memcpy(val, vcpu->arch.pio_data, size * count);
3484 vcpu->arch.pio.count = 0;
3485 return 1;
3486 }
3487
3488 vcpu->run->exit_reason = KVM_EXIT_IO;
3489 vcpu->run->io.direction = KVM_EXIT_IO_IN;
3490 vcpu->run->io.size = size;
3491 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3492 vcpu->run->io.count = count;
3493 vcpu->run->io.port = port;
3494
3495 return 0;
3496}
3497
3498static int emulator_pio_out_emulated(int size, unsigned short port,
3499 const void *val, unsigned int count,
3500 struct kvm_vcpu *vcpu)
3501{
3502 trace_kvm_pio(0, port, size, 1);
3503
3504 vcpu->arch.pio.port = port;
3505 vcpu->arch.pio.in = 0;
3506 vcpu->arch.pio.count = count;
3507 vcpu->arch.pio.size = size;
3508
3509 memcpy(vcpu->arch.pio_data, val, size * count);
3510
3511 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3512 vcpu->arch.pio.count = 0;
3513 return 1;
3514 }
3515
3516 vcpu->run->exit_reason = KVM_EXIT_IO;
3517 vcpu->run->io.direction = KVM_EXIT_IO_OUT;
3518 vcpu->run->io.size = size;
3519 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3520 vcpu->run->io.count = count;
3521 vcpu->run->io.port = port;
3522
3523 return 0;
3524}
3525
3317static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 3526static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
3318{ 3527{
3319 return kvm_x86_ops->get_segment_base(vcpu, seg); 3528 return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -3334,14 +3543,14 @@ int emulate_clts(struct kvm_vcpu *vcpu)
3334 3543
3335int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3544int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
3336{ 3545{
3337 return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); 3546 return kvm_get_dr(ctxt->vcpu, dr, dest);
3338} 3547}
3339 3548
3340int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3549int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
3341{ 3550{
3342 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3551 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
3343 3552
3344 return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); 3553 return kvm_set_dr(ctxt->vcpu, dr, value & mask);
3345} 3554}
3346 3555
3347void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3556void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3362,12 +3571,167 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
3362} 3571}
3363EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3572EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3364 3573
3574static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3575{
3576 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3577}
3578
3579static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3580{
3581 unsigned long value;
3582
3583 switch (cr) {
3584 case 0:
3585 value = kvm_read_cr0(vcpu);
3586 break;
3587 case 2:
3588 value = vcpu->arch.cr2;
3589 break;
3590 case 3:
3591 value = vcpu->arch.cr3;
3592 break;
3593 case 4:
3594 value = kvm_read_cr4(vcpu);
3595 break;
3596 case 8:
3597 value = kvm_get_cr8(vcpu);
3598 break;
3599 default:
3600 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3601 return 0;
3602 }
3603
3604 return value;
3605}
3606
3607static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3608{
3609 switch (cr) {
3610 case 0:
3611 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3612 break;
3613 case 2:
3614 vcpu->arch.cr2 = val;
3615 break;
3616 case 3:
3617 kvm_set_cr3(vcpu, val);
3618 break;
3619 case 4:
3620 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3621 break;
3622 case 8:
3623 kvm_set_cr8(vcpu, val & 0xfUL);
3624 break;
3625 default:
3626 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3627 }
3628}
3629
3630static int emulator_get_cpl(struct kvm_vcpu *vcpu)
3631{
3632 return kvm_x86_ops->get_cpl(vcpu);
3633}
3634
3635static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3636{
3637 kvm_x86_ops->get_gdt(vcpu, dt);
3638}
3639
3640static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3641 struct kvm_vcpu *vcpu)
3642{
3643 struct kvm_segment var;
3644
3645 kvm_get_segment(vcpu, &var, seg);
3646
3647 if (var.unusable)
3648 return false;
3649
3650 if (var.g)
3651 var.limit >>= 12;
3652 set_desc_limit(desc, var.limit);
3653 set_desc_base(desc, (unsigned long)var.base);
3654 desc->type = var.type;
3655 desc->s = var.s;
3656 desc->dpl = var.dpl;
3657 desc->p = var.present;
3658 desc->avl = var.avl;
3659 desc->l = var.l;
3660 desc->d = var.db;
3661 desc->g = var.g;
3662
3663 return true;
3664}
3665
3666static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
3667 struct kvm_vcpu *vcpu)
3668{
3669 struct kvm_segment var;
3670
3671 /* needed to preserve selector */
3672 kvm_get_segment(vcpu, &var, seg);
3673
3674 var.base = get_desc_base(desc);
3675 var.limit = get_desc_limit(desc);
3676 if (desc->g)
3677 var.limit = (var.limit << 12) | 0xfff;
3678 var.type = desc->type;
3679 var.present = desc->p;
3680 var.dpl = desc->dpl;
3681 var.db = desc->d;
3682 var.s = desc->s;
3683 var.l = desc->l;
3684 var.g = desc->g;
3685 var.avl = desc->avl;
3686 var.present = desc->p;
3687 var.unusable = !var.present;
3688 var.padding = 0;
3689
3690 kvm_set_segment(vcpu, &var, seg);
3691 return;
3692}
3693
3694static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
3695{
3696 struct kvm_segment kvm_seg;
3697
3698 kvm_get_segment(vcpu, &kvm_seg, seg);
3699 return kvm_seg.selector;
3700}
3701
3702static void emulator_set_segment_selector(u16 sel, int seg,
3703 struct kvm_vcpu *vcpu)
3704{
3705 struct kvm_segment kvm_seg;
3706
3707 kvm_get_segment(vcpu, &kvm_seg, seg);
3708 kvm_seg.selector = sel;
3709 kvm_set_segment(vcpu, &kvm_seg, seg);
3710}
3711
3712static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3713{
3714 kvm_x86_ops->set_rflags(vcpu, rflags);
3715}
3716
3365static struct x86_emulate_ops emulate_ops = { 3717static struct x86_emulate_ops emulate_ops = {
3366 .read_std = kvm_read_guest_virt_system, 3718 .read_std = kvm_read_guest_virt_system,
3719 .write_std = kvm_write_guest_virt_system,
3367 .fetch = kvm_fetch_guest_virt, 3720 .fetch = kvm_fetch_guest_virt,
3368 .read_emulated = emulator_read_emulated, 3721 .read_emulated = emulator_read_emulated,
3369 .write_emulated = emulator_write_emulated, 3722 .write_emulated = emulator_write_emulated,
3370 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3723 .cmpxchg_emulated = emulator_cmpxchg_emulated,
3724 .pio_in_emulated = emulator_pio_in_emulated,
3725 .pio_out_emulated = emulator_pio_out_emulated,
3726 .get_cached_descriptor = emulator_get_cached_descriptor,
3727 .set_cached_descriptor = emulator_set_cached_descriptor,
3728 .get_segment_selector = emulator_get_segment_selector,
3729 .set_segment_selector = emulator_set_segment_selector,
3730 .get_gdt = emulator_get_gdt,
3731 .get_cr = emulator_get_cr,
3732 .set_cr = emulator_set_cr,
3733 .cpl = emulator_get_cpl,
3734 .set_rflags = emulator_set_rflags,
3371}; 3735};
3372 3736
3373static void cache_all_regs(struct kvm_vcpu *vcpu) 3737static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3398,14 +3762,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3398 cache_all_regs(vcpu); 3762 cache_all_regs(vcpu);
3399 3763
3400 vcpu->mmio_is_write = 0; 3764 vcpu->mmio_is_write = 0;
3401 vcpu->arch.pio.string = 0;
3402 3765
3403 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3766 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3404 int cs_db, cs_l; 3767 int cs_db, cs_l;
3405 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3768 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3406 3769
3407 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3770 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3408 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 3771 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
3772 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
3409 vcpu->arch.emulate_ctxt.mode = 3773 vcpu->arch.emulate_ctxt.mode =
3410 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 3774 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3411 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3775 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
@@ -3414,6 +3778,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3414 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3778 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3415 3779
3416 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3780 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3781 trace_kvm_emulate_insn_start(vcpu);
3417 3782
3418 /* Only allow emulation of specific instructions on #UD 3783 /* Only allow emulation of specific instructions on #UD
3419 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 3784 * (namely VMMCALL, sysenter, sysexit, syscall)*/
@@ -3446,6 +3811,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3446 ++vcpu->stat.insn_emulation; 3811 ++vcpu->stat.insn_emulation;
3447 if (r) { 3812 if (r) {
3448 ++vcpu->stat.insn_emulation_fail; 3813 ++vcpu->stat.insn_emulation_fail;
3814 trace_kvm_emulate_insn_failed(vcpu);
3449 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3815 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3450 return EMULATE_DONE; 3816 return EMULATE_DONE;
3451 return EMULATE_FAIL; 3817 return EMULATE_FAIL;
@@ -3457,16 +3823,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3457 return EMULATE_DONE; 3823 return EMULATE_DONE;
3458 } 3824 }
3459 3825
3826restart:
3460 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3827 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3461 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 3828 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3462 3829
3463 if (r == 0) 3830 if (r == 0)
3464 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 3831 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
3465 3832
3466 if (vcpu->arch.pio.string) 3833 if (vcpu->arch.pio.count) {
3834 if (!vcpu->arch.pio.in)
3835 vcpu->arch.pio.count = 0;
3467 return EMULATE_DO_MMIO; 3836 return EMULATE_DO_MMIO;
3837 }
3468 3838
3469 if ((r || vcpu->mmio_is_write) && run) { 3839 if (r || vcpu->mmio_is_write) {
3470 run->exit_reason = KVM_EXIT_MMIO; 3840 run->exit_reason = KVM_EXIT_MMIO;
3471 run->mmio.phys_addr = vcpu->mmio_phys_addr; 3841 run->mmio.phys_addr = vcpu->mmio_phys_addr;
3472 memcpy(run->mmio.data, vcpu->mmio_data, 8); 3842 memcpy(run->mmio.data, vcpu->mmio_data, 8);
@@ -3476,222 +3846,41 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3476 3846
3477 if (r) { 3847 if (r) {
3478 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3848 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3479 return EMULATE_DONE; 3849 goto done;
3480 if (!vcpu->mmio_needed) { 3850 if (!vcpu->mmio_needed) {
3851 ++vcpu->stat.insn_emulation_fail;
3852 trace_kvm_emulate_insn_failed(vcpu);
3481 kvm_report_emulation_failure(vcpu, "mmio"); 3853 kvm_report_emulation_failure(vcpu, "mmio");
3482 return EMULATE_FAIL; 3854 return EMULATE_FAIL;
3483 } 3855 }
3484 return EMULATE_DO_MMIO; 3856 return EMULATE_DO_MMIO;
3485 } 3857 }
3486 3858
3487 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
3488
3489 if (vcpu->mmio_is_write) { 3859 if (vcpu->mmio_is_write) {
3490 vcpu->mmio_needed = 0; 3860 vcpu->mmio_needed = 0;
3491 return EMULATE_DO_MMIO; 3861 return EMULATE_DO_MMIO;
3492 } 3862 }
3493 3863
3494 return EMULATE_DONE; 3864done:
3495} 3865 if (vcpu->arch.exception.pending)
3496EXPORT_SYMBOL_GPL(emulate_instruction); 3866 vcpu->arch.emulate_ctxt.restart = false;
3497
3498static int pio_copy_data(struct kvm_vcpu *vcpu)
3499{
3500 void *p = vcpu->arch.pio_data;
3501 gva_t q = vcpu->arch.pio.guest_gva;
3502 unsigned bytes;
3503 int ret;
3504 u32 error_code;
3505
3506 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
3507 if (vcpu->arch.pio.in)
3508 ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
3509 else
3510 ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
3511
3512 if (ret == X86EMUL_PROPAGATE_FAULT)
3513 kvm_inject_page_fault(vcpu, q, error_code);
3514
3515 return ret;
3516}
3517
3518int complete_pio(struct kvm_vcpu *vcpu)
3519{
3520 struct kvm_pio_request *io = &vcpu->arch.pio;
3521 long delta;
3522 int r;
3523 unsigned long val;
3524
3525 if (!io->string) {
3526 if (io->in) {
3527 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3528 memcpy(&val, vcpu->arch.pio_data, io->size);
3529 kvm_register_write(vcpu, VCPU_REGS_RAX, val);
3530 }
3531 } else {
3532 if (io->in) {
3533 r = pio_copy_data(vcpu);
3534 if (r)
3535 goto out;
3536 }
3537
3538 delta = 1;
3539 if (io->rep) {
3540 delta *= io->cur_count;
3541 /*
3542 * The size of the register should really depend on
3543 * current address size.
3544 */
3545 val = kvm_register_read(vcpu, VCPU_REGS_RCX);
3546 val -= delta;
3547 kvm_register_write(vcpu, VCPU_REGS_RCX, val);
3548 }
3549 if (io->down)
3550 delta = -delta;
3551 delta *= io->size;
3552 if (io->in) {
3553 val = kvm_register_read(vcpu, VCPU_REGS_RDI);
3554 val += delta;
3555 kvm_register_write(vcpu, VCPU_REGS_RDI, val);
3556 } else {
3557 val = kvm_register_read(vcpu, VCPU_REGS_RSI);
3558 val += delta;
3559 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
3560 }
3561 }
3562out:
3563 io->count -= io->cur_count;
3564 io->cur_count = 0;
3565
3566 return 0;
3567}
3568 3867
3569static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3868 if (vcpu->arch.emulate_ctxt.restart)
3570{ 3869 goto restart;
3571 /* TODO: String I/O for in kernel device */
3572 int r;
3573 3870
3574 if (vcpu->arch.pio.in) 3871 return EMULATE_DONE;
3575 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3576 vcpu->arch.pio.size, pd);
3577 else
3578 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3579 vcpu->arch.pio.port, vcpu->arch.pio.size,
3580 pd);
3581 return r;
3582}
3583
3584static int pio_string_write(struct kvm_vcpu *vcpu)
3585{
3586 struct kvm_pio_request *io = &vcpu->arch.pio;
3587 void *pd = vcpu->arch.pio_data;
3588 int i, r = 0;
3589
3590 for (i = 0; i < io->cur_count; i++) {
3591 if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3592 io->port, io->size, pd)) {
3593 r = -EOPNOTSUPP;
3594 break;
3595 }
3596 pd += io->size;
3597 }
3598 return r;
3599}
3600
3601int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3602{
3603 unsigned long val;
3604
3605 trace_kvm_pio(!in, port, size, 1);
3606
3607 vcpu->run->exit_reason = KVM_EXIT_IO;
3608 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3609 vcpu->run->io.size = vcpu->arch.pio.size = size;
3610 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3611 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
3612 vcpu->run->io.port = vcpu->arch.pio.port = port;
3613 vcpu->arch.pio.in = in;
3614 vcpu->arch.pio.string = 0;
3615 vcpu->arch.pio.down = 0;
3616 vcpu->arch.pio.rep = 0;
3617
3618 if (!vcpu->arch.pio.in) {
3619 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3620 memcpy(vcpu->arch.pio_data, &val, 4);
3621 }
3622
3623 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3624 complete_pio(vcpu);
3625 return 1;
3626 }
3627 return 0;
3628} 3872}
3629EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3873EXPORT_SYMBOL_GPL(emulate_instruction);
3630 3874
3631int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, 3875int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
3632 int size, unsigned long count, int down,
3633 gva_t address, int rep, unsigned port)
3634{ 3876{
3635 unsigned now, in_page; 3877 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3636 int ret = 0; 3878 int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
3637 3879 /* do not return to emulator after return from userspace */
3638 trace_kvm_pio(!in, port, size, count); 3880 vcpu->arch.pio.count = 0;
3639
3640 vcpu->run->exit_reason = KVM_EXIT_IO;
3641 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3642 vcpu->run->io.size = vcpu->arch.pio.size = size;
3643 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3644 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
3645 vcpu->run->io.port = vcpu->arch.pio.port = port;
3646 vcpu->arch.pio.in = in;
3647 vcpu->arch.pio.string = 1;
3648 vcpu->arch.pio.down = down;
3649 vcpu->arch.pio.rep = rep;
3650
3651 if (!count) {
3652 kvm_x86_ops->skip_emulated_instruction(vcpu);
3653 return 1;
3654 }
3655
3656 if (!down)
3657 in_page = PAGE_SIZE - offset_in_page(address);
3658 else
3659 in_page = offset_in_page(address) + size;
3660 now = min(count, (unsigned long)in_page / size);
3661 if (!now)
3662 now = 1;
3663 if (down) {
3664 /*
3665 * String I/O in reverse. Yuck. Kill the guest, fix later.
3666 */
3667 pr_unimpl(vcpu, "guest string pio down\n");
3668 kvm_inject_gp(vcpu, 0);
3669 return 1;
3670 }
3671 vcpu->run->io.count = now;
3672 vcpu->arch.pio.cur_count = now;
3673
3674 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
3675 kvm_x86_ops->skip_emulated_instruction(vcpu);
3676
3677 vcpu->arch.pio.guest_gva = address;
3678
3679 if (!vcpu->arch.pio.in) {
3680 /* string PIO write */
3681 ret = pio_copy_data(vcpu);
3682 if (ret == X86EMUL_PROPAGATE_FAULT)
3683 return 1;
3684 if (ret == 0 && !pio_string_write(vcpu)) {
3685 complete_pio(vcpu);
3686 if (vcpu->arch.pio.count == 0)
3687 ret = 1;
3688 }
3689 }
3690 /* no string PIO read support yet */
3691
3692 return ret; 3881 return ret;
3693} 3882}
3694EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 3883EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
3695 3884
3696static void bounce_off(void *info) 3885static void bounce_off(void *info)
3697{ 3886{
@@ -4014,85 +4203,20 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
4014 return emulator_write_emulated(rip, instruction, 3, vcpu); 4203 return emulator_write_emulated(rip, instruction, 3, vcpu);
4015} 4204}
4016 4205
4017static u64 mk_cr_64(u64 curr_cr, u32 new_val)
4018{
4019 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
4020}
4021
4022void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4206void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4023{ 4207{
4024 struct descriptor_table dt = { limit, base }; 4208 struct desc_ptr dt = { limit, base };
4025 4209
4026 kvm_x86_ops->set_gdt(vcpu, &dt); 4210 kvm_x86_ops->set_gdt(vcpu, &dt);
4027} 4211}
4028 4212
4029void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4213void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4030{ 4214{
4031 struct descriptor_table dt = { limit, base }; 4215 struct desc_ptr dt = { limit, base };
4032 4216
4033 kvm_x86_ops->set_idt(vcpu, &dt); 4217 kvm_x86_ops->set_idt(vcpu, &dt);
4034} 4218}
4035 4219
4036void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
4037 unsigned long *rflags)
4038{
4039 kvm_lmsw(vcpu, msw);
4040 *rflags = kvm_get_rflags(vcpu);
4041}
4042
4043unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
4044{
4045 unsigned long value;
4046
4047 switch (cr) {
4048 case 0:
4049 value = kvm_read_cr0(vcpu);
4050 break;
4051 case 2:
4052 value = vcpu->arch.cr2;
4053 break;
4054 case 3:
4055 value = vcpu->arch.cr3;
4056 break;
4057 case 4:
4058 value = kvm_read_cr4(vcpu);
4059 break;
4060 case 8:
4061 value = kvm_get_cr8(vcpu);
4062 break;
4063 default:
4064 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
4065 return 0;
4066 }
4067
4068 return value;
4069}
4070
4071void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
4072 unsigned long *rflags)
4073{
4074 switch (cr) {
4075 case 0:
4076 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
4077 *rflags = kvm_get_rflags(vcpu);
4078 break;
4079 case 2:
4080 vcpu->arch.cr2 = val;
4081 break;
4082 case 3:
4083 kvm_set_cr3(vcpu, val);
4084 break;
4085 case 4:
4086 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4087 break;
4088 case 8:
4089 kvm_set_cr8(vcpu, val & 0xfUL);
4090 break;
4091 default:
4092 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
4093 }
4094}
4095
4096static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 4220static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
4097{ 4221{
4098 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 4222 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
@@ -4156,9 +4280,13 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
4156{ 4280{
4157 struct kvm_cpuid_entry2 *best; 4281 struct kvm_cpuid_entry2 *best;
4158 4282
4283 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
4284 if (!best || best->eax < 0x80000008)
4285 goto not_found;
4159 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 4286 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
4160 if (best) 4287 if (best)
4161 return best->eax & 0xff; 4288 return best->eax & 0xff;
4289not_found:
4162 return 36; 4290 return 36;
4163} 4291}
4164 4292
@@ -4272,6 +4400,9 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
4272{ 4400{
4273 /* try to reinject previous events if any */ 4401 /* try to reinject previous events if any */
4274 if (vcpu->arch.exception.pending) { 4402 if (vcpu->arch.exception.pending) {
4403 trace_kvm_inj_exception(vcpu->arch.exception.nr,
4404 vcpu->arch.exception.has_error_code,
4405 vcpu->arch.exception.error_code);
4275 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 4406 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
4276 vcpu->arch.exception.has_error_code, 4407 vcpu->arch.exception.has_error_code,
4277 vcpu->arch.exception.error_code); 4408 vcpu->arch.exception.error_code);
@@ -4532,24 +4663,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4532 if (!irqchip_in_kernel(vcpu->kvm)) 4663 if (!irqchip_in_kernel(vcpu->kvm))
4533 kvm_set_cr8(vcpu, kvm_run->cr8); 4664 kvm_set_cr8(vcpu, kvm_run->cr8);
4534 4665
4535 if (vcpu->arch.pio.cur_count) { 4666 if (vcpu->arch.pio.count || vcpu->mmio_needed ||
4536 r = complete_pio(vcpu); 4667 vcpu->arch.emulate_ctxt.restart) {
4537 if (r) 4668 if (vcpu->mmio_needed) {
4538 goto out; 4669 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4539 } 4670 vcpu->mmio_read_completed = 1;
4540 if (vcpu->mmio_needed) { 4671 vcpu->mmio_needed = 0;
4541 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4672 }
4542 vcpu->mmio_read_completed = 1;
4543 vcpu->mmio_needed = 0;
4544
4545 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4673 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4546 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, 4674 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4547 EMULTYPE_NO_DECODE);
4548 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4675 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4549 if (r == EMULATE_DO_MMIO) { 4676 if (r == EMULATE_DO_MMIO) {
4550 /*
4551 * Read-modify-write. Back to userspace.
4552 */
4553 r = 0; 4677 r = 0;
4554 goto out; 4678 goto out;
4555 } 4679 }
@@ -4632,12 +4756,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4632 return 0; 4756 return 0;
4633} 4757}
4634 4758
4635void kvm_get_segment(struct kvm_vcpu *vcpu,
4636 struct kvm_segment *var, int seg)
4637{
4638 kvm_x86_ops->get_segment(vcpu, var, seg);
4639}
4640
4641void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 4759void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4642{ 4760{
4643 struct kvm_segment cs; 4761 struct kvm_segment cs;
@@ -4651,7 +4769,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
4651int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 4769int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4652 struct kvm_sregs *sregs) 4770 struct kvm_sregs *sregs)
4653{ 4771{
4654 struct descriptor_table dt; 4772 struct desc_ptr dt;
4655 4773
4656 vcpu_load(vcpu); 4774 vcpu_load(vcpu);
4657 4775
@@ -4666,11 +4784,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4666 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4784 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4667 4785
4668 kvm_x86_ops->get_idt(vcpu, &dt); 4786 kvm_x86_ops->get_idt(vcpu, &dt);
4669 sregs->idt.limit = dt.limit; 4787 sregs->idt.limit = dt.size;
4670 sregs->idt.base = dt.base; 4788 sregs->idt.base = dt.address;
4671 kvm_x86_ops->get_gdt(vcpu, &dt); 4789 kvm_x86_ops->get_gdt(vcpu, &dt);
4672 sregs->gdt.limit = dt.limit; 4790 sregs->gdt.limit = dt.size;
4673 sregs->gdt.base = dt.base; 4791 sregs->gdt.base = dt.address;
4674 4792
4675 sregs->cr0 = kvm_read_cr0(vcpu); 4793 sregs->cr0 = kvm_read_cr0(vcpu);
4676 sregs->cr2 = vcpu->arch.cr2; 4794 sregs->cr2 = vcpu->arch.cr2;
@@ -4709,559 +4827,33 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4709 return 0; 4827 return 0;
4710} 4828}
4711 4829
4712static void kvm_set_segment(struct kvm_vcpu *vcpu, 4830int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4713 struct kvm_segment *var, int seg) 4831 bool has_error_code, u32 error_code)
4714{
4715 kvm_x86_ops->set_segment(vcpu, var, seg);
4716}
4717
4718static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
4719 struct kvm_segment *kvm_desct)
4720{
4721 kvm_desct->base = get_desc_base(seg_desc);
4722 kvm_desct->limit = get_desc_limit(seg_desc);
4723 if (seg_desc->g) {
4724 kvm_desct->limit <<= 12;
4725 kvm_desct->limit |= 0xfff;
4726 }
4727 kvm_desct->selector = selector;
4728 kvm_desct->type = seg_desc->type;
4729 kvm_desct->present = seg_desc->p;
4730 kvm_desct->dpl = seg_desc->dpl;
4731 kvm_desct->db = seg_desc->d;
4732 kvm_desct->s = seg_desc->s;
4733 kvm_desct->l = seg_desc->l;
4734 kvm_desct->g = seg_desc->g;
4735 kvm_desct->avl = seg_desc->avl;
4736 if (!selector)
4737 kvm_desct->unusable = 1;
4738 else
4739 kvm_desct->unusable = 0;
4740 kvm_desct->padding = 0;
4741}
4742
4743static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
4744 u16 selector,
4745 struct descriptor_table *dtable)
4746{
4747 if (selector & 1 << 2) {
4748 struct kvm_segment kvm_seg;
4749
4750 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
4751
4752 if (kvm_seg.unusable)
4753 dtable->limit = 0;
4754 else
4755 dtable->limit = kvm_seg.limit;
4756 dtable->base = kvm_seg.base;
4757 }
4758 else
4759 kvm_x86_ops->get_gdt(vcpu, dtable);
4760}
4761
4762/* allowed just for 8 bytes segments */
4763static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4764 struct desc_struct *seg_desc)
4765{
4766 struct descriptor_table dtable;
4767 u16 index = selector >> 3;
4768 int ret;
4769 u32 err;
4770 gva_t addr;
4771
4772 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4773
4774 if (dtable.limit < index * 8 + 7) {
4775 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4776 return X86EMUL_PROPAGATE_FAULT;
4777 }
4778 addr = dtable.base + index * 8;
4779 ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
4780 vcpu, &err);
4781 if (ret == X86EMUL_PROPAGATE_FAULT)
4782 kvm_inject_page_fault(vcpu, addr, err);
4783
4784 return ret;
4785}
4786
4787/* allowed just for 8 bytes segments */
4788static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4789 struct desc_struct *seg_desc)
4790{
4791 struct descriptor_table dtable;
4792 u16 index = selector >> 3;
4793
4794 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4795
4796 if (dtable.limit < index * 8 + 7)
4797 return 1;
4798 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
4799}
4800
4801static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
4802 struct desc_struct *seg_desc)
4803{
4804 u32 base_addr = get_desc_base(seg_desc);
4805
4806 return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
4807}
4808
4809static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
4810 struct desc_struct *seg_desc)
4811{
4812 u32 base_addr = get_desc_base(seg_desc);
4813
4814 return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
4815}
4816
4817static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4818{
4819 struct kvm_segment kvm_seg;
4820
4821 kvm_get_segment(vcpu, &kvm_seg, seg);
4822 return kvm_seg.selector;
4823}
4824
4825static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4826{
4827 struct kvm_segment segvar = {
4828 .base = selector << 4,
4829 .limit = 0xffff,
4830 .selector = selector,
4831 .type = 3,
4832 .present = 1,
4833 .dpl = 3,
4834 .db = 0,
4835 .s = 1,
4836 .l = 0,
4837 .g = 0,
4838 .avl = 0,
4839 .unusable = 0,
4840 };
4841 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4842 return X86EMUL_CONTINUE;
4843}
4844
4845static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4846{ 4832{
4847 return (seg != VCPU_SREG_LDTR) && 4833 int cs_db, cs_l, ret;
4848 (seg != VCPU_SREG_TR) && 4834 cache_all_regs(vcpu);
4849 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4850}
4851
4852int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
4853{
4854 struct kvm_segment kvm_seg;
4855 struct desc_struct seg_desc;
4856 u8 dpl, rpl, cpl;
4857 unsigned err_vec = GP_VECTOR;
4858 u32 err_code = 0;
4859 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
4860 int ret;
4861 4835
4862 if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) 4836 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4863 return kvm_load_realmode_segment(vcpu, selector, seg);
4864 4837
4865 /* NULL selector is not valid for TR, CS and SS */ 4838 vcpu->arch.emulate_ctxt.vcpu = vcpu;
4866 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) 4839 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
4867 && null_selector) 4840 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4868 goto exception; 4841 vcpu->arch.emulate_ctxt.mode =
4842 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4843 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
4844 ? X86EMUL_MODE_VM86 : cs_l
4845 ? X86EMUL_MODE_PROT64 : cs_db
4846 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4869 4847
4870 /* TR should be in GDT only */ 4848 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
4871 if (seg == VCPU_SREG_TR && (selector & (1 << 2))) 4849 tss_selector, reason, has_error_code,
4872 goto exception; 4850 error_code);
4873 4851
4874 ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
4875 if (ret) 4852 if (ret)
4876 return ret; 4853 return EMULATE_FAIL;
4877
4878 seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
4879
4880 if (null_selector) { /* for NULL selector skip all following checks */
4881 kvm_seg.unusable = 1;
4882 goto load;
4883 }
4884
4885 err_code = selector & 0xfffc;
4886 err_vec = GP_VECTOR;
4887
4888 /* can't load system descriptor into segment selecor */
4889 if (seg <= VCPU_SREG_GS && !kvm_seg.s)
4890 goto exception;
4891
4892 if (!kvm_seg.present) {
4893 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
4894 goto exception;
4895 }
4896
4897 rpl = selector & 3;
4898 dpl = kvm_seg.dpl;
4899 cpl = kvm_x86_ops->get_cpl(vcpu);
4900
4901 switch (seg) {
4902 case VCPU_SREG_SS:
4903 /*
4904 * segment is not a writable data segment or segment
4905 * selector's RPL != CPL or segment selector's RPL != CPL
4906 */
4907 if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
4908 goto exception;
4909 break;
4910 case VCPU_SREG_CS:
4911 if (!(kvm_seg.type & 8))
4912 goto exception;
4913
4914 if (kvm_seg.type & 4) {
4915 /* conforming */
4916 if (dpl > cpl)
4917 goto exception;
4918 } else {
4919 /* nonconforming */
4920 if (rpl > cpl || dpl != cpl)
4921 goto exception;
4922 }
4923 /* CS(RPL) <- CPL */
4924 selector = (selector & 0xfffc) | cpl;
4925 break;
4926 case VCPU_SREG_TR:
4927 if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
4928 goto exception;
4929 break;
4930 case VCPU_SREG_LDTR:
4931 if (kvm_seg.s || kvm_seg.type != 2)
4932 goto exception;
4933 break;
4934 default: /* DS, ES, FS, or GS */
4935 /*
4936 * segment is not a data or readable code segment or
4937 * ((segment is a data or nonconforming code segment)
4938 * and (both RPL and CPL > DPL))
4939 */
4940 if ((kvm_seg.type & 0xa) == 0x8 ||
4941 (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
4942 goto exception;
4943 break;
4944 }
4945
4946 if (!kvm_seg.unusable && kvm_seg.s) {
4947 /* mark segment as accessed */
4948 kvm_seg.type |= 1;
4949 seg_desc.type |= 1;
4950 save_guest_segment_descriptor(vcpu, selector, &seg_desc);
4951 }
4952load:
4953 kvm_set_segment(vcpu, &kvm_seg, seg);
4954 return X86EMUL_CONTINUE;
4955exception:
4956 kvm_queue_exception_e(vcpu, err_vec, err_code);
4957 return X86EMUL_PROPAGATE_FAULT;
4958}
4959
4960static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4961 struct tss_segment_32 *tss)
4962{
4963 tss->cr3 = vcpu->arch.cr3;
4964 tss->eip = kvm_rip_read(vcpu);
4965 tss->eflags = kvm_get_rflags(vcpu);
4966 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4967 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4968 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4969 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4970 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4971 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4972 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4973 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4974 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4975 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4976 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4977 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4978 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
4979 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
4980 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4981}
4982
4983static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
4984{
4985 struct kvm_segment kvm_seg;
4986 kvm_get_segment(vcpu, &kvm_seg, seg);
4987 kvm_seg.selector = sel;
4988 kvm_set_segment(vcpu, &kvm_seg, seg);
4989}
4990
4991static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4992 struct tss_segment_32 *tss)
4993{
4994 kvm_set_cr3(vcpu, tss->cr3);
4995
4996 kvm_rip_write(vcpu, tss->eip);
4997 kvm_set_rflags(vcpu, tss->eflags | 2);
4998
4999 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
5000 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
5001 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
5002 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
5003 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
5004 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
5005 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
5006 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
5007
5008 /*
5009 * SDM says that segment selectors are loaded before segment
5010 * descriptors
5011 */
5012 kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
5013 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
5014 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
5015 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5016 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5017 kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
5018 kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
5019
5020 /*
5021 * Now load segment descriptors. If fault happenes at this stage
5022 * it is handled in a context of new task
5023 */
5024 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
5025 return 1;
5026
5027 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
5028 return 1;
5029
5030 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
5031 return 1;
5032
5033 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
5034 return 1;
5035
5036 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
5037 return 1;
5038
5039 if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
5040 return 1;
5041
5042 if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
5043 return 1;
5044 return 0;
5045}
5046
5047static void save_state_to_tss16(struct kvm_vcpu *vcpu,
5048 struct tss_segment_16 *tss)
5049{
5050 tss->ip = kvm_rip_read(vcpu);
5051 tss->flag = kvm_get_rflags(vcpu);
5052 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
5053 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
5054 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
5055 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
5056 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
5057 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
5058 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
5059 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
5060
5061 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
5062 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
5063 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
5064 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
5065 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
5066}
5067
5068static int load_state_from_tss16(struct kvm_vcpu *vcpu,
5069 struct tss_segment_16 *tss)
5070{
5071 kvm_rip_write(vcpu, tss->ip);
5072 kvm_set_rflags(vcpu, tss->flag | 2);
5073 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
5074 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
5075 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
5076 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
5077 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
5078 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
5079 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
5080 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
5081
5082 /*
5083 * SDM says that segment selectors are loaded before segment
5084 * descriptors
5085 */
5086 kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
5087 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
5088 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
5089 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5090 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5091
5092 /*
5093 * Now load segment descriptors. If fault happenes at this stage
5094 * it is handled in a context of new task
5095 */
5096 if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
5097 return 1;
5098
5099 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
5100 return 1;
5101
5102 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
5103 return 1;
5104
5105 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
5106 return 1;
5107
5108 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
5109 return 1;
5110 return 0;
5111}
5112
5113static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
5114 u16 old_tss_sel, u32 old_tss_base,
5115 struct desc_struct *nseg_desc)
5116{
5117 struct tss_segment_16 tss_segment_16;
5118 int ret = 0;
5119
5120 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
5121 sizeof tss_segment_16))
5122 goto out;
5123
5124 save_state_to_tss16(vcpu, &tss_segment_16);
5125
5126 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
5127 sizeof tss_segment_16))
5128 goto out;
5129
5130 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
5131 &tss_segment_16, sizeof tss_segment_16))
5132 goto out;
5133
5134 if (old_tss_sel != 0xffff) {
5135 tss_segment_16.prev_task_link = old_tss_sel;
5136
5137 if (kvm_write_guest(vcpu->kvm,
5138 get_tss_base_addr_write(vcpu, nseg_desc),
5139 &tss_segment_16.prev_task_link,
5140 sizeof tss_segment_16.prev_task_link))
5141 goto out;
5142 }
5143
5144 if (load_state_from_tss16(vcpu, &tss_segment_16))
5145 goto out;
5146
5147 ret = 1;
5148out:
5149 return ret;
5150}
5151
5152static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
5153 u16 old_tss_sel, u32 old_tss_base,
5154 struct desc_struct *nseg_desc)
5155{
5156 struct tss_segment_32 tss_segment_32;
5157 int ret = 0;
5158
5159 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
5160 sizeof tss_segment_32))
5161 goto out;
5162
5163 save_state_to_tss32(vcpu, &tss_segment_32);
5164
5165 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
5166 sizeof tss_segment_32))
5167 goto out;
5168
5169 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
5170 &tss_segment_32, sizeof tss_segment_32))
5171 goto out;
5172
5173 if (old_tss_sel != 0xffff) {
5174 tss_segment_32.prev_task_link = old_tss_sel;
5175
5176 if (kvm_write_guest(vcpu->kvm,
5177 get_tss_base_addr_write(vcpu, nseg_desc),
5178 &tss_segment_32.prev_task_link,
5179 sizeof tss_segment_32.prev_task_link))
5180 goto out;
5181 }
5182
5183 if (load_state_from_tss32(vcpu, &tss_segment_32))
5184 goto out;
5185
5186 ret = 1;
5187out:
5188 return ret;
5189}
5190
5191int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
5192{
5193 struct kvm_segment tr_seg;
5194 struct desc_struct cseg_desc;
5195 struct desc_struct nseg_desc;
5196 int ret = 0;
5197 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
5198 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
5199
5200 old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
5201
5202 /* FIXME: Handle errors. Failure to read either TSS or their
5203 * descriptors should generate a pagefault.
5204 */
5205 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
5206 goto out;
5207
5208 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
5209 goto out;
5210
5211 if (reason != TASK_SWITCH_IRET) {
5212 int cpl;
5213
5214 cpl = kvm_x86_ops->get_cpl(vcpu);
5215 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
5216 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
5217 return 1;
5218 }
5219 }
5220 4854
5221 if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { 4855 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5222 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 4856 return EMULATE_DONE;
5223 return 1;
5224 }
5225
5226 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
5227 cseg_desc.type &= ~(1 << 1); //clear the B flag
5228 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
5229 }
5230
5231 if (reason == TASK_SWITCH_IRET) {
5232 u32 eflags = kvm_get_rflags(vcpu);
5233 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
5234 }
5235
5236 /* set back link to prev task only if NT bit is set in eflags
5237 note that old_tss_sel is not used afetr this point */
5238 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
5239 old_tss_sel = 0xffff;
5240
5241 if (nseg_desc.type & 8)
5242 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
5243 old_tss_base, &nseg_desc);
5244 else
5245 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
5246 old_tss_base, &nseg_desc);
5247
5248 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
5249 u32 eflags = kvm_get_rflags(vcpu);
5250 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
5251 }
5252
5253 if (reason != TASK_SWITCH_IRET) {
5254 nseg_desc.type |= (1 << 1);
5255 save_guest_segment_descriptor(vcpu, tss_selector,
5256 &nseg_desc);
5257 }
5258
5259 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
5260 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
5261 tr_seg.type = 11;
5262 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
5263out:
5264 return ret;
5265} 4857}
5266EXPORT_SYMBOL_GPL(kvm_task_switch); 4858EXPORT_SYMBOL_GPL(kvm_task_switch);
5267 4859
@@ -5270,15 +4862,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5270{ 4862{
5271 int mmu_reset_needed = 0; 4863 int mmu_reset_needed = 0;
5272 int pending_vec, max_bits; 4864 int pending_vec, max_bits;
5273 struct descriptor_table dt; 4865 struct desc_ptr dt;
5274 4866
5275 vcpu_load(vcpu); 4867 vcpu_load(vcpu);
5276 4868
5277 dt.limit = sregs->idt.limit; 4869 dt.size = sregs->idt.limit;
5278 dt.base = sregs->idt.base; 4870 dt.address = sregs->idt.base;
5279 kvm_x86_ops->set_idt(vcpu, &dt); 4871 kvm_x86_ops->set_idt(vcpu, &dt);
5280 dt.limit = sregs->gdt.limit; 4872 dt.size = sregs->gdt.limit;
5281 dt.base = sregs->gdt.base; 4873 dt.address = sregs->gdt.base;
5282 kvm_x86_ops->set_gdt(vcpu, &dt); 4874 kvm_x86_ops->set_gdt(vcpu, &dt);
5283 4875
5284 vcpu->arch.cr2 = sregs->cr2; 4876 vcpu->arch.cr2 = sregs->cr2;
@@ -5377,11 +4969,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5377 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4969 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
5378 } 4970 }
5379 4971
5380 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { 4972 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5381 vcpu->arch.singlestep_cs = 4973 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
5382 get_segment_selector(vcpu, VCPU_SREG_CS); 4974 get_segment_base(vcpu, VCPU_SREG_CS);
5383 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
5384 }
5385 4975
5386 /* 4976 /*
5387 * Trigger an rflags update that will inject or remove the trace 4977 * Trigger an rflags update that will inject or remove the trace
@@ -5872,13 +5462,22 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
5872 return kvm_x86_ops->interrupt_allowed(vcpu); 5462 return kvm_x86_ops->interrupt_allowed(vcpu);
5873} 5463}
5874 5464
5465bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
5466{
5467 unsigned long current_rip = kvm_rip_read(vcpu) +
5468 get_segment_base(vcpu, VCPU_SREG_CS);
5469
5470 return current_rip == linear_rip;
5471}
5472EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
5473
5875unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) 5474unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5876{ 5475{
5877 unsigned long rflags; 5476 unsigned long rflags;
5878 5477
5879 rflags = kvm_x86_ops->get_rflags(vcpu); 5478 rflags = kvm_x86_ops->get_rflags(vcpu);
5880 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5479 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5881 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); 5480 rflags &= ~X86_EFLAGS_TF;
5882 return rflags; 5481 return rflags;
5883} 5482}
5884EXPORT_SYMBOL_GPL(kvm_get_rflags); 5483EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -5886,10 +5485,8 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
5886void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 5485void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5887{ 5486{
5888 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && 5487 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5889 vcpu->arch.singlestep_cs == 5488 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
5890 get_segment_selector(vcpu, VCPU_SREG_CS) && 5489 rflags |= X86_EFLAGS_TF;
5891 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5892 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5893 kvm_x86_ops->set_rflags(vcpu, rflags); 5490 kvm_x86_ops->set_rflags(vcpu, rflags);
5894} 5491}
5895EXPORT_SYMBOL_GPL(kvm_set_rflags); 5492EXPORT_SYMBOL_GPL(kvm_set_rflags);
@@ -5905,3 +5502,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5905EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 5502EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5906EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 5503EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5907EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 5504EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
5505EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2b..f4b54458285b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,6 +65,13 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 66}
67 67
68static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
69{
70 return rcu_dereference_check(kvm->arch.aliases,
71 srcu_read_lock_held(&kvm->srcu)
72 || lockdep_is_held(&kvm->slots_lock));
73}
74
68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 75void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 76void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
70 77