aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2011-02-22 12:24:26 -0500
committerThomas Gleixner <tglx@linutronix.de>2011-02-22 12:41:48 -0500
commit695884fb8acd9857e0e7120ccb2150e30f4b8fef (patch)
tree49aa424c1a021ce432e9fa5ea29d37a23e4e30cc /arch/x86/kvm
parent5df91509d324d44cfb11e55d9cb02fe18b53b045 (diff)
parent04bea68b2f0eeebb089ecc67b618795925268b4a (diff)
Merge branch 'devicetree/for-x86' of git://git.secretlab.ca/git/linux-2.6 into x86/platform
Reason: x86 devicetree support for ce4100 depends on those device tree changes scheduled for .39. Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c367
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h22
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/mmu.c379
-rw-r--r--arch/x86/kvm/mmu_audit.c39
-rw-r--r--arch/x86/kvm/paging_tmpl.h156
-rw-r--r--arch/x86/kvm/svm.c861
-rw-r--r--arch/x86/kvm/trace.h17
-rw-r--r--arch/x86/kvm/vmx.c180
-rw-r--r--arch/x86/kvm/x86.c487
-rw-r--r--arch/x86/kvm/x86.h5
14 files changed, 1627 insertions, 895 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ddc131ff438f..50f63648ce1b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select KVM_ASYNC_PF
31 select USER_RETURN_NOTIFIER 32 select USER_RETURN_NOTIFIER
32 select KVM_MMIO 33 select KVM_MMIO
33 ---help--- 34 ---help---
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..f15501f431c8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
1 1
2EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 2ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
3 3
4CFLAGS_x86.o := -I. 4CFLAGS_x86.o := -I.
5CFLAGS_svm.o := -I. 5CFLAGS_svm.o := -I.
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o) 10 assigned-dev.o)
11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
12 13
13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
14 i8254.o timer.o 15 i8254.o timer.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 38b6e8dafaff..caf966781d25 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -20,16 +20,8 @@
20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
21 */ 21 */
22 22
23#ifndef __KERNEL__
24#include <stdio.h>
25#include <stdint.h>
26#include <public/xen.h>
27#define DPRINTF(_f, _a ...) printf(_f , ## _a)
28#else
29#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
30#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
31#define DPRINTF(x...) do {} while (0)
32#endif
33#include <linux/module.h> 25#include <linux/module.h>
34#include <asm/kvm_emulate.h> 26#include <asm/kvm_emulate.h>
35 27
@@ -418,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
418} 410}
419 411
420static inline unsigned long 412static inline unsigned long
421register_address(struct decode_cache *c, unsigned long base, unsigned long reg) 413register_address(struct decode_cache *c, unsigned long reg)
422{ 414{
423 return base + address_mask(c, reg); 415 return address_mask(c, reg);
424} 416}
425 417
426static inline void 418static inline void
@@ -452,60 +444,55 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
452 return ops->get_cached_segment_base(seg, ctxt->vcpu); 444 return ops->get_cached_segment_base(seg, ctxt->vcpu);
453} 445}
454 446
455static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 447static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
456 struct x86_emulate_ops *ops, 448 struct x86_emulate_ops *ops,
457 struct decode_cache *c) 449 struct decode_cache *c)
458{ 450{
459 if (!c->has_seg_override) 451 if (!c->has_seg_override)
460 return 0; 452 return 0;
461 453
462 return seg_base(ctxt, ops, c->seg_override); 454 return c->seg_override;
463} 455}
464 456
465static unsigned long es_base(struct x86_emulate_ctxt *ctxt, 457static ulong linear(struct x86_emulate_ctxt *ctxt,
466 struct x86_emulate_ops *ops) 458 struct segmented_address addr)
467{ 459{
468 return seg_base(ctxt, ops, VCPU_SREG_ES); 460 struct decode_cache *c = &ctxt->decode;
469} 461 ulong la;
470
471static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
472 struct x86_emulate_ops *ops)
473{
474 return seg_base(ctxt, ops, VCPU_SREG_SS);
475}
476 462
477static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 463 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
478 u32 error, bool valid) 464 if (c->ad_bytes != 8)
479{ 465 la &= (u32)-1;
480 ctxt->exception = vec; 466 return la;
481 ctxt->error_code = error;
482 ctxt->error_code_valid = valid;
483} 467}
484 468
485static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 469static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
470 u32 error, bool valid)
486{ 471{
487 emulate_exception(ctxt, GP_VECTOR, err, true); 472 ctxt->exception.vector = vec;
473 ctxt->exception.error_code = error;
474 ctxt->exception.error_code_valid = valid;
475 return X86EMUL_PROPAGATE_FAULT;
488} 476}
489 477
490static void emulate_pf(struct x86_emulate_ctxt *ctxt) 478static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
491{ 479{
492 emulate_exception(ctxt, PF_VECTOR, 0, true); 480 return emulate_exception(ctxt, GP_VECTOR, err, true);
493} 481}
494 482
495static void emulate_ud(struct x86_emulate_ctxt *ctxt) 483static int emulate_ud(struct x86_emulate_ctxt *ctxt)
496{ 484{
497 emulate_exception(ctxt, UD_VECTOR, 0, false); 485 return emulate_exception(ctxt, UD_VECTOR, 0, false);
498} 486}
499 487
500static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) 488static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
501{ 489{
502 emulate_exception(ctxt, TS_VECTOR, err, true); 490 return emulate_exception(ctxt, TS_VECTOR, err, true);
503} 491}
504 492
505static int emulate_de(struct x86_emulate_ctxt *ctxt) 493static int emulate_de(struct x86_emulate_ctxt *ctxt)
506{ 494{
507 emulate_exception(ctxt, DE_VECTOR, 0, false); 495 return emulate_exception(ctxt, DE_VECTOR, 0, false);
508 return X86EMUL_PROPAGATE_FAULT;
509} 496}
510 497
511static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 498static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -520,7 +507,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
520 cur_size = fc->end - fc->start; 507 cur_size = fc->end - fc->start;
521 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); 508 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
522 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, 509 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
523 size, ctxt->vcpu, NULL); 510 size, ctxt->vcpu, &ctxt->exception);
524 if (rc != X86EMUL_CONTINUE) 511 if (rc != X86EMUL_CONTINUE)
525 return rc; 512 return rc;
526 fc->end += size; 513 fc->end += size;
@@ -564,7 +551,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
564 551
565static int read_descriptor(struct x86_emulate_ctxt *ctxt, 552static int read_descriptor(struct x86_emulate_ctxt *ctxt,
566 struct x86_emulate_ops *ops, 553 struct x86_emulate_ops *ops,
567 ulong addr, 554 struct segmented_address addr,
568 u16 *size, unsigned long *address, int op_bytes) 555 u16 *size, unsigned long *address, int op_bytes)
569{ 556{
570 int rc; 557 int rc;
@@ -572,10 +559,13 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
572 if (op_bytes == 2) 559 if (op_bytes == 2)
573 op_bytes = 3; 560 op_bytes = 3;
574 *address = 0; 561 *address = 0;
575 rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL); 562 rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
563 ctxt->vcpu, &ctxt->exception);
576 if (rc != X86EMUL_CONTINUE) 564 if (rc != X86EMUL_CONTINUE)
577 return rc; 565 return rc;
578 rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL); 566 addr.ea += 2;
567 rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
568 ctxt->vcpu, &ctxt->exception);
579 return rc; 569 return rc;
580} 570}
581 571
@@ -768,7 +758,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
768 break; 758 break;
769 } 759 }
770 } 760 }
771 op->addr.mem = modrm_ea; 761 op->addr.mem.ea = modrm_ea;
772done: 762done:
773 return rc; 763 return rc;
774} 764}
@@ -783,13 +773,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
783 op->type = OP_MEM; 773 op->type = OP_MEM;
784 switch (c->ad_bytes) { 774 switch (c->ad_bytes) {
785 case 2: 775 case 2:
786 op->addr.mem = insn_fetch(u16, 2, c->eip); 776 op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
787 break; 777 break;
788 case 4: 778 case 4:
789 op->addr.mem = insn_fetch(u32, 4, c->eip); 779 op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
790 break; 780 break;
791 case 8: 781 case 8:
792 op->addr.mem = insn_fetch(u64, 8, c->eip); 782 op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
793 break; 783 break;
794 } 784 }
795done: 785done:
@@ -808,7 +798,7 @@ static void fetch_bit_operand(struct decode_cache *c)
808 else if (c->src.bytes == 4) 798 else if (c->src.bytes == 4)
809 sv = (s32)c->src.val & (s32)mask; 799 sv = (s32)c->src.val & (s32)mask;
810 800
811 c->dst.addr.mem += (sv >> 3); 801 c->dst.addr.mem.ea += (sv >> 3);
812 } 802 }
813 803
814 /* only subword offset */ 804 /* only subword offset */
@@ -821,7 +811,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
821{ 811{
822 int rc; 812 int rc;
823 struct read_cache *mc = &ctxt->decode.mem_read; 813 struct read_cache *mc = &ctxt->decode.mem_read;
824 u32 err;
825 814
826 while (size) { 815 while (size) {
827 int n = min(size, 8u); 816 int n = min(size, 8u);
@@ -829,10 +818,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
829 if (mc->pos < mc->end) 818 if (mc->pos < mc->end)
830 goto read_cached; 819 goto read_cached;
831 820
832 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, 821 rc = ops->read_emulated(addr, mc->data + mc->end, n,
833 ctxt->vcpu); 822 &ctxt->exception, ctxt->vcpu);
834 if (rc == X86EMUL_PROPAGATE_FAULT)
835 emulate_pf(ctxt);
836 if (rc != X86EMUL_CONTINUE) 823 if (rc != X86EMUL_CONTINUE)
837 return rc; 824 return rc;
838 mc->end += n; 825 mc->end += n;
@@ -907,19 +894,15 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
907 struct desc_ptr dt; 894 struct desc_ptr dt;
908 u16 index = selector >> 3; 895 u16 index = selector >> 3;
909 int ret; 896 int ret;
910 u32 err;
911 ulong addr; 897 ulong addr;
912 898
913 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 899 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
914 900
915 if (dt.size < index * 8 + 7) { 901 if (dt.size < index * 8 + 7)
916 emulate_gp(ctxt, selector & 0xfffc); 902 return emulate_gp(ctxt, selector & 0xfffc);
917 return X86EMUL_PROPAGATE_FAULT;
918 }
919 addr = dt.address + index * 8; 903 addr = dt.address + index * 8;
920 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 904 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
921 if (ret == X86EMUL_PROPAGATE_FAULT) 905 &ctxt->exception);
922 emulate_pf(ctxt);
923 906
924 return ret; 907 return ret;
925} 908}
@@ -931,21 +914,17 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
931{ 914{
932 struct desc_ptr dt; 915 struct desc_ptr dt;
933 u16 index = selector >> 3; 916 u16 index = selector >> 3;
934 u32 err;
935 ulong addr; 917 ulong addr;
936 int ret; 918 int ret;
937 919
938 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 920 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
939 921
940 if (dt.size < index * 8 + 7) { 922 if (dt.size < index * 8 + 7)
941 emulate_gp(ctxt, selector & 0xfffc); 923 return emulate_gp(ctxt, selector & 0xfffc);
942 return X86EMUL_PROPAGATE_FAULT;
943 }
944 924
945 addr = dt.address + index * 8; 925 addr = dt.address + index * 8;
946 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 926 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
947 if (ret == X86EMUL_PROPAGATE_FAULT) 927 &ctxt->exception);
948 emulate_pf(ctxt);
949 928
950 return ret; 929 return ret;
951} 930}
@@ -1092,7 +1071,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1092{ 1071{
1093 int rc; 1072 int rc;
1094 struct decode_cache *c = &ctxt->decode; 1073 struct decode_cache *c = &ctxt->decode;
1095 u32 err;
1096 1074
1097 switch (c->dst.type) { 1075 switch (c->dst.type) {
1098 case OP_REG: 1076 case OP_REG:
@@ -1101,21 +1079,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1101 case OP_MEM: 1079 case OP_MEM:
1102 if (c->lock_prefix) 1080 if (c->lock_prefix)
1103 rc = ops->cmpxchg_emulated( 1081 rc = ops->cmpxchg_emulated(
1104 c->dst.addr.mem, 1082 linear(ctxt, c->dst.addr.mem),
1105 &c->dst.orig_val, 1083 &c->dst.orig_val,
1106 &c->dst.val, 1084 &c->dst.val,
1107 c->dst.bytes, 1085 c->dst.bytes,
1108 &err, 1086 &ctxt->exception,
1109 ctxt->vcpu); 1087 ctxt->vcpu);
1110 else 1088 else
1111 rc = ops->write_emulated( 1089 rc = ops->write_emulated(
1112 c->dst.addr.mem, 1090 linear(ctxt, c->dst.addr.mem),
1113 &c->dst.val, 1091 &c->dst.val,
1114 c->dst.bytes, 1092 c->dst.bytes,
1115 &err, 1093 &ctxt->exception,
1116 ctxt->vcpu); 1094 ctxt->vcpu);
1117 if (rc == X86EMUL_PROPAGATE_FAULT)
1118 emulate_pf(ctxt);
1119 if (rc != X86EMUL_CONTINUE) 1095 if (rc != X86EMUL_CONTINUE)
1120 return rc; 1096 return rc;
1121 break; 1097 break;
@@ -1137,8 +1113,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1137 c->dst.bytes = c->op_bytes; 1113 c->dst.bytes = c->op_bytes;
1138 c->dst.val = c->src.val; 1114 c->dst.val = c->src.val;
1139 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1115 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1140 c->dst.addr.mem = register_address(c, ss_base(ctxt, ops), 1116 c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1141 c->regs[VCPU_REGS_RSP]); 1117 c->dst.addr.mem.seg = VCPU_SREG_SS;
1142} 1118}
1143 1119
1144static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1120static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1147,10 +1123,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1147{ 1123{
1148 struct decode_cache *c = &ctxt->decode; 1124 struct decode_cache *c = &ctxt->decode;
1149 int rc; 1125 int rc;
1126 struct segmented_address addr;
1150 1127
1151 rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), 1128 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1152 c->regs[VCPU_REGS_RSP]), 1129 addr.seg = VCPU_SREG_SS;
1153 dest, len); 1130 rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
1154 if (rc != X86EMUL_CONTINUE) 1131 if (rc != X86EMUL_CONTINUE)
1155 return rc; 1132 return rc;
1156 1133
@@ -1184,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1184 change_mask |= EFLG_IF; 1161 change_mask |= EFLG_IF;
1185 break; 1162 break;
1186 case X86EMUL_MODE_VM86: 1163 case X86EMUL_MODE_VM86:
1187 if (iopl < 3) { 1164 if (iopl < 3)
1188 emulate_gp(ctxt, 0); 1165 return emulate_gp(ctxt, 0);
1189 return X86EMUL_PROPAGATE_FAULT;
1190 }
1191 change_mask |= EFLG_IF; 1166 change_mask |= EFLG_IF;
1192 break; 1167 break;
1193 default: /* real mode */ 1168 default: /* real mode */
@@ -1198,9 +1173,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1198 *(unsigned long *)dest = 1173 *(unsigned long *)dest =
1199 (ctxt->eflags & ~change_mask) | (val & change_mask); 1174 (ctxt->eflags & ~change_mask) | (val & change_mask);
1200 1175
1201 if (rc == X86EMUL_PROPAGATE_FAULT)
1202 emulate_pf(ctxt);
1203
1204 return rc; 1176 return rc;
1205} 1177}
1206 1178
@@ -1287,7 +1259,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1287 gva_t cs_addr; 1259 gva_t cs_addr;
1288 gva_t eip_addr; 1260 gva_t eip_addr;
1289 u16 cs, eip; 1261 u16 cs, eip;
1290 u32 err;
1291 1262
1292 /* TODO: Add limit checks */ 1263 /* TODO: Add limit checks */
1293 c->src.val = ctxt->eflags; 1264 c->src.val = ctxt->eflags;
@@ -1317,11 +1288,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1317 eip_addr = dt.address + (irq << 2); 1288 eip_addr = dt.address + (irq << 2);
1318 cs_addr = dt.address + (irq << 2) + 2; 1289 cs_addr = dt.address + (irq << 2) + 2;
1319 1290
1320 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err); 1291 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
1321 if (rc != X86EMUL_CONTINUE) 1292 if (rc != X86EMUL_CONTINUE)
1322 return rc; 1293 return rc;
1323 1294
1324 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err); 1295 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
1325 if (rc != X86EMUL_CONTINUE) 1296 if (rc != X86EMUL_CONTINUE)
1326 return rc; 1297 return rc;
1327 1298
@@ -1370,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1370 if (rc != X86EMUL_CONTINUE) 1341 if (rc != X86EMUL_CONTINUE)
1371 return rc; 1342 return rc;
1372 1343
1373 if (temp_eip & ~0xffff) { 1344 if (temp_eip & ~0xffff)
1374 emulate_gp(ctxt, 0); 1345 return emulate_gp(ctxt, 0);
1375 return X86EMUL_PROPAGATE_FAULT;
1376 }
1377 1346
1378 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1347 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1379 1348
@@ -1624,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1624 1593
1625 /* syscall is not available in real mode */ 1594 /* syscall is not available in real mode */
1626 if (ctxt->mode == X86EMUL_MODE_REAL || 1595 if (ctxt->mode == X86EMUL_MODE_REAL ||
1627 ctxt->mode == X86EMUL_MODE_VM86) { 1596 ctxt->mode == X86EMUL_MODE_VM86)
1628 emulate_ud(ctxt); 1597 return emulate_ud(ctxt);
1629 return X86EMUL_PROPAGATE_FAULT;
1630 }
1631 1598
1632 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1599 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1633 1600
@@ -1678,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1678 u16 cs_sel, ss_sel; 1645 u16 cs_sel, ss_sel;
1679 1646
1680 /* inject #GP if in real mode */ 1647 /* inject #GP if in real mode */
1681 if (ctxt->mode == X86EMUL_MODE_REAL) { 1648 if (ctxt->mode == X86EMUL_MODE_REAL)
1682 emulate_gp(ctxt, 0); 1649 return emulate_gp(ctxt, 0);
1683 return X86EMUL_PROPAGATE_FAULT;
1684 }
1685 1650
1686 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1651 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1687 * Therefore, we inject an #UD. 1652 * Therefore, we inject an #UD.
1688 */ 1653 */
1689 if (ctxt->mode == X86EMUL_MODE_PROT64) { 1654 if (ctxt->mode == X86EMUL_MODE_PROT64)
1690 emulate_ud(ctxt); 1655 return emulate_ud(ctxt);
1691 return X86EMUL_PROPAGATE_FAULT;
1692 }
1693 1656
1694 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1657 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1695 1658
1696 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 1659 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1697 switch (ctxt->mode) { 1660 switch (ctxt->mode) {
1698 case X86EMUL_MODE_PROT32: 1661 case X86EMUL_MODE_PROT32:
1699 if ((msr_data & 0xfffc) == 0x0) { 1662 if ((msr_data & 0xfffc) == 0x0)
1700 emulate_gp(ctxt, 0); 1663 return emulate_gp(ctxt, 0);
1701 return X86EMUL_PROPAGATE_FAULT;
1702 }
1703 break; 1664 break;
1704 case X86EMUL_MODE_PROT64: 1665 case X86EMUL_MODE_PROT64:
1705 if (msr_data == 0x0) { 1666 if (msr_data == 0x0)
1706 emulate_gp(ctxt, 0); 1667 return emulate_gp(ctxt, 0);
1707 return X86EMUL_PROPAGATE_FAULT;
1708 }
1709 break; 1668 break;
1710 } 1669 }
1711 1670
@@ -1745,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1745 1704
1746 /* inject #GP if in real mode or Virtual 8086 mode */ 1705 /* inject #GP if in real mode or Virtual 8086 mode */
1747 if (ctxt->mode == X86EMUL_MODE_REAL || 1706 if (ctxt->mode == X86EMUL_MODE_REAL ||
1748 ctxt->mode == X86EMUL_MODE_VM86) { 1707 ctxt->mode == X86EMUL_MODE_VM86)
1749 emulate_gp(ctxt, 0); 1708 return emulate_gp(ctxt, 0);
1750 return X86EMUL_PROPAGATE_FAULT;
1751 }
1752 1709
1753 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1710 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1754 1711
@@ -1763,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1763 switch (usermode) { 1720 switch (usermode) {
1764 case X86EMUL_MODE_PROT32: 1721 case X86EMUL_MODE_PROT32:
1765 cs_sel = (u16)(msr_data + 16); 1722 cs_sel = (u16)(msr_data + 16);
1766 if ((msr_data & 0xfffc) == 0x0) { 1723 if ((msr_data & 0xfffc) == 0x0)
1767 emulate_gp(ctxt, 0); 1724 return emulate_gp(ctxt, 0);
1768 return X86EMUL_PROPAGATE_FAULT;
1769 }
1770 ss_sel = (u16)(msr_data + 24); 1725 ss_sel = (u16)(msr_data + 24);
1771 break; 1726 break;
1772 case X86EMUL_MODE_PROT64: 1727 case X86EMUL_MODE_PROT64:
1773 cs_sel = (u16)(msr_data + 32); 1728 cs_sel = (u16)(msr_data + 32);
1774 if (msr_data == 0x0) { 1729 if (msr_data == 0x0)
1775 emulate_gp(ctxt, 0); 1730 return emulate_gp(ctxt, 0);
1776 return X86EMUL_PROPAGATE_FAULT;
1777 }
1778 ss_sel = cs_sel + 8; 1731 ss_sel = cs_sel + 8;
1779 cs.d = 0; 1732 cs.d = 0;
1780 cs.l = 1; 1733 cs.l = 1;
@@ -1934,33 +1887,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1934{ 1887{
1935 struct tss_segment_16 tss_seg; 1888 struct tss_segment_16 tss_seg;
1936 int ret; 1889 int ret;
1937 u32 err, new_tss_base = get_desc_base(new_desc); 1890 u32 new_tss_base = get_desc_base(new_desc);
1938 1891
1939 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1892 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1940 &err); 1893 &ctxt->exception);
1941 if (ret == X86EMUL_PROPAGATE_FAULT) { 1894 if (ret != X86EMUL_CONTINUE)
1942 /* FIXME: need to provide precise fault address */ 1895 /* FIXME: need to provide precise fault address */
1943 emulate_pf(ctxt);
1944 return ret; 1896 return ret;
1945 }
1946 1897
1947 save_state_to_tss16(ctxt, ops, &tss_seg); 1898 save_state_to_tss16(ctxt, ops, &tss_seg);
1948 1899
1949 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1900 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1950 &err); 1901 &ctxt->exception);
1951 if (ret == X86EMUL_PROPAGATE_FAULT) { 1902 if (ret != X86EMUL_CONTINUE)
1952 /* FIXME: need to provide precise fault address */ 1903 /* FIXME: need to provide precise fault address */
1953 emulate_pf(ctxt);
1954 return ret; 1904 return ret;
1955 }
1956 1905
1957 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1906 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1958 &err); 1907 &ctxt->exception);
1959 if (ret == X86EMUL_PROPAGATE_FAULT) { 1908 if (ret != X86EMUL_CONTINUE)
1960 /* FIXME: need to provide precise fault address */ 1909 /* FIXME: need to provide precise fault address */
1961 emulate_pf(ctxt);
1962 return ret; 1910 return ret;
1963 }
1964 1911
1965 if (old_tss_sel != 0xffff) { 1912 if (old_tss_sel != 0xffff) {
1966 tss_seg.prev_task_link = old_tss_sel; 1913 tss_seg.prev_task_link = old_tss_sel;
@@ -1968,12 +1915,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1968 ret = ops->write_std(new_tss_base, 1915 ret = ops->write_std(new_tss_base,
1969 &tss_seg.prev_task_link, 1916 &tss_seg.prev_task_link,
1970 sizeof tss_seg.prev_task_link, 1917 sizeof tss_seg.prev_task_link,
1971 ctxt->vcpu, &err); 1918 ctxt->vcpu, &ctxt->exception);
1972 if (ret == X86EMUL_PROPAGATE_FAULT) { 1919 if (ret != X86EMUL_CONTINUE)
1973 /* FIXME: need to provide precise fault address */ 1920 /* FIXME: need to provide precise fault address */
1974 emulate_pf(ctxt);
1975 return ret; 1921 return ret;
1976 }
1977 } 1922 }
1978 1923
1979 return load_state_from_tss16(ctxt, ops, &tss_seg); 1924 return load_state_from_tss16(ctxt, ops, &tss_seg);
@@ -2013,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2013 struct decode_cache *c = &ctxt->decode; 1958 struct decode_cache *c = &ctxt->decode;
2014 int ret; 1959 int ret;
2015 1960
2016 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { 1961 if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
2017 emulate_gp(ctxt, 0); 1962 return emulate_gp(ctxt, 0);
2018 return X86EMUL_PROPAGATE_FAULT;
2019 }
2020 c->eip = tss->eip; 1963 c->eip = tss->eip;
2021 ctxt->eflags = tss->eflags | 2; 1964 ctxt->eflags = tss->eflags | 2;
2022 c->regs[VCPU_REGS_RAX] = tss->eax; 1965 c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2076,33 +2019,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2076{ 2019{
2077 struct tss_segment_32 tss_seg; 2020 struct tss_segment_32 tss_seg;
2078 int ret; 2021 int ret;
2079 u32 err, new_tss_base = get_desc_base(new_desc); 2022 u32 new_tss_base = get_desc_base(new_desc);
2080 2023
2081 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2024 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2082 &err); 2025 &ctxt->exception);
2083 if (ret == X86EMUL_PROPAGATE_FAULT) { 2026 if (ret != X86EMUL_CONTINUE)
2084 /* FIXME: need to provide precise fault address */ 2027 /* FIXME: need to provide precise fault address */
2085 emulate_pf(ctxt);
2086 return ret; 2028 return ret;
2087 }
2088 2029
2089 save_state_to_tss32(ctxt, ops, &tss_seg); 2030 save_state_to_tss32(ctxt, ops, &tss_seg);
2090 2031
2091 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2032 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2092 &err); 2033 &ctxt->exception);
2093 if (ret == X86EMUL_PROPAGATE_FAULT) { 2034 if (ret != X86EMUL_CONTINUE)
2094 /* FIXME: need to provide precise fault address */ 2035 /* FIXME: need to provide precise fault address */
2095 emulate_pf(ctxt);
2096 return ret; 2036 return ret;
2097 }
2098 2037
2099 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2038 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2100 &err); 2039 &ctxt->exception);
2101 if (ret == X86EMUL_PROPAGATE_FAULT) { 2040 if (ret != X86EMUL_CONTINUE)
2102 /* FIXME: need to provide precise fault address */ 2041 /* FIXME: need to provide precise fault address */
2103 emulate_pf(ctxt);
2104 return ret; 2042 return ret;
2105 }
2106 2043
2107 if (old_tss_sel != 0xffff) { 2044 if (old_tss_sel != 0xffff) {
2108 tss_seg.prev_task_link = old_tss_sel; 2045 tss_seg.prev_task_link = old_tss_sel;
@@ -2110,12 +2047,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2110 ret = ops->write_std(new_tss_base, 2047 ret = ops->write_std(new_tss_base,
2111 &tss_seg.prev_task_link, 2048 &tss_seg.prev_task_link,
2112 sizeof tss_seg.prev_task_link, 2049 sizeof tss_seg.prev_task_link,
2113 ctxt->vcpu, &err); 2050 ctxt->vcpu, &ctxt->exception);
2114 if (ret == X86EMUL_PROPAGATE_FAULT) { 2051 if (ret != X86EMUL_CONTINUE)
2115 /* FIXME: need to provide precise fault address */ 2052 /* FIXME: need to provide precise fault address */
2116 emulate_pf(ctxt);
2117 return ret; 2053 return ret;
2118 }
2119 } 2054 }
2120 2055
2121 return load_state_from_tss32(ctxt, ops, &tss_seg); 2056 return load_state_from_tss32(ctxt, ops, &tss_seg);
@@ -2146,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2146 2081
2147 if (reason != TASK_SWITCH_IRET) { 2082 if (reason != TASK_SWITCH_IRET) {
2148 if ((tss_selector & 3) > next_tss_desc.dpl || 2083 if ((tss_selector & 3) > next_tss_desc.dpl ||
2149 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2084 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
2150 emulate_gp(ctxt, 0); 2085 return emulate_gp(ctxt, 0);
2151 return X86EMUL_PROPAGATE_FAULT;
2152 }
2153 } 2086 }
2154 2087
2155 desc_limit = desc_limit_scaled(&next_tss_desc); 2088 desc_limit = desc_limit_scaled(&next_tss_desc);
@@ -2231,14 +2164,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2231 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2164 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2232} 2165}
2233 2166
2234static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, 2167static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
2235 int reg, struct operand *op) 2168 int reg, struct operand *op)
2236{ 2169{
2237 struct decode_cache *c = &ctxt->decode; 2170 struct decode_cache *c = &ctxt->decode;
2238 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2171 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2239 2172
2240 register_address_increment(c, &c->regs[reg], df * op->bytes); 2173 register_address_increment(c, &c->regs[reg], df * op->bytes);
2241 op->addr.mem = register_address(c, base, c->regs[reg]); 2174 op->addr.mem.ea = register_address(c, c->regs[reg]);
2175 op->addr.mem.seg = seg;
2242} 2176}
2243 2177
2244static int em_push(struct x86_emulate_ctxt *ctxt) 2178static int em_push(struct x86_emulate_ctxt *ctxt)
@@ -2369,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2369 struct decode_cache *c = &ctxt->decode; 2303 struct decode_cache *c = &ctxt->decode;
2370 u64 tsc = 0; 2304 u64 tsc = 0;
2371 2305
2372 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) { 2306 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
2373 emulate_gp(ctxt, 0); 2307 return emulate_gp(ctxt, 0);
2374 return X86EMUL_PROPAGATE_FAULT;
2375 }
2376 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); 2308 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
2377 c->regs[VCPU_REGS_RAX] = (u32)tsc; 2309 c->regs[VCPU_REGS_RAX] = (u32)tsc;
2378 c->regs[VCPU_REGS_RDX] = tsc >> 32; 2310 c->regs[VCPU_REGS_RDX] = tsc >> 32;
@@ -2647,7 +2579,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
2647 2579
2648 op->type = OP_IMM; 2580 op->type = OP_IMM;
2649 op->bytes = size; 2581 op->bytes = size;
2650 op->addr.mem = c->eip; 2582 op->addr.mem.ea = c->eip;
2651 /* NB. Immediates are sign-extended as necessary. */ 2583 /* NB. Immediates are sign-extended as necessary. */
2652 switch (op->bytes) { 2584 switch (op->bytes) {
2653 case 1: 2585 case 1:
@@ -2678,7 +2610,7 @@ done:
2678} 2610}
2679 2611
2680int 2612int
2681x86_decode_insn(struct x86_emulate_ctxt *ctxt) 2613x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2682{ 2614{
2683 struct x86_emulate_ops *ops = ctxt->ops; 2615 struct x86_emulate_ops *ops = ctxt->ops;
2684 struct decode_cache *c = &ctxt->decode; 2616 struct decode_cache *c = &ctxt->decode;
@@ -2689,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
2689 struct operand memop = { .type = OP_NONE }; 2621 struct operand memop = { .type = OP_NONE };
2690 2622
2691 c->eip = ctxt->eip; 2623 c->eip = ctxt->eip;
2692 c->fetch.start = c->fetch.end = c->eip; 2624 c->fetch.start = c->eip;
2625 c->fetch.end = c->fetch.start + insn_len;
2626 if (insn_len > 0)
2627 memcpy(c->fetch.data, insn, insn_len);
2693 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); 2628 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
2694 2629
2695 switch (mode) { 2630 switch (mode) {
@@ -2803,10 +2738,8 @@ done_prefixes:
2803 c->execute = opcode.u.execute; 2738 c->execute = opcode.u.execute;
2804 2739
2805 /* Unrecognised? */ 2740 /* Unrecognised? */
2806 if (c->d == 0 || (c->d & Undefined)) { 2741 if (c->d == 0 || (c->d & Undefined))
2807 DPRINTF("Cannot emulate %02x\n", c->b);
2808 return -1; 2742 return -1;
2809 }
2810 2743
2811 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 2744 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2812 c->op_bytes = 8; 2745 c->op_bytes = 8;
@@ -2831,14 +2764,13 @@ done_prefixes:
2831 if (!c->has_seg_override) 2764 if (!c->has_seg_override)
2832 set_seg_override(c, VCPU_SREG_DS); 2765 set_seg_override(c, VCPU_SREG_DS);
2833 2766
2834 if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d)) 2767 memop.addr.mem.seg = seg_override(ctxt, ops, c);
2835 memop.addr.mem += seg_override_base(ctxt, ops, c);
2836 2768
2837 if (memop.type == OP_MEM && c->ad_bytes != 8) 2769 if (memop.type == OP_MEM && c->ad_bytes != 8)
2838 memop.addr.mem = (u32)memop.addr.mem; 2770 memop.addr.mem.ea = (u32)memop.addr.mem.ea;
2839 2771
2840 if (memop.type == OP_MEM && c->rip_relative) 2772 if (memop.type == OP_MEM && c->rip_relative)
2841 memop.addr.mem += c->eip; 2773 memop.addr.mem.ea += c->eip;
2842 2774
2843 /* 2775 /*
2844 * Decode and fetch the source operand: register, memory 2776 * Decode and fetch the source operand: register, memory
@@ -2890,14 +2822,14 @@ done_prefixes:
2890 case SrcSI: 2822 case SrcSI:
2891 c->src.type = OP_MEM; 2823 c->src.type = OP_MEM;
2892 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2824 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2893 c->src.addr.mem = 2825 c->src.addr.mem.ea =
2894 register_address(c, seg_override_base(ctxt, ops, c), 2826 register_address(c, c->regs[VCPU_REGS_RSI]);
2895 c->regs[VCPU_REGS_RSI]); 2827 c->src.addr.mem.seg = seg_override(ctxt, ops, c),
2896 c->src.val = 0; 2828 c->src.val = 0;
2897 break; 2829 break;
2898 case SrcImmFAddr: 2830 case SrcImmFAddr:
2899 c->src.type = OP_IMM; 2831 c->src.type = OP_IMM;
2900 c->src.addr.mem = c->eip; 2832 c->src.addr.mem.ea = c->eip;
2901 c->src.bytes = c->op_bytes + 2; 2833 c->src.bytes = c->op_bytes + 2;
2902 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); 2834 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
2903 break; 2835 break;
@@ -2944,7 +2876,7 @@ done_prefixes:
2944 break; 2876 break;
2945 case DstImmUByte: 2877 case DstImmUByte:
2946 c->dst.type = OP_IMM; 2878 c->dst.type = OP_IMM;
2947 c->dst.addr.mem = c->eip; 2879 c->dst.addr.mem.ea = c->eip;
2948 c->dst.bytes = 1; 2880 c->dst.bytes = 1;
2949 c->dst.val = insn_fetch(u8, 1, c->eip); 2881 c->dst.val = insn_fetch(u8, 1, c->eip);
2950 break; 2882 break;
@@ -2969,9 +2901,9 @@ done_prefixes:
2969 case DstDI: 2901 case DstDI:
2970 c->dst.type = OP_MEM; 2902 c->dst.type = OP_MEM;
2971 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2903 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2972 c->dst.addr.mem = 2904 c->dst.addr.mem.ea =
2973 register_address(c, es_base(ctxt, ops), 2905 register_address(c, c->regs[VCPU_REGS_RDI]);
2974 c->regs[VCPU_REGS_RDI]); 2906 c->dst.addr.mem.seg = VCPU_SREG_ES;
2975 c->dst.val = 0; 2907 c->dst.val = 0;
2976 break; 2908 break;
2977 case ImplicitOps: 2909 case ImplicitOps:
@@ -3020,24 +2952,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3020 ctxt->decode.mem_read.pos = 0; 2952 ctxt->decode.mem_read.pos = 0;
3021 2953
3022 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 2954 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
3023 emulate_ud(ctxt); 2955 rc = emulate_ud(ctxt);
3024 goto done; 2956 goto done;
3025 } 2957 }
3026 2958
3027 /* LOCK prefix is allowed only with some instructions */ 2959 /* LOCK prefix is allowed only with some instructions */
3028 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 2960 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
3029 emulate_ud(ctxt); 2961 rc = emulate_ud(ctxt);
3030 goto done; 2962 goto done;
3031 } 2963 }
3032 2964
3033 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { 2965 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
3034 emulate_ud(ctxt); 2966 rc = emulate_ud(ctxt);
3035 goto done; 2967 goto done;
3036 } 2968 }
3037 2969
3038 /* Privileged instruction can be executed only in CPL=0 */ 2970 /* Privileged instruction can be executed only in CPL=0 */
3039 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 2971 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
3040 emulate_gp(ctxt, 0); 2972 rc = emulate_gp(ctxt, 0);
3041 goto done; 2973 goto done;
3042 } 2974 }
3043 2975
@@ -3050,7 +2982,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3050 } 2982 }
3051 2983
3052 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { 2984 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
3053 rc = read_emulated(ctxt, ops, c->src.addr.mem, 2985 rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
3054 c->src.valptr, c->src.bytes); 2986 c->src.valptr, c->src.bytes);
3055 if (rc != X86EMUL_CONTINUE) 2987 if (rc != X86EMUL_CONTINUE)
3056 goto done; 2988 goto done;
@@ -3058,7 +2990,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3058 } 2990 }
3059 2991
3060 if (c->src2.type == OP_MEM) { 2992 if (c->src2.type == OP_MEM) {
3061 rc = read_emulated(ctxt, ops, c->src2.addr.mem, 2993 rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
3062 &c->src2.val, c->src2.bytes); 2994 &c->src2.val, c->src2.bytes);
3063 if (rc != X86EMUL_CONTINUE) 2995 if (rc != X86EMUL_CONTINUE)
3064 goto done; 2996 goto done;
@@ -3070,7 +3002,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3070 3002
3071 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3003 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
3072 /* optimisation - avoid slow emulated read if Mov */ 3004 /* optimisation - avoid slow emulated read if Mov */
3073 rc = read_emulated(ctxt, ops, c->dst.addr.mem, 3005 rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
3074 &c->dst.val, c->dst.bytes); 3006 &c->dst.val, c->dst.bytes);
3075 if (rc != X86EMUL_CONTINUE) 3007 if (rc != X86EMUL_CONTINUE)
3076 goto done; 3008 goto done;
@@ -3215,13 +3147,13 @@ special_insn:
3215 break; 3147 break;
3216 case 0x8c: /* mov r/m, sreg */ 3148 case 0x8c: /* mov r/m, sreg */
3217 if (c->modrm_reg > VCPU_SREG_GS) { 3149 if (c->modrm_reg > VCPU_SREG_GS) {
3218 emulate_ud(ctxt); 3150 rc = emulate_ud(ctxt);
3219 goto done; 3151 goto done;
3220 } 3152 }
3221 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 3153 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
3222 break; 3154 break;
3223 case 0x8d: /* lea r16/r32, m */ 3155 case 0x8d: /* lea r16/r32, m */
3224 c->dst.val = c->src.addr.mem; 3156 c->dst.val = c->src.addr.mem.ea;
3225 break; 3157 break;
3226 case 0x8e: { /* mov seg, r/m16 */ 3158 case 0x8e: { /* mov seg, r/m16 */
3227 uint16_t sel; 3159 uint16_t sel;
@@ -3230,7 +3162,7 @@ special_insn:
3230 3162
3231 if (c->modrm_reg == VCPU_SREG_CS || 3163 if (c->modrm_reg == VCPU_SREG_CS ||
3232 c->modrm_reg > VCPU_SREG_GS) { 3164 c->modrm_reg > VCPU_SREG_GS) {
3233 emulate_ud(ctxt); 3165 rc = emulate_ud(ctxt);
3234 goto done; 3166 goto done;
3235 } 3167 }
3236 3168
@@ -3268,7 +3200,6 @@ special_insn:
3268 break; 3200 break;
3269 case 0xa6 ... 0xa7: /* cmps */ 3201 case 0xa6 ... 0xa7: /* cmps */
3270 c->dst.type = OP_NONE; /* Disable writeback. */ 3202 c->dst.type = OP_NONE; /* Disable writeback. */
3271 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
3272 goto cmp; 3203 goto cmp;
3273 case 0xa8 ... 0xa9: /* test ax, imm */ 3204 case 0xa8 ... 0xa9: /* test ax, imm */
3274 goto test; 3205 goto test;
@@ -3363,7 +3294,7 @@ special_insn:
3363 do_io_in: 3294 do_io_in:
3364 c->dst.bytes = min(c->dst.bytes, 4u); 3295 c->dst.bytes = min(c->dst.bytes, 4u);
3365 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 3296 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
3366 emulate_gp(ctxt, 0); 3297 rc = emulate_gp(ctxt, 0);
3367 goto done; 3298 goto done;
3368 } 3299 }
3369 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 3300 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -3377,7 +3308,7 @@ special_insn:
3377 c->src.bytes = min(c->src.bytes, 4u); 3308 c->src.bytes = min(c->src.bytes, 4u);
3378 if (!emulator_io_permited(ctxt, ops, c->dst.val, 3309 if (!emulator_io_permited(ctxt, ops, c->dst.val,
3379 c->src.bytes)) { 3310 c->src.bytes)) {
3380 emulate_gp(ctxt, 0); 3311 rc = emulate_gp(ctxt, 0);
3381 goto done; 3312 goto done;
3382 } 3313 }
3383 ops->pio_out_emulated(c->src.bytes, c->dst.val, 3314 ops->pio_out_emulated(c->src.bytes, c->dst.val,
@@ -3402,14 +3333,14 @@ special_insn:
3402 break; 3333 break;
3403 case 0xfa: /* cli */ 3334 case 0xfa: /* cli */
3404 if (emulator_bad_iopl(ctxt, ops)) { 3335 if (emulator_bad_iopl(ctxt, ops)) {
3405 emulate_gp(ctxt, 0); 3336 rc = emulate_gp(ctxt, 0);
3406 goto done; 3337 goto done;
3407 } else 3338 } else
3408 ctxt->eflags &= ~X86_EFLAGS_IF; 3339 ctxt->eflags &= ~X86_EFLAGS_IF;
3409 break; 3340 break;
3410 case 0xfb: /* sti */ 3341 case 0xfb: /* sti */
3411 if (emulator_bad_iopl(ctxt, ops)) { 3342 if (emulator_bad_iopl(ctxt, ops)) {
3412 emulate_gp(ctxt, 0); 3343 rc = emulate_gp(ctxt, 0);
3413 goto done; 3344 goto done;
3414 } else { 3345 } else {
3415 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; 3346 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
@@ -3449,11 +3380,11 @@ writeback:
3449 c->dst.type = saved_dst_type; 3380 c->dst.type = saved_dst_type;
3450 3381
3451 if ((c->d & SrcMask) == SrcSI) 3382 if ((c->d & SrcMask) == SrcSI)
3452 string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), 3383 string_addr_inc(ctxt, seg_override(ctxt, ops, c),
3453 VCPU_REGS_RSI, &c->src); 3384 VCPU_REGS_RSI, &c->src);
3454 3385
3455 if ((c->d & DstMask) == DstDI) 3386 if ((c->d & DstMask) == DstDI)
3456 string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, 3387 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
3457 &c->dst); 3388 &c->dst);
3458 3389
3459 if (c->rep_prefix && (c->d & String)) { 3390 if (c->rep_prefix && (c->d & String)) {
@@ -3482,6 +3413,8 @@ writeback:
3482 ctxt->eip = c->eip; 3413 ctxt->eip = c->eip;
3483 3414
3484done: 3415done:
3416 if (rc == X86EMUL_PROPAGATE_FAULT)
3417 ctxt->have_exception = true;
3485 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 3418 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3486 3419
3487twobyte_insn: 3420twobyte_insn:
@@ -3544,9 +3477,11 @@ twobyte_insn:
3544 break; 3477 break;
3545 case 5: /* not defined */ 3478 case 5: /* not defined */
3546 emulate_ud(ctxt); 3479 emulate_ud(ctxt);
3480 rc = X86EMUL_PROPAGATE_FAULT;
3547 goto done; 3481 goto done;
3548 case 7: /* invlpg*/ 3482 case 7: /* invlpg*/
3549 emulate_invlpg(ctxt->vcpu, c->src.addr.mem); 3483 emulate_invlpg(ctxt->vcpu,
3484 linear(ctxt, c->src.addr.mem));
3550 /* Disable writeback. */ 3485 /* Disable writeback. */
3551 c->dst.type = OP_NONE; 3486 c->dst.type = OP_NONE;
3552 break; 3487 break;
@@ -3573,6 +3508,7 @@ twobyte_insn:
3573 case 5 ... 7: 3508 case 5 ... 7:
3574 case 9 ... 15: 3509 case 9 ... 15:
3575 emulate_ud(ctxt); 3510 emulate_ud(ctxt);
3511 rc = X86EMUL_PROPAGATE_FAULT;
3576 goto done; 3512 goto done;
3577 } 3513 }
3578 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3514 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3581,6 +3517,7 @@ twobyte_insn:
3581 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3517 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3582 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3518 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3583 emulate_ud(ctxt); 3519 emulate_ud(ctxt);
3520 rc = X86EMUL_PROPAGATE_FAULT;
3584 goto done; 3521 goto done;
3585 } 3522 }
3586 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); 3523 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
@@ -3588,6 +3525,7 @@ twobyte_insn:
3588 case 0x22: /* mov reg, cr */ 3525 case 0x22: /* mov reg, cr */
3589 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { 3526 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
3590 emulate_gp(ctxt, 0); 3527 emulate_gp(ctxt, 0);
3528 rc = X86EMUL_PROPAGATE_FAULT;
3591 goto done; 3529 goto done;
3592 } 3530 }
3593 c->dst.type = OP_NONE; 3531 c->dst.type = OP_NONE;
@@ -3596,6 +3534,7 @@ twobyte_insn:
3596 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3534 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3597 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3535 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3598 emulate_ud(ctxt); 3536 emulate_ud(ctxt);
3537 rc = X86EMUL_PROPAGATE_FAULT;
3599 goto done; 3538 goto done;
3600 } 3539 }
3601 3540
@@ -3604,6 +3543,7 @@ twobyte_insn:
3604 ~0ULL : ~0U), ctxt->vcpu) < 0) { 3543 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3605 /* #UD condition is already handled by the code above */ 3544 /* #UD condition is already handled by the code above */
3606 emulate_gp(ctxt, 0); 3545 emulate_gp(ctxt, 0);
3546 rc = X86EMUL_PROPAGATE_FAULT;
3607 goto done; 3547 goto done;
3608 } 3548 }
3609 3549
@@ -3615,6 +3555,7 @@ twobyte_insn:
3615 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3555 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3616 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 3556 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
3617 emulate_gp(ctxt, 0); 3557 emulate_gp(ctxt, 0);
3558 rc = X86EMUL_PROPAGATE_FAULT;
3618 goto done; 3559 goto done;
3619 } 3560 }
3620 rc = X86EMUL_CONTINUE; 3561 rc = X86EMUL_CONTINUE;
@@ -3623,6 +3564,7 @@ twobyte_insn:
3623 /* rdmsr */ 3564 /* rdmsr */
3624 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 3565 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
3625 emulate_gp(ctxt, 0); 3566 emulate_gp(ctxt, 0);
3567 rc = X86EMUL_PROPAGATE_FAULT;
3626 goto done; 3568 goto done;
3627 } else { 3569 } else {
3628 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3570 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3785,6 +3727,5 @@ twobyte_insn:
3785 goto writeback; 3727 goto writeback;
3786 3728
3787cannot_emulate: 3729cannot_emulate:
3788 DPRINTF("Cannot emulate %02x\n", c->b);
3789 return -1; 3730 return -1;
3790} 3731}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index f628234fbeca..3cece05e4ac4 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -575,6 +575,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
575 s->pics[1].elcr_mask = 0xde; 575 s->pics[1].elcr_mask = 0xde;
576 s->pics[0].pics_state = s; 576 s->pics[0].pics_state = s;
577 s->pics[1].pics_state = s; 577 s->pics[1].pics_state = s;
578 s->pics[0].isr_ack = 0xff;
579 s->pics[1].isr_ack = 0xff;
578 580
579 /* 581 /*
580 * Initialize PIO device 582 * Initialize PIO device
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 975bb45329a1..3377d53fcd36 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -73,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
73 return vcpu->arch.cr4 & mask; 73 return vcpu->arch.cr4 & mask;
74} 74}
75 75
76static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
77{
78 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
79 kvm_x86_ops->decache_cr3(vcpu);
80 return vcpu->arch.cr3;
81}
82
76static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) 83static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
77{ 84{
78 return kvm_read_cr4_bits(vcpu, ~0UL); 85 return kvm_read_cr4_bits(vcpu, ~0UL);
@@ -84,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
84 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); 91 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
85} 92}
86 93
94static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
95{
96 vcpu->arch.hflags |= HF_GUEST_MASK;
97}
98
99static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
100{
101 vcpu->arch.hflags &= ~HF_GUEST_MASK;
102}
103
104static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
105{
106 return vcpu->arch.hflags & HF_GUEST_MASK;
107}
108
87#endif 109#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 413f8973a855..93cf9d0d3653 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
277 277
278 if (old_ppr != ppr) { 278 if (old_ppr != ppr) {
279 apic_set_reg(apic, APIC_PROCPRI, ppr); 279 apic_set_reg(apic, APIC_PROCPRI, ppr);
280 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 280 if (ppr < old_ppr)
281 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
281 } 282 }
282} 283}
283 284
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fb8b376bf28c..f02b8edc3d44 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,9 +18,11 @@
18 * 18 *
19 */ 19 */
20 20
21#include "irq.h"
21#include "mmu.h" 22#include "mmu.h"
22#include "x86.h" 23#include "x86.h"
23#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include "x86.h"
24 26
25#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
26#include <linux/types.h> 28#include <linux/types.h>
@@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages;
194 196
195static u64 __read_mostly shadow_trap_nonpresent_pte; 197static u64 __read_mostly shadow_trap_nonpresent_pte;
196static u64 __read_mostly shadow_notrap_nonpresent_pte; 198static u64 __read_mostly shadow_notrap_nonpresent_pte;
197static u64 __read_mostly shadow_base_present_pte;
198static u64 __read_mostly shadow_nx_mask; 199static u64 __read_mostly shadow_nx_mask;
199static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 200static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
200static u64 __read_mostly shadow_user_mask; 201static u64 __read_mostly shadow_user_mask;
@@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
213} 214}
214EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); 215EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
215 216
216void kvm_mmu_set_base_ptes(u64 base_pte)
217{
218 shadow_base_present_pte = base_pte;
219}
220EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
221
222void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 217void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
223 u64 dirty_mask, u64 nx_mask, u64 x_mask) 218 u64 dirty_mask, u64 nx_mask, u64 x_mask)
224{ 219{
@@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
482} 477}
483 478
484/* 479/*
485 * Return the pointer to the largepage write count for a given 480 * Return the pointer to the large page information for a given gfn,
486 * gfn, handling slots that are not large page aligned. 481 * handling slots that are not large page aligned.
487 */ 482 */
488static int *slot_largepage_idx(gfn_t gfn, 483static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
489 struct kvm_memory_slot *slot, 484 struct kvm_memory_slot *slot,
490 int level) 485 int level)
491{ 486{
492 unsigned long idx; 487 unsigned long idx;
493 488
494 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 489 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
495 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 490 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
496 return &slot->lpage_info[level - 2][idx].write_count; 491 return &slot->lpage_info[level - 2][idx];
497} 492}
498 493
499static void account_shadowed(struct kvm *kvm, gfn_t gfn) 494static void account_shadowed(struct kvm *kvm, gfn_t gfn)
500{ 495{
501 struct kvm_memory_slot *slot; 496 struct kvm_memory_slot *slot;
502 int *write_count; 497 struct kvm_lpage_info *linfo;
503 int i; 498 int i;
504 499
505 slot = gfn_to_memslot(kvm, gfn); 500 slot = gfn_to_memslot(kvm, gfn);
506 for (i = PT_DIRECTORY_LEVEL; 501 for (i = PT_DIRECTORY_LEVEL;
507 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 502 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
508 write_count = slot_largepage_idx(gfn, slot, i); 503 linfo = lpage_info_slot(gfn, slot, i);
509 *write_count += 1; 504 linfo->write_count += 1;
510 } 505 }
511} 506}
512 507
513static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 508static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
514{ 509{
515 struct kvm_memory_slot *slot; 510 struct kvm_memory_slot *slot;
516 int *write_count; 511 struct kvm_lpage_info *linfo;
517 int i; 512 int i;
518 513
519 slot = gfn_to_memslot(kvm, gfn); 514 slot = gfn_to_memslot(kvm, gfn);
520 for (i = PT_DIRECTORY_LEVEL; 515 for (i = PT_DIRECTORY_LEVEL;
521 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 516 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
522 write_count = slot_largepage_idx(gfn, slot, i); 517 linfo = lpage_info_slot(gfn, slot, i);
523 *write_count -= 1; 518 linfo->write_count -= 1;
524 WARN_ON(*write_count < 0); 519 WARN_ON(linfo->write_count < 0);
525 } 520 }
526} 521}
527 522
@@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm,
530 int level) 525 int level)
531{ 526{
532 struct kvm_memory_slot *slot; 527 struct kvm_memory_slot *slot;
533 int *largepage_idx; 528 struct kvm_lpage_info *linfo;
534 529
535 slot = gfn_to_memslot(kvm, gfn); 530 slot = gfn_to_memslot(kvm, gfn);
536 if (slot) { 531 if (slot) {
537 largepage_idx = slot_largepage_idx(gfn, slot, level); 532 linfo = lpage_info_slot(gfn, slot, level);
538 return *largepage_idx; 533 return linfo->write_count;
539 } 534 }
540 535
541 return 1; 536 return 1;
@@ -559,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
559 return ret; 554 return ret;
560} 555}
561 556
562static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 557static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
563{ 558{
564 struct kvm_memory_slot *slot; 559 struct kvm_memory_slot *slot;
565 int host_level, level, max_level;
566
567 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 560 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
568 if (slot && slot->dirty_bitmap) 561 if (slot && slot->dirty_bitmap)
569 return PT_PAGE_TABLE_LEVEL; 562 return true;
563 return false;
564}
565
566static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 int host_level, level, max_level;
570 569
571 host_level = host_mapping_level(vcpu->kvm, large_gfn); 570 host_level = host_mapping_level(vcpu->kvm, large_gfn);
572 571
@@ -590,16 +589,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
590static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 589static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
591{ 590{
592 struct kvm_memory_slot *slot; 591 struct kvm_memory_slot *slot;
593 unsigned long idx; 592 struct kvm_lpage_info *linfo;
594 593
595 slot = gfn_to_memslot(kvm, gfn); 594 slot = gfn_to_memslot(kvm, gfn);
596 if (likely(level == PT_PAGE_TABLE_LEVEL)) 595 if (likely(level == PT_PAGE_TABLE_LEVEL))
597 return &slot->rmap[gfn - slot->base_gfn]; 596 return &slot->rmap[gfn - slot->base_gfn];
598 597
599 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 598 linfo = lpage_info_slot(gfn, slot, level);
600 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
601 599
602 return &slot->lpage_info[level - 2][idx].rmap_pde; 600 return &linfo->rmap_pde;
603} 601}
604 602
605/* 603/*
@@ -887,19 +885,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
887 end = start + (memslot->npages << PAGE_SHIFT); 885 end = start + (memslot->npages << PAGE_SHIFT);
888 if (hva >= start && hva < end) { 886 if (hva >= start && hva < end) {
889 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 887 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
888 gfn_t gfn = memslot->base_gfn + gfn_offset;
890 889
891 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 890 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
892 891
893 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 892 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
894 unsigned long idx; 893 struct kvm_lpage_info *linfo;
895 int sh; 894
896 895 linfo = lpage_info_slot(gfn, memslot,
897 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); 896 PT_DIRECTORY_LEVEL + j);
898 idx = ((memslot->base_gfn+gfn_offset) >> sh) - 897 ret |= handler(kvm, &linfo->rmap_pde, data);
899 (memslot->base_gfn >> sh);
900 ret |= handler(kvm,
901 &memslot->lpage_info[j][idx].rmap_pde,
902 data);
903 } 898 }
904 trace_kvm_age_page(hva, memslot, ret); 899 trace_kvm_age_page(hva, memslot, ret);
905 retval |= ret; 900 retval |= ret;
@@ -950,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
950 return young; 945 return young;
951} 946}
952 947
948static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
949 unsigned long data)
950{
951 u64 *spte;
952 int young = 0;
953
954 /*
955 * If there's no access bit in the secondary pte set by the
956 * hardware it's up to gup-fast/gup to set the access bit in
957 * the primary pte or in the page structure.
958 */
959 if (!shadow_accessed_mask)
960 goto out;
961
962 spte = rmap_next(kvm, rmapp, NULL);
963 while (spte) {
964 u64 _spte = *spte;
965 BUG_ON(!(_spte & PT_PRESENT_MASK));
966 young = _spte & PT_ACCESSED_MASK;
967 if (young) {
968 young = 1;
969 break;
970 }
971 spte = rmap_next(kvm, rmapp, spte);
972 }
973out:
974 return young;
975}
976
953#define RMAP_RECYCLE_THRESHOLD 1000 977#define RMAP_RECYCLE_THRESHOLD 1000
954 978
955static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 979static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -970,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
970 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 994 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
971} 995}
972 996
997int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
998{
999 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1000}
1001
973#ifdef MMU_DEBUG 1002#ifdef MMU_DEBUG
974static int is_empty_shadow_page(u64 *spt) 1003static int is_empty_shadow_page(u64 *spt)
975{ 1004{
@@ -1161,7 +1190,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1161} 1190}
1162 1191
1163static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1192static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1164 struct kvm_mmu_page *sp, bool clear_unsync) 1193 struct kvm_mmu_page *sp)
1165{ 1194{
1166 return 1; 1195 return 1;
1167} 1196}
@@ -1291,7 +1320,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1291 if (clear_unsync) 1320 if (clear_unsync)
1292 kvm_unlink_unsync_page(vcpu->kvm, sp); 1321 kvm_unlink_unsync_page(vcpu->kvm, sp);
1293 1322
1294 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { 1323 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1295 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1324 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1296 return 1; 1325 return 1;
1297 } 1326 }
@@ -1332,12 +1361,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1332 continue; 1361 continue;
1333 1362
1334 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1363 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1364 kvm_unlink_unsync_page(vcpu->kvm, s);
1335 if ((s->role.cr4_pae != !!is_pae(vcpu)) || 1365 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1336 (vcpu->arch.mmu.sync_page(vcpu, s, true))) { 1366 (vcpu->arch.mmu.sync_page(vcpu, s))) {
1337 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); 1367 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1338 continue; 1368 continue;
1339 } 1369 }
1340 kvm_unlink_unsync_page(vcpu->kvm, s);
1341 flush = true; 1370 flush = true;
1342 } 1371 }
1343 1372
@@ -1963,9 +1992,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1963 unsigned pte_access, int user_fault, 1992 unsigned pte_access, int user_fault,
1964 int write_fault, int dirty, int level, 1993 int write_fault, int dirty, int level,
1965 gfn_t gfn, pfn_t pfn, bool speculative, 1994 gfn_t gfn, pfn_t pfn, bool speculative,
1966 bool can_unsync, bool reset_host_protection) 1995 bool can_unsync, bool host_writable)
1967{ 1996{
1968 u64 spte; 1997 u64 spte, entry = *sptep;
1969 int ret = 0; 1998 int ret = 0;
1970 1999
1971 /* 2000 /*
@@ -1973,7 +2002,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1973 * whether the guest actually used the pte (in order to detect 2002 * whether the guest actually used the pte (in order to detect
1974 * demand paging). 2003 * demand paging).
1975 */ 2004 */
1976 spte = shadow_base_present_pte; 2005 spte = PT_PRESENT_MASK;
1977 if (!speculative) 2006 if (!speculative)
1978 spte |= shadow_accessed_mask; 2007 spte |= shadow_accessed_mask;
1979 if (!dirty) 2008 if (!dirty)
@@ -1990,8 +2019,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1990 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 2019 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1991 kvm_is_mmio_pfn(pfn)); 2020 kvm_is_mmio_pfn(pfn));
1992 2021
1993 if (reset_host_protection) 2022 if (host_writable)
1994 spte |= SPTE_HOST_WRITEABLE; 2023 spte |= SPTE_HOST_WRITEABLE;
2024 else
2025 pte_access &= ~ACC_WRITE_MASK;
1995 2026
1996 spte |= (u64)pfn << PAGE_SHIFT; 2027 spte |= (u64)pfn << PAGE_SHIFT;
1997 2028
@@ -2036,6 +2067,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2036 2067
2037set_pte: 2068set_pte:
2038 update_spte(sptep, spte); 2069 update_spte(sptep, spte);
2070 /*
2071 * If we overwrite a writable spte with a read-only one we
2072 * should flush remote TLBs. Otherwise rmap_write_protect
2073 * will find a read-only spte, even though the writable spte
2074 * might be cached on a CPU's TLB.
2075 */
2076 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2077 kvm_flush_remote_tlbs(vcpu->kvm);
2039done: 2078done:
2040 return ret; 2079 return ret;
2041} 2080}
@@ -2045,7 +2084,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2045 int user_fault, int write_fault, int dirty, 2084 int user_fault, int write_fault, int dirty,
2046 int *ptwrite, int level, gfn_t gfn, 2085 int *ptwrite, int level, gfn_t gfn,
2047 pfn_t pfn, bool speculative, 2086 pfn_t pfn, bool speculative,
2048 bool reset_host_protection) 2087 bool host_writable)
2049{ 2088{
2050 int was_rmapped = 0; 2089 int was_rmapped = 0;
2051 int rmap_count; 2090 int rmap_count;
@@ -2080,7 +2119,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2080 2119
2081 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2120 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2082 dirty, level, gfn, pfn, speculative, true, 2121 dirty, level, gfn, pfn, speculative, true,
2083 reset_host_protection)) { 2122 host_writable)) {
2084 if (write_fault) 2123 if (write_fault)
2085 *ptwrite = 1; 2124 *ptwrite = 1;
2086 kvm_mmu_flush_tlb(vcpu); 2125 kvm_mmu_flush_tlb(vcpu);
@@ -2211,7 +2250,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2211} 2250}
2212 2251
2213static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2252static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2214 int level, gfn_t gfn, pfn_t pfn) 2253 int map_writable, int level, gfn_t gfn, pfn_t pfn,
2254 bool prefault)
2215{ 2255{
2216 struct kvm_shadow_walk_iterator iterator; 2256 struct kvm_shadow_walk_iterator iterator;
2217 struct kvm_mmu_page *sp; 2257 struct kvm_mmu_page *sp;
@@ -2220,9 +2260,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2220 2260
2221 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2261 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2222 if (iterator.level == level) { 2262 if (iterator.level == level) {
2223 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2263 unsigned pte_access = ACC_ALL;
2264
2265 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2224 0, write, 1, &pt_write, 2266 0, write, 1, &pt_write,
2225 level, gfn, pfn, false, true); 2267 level, gfn, pfn, prefault, map_writable);
2226 direct_pte_prefetch(vcpu, iterator.sptep); 2268 direct_pte_prefetch(vcpu, iterator.sptep);
2227 ++vcpu->stat.pf_fixed; 2269 ++vcpu->stat.pf_fixed;
2228 break; 2270 break;
@@ -2277,27 +2319,81 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2277 return 1; 2319 return 1;
2278} 2320}
2279 2321
2280static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2322static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2323 gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2324{
2325 pfn_t pfn = *pfnp;
2326 gfn_t gfn = *gfnp;
2327 int level = *levelp;
2328
2329 /*
2330 * Check if it's a transparent hugepage. If this would be an
2331 * hugetlbfs page, level wouldn't be set to
2332 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2333 * here.
2334 */
2335 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2336 level == PT_PAGE_TABLE_LEVEL &&
2337 PageTransCompound(pfn_to_page(pfn)) &&
2338 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2339 unsigned long mask;
2340 /*
2341 * mmu_notifier_retry was successful and we hold the
2342 * mmu_lock here, so the pmd can't become splitting
2343 * from under us, and in turn
2344 * __split_huge_page_refcount() can't run from under
2345 * us and we can safely transfer the refcount from
2346 * PG_tail to PG_head as we switch the pfn to tail to
2347 * head.
2348 */
2349 *levelp = level = PT_DIRECTORY_LEVEL;
2350 mask = KVM_PAGES_PER_HPAGE(level) - 1;
2351 VM_BUG_ON((gfn & mask) != (pfn & mask));
2352 if (pfn & mask) {
2353 gfn &= ~mask;
2354 *gfnp = gfn;
2355 kvm_release_pfn_clean(pfn);
2356 pfn &= ~mask;
2357 if (!get_page_unless_zero(pfn_to_page(pfn)))
2358 BUG();
2359 *pfnp = pfn;
2360 }
2361 }
2362}
2363
2364static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2365 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2366
2367static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2368 bool prefault)
2281{ 2369{
2282 int r; 2370 int r;
2283 int level; 2371 int level;
2372 int force_pt_level;
2284 pfn_t pfn; 2373 pfn_t pfn;
2285 unsigned long mmu_seq; 2374 unsigned long mmu_seq;
2375 bool map_writable;
2286 2376
2287 level = mapping_level(vcpu, gfn); 2377 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2288 2378 if (likely(!force_pt_level)) {
2289 /* 2379 level = mapping_level(vcpu, gfn);
2290 * This path builds a PAE pagetable - so we can map 2mb pages at 2380 /*
2291 * maximum. Therefore check if the level is larger than that. 2381 * This path builds a PAE pagetable - so we can map
2292 */ 2382 * 2mb pages at maximum. Therefore check if the level
2293 if (level > PT_DIRECTORY_LEVEL) 2383 * is larger than that.
2294 level = PT_DIRECTORY_LEVEL; 2384 */
2385 if (level > PT_DIRECTORY_LEVEL)
2386 level = PT_DIRECTORY_LEVEL;
2295 2387
2296 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2388 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2389 } else
2390 level = PT_PAGE_TABLE_LEVEL;
2297 2391
2298 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2392 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2299 smp_rmb(); 2393 smp_rmb();
2300 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2394
2395 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2396 return 0;
2301 2397
2302 /* mmio */ 2398 /* mmio */
2303 if (is_error_pfn(pfn)) 2399 if (is_error_pfn(pfn))
@@ -2307,7 +2403,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2307 if (mmu_notifier_retry(vcpu, mmu_seq)) 2403 if (mmu_notifier_retry(vcpu, mmu_seq))
2308 goto out_unlock; 2404 goto out_unlock;
2309 kvm_mmu_free_some_pages(vcpu); 2405 kvm_mmu_free_some_pages(vcpu);
2310 r = __direct_map(vcpu, v, write, level, gfn, pfn); 2406 if (likely(!force_pt_level))
2407 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2408 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2409 prefault);
2311 spin_unlock(&vcpu->kvm->mmu_lock); 2410 spin_unlock(&vcpu->kvm->mmu_lock);
2312 2411
2313 2412
@@ -2394,7 +2493,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2394 ASSERT(!VALID_PAGE(root)); 2493 ASSERT(!VALID_PAGE(root));
2395 spin_lock(&vcpu->kvm->mmu_lock); 2494 spin_lock(&vcpu->kvm->mmu_lock);
2396 kvm_mmu_free_some_pages(vcpu); 2495 kvm_mmu_free_some_pages(vcpu);
2397 sp = kvm_mmu_get_page(vcpu, i << 30, i << 30, 2496 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2497 i << 30,
2398 PT32_ROOT_LEVEL, 1, ACC_ALL, 2498 PT32_ROOT_LEVEL, 1, ACC_ALL,
2399 NULL); 2499 NULL);
2400 root = __pa(sp->spt); 2500 root = __pa(sp->spt);
@@ -2529,6 +2629,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2529 hpa_t root = vcpu->arch.mmu.root_hpa; 2629 hpa_t root = vcpu->arch.mmu.root_hpa;
2530 sp = page_header(root); 2630 sp = page_header(root);
2531 mmu_sync_children(vcpu, sp); 2631 mmu_sync_children(vcpu, sp);
2632 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2532 return; 2633 return;
2533 } 2634 }
2534 for (i = 0; i < 4; ++i) { 2635 for (i = 0; i < 4; ++i) {
@@ -2551,23 +2652,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2551} 2652}
2552 2653
2553static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 2654static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2554 u32 access, u32 *error) 2655 u32 access, struct x86_exception *exception)
2555{ 2656{
2556 if (error) 2657 if (exception)
2557 *error = 0; 2658 exception->error_code = 0;
2558 return vaddr; 2659 return vaddr;
2559} 2660}
2560 2661
2561static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, 2662static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2562 u32 access, u32 *error) 2663 u32 access,
2664 struct x86_exception *exception)
2563{ 2665{
2564 if (error) 2666 if (exception)
2565 *error = 0; 2667 exception->error_code = 0;
2566 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); 2668 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2567} 2669}
2568 2670
2569static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2671static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2570 u32 error_code) 2672 u32 error_code, bool prefault)
2571{ 2673{
2572 gfn_t gfn; 2674 gfn_t gfn;
2573 int r; 2675 int r;
@@ -2583,17 +2685,68 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2583 gfn = gva >> PAGE_SHIFT; 2685 gfn = gva >> PAGE_SHIFT;
2584 2686
2585 return nonpaging_map(vcpu, gva & PAGE_MASK, 2687 return nonpaging_map(vcpu, gva & PAGE_MASK,
2586 error_code & PFERR_WRITE_MASK, gfn); 2688 error_code & PFERR_WRITE_MASK, gfn, prefault);
2689}
2690
2691static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2692{
2693 struct kvm_arch_async_pf arch;
2694
2695 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2696 arch.gfn = gfn;
2697 arch.direct_map = vcpu->arch.mmu.direct_map;
2698 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
2699
2700 return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2701}
2702
2703static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2704{
2705 if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2706 kvm_event_needs_reinjection(vcpu)))
2707 return false;
2708
2709 return kvm_x86_ops->interrupt_allowed(vcpu);
2587} 2710}
2588 2711
2589static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, 2712static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2590 u32 error_code) 2713 gva_t gva, pfn_t *pfn, bool write, bool *writable)
2714{
2715 bool async;
2716
2717 *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
2718
2719 if (!async)
2720 return false; /* *pfn has correct page already */
2721
2722 put_page(pfn_to_page(*pfn));
2723
2724 if (!prefault && can_do_async_pf(vcpu)) {
2725 trace_kvm_try_async_get_page(gva, gfn);
2726 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2727 trace_kvm_async_pf_doublefault(gva, gfn);
2728 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2729 return true;
2730 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2731 return true;
2732 }
2733
2734 *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
2735
2736 return false;
2737}
2738
2739static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2740 bool prefault)
2591{ 2741{
2592 pfn_t pfn; 2742 pfn_t pfn;
2593 int r; 2743 int r;
2594 int level; 2744 int level;
2745 int force_pt_level;
2595 gfn_t gfn = gpa >> PAGE_SHIFT; 2746 gfn_t gfn = gpa >> PAGE_SHIFT;
2596 unsigned long mmu_seq; 2747 unsigned long mmu_seq;
2748 int write = error_code & PFERR_WRITE_MASK;
2749 bool map_writable;
2597 2750
2598 ASSERT(vcpu); 2751 ASSERT(vcpu);
2599 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2752 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2602,21 +2755,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2602 if (r) 2755 if (r)
2603 return r; 2756 return r;
2604 2757
2605 level = mapping_level(vcpu, gfn); 2758 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2606 2759 if (likely(!force_pt_level)) {
2607 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2760 level = mapping_level(vcpu, gfn);
2761 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2762 } else
2763 level = PT_PAGE_TABLE_LEVEL;
2608 2764
2609 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2765 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2610 smp_rmb(); 2766 smp_rmb();
2611 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2767
2768 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2769 return 0;
2770
2771 /* mmio */
2612 if (is_error_pfn(pfn)) 2772 if (is_error_pfn(pfn))
2613 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2773 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2614 spin_lock(&vcpu->kvm->mmu_lock); 2774 spin_lock(&vcpu->kvm->mmu_lock);
2615 if (mmu_notifier_retry(vcpu, mmu_seq)) 2775 if (mmu_notifier_retry(vcpu, mmu_seq))
2616 goto out_unlock; 2776 goto out_unlock;
2617 kvm_mmu_free_some_pages(vcpu); 2777 kvm_mmu_free_some_pages(vcpu);
2618 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2778 if (likely(!force_pt_level))
2619 level, gfn, pfn); 2779 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2780 r = __direct_map(vcpu, gpa, write, map_writable,
2781 level, gfn, pfn, prefault);
2620 spin_unlock(&vcpu->kvm->mmu_lock); 2782 spin_unlock(&vcpu->kvm->mmu_lock);
2621 2783
2622 return r; 2784 return r;
@@ -2658,18 +2820,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2658 2820
2659static void paging_new_cr3(struct kvm_vcpu *vcpu) 2821static void paging_new_cr3(struct kvm_vcpu *vcpu)
2660{ 2822{
2661 pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); 2823 pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
2662 mmu_free_roots(vcpu); 2824 mmu_free_roots(vcpu);
2663} 2825}
2664 2826
2665static unsigned long get_cr3(struct kvm_vcpu *vcpu) 2827static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2666{ 2828{
2667 return vcpu->arch.cr3; 2829 return kvm_read_cr3(vcpu);
2668} 2830}
2669 2831
2670static void inject_page_fault(struct kvm_vcpu *vcpu) 2832static void inject_page_fault(struct kvm_vcpu *vcpu,
2833 struct x86_exception *fault)
2671{ 2834{
2672 vcpu->arch.mmu.inject_page_fault(vcpu); 2835 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
2673} 2836}
2674 2837
2675static void paging_free(struct kvm_vcpu *vcpu) 2838static void paging_free(struct kvm_vcpu *vcpu)
@@ -2815,6 +2978,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2815{ 2978{
2816 struct kvm_mmu *context = vcpu->arch.walk_mmu; 2979 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2817 2980
2981 context->base_role.word = 0;
2818 context->new_cr3 = nonpaging_new_cr3; 2982 context->new_cr3 = nonpaging_new_cr3;
2819 context->page_fault = tdp_page_fault; 2983 context->page_fault = tdp_page_fault;
2820 context->free = nonpaging_free; 2984 context->free = nonpaging_free;
@@ -3007,9 +3171,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3007 return; 3171 return;
3008 } 3172 }
3009 3173
3010 if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
3011 return;
3012
3013 ++vcpu->kvm->stat.mmu_pte_updated; 3174 ++vcpu->kvm->stat.mmu_pte_updated;
3014 if (!sp->role.cr4_pae) 3175 if (!sp->role.cr4_pae)
3015 paging32_update_pte(vcpu, sp, spte, new); 3176 paging32_update_pte(vcpu, sp, spte, new);
@@ -3263,12 +3424,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3263 } 3424 }
3264} 3425}
3265 3426
3266int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 3427int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3428 void *insn, int insn_len)
3267{ 3429{
3268 int r; 3430 int r;
3269 enum emulation_result er; 3431 enum emulation_result er;
3270 3432
3271 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); 3433 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
3272 if (r < 0) 3434 if (r < 0)
3273 goto out; 3435 goto out;
3274 3436
@@ -3281,7 +3443,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
3281 if (r) 3443 if (r)
3282 goto out; 3444 goto out;
3283 3445
3284 er = emulate_instruction(vcpu, cr2, error_code, 0); 3446 er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
3285 3447
3286 switch (er) { 3448 switch (er) {
3287 case EMULATE_DONE: 3449 case EMULATE_DONE:
@@ -3376,11 +3538,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3376 if (!test_bit(slot, sp->slot_bitmap)) 3538 if (!test_bit(slot, sp->slot_bitmap))
3377 continue; 3539 continue;
3378 3540
3541 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3542 continue;
3543
3379 pt = sp->spt; 3544 pt = sp->spt;
3380 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3545 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
3381 /* avoid RMW */ 3546 /* avoid RMW */
3382 if (is_writable_pte(pt[i])) 3547 if (is_writable_pte(pt[i]))
3383 pt[i] &= ~PT_WRITABLE_MASK; 3548 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3384 } 3549 }
3385 kvm_flush_remote_tlbs(kvm); 3550 kvm_flush_remote_tlbs(kvm);
3386} 3551}
@@ -3462,13 +3627,6 @@ static void mmu_destroy_caches(void)
3462 kmem_cache_destroy(mmu_page_header_cache); 3627 kmem_cache_destroy(mmu_page_header_cache);
3463} 3628}
3464 3629
3465void kvm_mmu_module_exit(void)
3466{
3467 mmu_destroy_caches();
3468 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3469 unregister_shrinker(&mmu_shrinker);
3470}
3471
3472int kvm_mmu_module_init(void) 3630int kvm_mmu_module_init(void)
3473{ 3631{
3474 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3632 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3565,7 +3723,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3565 3723
3566static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3724static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3567{ 3725{
3568 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); 3726 (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
3569 return 1; 3727 return 1;
3570} 3728}
3571 3729
@@ -3661,12 +3819,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3661} 3819}
3662EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3820EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3663 3821
3664#ifdef CONFIG_KVM_MMU_AUDIT
3665#include "mmu_audit.c"
3666#else
3667static void mmu_audit_disable(void) { }
3668#endif
3669
3670void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 3822void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3671{ 3823{
3672 ASSERT(vcpu); 3824 ASSERT(vcpu);
@@ -3674,5 +3826,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3674 destroy_kvm_mmu(vcpu); 3826 destroy_kvm_mmu(vcpu);
3675 free_mmu_pages(vcpu); 3827 free_mmu_pages(vcpu);
3676 mmu_free_memory_caches(vcpu); 3828 mmu_free_memory_caches(vcpu);
3829}
3830
3831#ifdef CONFIG_KVM_MMU_AUDIT
3832#include "mmu_audit.c"
3833#else
3834static void mmu_audit_disable(void) { }
3835#endif
3836
3837void kvm_mmu_module_exit(void)
3838{
3839 mmu_destroy_caches();
3840 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3841 unregister_shrinker(&mmu_shrinker);
3677 mmu_audit_disable(); 3842 mmu_audit_disable();
3678} 3843}
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ba2bcdde6221..5f6223b8bcf7 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,11 +19,9 @@
19 19
20#include <linux/ratelimit.h> 20#include <linux/ratelimit.h>
21 21
22static int audit_point; 22#define audit_printk(kvm, fmt, args...) \
23
24#define audit_printk(fmt, args...) \
25 printk(KERN_ERR "audit: (%s) error: " \ 23 printk(KERN_ERR "audit: (%s) error: " \
26 fmt, audit_point_name[audit_point], ##args) 24 fmt, audit_point_name[kvm->arch.audit_point], ##args)
27 25
28typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); 26typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
29 27
@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
97 95
98 if (sp->unsync) { 96 if (sp->unsync) {
99 if (level != PT_PAGE_TABLE_LEVEL) { 97 if (level != PT_PAGE_TABLE_LEVEL) {
100 audit_printk("unsync sp: %p level = %d\n", sp, level); 98 audit_printk(vcpu->kvm, "unsync sp: %p "
99 "level = %d\n", sp, level);
101 return; 100 return;
102 } 101 }
103 102
104 if (*sptep == shadow_notrap_nonpresent_pte) { 103 if (*sptep == shadow_notrap_nonpresent_pte) {
105 audit_printk("notrap spte in unsync sp: %p\n", sp); 104 audit_printk(vcpu->kvm, "notrap spte in unsync "
105 "sp: %p\n", sp);
106 return; 106 return;
107 } 107 }
108 } 108 }
109 109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { 110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk("notrap spte in direct sp: %p\n", sp); 111 audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
112 sp);
112 return; 113 return;
113 } 114 }
114 115
@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
125 126
126 hpa = pfn << PAGE_SHIFT; 127 hpa = pfn << PAGE_SHIFT;
127 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) 128 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
128 audit_printk("levels %d pfn %llx hpa %llx ent %llxn", 129 audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
129 vcpu->arch.mmu.root_level, pfn, hpa, *sptep); 130 "ent %llxn", vcpu->arch.mmu.root_level, pfn,
131 hpa, *sptep);
130} 132}
131 133
132static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) 134static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
142 if (!gfn_to_memslot(kvm, gfn)) { 144 if (!gfn_to_memslot(kvm, gfn)) {
143 if (!printk_ratelimit()) 145 if (!printk_ratelimit())
144 return; 146 return;
145 audit_printk("no memslot for gfn %llx\n", gfn); 147 audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
146 audit_printk("index %ld of sp (gfn=%llx)\n", 148 audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
147 (long int)(sptep - rev_sp->spt), rev_sp->gfn); 149 (long int)(sptep - rev_sp->spt), rev_sp->gfn);
148 dump_stack(); 150 dump_stack();
149 return; 151 return;
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
153 if (!*rmapp) { 155 if (!*rmapp) {
154 if (!printk_ratelimit()) 156 if (!printk_ratelimit())
155 return; 157 return;
156 audit_printk("no rmap for writable spte %llx\n", *sptep); 158 audit_printk(kvm, "no rmap for writable spte %llx\n",
159 *sptep);
157 dump_stack(); 160 dump_stack();
158 } 161 }
159} 162}
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
168{ 171{
169 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 172 struct kvm_mmu_page *sp = page_header(__pa(sptep));
170 173
171 if (audit_point == AUDIT_POST_SYNC && sp->unsync) 174 if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
172 audit_printk("meet unsync sp(%p) after sync root.\n", sp); 175 audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
176 "root.\n", sp);
173} 177}
174 178
175static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) 179static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
202 spte = rmap_next(kvm, rmapp, NULL); 206 spte = rmap_next(kvm, rmapp, NULL);
203 while (spte) { 207 while (spte) {
204 if (is_writable_pte(*spte)) 208 if (is_writable_pte(*spte))
205 audit_printk("shadow page has writable mappings: gfn " 209 audit_printk(kvm, "shadow page has writable "
206 "%llx role %x\n", sp->gfn, sp->role.word); 210 "mappings: gfn %llx role %x\n",
211 sp->gfn, sp->role.word);
207 spte = rmap_next(kvm, rmapp, spte); 212 spte = rmap_next(kvm, rmapp, spte);
208 } 213 }
209} 214}
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
238 if (!__ratelimit(&ratelimit_state)) 243 if (!__ratelimit(&ratelimit_state))
239 return; 244 return;
240 245
241 audit_point = point; 246 vcpu->kvm->arch.audit_point = point;
242 audit_all_active_sps(vcpu->kvm); 247 audit_all_active_sps(vcpu->kvm);
243 audit_vcpu_spte(vcpu); 248 audit_vcpu_spte(vcpu);
244} 249}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cd7a833a3b52..6bccc24c4181 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -72,7 +72,7 @@ struct guest_walker {
72 unsigned pt_access; 72 unsigned pt_access;
73 unsigned pte_access; 73 unsigned pte_access;
74 gfn_t gfn; 74 gfn_t gfn;
75 u32 error_code; 75 struct x86_exception fault;
76}; 76};
77 77
78static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) 78static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -266,21 +266,23 @@ walk:
266 return 1; 266 return 1;
267 267
268error: 268error:
269 walker->error_code = 0; 269 walker->fault.vector = PF_VECTOR;
270 walker->fault.error_code_valid = true;
271 walker->fault.error_code = 0;
270 if (present) 272 if (present)
271 walker->error_code |= PFERR_PRESENT_MASK; 273 walker->fault.error_code |= PFERR_PRESENT_MASK;
272 274
273 walker->error_code |= write_fault | user_fault; 275 walker->fault.error_code |= write_fault | user_fault;
274 276
275 if (fetch_fault && mmu->nx) 277 if (fetch_fault && mmu->nx)
276 walker->error_code |= PFERR_FETCH_MASK; 278 walker->fault.error_code |= PFERR_FETCH_MASK;
277 if (rsvd_fault) 279 if (rsvd_fault)
278 walker->error_code |= PFERR_RSVD_MASK; 280 walker->fault.error_code |= PFERR_RSVD_MASK;
279 281
280 vcpu->arch.fault.address = addr; 282 walker->fault.address = addr;
281 vcpu->arch.fault.error_code = walker->error_code; 283 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
282 284
283 trace_kvm_mmu_walker_error(walker->error_code); 285 trace_kvm_mmu_walker_error(walker->fault.error_code);
284 return 0; 286 return 0;
285} 287}
286 288
@@ -299,25 +301,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
299 addr, access); 301 addr, access);
300} 302}
301 303
304static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
305 struct kvm_mmu_page *sp, u64 *spte,
306 pt_element_t gpte)
307{
308 u64 nonpresent = shadow_trap_nonpresent_pte;
309
310 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
311 goto no_present;
312
313 if (!is_present_gpte(gpte)) {
314 if (!sp->unsync)
315 nonpresent = shadow_notrap_nonpresent_pte;
316 goto no_present;
317 }
318
319 if (!(gpte & PT_ACCESSED_MASK))
320 goto no_present;
321
322 return false;
323
324no_present:
325 drop_spte(vcpu->kvm, spte, nonpresent);
326 return true;
327}
328
302static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 329static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
303 u64 *spte, const void *pte) 330 u64 *spte, const void *pte)
304{ 331{
305 pt_element_t gpte; 332 pt_element_t gpte;
306 unsigned pte_access; 333 unsigned pte_access;
307 pfn_t pfn; 334 pfn_t pfn;
308 u64 new_spte;
309 335
310 gpte = *(const pt_element_t *)pte; 336 gpte = *(const pt_element_t *)pte;
311 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 337 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
312 if (!is_present_gpte(gpte)) {
313 if (sp->unsync)
314 new_spte = shadow_trap_nonpresent_pte;
315 else
316 new_spte = shadow_notrap_nonpresent_pte;
317 __set_spte(spte, new_spte);
318 }
319 return; 338 return;
320 } 339
321 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 340 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
322 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 341 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
323 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 342 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -329,7 +348,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
329 return; 348 return;
330 kvm_get_pfn(pfn); 349 kvm_get_pfn(pfn);
331 /* 350 /*
332 * we call mmu_set_spte() with reset_host_protection = true beacuse that 351 * we call mmu_set_spte() with host_writable = true beacuse that
333 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 352 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
334 */ 353 */
335 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 354 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -364,7 +383,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
364 u64 *sptep) 383 u64 *sptep)
365{ 384{
366 struct kvm_mmu_page *sp; 385 struct kvm_mmu_page *sp;
367 struct kvm_mmu *mmu = &vcpu->arch.mmu;
368 pt_element_t *gptep = gw->prefetch_ptes; 386 pt_element_t *gptep = gw->prefetch_ptes;
369 u64 *spte; 387 u64 *spte;
370 int i; 388 int i;
@@ -395,14 +413,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
395 413
396 gpte = gptep[i]; 414 gpte = gptep[i];
397 415
398 if (!is_present_gpte(gpte) || 416 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
399 is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
400 if (!sp->unsync)
401 __set_spte(spte, shadow_notrap_nonpresent_pte);
402 continue;
403 }
404
405 if (!(gpte & PT_ACCESSED_MASK))
406 continue; 417 continue;
407 418
408 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 419 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -427,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
427static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 438static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
428 struct guest_walker *gw, 439 struct guest_walker *gw,
429 int user_fault, int write_fault, int hlevel, 440 int user_fault, int write_fault, int hlevel,
430 int *ptwrite, pfn_t pfn) 441 int *ptwrite, pfn_t pfn, bool map_writable,
442 bool prefault)
431{ 443{
432 unsigned access = gw->pt_access; 444 unsigned access = gw->pt_access;
433 struct kvm_mmu_page *sp = NULL; 445 struct kvm_mmu_page *sp = NULL;
@@ -501,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
501 513
502 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 514 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
503 user_fault, write_fault, dirty, ptwrite, it.level, 515 user_fault, write_fault, dirty, ptwrite, it.level,
504 gw->gfn, pfn, false, true); 516 gw->gfn, pfn, prefault, map_writable);
505 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 517 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
506 518
507 return it.sptep; 519 return it.sptep;
@@ -527,8 +539,8 @@ out_gpte_changed:
527 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 539 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
528 * a negative value on error. 540 * a negative value on error.
529 */ 541 */
530static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, 542static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
531 u32 error_code) 543 bool prefault)
532{ 544{
533 int write_fault = error_code & PFERR_WRITE_MASK; 545 int write_fault = error_code & PFERR_WRITE_MASK;
534 int user_fault = error_code & PFERR_USER_MASK; 546 int user_fault = error_code & PFERR_USER_MASK;
@@ -538,7 +550,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
538 int r; 550 int r;
539 pfn_t pfn; 551 pfn_t pfn;
540 int level = PT_PAGE_TABLE_LEVEL; 552 int level = PT_PAGE_TABLE_LEVEL;
553 int force_pt_level;
541 unsigned long mmu_seq; 554 unsigned long mmu_seq;
555 bool map_writable;
542 556
543 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 557 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
544 558
@@ -556,19 +570,29 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
556 */ 570 */
557 if (!r) { 571 if (!r) {
558 pgprintk("%s: guest page fault\n", __func__); 572 pgprintk("%s: guest page fault\n", __func__);
559 inject_page_fault(vcpu); 573 if (!prefault) {
560 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 574 inject_page_fault(vcpu, &walker.fault);
575 /* reset fork detector */
576 vcpu->arch.last_pt_write_count = 0;
577 }
561 return 0; 578 return 0;
562 } 579 }
563 580
564 if (walker.level >= PT_DIRECTORY_LEVEL) { 581 if (walker.level >= PT_DIRECTORY_LEVEL)
582 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
583 else
584 force_pt_level = 1;
585 if (!force_pt_level) {
565 level = min(walker.level, mapping_level(vcpu, walker.gfn)); 586 level = min(walker.level, mapping_level(vcpu, walker.gfn));
566 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 587 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
567 } 588 }
568 589
569 mmu_seq = vcpu->kvm->mmu_notifier_seq; 590 mmu_seq = vcpu->kvm->mmu_notifier_seq;
570 smp_rmb(); 591 smp_rmb();
571 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 592
593 if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
594 &map_writable))
595 return 0;
572 596
573 /* mmio */ 597 /* mmio */
574 if (is_error_pfn(pfn)) 598 if (is_error_pfn(pfn))
@@ -580,8 +604,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
580 604
581 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 605 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
582 kvm_mmu_free_some_pages(vcpu); 606 kvm_mmu_free_some_pages(vcpu);
607 if (!force_pt_level)
608 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
583 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 609 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
584 level, &write_pt, pfn); 610 level, &write_pt, pfn, map_writable, prefault);
585 (void)sptep; 611 (void)sptep;
586 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 612 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
587 sptep, *sptep, write_pt); 613 sptep, *sptep, write_pt);
@@ -661,7 +687,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
661} 687}
662 688
663static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 689static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
664 u32 *error) 690 struct x86_exception *exception)
665{ 691{
666 struct guest_walker walker; 692 struct guest_walker walker;
667 gpa_t gpa = UNMAPPED_GVA; 693 gpa_t gpa = UNMAPPED_GVA;
@@ -672,14 +698,15 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
672 if (r) { 698 if (r) {
673 gpa = gfn_to_gpa(walker.gfn); 699 gpa = gfn_to_gpa(walker.gfn);
674 gpa |= vaddr & ~PAGE_MASK; 700 gpa |= vaddr & ~PAGE_MASK;
675 } else if (error) 701 } else if (exception)
676 *error = walker.error_code; 702 *exception = walker.fault;
677 703
678 return gpa; 704 return gpa;
679} 705}
680 706
681static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, 707static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
682 u32 access, u32 *error) 708 u32 access,
709 struct x86_exception *exception)
683{ 710{
684 struct guest_walker walker; 711 struct guest_walker walker;
685 gpa_t gpa = UNMAPPED_GVA; 712 gpa_t gpa = UNMAPPED_GVA;
@@ -690,8 +717,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
690 if (r) { 717 if (r) {
691 gpa = gfn_to_gpa(walker.gfn); 718 gpa = gfn_to_gpa(walker.gfn);
692 gpa |= vaddr & ~PAGE_MASK; 719 gpa |= vaddr & ~PAGE_MASK;
693 } else if (error) 720 } else if (exception)
694 *error = walker.error_code; 721 *exception = walker.fault;
695 722
696 return gpa; 723 return gpa;
697} 724}
@@ -730,12 +757,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
730 * Using the cached information from sp->gfns is safe because: 757 * Using the cached information from sp->gfns is safe because:
731 * - The spte has a reference to the struct page, so the pfn for a given gfn 758 * - The spte has a reference to the struct page, so the pfn for a given gfn
732 * can't change unless all sptes pointing to it are nuked first. 759 * can't change unless all sptes pointing to it are nuked first.
760 *
761 * Note:
762 * We should flush all tlbs if spte is dropped even though guest is
763 * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
764 * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
765 * used by guest then tlbs are not flushed, so guest is allowed to access the
766 * freed pages.
767 * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
733 */ 768 */
734static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 769static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
735 bool clear_unsync)
736{ 770{
737 int i, offset, nr_present; 771 int i, offset, nr_present;
738 bool reset_host_protection; 772 bool host_writable;
739 gpa_t first_pte_gpa; 773 gpa_t first_pte_gpa;
740 774
741 offset = nr_present = 0; 775 offset = nr_present = 0;
@@ -764,31 +798,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
764 return -EINVAL; 798 return -EINVAL;
765 799
766 gfn = gpte_to_gfn(gpte); 800 gfn = gpte_to_gfn(gpte);
767 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
768 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
769 || !(gpte & PT_ACCESSED_MASK)) {
770 u64 nonpresent;
771 801
772 if (is_present_gpte(gpte) || !clear_unsync) 802 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
773 nonpresent = shadow_trap_nonpresent_pte; 803 vcpu->kvm->tlbs_dirty++;
774 else 804 continue;
775 nonpresent = shadow_notrap_nonpresent_pte; 805 }
776 drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); 806
807 if (gfn != sp->gfns[i]) {
808 drop_spte(vcpu->kvm, &sp->spt[i],
809 shadow_trap_nonpresent_pte);
810 vcpu->kvm->tlbs_dirty++;
777 continue; 811 continue;
778 } 812 }
779 813
780 nr_present++; 814 nr_present++;
781 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 815 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
782 if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { 816 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
783 pte_access &= ~ACC_WRITE_MASK; 817
784 reset_host_protection = 0;
785 } else {
786 reset_host_protection = 1;
787 }
788 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 818 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
789 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 819 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
790 spte_to_pfn(sp->spt[i]), true, false, 820 spte_to_pfn(sp->spt[i]), true, false,
791 reset_host_protection); 821 host_writable);
792 } 822 }
793 823
794 return !nr_present; 824 return !nr_present;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 82e144a4e514..25bd1bc5aad2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -31,6 +31,7 @@
31 31
32#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
33#include <asm/desc.h> 33#include <asm/desc.h>
34#include <asm/kvm_para.h>
34 35
35#include <asm/virtext.h> 36#include <asm/virtext.h>
36#include "trace.h" 37#include "trace.h"
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL");
50#define SVM_FEATURE_LBRV (1 << 1) 51#define SVM_FEATURE_LBRV (1 << 1)
51#define SVM_FEATURE_SVML (1 << 2) 52#define SVM_FEATURE_SVML (1 << 2)
52#define SVM_FEATURE_NRIP (1 << 3) 53#define SVM_FEATURE_NRIP (1 << 3)
54#define SVM_FEATURE_TSC_RATE (1 << 4)
55#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
56#define SVM_FEATURE_FLUSH_ASID (1 << 6)
57#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
53#define SVM_FEATURE_PAUSE_FILTER (1 << 10) 58#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
54 59
55#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 60#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -97,10 +102,8 @@ struct nested_state {
97 unsigned long vmexit_rax; 102 unsigned long vmexit_rax;
98 103
99 /* cache for intercepts of the guest */ 104 /* cache for intercepts of the guest */
100 u16 intercept_cr_read; 105 u32 intercept_cr;
101 u16 intercept_cr_write; 106 u32 intercept_dr;
102 u16 intercept_dr_read;
103 u16 intercept_dr_write;
104 u32 intercept_exceptions; 107 u32 intercept_exceptions;
105 u64 intercept; 108 u64 intercept;
106 109
@@ -123,7 +126,12 @@ struct vcpu_svm {
123 u64 next_rip; 126 u64 next_rip;
124 127
125 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; 128 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
126 u64 host_gs_base; 129 struct {
130 u16 fs;
131 u16 gs;
132 u16 ldt;
133 u64 gs_base;
134 } host;
127 135
128 u32 *msrpm; 136 u32 *msrpm;
129 137
@@ -133,6 +141,7 @@ struct vcpu_svm {
133 141
134 unsigned int3_injected; 142 unsigned int3_injected;
135 unsigned long int3_rip; 143 unsigned long int3_rip;
144 u32 apf_reason;
136}; 145};
137 146
138#define MSR_INVALID 0xffffffffU 147#define MSR_INVALID 0xffffffffU
@@ -180,14 +189,151 @@ static int nested_svm_vmexit(struct vcpu_svm *svm);
180static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 189static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
181 bool has_error_code, u32 error_code); 190 bool has_error_code, u32 error_code);
182 191
192enum {
193 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
194 pause filter count */
195 VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
196 VMCB_ASID, /* ASID */
197 VMCB_INTR, /* int_ctl, int_vector */
198 VMCB_NPT, /* npt_en, nCR3, gPAT */
199 VMCB_CR, /* CR0, CR3, CR4, EFER */
200 VMCB_DR, /* DR6, DR7 */
201 VMCB_DT, /* GDT, IDT */
202 VMCB_SEG, /* CS, DS, SS, ES, CPL */
203 VMCB_CR2, /* CR2 only */
204 VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
205 VMCB_DIRTY_MAX,
206};
207
208/* TPR and CR2 are always written before VMRUN */
209#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
210
211static inline void mark_all_dirty(struct vmcb *vmcb)
212{
213 vmcb->control.clean = 0;
214}
215
216static inline void mark_all_clean(struct vmcb *vmcb)
217{
218 vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
219 & ~VMCB_ALWAYS_DIRTY_MASK;
220}
221
222static inline void mark_dirty(struct vmcb *vmcb, int bit)
223{
224 vmcb->control.clean &= ~(1 << bit);
225}
226
183static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 227static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
184{ 228{
185 return container_of(vcpu, struct vcpu_svm, vcpu); 229 return container_of(vcpu, struct vcpu_svm, vcpu);
186} 230}
187 231
188static inline bool is_nested(struct vcpu_svm *svm) 232static void recalc_intercepts(struct vcpu_svm *svm)
233{
234 struct vmcb_control_area *c, *h;
235 struct nested_state *g;
236
237 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
238
239 if (!is_guest_mode(&svm->vcpu))
240 return;
241
242 c = &svm->vmcb->control;
243 h = &svm->nested.hsave->control;
244 g = &svm->nested;
245
246 c->intercept_cr = h->intercept_cr | g->intercept_cr;
247 c->intercept_dr = h->intercept_dr | g->intercept_dr;
248 c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
249 c->intercept = h->intercept | g->intercept;
250}
251
252static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
253{
254 if (is_guest_mode(&svm->vcpu))
255 return svm->nested.hsave;
256 else
257 return svm->vmcb;
258}
259
260static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
261{
262 struct vmcb *vmcb = get_host_vmcb(svm);
263
264 vmcb->control.intercept_cr |= (1U << bit);
265
266 recalc_intercepts(svm);
267}
268
269static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
270{
271 struct vmcb *vmcb = get_host_vmcb(svm);
272
273 vmcb->control.intercept_cr &= ~(1U << bit);
274
275 recalc_intercepts(svm);
276}
277
278static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
279{
280 struct vmcb *vmcb = get_host_vmcb(svm);
281
282 return vmcb->control.intercept_cr & (1U << bit);
283}
284
285static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
286{
287 struct vmcb *vmcb = get_host_vmcb(svm);
288
289 vmcb->control.intercept_dr |= (1U << bit);
290
291 recalc_intercepts(svm);
292}
293
294static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
295{
296 struct vmcb *vmcb = get_host_vmcb(svm);
297
298 vmcb->control.intercept_dr &= ~(1U << bit);
299
300 recalc_intercepts(svm);
301}
302
303static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
304{
305 struct vmcb *vmcb = get_host_vmcb(svm);
306
307 vmcb->control.intercept_exceptions |= (1U << bit);
308
309 recalc_intercepts(svm);
310}
311
312static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
189{ 313{
190 return svm->nested.vmcb; 314 struct vmcb *vmcb = get_host_vmcb(svm);
315
316 vmcb->control.intercept_exceptions &= ~(1U << bit);
317
318 recalc_intercepts(svm);
319}
320
321static inline void set_intercept(struct vcpu_svm *svm, int bit)
322{
323 struct vmcb *vmcb = get_host_vmcb(svm);
324
325 vmcb->control.intercept |= (1ULL << bit);
326
327 recalc_intercepts(svm);
328}
329
330static inline void clr_intercept(struct vcpu_svm *svm, int bit)
331{
332 struct vmcb *vmcb = get_host_vmcb(svm);
333
334 vmcb->control.intercept &= ~(1ULL << bit);
335
336 recalc_intercepts(svm);
191} 337}
192 338
193static inline void enable_gif(struct vcpu_svm *svm) 339static inline void enable_gif(struct vcpu_svm *svm)
@@ -264,11 +410,6 @@ static u32 svm_msrpm_offset(u32 msr)
264 410
265#define MAX_INST_SIZE 15 411#define MAX_INST_SIZE 15
266 412
267static inline u32 svm_has(u32 feat)
268{
269 return svm_features & feat;
270}
271
272static inline void clgi(void) 413static inline void clgi(void)
273{ 414{
274 asm volatile (__ex(SVM_CLGI)); 415 asm volatile (__ex(SVM_CLGI));
@@ -284,16 +425,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
284 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); 425 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
285} 426}
286 427
287static inline void force_new_asid(struct kvm_vcpu *vcpu)
288{
289 to_svm(vcpu)->asid_generation--;
290}
291
292static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
293{
294 force_new_asid(vcpu);
295}
296
297static int get_npt_level(void) 428static int get_npt_level(void)
298{ 429{
299#ifdef CONFIG_X86_64 430#ifdef CONFIG_X86_64
@@ -310,6 +441,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
310 efer &= ~EFER_LME; 441 efer &= ~EFER_LME;
311 442
312 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 443 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
444 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
313} 445}
314 446
315static int is_external_interrupt(u32 info) 447static int is_external_interrupt(u32 info)
@@ -347,7 +479,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
347 svm->next_rip = svm->vmcb->control.next_rip; 479 svm->next_rip = svm->vmcb->control.next_rip;
348 480
349 if (!svm->next_rip) { 481 if (!svm->next_rip) {
350 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != 482 if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
351 EMULATE_DONE) 483 EMULATE_DONE)
352 printk(KERN_DEBUG "%s: NOP\n", __func__); 484 printk(KERN_DEBUG "%s: NOP\n", __func__);
353 return; 485 return;
@@ -374,7 +506,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
374 nested_svm_check_exception(svm, nr, has_error_code, error_code)) 506 nested_svm_check_exception(svm, nr, has_error_code, error_code))
375 return; 507 return;
376 508
377 if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { 509 if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
378 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); 510 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
379 511
380 /* 512 /*
@@ -670,7 +802,7 @@ static __init int svm_hardware_setup(void)
670 802
671 svm_features = cpuid_edx(SVM_CPUID_FUNC); 803 svm_features = cpuid_edx(SVM_CPUID_FUNC);
672 804
673 if (!svm_has(SVM_FEATURE_NPT)) 805 if (!boot_cpu_has(X86_FEATURE_NPT))
674 npt_enabled = false; 806 npt_enabled = false;
675 807
676 if (npt_enabled && !npt) { 808 if (npt_enabled && !npt) {
@@ -725,13 +857,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
725 struct vcpu_svm *svm = to_svm(vcpu); 857 struct vcpu_svm *svm = to_svm(vcpu);
726 u64 g_tsc_offset = 0; 858 u64 g_tsc_offset = 0;
727 859
728 if (is_nested(svm)) { 860 if (is_guest_mode(vcpu)) {
729 g_tsc_offset = svm->vmcb->control.tsc_offset - 861 g_tsc_offset = svm->vmcb->control.tsc_offset -
730 svm->nested.hsave->control.tsc_offset; 862 svm->nested.hsave->control.tsc_offset;
731 svm->nested.hsave->control.tsc_offset = offset; 863 svm->nested.hsave->control.tsc_offset = offset;
732 } 864 }
733 865
734 svm->vmcb->control.tsc_offset = offset + g_tsc_offset; 866 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
867
868 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
735} 869}
736 870
737static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 871static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -739,8 +873,9 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
739 struct vcpu_svm *svm = to_svm(vcpu); 873 struct vcpu_svm *svm = to_svm(vcpu);
740 874
741 svm->vmcb->control.tsc_offset += adjustment; 875 svm->vmcb->control.tsc_offset += adjustment;
742 if (is_nested(svm)) 876 if (is_guest_mode(vcpu))
743 svm->nested.hsave->control.tsc_offset += adjustment; 877 svm->nested.hsave->control.tsc_offset += adjustment;
878 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
744} 879}
745 880
746static void init_vmcb(struct vcpu_svm *svm) 881static void init_vmcb(struct vcpu_svm *svm)
@@ -749,62 +884,62 @@ static void init_vmcb(struct vcpu_svm *svm)
749 struct vmcb_save_area *save = &svm->vmcb->save; 884 struct vmcb_save_area *save = &svm->vmcb->save;
750 885
751 svm->vcpu.fpu_active = 1; 886 svm->vcpu.fpu_active = 1;
887 svm->vcpu.arch.hflags = 0;
752 888
753 control->intercept_cr_read = INTERCEPT_CR0_MASK | 889 set_cr_intercept(svm, INTERCEPT_CR0_READ);
754 INTERCEPT_CR3_MASK | 890 set_cr_intercept(svm, INTERCEPT_CR3_READ);
755 INTERCEPT_CR4_MASK; 891 set_cr_intercept(svm, INTERCEPT_CR4_READ);
756 892 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
757 control->intercept_cr_write = INTERCEPT_CR0_MASK | 893 set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
758 INTERCEPT_CR3_MASK | 894 set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
759 INTERCEPT_CR4_MASK | 895 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
760 INTERCEPT_CR8_MASK; 896
761 897 set_dr_intercept(svm, INTERCEPT_DR0_READ);
762 control->intercept_dr_read = INTERCEPT_DR0_MASK | 898 set_dr_intercept(svm, INTERCEPT_DR1_READ);
763 INTERCEPT_DR1_MASK | 899 set_dr_intercept(svm, INTERCEPT_DR2_READ);
764 INTERCEPT_DR2_MASK | 900 set_dr_intercept(svm, INTERCEPT_DR3_READ);
765 INTERCEPT_DR3_MASK | 901 set_dr_intercept(svm, INTERCEPT_DR4_READ);
766 INTERCEPT_DR4_MASK | 902 set_dr_intercept(svm, INTERCEPT_DR5_READ);
767 INTERCEPT_DR5_MASK | 903 set_dr_intercept(svm, INTERCEPT_DR6_READ);
768 INTERCEPT_DR6_MASK | 904 set_dr_intercept(svm, INTERCEPT_DR7_READ);
769 INTERCEPT_DR7_MASK; 905
770 906 set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
771 control->intercept_dr_write = INTERCEPT_DR0_MASK | 907 set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
772 INTERCEPT_DR1_MASK | 908 set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
773 INTERCEPT_DR2_MASK | 909 set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
774 INTERCEPT_DR3_MASK | 910 set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
775 INTERCEPT_DR4_MASK | 911 set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
776 INTERCEPT_DR5_MASK | 912 set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
777 INTERCEPT_DR6_MASK | 913 set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
778 INTERCEPT_DR7_MASK; 914
779 915 set_exception_intercept(svm, PF_VECTOR);
780 control->intercept_exceptions = (1 << PF_VECTOR) | 916 set_exception_intercept(svm, UD_VECTOR);
781 (1 << UD_VECTOR) | 917 set_exception_intercept(svm, MC_VECTOR);
782 (1 << MC_VECTOR); 918
783 919 set_intercept(svm, INTERCEPT_INTR);
784 920 set_intercept(svm, INTERCEPT_NMI);
785 control->intercept = (1ULL << INTERCEPT_INTR) | 921 set_intercept(svm, INTERCEPT_SMI);
786 (1ULL << INTERCEPT_NMI) | 922 set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
787 (1ULL << INTERCEPT_SMI) | 923 set_intercept(svm, INTERCEPT_CPUID);
788 (1ULL << INTERCEPT_SELECTIVE_CR0) | 924 set_intercept(svm, INTERCEPT_INVD);
789 (1ULL << INTERCEPT_CPUID) | 925 set_intercept(svm, INTERCEPT_HLT);
790 (1ULL << INTERCEPT_INVD) | 926 set_intercept(svm, INTERCEPT_INVLPG);
791 (1ULL << INTERCEPT_HLT) | 927 set_intercept(svm, INTERCEPT_INVLPGA);
792 (1ULL << INTERCEPT_INVLPG) | 928 set_intercept(svm, INTERCEPT_IOIO_PROT);
793 (1ULL << INTERCEPT_INVLPGA) | 929 set_intercept(svm, INTERCEPT_MSR_PROT);
794 (1ULL << INTERCEPT_IOIO_PROT) | 930 set_intercept(svm, INTERCEPT_TASK_SWITCH);
795 (1ULL << INTERCEPT_MSR_PROT) | 931 set_intercept(svm, INTERCEPT_SHUTDOWN);
796 (1ULL << INTERCEPT_TASK_SWITCH) | 932 set_intercept(svm, INTERCEPT_VMRUN);
797 (1ULL << INTERCEPT_SHUTDOWN) | 933 set_intercept(svm, INTERCEPT_VMMCALL);
798 (1ULL << INTERCEPT_VMRUN) | 934 set_intercept(svm, INTERCEPT_VMLOAD);
799 (1ULL << INTERCEPT_VMMCALL) | 935 set_intercept(svm, INTERCEPT_VMSAVE);
800 (1ULL << INTERCEPT_VMLOAD) | 936 set_intercept(svm, INTERCEPT_STGI);
801 (1ULL << INTERCEPT_VMSAVE) | 937 set_intercept(svm, INTERCEPT_CLGI);
802 (1ULL << INTERCEPT_STGI) | 938 set_intercept(svm, INTERCEPT_SKINIT);
803 (1ULL << INTERCEPT_CLGI) | 939 set_intercept(svm, INTERCEPT_WBINVD);
804 (1ULL << INTERCEPT_SKINIT) | 940 set_intercept(svm, INTERCEPT_MONITOR);
805 (1ULL << INTERCEPT_WBINVD) | 941 set_intercept(svm, INTERCEPT_MWAIT);
806 (1ULL << INTERCEPT_MONITOR) | 942 set_intercept(svm, INTERCEPT_XSETBV);
807 (1ULL << INTERCEPT_MWAIT);
808 943
809 control->iopm_base_pa = iopm_base; 944 control->iopm_base_pa = iopm_base;
810 control->msrpm_base_pa = __pa(svm->msrpm); 945 control->msrpm_base_pa = __pa(svm->msrpm);
@@ -855,25 +990,27 @@ static void init_vmcb(struct vcpu_svm *svm)
855 if (npt_enabled) { 990 if (npt_enabled) {
856 /* Setup VMCB for Nested Paging */ 991 /* Setup VMCB for Nested Paging */
857 control->nested_ctl = 1; 992 control->nested_ctl = 1;
858 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | 993 clr_intercept(svm, INTERCEPT_TASK_SWITCH);
859 (1ULL << INTERCEPT_INVLPG)); 994 clr_intercept(svm, INTERCEPT_INVLPG);
860 control->intercept_exceptions &= ~(1 << PF_VECTOR); 995 clr_exception_intercept(svm, PF_VECTOR);
861 control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; 996 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
862 control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; 997 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
863 save->g_pat = 0x0007040600070406ULL; 998 save->g_pat = 0x0007040600070406ULL;
864 save->cr3 = 0; 999 save->cr3 = 0;
865 save->cr4 = 0; 1000 save->cr4 = 0;
866 } 1001 }
867 force_new_asid(&svm->vcpu); 1002 svm->asid_generation = 0;
868 1003
869 svm->nested.vmcb = 0; 1004 svm->nested.vmcb = 0;
870 svm->vcpu.arch.hflags = 0; 1005 svm->vcpu.arch.hflags = 0;
871 1006
872 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { 1007 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
873 control->pause_filter_count = 3000; 1008 control->pause_filter_count = 3000;
874 control->intercept |= (1ULL << INTERCEPT_PAUSE); 1009 set_intercept(svm, INTERCEPT_PAUSE);
875 } 1010 }
876 1011
1012 mark_all_dirty(svm->vmcb);
1013
877 enable_gif(svm); 1014 enable_gif(svm);
878} 1015}
879 1016
@@ -990,8 +1127,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
990 1127
991 if (unlikely(cpu != vcpu->cpu)) { 1128 if (unlikely(cpu != vcpu->cpu)) {
992 svm->asid_generation = 0; 1129 svm->asid_generation = 0;
1130 mark_all_dirty(svm->vmcb);
993 } 1131 }
994 1132
1133#ifdef CONFIG_X86_64
1134 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1135#endif
1136 savesegment(fs, svm->host.fs);
1137 savesegment(gs, svm->host.gs);
1138 svm->host.ldt = kvm_read_ldt();
1139
995 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1140 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
996 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1141 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
997} 1142}
@@ -1002,6 +1147,14 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1002 int i; 1147 int i;
1003 1148
1004 ++vcpu->stat.host_state_reload; 1149 ++vcpu->stat.host_state_reload;
1150 kvm_load_ldt(svm->host.ldt);
1151#ifdef CONFIG_X86_64
1152 loadsegment(fs, svm->host.fs);
1153 load_gs_index(svm->host.gs);
1154 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1155#else
1156 loadsegment(gs, svm->host.gs);
1157#endif
1005 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1158 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1006 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1159 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1007} 1160}
@@ -1021,7 +1174,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1021 switch (reg) { 1174 switch (reg) {
1022 case VCPU_EXREG_PDPTR: 1175 case VCPU_EXREG_PDPTR:
1023 BUG_ON(!npt_enabled); 1176 BUG_ON(!npt_enabled);
1024 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); 1177 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1025 break; 1178 break;
1026 default: 1179 default:
1027 BUG(); 1180 BUG();
@@ -1030,12 +1183,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1030 1183
1031static void svm_set_vintr(struct vcpu_svm *svm) 1184static void svm_set_vintr(struct vcpu_svm *svm)
1032{ 1185{
1033 svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; 1186 set_intercept(svm, INTERCEPT_VINTR);
1034} 1187}
1035 1188
1036static void svm_clear_vintr(struct vcpu_svm *svm) 1189static void svm_clear_vintr(struct vcpu_svm *svm)
1037{ 1190{
1038 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1191 clr_intercept(svm, INTERCEPT_VINTR);
1039} 1192}
1040 1193
1041static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1194static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1150,6 +1303,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1150 1303
1151 svm->vmcb->save.idtr.limit = dt->size; 1304 svm->vmcb->save.idtr.limit = dt->size;
1152 svm->vmcb->save.idtr.base = dt->address ; 1305 svm->vmcb->save.idtr.base = dt->address ;
1306 mark_dirty(svm->vmcb, VMCB_DT);
1153} 1307}
1154 1308
1155static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1309static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1166,19 +1320,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1166 1320
1167 svm->vmcb->save.gdtr.limit = dt->size; 1321 svm->vmcb->save.gdtr.limit = dt->size;
1168 svm->vmcb->save.gdtr.base = dt->address ; 1322 svm->vmcb->save.gdtr.base = dt->address ;
1323 mark_dirty(svm->vmcb, VMCB_DT);
1169} 1324}
1170 1325
1171static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1326static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1172{ 1327{
1173} 1328}
1174 1329
1330static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1331{
1332}
1333
1175static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1334static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1176{ 1335{
1177} 1336}
1178 1337
1179static void update_cr0_intercept(struct vcpu_svm *svm) 1338static void update_cr0_intercept(struct vcpu_svm *svm)
1180{ 1339{
1181 struct vmcb *vmcb = svm->vmcb;
1182 ulong gcr0 = svm->vcpu.arch.cr0; 1340 ulong gcr0 = svm->vcpu.arch.cr0;
1183 u64 *hcr0 = &svm->vmcb->save.cr0; 1341 u64 *hcr0 = &svm->vmcb->save.cr0;
1184 1342
@@ -1188,27 +1346,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
1188 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) 1346 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1189 | (gcr0 & SVM_CR0_SELECTIVE_MASK); 1347 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1190 1348
1349 mark_dirty(svm->vmcb, VMCB_CR);
1191 1350
1192 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1351 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1193 vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; 1352 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1194 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; 1353 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1195 if (is_nested(svm)) {
1196 struct vmcb *hsave = svm->nested.hsave;
1197
1198 hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
1199 hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1200 vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
1201 vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
1202 }
1203 } else { 1354 } else {
1204 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; 1355 set_cr_intercept(svm, INTERCEPT_CR0_READ);
1205 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; 1356 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1206 if (is_nested(svm)) {
1207 struct vmcb *hsave = svm->nested.hsave;
1208
1209 hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
1210 hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1211 }
1212 } 1357 }
1213} 1358}
1214 1359
@@ -1216,7 +1361,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1216{ 1361{
1217 struct vcpu_svm *svm = to_svm(vcpu); 1362 struct vcpu_svm *svm = to_svm(vcpu);
1218 1363
1219 if (is_nested(svm)) { 1364 if (is_guest_mode(vcpu)) {
1220 /* 1365 /*
1221 * We are here because we run in nested mode, the host kvm 1366 * We are here because we run in nested mode, the host kvm
1222 * intercepts cr0 writes but the l1 hypervisor does not. 1367 * intercepts cr0 writes but the l1 hypervisor does not.
@@ -1268,6 +1413,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1268 */ 1413 */
1269 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1414 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1270 svm->vmcb->save.cr0 = cr0; 1415 svm->vmcb->save.cr0 = cr0;
1416 mark_dirty(svm->vmcb, VMCB_CR);
1271 update_cr0_intercept(svm); 1417 update_cr0_intercept(svm);
1272} 1418}
1273 1419
@@ -1277,13 +1423,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1277 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; 1423 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1278 1424
1279 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1425 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1280 force_new_asid(vcpu); 1426 svm_flush_tlb(vcpu);
1281 1427
1282 vcpu->arch.cr4 = cr4; 1428 vcpu->arch.cr4 = cr4;
1283 if (!npt_enabled) 1429 if (!npt_enabled)
1284 cr4 |= X86_CR4_PAE; 1430 cr4 |= X86_CR4_PAE;
1285 cr4 |= host_cr4_mce; 1431 cr4 |= host_cr4_mce;
1286 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1432 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1433 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1287} 1434}
1288 1435
1289static void svm_set_segment(struct kvm_vcpu *vcpu, 1436static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1312,26 +1459,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1312 = (svm->vmcb->save.cs.attrib 1459 = (svm->vmcb->save.cs.attrib
1313 >> SVM_SELECTOR_DPL_SHIFT) & 3; 1460 >> SVM_SELECTOR_DPL_SHIFT) & 3;
1314 1461
1462 mark_dirty(svm->vmcb, VMCB_SEG);
1315} 1463}
1316 1464
1317static void update_db_intercept(struct kvm_vcpu *vcpu) 1465static void update_db_intercept(struct kvm_vcpu *vcpu)
1318{ 1466{
1319 struct vcpu_svm *svm = to_svm(vcpu); 1467 struct vcpu_svm *svm = to_svm(vcpu);
1320 1468
1321 svm->vmcb->control.intercept_exceptions &= 1469 clr_exception_intercept(svm, DB_VECTOR);
1322 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1470 clr_exception_intercept(svm, BP_VECTOR);
1323 1471
1324 if (svm->nmi_singlestep) 1472 if (svm->nmi_singlestep)
1325 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1473 set_exception_intercept(svm, DB_VECTOR);
1326 1474
1327 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1475 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1328 if (vcpu->guest_debug & 1476 if (vcpu->guest_debug &
1329 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 1477 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1330 svm->vmcb->control.intercept_exceptions |= 1478 set_exception_intercept(svm, DB_VECTOR);
1331 1 << DB_VECTOR;
1332 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1479 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1333 svm->vmcb->control.intercept_exceptions |= 1480 set_exception_intercept(svm, BP_VECTOR);
1334 1 << BP_VECTOR;
1335 } else 1481 } else
1336 vcpu->guest_debug = 0; 1482 vcpu->guest_debug = 0;
1337} 1483}
@@ -1345,21 +1491,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1345 else 1491 else
1346 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1492 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1347 1493
1348 update_db_intercept(vcpu); 1494 mark_dirty(svm->vmcb, VMCB_DR);
1349}
1350
1351static void load_host_msrs(struct kvm_vcpu *vcpu)
1352{
1353#ifdef CONFIG_X86_64
1354 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1355#endif
1356}
1357 1495
1358static void save_host_msrs(struct kvm_vcpu *vcpu) 1496 update_db_intercept(vcpu);
1359{
1360#ifdef CONFIG_X86_64
1361 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1362#endif
1363} 1497}
1364 1498
1365static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1499static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -1372,6 +1506,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1372 1506
1373 svm->asid_generation = sd->asid_generation; 1507 svm->asid_generation = sd->asid_generation;
1374 svm->vmcb->control.asid = sd->next_asid++; 1508 svm->vmcb->control.asid = sd->next_asid++;
1509
1510 mark_dirty(svm->vmcb, VMCB_ASID);
1375} 1511}
1376 1512
1377static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1513static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
@@ -1379,20 +1515,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1379 struct vcpu_svm *svm = to_svm(vcpu); 1515 struct vcpu_svm *svm = to_svm(vcpu);
1380 1516
1381 svm->vmcb->save.dr7 = value; 1517 svm->vmcb->save.dr7 = value;
1518 mark_dirty(svm->vmcb, VMCB_DR);
1382} 1519}
1383 1520
1384static int pf_interception(struct vcpu_svm *svm) 1521static int pf_interception(struct vcpu_svm *svm)
1385{ 1522{
1386 u64 fault_address; 1523 u64 fault_address = svm->vmcb->control.exit_info_2;
1387 u32 error_code; 1524 u32 error_code;
1525 int r = 1;
1388 1526
1389 fault_address = svm->vmcb->control.exit_info_2; 1527 switch (svm->apf_reason) {
1390 error_code = svm->vmcb->control.exit_info_1; 1528 default:
1391 1529 error_code = svm->vmcb->control.exit_info_1;
1392 trace_kvm_page_fault(fault_address, error_code); 1530
1393 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) 1531 trace_kvm_page_fault(fault_address, error_code);
1394 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1532 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1395 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1533 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1534 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1535 svm->vmcb->control.insn_bytes,
1536 svm->vmcb->control.insn_len);
1537 break;
1538 case KVM_PV_REASON_PAGE_NOT_PRESENT:
1539 svm->apf_reason = 0;
1540 local_irq_disable();
1541 kvm_async_pf_task_wait(fault_address);
1542 local_irq_enable();
1543 break;
1544 case KVM_PV_REASON_PAGE_READY:
1545 svm->apf_reason = 0;
1546 local_irq_disable();
1547 kvm_async_pf_task_wake(fault_address);
1548 local_irq_enable();
1549 break;
1550 }
1551 return r;
1396} 1552}
1397 1553
1398static int db_interception(struct vcpu_svm *svm) 1554static int db_interception(struct vcpu_svm *svm)
@@ -1440,7 +1596,7 @@ static int ud_interception(struct vcpu_svm *svm)
1440{ 1596{
1441 int er; 1597 int er;
1442 1598
1443 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); 1599 er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
1444 if (er != EMULATE_DONE) 1600 if (er != EMULATE_DONE)
1445 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1601 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1446 return 1; 1602 return 1;
@@ -1449,21 +1605,8 @@ static int ud_interception(struct vcpu_svm *svm)
1449static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1605static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1450{ 1606{
1451 struct vcpu_svm *svm = to_svm(vcpu); 1607 struct vcpu_svm *svm = to_svm(vcpu);
1452 u32 excp;
1453
1454 if (is_nested(svm)) {
1455 u32 h_excp, n_excp;
1456
1457 h_excp = svm->nested.hsave->control.intercept_exceptions;
1458 n_excp = svm->nested.intercept_exceptions;
1459 h_excp &= ~(1 << NM_VECTOR);
1460 excp = h_excp | n_excp;
1461 } else {
1462 excp = svm->vmcb->control.intercept_exceptions;
1463 excp &= ~(1 << NM_VECTOR);
1464 }
1465 1608
1466 svm->vmcb->control.intercept_exceptions = excp; 1609 clr_exception_intercept(svm, NM_VECTOR);
1467 1610
1468 svm->vcpu.fpu_active = 1; 1611 svm->vcpu.fpu_active = 1;
1469 update_cr0_intercept(svm); 1612 update_cr0_intercept(svm);
@@ -1570,7 +1713,7 @@ static int io_interception(struct vcpu_svm *svm)
1570 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1713 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1571 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1714 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1572 if (string || in) 1715 if (string || in)
1573 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 1716 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
1574 1717
1575 port = io_info >> 16; 1718 port = io_info >> 16;
1576 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1719 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1624,17 +1767,19 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1624 struct vcpu_svm *svm = to_svm(vcpu); 1767 struct vcpu_svm *svm = to_svm(vcpu);
1625 1768
1626 svm->vmcb->control.nested_cr3 = root; 1769 svm->vmcb->control.nested_cr3 = root;
1627 force_new_asid(vcpu); 1770 mark_dirty(svm->vmcb, VMCB_NPT);
1771 svm_flush_tlb(vcpu);
1628} 1772}
1629 1773
1630static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) 1774static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
1775 struct x86_exception *fault)
1631{ 1776{
1632 struct vcpu_svm *svm = to_svm(vcpu); 1777 struct vcpu_svm *svm = to_svm(vcpu);
1633 1778
1634 svm->vmcb->control.exit_code = SVM_EXIT_NPF; 1779 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1635 svm->vmcb->control.exit_code_hi = 0; 1780 svm->vmcb->control.exit_code_hi = 0;
1636 svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code; 1781 svm->vmcb->control.exit_info_1 = fault->error_code;
1637 svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address; 1782 svm->vmcb->control.exit_info_2 = fault->address;
1638 1783
1639 nested_svm_vmexit(svm); 1784 nested_svm_vmexit(svm);
1640} 1785}
@@ -1680,7 +1825,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1680{ 1825{
1681 int vmexit; 1826 int vmexit;
1682 1827
1683 if (!is_nested(svm)) 1828 if (!is_guest_mode(&svm->vcpu))
1684 return 0; 1829 return 0;
1685 1830
1686 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; 1831 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1698,7 +1843,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1698/* This function returns true if it is save to enable the irq window */ 1843/* This function returns true if it is save to enable the irq window */
1699static inline bool nested_svm_intr(struct vcpu_svm *svm) 1844static inline bool nested_svm_intr(struct vcpu_svm *svm)
1700{ 1845{
1701 if (!is_nested(svm)) 1846 if (!is_guest_mode(&svm->vcpu))
1702 return true; 1847 return true;
1703 1848
1704 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1849 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
@@ -1737,7 +1882,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
1737/* This function returns true if it is save to enable the nmi window */ 1882/* This function returns true if it is save to enable the nmi window */
1738static inline bool nested_svm_nmi(struct vcpu_svm *svm) 1883static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1739{ 1884{
1740 if (!is_nested(svm)) 1885 if (!is_guest_mode(&svm->vcpu))
1741 return true; 1886 return true;
1742 1887
1743 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) 1888 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
@@ -1836,8 +1981,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1836 return NESTED_EXIT_HOST; 1981 return NESTED_EXIT_HOST;
1837 break; 1982 break;
1838 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1983 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1839 /* When we're shadowing, trap PFs */ 1984 /* When we're shadowing, trap PFs, but not async PF */
1840 if (!npt_enabled) 1985 if (!npt_enabled && svm->apf_reason == 0)
1841 return NESTED_EXIT_HOST; 1986 return NESTED_EXIT_HOST;
1842 break; 1987 break;
1843 case SVM_EXIT_EXCP_BASE + NM_VECTOR: 1988 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1865,27 +2010,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1865 case SVM_EXIT_IOIO: 2010 case SVM_EXIT_IOIO:
1866 vmexit = nested_svm_intercept_ioio(svm); 2011 vmexit = nested_svm_intercept_ioio(svm);
1867 break; 2012 break;
1868 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 2013 case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
1869 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 2014 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
1870 if (svm->nested.intercept_cr_read & cr_bits) 2015 if (svm->nested.intercept_cr & bit)
1871 vmexit = NESTED_EXIT_DONE; 2016 vmexit = NESTED_EXIT_DONE;
1872 break; 2017 break;
1873 } 2018 }
1874 case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { 2019 case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
1875 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); 2020 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
1876 if (svm->nested.intercept_cr_write & cr_bits) 2021 if (svm->nested.intercept_dr & bit)
1877 vmexit = NESTED_EXIT_DONE;
1878 break;
1879 }
1880 case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
1881 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
1882 if (svm->nested.intercept_dr_read & dr_bits)
1883 vmexit = NESTED_EXIT_DONE;
1884 break;
1885 }
1886 case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
1887 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
1888 if (svm->nested.intercept_dr_write & dr_bits)
1889 vmexit = NESTED_EXIT_DONE; 2022 vmexit = NESTED_EXIT_DONE;
1890 break; 2023 break;
1891 } 2024 }
@@ -1893,6 +2026,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1893 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 2026 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1894 if (svm->nested.intercept_exceptions & excp_bits) 2027 if (svm->nested.intercept_exceptions & excp_bits)
1895 vmexit = NESTED_EXIT_DONE; 2028 vmexit = NESTED_EXIT_DONE;
2029 /* async page fault always cause vmexit */
2030 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2031 svm->apf_reason != 0)
2032 vmexit = NESTED_EXIT_DONE;
1896 break; 2033 break;
1897 } 2034 }
1898 case SVM_EXIT_ERR: { 2035 case SVM_EXIT_ERR: {
@@ -1926,10 +2063,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
1926 struct vmcb_control_area *dst = &dst_vmcb->control; 2063 struct vmcb_control_area *dst = &dst_vmcb->control;
1927 struct vmcb_control_area *from = &from_vmcb->control; 2064 struct vmcb_control_area *from = &from_vmcb->control;
1928 2065
1929 dst->intercept_cr_read = from->intercept_cr_read; 2066 dst->intercept_cr = from->intercept_cr;
1930 dst->intercept_cr_write = from->intercept_cr_write; 2067 dst->intercept_dr = from->intercept_dr;
1931 dst->intercept_dr_read = from->intercept_dr_read;
1932 dst->intercept_dr_write = from->intercept_dr_write;
1933 dst->intercept_exceptions = from->intercept_exceptions; 2068 dst->intercept_exceptions = from->intercept_exceptions;
1934 dst->intercept = from->intercept; 2069 dst->intercept = from->intercept;
1935 dst->iopm_base_pa = from->iopm_base_pa; 2070 dst->iopm_base_pa = from->iopm_base_pa;
@@ -1970,7 +2105,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1970 if (!nested_vmcb) 2105 if (!nested_vmcb)
1971 return 1; 2106 return 1;
1972 2107
1973 /* Exit nested SVM mode */ 2108 /* Exit Guest-Mode */
2109 leave_guest_mode(&svm->vcpu);
1974 svm->nested.vmcb = 0; 2110 svm->nested.vmcb = 0;
1975 2111
1976 /* Give the current vmcb to the guest */ 2112 /* Give the current vmcb to the guest */
@@ -1984,7 +2120,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1984 nested_vmcb->save.idtr = vmcb->save.idtr; 2120 nested_vmcb->save.idtr = vmcb->save.idtr;
1985 nested_vmcb->save.efer = svm->vcpu.arch.efer; 2121 nested_vmcb->save.efer = svm->vcpu.arch.efer;
1986 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); 2122 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1987 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; 2123 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
1988 nested_vmcb->save.cr2 = vmcb->save.cr2; 2124 nested_vmcb->save.cr2 = vmcb->save.cr2;
1989 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; 2125 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
1990 nested_vmcb->save.rflags = vmcb->save.rflags; 2126 nested_vmcb->save.rflags = vmcb->save.rflags;
@@ -2061,6 +2197,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2061 svm->vmcb->save.cpl = 0; 2197 svm->vmcb->save.cpl = 0;
2062 svm->vmcb->control.exit_int_info = 0; 2198 svm->vmcb->control.exit_int_info = 0;
2063 2199
2200 mark_all_dirty(svm->vmcb);
2201
2064 nested_svm_unmap(page); 2202 nested_svm_unmap(page);
2065 2203
2066 nested_svm_uninit_mmu_context(&svm->vcpu); 2204 nested_svm_uninit_mmu_context(&svm->vcpu);
@@ -2148,8 +2286,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2148 nested_vmcb->control.event_inj, 2286 nested_vmcb->control.event_inj,
2149 nested_vmcb->control.nested_ctl); 2287 nested_vmcb->control.nested_ctl);
2150 2288
2151 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, 2289 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2152 nested_vmcb->control.intercept_cr_write, 2290 nested_vmcb->control.intercept_cr >> 16,
2153 nested_vmcb->control.intercept_exceptions, 2291 nested_vmcb->control.intercept_exceptions,
2154 nested_vmcb->control.intercept); 2292 nested_vmcb->control.intercept);
2155 2293
@@ -2177,7 +2315,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2177 if (npt_enabled) 2315 if (npt_enabled)
2178 hsave->save.cr3 = vmcb->save.cr3; 2316 hsave->save.cr3 = vmcb->save.cr3;
2179 else 2317 else
2180 hsave->save.cr3 = svm->vcpu.arch.cr3; 2318 hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
2181 2319
2182 copy_vmcb_control_area(hsave, vmcb); 2320 copy_vmcb_control_area(hsave, vmcb);
2183 2321
@@ -2229,14 +2367,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2229 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; 2367 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
2230 2368
2231 /* cache intercepts */ 2369 /* cache intercepts */
2232 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; 2370 svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
2233 svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; 2371 svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
2234 svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
2235 svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
2236 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; 2372 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2237 svm->nested.intercept = nested_vmcb->control.intercept; 2373 svm->nested.intercept = nested_vmcb->control.intercept;
2238 2374
2239 force_new_asid(&svm->vcpu); 2375 svm_flush_tlb(&svm->vcpu);
2240 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 2376 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2241 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 2377 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2242 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 2378 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -2245,29 +2381,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2245 2381
2246 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { 2382 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2247 /* We only want the cr8 intercept bits of the guest */ 2383 /* We only want the cr8 intercept bits of the guest */
2248 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; 2384 clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2249 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2385 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2250 } 2386 }
2251 2387
2252 /* We don't want to see VMMCALLs from a nested guest */ 2388 /* We don't want to see VMMCALLs from a nested guest */
2253 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); 2389 clr_intercept(svm, INTERCEPT_VMMCALL);
2254
2255 /*
2256 * We don't want a nested guest to be more powerful than the guest, so
2257 * all intercepts are ORed
2258 */
2259 svm->vmcb->control.intercept_cr_read |=
2260 nested_vmcb->control.intercept_cr_read;
2261 svm->vmcb->control.intercept_cr_write |=
2262 nested_vmcb->control.intercept_cr_write;
2263 svm->vmcb->control.intercept_dr_read |=
2264 nested_vmcb->control.intercept_dr_read;
2265 svm->vmcb->control.intercept_dr_write |=
2266 nested_vmcb->control.intercept_dr_write;
2267 svm->vmcb->control.intercept_exceptions |=
2268 nested_vmcb->control.intercept_exceptions;
2269
2270 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
2271 2390
2272 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; 2391 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2273 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 2392 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
@@ -2278,11 +2397,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2278 2397
2279 nested_svm_unmap(page); 2398 nested_svm_unmap(page);
2280 2399
2281 /* nested_vmcb is our indicator if nested SVM is activated */ 2400 /* Enter Guest-Mode */
2401 enter_guest_mode(&svm->vcpu);
2402
2403 /*
2404 * Merge guest and host intercepts - must be called with vcpu in
2405 * guest-mode to take affect here
2406 */
2407 recalc_intercepts(svm);
2408
2282 svm->nested.vmcb = vmcb_gpa; 2409 svm->nested.vmcb = vmcb_gpa;
2283 2410
2284 enable_gif(svm); 2411 enable_gif(svm);
2285 2412
2413 mark_all_dirty(svm->vmcb);
2414
2286 return true; 2415 return true;
2287} 2416}
2288 2417
@@ -2400,6 +2529,8 @@ static int clgi_interception(struct vcpu_svm *svm)
2400 svm_clear_vintr(svm); 2529 svm_clear_vintr(svm);
2401 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2530 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2402 2531
2532 mark_dirty(svm->vmcb, VMCB_INTR);
2533
2403 return 1; 2534 return 1;
2404} 2535}
2405 2536
@@ -2426,6 +2557,19 @@ static int skinit_interception(struct vcpu_svm *svm)
2426 return 1; 2557 return 1;
2427} 2558}
2428 2559
2560static int xsetbv_interception(struct vcpu_svm *svm)
2561{
2562 u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2563 u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
2564
2565 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2566 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2567 skip_emulated_instruction(&svm->vcpu);
2568 }
2569
2570 return 1;
2571}
2572
2429static int invalid_op_interception(struct vcpu_svm *svm) 2573static int invalid_op_interception(struct vcpu_svm *svm)
2430{ 2574{
2431 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2575 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2507,19 +2651,92 @@ static int cpuid_interception(struct vcpu_svm *svm)
2507static int iret_interception(struct vcpu_svm *svm) 2651static int iret_interception(struct vcpu_svm *svm)
2508{ 2652{
2509 ++svm->vcpu.stat.nmi_window_exits; 2653 ++svm->vcpu.stat.nmi_window_exits;
2510 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 2654 clr_intercept(svm, INTERCEPT_IRET);
2511 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2655 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2512 return 1; 2656 return 1;
2513} 2657}
2514 2658
2515static int invlpg_interception(struct vcpu_svm *svm) 2659static int invlpg_interception(struct vcpu_svm *svm)
2516{ 2660{
2517 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2661 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2662 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2663
2664 kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2665 skip_emulated_instruction(&svm->vcpu);
2666 return 1;
2518} 2667}
2519 2668
2520static int emulate_on_interception(struct vcpu_svm *svm) 2669static int emulate_on_interception(struct vcpu_svm *svm)
2521{ 2670{
2522 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2671 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2672}
2673
2674#define CR_VALID (1ULL << 63)
2675
2676static int cr_interception(struct vcpu_svm *svm)
2677{
2678 int reg, cr;
2679 unsigned long val;
2680 int err;
2681
2682 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2683 return emulate_on_interception(svm);
2684
2685 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2686 return emulate_on_interception(svm);
2687
2688 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2689 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2690
2691 err = 0;
2692 if (cr >= 16) { /* mov to cr */
2693 cr -= 16;
2694 val = kvm_register_read(&svm->vcpu, reg);
2695 switch (cr) {
2696 case 0:
2697 err = kvm_set_cr0(&svm->vcpu, val);
2698 break;
2699 case 3:
2700 err = kvm_set_cr3(&svm->vcpu, val);
2701 break;
2702 case 4:
2703 err = kvm_set_cr4(&svm->vcpu, val);
2704 break;
2705 case 8:
2706 err = kvm_set_cr8(&svm->vcpu, val);
2707 break;
2708 default:
2709 WARN(1, "unhandled write to CR%d", cr);
2710 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2711 return 1;
2712 }
2713 } else { /* mov from cr */
2714 switch (cr) {
2715 case 0:
2716 val = kvm_read_cr0(&svm->vcpu);
2717 break;
2718 case 2:
2719 val = svm->vcpu.arch.cr2;
2720 break;
2721 case 3:
2722 val = kvm_read_cr3(&svm->vcpu);
2723 break;
2724 case 4:
2725 val = kvm_read_cr4(&svm->vcpu);
2726 break;
2727 case 8:
2728 val = kvm_get_cr8(&svm->vcpu);
2729 break;
2730 default:
2731 WARN(1, "unhandled read from CR%d", cr);
2732 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2733 return 1;
2734 }
2735 kvm_register_write(&svm->vcpu, reg, val);
2736 }
2737 kvm_complete_insn_gp(&svm->vcpu, err);
2738
2739 return 1;
2523} 2740}
2524 2741
2525static int cr0_write_interception(struct vcpu_svm *svm) 2742static int cr0_write_interception(struct vcpu_svm *svm)
@@ -2527,7 +2744,7 @@ static int cr0_write_interception(struct vcpu_svm *svm)
2527 struct kvm_vcpu *vcpu = &svm->vcpu; 2744 struct kvm_vcpu *vcpu = &svm->vcpu;
2528 int r; 2745 int r;
2529 2746
2530 r = emulate_instruction(&svm->vcpu, 0, 0, 0); 2747 r = cr_interception(svm);
2531 2748
2532 if (svm->nested.vmexit_rip) { 2749 if (svm->nested.vmexit_rip) {
2533 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); 2750 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
@@ -2536,22 +2753,47 @@ static int cr0_write_interception(struct vcpu_svm *svm)
2536 svm->nested.vmexit_rip = 0; 2753 svm->nested.vmexit_rip = 0;
2537 } 2754 }
2538 2755
2539 return r == EMULATE_DONE; 2756 return r;
2757}
2758
2759static int dr_interception(struct vcpu_svm *svm)
2760{
2761 int reg, dr;
2762 unsigned long val;
2763 int err;
2764
2765 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2766 return emulate_on_interception(svm);
2767
2768 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2769 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2770
2771 if (dr >= 16) { /* mov to DRn */
2772 val = kvm_register_read(&svm->vcpu, reg);
2773 kvm_set_dr(&svm->vcpu, dr - 16, val);
2774 } else {
2775 err = kvm_get_dr(&svm->vcpu, dr, &val);
2776 if (!err)
2777 kvm_register_write(&svm->vcpu, reg, val);
2778 }
2779
2780 return 1;
2540} 2781}
2541 2782
2542static int cr8_write_interception(struct vcpu_svm *svm) 2783static int cr8_write_interception(struct vcpu_svm *svm)
2543{ 2784{
2544 struct kvm_run *kvm_run = svm->vcpu.run; 2785 struct kvm_run *kvm_run = svm->vcpu.run;
2786 int r;
2545 2787
2546 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2788 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2547 /* instruction emulation calls kvm_set_cr8() */ 2789 /* instruction emulation calls kvm_set_cr8() */
2548 emulate_instruction(&svm->vcpu, 0, 0, 0); 2790 r = cr_interception(svm);
2549 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2791 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2550 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2792 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2551 return 1; 2793 return r;
2552 } 2794 }
2553 if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) 2795 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2554 return 1; 2796 return r;
2555 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2797 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2556 return 0; 2798 return 0;
2557} 2799}
@@ -2562,14 +2804,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2562 2804
2563 switch (ecx) { 2805 switch (ecx) {
2564 case MSR_IA32_TSC: { 2806 case MSR_IA32_TSC: {
2565 u64 tsc_offset; 2807 struct vmcb *vmcb = get_host_vmcb(svm);
2566 2808
2567 if (is_nested(svm)) 2809 *data = vmcb->control.tsc_offset + native_read_tsc();
2568 tsc_offset = svm->nested.hsave->control.tsc_offset;
2569 else
2570 tsc_offset = svm->vmcb->control.tsc_offset;
2571
2572 *data = tsc_offset + native_read_tsc();
2573 break; 2810 break;
2574 } 2811 }
2575 case MSR_STAR: 2812 case MSR_STAR:
@@ -2714,7 +2951,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2714 svm->vmcb->save.sysenter_esp = data; 2951 svm->vmcb->save.sysenter_esp = data;
2715 break; 2952 break;
2716 case MSR_IA32_DEBUGCTLMSR: 2953 case MSR_IA32_DEBUGCTLMSR:
2717 if (!svm_has(SVM_FEATURE_LBRV)) { 2954 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2718 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 2955 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2719 __func__, data); 2956 __func__, data);
2720 break; 2957 break;
@@ -2723,6 +2960,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2723 return 1; 2960 return 1;
2724 2961
2725 svm->vmcb->save.dbgctl = data; 2962 svm->vmcb->save.dbgctl = data;
2963 mark_dirty(svm->vmcb, VMCB_LBR);
2726 if (data & (1ULL<<0)) 2964 if (data & (1ULL<<0))
2727 svm_enable_lbrv(svm); 2965 svm_enable_lbrv(svm);
2728 else 2966 else
@@ -2775,6 +3013,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
2775 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3013 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2776 svm_clear_vintr(svm); 3014 svm_clear_vintr(svm);
2777 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3015 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3016 mark_dirty(svm->vmcb, VMCB_INTR);
2778 /* 3017 /*
2779 * If the user space waits to inject interrupts, exit as soon as 3018 * If the user space waits to inject interrupts, exit as soon as
2780 * possible 3019 * possible
@@ -2797,31 +3036,31 @@ static int pause_interception(struct vcpu_svm *svm)
2797} 3036}
2798 3037
2799static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 3038static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2800 [SVM_EXIT_READ_CR0] = emulate_on_interception, 3039 [SVM_EXIT_READ_CR0] = cr_interception,
2801 [SVM_EXIT_READ_CR3] = emulate_on_interception, 3040 [SVM_EXIT_READ_CR3] = cr_interception,
2802 [SVM_EXIT_READ_CR4] = emulate_on_interception, 3041 [SVM_EXIT_READ_CR4] = cr_interception,
2803 [SVM_EXIT_READ_CR8] = emulate_on_interception, 3042 [SVM_EXIT_READ_CR8] = cr_interception,
2804 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 3043 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2805 [SVM_EXIT_WRITE_CR0] = cr0_write_interception, 3044 [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
2806 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 3045 [SVM_EXIT_WRITE_CR3] = cr_interception,
2807 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 3046 [SVM_EXIT_WRITE_CR4] = cr_interception,
2808 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3047 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2809 [SVM_EXIT_READ_DR0] = emulate_on_interception, 3048 [SVM_EXIT_READ_DR0] = dr_interception,
2810 [SVM_EXIT_READ_DR1] = emulate_on_interception, 3049 [SVM_EXIT_READ_DR1] = dr_interception,
2811 [SVM_EXIT_READ_DR2] = emulate_on_interception, 3050 [SVM_EXIT_READ_DR2] = dr_interception,
2812 [SVM_EXIT_READ_DR3] = emulate_on_interception, 3051 [SVM_EXIT_READ_DR3] = dr_interception,
2813 [SVM_EXIT_READ_DR4] = emulate_on_interception, 3052 [SVM_EXIT_READ_DR4] = dr_interception,
2814 [SVM_EXIT_READ_DR5] = emulate_on_interception, 3053 [SVM_EXIT_READ_DR5] = dr_interception,
2815 [SVM_EXIT_READ_DR6] = emulate_on_interception, 3054 [SVM_EXIT_READ_DR6] = dr_interception,
2816 [SVM_EXIT_READ_DR7] = emulate_on_interception, 3055 [SVM_EXIT_READ_DR7] = dr_interception,
2817 [SVM_EXIT_WRITE_DR0] = emulate_on_interception, 3056 [SVM_EXIT_WRITE_DR0] = dr_interception,
2818 [SVM_EXIT_WRITE_DR1] = emulate_on_interception, 3057 [SVM_EXIT_WRITE_DR1] = dr_interception,
2819 [SVM_EXIT_WRITE_DR2] = emulate_on_interception, 3058 [SVM_EXIT_WRITE_DR2] = dr_interception,
2820 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 3059 [SVM_EXIT_WRITE_DR3] = dr_interception,
2821 [SVM_EXIT_WRITE_DR4] = emulate_on_interception, 3060 [SVM_EXIT_WRITE_DR4] = dr_interception,
2822 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 3061 [SVM_EXIT_WRITE_DR5] = dr_interception,
2823 [SVM_EXIT_WRITE_DR6] = emulate_on_interception, 3062 [SVM_EXIT_WRITE_DR6] = dr_interception,
2824 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 3063 [SVM_EXIT_WRITE_DR7] = dr_interception,
2825 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 3064 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2826 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3065 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2827 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3066 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
@@ -2854,6 +3093,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2854 [SVM_EXIT_WBINVD] = emulate_on_interception, 3093 [SVM_EXIT_WBINVD] = emulate_on_interception,
2855 [SVM_EXIT_MONITOR] = invalid_op_interception, 3094 [SVM_EXIT_MONITOR] = invalid_op_interception,
2856 [SVM_EXIT_MWAIT] = invalid_op_interception, 3095 [SVM_EXIT_MWAIT] = invalid_op_interception,
3096 [SVM_EXIT_XSETBV] = xsetbv_interception,
2857 [SVM_EXIT_NPF] = pf_interception, 3097 [SVM_EXIT_NPF] = pf_interception,
2858}; 3098};
2859 3099
@@ -2864,10 +3104,10 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
2864 struct vmcb_save_area *save = &svm->vmcb->save; 3104 struct vmcb_save_area *save = &svm->vmcb->save;
2865 3105
2866 pr_err("VMCB Control Area:\n"); 3106 pr_err("VMCB Control Area:\n");
2867 pr_err("cr_read: %04x\n", control->intercept_cr_read); 3107 pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff);
2868 pr_err("cr_write: %04x\n", control->intercept_cr_write); 3108 pr_err("cr_write: %04x\n", control->intercept_cr >> 16);
2869 pr_err("dr_read: %04x\n", control->intercept_dr_read); 3109 pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff);
2870 pr_err("dr_write: %04x\n", control->intercept_dr_write); 3110 pr_err("dr_write: %04x\n", control->intercept_dr >> 16);
2871 pr_err("exceptions: %08x\n", control->intercept_exceptions); 3111 pr_err("exceptions: %08x\n", control->intercept_exceptions);
2872 pr_err("intercepts: %016llx\n", control->intercept); 3112 pr_err("intercepts: %016llx\n", control->intercept);
2873 pr_err("pause filter count: %d\n", control->pause_filter_count); 3113 pr_err("pause filter count: %d\n", control->pause_filter_count);
@@ -2950,15 +3190,23 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
2950 3190
2951} 3191}
2952 3192
3193static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3194{
3195 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3196
3197 *info1 = control->exit_info_1;
3198 *info2 = control->exit_info_2;
3199}
3200
2953static int handle_exit(struct kvm_vcpu *vcpu) 3201static int handle_exit(struct kvm_vcpu *vcpu)
2954{ 3202{
2955 struct vcpu_svm *svm = to_svm(vcpu); 3203 struct vcpu_svm *svm = to_svm(vcpu);
2956 struct kvm_run *kvm_run = vcpu->run; 3204 struct kvm_run *kvm_run = vcpu->run;
2957 u32 exit_code = svm->vmcb->control.exit_code; 3205 u32 exit_code = svm->vmcb->control.exit_code;
2958 3206
2959 trace_kvm_exit(exit_code, vcpu); 3207 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
2960 3208
2961 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) 3209 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
2962 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3210 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2963 if (npt_enabled) 3211 if (npt_enabled)
2964 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3212 vcpu->arch.cr3 = svm->vmcb->save.cr3;
@@ -2970,7 +3218,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2970 return 1; 3218 return 1;
2971 } 3219 }
2972 3220
2973 if (is_nested(svm)) { 3221 if (is_guest_mode(vcpu)) {
2974 int vmexit; 3222 int vmexit;
2975 3223
2976 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, 3224 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -3033,7 +3281,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
3033 3281
3034 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 3282 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3035 3283
3036 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3037 /* FIXME: handle wraparound of asid_generation */ 3284 /* FIXME: handle wraparound of asid_generation */
3038 if (svm->asid_generation != sd->asid_generation) 3285 if (svm->asid_generation != sd->asid_generation)
3039 new_asid(svm, sd); 3286 new_asid(svm, sd);
@@ -3045,7 +3292,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3045 3292
3046 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3293 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3047 vcpu->arch.hflags |= HF_NMI_MASK; 3294 vcpu->arch.hflags |= HF_NMI_MASK;
3048 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3295 set_intercept(svm, INTERCEPT_IRET);
3049 ++vcpu->stat.nmi_injections; 3296 ++vcpu->stat.nmi_injections;
3050} 3297}
3051 3298
@@ -3058,6 +3305,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
3058 control->int_ctl &= ~V_INTR_PRIO_MASK; 3305 control->int_ctl &= ~V_INTR_PRIO_MASK;
3059 control->int_ctl |= V_IRQ_MASK | 3306 control->int_ctl |= V_IRQ_MASK |
3060 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 3307 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
3308 mark_dirty(svm->vmcb, VMCB_INTR);
3061} 3309}
3062 3310
3063static void svm_set_irq(struct kvm_vcpu *vcpu) 3311static void svm_set_irq(struct kvm_vcpu *vcpu)
@@ -3077,14 +3325,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3077{ 3325{
3078 struct vcpu_svm *svm = to_svm(vcpu); 3326 struct vcpu_svm *svm = to_svm(vcpu);
3079 3327
3080 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3328 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3081 return; 3329 return;
3082 3330
3083 if (irr == -1) 3331 if (irr == -1)
3084 return; 3332 return;
3085 3333
3086 if (tpr >= irr) 3334 if (tpr >= irr)
3087 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; 3335 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3088} 3336}
3089 3337
3090static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3338static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -3112,10 +3360,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3112 3360
3113 if (masked) { 3361 if (masked) {
3114 svm->vcpu.arch.hflags |= HF_NMI_MASK; 3362 svm->vcpu.arch.hflags |= HF_NMI_MASK;
3115 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3363 set_intercept(svm, INTERCEPT_IRET);
3116 } else { 3364 } else {
3117 svm->vcpu.arch.hflags &= ~HF_NMI_MASK; 3365 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3118 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 3366 clr_intercept(svm, INTERCEPT_IRET);
3119 } 3367 }
3120} 3368}
3121 3369
@@ -3131,7 +3379,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3131 3379
3132 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); 3380 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
3133 3381
3134 if (is_nested(svm)) 3382 if (is_guest_mode(vcpu))
3135 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); 3383 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
3136 3384
3137 return ret; 3385 return ret;
@@ -3177,7 +3425,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3177 3425
3178static void svm_flush_tlb(struct kvm_vcpu *vcpu) 3426static void svm_flush_tlb(struct kvm_vcpu *vcpu)
3179{ 3427{
3180 force_new_asid(vcpu); 3428 struct vcpu_svm *svm = to_svm(vcpu);
3429
3430 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3431 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3432 else
3433 svm->asid_generation--;
3181} 3434}
3182 3435
3183static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) 3436static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3188,10 +3441,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3188{ 3441{
3189 struct vcpu_svm *svm = to_svm(vcpu); 3442 struct vcpu_svm *svm = to_svm(vcpu);
3190 3443
3191 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3444 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3192 return; 3445 return;
3193 3446
3194 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 3447 if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3195 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 3448 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3196 kvm_set_cr8(vcpu, cr8); 3449 kvm_set_cr8(vcpu, cr8);
3197 } 3450 }
@@ -3202,7 +3455,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3202 struct vcpu_svm *svm = to_svm(vcpu); 3455 struct vcpu_svm *svm = to_svm(vcpu);
3203 u64 cr8; 3456 u64 cr8;
3204 3457
3205 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3458 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3206 return; 3459 return;
3207 3460
3208 cr8 = kvm_get_cr8(vcpu); 3461 cr8 = kvm_get_cr8(vcpu);
@@ -3289,9 +3542,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3289static void svm_vcpu_run(struct kvm_vcpu *vcpu) 3542static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3290{ 3543{
3291 struct vcpu_svm *svm = to_svm(vcpu); 3544 struct vcpu_svm *svm = to_svm(vcpu);
3292 u16 fs_selector;
3293 u16 gs_selector;
3294 u16 ldt_selector;
3295 3545
3296 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3546 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3297 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 3547 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3308,10 +3558,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3308 3558
3309 sync_lapic_to_cr8(vcpu); 3559 sync_lapic_to_cr8(vcpu);
3310 3560
3311 save_host_msrs(vcpu);
3312 savesegment(fs, fs_selector);
3313 savesegment(gs, gs_selector);
3314 ldt_selector = kvm_read_ldt();
3315 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3561 svm->vmcb->save.cr2 = vcpu->arch.cr2;
3316 3562
3317 clgi(); 3563 clgi();
@@ -3389,20 +3635,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3389#endif 3635#endif
3390 ); 3636 );
3391 3637
3392 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3393 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3394 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3395 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3396
3397 load_host_msrs(vcpu);
3398 loadsegment(fs, fs_selector);
3399#ifdef CONFIG_X86_64 3638#ifdef CONFIG_X86_64
3400 load_gs_index(gs_selector); 3639 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3401 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
3402#else 3640#else
3403 loadsegment(gs, gs_selector); 3641 loadsegment(fs, svm->host.fs);
3404#endif 3642#endif
3405 kvm_load_ldt(ldt_selector);
3406 3643
3407 reload_tss(vcpu); 3644 reload_tss(vcpu);
3408 3645
@@ -3410,10 +3647,21 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3410 3647
3411 stgi(); 3648 stgi();
3412 3649
3650 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3651 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3652 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3653 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3654
3413 sync_cr8_to_lapic(vcpu); 3655 sync_cr8_to_lapic(vcpu);
3414 3656
3415 svm->next_rip = 0; 3657 svm->next_rip = 0;
3416 3658
3659 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3660
3661 /* if exit due to PF check for async PF */
3662 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3663 svm->apf_reason = kvm_read_and_reset_pf_reason();
3664
3417 if (npt_enabled) { 3665 if (npt_enabled) {
3418 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); 3666 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
3419 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); 3667 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
@@ -3426,6 +3674,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3426 if (unlikely(svm->vmcb->control.exit_code == 3674 if (unlikely(svm->vmcb->control.exit_code ==
3427 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 3675 SVM_EXIT_EXCP_BASE + MC_VECTOR))
3428 svm_handle_mce(svm); 3676 svm_handle_mce(svm);
3677
3678 mark_all_clean(svm->vmcb);
3429} 3679}
3430 3680
3431#undef R 3681#undef R
@@ -3435,7 +3685,8 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3435 struct vcpu_svm *svm = to_svm(vcpu); 3685 struct vcpu_svm *svm = to_svm(vcpu);
3436 3686
3437 svm->vmcb->save.cr3 = root; 3687 svm->vmcb->save.cr3 = root;
3438 force_new_asid(vcpu); 3688 mark_dirty(svm->vmcb, VMCB_CR);
3689 svm_flush_tlb(vcpu);
3439} 3690}
3440 3691
3441static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) 3692static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -3443,11 +3694,13 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3443 struct vcpu_svm *svm = to_svm(vcpu); 3694 struct vcpu_svm *svm = to_svm(vcpu);
3444 3695
3445 svm->vmcb->control.nested_cr3 = root; 3696 svm->vmcb->control.nested_cr3 = root;
3697 mark_dirty(svm->vmcb, VMCB_NPT);
3446 3698
3447 /* Also sync guest cr3 here in case we live migrate */ 3699 /* Also sync guest cr3 here in case we live migrate */
3448 svm->vmcb->save.cr3 = vcpu->arch.cr3; 3700 svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
3701 mark_dirty(svm->vmcb, VMCB_CR);
3449 3702
3450 force_new_asid(vcpu); 3703 svm_flush_tlb(vcpu);
3451} 3704}
3452 3705
3453static int is_disabled(void) 3706static int is_disabled(void)
@@ -3507,7 +3760,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3507 additional features */ 3760 additional features */
3508 3761
3509 /* Support next_rip if host supports it */ 3762 /* Support next_rip if host supports it */
3510 if (svm_has(SVM_FEATURE_NRIP)) 3763 if (boot_cpu_has(X86_FEATURE_NRIPS))
3511 entry->edx |= SVM_FEATURE_NRIP; 3764 entry->edx |= SVM_FEATURE_NRIP;
3512 3765
3513 /* Support NPT for the guest if enabled */ 3766 /* Support NPT for the guest if enabled */
@@ -3567,6 +3820,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
3567 { SVM_EXIT_WBINVD, "wbinvd" }, 3820 { SVM_EXIT_WBINVD, "wbinvd" },
3568 { SVM_EXIT_MONITOR, "monitor" }, 3821 { SVM_EXIT_MONITOR, "monitor" },
3569 { SVM_EXIT_MWAIT, "mwait" }, 3822 { SVM_EXIT_MWAIT, "mwait" },
3823 { SVM_EXIT_XSETBV, "xsetbv" },
3570 { SVM_EXIT_NPF, "npf" }, 3824 { SVM_EXIT_NPF, "npf" },
3571 { -1, NULL } 3825 { -1, NULL }
3572}; 3826};
@@ -3590,9 +3844,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3590{ 3844{
3591 struct vcpu_svm *svm = to_svm(vcpu); 3845 struct vcpu_svm *svm = to_svm(vcpu);
3592 3846
3593 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; 3847 set_exception_intercept(svm, NM_VECTOR);
3594 if (is_nested(svm))
3595 svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
3596 update_cr0_intercept(svm); 3848 update_cr0_intercept(svm);
3597} 3849}
3598 3850
@@ -3623,6 +3875,7 @@ static struct kvm_x86_ops svm_x86_ops = {
3623 .get_cpl = svm_get_cpl, 3875 .get_cpl = svm_get_cpl,
3624 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 3876 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
3625 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, 3877 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
3878 .decache_cr3 = svm_decache_cr3,
3626 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 3879 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
3627 .set_cr0 = svm_set_cr0, 3880 .set_cr0 = svm_set_cr0,
3628 .set_cr3 = svm_set_cr3, 3881 .set_cr3 = svm_set_cr3,
@@ -3663,7 +3916,9 @@ static struct kvm_x86_ops svm_x86_ops = {
3663 .get_tdp_level = get_npt_level, 3916 .get_tdp_level = get_npt_level,
3664 .get_mt_mask = svm_get_mt_mask, 3917 .get_mt_mask = svm_get_mt_mask,
3665 3918
3919 .get_exit_info = svm_get_exit_info,
3666 .exit_reasons_str = svm_exit_reasons_str, 3920 .exit_reasons_str = svm_exit_reasons_str,
3921
3667 .get_lpage_level = svm_get_lpage_level, 3922 .get_lpage_level = svm_get_lpage_level,
3668 3923
3669 .cpuid_update = svm_cpuid_update, 3924 .cpuid_update = svm_cpuid_update,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a6544b8e7c0f..1357d7cf4ec8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic,
178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) 178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) 179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
180 180
181#define KVM_ISA_VMX 1
182#define KVM_ISA_SVM 2
183
181/* 184/*
182 * Tracepoint for kvm guest exit: 185 * Tracepoint for kvm guest exit:
183 */ 186 */
184TRACE_EVENT(kvm_exit, 187TRACE_EVENT(kvm_exit,
185 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu), 188 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
186 TP_ARGS(exit_reason, vcpu), 189 TP_ARGS(exit_reason, vcpu, isa),
187 190
188 TP_STRUCT__entry( 191 TP_STRUCT__entry(
189 __field( unsigned int, exit_reason ) 192 __field( unsigned int, exit_reason )
190 __field( unsigned long, guest_rip ) 193 __field( unsigned long, guest_rip )
194 __field( u32, isa )
195 __field( u64, info1 )
196 __field( u64, info2 )
191 ), 197 ),
192 198
193 TP_fast_assign( 199 TP_fast_assign(
194 __entry->exit_reason = exit_reason; 200 __entry->exit_reason = exit_reason;
195 __entry->guest_rip = kvm_rip_read(vcpu); 201 __entry->guest_rip = kvm_rip_read(vcpu);
202 __entry->isa = isa;
203 kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
204 &__entry->info2);
196 ), 205 ),
197 206
198 TP_printk("reason %s rip 0x%lx", 207 TP_printk("reason %s rip 0x%lx info %llx %llx",
199 ftrace_print_symbols_seq(p, __entry->exit_reason, 208 ftrace_print_symbols_seq(p, __entry->exit_reason,
200 kvm_x86_ops->exit_reasons_str), 209 kvm_x86_ops->exit_reasons_str),
201 __entry->guest_rip) 210 __entry->guest_rip, __entry->info1, __entry->info2)
202); 211);
203 212
204/* 213/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8da0e45ff7c9..bf89ec2cfb82 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
69static int __read_mostly vmm_exclusive = 1; 69static int __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO); 70module_param(vmm_exclusive, bool, S_IRUGO);
71 71
72static int __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO);
74
72#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 75#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
73 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 76 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
74#define KVM_GUEST_CR0_MASK \ 77#define KVM_GUEST_CR0_MASK \
@@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm);
177static u64 construct_eptp(unsigned long root_hpa); 180static u64 construct_eptp(unsigned long root_hpa);
178static void kvm_cpu_vmxon(u64 addr); 181static void kvm_cpu_vmxon(u64 addr);
179static void kvm_cpu_vmxoff(void); 182static void kvm_cpu_vmxoff(void);
183static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
180 184
181static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
182static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b;
188static unsigned long *vmx_msr_bitmap_legacy; 192static unsigned long *vmx_msr_bitmap_legacy;
189static unsigned long *vmx_msr_bitmap_longmode; 193static unsigned long *vmx_msr_bitmap_longmode;
190 194
195static bool cpu_has_load_ia32_efer;
196
191static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 197static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
192static DEFINE_SPINLOCK(vmx_vpid_lock); 198static DEFINE_SPINLOCK(vmx_vpid_lock);
193 199
@@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs)
472 u8 error; 478 u8 error;
473 479
474 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" 480 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
475 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 481 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
476 : "cc", "memory"); 482 : "cc", "memory");
477 if (error) 483 if (error)
478 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 484 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs)
485 u8 error; 491 u8 error;
486 492
487 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 493 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
488 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 494 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
489 : "cc", "memory"); 495 : "cc", "memory");
490 if (error) 496 if (error)
491 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 497 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
565 571
566static unsigned long vmcs_readl(unsigned long field) 572static unsigned long vmcs_readl(unsigned long field)
567{ 573{
568 unsigned long value; 574 unsigned long value = 0;
569 575
570 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) 576 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
571 : "=a"(value) : "d"(field) : "cc"); 577 : "+a"(value) : "d"(field) : "cc");
572 return value; 578 return value;
573} 579}
574 580
@@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
661 unsigned i; 667 unsigned i;
662 struct msr_autoload *m = &vmx->msr_autoload; 668 struct msr_autoload *m = &vmx->msr_autoload;
663 669
670 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
671 vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
672 vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
673 return;
674 }
675
664 for (i = 0; i < m->nr; ++i) 676 for (i = 0; i < m->nr; ++i)
665 if (m->guest[i].index == msr) 677 if (m->guest[i].index == msr)
666 break; 678 break;
@@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
680 unsigned i; 692 unsigned i;
681 struct msr_autoload *m = &vmx->msr_autoload; 693 struct msr_autoload *m = &vmx->msr_autoload;
682 694
695 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
696 vmcs_write64(GUEST_IA32_EFER, guest_val);
697 vmcs_write64(HOST_IA32_EFER, host_val);
698 vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
699 vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
700 return;
701 }
702
683 for (i = 0; i < m->nr; ++i) 703 for (i = 0; i < m->nr; ++i)
684 if (m->guest[i].index == msr) 704 if (m->guest[i].index == msr)
685 break; 705 break;
@@ -821,10 +841,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
821#endif 841#endif
822 842
823#ifdef CONFIG_X86_64 843#ifdef CONFIG_X86_64
824 if (is_long_mode(&vmx->vcpu)) { 844 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
825 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 845 if (is_long_mode(&vmx->vcpu))
826 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 846 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
827 }
828#endif 847#endif
829 for (i = 0; i < vmx->save_nmsrs; ++i) 848 for (i = 0; i < vmx->save_nmsrs; ++i)
830 kvm_set_shared_msr(vmx->guest_msrs[i].index, 849 kvm_set_shared_msr(vmx->guest_msrs[i].index,
@@ -839,23 +858,23 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
839 858
840 ++vmx->vcpu.stat.host_state_reload; 859 ++vmx->vcpu.stat.host_state_reload;
841 vmx->host_state.loaded = 0; 860 vmx->host_state.loaded = 0;
842 if (vmx->host_state.fs_reload_needed) 861#ifdef CONFIG_X86_64
843 loadsegment(fs, vmx->host_state.fs_sel); 862 if (is_long_mode(&vmx->vcpu))
863 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
864#endif
844 if (vmx->host_state.gs_ldt_reload_needed) { 865 if (vmx->host_state.gs_ldt_reload_needed) {
845 kvm_load_ldt(vmx->host_state.ldt_sel); 866 kvm_load_ldt(vmx->host_state.ldt_sel);
846#ifdef CONFIG_X86_64 867#ifdef CONFIG_X86_64
847 load_gs_index(vmx->host_state.gs_sel); 868 load_gs_index(vmx->host_state.gs_sel);
848 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
849#else 869#else
850 loadsegment(gs, vmx->host_state.gs_sel); 870 loadsegment(gs, vmx->host_state.gs_sel);
851#endif 871#endif
852 } 872 }
873 if (vmx->host_state.fs_reload_needed)
874 loadsegment(fs, vmx->host_state.fs_sel);
853 reload_tss(); 875 reload_tss();
854#ifdef CONFIG_X86_64 876#ifdef CONFIG_X86_64
855 if (is_long_mode(&vmx->vcpu)) { 877 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
856 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
857 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
858 }
859#endif 878#endif
860 if (current_thread_info()->status & TS_USEDFPU) 879 if (current_thread_info()->status & TS_USEDFPU)
861 clts(); 880 clts();
@@ -1010,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1010 vmx_set_interrupt_shadow(vcpu, 0); 1029 vmx_set_interrupt_shadow(vcpu, 0);
1011} 1030}
1012 1031
1032static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1033{
1034 /* Ensure that we clear the HLT state in the VMCS. We don't need to
1035 * explicitly skip the instruction because if the HLT state is set, then
1036 * the instruction is already executing and RIP has already been
1037 * advanced. */
1038 if (!yield_on_hlt &&
1039 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1040 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1041}
1042
1013static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 1043static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1014 bool has_error_code, u32 error_code, 1044 bool has_error_code, u32 error_code,
1015 bool reinject) 1045 bool reinject)
@@ -1036,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1036 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1066 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1037 1067
1038 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1068 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1069 vmx_clear_hlt(vcpu);
1039} 1070}
1040 1071
1041static bool vmx_rdtscp_supported(void) 1072static bool vmx_rdtscp_supported(void)
@@ -1306,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void)
1306 && tboot_enabled()) 1337 && tboot_enabled())
1307 return 1; 1338 return 1;
1308 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 1339 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1309 && !tboot_enabled()) 1340 && !tboot_enabled()) {
1341 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1342 " activate TXT before enabling KVM\n");
1310 return 1; 1343 return 1;
1344 }
1311 } 1345 }
1312 1346
1313 return 0; 1347 return 0;
@@ -1401,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1401 return 0; 1435 return 0;
1402} 1436}
1403 1437
1438static __init bool allow_1_setting(u32 msr, u32 ctl)
1439{
1440 u32 vmx_msr_low, vmx_msr_high;
1441
1442 rdmsr(msr, vmx_msr_low, vmx_msr_high);
1443 return vmx_msr_high & ctl;
1444}
1445
1404static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) 1446static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1405{ 1447{
1406 u32 vmx_msr_low, vmx_msr_high; 1448 u32 vmx_msr_low, vmx_msr_high;
@@ -1417,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1417 &_pin_based_exec_control) < 0) 1459 &_pin_based_exec_control) < 0)
1418 return -EIO; 1460 return -EIO;
1419 1461
1420 min = CPU_BASED_HLT_EXITING | 1462 min =
1421#ifdef CONFIG_X86_64 1463#ifdef CONFIG_X86_64
1422 CPU_BASED_CR8_LOAD_EXITING | 1464 CPU_BASED_CR8_LOAD_EXITING |
1423 CPU_BASED_CR8_STORE_EXITING | 1465 CPU_BASED_CR8_STORE_EXITING |
@@ -1430,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1430 CPU_BASED_MWAIT_EXITING | 1472 CPU_BASED_MWAIT_EXITING |
1431 CPU_BASED_MONITOR_EXITING | 1473 CPU_BASED_MONITOR_EXITING |
1432 CPU_BASED_INVLPG_EXITING; 1474 CPU_BASED_INVLPG_EXITING;
1475
1476 if (yield_on_hlt)
1477 min |= CPU_BASED_HLT_EXITING;
1478
1433 opt = CPU_BASED_TPR_SHADOW | 1479 opt = CPU_BASED_TPR_SHADOW |
1434 CPU_BASED_USE_MSR_BITMAPS | 1480 CPU_BASED_USE_MSR_BITMAPS |
1435 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1481 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1511,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1511 vmcs_conf->vmexit_ctrl = _vmexit_control; 1557 vmcs_conf->vmexit_ctrl = _vmexit_control;
1512 vmcs_conf->vmentry_ctrl = _vmentry_control; 1558 vmcs_conf->vmentry_ctrl = _vmentry_control;
1513 1559
1560 cpu_has_load_ia32_efer =
1561 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
1562 VM_ENTRY_LOAD_IA32_EFER)
1563 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
1564 VM_EXIT_LOAD_IA32_EFER);
1565
1514 return 0; 1566 return 0;
1515} 1567}
1516 1568
@@ -1684,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1684 save->limit = vmcs_read32(sf->limit); 1736 save->limit = vmcs_read32(sf->limit);
1685 save->ar = vmcs_read32(sf->ar_bytes); 1737 save->ar = vmcs_read32(sf->ar_bytes);
1686 vmcs_write16(sf->selector, save->base >> 4); 1738 vmcs_write16(sf->selector, save->base >> 4);
1687 vmcs_write32(sf->base, save->base & 0xfffff); 1739 vmcs_write32(sf->base, save->base & 0xffff0);
1688 vmcs_write32(sf->limit, 0xffff); 1740 vmcs_write32(sf->limit, 0xffff);
1689 vmcs_write32(sf->ar_bytes, 0xf3); 1741 vmcs_write32(sf->ar_bytes, 0xf3);
1742 if (save->base & 0xf)
1743 printk_once(KERN_WARNING "kvm: segment base is not paragraph"
1744 " aligned when entering protected mode (seg=%d)",
1745 seg);
1690} 1746}
1691 1747
1692static void enter_rmode(struct kvm_vcpu *vcpu) 1748static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1815,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1815 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 1871 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1816} 1872}
1817 1873
1874static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
1875{
1876 if (enable_ept && is_paging(vcpu))
1877 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
1878 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
1879}
1880
1818static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1881static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1819{ 1882{
1820 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 1883 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1858,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1858 unsigned long cr0, 1921 unsigned long cr0,
1859 struct kvm_vcpu *vcpu) 1922 struct kvm_vcpu *vcpu)
1860{ 1923{
1924 vmx_decache_cr3(vcpu);
1861 if (!(cr0 & X86_CR0_PG)) { 1925 if (!(cr0 & X86_CR0_PG)) {
1862 /* From paging/starting to nonpaging */ 1926 /* From paging/starting to nonpaging */
1863 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1927 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1938,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1938 if (enable_ept) { 2002 if (enable_ept) {
1939 eptp = construct_eptp(cr3); 2003 eptp = construct_eptp(cr3);
1940 vmcs_write64(EPT_POINTER, eptp); 2004 vmcs_write64(EPT_POINTER, eptp);
1941 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 2005 guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
1942 vcpu->kvm->arch.ept_identity_map_addr; 2006 vcpu->kvm->arch.ept_identity_map_addr;
1943 ept_load_pdptrs(vcpu); 2007 ept_load_pdptrs(vcpu);
1944 } 2008 }
@@ -2726,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2726 vmcs_writel(GUEST_IDTR_BASE, 0); 2790 vmcs_writel(GUEST_IDTR_BASE, 0);
2727 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 2791 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
2728 2792
2729 vmcs_write32(GUEST_ACTIVITY_STATE, 0); 2793 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
2730 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 2794 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2731 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 2795 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2732 2796
@@ -2788,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2788 return; 2852 return;
2789 } 2853 }
2790 2854
2855 if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
2856 enable_irq_window(vcpu);
2857 return;
2858 }
2791 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2859 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2792 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 2860 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2793 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2861 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2815,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2815 } else 2883 } else
2816 intr |= INTR_TYPE_EXT_INTR; 2884 intr |= INTR_TYPE_EXT_INTR;
2817 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 2885 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
2886 vmx_clear_hlt(vcpu);
2818} 2887}
2819 2888
2820static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 2889static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2842,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2842 } 2911 }
2843 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2912 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2844 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2913 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2914 vmx_clear_hlt(vcpu);
2845} 2915}
2846 2916
2847static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 2917static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2850,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2850 return 0; 2920 return 0;
2851 2921
2852 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 2922 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2853 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); 2923 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
2924 | GUEST_INTR_STATE_NMI));
2854} 2925}
2855 2926
2856static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 2927static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2911,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2911 * Cause the #SS fault with 0 error code in VM86 mode. 2982 * Cause the #SS fault with 0 error code in VM86 mode.
2912 */ 2983 */
2913 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2984 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2914 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) 2985 if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
2915 return 1; 2986 return 1;
2916 /* 2987 /*
2917 * Forward all other exceptions that are valid in real mode. 2988 * Forward all other exceptions that are valid in real mode.
@@ -3008,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3008 } 3079 }
3009 3080
3010 if (is_invalid_opcode(intr_info)) { 3081 if (is_invalid_opcode(intr_info)) {
3011 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); 3082 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
3012 if (er != EMULATE_DONE) 3083 if (er != EMULATE_DONE)
3013 kvm_queue_exception(vcpu, UD_VECTOR); 3084 kvm_queue_exception(vcpu, UD_VECTOR);
3014 return 1; 3085 return 1;
@@ -3027,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3027 3098
3028 if (kvm_event_needs_reinjection(vcpu)) 3099 if (kvm_event_needs_reinjection(vcpu))
3029 kvm_mmu_unprotect_page_virt(vcpu, cr2); 3100 kvm_mmu_unprotect_page_virt(vcpu, cr2);
3030 return kvm_mmu_page_fault(vcpu, cr2, error_code); 3101 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
3031 } 3102 }
3032 3103
3033 if (vmx->rmode.vm86_active && 3104 if (vmx->rmode.vm86_active &&
@@ -3099,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
3099 ++vcpu->stat.io_exits; 3170 ++vcpu->stat.io_exits;
3100 3171
3101 if (string || in) 3172 if (string || in)
3102 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3173 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3103 3174
3104 port = exit_qualification >> 16; 3175 port = exit_qualification >> 16;
3105 size = (exit_qualification & 7) + 1; 3176 size = (exit_qualification & 7) + 1;
@@ -3119,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3119 hypercall[2] = 0xc1; 3190 hypercall[2] = 0xc1;
3120} 3191}
3121 3192
3122static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
3123{
3124 if (err)
3125 kvm_inject_gp(vcpu, 0);
3126 else
3127 skip_emulated_instruction(vcpu);
3128}
3129
3130static int handle_cr(struct kvm_vcpu *vcpu) 3193static int handle_cr(struct kvm_vcpu *vcpu)
3131{ 3194{
3132 unsigned long exit_qualification, val; 3195 unsigned long exit_qualification, val;
@@ -3144,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3144 switch (cr) { 3207 switch (cr) {
3145 case 0: 3208 case 0:
3146 err = kvm_set_cr0(vcpu, val); 3209 err = kvm_set_cr0(vcpu, val);
3147 complete_insn_gp(vcpu, err); 3210 kvm_complete_insn_gp(vcpu, err);
3148 return 1; 3211 return 1;
3149 case 3: 3212 case 3:
3150 err = kvm_set_cr3(vcpu, val); 3213 err = kvm_set_cr3(vcpu, val);
3151 complete_insn_gp(vcpu, err); 3214 kvm_complete_insn_gp(vcpu, err);
3152 return 1; 3215 return 1;
3153 case 4: 3216 case 4:
3154 err = kvm_set_cr4(vcpu, val); 3217 err = kvm_set_cr4(vcpu, val);
3155 complete_insn_gp(vcpu, err); 3218 kvm_complete_insn_gp(vcpu, err);
3156 return 1; 3219 return 1;
3157 case 8: { 3220 case 8: {
3158 u8 cr8_prev = kvm_get_cr8(vcpu); 3221 u8 cr8_prev = kvm_get_cr8(vcpu);
3159 u8 cr8 = kvm_register_read(vcpu, reg); 3222 u8 cr8 = kvm_register_read(vcpu, reg);
3160 kvm_set_cr8(vcpu, cr8); 3223 err = kvm_set_cr8(vcpu, cr8);
3161 skip_emulated_instruction(vcpu); 3224 kvm_complete_insn_gp(vcpu, err);
3162 if (irqchip_in_kernel(vcpu->kvm)) 3225 if (irqchip_in_kernel(vcpu->kvm))
3163 return 1; 3226 return 1;
3164 if (cr8_prev <= cr8) 3227 if (cr8_prev <= cr8)
@@ -3177,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3177 case 1: /*mov from cr*/ 3240 case 1: /*mov from cr*/
3178 switch (cr) { 3241 switch (cr) {
3179 case 3: 3242 case 3:
3180 kvm_register_write(vcpu, reg, vcpu->arch.cr3); 3243 val = kvm_read_cr3(vcpu);
3181 trace_kvm_cr_read(cr, vcpu->arch.cr3); 3244 kvm_register_write(vcpu, reg, val);
3245 trace_kvm_cr_read(cr, val);
3182 skip_emulated_instruction(vcpu); 3246 skip_emulated_instruction(vcpu);
3183 return 1; 3247 return 1;
3184 case 8: 3248 case 8:
@@ -3350,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3350 return 1; 3414 return 1;
3351} 3415}
3352 3416
3417static int handle_invd(struct kvm_vcpu *vcpu)
3418{
3419 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3420}
3421
3353static int handle_invlpg(struct kvm_vcpu *vcpu) 3422static int handle_invlpg(struct kvm_vcpu *vcpu)
3354{ 3423{
3355 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3424 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3378,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
3378 3447
3379static int handle_apic_access(struct kvm_vcpu *vcpu) 3448static int handle_apic_access(struct kvm_vcpu *vcpu)
3380{ 3449{
3381 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3450 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3382} 3451}
3383 3452
3384static int handle_task_switch(struct kvm_vcpu *vcpu) 3453static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3477,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
3477 3546
3478 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 3547 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3479 trace_kvm_page_fault(gpa, exit_qualification); 3548 trace_kvm_page_fault(gpa, exit_qualification);
3480 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); 3549 return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
3481} 3550}
3482 3551
3483static u64 ept_rsvd_mask(u64 spte, int level) 3552static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3593,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3593 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) 3662 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
3594 return handle_interrupt_window(&vmx->vcpu); 3663 return handle_interrupt_window(&vmx->vcpu);
3595 3664
3596 err = emulate_instruction(vcpu, 0, 0, 0); 3665 err = emulate_instruction(vcpu, 0);
3597 3666
3598 if (err == EMULATE_DO_MMIO) { 3667 if (err == EMULATE_DO_MMIO) {
3599 ret = 0; 3668 ret = 0;
@@ -3650,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3650 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 3719 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
3651 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 3720 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
3652 [EXIT_REASON_HLT] = handle_halt, 3721 [EXIT_REASON_HLT] = handle_halt,
3722 [EXIT_REASON_INVD] = handle_invd,
3653 [EXIT_REASON_INVLPG] = handle_invlpg, 3723 [EXIT_REASON_INVLPG] = handle_invlpg,
3654 [EXIT_REASON_VMCALL] = handle_vmcall, 3724 [EXIT_REASON_VMCALL] = handle_vmcall,
3655 [EXIT_REASON_VMCLEAR] = handle_vmx_insn, 3725 [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
@@ -3677,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3677static const int kvm_vmx_max_exit_handlers = 3747static const int kvm_vmx_max_exit_handlers =
3678 ARRAY_SIZE(kvm_vmx_exit_handlers); 3748 ARRAY_SIZE(kvm_vmx_exit_handlers);
3679 3749
3750static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3751{
3752 *info1 = vmcs_readl(EXIT_QUALIFICATION);
3753 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
3754}
3755
3680/* 3756/*
3681 * The guest has exited. See if we can fix it or if we need userspace 3757 * The guest has exited. See if we can fix it or if we need userspace
3682 * assistance. 3758 * assistance.
@@ -3687,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3687 u32 exit_reason = vmx->exit_reason; 3763 u32 exit_reason = vmx->exit_reason;
3688 u32 vectoring_info = vmx->idt_vectoring_info; 3764 u32 vectoring_info = vmx->idt_vectoring_info;
3689 3765
3690 trace_kvm_exit(exit_reason, vcpu); 3766 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
3691 3767
3692 /* If guest state is invalid, start emulating */ 3768 /* If guest state is invalid, start emulating */
3693 if (vmx->emulation_required && emulate_invalid_guest_state) 3769 if (vmx->emulation_required && emulate_invalid_guest_state)
3694 return handle_invalid_guest_state(vcpu); 3770 return handle_invalid_guest_state(vcpu);
3695 3771
3696 /* Access CR3 don't cause VMExit in paging mode, so we need
3697 * to sync with guest real CR3. */
3698 if (enable_ept && is_paging(vcpu))
3699 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3700
3701 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 3772 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3702 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3773 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3703 vcpu->run->fail_entry.hardware_entry_failure_reason 3774 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -4014,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4014 ); 4085 );
4015 4086
4016 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 4087 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
4017 | (1 << VCPU_EXREG_PDPTR)); 4088 | (1 << VCPU_EXREG_PDPTR)
4089 | (1 << VCPU_EXREG_CR3));
4018 vcpu->arch.regs_dirty = 0; 4090 vcpu->arch.regs_dirty = 0;
4019 4091
4020 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 4092 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
@@ -4228,11 +4300,6 @@ static int vmx_get_lpage_level(void)
4228 return PT_PDPE_LEVEL; 4300 return PT_PDPE_LEVEL;
4229} 4301}
4230 4302
4231static inline u32 bit(int bitno)
4232{
4233 return 1 << (bitno & 31);
4234}
4235
4236static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 4303static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4237{ 4304{
4238 struct kvm_cpuid_entry2 *best; 4305 struct kvm_cpuid_entry2 *best;
@@ -4286,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4286 .get_cpl = vmx_get_cpl, 4353 .get_cpl = vmx_get_cpl,
4287 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 4354 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4288 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 4355 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
4356 .decache_cr3 = vmx_decache_cr3,
4289 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 4357 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
4290 .set_cr0 = vmx_set_cr0, 4358 .set_cr0 = vmx_set_cr0,
4291 .set_cr3 = vmx_set_cr3, 4359 .set_cr3 = vmx_set_cr3,
@@ -4326,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
4326 .get_tdp_level = get_ept_level, 4394 .get_tdp_level = get_ept_level,
4327 .get_mt_mask = vmx_get_mt_mask, 4395 .get_mt_mask = vmx_get_mt_mask,
4328 4396
4397 .get_exit_info = vmx_get_exit_info,
4329 .exit_reasons_str = vmx_exit_reasons_str, 4398 .exit_reasons_str = vmx_exit_reasons_str,
4399
4330 .get_lpage_level = vmx_get_lpage_level, 4400 .get_lpage_level = vmx_get_lpage_level,
4331 4401
4332 .cpuid_update = vmx_cpuid_update, 4402 .cpuid_update = vmx_cpuid_update,
@@ -4402,8 +4472,6 @@ static int __init vmx_init(void)
4402 4472
4403 if (enable_ept) { 4473 if (enable_ept) {
4404 bypass_guest_pf = 0; 4474 bypass_guest_pf = 0;
4405 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
4406 VMX_EPT_WRITABLE_MASK);
4407 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 4475 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4408 VMX_EPT_EXECUTABLE_MASK); 4476 VMX_EPT_EXECUTABLE_MASK);
4409 kvm_enable_tdp(); 4477 kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cdac9e592aa5..bcc0efce85bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -43,6 +43,7 @@
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/hash.h>
46#include <trace/events/kvm.h> 47#include <trace/events/kvm.h>
47 48
48#define CREATE_TRACE_POINTS 49#define CREATE_TRACE_POINTS
@@ -155,9 +156,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
155 156
156u64 __read_mostly host_xcr0; 157u64 __read_mostly host_xcr0;
157 158
158static inline u32 bit(int bitno) 159static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
159{ 160{
160 return 1 << (bitno & 31); 161 int i;
162 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
163 vcpu->arch.apf.gfns[i] = ~0;
161} 164}
162 165
163static void kvm_on_user_return(struct user_return_notifier *urn) 166static void kvm_on_user_return(struct user_return_notifier *urn)
@@ -331,23 +334,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
331} 334}
332EXPORT_SYMBOL_GPL(kvm_requeue_exception); 335EXPORT_SYMBOL_GPL(kvm_requeue_exception);
333 336
334void kvm_inject_page_fault(struct kvm_vcpu *vcpu) 337void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
335{ 338{
336 unsigned error_code = vcpu->arch.fault.error_code; 339 if (err)
340 kvm_inject_gp(vcpu, 0);
341 else
342 kvm_x86_ops->skip_emulated_instruction(vcpu);
343}
344EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
337 345
346void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
347{
338 ++vcpu->stat.pf_guest; 348 ++vcpu->stat.pf_guest;
339 vcpu->arch.cr2 = vcpu->arch.fault.address; 349 vcpu->arch.cr2 = fault->address;
340 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 350 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
341} 351}
342 352
343void kvm_propagate_fault(struct kvm_vcpu *vcpu) 353void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
344{ 354{
345 if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) 355 if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
346 vcpu->arch.nested_mmu.inject_page_fault(vcpu); 356 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
347 else 357 else
348 vcpu->arch.mmu.inject_page_fault(vcpu); 358 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
349
350 vcpu->arch.fault.nested = false;
351} 359}
352 360
353void kvm_inject_nmi(struct kvm_vcpu *vcpu) 361void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -465,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
465 (unsigned long *)&vcpu->arch.regs_avail)) 473 (unsigned long *)&vcpu->arch.regs_avail))
466 return true; 474 return true;
467 475
468 gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; 476 gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
469 offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); 477 offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
470 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), 478 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
471 PFERR_USER_MASK | PFERR_WRITE_MASK); 479 PFERR_USER_MASK | PFERR_WRITE_MASK);
472 if (r < 0) 480 if (r < 0)
@@ -511,12 +519,15 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
511 } else 519 } else
512#endif 520#endif
513 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, 521 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
514 vcpu->arch.cr3)) 522 kvm_read_cr3(vcpu)))
515 return 1; 523 return 1;
516 } 524 }
517 525
518 kvm_x86_ops->set_cr0(vcpu, cr0); 526 kvm_x86_ops->set_cr0(vcpu, cr0);
519 527
528 if ((cr0 ^ old_cr0) & X86_CR0_PG)
529 kvm_clear_async_pf_completion_queue(vcpu);
530
520 if ((cr0 ^ old_cr0) & update_bits) 531 if ((cr0 ^ old_cr0) & update_bits)
521 kvm_mmu_reset_context(vcpu); 532 kvm_mmu_reset_context(vcpu);
522 return 0; 533 return 0;
@@ -600,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
600 return 1; 611 return 1;
601 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 612 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
602 && ((cr4 ^ old_cr4) & pdptr_bits) 613 && ((cr4 ^ old_cr4) & pdptr_bits)
603 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) 614 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
615 kvm_read_cr3(vcpu)))
604 return 1; 616 return 1;
605 617
606 if (cr4 & X86_CR4_VMXE) 618 if (cr4 & X86_CR4_VMXE)
@@ -620,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
620 632
621int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 633int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
622{ 634{
623 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 635 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
624 kvm_mmu_sync_roots(vcpu); 636 kvm_mmu_sync_roots(vcpu);
625 kvm_mmu_flush_tlb(vcpu); 637 kvm_mmu_flush_tlb(vcpu);
626 return 0; 638 return 0;
@@ -655,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
655 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 667 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
656 return 1; 668 return 1;
657 vcpu->arch.cr3 = cr3; 669 vcpu->arch.cr3 = cr3;
670 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
658 vcpu->arch.mmu.new_cr3(vcpu); 671 vcpu->arch.mmu.new_cr3(vcpu);
659 return 0; 672 return 0;
660} 673}
661EXPORT_SYMBOL_GPL(kvm_set_cr3); 674EXPORT_SYMBOL_GPL(kvm_set_cr3);
662 675
663int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 676int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
664{ 677{
665 if (cr8 & CR8_RESERVED_BITS) 678 if (cr8 & CR8_RESERVED_BITS)
666 return 1; 679 return 1;
@@ -670,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
670 vcpu->arch.cr8 = cr8; 683 vcpu->arch.cr8 = cr8;
671 return 0; 684 return 0;
672} 685}
673
674void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
675{
676 if (__kvm_set_cr8(vcpu, cr8))
677 kvm_inject_gp(vcpu, 0);
678}
679EXPORT_SYMBOL_GPL(kvm_set_cr8); 686EXPORT_SYMBOL_GPL(kvm_set_cr8);
680 687
681unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 688unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@@ -780,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
780 * kvm-specific. Those are put in the beginning of the list. 787 * kvm-specific. Those are put in the beginning of the list.
781 */ 788 */
782 789
783#define KVM_SAVE_MSRS_BEGIN 7 790#define KVM_SAVE_MSRS_BEGIN 8
784static u32 msrs_to_save[] = { 791static u32 msrs_to_save[] = {
785 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 792 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
786 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 793 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
787 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 794 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
788 HV_X64_MSR_APIC_ASSIST_PAGE, 795 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
789 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 796 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
790 MSR_STAR, 797 MSR_STAR,
791#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
@@ -835,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
835 kvm_x86_ops->set_efer(vcpu, efer); 842 kvm_x86_ops->set_efer(vcpu, efer);
836 843
837 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 844 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
838 kvm_mmu_reset_context(vcpu);
839 845
840 /* Update reserved bits */ 846 /* Update reserved bits */
841 if ((efer ^ old_efer) & EFER_NX) 847 if ((efer ^ old_efer) & EFER_NX)
@@ -981,7 +987,7 @@ static inline u64 nsec_to_cycles(u64 nsec)
981 if (kvm_tsc_changes_freq()) 987 if (kvm_tsc_changes_freq())
982 printk_once(KERN_WARNING 988 printk_once(KERN_WARNING
983 "kvm: unreliable cycle conversion on adjustable rate TSC\n"); 989 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
984 ret = nsec * __get_cpu_var(cpu_tsc_khz); 990 ret = nsec * __this_cpu_read(cpu_tsc_khz);
985 do_div(ret, USEC_PER_SEC); 991 do_div(ret, USEC_PER_SEC);
986 return ret; 992 return ret;
987} 993}
@@ -1066,7 +1072,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1066 local_irq_save(flags); 1072 local_irq_save(flags);
1067 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); 1073 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
1068 kernel_ns = get_kernel_ns(); 1074 kernel_ns = get_kernel_ns();
1069 this_tsc_khz = __get_cpu_var(cpu_tsc_khz); 1075 this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
1070 1076
1071 if (unlikely(this_tsc_khz == 0)) { 1077 if (unlikely(this_tsc_khz == 0)) {
1072 local_irq_restore(flags); 1078 local_irq_restore(flags);
@@ -1423,6 +1429,30 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1423 return 0; 1429 return 0;
1424} 1430}
1425 1431
1432static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1433{
1434 gpa_t gpa = data & ~0x3f;
1435
1436 /* Bits 2:5 are resrved, Should be zero */
1437 if (data & 0x3c)
1438 return 1;
1439
1440 vcpu->arch.apf.msr_val = data;
1441
1442 if (!(data & KVM_ASYNC_PF_ENABLED)) {
1443 kvm_clear_async_pf_completion_queue(vcpu);
1444 kvm_async_pf_hash_reset(vcpu);
1445 return 0;
1446 }
1447
1448 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
1449 return 1;
1450
1451 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
1452 kvm_async_pf_wakeup_all(vcpu);
1453 return 0;
1454}
1455
1426int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1456int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1427{ 1457{
1428 switch (msr) { 1458 switch (msr) {
@@ -1504,6 +1534,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1504 } 1534 }
1505 break; 1535 break;
1506 } 1536 }
1537 case MSR_KVM_ASYNC_PF_EN:
1538 if (kvm_pv_enable_async_pf(vcpu, data))
1539 return 1;
1540 break;
1507 case MSR_IA32_MCG_CTL: 1541 case MSR_IA32_MCG_CTL:
1508 case MSR_IA32_MCG_STATUS: 1542 case MSR_IA32_MCG_STATUS:
1509 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1543 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1780,6 +1814,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1780 case MSR_KVM_SYSTEM_TIME_NEW: 1814 case MSR_KVM_SYSTEM_TIME_NEW:
1781 data = vcpu->arch.time; 1815 data = vcpu->arch.time;
1782 break; 1816 break;
1817 case MSR_KVM_ASYNC_PF_EN:
1818 data = vcpu->arch.apf.msr_val;
1819 break;
1783 case MSR_IA32_P5_MC_ADDR: 1820 case MSR_IA32_P5_MC_ADDR:
1784 case MSR_IA32_P5_MC_TYPE: 1821 case MSR_IA32_P5_MC_TYPE:
1785 case MSR_IA32_MCG_CAP: 1822 case MSR_IA32_MCG_CAP:
@@ -1909,6 +1946,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1909 case KVM_CAP_NOP_IO_DELAY: 1946 case KVM_CAP_NOP_IO_DELAY:
1910 case KVM_CAP_MP_STATE: 1947 case KVM_CAP_MP_STATE:
1911 case KVM_CAP_SYNC_MMU: 1948 case KVM_CAP_SYNC_MMU:
1949 case KVM_CAP_USER_NMI:
1912 case KVM_CAP_REINJECT_CONTROL: 1950 case KVM_CAP_REINJECT_CONTROL:
1913 case KVM_CAP_IRQ_INJECT_STATUS: 1951 case KVM_CAP_IRQ_INJECT_STATUS:
1914 case KVM_CAP_ASSIGN_DEV_IRQ: 1952 case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -1927,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1927 case KVM_CAP_DEBUGREGS: 1965 case KVM_CAP_DEBUGREGS:
1928 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1966 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1929 case KVM_CAP_XSAVE: 1967 case KVM_CAP_XSAVE:
1968 case KVM_CAP_ASYNC_PF:
1930 r = 1; 1969 r = 1;
1931 break; 1970 break;
1932 case KVM_CAP_COALESCED_MMIO: 1971 case KVM_CAP_COALESCED_MMIO:
@@ -2190,6 +2229,11 @@ out:
2190 return r; 2229 return r;
2191} 2230}
2192 2231
2232static void cpuid_mask(u32 *word, int wordnum)
2233{
2234 *word &= boot_cpu_data.x86_capability[wordnum];
2235}
2236
2193static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 2237static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2194 u32 index) 2238 u32 index)
2195{ 2239{
@@ -2264,7 +2308,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2264 break; 2308 break;
2265 case 1: 2309 case 1:
2266 entry->edx &= kvm_supported_word0_x86_features; 2310 entry->edx &= kvm_supported_word0_x86_features;
2311 cpuid_mask(&entry->edx, 0);
2267 entry->ecx &= kvm_supported_word4_x86_features; 2312 entry->ecx &= kvm_supported_word4_x86_features;
2313 cpuid_mask(&entry->ecx, 4);
2268 /* we support x2apic emulation even if host does not support 2314 /* we support x2apic emulation even if host does not support
2269 * it since we emulate x2apic in software */ 2315 * it since we emulate x2apic in software */
2270 entry->ecx |= F(X2APIC); 2316 entry->ecx |= F(X2APIC);
@@ -2355,7 +2401,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2355 break; 2401 break;
2356 case 0x80000001: 2402 case 0x80000001:
2357 entry->edx &= kvm_supported_word1_x86_features; 2403 entry->edx &= kvm_supported_word1_x86_features;
2404 cpuid_mask(&entry->edx, 1);
2358 entry->ecx &= kvm_supported_word6_x86_features; 2405 entry->ecx &= kvm_supported_word6_x86_features;
2406 cpuid_mask(&entry->ecx, 6);
2359 break; 2407 break;
2360 } 2408 }
2361 2409
@@ -3174,20 +3222,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3174 struct kvm_memslots *slots, *old_slots; 3222 struct kvm_memslots *slots, *old_slots;
3175 unsigned long *dirty_bitmap; 3223 unsigned long *dirty_bitmap;
3176 3224
3177 r = -ENOMEM; 3225 dirty_bitmap = memslot->dirty_bitmap_head;
3178 dirty_bitmap = vmalloc(n); 3226 if (memslot->dirty_bitmap == dirty_bitmap)
3179 if (!dirty_bitmap) 3227 dirty_bitmap += n / sizeof(long);
3180 goto out;
3181 memset(dirty_bitmap, 0, n); 3228 memset(dirty_bitmap, 0, n);
3182 3229
3183 r = -ENOMEM; 3230 r = -ENOMEM;
3184 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 3231 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
3185 if (!slots) { 3232 if (!slots)
3186 vfree(dirty_bitmap);
3187 goto out; 3233 goto out;
3188 }
3189 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 3234 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
3190 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 3235 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
3236 slots->generation++;
3191 3237
3192 old_slots = kvm->memslots; 3238 old_slots = kvm->memslots;
3193 rcu_assign_pointer(kvm->memslots, slots); 3239 rcu_assign_pointer(kvm->memslots, slots);
@@ -3200,11 +3246,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3200 spin_unlock(&kvm->mmu_lock); 3246 spin_unlock(&kvm->mmu_lock);
3201 3247
3202 r = -EFAULT; 3248 r = -EFAULT;
3203 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { 3249 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
3204 vfree(dirty_bitmap);
3205 goto out; 3250 goto out;
3206 }
3207 vfree(dirty_bitmap);
3208 } else { 3251 } else {
3209 r = -EFAULT; 3252 r = -EFAULT;
3210 if (clear_user(log->dirty_bitmap, n)) 3253 if (clear_user(log->dirty_bitmap, n))
@@ -3271,8 +3314,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
3271 if (vpic) { 3314 if (vpic) {
3272 r = kvm_ioapic_init(kvm); 3315 r = kvm_ioapic_init(kvm);
3273 if (r) { 3316 if (r) {
3317 mutex_lock(&kvm->slots_lock);
3274 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3318 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3275 &vpic->dev); 3319 &vpic->dev);
3320 mutex_unlock(&kvm->slots_lock);
3276 kfree(vpic); 3321 kfree(vpic);
3277 goto create_irqchip_unlock; 3322 goto create_irqchip_unlock;
3278 } 3323 }
@@ -3283,10 +3328,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
3283 smp_wmb(); 3328 smp_wmb();
3284 r = kvm_setup_default_irq_routing(kvm); 3329 r = kvm_setup_default_irq_routing(kvm);
3285 if (r) { 3330 if (r) {
3331 mutex_lock(&kvm->slots_lock);
3286 mutex_lock(&kvm->irq_lock); 3332 mutex_lock(&kvm->irq_lock);
3287 kvm_ioapic_destroy(kvm); 3333 kvm_ioapic_destroy(kvm);
3288 kvm_destroy_pic(kvm); 3334 kvm_destroy_pic(kvm);
3289 mutex_unlock(&kvm->irq_lock); 3335 mutex_unlock(&kvm->irq_lock);
3336 mutex_unlock(&kvm->slots_lock);
3290 } 3337 }
3291 create_irqchip_unlock: 3338 create_irqchip_unlock:
3292 mutex_unlock(&kvm->lock); 3339 mutex_unlock(&kvm->lock);
@@ -3562,63 +3609,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3562static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) 3609static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3563{ 3610{
3564 gpa_t t_gpa; 3611 gpa_t t_gpa;
3565 u32 error; 3612 struct x86_exception exception;
3566 3613
3567 BUG_ON(!mmu_is_nested(vcpu)); 3614 BUG_ON(!mmu_is_nested(vcpu));
3568 3615
3569 /* NPT walks are always user-walks */ 3616 /* NPT walks are always user-walks */
3570 access |= PFERR_USER_MASK; 3617 access |= PFERR_USER_MASK;
3571 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); 3618 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
3572 if (t_gpa == UNMAPPED_GVA)
3573 vcpu->arch.fault.nested = true;
3574 3619
3575 return t_gpa; 3620 return t_gpa;
3576} 3621}
3577 3622
3578gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3623gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
3624 struct x86_exception *exception)
3579{ 3625{
3580 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3626 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3581 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3627 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3582} 3628}
3583 3629
3584 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3630 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
3631 struct x86_exception *exception)
3585{ 3632{
3586 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3633 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3587 access |= PFERR_FETCH_MASK; 3634 access |= PFERR_FETCH_MASK;
3588 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3635 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3589} 3636}
3590 3637
3591gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3638gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
3639 struct x86_exception *exception)
3592{ 3640{
3593 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3641 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3594 access |= PFERR_WRITE_MASK; 3642 access |= PFERR_WRITE_MASK;
3595 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3643 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3596} 3644}
3597 3645
3598/* uses this to access any guest's mapped memory without checking CPL */ 3646/* uses this to access any guest's mapped memory without checking CPL */
3599gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3647gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
3648 struct x86_exception *exception)
3600{ 3649{
3601 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); 3650 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
3602} 3651}
3603 3652
3604static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3653static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3605 struct kvm_vcpu *vcpu, u32 access, 3654 struct kvm_vcpu *vcpu, u32 access,
3606 u32 *error) 3655 struct x86_exception *exception)
3607{ 3656{
3608 void *data = val; 3657 void *data = val;
3609 int r = X86EMUL_CONTINUE; 3658 int r = X86EMUL_CONTINUE;
3610 3659
3611 while (bytes) { 3660 while (bytes) {
3612 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, 3661 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3613 error); 3662 exception);
3614 unsigned offset = addr & (PAGE_SIZE-1); 3663 unsigned offset = addr & (PAGE_SIZE-1);
3615 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3664 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3616 int ret; 3665 int ret;
3617 3666
3618 if (gpa == UNMAPPED_GVA) { 3667 if (gpa == UNMAPPED_GVA)
3619 r = X86EMUL_PROPAGATE_FAULT; 3668 return X86EMUL_PROPAGATE_FAULT;
3620 goto out;
3621 }
3622 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3669 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3623 if (ret < 0) { 3670 if (ret < 0) {
3624 r = X86EMUL_IO_NEEDED; 3671 r = X86EMUL_IO_NEEDED;
@@ -3635,31 +3682,35 @@ out:
3635 3682
3636/* used for instruction fetching */ 3683/* used for instruction fetching */
3637static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, 3684static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
3638 struct kvm_vcpu *vcpu, u32 *error) 3685 struct kvm_vcpu *vcpu,
3686 struct x86_exception *exception)
3639{ 3687{
3640 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3688 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3641 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 3689 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3642 access | PFERR_FETCH_MASK, error); 3690 access | PFERR_FETCH_MASK,
3691 exception);
3643} 3692}
3644 3693
3645static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3694static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
3646 struct kvm_vcpu *vcpu, u32 *error) 3695 struct kvm_vcpu *vcpu,
3696 struct x86_exception *exception)
3647{ 3697{
3648 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3698 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3649 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3699 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3650 error); 3700 exception);
3651} 3701}
3652 3702
3653static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, 3703static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3654 struct kvm_vcpu *vcpu, u32 *error) 3704 struct kvm_vcpu *vcpu,
3705 struct x86_exception *exception)
3655{ 3706{
3656 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3707 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3657} 3708}
3658 3709
3659static int kvm_write_guest_virt_system(gva_t addr, void *val, 3710static int kvm_write_guest_virt_system(gva_t addr, void *val,
3660 unsigned int bytes, 3711 unsigned int bytes,
3661 struct kvm_vcpu *vcpu, 3712 struct kvm_vcpu *vcpu,
3662 u32 *error) 3713 struct x86_exception *exception)
3663{ 3714{
3664 void *data = val; 3715 void *data = val;
3665 int r = X86EMUL_CONTINUE; 3716 int r = X86EMUL_CONTINUE;
@@ -3667,15 +3718,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3667 while (bytes) { 3718 while (bytes) {
3668 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, 3719 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3669 PFERR_WRITE_MASK, 3720 PFERR_WRITE_MASK,
3670 error); 3721 exception);
3671 unsigned offset = addr & (PAGE_SIZE-1); 3722 unsigned offset = addr & (PAGE_SIZE-1);
3672 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3723 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3673 int ret; 3724 int ret;
3674 3725
3675 if (gpa == UNMAPPED_GVA) { 3726 if (gpa == UNMAPPED_GVA)
3676 r = X86EMUL_PROPAGATE_FAULT; 3727 return X86EMUL_PROPAGATE_FAULT;
3677 goto out;
3678 }
3679 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3728 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3680 if (ret < 0) { 3729 if (ret < 0) {
3681 r = X86EMUL_IO_NEEDED; 3730 r = X86EMUL_IO_NEEDED;
@@ -3693,7 +3742,7 @@ out:
3693static int emulator_read_emulated(unsigned long addr, 3742static int emulator_read_emulated(unsigned long addr,
3694 void *val, 3743 void *val,
3695 unsigned int bytes, 3744 unsigned int bytes,
3696 unsigned int *error_code, 3745 struct x86_exception *exception,
3697 struct kvm_vcpu *vcpu) 3746 struct kvm_vcpu *vcpu)
3698{ 3747{
3699 gpa_t gpa; 3748 gpa_t gpa;
@@ -3706,7 +3755,7 @@ static int emulator_read_emulated(unsigned long addr,
3706 return X86EMUL_CONTINUE; 3755 return X86EMUL_CONTINUE;
3707 } 3756 }
3708 3757
3709 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); 3758 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
3710 3759
3711 if (gpa == UNMAPPED_GVA) 3760 if (gpa == UNMAPPED_GVA)
3712 return X86EMUL_PROPAGATE_FAULT; 3761 return X86EMUL_PROPAGATE_FAULT;
@@ -3715,8 +3764,8 @@ static int emulator_read_emulated(unsigned long addr,
3715 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3764 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3716 goto mmio; 3765 goto mmio;
3717 3766
3718 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) 3767 if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
3719 == X86EMUL_CONTINUE) 3768 == X86EMUL_CONTINUE)
3720 return X86EMUL_CONTINUE; 3769 return X86EMUL_CONTINUE;
3721 3770
3722mmio: 3771mmio:
@@ -3740,7 +3789,7 @@ mmio:
3740} 3789}
3741 3790
3742int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3791int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3743 const void *val, int bytes) 3792 const void *val, int bytes)
3744{ 3793{
3745 int ret; 3794 int ret;
3746 3795
@@ -3754,12 +3803,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3754static int emulator_write_emulated_onepage(unsigned long addr, 3803static int emulator_write_emulated_onepage(unsigned long addr,
3755 const void *val, 3804 const void *val,
3756 unsigned int bytes, 3805 unsigned int bytes,
3757 unsigned int *error_code, 3806 struct x86_exception *exception,
3758 struct kvm_vcpu *vcpu) 3807 struct kvm_vcpu *vcpu)
3759{ 3808{
3760 gpa_t gpa; 3809 gpa_t gpa;
3761 3810
3762 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); 3811 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
3763 3812
3764 if (gpa == UNMAPPED_GVA) 3813 if (gpa == UNMAPPED_GVA)
3765 return X86EMUL_PROPAGATE_FAULT; 3814 return X86EMUL_PROPAGATE_FAULT;
@@ -3792,7 +3841,7 @@ mmio:
3792int emulator_write_emulated(unsigned long addr, 3841int emulator_write_emulated(unsigned long addr,
3793 const void *val, 3842 const void *val,
3794 unsigned int bytes, 3843 unsigned int bytes,
3795 unsigned int *error_code, 3844 struct x86_exception *exception,
3796 struct kvm_vcpu *vcpu) 3845 struct kvm_vcpu *vcpu)
3797{ 3846{
3798 /* Crossing a page boundary? */ 3847 /* Crossing a page boundary? */
@@ -3800,7 +3849,7 @@ int emulator_write_emulated(unsigned long addr,
3800 int rc, now; 3849 int rc, now;
3801 3850
3802 now = -addr & ~PAGE_MASK; 3851 now = -addr & ~PAGE_MASK;
3803 rc = emulator_write_emulated_onepage(addr, val, now, error_code, 3852 rc = emulator_write_emulated_onepage(addr, val, now, exception,
3804 vcpu); 3853 vcpu);
3805 if (rc != X86EMUL_CONTINUE) 3854 if (rc != X86EMUL_CONTINUE)
3806 return rc; 3855 return rc;
@@ -3808,7 +3857,7 @@ int emulator_write_emulated(unsigned long addr,
3808 val += now; 3857 val += now;
3809 bytes -= now; 3858 bytes -= now;
3810 } 3859 }
3811 return emulator_write_emulated_onepage(addr, val, bytes, error_code, 3860 return emulator_write_emulated_onepage(addr, val, bytes, exception,
3812 vcpu); 3861 vcpu);
3813} 3862}
3814 3863
@@ -3826,7 +3875,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3826 const void *old, 3875 const void *old,
3827 const void *new, 3876 const void *new,
3828 unsigned int bytes, 3877 unsigned int bytes,
3829 unsigned int *error_code, 3878 struct x86_exception *exception,
3830 struct kvm_vcpu *vcpu) 3879 struct kvm_vcpu *vcpu)
3831{ 3880{
3832 gpa_t gpa; 3881 gpa_t gpa;
@@ -3884,7 +3933,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3884emul_write: 3933emul_write:
3885 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3934 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3886 3935
3887 return emulator_write_emulated(addr, new, bytes, error_code, vcpu); 3936 return emulator_write_emulated(addr, new, bytes, exception, vcpu);
3888} 3937}
3889 3938
3890static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3939static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3909,7 +3958,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3909 if (vcpu->arch.pio.count) 3958 if (vcpu->arch.pio.count)
3910 goto data_avail; 3959 goto data_avail;
3911 3960
3912 trace_kvm_pio(0, port, size, 1); 3961 trace_kvm_pio(0, port, size, count);
3913 3962
3914 vcpu->arch.pio.port = port; 3963 vcpu->arch.pio.port = port;
3915 vcpu->arch.pio.in = 1; 3964 vcpu->arch.pio.in = 1;
@@ -3937,7 +3986,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
3937 const void *val, unsigned int count, 3986 const void *val, unsigned int count,
3938 struct kvm_vcpu *vcpu) 3987 struct kvm_vcpu *vcpu)
3939{ 3988{
3940 trace_kvm_pio(1, port, size, 1); 3989 trace_kvm_pio(1, port, size, count);
3941 3990
3942 vcpu->arch.pio.port = port; 3991 vcpu->arch.pio.port = port;
3943 vcpu->arch.pio.in = 0; 3992 vcpu->arch.pio.in = 0;
@@ -3978,13 +4027,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
3978 return X86EMUL_CONTINUE; 4027 return X86EMUL_CONTINUE;
3979 4028
3980 if (kvm_x86_ops->has_wbinvd_exit()) { 4029 if (kvm_x86_ops->has_wbinvd_exit()) {
3981 preempt_disable(); 4030 int cpu = get_cpu();
4031
4032 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3982 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, 4033 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
3983 wbinvd_ipi, NULL, 1); 4034 wbinvd_ipi, NULL, 1);
3984 preempt_enable(); 4035 put_cpu();
3985 cpumask_clear(vcpu->arch.wbinvd_dirty_mask); 4036 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
3986 } 4037 } else
3987 wbinvd(); 4038 wbinvd();
3988 return X86EMUL_CONTINUE; 4039 return X86EMUL_CONTINUE;
3989} 4040}
3990EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4041EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
@@ -4024,7 +4075,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
4024 value = vcpu->arch.cr2; 4075 value = vcpu->arch.cr2;
4025 break; 4076 break;
4026 case 3: 4077 case 3:
4027 value = vcpu->arch.cr3; 4078 value = kvm_read_cr3(vcpu);
4028 break; 4079 break;
4029 case 4: 4080 case 4:
4030 value = kvm_read_cr4(vcpu); 4081 value = kvm_read_cr4(vcpu);
@@ -4058,7 +4109,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
4058 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 4109 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4059 break; 4110 break;
4060 case 8: 4111 case 8:
4061 res = __kvm_set_cr8(vcpu, val & 0xfUL); 4112 res = kvm_set_cr8(vcpu, val);
4062 break; 4113 break;
4063 default: 4114 default:
4064 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4115 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@@ -4211,12 +4262,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
4211static void inject_emulated_exception(struct kvm_vcpu *vcpu) 4262static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4212{ 4263{
4213 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4264 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4214 if (ctxt->exception == PF_VECTOR) 4265 if (ctxt->exception.vector == PF_VECTOR)
4215 kvm_propagate_fault(vcpu); 4266 kvm_propagate_fault(vcpu, &ctxt->exception);
4216 else if (ctxt->error_code_valid) 4267 else if (ctxt->exception.error_code_valid)
4217 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 4268 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
4269 ctxt->exception.error_code);
4218 else 4270 else
4219 kvm_queue_exception(vcpu, ctxt->exception); 4271 kvm_queue_exception(vcpu, ctxt->exception.vector);
4220} 4272}
4221 4273
4222static void init_emulate_ctxt(struct kvm_vcpu *vcpu) 4274static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -4272,13 +4324,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4272 4324
4273static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4325static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4274{ 4326{
4327 int r = EMULATE_DONE;
4328
4275 ++vcpu->stat.insn_emulation_fail; 4329 ++vcpu->stat.insn_emulation_fail;
4276 trace_kvm_emulate_insn_failed(vcpu); 4330 trace_kvm_emulate_insn_failed(vcpu);
4277 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4331 if (!is_guest_mode(vcpu)) {
4278 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4332 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4279 vcpu->run->internal.ndata = 0; 4333 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4334 vcpu->run->internal.ndata = 0;
4335 r = EMULATE_FAIL;
4336 }
4280 kvm_queue_exception(vcpu, UD_VECTOR); 4337 kvm_queue_exception(vcpu, UD_VECTOR);
4281 return EMULATE_FAIL; 4338
4339 return r;
4282} 4340}
4283 4341
4284static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4342static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
@@ -4307,10 +4365,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4307 return false; 4365 return false;
4308} 4366}
4309 4367
4310int emulate_instruction(struct kvm_vcpu *vcpu, 4368int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4311 unsigned long cr2, 4369 unsigned long cr2,
4312 u16 error_code, 4370 int emulation_type,
4313 int emulation_type) 4371 void *insn,
4372 int insn_len)
4314{ 4373{
4315 int r; 4374 int r;
4316 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4375 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
@@ -4328,10 +4387,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
4328 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4387 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4329 init_emulate_ctxt(vcpu); 4388 init_emulate_ctxt(vcpu);
4330 vcpu->arch.emulate_ctxt.interruptibility = 0; 4389 vcpu->arch.emulate_ctxt.interruptibility = 0;
4331 vcpu->arch.emulate_ctxt.exception = -1; 4390 vcpu->arch.emulate_ctxt.have_exception = false;
4332 vcpu->arch.emulate_ctxt.perm_ok = false; 4391 vcpu->arch.emulate_ctxt.perm_ok = false;
4333 4392
4334 r = x86_decode_insn(&vcpu->arch.emulate_ctxt); 4393 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
4335 if (r == X86EMUL_PROPAGATE_FAULT) 4394 if (r == X86EMUL_PROPAGATE_FAULT)
4336 goto done; 4395 goto done;
4337 4396
@@ -4394,7 +4453,7 @@ restart:
4394 } 4453 }
4395 4454
4396done: 4455done:
4397 if (vcpu->arch.emulate_ctxt.exception >= 0) { 4456 if (vcpu->arch.emulate_ctxt.have_exception) {
4398 inject_emulated_exception(vcpu); 4457 inject_emulated_exception(vcpu);
4399 r = EMULATE_DONE; 4458 r = EMULATE_DONE;
4400 } else if (vcpu->arch.pio.count) { 4459 } else if (vcpu->arch.pio.count) {
@@ -4418,7 +4477,7 @@ done:
4418 4477
4419 return r; 4478 return r;
4420} 4479}
4421EXPORT_SYMBOL_GPL(emulate_instruction); 4480EXPORT_SYMBOL_GPL(x86_emulate_instruction);
4422 4481
4423int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 4482int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4424{ 4483{
@@ -4432,7 +4491,7 @@ EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
4432 4491
4433static void tsc_bad(void *info) 4492static void tsc_bad(void *info)
4434{ 4493{
4435 __get_cpu_var(cpu_tsc_khz) = 0; 4494 __this_cpu_write(cpu_tsc_khz, 0);
4436} 4495}
4437 4496
4438static void tsc_khz_changed(void *data) 4497static void tsc_khz_changed(void *data)
@@ -4446,7 +4505,7 @@ static void tsc_khz_changed(void *data)
4446 khz = cpufreq_quick_get(raw_smp_processor_id()); 4505 khz = cpufreq_quick_get(raw_smp_processor_id());
4447 if (!khz) 4506 if (!khz)
4448 khz = tsc_khz; 4507 khz = tsc_khz;
4449 __get_cpu_var(cpu_tsc_khz) = khz; 4508 __this_cpu_write(cpu_tsc_khz, khz);
4450} 4509}
4451 4510
4452static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 4511static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4569,9 +4628,11 @@ static void kvm_timer_init(void)
4569#ifdef CONFIG_CPU_FREQ 4628#ifdef CONFIG_CPU_FREQ
4570 struct cpufreq_policy policy; 4629 struct cpufreq_policy policy;
4571 memset(&policy, 0, sizeof(policy)); 4630 memset(&policy, 0, sizeof(policy));
4572 cpufreq_get_policy(&policy, get_cpu()); 4631 cpu = get_cpu();
4632 cpufreq_get_policy(&policy, cpu);
4573 if (policy.cpuinfo.max_freq) 4633 if (policy.cpuinfo.max_freq)
4574 max_tsc_khz = policy.cpuinfo.max_freq; 4634 max_tsc_khz = policy.cpuinfo.max_freq;
4635 put_cpu();
4575#endif 4636#endif
4576 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 4637 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
4577 CPUFREQ_TRANSITION_NOTIFIER); 4638 CPUFREQ_TRANSITION_NOTIFIER);
@@ -4656,7 +4717,6 @@ int kvm_arch_init(void *opaque)
4656 4717
4657 kvm_x86_ops = ops; 4718 kvm_x86_ops = ops;
4658 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 4719 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
4659 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
4660 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 4720 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
4661 PT_DIRTY_MASK, PT64_NX_MASK, 0); 4721 PT_DIRTY_MASK, PT64_NX_MASK, 0);
4662 4722
@@ -5119,6 +5179,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5119 vcpu->fpu_active = 0; 5179 vcpu->fpu_active = 0;
5120 kvm_x86_ops->fpu_deactivate(vcpu); 5180 kvm_x86_ops->fpu_deactivate(vcpu);
5121 } 5181 }
5182 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
5183 /* Page is swapped out. Do synthetic halt */
5184 vcpu->arch.apf.halted = true;
5185 r = 1;
5186 goto out;
5187 }
5122 } 5188 }
5123 5189
5124 r = kvm_mmu_reload(vcpu); 5190 r = kvm_mmu_reload(vcpu);
@@ -5247,7 +5313,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5247 5313
5248 r = 1; 5314 r = 1;
5249 while (r > 0) { 5315 while (r > 0) {
5250 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 5316 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
5317 !vcpu->arch.apf.halted)
5251 r = vcpu_enter_guest(vcpu); 5318 r = vcpu_enter_guest(vcpu);
5252 else { 5319 else {
5253 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5320 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -5260,6 +5327,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5260 vcpu->arch.mp_state = 5327 vcpu->arch.mp_state =
5261 KVM_MP_STATE_RUNNABLE; 5328 KVM_MP_STATE_RUNNABLE;
5262 case KVM_MP_STATE_RUNNABLE: 5329 case KVM_MP_STATE_RUNNABLE:
5330 vcpu->arch.apf.halted = false;
5263 break; 5331 break;
5264 case KVM_MP_STATE_SIPI_RECEIVED: 5332 case KVM_MP_STATE_SIPI_RECEIVED:
5265 default: 5333 default:
@@ -5281,6 +5349,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5281 vcpu->run->exit_reason = KVM_EXIT_INTR; 5349 vcpu->run->exit_reason = KVM_EXIT_INTR;
5282 ++vcpu->stat.request_irq_exits; 5350 ++vcpu->stat.request_irq_exits;
5283 } 5351 }
5352
5353 kvm_check_async_pf_completion(vcpu);
5354
5284 if (signal_pending(current)) { 5355 if (signal_pending(current)) {
5285 r = -EINTR; 5356 r = -EINTR;
5286 vcpu->run->exit_reason = KVM_EXIT_INTR; 5357 vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -5305,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5305 int r; 5376 int r;
5306 sigset_t sigsaved; 5377 sigset_t sigsaved;
5307 5378
5379 if (!tsk_used_math(current) && init_fpu(current))
5380 return -ENOMEM;
5381
5308 if (vcpu->sigset_active) 5382 if (vcpu->sigset_active)
5309 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 5383 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
5310 5384
@@ -5316,8 +5390,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5316 } 5390 }
5317 5391
5318 /* re-sync apic's tpr */ 5392 /* re-sync apic's tpr */
5319 if (!irqchip_in_kernel(vcpu->kvm)) 5393 if (!irqchip_in_kernel(vcpu->kvm)) {
5320 kvm_set_cr8(vcpu, kvm_run->cr8); 5394 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
5395 r = -EINVAL;
5396 goto out;
5397 }
5398 }
5321 5399
5322 if (vcpu->arch.pio.count || vcpu->mmio_needed) { 5400 if (vcpu->arch.pio.count || vcpu->mmio_needed) {
5323 if (vcpu->mmio_needed) { 5401 if (vcpu->mmio_needed) {
@@ -5326,7 +5404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5326 vcpu->mmio_needed = 0; 5404 vcpu->mmio_needed = 0;
5327 } 5405 }
5328 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5406 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5329 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); 5407 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5330 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5408 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5331 if (r != EMULATE_DONE) { 5409 if (r != EMULATE_DONE) {
5332 r = 0; 5410 r = 0;
@@ -5439,7 +5517,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
5439 5517
5440 sregs->cr0 = kvm_read_cr0(vcpu); 5518 sregs->cr0 = kvm_read_cr0(vcpu);
5441 sregs->cr2 = vcpu->arch.cr2; 5519 sregs->cr2 = vcpu->arch.cr2;
5442 sregs->cr3 = vcpu->arch.cr3; 5520 sregs->cr3 = kvm_read_cr3(vcpu);
5443 sregs->cr4 = kvm_read_cr4(vcpu); 5521 sregs->cr4 = kvm_read_cr4(vcpu);
5444 sregs->cr8 = kvm_get_cr8(vcpu); 5522 sregs->cr8 = kvm_get_cr8(vcpu);
5445 sregs->efer = vcpu->arch.efer; 5523 sregs->efer = vcpu->arch.efer;
@@ -5507,8 +5585,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5507 kvm_x86_ops->set_gdt(vcpu, &dt); 5585 kvm_x86_ops->set_gdt(vcpu, &dt);
5508 5586
5509 vcpu->arch.cr2 = sregs->cr2; 5587 vcpu->arch.cr2 = sregs->cr2;
5510 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 5588 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
5511 vcpu->arch.cr3 = sregs->cr3; 5589 vcpu->arch.cr3 = sregs->cr3;
5590 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
5512 5591
5513 kvm_set_cr8(vcpu, sregs->cr8); 5592 kvm_set_cr8(vcpu, sregs->cr8);
5514 5593
@@ -5522,8 +5601,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5522 5601
5523 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5602 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
5524 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5603 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5604 if (sregs->cr4 & X86_CR4_OSXSAVE)
5605 update_cpuid(vcpu);
5525 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5606 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5526 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); 5607 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
5527 mmu_reset_needed = 1; 5608 mmu_reset_needed = 1;
5528 } 5609 }
5529 5610
@@ -5774,6 +5855,8 @@ free_vcpu:
5774 5855
5775void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 5856void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
5776{ 5857{
5858 vcpu->arch.apf.msr_val = 0;
5859
5777 vcpu_load(vcpu); 5860 vcpu_load(vcpu);
5778 kvm_mmu_unload(vcpu); 5861 kvm_mmu_unload(vcpu);
5779 vcpu_put(vcpu); 5862 vcpu_put(vcpu);
@@ -5793,6 +5876,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5793 vcpu->arch.dr7 = DR7_FIXED_1; 5876 vcpu->arch.dr7 = DR7_FIXED_1;
5794 5877
5795 kvm_make_request(KVM_REQ_EVENT, vcpu); 5878 kvm_make_request(KVM_REQ_EVENT, vcpu);
5879 vcpu->arch.apf.msr_val = 0;
5880
5881 kvm_clear_async_pf_completion_queue(vcpu);
5882 kvm_async_pf_hash_reset(vcpu);
5883 vcpu->arch.apf.halted = false;
5796 5884
5797 return kvm_x86_ops->vcpu_reset(vcpu); 5885 return kvm_x86_ops->vcpu_reset(vcpu);
5798} 5886}
@@ -5882,6 +5970,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5882 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 5970 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5883 goto fail_free_mce_banks; 5971 goto fail_free_mce_banks;
5884 5972
5973 kvm_async_pf_hash_reset(vcpu);
5974
5885 return 0; 5975 return 0;
5886fail_free_mce_banks: 5976fail_free_mce_banks:
5887 kfree(vcpu->arch.mce_banks); 5977 kfree(vcpu->arch.mce_banks);
@@ -5907,13 +5997,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
5907 free_page((unsigned long)vcpu->arch.pio_data); 5997 free_page((unsigned long)vcpu->arch.pio_data);
5908} 5998}
5909 5999
5910struct kvm *kvm_arch_create_vm(void) 6000int kvm_arch_init_vm(struct kvm *kvm)
5911{ 6001{
5912 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
5913
5914 if (!kvm)
5915 return ERR_PTR(-ENOMEM);
5916
5917 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6002 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5918 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6003 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5919 6004
@@ -5922,7 +6007,7 @@ struct kvm *kvm_arch_create_vm(void)
5922 6007
5923 spin_lock_init(&kvm->arch.tsc_write_lock); 6008 spin_lock_init(&kvm->arch.tsc_write_lock);
5924 6009
5925 return kvm; 6010 return 0;
5926} 6011}
5927 6012
5928static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 6013static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -5940,8 +6025,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
5940 /* 6025 /*
5941 * Unpin any mmu pages first. 6026 * Unpin any mmu pages first.
5942 */ 6027 */
5943 kvm_for_each_vcpu(i, vcpu, kvm) 6028 kvm_for_each_vcpu(i, vcpu, kvm) {
6029 kvm_clear_async_pf_completion_queue(vcpu);
5944 kvm_unload_vcpu_mmu(vcpu); 6030 kvm_unload_vcpu_mmu(vcpu);
6031 }
5945 kvm_for_each_vcpu(i, vcpu, kvm) 6032 kvm_for_each_vcpu(i, vcpu, kvm)
5946 kvm_arch_vcpu_free(vcpu); 6033 kvm_arch_vcpu_free(vcpu);
5947 6034
@@ -5965,13 +6052,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5965 kfree(kvm->arch.vpic); 6052 kfree(kvm->arch.vpic);
5966 kfree(kvm->arch.vioapic); 6053 kfree(kvm->arch.vioapic);
5967 kvm_free_vcpus(kvm); 6054 kvm_free_vcpus(kvm);
5968 kvm_free_physmem(kvm);
5969 if (kvm->arch.apic_access_page) 6055 if (kvm->arch.apic_access_page)
5970 put_page(kvm->arch.apic_access_page); 6056 put_page(kvm->arch.apic_access_page);
5971 if (kvm->arch.ept_identity_pagetable) 6057 if (kvm->arch.ept_identity_pagetable)
5972 put_page(kvm->arch.ept_identity_pagetable); 6058 put_page(kvm->arch.ept_identity_pagetable);
5973 cleanup_srcu_struct(&kvm->srcu);
5974 kfree(kvm);
5975} 6059}
5976 6060
5977int kvm_arch_prepare_memory_region(struct kvm *kvm, 6061int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -6052,7 +6136,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
6052 6136
6053int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6137int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6054{ 6138{
6055 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 6139 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6140 !vcpu->arch.apf.halted)
6141 || !list_empty_careful(&vcpu->async_pf.done)
6056 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6142 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
6057 || vcpu->arch.nmi_pending || 6143 || vcpu->arch.nmi_pending ||
6058 (kvm_arch_interrupt_allowed(vcpu) && 6144 (kvm_arch_interrupt_allowed(vcpu) &&
@@ -6111,6 +6197,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
6111} 6197}
6112EXPORT_SYMBOL_GPL(kvm_set_rflags); 6198EXPORT_SYMBOL_GPL(kvm_set_rflags);
6113 6199
6200void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
6201{
6202 int r;
6203
6204 if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
6205 is_error_page(work->page))
6206 return;
6207
6208 r = kvm_mmu_reload(vcpu);
6209 if (unlikely(r))
6210 return;
6211
6212 if (!vcpu->arch.mmu.direct_map &&
6213 work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
6214 return;
6215
6216 vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
6217}
6218
6219static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
6220{
6221 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
6222}
6223
6224static inline u32 kvm_async_pf_next_probe(u32 key)
6225{
6226 return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
6227}
6228
6229static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6230{
6231 u32 key = kvm_async_pf_hash_fn(gfn);
6232
6233 while (vcpu->arch.apf.gfns[key] != ~0)
6234 key = kvm_async_pf_next_probe(key);
6235
6236 vcpu->arch.apf.gfns[key] = gfn;
6237}
6238
6239static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
6240{
6241 int i;
6242 u32 key = kvm_async_pf_hash_fn(gfn);
6243
6244 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
6245 (vcpu->arch.apf.gfns[key] != gfn &&
6246 vcpu->arch.apf.gfns[key] != ~0); i++)
6247 key = kvm_async_pf_next_probe(key);
6248
6249 return key;
6250}
6251
6252bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6253{
6254 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
6255}
6256
6257static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6258{
6259 u32 i, j, k;
6260
6261 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
6262 while (true) {
6263 vcpu->arch.apf.gfns[i] = ~0;
6264 do {
6265 j = kvm_async_pf_next_probe(j);
6266 if (vcpu->arch.apf.gfns[j] == ~0)
6267 return;
6268 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
6269 /*
6270 * k lies cyclically in ]i,j]
6271 * | i.k.j |
6272 * |....j i.k.| or |.k..j i...|
6273 */
6274 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
6275 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
6276 i = j;
6277 }
6278}
6279
6280static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
6281{
6282
6283 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
6284 sizeof(val));
6285}
6286
6287void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
6288 struct kvm_async_pf *work)
6289{
6290 struct x86_exception fault;
6291
6292 trace_kvm_async_pf_not_present(work->arch.token, work->gva);
6293 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
6294
6295 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
6296 (vcpu->arch.apf.send_user_only &&
6297 kvm_x86_ops->get_cpl(vcpu) == 0))
6298 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
6299 else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
6300 fault.vector = PF_VECTOR;
6301 fault.error_code_valid = true;
6302 fault.error_code = 0;
6303 fault.nested_page_fault = false;
6304 fault.address = work->arch.token;
6305 kvm_inject_page_fault(vcpu, &fault);
6306 }
6307}
6308
6309void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
6310 struct kvm_async_pf *work)
6311{
6312 struct x86_exception fault;
6313
6314 trace_kvm_async_pf_ready(work->arch.token, work->gva);
6315 if (is_error_page(work->page))
6316 work->arch.token = ~0; /* broadcast wakeup */
6317 else
6318 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
6319
6320 if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
6321 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
6322 fault.vector = PF_VECTOR;
6323 fault.error_code_valid = true;
6324 fault.error_code = 0;
6325 fault.nested_page_fault = false;
6326 fault.address = work->arch.token;
6327 kvm_inject_page_fault(vcpu, &fault);
6328 }
6329 vcpu->arch.apf.halted = false;
6330}
6331
6332bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
6333{
6334 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
6335 return true;
6336 else
6337 return !kvm_event_needs_reinjection(vcpu) &&
6338 kvm_x86_ops->interrupt_allowed(vcpu);
6339}
6340
6114EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 6341EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
6115EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 6342EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
6116EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 6343EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2cea414489f3..c600da830ce0 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -70,6 +70,11 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
70 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 70 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
71} 71}
72 72
73static inline u32 bit(int bitno)
74{
75 return 1 << (bitno & 31);
76}
77
73void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 78void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
74void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 79void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
75int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); 80int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);