aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-13 12:55:09 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-13 12:55:09 -0500
commitb9085bcbf5f43adf60533f9b635b2e7faeed0fe9 (patch)
treee397abf5682a45c096e75b3d0fa99c8e228425fc /arch/x86/kvm
parentc7d7b98671552abade78834c522b7308bda73c0d (diff)
parent6557bada461afeaa920a189fae2cff7c8fdce39f (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM update from Paolo Bonzini: "Fairly small update, but there are some interesting new features. Common: Optional support for adding a small amount of polling on each HLT instruction executed in the guest (or equivalent for other architectures). This can improve latency up to 50% on some scenarios (e.g. O_DSYNC writes or TCP_RR netperf tests). This also has to be enabled manually for now, but the plan is to auto-tune this in the future. ARM/ARM64: The highlights are support for GICv3 emulation and dirty page tracking s390: Several optimizations and bugfixes. Also a first: a feature exposed by KVM (UUID and long guest name in /proc/sysinfo) before it is available in IBM's hypervisor! :) MIPS: Bugfixes. x86: Support for PML (page modification logging, a new feature in Broadwell Xeons that speeds up dirty page tracking), nested virtualization improvements (nested APICv---a nice optimization), usual round of emulation fixes. There is also a new option to reduce latency of the TSC deadline timer in the guest; this needs to be tuned manually. Some commits are common between this pull and Catalin's; I see you have already included his tree. Powerpc: Nothing yet. The KVM/PPC changes will come in through the PPC maintainers, because I haven't received them yet and I might end up being offline for some part of next week" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits) KVM: ia64: drop kvm.h from installed user headers KVM: x86: fix build with !CONFIG_SMP KVM: x86: emulate: correct page fault error code for NoWrite instructions KVM: Disable compat ioctl for s390 KVM: s390: add cpu model support KVM: s390: use facilities and cpu_id per KVM KVM: s390/CPACF: Choose crypto control block format s390/kernel: Update /proc/sysinfo file with Extended Name and UUID KVM: s390: reenable LPP facility KVM: s390: floating irqs: fix user triggerable endless loop kvm: add halt_poll_ns module parameter kvm: remove KVM_MMIO_SIZE KVM: MIPS: Don't leak FPU/DSP to guest KVM: MIPS: Disable HTW while in guest KVM: nVMX: Enable nested posted interrupt processing KVM: nVMX: Enable nested virtual interrupt delivery KVM: nVMX: Enable nested apic register virtualization KVM: nVMX: Make nested control MSRs per-cpu KVM: nVMX: Enable nested virtualize x2apic mode KVM: nVMX: Prepare for using hardware MSR bitmap ...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/emulate.c230
-rw-r--r--arch/x86/kvm/ioapic.h2
-rw-r--r--arch/x86/kvm/iommu.c4
-rw-r--r--arch/x86/kvm/lapic.c147
-rw-r--r--arch/x86/kvm/lapic.h6
-rw-r--r--arch/x86/kvm/mmu.c351
-rw-r--r--arch/x86/kvm/mmu.h17
-rw-r--r--arch/x86/kvm/svm.c4
-rw-r--r--arch/x86/kvm/trace.h38
-rw-r--r--arch/x86/kvm/vmx.c1086
-rw-r--r--arch/x86/kvm/x86.c209
-rw-r--r--arch/x86/kvm/x86.h3
13 files changed, 1673 insertions, 425 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 7dc7ba577ecd..413a7bf9efbb 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -39,6 +39,7 @@ config KVM
39 select PERF_EVENTS 39 select PERF_EVENTS
40 select HAVE_KVM_MSI 40 select HAVE_KVM_MSI
41 select HAVE_KVM_CPU_RELAX_INTERCEPT 41 select HAVE_KVM_CPU_RELAX_INTERCEPT
42 select KVM_GENERIC_DIRTYLOG_READ_PROTECT
42 select KVM_VFIO 43 select KVM_VFIO
43 select SRCU 44 select SRCU
44 ---help--- 45 ---help---
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index de12c1d379f1..e0b794a84c35 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -86,6 +86,7 @@
86#define DstAcc (OpAcc << DstShift) 86#define DstAcc (OpAcc << DstShift)
87#define DstDI (OpDI << DstShift) 87#define DstDI (OpDI << DstShift)
88#define DstMem64 (OpMem64 << DstShift) 88#define DstMem64 (OpMem64 << DstShift)
89#define DstMem16 (OpMem16 << DstShift)
89#define DstImmUByte (OpImmUByte << DstShift) 90#define DstImmUByte (OpImmUByte << DstShift)
90#define DstDX (OpDX << DstShift) 91#define DstDX (OpDX << DstShift)
91#define DstAccLo (OpAccLo << DstShift) 92#define DstAccLo (OpAccLo << DstShift)
@@ -124,6 +125,7 @@
124#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ 125#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
125#define Escape (5<<15) /* Escape to coprocessor instruction */ 126#define Escape (5<<15) /* Escape to coprocessor instruction */
126#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ 127#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
128#define ModeDual (7<<15) /* Different instruction for 32/64 bit */
127#define Sse (1<<18) /* SSE Vector instruction */ 129#define Sse (1<<18) /* SSE Vector instruction */
128/* Generic ModRM decode. */ 130/* Generic ModRM decode. */
129#define ModRM (1<<19) 131#define ModRM (1<<19)
@@ -165,10 +167,10 @@
165#define NoMod ((u64)1 << 47) /* Mod field is ignored */ 167#define NoMod ((u64)1 << 47) /* Mod field is ignored */
166#define Intercept ((u64)1 << 48) /* Has valid intercept field */ 168#define Intercept ((u64)1 << 48) /* Has valid intercept field */
167#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ 169#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
168#define NoBigReal ((u64)1 << 50) /* No big real mode */
169#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ 170#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
170#define NearBranch ((u64)1 << 52) /* Near branches */ 171#define NearBranch ((u64)1 << 52) /* Near branches */
171#define No16 ((u64)1 << 53) /* No 16 bit operand */ 172#define No16 ((u64)1 << 53) /* No 16 bit operand */
173#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
172 174
173#define DstXacc (DstAccLo | SrcAccHi | SrcWrite) 175#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
174 176
@@ -213,6 +215,7 @@ struct opcode {
213 const struct gprefix *gprefix; 215 const struct gprefix *gprefix;
214 const struct escape *esc; 216 const struct escape *esc;
215 const struct instr_dual *idual; 217 const struct instr_dual *idual;
218 const struct mode_dual *mdual;
216 void (*fastop)(struct fastop *fake); 219 void (*fastop)(struct fastop *fake);
217 } u; 220 } u;
218 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 221 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
@@ -240,6 +243,11 @@ struct instr_dual {
240 struct opcode mod3; 243 struct opcode mod3;
241}; 244};
242 245
246struct mode_dual {
247 struct opcode mode32;
248 struct opcode mode64;
249};
250
243/* EFLAGS bit definitions. */ 251/* EFLAGS bit definitions. */
244#define EFLG_ID (1<<21) 252#define EFLG_ID (1<<21)
245#define EFLG_VIP (1<<20) 253#define EFLG_VIP (1<<20)
@@ -262,6 +270,13 @@ struct instr_dual {
262#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a 270#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
263#define EFLG_RESERVED_ONE_MASK 2 271#define EFLG_RESERVED_ONE_MASK 2
264 272
273enum x86_transfer_type {
274 X86_TRANSFER_NONE,
275 X86_TRANSFER_CALL_JMP,
276 X86_TRANSFER_RET,
277 X86_TRANSFER_TASK_SWITCH,
278};
279
265static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) 280static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
266{ 281{
267 if (!(ctxt->regs_valid & (1 << nr))) { 282 if (!(ctxt->regs_valid & (1 << nr))) {
@@ -669,9 +684,13 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
669 } 684 }
670 if (addr.ea > lim) 685 if (addr.ea > lim)
671 goto bad; 686 goto bad;
672 *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); 687 if (lim == 0xffffffff)
673 if (size > *max_size) 688 *max_size = ~0u;
674 goto bad; 689 else {
690 *max_size = (u64)lim + 1 - addr.ea;
691 if (size > *max_size)
692 goto bad;
693 }
675 la &= (u32)-1; 694 la &= (u32)-1;
676 break; 695 break;
677 } 696 }
@@ -722,19 +741,26 @@ static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
722 const struct desc_struct *cs_desc) 741 const struct desc_struct *cs_desc)
723{ 742{
724 enum x86emul_mode mode = ctxt->mode; 743 enum x86emul_mode mode = ctxt->mode;
744 int rc;
725 745
726#ifdef CONFIG_X86_64 746#ifdef CONFIG_X86_64
727 if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) { 747 if (ctxt->mode >= X86EMUL_MODE_PROT16) {
728 u64 efer = 0; 748 if (cs_desc->l) {
749 u64 efer = 0;
729 750
730 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); 751 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
731 if (efer & EFER_LMA) 752 if (efer & EFER_LMA)
732 mode = X86EMUL_MODE_PROT64; 753 mode = X86EMUL_MODE_PROT64;
754 } else
755 mode = X86EMUL_MODE_PROT32; /* temporary value */
733 } 756 }
734#endif 757#endif
735 if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) 758 if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
736 mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 759 mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
737 return assign_eip(ctxt, dst, mode); 760 rc = assign_eip(ctxt, dst, mode);
761 if (rc == X86EMUL_CONTINUE)
762 ctxt->mode = mode;
763 return rc;
738} 764}
739 765
740static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) 766static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -1057,8 +1083,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
1057 asm volatile("fnstcw %0": "+m"(fcw)); 1083 asm volatile("fnstcw %0": "+m"(fcw));
1058 ctxt->ops->put_fpu(ctxt); 1084 ctxt->ops->put_fpu(ctxt);
1059 1085
1060 /* force 2 byte destination */
1061 ctxt->dst.bytes = 2;
1062 ctxt->dst.val = fcw; 1086 ctxt->dst.val = fcw;
1063 1087
1064 return X86EMUL_CONTINUE; 1088 return X86EMUL_CONTINUE;
@@ -1075,8 +1099,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
1075 asm volatile("fnstsw %0": "+m"(fsw)); 1099 asm volatile("fnstsw %0": "+m"(fsw));
1076 ctxt->ops->put_fpu(ctxt); 1100 ctxt->ops->put_fpu(ctxt);
1077 1101
1078 /* force 2 byte destination */
1079 ctxt->dst.bytes = 2;
1080 ctxt->dst.val = fsw; 1102 ctxt->dst.val = fsw;
1081 1103
1082 return X86EMUL_CONTINUE; 1104 return X86EMUL_CONTINUE;
@@ -1223,6 +1245,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1223 else { 1245 else {
1224 modrm_ea += reg_read(ctxt, base_reg); 1246 modrm_ea += reg_read(ctxt, base_reg);
1225 adjust_modrm_seg(ctxt, base_reg); 1247 adjust_modrm_seg(ctxt, base_reg);
1248 /* Increment ESP on POP [ESP] */
1249 if ((ctxt->d & IncSP) &&
1250 base_reg == VCPU_REGS_RSP)
1251 modrm_ea += ctxt->op_bytes;
1226 } 1252 }
1227 if (index_reg != 4) 1253 if (index_reg != 4)
1228 modrm_ea += reg_read(ctxt, index_reg) << scale; 1254 modrm_ea += reg_read(ctxt, index_reg) << scale;
@@ -1435,10 +1461,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1435 ops->get_gdt(ctxt, dt); 1461 ops->get_gdt(ctxt, dt);
1436} 1462}
1437 1463
1438/* allowed just for 8 bytes segments */ 1464static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
1439static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1465 u16 selector, ulong *desc_addr_p)
1440 u16 selector, struct desc_struct *desc,
1441 ulong *desc_addr_p)
1442{ 1466{
1443 struct desc_ptr dt; 1467 struct desc_ptr dt;
1444 u16 index = selector >> 3; 1468 u16 index = selector >> 3;
@@ -1449,8 +1473,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1449 if (dt.size < index * 8 + 7) 1473 if (dt.size < index * 8 + 7)
1450 return emulate_gp(ctxt, selector & 0xfffc); 1474 return emulate_gp(ctxt, selector & 0xfffc);
1451 1475
1452 *desc_addr_p = addr = dt.address + index * 8; 1476 addr = dt.address + index * 8;
1453 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, 1477
1478#ifdef CONFIG_X86_64
1479 if (addr >> 32 != 0) {
1480 u64 efer = 0;
1481
1482 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
1483 if (!(efer & EFER_LMA))
1484 addr &= (u32)-1;
1485 }
1486#endif
1487
1488 *desc_addr_p = addr;
1489 return X86EMUL_CONTINUE;
1490}
1491
1492/* allowed just for 8 bytes segments */
1493static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1494 u16 selector, struct desc_struct *desc,
1495 ulong *desc_addr_p)
1496{
1497 int rc;
1498
1499 rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
1500 if (rc != X86EMUL_CONTINUE)
1501 return rc;
1502
1503 return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc),
1454 &ctxt->exception); 1504 &ctxt->exception);
1455} 1505}
1456 1506
@@ -1458,16 +1508,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1458static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1508static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1459 u16 selector, struct desc_struct *desc) 1509 u16 selector, struct desc_struct *desc)
1460{ 1510{
1461 struct desc_ptr dt; 1511 int rc;
1462 u16 index = selector >> 3;
1463 ulong addr; 1512 ulong addr;
1464 1513
1465 get_descriptor_table_ptr(ctxt, selector, &dt); 1514 rc = get_descriptor_ptr(ctxt, selector, &addr);
1466 1515 if (rc != X86EMUL_CONTINUE)
1467 if (dt.size < index * 8 + 7) 1516 return rc;
1468 return emulate_gp(ctxt, selector & 0xfffc);
1469 1517
1470 addr = dt.address + index * 8;
1471 return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, 1518 return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
1472 &ctxt->exception); 1519 &ctxt->exception);
1473} 1520}
@@ -1475,7 +1522,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1475/* Does not support long mode */ 1522/* Does not support long mode */
1476static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1523static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1477 u16 selector, int seg, u8 cpl, 1524 u16 selector, int seg, u8 cpl,
1478 bool in_task_switch, 1525 enum x86_transfer_type transfer,
1479 struct desc_struct *desc) 1526 struct desc_struct *desc)
1480{ 1527{
1481 struct desc_struct seg_desc, old_desc; 1528 struct desc_struct seg_desc, old_desc;
@@ -1529,11 +1576,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1529 return ret; 1576 return ret;
1530 1577
1531 err_code = selector & 0xfffc; 1578 err_code = selector & 0xfffc;
1532 err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR; 1579 err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
1580 GP_VECTOR;
1533 1581
1534 /* can't load system descriptor into segment selector */ 1582 /* can't load system descriptor into segment selector */
1535 if (seg <= VCPU_SREG_GS && !seg_desc.s) 1583 if (seg <= VCPU_SREG_GS && !seg_desc.s) {
1584 if (transfer == X86_TRANSFER_CALL_JMP)
1585 return X86EMUL_UNHANDLEABLE;
1536 goto exception; 1586 goto exception;
1587 }
1537 1588
1538 if (!seg_desc.p) { 1589 if (!seg_desc.p) {
1539 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; 1590 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
@@ -1605,10 +1656,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1605 1656
1606 if (seg_desc.s) { 1657 if (seg_desc.s) {
1607 /* mark segment as accessed */ 1658 /* mark segment as accessed */
1608 seg_desc.type |= 1; 1659 if (!(seg_desc.type & 1)) {
1609 ret = write_segment_descriptor(ctxt, selector, &seg_desc); 1660 seg_desc.type |= 1;
1610 if (ret != X86EMUL_CONTINUE) 1661 ret = write_segment_descriptor(ctxt, selector,
1611 return ret; 1662 &seg_desc);
1663 if (ret != X86EMUL_CONTINUE)
1664 return ret;
1665 }
1612 } else if (ctxt->mode == X86EMUL_MODE_PROT64) { 1666 } else if (ctxt->mode == X86EMUL_MODE_PROT64) {
1613 ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3, 1667 ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
1614 sizeof(base3), &ctxt->exception); 1668 sizeof(base3), &ctxt->exception);
@@ -1631,7 +1685,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1631 u16 selector, int seg) 1685 u16 selector, int seg)
1632{ 1686{
1633 u8 cpl = ctxt->ops->cpl(ctxt); 1687 u8 cpl = ctxt->ops->cpl(ctxt);
1634 return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL); 1688 return __load_segment_descriptor(ctxt, selector, seg, cpl,
1689 X86_TRANSFER_NONE, NULL);
1635} 1690}
1636 1691
1637static void write_register_operand(struct operand *op) 1692static void write_register_operand(struct operand *op)
@@ -1828,12 +1883,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
1828 unsigned long selector; 1883 unsigned long selector;
1829 int rc; 1884 int rc;
1830 1885
1831 rc = emulate_pop(ctxt, &selector, ctxt->op_bytes); 1886 rc = emulate_pop(ctxt, &selector, 2);
1832 if (rc != X86EMUL_CONTINUE) 1887 if (rc != X86EMUL_CONTINUE)
1833 return rc; 1888 return rc;
1834 1889
1835 if (ctxt->modrm_reg == VCPU_SREG_SS) 1890 if (ctxt->modrm_reg == VCPU_SREG_SS)
1836 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; 1891 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
1892 if (ctxt->op_bytes > 2)
1893 rsp_increment(ctxt, ctxt->op_bytes - 2);
1837 1894
1838 rc = load_segment_descriptor(ctxt, (u16)selector, seg); 1895 rc = load_segment_descriptor(ctxt, (u16)selector, seg);
1839 return rc; 1896 return rc;
@@ -2007,6 +2064,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
2007 2064
2008 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ 2065 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
2009 ctxt->eflags |= EFLG_RESERVED_ONE_MASK; 2066 ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
2067 ctxt->ops->set_nmi_mask(ctxt, false);
2010 2068
2011 return rc; 2069 return rc;
2012} 2070}
@@ -2041,7 +2099,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
2041 2099
2042 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); 2100 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
2043 2101
2044 rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, 2102 rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
2103 X86_TRANSFER_CALL_JMP,
2045 &new_desc); 2104 &new_desc);
2046 if (rc != X86EMUL_CONTINUE) 2105 if (rc != X86EMUL_CONTINUE)
2047 return rc; 2106 return rc;
@@ -2130,7 +2189,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
2130 /* Outer-privilege level return is not implemented */ 2189 /* Outer-privilege level return is not implemented */
2131 if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) 2190 if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
2132 return X86EMUL_UNHANDLEABLE; 2191 return X86EMUL_UNHANDLEABLE;
2133 rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false, 2192 rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
2193 X86_TRANSFER_RET,
2134 &new_desc); 2194 &new_desc);
2135 if (rc != X86EMUL_CONTINUE) 2195 if (rc != X86EMUL_CONTINUE)
2136 return rc; 2196 return rc;
@@ -2163,12 +2223,15 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2163 fastop(ctxt, em_cmp); 2223 fastop(ctxt, em_cmp);
2164 2224
2165 if (ctxt->eflags & EFLG_ZF) { 2225 if (ctxt->eflags & EFLG_ZF) {
2166 /* Success: write back to memory. */ 2226 /* Success: write back to memory; no update of EAX */
2227 ctxt->src.type = OP_NONE;
2167 ctxt->dst.val = ctxt->src.orig_val; 2228 ctxt->dst.val = ctxt->src.orig_val;
2168 } else { 2229 } else {
2169 /* Failure: write the value we saw to EAX. */ 2230 /* Failure: write the value we saw to EAX. */
2170 ctxt->dst.type = OP_REG; 2231 ctxt->src.type = OP_REG;
2171 ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); 2232 ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
2233 ctxt->src.val = ctxt->dst.orig_val;
2234 /* Create write-cycle to dest by writing the same value */
2172 ctxt->dst.val = ctxt->dst.orig_val; 2235 ctxt->dst.val = ctxt->dst.orig_val;
2173 } 2236 }
2174 return X86EMUL_CONTINUE; 2237 return X86EMUL_CONTINUE;
@@ -2556,23 +2619,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2556 * it is handled in a context of new task 2619 * it is handled in a context of new task
2557 */ 2620 */
2558 ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, 2621 ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
2559 true, NULL); 2622 X86_TRANSFER_TASK_SWITCH, NULL);
2560 if (ret != X86EMUL_CONTINUE) 2623 if (ret != X86EMUL_CONTINUE)
2561 return ret; 2624 return ret;
2562 ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, 2625 ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
2563 true, NULL); 2626 X86_TRANSFER_TASK_SWITCH, NULL);
2564 if (ret != X86EMUL_CONTINUE) 2627 if (ret != X86EMUL_CONTINUE)
2565 return ret; 2628 return ret;
2566 ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, 2629 ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
2567 true, NULL); 2630 X86_TRANSFER_TASK_SWITCH, NULL);
2568 if (ret != X86EMUL_CONTINUE) 2631 if (ret != X86EMUL_CONTINUE)
2569 return ret; 2632 return ret;
2570 ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, 2633 ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
2571 true, NULL); 2634 X86_TRANSFER_TASK_SWITCH, NULL);
2572 if (ret != X86EMUL_CONTINUE) 2635 if (ret != X86EMUL_CONTINUE)
2573 return ret; 2636 return ret;
2574 ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, 2637 ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
2575 true, NULL); 2638 X86_TRANSFER_TASK_SWITCH, NULL);
2576 if (ret != X86EMUL_CONTINUE) 2639 if (ret != X86EMUL_CONTINUE)
2577 return ret; 2640 return ret;
2578 2641
@@ -2694,31 +2757,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2694 * it is handled in a context of new task 2757 * it is handled in a context of new task
2695 */ 2758 */
2696 ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, 2759 ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
2697 cpl, true, NULL); 2760 cpl, X86_TRANSFER_TASK_SWITCH, NULL);
2698 if (ret != X86EMUL_CONTINUE) 2761 if (ret != X86EMUL_CONTINUE)
2699 return ret; 2762 return ret;
2700 ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, 2763 ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
2701 true, NULL); 2764 X86_TRANSFER_TASK_SWITCH, NULL);
2702 if (ret != X86EMUL_CONTINUE) 2765 if (ret != X86EMUL_CONTINUE)
2703 return ret; 2766 return ret;
2704 ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, 2767 ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
2705 true, NULL); 2768 X86_TRANSFER_TASK_SWITCH, NULL);
2706 if (ret != X86EMUL_CONTINUE) 2769 if (ret != X86EMUL_CONTINUE)
2707 return ret; 2770 return ret;
2708 ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, 2771 ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
2709 true, NULL); 2772 X86_TRANSFER_TASK_SWITCH, NULL);
2710 if (ret != X86EMUL_CONTINUE) 2773 if (ret != X86EMUL_CONTINUE)
2711 return ret; 2774 return ret;
2712 ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, 2775 ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
2713 true, NULL); 2776 X86_TRANSFER_TASK_SWITCH, NULL);
2714 if (ret != X86EMUL_CONTINUE) 2777 if (ret != X86EMUL_CONTINUE)
2715 return ret; 2778 return ret;
2716 ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, 2779 ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
2717 true, NULL); 2780 X86_TRANSFER_TASK_SWITCH, NULL);
2718 if (ret != X86EMUL_CONTINUE) 2781 if (ret != X86EMUL_CONTINUE)
2719 return ret; 2782 return ret;
2720 ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, 2783 ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
2721 true, NULL); 2784 X86_TRANSFER_TASK_SWITCH, NULL);
2722 if (ret != X86EMUL_CONTINUE) 2785 if (ret != X86EMUL_CONTINUE)
2723 return ret; 2786 return ret;
2724 2787
@@ -2739,7 +2802,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2739 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, 2802 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2740 &ctxt->exception); 2803 &ctxt->exception);
2741 if (ret != X86EMUL_CONTINUE) 2804 if (ret != X86EMUL_CONTINUE)
2742 /* FIXME: need to provide precise fault address */
2743 return ret; 2805 return ret;
2744 2806
2745 save_state_to_tss32(ctxt, &tss_seg); 2807 save_state_to_tss32(ctxt, &tss_seg);
@@ -2748,13 +2810,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2748 ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, 2810 ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
2749 ldt_sel_offset - eip_offset, &ctxt->exception); 2811 ldt_sel_offset - eip_offset, &ctxt->exception);
2750 if (ret != X86EMUL_CONTINUE) 2812 if (ret != X86EMUL_CONTINUE)
2751 /* FIXME: need to provide precise fault address */
2752 return ret; 2813 return ret;
2753 2814
2754 ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, 2815 ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
2755 &ctxt->exception); 2816 &ctxt->exception);
2756 if (ret != X86EMUL_CONTINUE) 2817 if (ret != X86EMUL_CONTINUE)
2757 /* FIXME: need to provide precise fault address */
2758 return ret; 2818 return ret;
2759 2819
2760 if (old_tss_sel != 0xffff) { 2820 if (old_tss_sel != 0xffff) {
@@ -2765,7 +2825,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2765 sizeof tss_seg.prev_task_link, 2825 sizeof tss_seg.prev_task_link,
2766 &ctxt->exception); 2826 &ctxt->exception);
2767 if (ret != X86EMUL_CONTINUE) 2827 if (ret != X86EMUL_CONTINUE)
2768 /* FIXME: need to provide precise fault address */
2769 return ret; 2828 return ret;
2770 } 2829 }
2771 2830
@@ -2999,15 +3058,16 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
2999 struct desc_struct old_desc, new_desc; 3058 struct desc_struct old_desc, new_desc;
3000 const struct x86_emulate_ops *ops = ctxt->ops; 3059 const struct x86_emulate_ops *ops = ctxt->ops;
3001 int cpl = ctxt->ops->cpl(ctxt); 3060 int cpl = ctxt->ops->cpl(ctxt);
3061 enum x86emul_mode prev_mode = ctxt->mode;
3002 3062
3003 old_eip = ctxt->_eip; 3063 old_eip = ctxt->_eip;
3004 ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); 3064 ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
3005 3065
3006 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); 3066 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
3007 rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, 3067 rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
3008 &new_desc); 3068 X86_TRANSFER_CALL_JMP, &new_desc);
3009 if (rc != X86EMUL_CONTINUE) 3069 if (rc != X86EMUL_CONTINUE)
3010 return X86EMUL_CONTINUE; 3070 return rc;
3011 3071
3012 rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); 3072 rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
3013 if (rc != X86EMUL_CONTINUE) 3073 if (rc != X86EMUL_CONTINUE)
@@ -3022,11 +3082,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
3022 rc = em_push(ctxt); 3082 rc = em_push(ctxt);
3023 /* If we failed, we tainted the memory, but the very least we should 3083 /* If we failed, we tainted the memory, but the very least we should
3024 restore cs */ 3084 restore cs */
3025 if (rc != X86EMUL_CONTINUE) 3085 if (rc != X86EMUL_CONTINUE) {
3086 pr_warn_once("faulting far call emulation tainted memory\n");
3026 goto fail; 3087 goto fail;
3088 }
3027 return rc; 3089 return rc;
3028fail: 3090fail:
3029 ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); 3091 ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
3092 ctxt->mode = prev_mode;
3030 return rc; 3093 return rc;
3031 3094
3032} 3095}
@@ -3477,6 +3540,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt)
3477 return X86EMUL_CONTINUE; 3540 return X86EMUL_CONTINUE;
3478} 3541}
3479 3542
3543static int em_movsxd(struct x86_emulate_ctxt *ctxt)
3544{
3545 ctxt->dst.val = (s32) ctxt->src.val;
3546 return X86EMUL_CONTINUE;
3547}
3548
3480static bool valid_cr(int nr) 3549static bool valid_cr(int nr)
3481{ 3550{
3482 switch (nr) { 3551 switch (nr) {
@@ -3676,6 +3745,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3676#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } 3745#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
3677#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } 3746#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
3678#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) } 3747#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
3748#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) }
3679#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } 3749#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
3680#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } 3750#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
3681#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } 3751#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
@@ -3738,7 +3808,7 @@ static const struct opcode group1[] = {
3738}; 3808};
3739 3809
3740static const struct opcode group1A[] = { 3810static const struct opcode group1A[] = {
3741 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, 3811 I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
3742}; 3812};
3743 3813
3744static const struct opcode group2[] = { 3814static const struct opcode group2[] = {
@@ -3854,7 +3924,7 @@ static const struct gprefix pfx_0f_e7 = {
3854}; 3924};
3855 3925
3856static const struct escape escape_d9 = { { 3926static const struct escape escape_d9 = { {
3857 N, N, N, N, N, N, N, I(DstMem, em_fnstcw), 3927 N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
3858}, { 3928}, {
3859 /* 0xC0 - 0xC7 */ 3929 /* 0xC0 - 0xC7 */
3860 N, N, N, N, N, N, N, N, 3930 N, N, N, N, N, N, N, N,
@@ -3896,7 +3966,7 @@ static const struct escape escape_db = { {
3896} }; 3966} };
3897 3967
3898static const struct escape escape_dd = { { 3968static const struct escape escape_dd = { {
3899 N, N, N, N, N, N, N, I(DstMem, em_fnstsw), 3969 N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
3900}, { 3970}, {
3901 /* 0xC0 - 0xC7 */ 3971 /* 0xC0 - 0xC7 */
3902 N, N, N, N, N, N, N, N, 3972 N, N, N, N, N, N, N, N,
@@ -3920,6 +3990,10 @@ static const struct instr_dual instr_dual_0f_c3 = {
3920 I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N 3990 I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
3921}; 3991};
3922 3992
3993static const struct mode_dual mode_dual_63 = {
3994 N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd)
3995};
3996
3923static const struct opcode opcode_table[256] = { 3997static const struct opcode opcode_table[256] = {
3924 /* 0x00 - 0x07 */ 3998 /* 0x00 - 0x07 */
3925 F6ALU(Lock, em_add), 3999 F6ALU(Lock, em_add),
@@ -3954,7 +4028,7 @@ static const struct opcode opcode_table[256] = {
3954 /* 0x60 - 0x67 */ 4028 /* 0x60 - 0x67 */
3955 I(ImplicitOps | Stack | No64, em_pusha), 4029 I(ImplicitOps | Stack | No64, em_pusha),
3956 I(ImplicitOps | Stack | No64, em_popa), 4030 I(ImplicitOps | Stack | No64, em_popa),
3957 N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , 4031 N, MD(ModRM, &mode_dual_63),
3958 N, N, N, N, 4032 N, N, N, N,
3959 /* 0x68 - 0x6F */ 4033 /* 0x68 - 0x6F */
3960 I(SrcImm | Mov | Stack, em_push), 4034 I(SrcImm | Mov | Stack, em_push),
@@ -4010,8 +4084,8 @@ static const struct opcode opcode_table[256] = {
4010 G(ByteOp, group11), G(0, group11), 4084 G(ByteOp, group11), G(0, group11),
4011 /* 0xC8 - 0xCF */ 4085 /* 0xC8 - 0xCF */
4012 I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), 4086 I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
4013 I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm), 4087 I(ImplicitOps | SrcImmU16, em_ret_far_imm),
4014 I(ImplicitOps | Stack, em_ret_far), 4088 I(ImplicitOps, em_ret_far),
4015 D(ImplicitOps), DI(SrcImmByte, intn), 4089 D(ImplicitOps), DI(SrcImmByte, intn),
4016 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), 4090 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
4017 /* 0xD0 - 0xD7 */ 4091 /* 0xD0 - 0xD7 */
@@ -4108,7 +4182,7 @@ static const struct opcode twobyte_table[256] = {
4108 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), 4182 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
4109 GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul), 4183 GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul),
4110 /* 0xB0 - 0xB7 */ 4184 /* 0xB0 - 0xB7 */
4111 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), 4185 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg),
4112 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), 4186 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
4113 F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), 4187 F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
4114 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), 4188 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
@@ -4174,6 +4248,8 @@ static const struct opcode opcode_map_0f_38[256] = {
4174#undef I 4248#undef I
4175#undef GP 4249#undef GP
4176#undef EXT 4250#undef EXT
4251#undef MD
4252#undef ID
4177 4253
4178#undef D2bv 4254#undef D2bv
4179#undef D2bvIP 4255#undef D2bvIP
@@ -4563,6 +4639,12 @@ done_prefixes:
4563 else 4639 else
4564 opcode = opcode.u.idual->mod012; 4640 opcode = opcode.u.idual->mod012;
4565 break; 4641 break;
4642 case ModeDual:
4643 if (ctxt->mode == X86EMUL_MODE_PROT64)
4644 opcode = opcode.u.mdual->mode64;
4645 else
4646 opcode = opcode.u.mdual->mode32;
4647 break;
4566 default: 4648 default:
4567 return EMULATION_FAILED; 4649 return EMULATION_FAILED;
4568 } 4650 }
@@ -4860,8 +4942,13 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4860 /* optimisation - avoid slow emulated read if Mov */ 4942 /* optimisation - avoid slow emulated read if Mov */
4861 rc = segmented_read(ctxt, ctxt->dst.addr.mem, 4943 rc = segmented_read(ctxt, ctxt->dst.addr.mem,
4862 &ctxt->dst.val, ctxt->dst.bytes); 4944 &ctxt->dst.val, ctxt->dst.bytes);
4863 if (rc != X86EMUL_CONTINUE) 4945 if (rc != X86EMUL_CONTINUE) {
4946 if (!(ctxt->d & NoWrite) &&
4947 rc == X86EMUL_PROPAGATE_FAULT &&
4948 ctxt->exception.vector == PF_VECTOR)
4949 ctxt->exception.error_code |= PFERR_WRITE_MASK;
4864 goto done; 4950 goto done;
4951 }
4865 } 4952 }
4866 ctxt->dst.orig_val = ctxt->dst.val; 4953 ctxt->dst.orig_val = ctxt->dst.val;
4867 4954
@@ -4899,11 +4986,6 @@ special_insn:
4899 goto threebyte_insn; 4986 goto threebyte_insn;
4900 4987
4901 switch (ctxt->b) { 4988 switch (ctxt->b) {
4902 case 0x63: /* movsxd */
4903 if (ctxt->mode != X86EMUL_MODE_PROT64)
4904 goto cannot_emulate;
4905 ctxt->dst.val = (s32) ctxt->src.val;
4906 break;
4907 case 0x70 ... 0x7f: /* jcc (short) */ 4989 case 0x70 ... 0x7f: /* jcc (short) */
4908 if (test_cc(ctxt->b, ctxt->eflags)) 4990 if (test_cc(ctxt->b, ctxt->eflags))
4909 rc = jmp_rel(ctxt, ctxt->src.val); 4991 rc = jmp_rel(ctxt, ctxt->src.val);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 3c9195535ffc..c2e36d934af4 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -98,7 +98,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
98} 98}
99 99
100void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); 100void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
101int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 101bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
102 int short_hand, unsigned int dest, int dest_mode); 102 int short_hand, unsigned int dest, int dest_mode);
103int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); 103int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
104void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, 104void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 17b73eeac8a4..7dbced309ddb 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -138,7 +138,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
138 138
139 gfn += page_size >> PAGE_SHIFT; 139 gfn += page_size >> PAGE_SHIFT;
140 140
141 141 cond_resched();
142 } 142 }
143 143
144 return 0; 144 return 0;
@@ -306,6 +306,8 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
306 kvm_unpin_pages(kvm, pfn, unmap_pages); 306 kvm_unpin_pages(kvm, pfn, unmap_pages);
307 307
308 gfn += unmap_pages; 308 gfn += unmap_pages;
309
310 cond_resched();
309 } 311 }
310} 312}
311 313
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d52dcf0776ea..e55b5fc344eb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
33#include <asm/page.h> 33#include <asm/page.h>
34#include <asm/current.h> 34#include <asm/current.h>
35#include <asm/apicdef.h> 35#include <asm/apicdef.h>
36#include <asm/delay.h>
36#include <linux/atomic.h> 37#include <linux/atomic.h>
37#include <linux/jump_label.h> 38#include <linux/jump_label.h>
38#include "kvm_cache_regs.h" 39#include "kvm_cache_regs.h"
@@ -327,17 +328,24 @@ static u8 count_vectors(void *bitmap)
327 return count; 328 return count;
328} 329}
329 330
330void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) 331void __kvm_apic_update_irr(u32 *pir, void *regs)
331{ 332{
332 u32 i, pir_val; 333 u32 i, pir_val;
333 struct kvm_lapic *apic = vcpu->arch.apic;
334 334
335 for (i = 0; i <= 7; i++) { 335 for (i = 0; i <= 7; i++) {
336 pir_val = xchg(&pir[i], 0); 336 pir_val = xchg(&pir[i], 0);
337 if (pir_val) 337 if (pir_val)
338 *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; 338 *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
339 } 339 }
340} 340}
341EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
342
343void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
344{
345 struct kvm_lapic *apic = vcpu->arch.apic;
346
347 __kvm_apic_update_irr(pir, apic->regs);
348}
341EXPORT_SYMBOL_GPL(kvm_apic_update_irr); 349EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
342 350
343static inline void apic_set_irr(int vec, struct kvm_lapic *apic) 351static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
@@ -405,7 +413,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
405 * because the processor can modify ISR under the hood. Instead 413 * because the processor can modify ISR under the hood. Instead
406 * just set SVI. 414 * just set SVI.
407 */ 415 */
408 if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) 416 if (unlikely(kvm_x86_ops->hwapic_isr_update))
409 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); 417 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
410 else { 418 else {
411 ++apic->isr_count; 419 ++apic->isr_count;
@@ -453,7 +461,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
453 * on the other hand isr_count and highest_isr_cache are unused 461 * on the other hand isr_count and highest_isr_cache are unused
454 * and must be left alone. 462 * and must be left alone.
455 */ 463 */
456 if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) 464 if (unlikely(kvm_x86_ops->hwapic_isr_update))
457 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, 465 kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
458 apic_find_highest_isr(apic)); 466 apic_find_highest_isr(apic));
459 else { 467 else {
@@ -580,55 +588,48 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
580 apic_update_ppr(apic); 588 apic_update_ppr(apic);
581} 589}
582 590
583static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) 591static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
584{ 592{
585 return dest == (apic_x2apic_mode(apic) ? 593 return dest == (apic_x2apic_mode(apic) ?
586 X2APIC_BROADCAST : APIC_BROADCAST); 594 X2APIC_BROADCAST : APIC_BROADCAST);
587} 595}
588 596
589int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) 597static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
590{ 598{
591 return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); 599 return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
592} 600}
593 601
594int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) 602static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
595{ 603{
596 int result = 0;
597 u32 logical_id; 604 u32 logical_id;
598 605
599 if (kvm_apic_broadcast(apic, mda)) 606 if (kvm_apic_broadcast(apic, mda))
600 return 1; 607 return true;
601 608
602 if (apic_x2apic_mode(apic)) { 609 logical_id = kvm_apic_get_reg(apic, APIC_LDR);
603 logical_id = kvm_apic_get_reg(apic, APIC_LDR);
604 return logical_id & mda;
605 }
606 610
607 logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR)); 611 if (apic_x2apic_mode(apic))
612 return ((logical_id >> 16) == (mda >> 16))
613 && (logical_id & mda & 0xffff) != 0;
614
615 logical_id = GET_APIC_LOGICAL_ID(logical_id);
608 616
609 switch (kvm_apic_get_reg(apic, APIC_DFR)) { 617 switch (kvm_apic_get_reg(apic, APIC_DFR)) {
610 case APIC_DFR_FLAT: 618 case APIC_DFR_FLAT:
611 if (logical_id & mda) 619 return (logical_id & mda) != 0;
612 result = 1;
613 break;
614 case APIC_DFR_CLUSTER: 620 case APIC_DFR_CLUSTER:
615 if (((logical_id >> 4) == (mda >> 0x4)) 621 return ((logical_id >> 4) == (mda >> 4))
616 && (logical_id & mda & 0xf)) 622 && (logical_id & mda & 0xf) != 0;
617 result = 1;
618 break;
619 default: 623 default:
620 apic_debug("Bad DFR vcpu %d: %08x\n", 624 apic_debug("Bad DFR vcpu %d: %08x\n",
621 apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR)); 625 apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
622 break; 626 return false;
623 } 627 }
624
625 return result;
626} 628}
627 629
628int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 630bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
629 int short_hand, unsigned int dest, int dest_mode) 631 int short_hand, unsigned int dest, int dest_mode)
630{ 632{
631 int result = 0;
632 struct kvm_lapic *target = vcpu->arch.apic; 633 struct kvm_lapic *target = vcpu->arch.apic;
633 634
634 apic_debug("target %p, source %p, dest 0x%x, " 635 apic_debug("target %p, source %p, dest 0x%x, "
@@ -638,29 +639,21 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
638 ASSERT(target); 639 ASSERT(target);
639 switch (short_hand) { 640 switch (short_hand) {
640 case APIC_DEST_NOSHORT: 641 case APIC_DEST_NOSHORT:
641 if (dest_mode == 0) 642 if (dest_mode == APIC_DEST_PHYSICAL)
642 /* Physical mode. */ 643 return kvm_apic_match_physical_addr(target, dest);
643 result = kvm_apic_match_physical_addr(target, dest);
644 else 644 else
645 /* Logical mode. */ 645 return kvm_apic_match_logical_addr(target, dest);
646 result = kvm_apic_match_logical_addr(target, dest);
647 break;
648 case APIC_DEST_SELF: 646 case APIC_DEST_SELF:
649 result = (target == source); 647 return target == source;
650 break;
651 case APIC_DEST_ALLINC: 648 case APIC_DEST_ALLINC:
652 result = 1; 649 return true;
653 break;
654 case APIC_DEST_ALLBUT: 650 case APIC_DEST_ALLBUT:
655 result = (target != source); 651 return target != source;
656 break;
657 default: 652 default:
658 apic_debug("kvm: apic: Bad dest shorthand value %x\n", 653 apic_debug("kvm: apic: Bad dest shorthand value %x\n",
659 short_hand); 654 short_hand);
660 break; 655 return false;
661 } 656 }
662
663 return result;
664} 657}
665 658
666bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, 659bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
@@ -693,7 +686,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
693 686
694 ret = true; 687 ret = true;
695 688
696 if (irq->dest_mode == 0) { /* physical mode */ 689 if (irq->dest_mode == APIC_DEST_PHYSICAL) {
697 if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) 690 if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
698 goto out; 691 goto out;
699 692
@@ -1076,25 +1069,72 @@ static void apic_timer_expired(struct kvm_lapic *apic)
1076{ 1069{
1077 struct kvm_vcpu *vcpu = apic->vcpu; 1070 struct kvm_vcpu *vcpu = apic->vcpu;
1078 wait_queue_head_t *q = &vcpu->wq; 1071 wait_queue_head_t *q = &vcpu->wq;
1072 struct kvm_timer *ktimer = &apic->lapic_timer;
1079 1073
1080 /*
1081 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1082 * vcpu_enter_guest.
1083 */
1084 if (atomic_read(&apic->lapic_timer.pending)) 1074 if (atomic_read(&apic->lapic_timer.pending))
1085 return; 1075 return;
1086 1076
1087 atomic_inc(&apic->lapic_timer.pending); 1077 atomic_inc(&apic->lapic_timer.pending);
1088 /* FIXME: this code should not know anything about vcpus */ 1078 kvm_set_pending_timer(vcpu);
1089 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1090 1079
1091 if (waitqueue_active(q)) 1080 if (waitqueue_active(q))
1092 wake_up_interruptible(q); 1081 wake_up_interruptible(q);
1082
1083 if (apic_lvtt_tscdeadline(apic))
1084 ktimer->expired_tscdeadline = ktimer->tscdeadline;
1085}
1086
1087/*
1088 * On APICv, this test will cause a busy wait
1089 * during a higher-priority task.
1090 */
1091
1092static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
1093{
1094 struct kvm_lapic *apic = vcpu->arch.apic;
1095 u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
1096
1097 if (kvm_apic_hw_enabled(apic)) {
1098 int vec = reg & APIC_VECTOR_MASK;
1099 void *bitmap = apic->regs + APIC_ISR;
1100
1101 if (kvm_x86_ops->deliver_posted_interrupt)
1102 bitmap = apic->regs + APIC_IRR;
1103
1104 if (apic_test_vector(vec, bitmap))
1105 return true;
1106 }
1107 return false;
1108}
1109
1110void wait_lapic_expire(struct kvm_vcpu *vcpu)
1111{
1112 struct kvm_lapic *apic = vcpu->arch.apic;
1113 u64 guest_tsc, tsc_deadline;
1114
1115 if (!kvm_vcpu_has_lapic(vcpu))
1116 return;
1117
1118 if (apic->lapic_timer.expired_tscdeadline == 0)
1119 return;
1120
1121 if (!lapic_timer_int_injected(vcpu))
1122 return;
1123
1124 tsc_deadline = apic->lapic_timer.expired_tscdeadline;
1125 apic->lapic_timer.expired_tscdeadline = 0;
1126 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
1127 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
1128
1129 /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
1130 if (guest_tsc < tsc_deadline)
1131 __delay(tsc_deadline - guest_tsc);
1093} 1132}
1094 1133
1095static void start_apic_timer(struct kvm_lapic *apic) 1134static void start_apic_timer(struct kvm_lapic *apic)
1096{ 1135{
1097 ktime_t now; 1136 ktime_t now;
1137
1098 atomic_set(&apic->lapic_timer.pending, 0); 1138 atomic_set(&apic->lapic_timer.pending, 0);
1099 1139
1100 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { 1140 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1140,6 +1180,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
1140 /* lapic timer in tsc deadline mode */ 1180 /* lapic timer in tsc deadline mode */
1141 u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; 1181 u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
1142 u64 ns = 0; 1182 u64 ns = 0;
1183 ktime_t expire;
1143 struct kvm_vcpu *vcpu = apic->vcpu; 1184 struct kvm_vcpu *vcpu = apic->vcpu;
1144 unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; 1185 unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
1145 unsigned long flags; 1186 unsigned long flags;
@@ -1154,8 +1195,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
1154 if (likely(tscdeadline > guest_tsc)) { 1195 if (likely(tscdeadline > guest_tsc)) {
1155 ns = (tscdeadline - guest_tsc) * 1000000ULL; 1196 ns = (tscdeadline - guest_tsc) * 1000000ULL;
1156 do_div(ns, this_tsc_khz); 1197 do_div(ns, this_tsc_khz);
1198 expire = ktime_add_ns(now, ns);
1199 expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
1157 hrtimer_start(&apic->lapic_timer.timer, 1200 hrtimer_start(&apic->lapic_timer.timer,
1158 ktime_add_ns(now, ns), HRTIMER_MODE_ABS); 1201 expire, HRTIMER_MODE_ABS);
1159 } else 1202 } else
1160 apic_timer_expired(apic); 1203 apic_timer_expired(apic);
1161 1204
@@ -1745,7 +1788,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1745 if (kvm_x86_ops->hwapic_irr_update) 1788 if (kvm_x86_ops->hwapic_irr_update)
1746 kvm_x86_ops->hwapic_irr_update(vcpu, 1789 kvm_x86_ops->hwapic_irr_update(vcpu,
1747 apic_find_highest_irr(apic)); 1790 apic_find_highest_irr(apic));
1748 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); 1791 if (unlikely(kvm_x86_ops->hwapic_isr_update))
1792 kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
1793 apic_find_highest_isr(apic));
1749 kvm_make_request(KVM_REQ_EVENT, vcpu); 1794 kvm_make_request(KVM_REQ_EVENT, vcpu);
1750 kvm_rtc_eoi_tracking_restore_one(vcpu); 1795 kvm_rtc_eoi_tracking_restore_one(vcpu);
1751} 1796}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c674fce53cf9..0bc6c656625b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
14 u32 timer_mode; 14 u32 timer_mode;
15 u32 timer_mode_mask; 15 u32 timer_mode_mask;
16 u64 tscdeadline; 16 u64 tscdeadline;
17 u64 expired_tscdeadline;
17 atomic_t pending; /* accumulated triggered timers */ 18 atomic_t pending; /* accumulated triggered timers */
18}; 19};
19 20
@@ -56,9 +57,8 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
56void kvm_apic_set_version(struct kvm_vcpu *vcpu); 57void kvm_apic_set_version(struct kvm_vcpu *vcpu);
57 58
58void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); 59void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
60void __kvm_apic_update_irr(u32 *pir, void *regs);
59void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); 61void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
60int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest);
61int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda);
62int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, 62int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
63 unsigned long *dest_map); 63 unsigned long *dest_map);
64int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); 64int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
@@ -170,4 +170,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
170 170
171bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); 171bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
172 172
173void wait_lapic_expire(struct kvm_vcpu *vcpu);
174
173#endif 175#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f83fc6c5e0ba..cee759299a35 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -63,30 +63,16 @@ enum {
63#undef MMU_DEBUG 63#undef MMU_DEBUG
64 64
65#ifdef MMU_DEBUG 65#ifdef MMU_DEBUG
66static bool dbg = 0;
67module_param(dbg, bool, 0644);
66 68
67#define pgprintk(x...) do { if (dbg) printk(x); } while (0) 69#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
68#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) 70#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
69 71#define MMU_WARN_ON(x) WARN_ON(x)
70#else 72#else
71
72#define pgprintk(x...) do { } while (0) 73#define pgprintk(x...) do { } while (0)
73#define rmap_printk(x...) do { } while (0) 74#define rmap_printk(x...) do { } while (0)
74 75#define MMU_WARN_ON(x) do { } while (0)
75#endif
76
77#ifdef MMU_DEBUG
78static bool dbg = 0;
79module_param(dbg, bool, 0644);
80#endif
81
82#ifndef MMU_DEBUG
83#define ASSERT(x) do { } while (0)
84#else
85#define ASSERT(x) \
86 if (!(x)) { \
87 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
88 __FILE__, __LINE__, #x); \
89 }
90#endif 76#endif
91 77
92#define PTE_PREFETCH_NUM 8 78#define PTE_PREFETCH_NUM 8
@@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
546 return (old_spte & bit_mask) && !(new_spte & bit_mask); 532 return (old_spte & bit_mask) && !(new_spte & bit_mask);
547} 533}
548 534
535static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
536{
537 return (old_spte & bit_mask) != (new_spte & bit_mask);
538}
539
549/* Rules for using mmu_spte_set: 540/* Rules for using mmu_spte_set:
550 * Set the sptep from nonpresent to present. 541 * Set the sptep from nonpresent to present.
551 * Note: the sptep being assigned *must* be either not present 542 * Note: the sptep being assigned *must* be either not present
@@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
596 if (!shadow_accessed_mask) 587 if (!shadow_accessed_mask)
597 return ret; 588 return ret;
598 589
590 /*
591 * Flush TLB when accessed/dirty bits are changed in the page tables,
592 * to guarantee consistency between TLB and page tables.
593 */
594 if (spte_is_bit_changed(old_spte, new_spte,
595 shadow_accessed_mask | shadow_dirty_mask))
596 ret = true;
597
599 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) 598 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
600 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 599 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
601 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) 600 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
@@ -1216,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1216 return flush; 1215 return flush;
1217} 1216}
1218 1217
1218static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
1219{
1220 u64 spte = *sptep;
1221
1222 rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1223
1224 spte &= ~shadow_dirty_mask;
1225
1226 return mmu_spte_update(sptep, spte);
1227}
1228
1229static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
1230{
1231 u64 *sptep;
1232 struct rmap_iterator iter;
1233 bool flush = false;
1234
1235 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1236 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1237
1238 flush |= spte_clear_dirty(kvm, sptep);
1239 sptep = rmap_get_next(&iter);
1240 }
1241
1242 return flush;
1243}
1244
1245static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
1246{
1247 u64 spte = *sptep;
1248
1249 rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1250
1251 spte |= shadow_dirty_mask;
1252
1253 return mmu_spte_update(sptep, spte);
1254}
1255
1256static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
1257{
1258 u64 *sptep;
1259 struct rmap_iterator iter;
1260 bool flush = false;
1261
1262 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1263 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1264
1265 flush |= spte_set_dirty(kvm, sptep);
1266 sptep = rmap_get_next(&iter);
1267 }
1268
1269 return flush;
1270}
1271
1219/** 1272/**
1220 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages 1273 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1221 * @kvm: kvm instance 1274 * @kvm: kvm instance
@@ -1226,7 +1279,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1226 * Used when we do not need to care about huge page mappings: e.g. during dirty 1279 * Used when we do not need to care about huge page mappings: e.g. during dirty
1227 * logging we do not have any such mappings. 1280 * logging we do not have any such mappings.
1228 */ 1281 */
1229void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1282static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1230 struct kvm_memory_slot *slot, 1283 struct kvm_memory_slot *slot,
1231 gfn_t gfn_offset, unsigned long mask) 1284 gfn_t gfn_offset, unsigned long mask)
1232{ 1285{
@@ -1242,6 +1295,53 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1242 } 1295 }
1243} 1296}
1244 1297
1298/**
1299 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages
1300 * @kvm: kvm instance
1301 * @slot: slot to clear D-bit
1302 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1303 * @mask: indicates which pages we should clear D-bit
1304 *
1305 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1306 */
1307void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1308 struct kvm_memory_slot *slot,
1309 gfn_t gfn_offset, unsigned long mask)
1310{
1311 unsigned long *rmapp;
1312
1313 while (mask) {
1314 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1315 PT_PAGE_TABLE_LEVEL, slot);
1316 __rmap_clear_dirty(kvm, rmapp);
1317
1318 /* clear the first set bit */
1319 mask &= mask - 1;
1320 }
1321}
1322EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1323
1324/**
1325 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1326 * PT level pages.
1327 *
1328 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1329 * enable dirty logging for them.
1330 *
1331 * Used when we do not need to care about huge page mappings: e.g. during dirty
1332 * logging we do not have any such mappings.
1333 */
1334void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1335 struct kvm_memory_slot *slot,
1336 gfn_t gfn_offset, unsigned long mask)
1337{
1338 if (kvm_x86_ops->enable_log_dirty_pt_masked)
1339 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1340 mask);
1341 else
1342 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1343}
1344
1245static bool rmap_write_protect(struct kvm *kvm, u64 gfn) 1345static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1246{ 1346{
1247 struct kvm_memory_slot *slot; 1347 struct kvm_memory_slot *slot;
@@ -1536,7 +1636,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1536 1636
1537static void kvm_mmu_free_page(struct kvm_mmu_page *sp) 1637static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1538{ 1638{
1539 ASSERT(is_empty_shadow_page(sp->spt)); 1639 MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
1540 hlist_del(&sp->hash_link); 1640 hlist_del(&sp->hash_link);
1541 list_del(&sp->link); 1641 list_del(&sp->link);
1542 free_page((unsigned long)sp->spt); 1642 free_page((unsigned long)sp->spt);
@@ -2501,8 +2601,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2501 } 2601 }
2502 } 2602 }
2503 2603
2504 if (pte_access & ACC_WRITE_MASK) 2604 if (pte_access & ACC_WRITE_MASK) {
2505 mark_page_dirty(vcpu->kvm, gfn); 2605 mark_page_dirty(vcpu->kvm, gfn);
2606 spte |= shadow_dirty_mask;
2607 }
2506 2608
2507set_pte: 2609set_pte:
2508 if (mmu_spte_update(sptep, spte)) 2610 if (mmu_spte_update(sptep, spte))
@@ -2818,6 +2920,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2818 */ 2920 */
2819 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); 2921 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
2820 2922
2923 /*
2924 * Theoretically we could also set dirty bit (and flush TLB) here in
2925 * order to eliminate unnecessary PML logging. See comments in
2926 * set_spte. But fast_page_fault is very unlikely to happen with PML
2927 * enabled, so we do not do this. This might result in the same GPA
2928 * to be logged in PML buffer again when the write really happens, and
2929 * eventually to be called by mark_page_dirty twice. But it's also no
2930 * harm. This also avoids the TLB flush needed after setting dirty bit
2931 * so non-PML cases won't be impacted.
2932 *
2933 * Compare with set_spte where instead shadow_dirty_mask is set.
2934 */
2821 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) 2935 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
2822 mark_page_dirty(vcpu->kvm, gfn); 2936 mark_page_dirty(vcpu->kvm, gfn);
2823 2937
@@ -3041,7 +3155,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3041 for (i = 0; i < 4; ++i) { 3155 for (i = 0; i < 4; ++i) {
3042 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3156 hpa_t root = vcpu->arch.mmu.pae_root[i];
3043 3157
3044 ASSERT(!VALID_PAGE(root)); 3158 MMU_WARN_ON(VALID_PAGE(root));
3045 spin_lock(&vcpu->kvm->mmu_lock); 3159 spin_lock(&vcpu->kvm->mmu_lock);
3046 make_mmu_pages_available(vcpu); 3160 make_mmu_pages_available(vcpu);
3047 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), 3161 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
@@ -3079,7 +3193,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3079 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 3193 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
3080 hpa_t root = vcpu->arch.mmu.root_hpa; 3194 hpa_t root = vcpu->arch.mmu.root_hpa;
3081 3195
3082 ASSERT(!VALID_PAGE(root)); 3196 MMU_WARN_ON(VALID_PAGE(root));
3083 3197
3084 spin_lock(&vcpu->kvm->mmu_lock); 3198 spin_lock(&vcpu->kvm->mmu_lock);
3085 make_mmu_pages_available(vcpu); 3199 make_mmu_pages_available(vcpu);
@@ -3104,7 +3218,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3104 for (i = 0; i < 4; ++i) { 3218 for (i = 0; i < 4; ++i) {
3105 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3219 hpa_t root = vcpu->arch.mmu.pae_root[i];
3106 3220
3107 ASSERT(!VALID_PAGE(root)); 3221 MMU_WARN_ON(VALID_PAGE(root));
3108 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 3222 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
3109 pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); 3223 pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
3110 if (!is_present_gpte(pdptr)) { 3224 if (!is_present_gpte(pdptr)) {
@@ -3329,8 +3443,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3329 if (r) 3443 if (r)
3330 return r; 3444 return r;
3331 3445
3332 ASSERT(vcpu); 3446 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3333 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
3334 3447
3335 gfn = gva >> PAGE_SHIFT; 3448 gfn = gva >> PAGE_SHIFT;
3336 3449
@@ -3396,8 +3509,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3396 int write = error_code & PFERR_WRITE_MASK; 3509 int write = error_code & PFERR_WRITE_MASK;
3397 bool map_writable; 3510 bool map_writable;
3398 3511
3399 ASSERT(vcpu); 3512 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3400 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
3401 3513
3402 if (unlikely(error_code & PFERR_RSVD_MASK)) { 3514 if (unlikely(error_code & PFERR_RSVD_MASK)) {
3403 r = handle_mmio_page_fault(vcpu, gpa, error_code, true); 3515 r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
@@ -3718,7 +3830,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
3718 update_permission_bitmask(vcpu, context, false); 3830 update_permission_bitmask(vcpu, context, false);
3719 update_last_pte_bitmap(vcpu, context); 3831 update_last_pte_bitmap(vcpu, context);
3720 3832
3721 ASSERT(is_pae(vcpu)); 3833 MMU_WARN_ON(!is_pae(vcpu));
3722 context->page_fault = paging64_page_fault; 3834 context->page_fault = paging64_page_fault;
3723 context->gva_to_gpa = paging64_gva_to_gpa; 3835 context->gva_to_gpa = paging64_gva_to_gpa;
3724 context->sync_page = paging64_sync_page; 3836 context->sync_page = paging64_sync_page;
@@ -3763,7 +3875,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
3763 3875
3764static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 3876static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3765{ 3877{
3766 struct kvm_mmu *context = vcpu->arch.walk_mmu; 3878 struct kvm_mmu *context = &vcpu->arch.mmu;
3767 3879
3768 context->base_role.word = 0; 3880 context->base_role.word = 0;
3769 context->page_fault = tdp_page_fault; 3881 context->page_fault = tdp_page_fault;
@@ -3803,11 +3915,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3803 update_last_pte_bitmap(vcpu, context); 3915 update_last_pte_bitmap(vcpu, context);
3804} 3916}
3805 3917
3806void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) 3918void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
3807{ 3919{
3808 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); 3920 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
3809 ASSERT(vcpu); 3921 struct kvm_mmu *context = &vcpu->arch.mmu;
3810 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3922
3923 MMU_WARN_ON(VALID_PAGE(context->root_hpa));
3811 3924
3812 if (!is_paging(vcpu)) 3925 if (!is_paging(vcpu))
3813 nonpaging_init_context(vcpu, context); 3926 nonpaging_init_context(vcpu, context);
@@ -3818,19 +3931,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3818 else 3931 else
3819 paging32_init_context(vcpu, context); 3932 paging32_init_context(vcpu, context);
3820 3933
3821 vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); 3934 context->base_role.nxe = is_nx(vcpu);
3822 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3935 context->base_role.cr4_pae = !!is_pae(vcpu);
3823 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3936 context->base_role.cr0_wp = is_write_protection(vcpu);
3824 vcpu->arch.mmu.base_role.smep_andnot_wp 3937 context->base_role.smep_andnot_wp
3825 = smep && !is_write_protection(vcpu); 3938 = smep && !is_write_protection(vcpu);
3826} 3939}
3827EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); 3940EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
3828 3941
3829void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, 3942void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
3830 bool execonly)
3831{ 3943{
3832 ASSERT(vcpu); 3944 struct kvm_mmu *context = &vcpu->arch.mmu;
3833 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3945
3946 MMU_WARN_ON(VALID_PAGE(context->root_hpa));
3834 3947
3835 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 3948 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
3836 3949
@@ -3851,11 +3964,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
3851 3964
3852static void init_kvm_softmmu(struct kvm_vcpu *vcpu) 3965static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
3853{ 3966{
3854 kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); 3967 struct kvm_mmu *context = &vcpu->arch.mmu;
3855 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; 3968
3856 vcpu->arch.walk_mmu->get_cr3 = get_cr3; 3969 kvm_init_shadow_mmu(vcpu);
3857 vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; 3970 context->set_cr3 = kvm_x86_ops->set_cr3;
3858 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 3971 context->get_cr3 = get_cr3;
3972 context->get_pdptr = kvm_pdptr_read;
3973 context->inject_page_fault = kvm_inject_page_fault;
3859} 3974}
3860 3975
3861static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) 3976static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
@@ -3900,17 +4015,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3900static void init_kvm_mmu(struct kvm_vcpu *vcpu) 4015static void init_kvm_mmu(struct kvm_vcpu *vcpu)
3901{ 4016{
3902 if (mmu_is_nested(vcpu)) 4017 if (mmu_is_nested(vcpu))
3903 return init_kvm_nested_mmu(vcpu); 4018 init_kvm_nested_mmu(vcpu);
3904 else if (tdp_enabled) 4019 else if (tdp_enabled)
3905 return init_kvm_tdp_mmu(vcpu); 4020 init_kvm_tdp_mmu(vcpu);
3906 else 4021 else
3907 return init_kvm_softmmu(vcpu); 4022 init_kvm_softmmu(vcpu);
3908} 4023}
3909 4024
3910void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 4025void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
3911{ 4026{
3912 ASSERT(vcpu);
3913
3914 kvm_mmu_unload(vcpu); 4027 kvm_mmu_unload(vcpu);
3915 init_kvm_mmu(vcpu); 4028 init_kvm_mmu(vcpu);
3916} 4029}
@@ -4266,8 +4379,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
4266 struct page *page; 4379 struct page *page;
4267 int i; 4380 int i;
4268 4381
4269 ASSERT(vcpu);
4270
4271 /* 4382 /*
4272 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. 4383 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
4273 * Therefore we need to allocate shadow page tables in the first 4384 * Therefore we need to allocate shadow page tables in the first
@@ -4286,8 +4397,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
4286 4397
4287int kvm_mmu_create(struct kvm_vcpu *vcpu) 4398int kvm_mmu_create(struct kvm_vcpu *vcpu)
4288{ 4399{
4289 ASSERT(vcpu);
4290
4291 vcpu->arch.walk_mmu = &vcpu->arch.mmu; 4400 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
4292 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4401 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4293 vcpu->arch.mmu.translate_gpa = translate_gpa; 4402 vcpu->arch.mmu.translate_gpa = translate_gpa;
@@ -4298,19 +4407,18 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
4298 4407
4299void kvm_mmu_setup(struct kvm_vcpu *vcpu) 4408void kvm_mmu_setup(struct kvm_vcpu *vcpu)
4300{ 4409{
4301 ASSERT(vcpu); 4410 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
4302 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
4303 4411
4304 init_kvm_mmu(vcpu); 4412 init_kvm_mmu(vcpu);
4305} 4413}
4306 4414
4307void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 4415void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
4416 struct kvm_memory_slot *memslot)
4308{ 4417{
4309 struct kvm_memory_slot *memslot;
4310 gfn_t last_gfn; 4418 gfn_t last_gfn;
4311 int i; 4419 int i;
4420 bool flush = false;
4312 4421
4313 memslot = id_to_memslot(kvm->memslots, slot);
4314 last_gfn = memslot->base_gfn + memslot->npages - 1; 4422 last_gfn = memslot->base_gfn + memslot->npages - 1;
4315 4423
4316 spin_lock(&kvm->mmu_lock); 4424 spin_lock(&kvm->mmu_lock);
@@ -4325,7 +4433,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4325 4433
4326 for (index = 0; index <= last_index; ++index, ++rmapp) { 4434 for (index = 0; index <= last_index; ++index, ++rmapp) {
4327 if (*rmapp) 4435 if (*rmapp)
4328 __rmap_write_protect(kvm, rmapp, false); 4436 flush |= __rmap_write_protect(kvm, rmapp,
4437 false);
4329 4438
4330 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) 4439 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
4331 cond_resched_lock(&kvm->mmu_lock); 4440 cond_resched_lock(&kvm->mmu_lock);
@@ -4352,8 +4461,124 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4352 * instead of PT_WRITABLE_MASK, that means it does not depend 4461 * instead of PT_WRITABLE_MASK, that means it does not depend
4353 * on PT_WRITABLE_MASK anymore. 4462 * on PT_WRITABLE_MASK anymore.
4354 */ 4463 */
4355 kvm_flush_remote_tlbs(kvm); 4464 if (flush)
4465 kvm_flush_remote_tlbs(kvm);
4466}
4467
4468void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
4469 struct kvm_memory_slot *memslot)
4470{
4471 gfn_t last_gfn;
4472 unsigned long *rmapp;
4473 unsigned long last_index, index;
4474 bool flush = false;
4475
4476 last_gfn = memslot->base_gfn + memslot->npages - 1;
4477
4478 spin_lock(&kvm->mmu_lock);
4479
4480 rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
4481 last_index = gfn_to_index(last_gfn, memslot->base_gfn,
4482 PT_PAGE_TABLE_LEVEL);
4483
4484 for (index = 0; index <= last_index; ++index, ++rmapp) {
4485 if (*rmapp)
4486 flush |= __rmap_clear_dirty(kvm, rmapp);
4487
4488 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
4489 cond_resched_lock(&kvm->mmu_lock);
4490 }
4491
4492 spin_unlock(&kvm->mmu_lock);
4493
4494 lockdep_assert_held(&kvm->slots_lock);
4495
4496 /*
4497 * It's also safe to flush TLBs out of mmu lock here as currently this
4498 * function is only used for dirty logging, in which case flushing TLB
4499 * out of mmu lock also guarantees no dirty pages will be lost in
4500 * dirty_bitmap.
4501 */
4502 if (flush)
4503 kvm_flush_remote_tlbs(kvm);
4504}
4505EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
4506
4507void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
4508 struct kvm_memory_slot *memslot)
4509{
4510 gfn_t last_gfn;
4511 int i;
4512 bool flush = false;
4513
4514 last_gfn = memslot->base_gfn + memslot->npages - 1;
4515
4516 spin_lock(&kvm->mmu_lock);
4517
4518 for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
4519 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
4520 unsigned long *rmapp;
4521 unsigned long last_index, index;
4522
4523 rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
4524 last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
4525
4526 for (index = 0; index <= last_index; ++index, ++rmapp) {
4527 if (*rmapp)
4528 flush |= __rmap_write_protect(kvm, rmapp,
4529 false);
4530
4531 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
4532 cond_resched_lock(&kvm->mmu_lock);
4533 }
4534 }
4535 spin_unlock(&kvm->mmu_lock);
4536
4537 /* see kvm_mmu_slot_remove_write_access */
4538 lockdep_assert_held(&kvm->slots_lock);
4539
4540 if (flush)
4541 kvm_flush_remote_tlbs(kvm);
4542}
4543EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
4544
4545void kvm_mmu_slot_set_dirty(struct kvm *kvm,
4546 struct kvm_memory_slot *memslot)
4547{
4548 gfn_t last_gfn;
4549 int i;
4550 bool flush = false;
4551
4552 last_gfn = memslot->base_gfn + memslot->npages - 1;
4553
4554 spin_lock(&kvm->mmu_lock);
4555
4556 for (i = PT_PAGE_TABLE_LEVEL;
4557 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
4558 unsigned long *rmapp;
4559 unsigned long last_index, index;
4560
4561 rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
4562 last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
4563
4564 for (index = 0; index <= last_index; ++index, ++rmapp) {
4565 if (*rmapp)
4566 flush |= __rmap_set_dirty(kvm, rmapp);
4567
4568 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
4569 cond_resched_lock(&kvm->mmu_lock);
4570 }
4571 }
4572
4573 spin_unlock(&kvm->mmu_lock);
4574
4575 lockdep_assert_held(&kvm->slots_lock);
4576
4577 /* see kvm_mmu_slot_leaf_clear_dirty */
4578 if (flush)
4579 kvm_flush_remote_tlbs(kvm);
4356} 4580}
4581EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
4357 4582
4358#define BATCH_ZAP_PAGES 10 4583#define BATCH_ZAP_PAGES 10
4359static void kvm_zap_obsolete_pages(struct kvm *kvm) 4584static void kvm_zap_obsolete_pages(struct kvm *kvm)
@@ -4606,8 +4831,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
4606 4831
4607void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 4832void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
4608{ 4833{
4609 ASSERT(vcpu);
4610
4611 kvm_mmu_unload(vcpu); 4834 kvm_mmu_unload(vcpu);
4612 free_mmu_pages(vcpu); 4835 free_mmu_pages(vcpu);
4613 mmu_free_memory_caches(vcpu); 4836 mmu_free_memory_caches(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bde8ee725754..c7d65637c851 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,18 +44,6 @@
44#define PT_DIRECTORY_LEVEL 2 44#define PT_DIRECTORY_LEVEL 2
45#define PT_PAGE_TABLE_LEVEL 1 45#define PT_PAGE_TABLE_LEVEL 1
46 46
47#define PFERR_PRESENT_BIT 0
48#define PFERR_WRITE_BIT 1
49#define PFERR_USER_BIT 2
50#define PFERR_RSVD_BIT 3
51#define PFERR_FETCH_BIT 4
52
53#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
54#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
55#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
56#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
57#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
58
59static inline u64 rsvd_bits(int s, int e) 47static inline u64 rsvd_bits(int s, int e)
60{ 48{
61 return ((1ULL << (e - s + 1)) - 1) << s; 49 return ((1ULL << (e - s + 1)) - 1) << s;
@@ -81,9 +69,8 @@ enum {
81}; 69};
82 70
83int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); 71int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
84void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 72void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
85void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, 73void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
86 bool execonly);
87void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 74void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
88 bool ept); 75 bool ept);
89 76
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 41dd0387cccb..a17d848c6d42 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
2003 2003
2004static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) 2004static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
2005{ 2005{
2006 kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); 2006 WARN_ON(mmu_is_nested(vcpu));
2007 2007 kvm_init_shadow_mmu(vcpu);
2008 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; 2008 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
2009 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; 2009 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
2010 vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; 2010 vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index c2a34bb5ad93..7c7bc8bef21f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -848,6 +848,24 @@ TRACE_EVENT(kvm_track_tsc,
848 848
849#endif /* CONFIG_X86_64 */ 849#endif /* CONFIG_X86_64 */
850 850
851/*
852 * Tracepoint for PML full VMEXIT.
853 */
854TRACE_EVENT(kvm_pml_full,
855 TP_PROTO(unsigned int vcpu_id),
856 TP_ARGS(vcpu_id),
857
858 TP_STRUCT__entry(
859 __field( unsigned int, vcpu_id )
860 ),
861
862 TP_fast_assign(
863 __entry->vcpu_id = vcpu_id;
864 ),
865
866 TP_printk("vcpu %d: PML full", __entry->vcpu_id)
867);
868
851TRACE_EVENT(kvm_ple_window, 869TRACE_EVENT(kvm_ple_window,
852 TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), 870 TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
853 TP_ARGS(grow, vcpu_id, new, old), 871 TP_ARGS(grow, vcpu_id, new, old),
@@ -914,6 +932,26 @@ TRACE_EVENT(kvm_pvclock_update,
914 __entry->flags) 932 __entry->flags)
915); 933);
916 934
935TRACE_EVENT(kvm_wait_lapic_expire,
936 TP_PROTO(unsigned int vcpu_id, s64 delta),
937 TP_ARGS(vcpu_id, delta),
938
939 TP_STRUCT__entry(
940 __field( unsigned int, vcpu_id )
941 __field( s64, delta )
942 ),
943
944 TP_fast_assign(
945 __entry->vcpu_id = vcpu_id;
946 __entry->delta = delta;
947 ),
948
949 TP_printk("vcpu %u: delta %lld (%s)",
950 __entry->vcpu_id,
951 __entry->delta,
952 __entry->delta < 0 ? "early" : "late")
953);
954
917#endif /* _TRACE_KVM_H */ 955#endif /* _TRACE_KVM_H */
918 956
919#undef TRACE_INCLUDE_PATH 957#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4c58d884838..3f73bfad0349 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -45,6 +45,7 @@
45#include <asm/perf_event.h> 45#include <asm/perf_event.h>
46#include <asm/debugreg.h> 46#include <asm/debugreg.h>
47#include <asm/kexec.h> 47#include <asm/kexec.h>
48#include <asm/apic.h>
48 49
49#include "trace.h" 50#include "trace.h"
50 51
@@ -101,6 +102,9 @@ module_param(nested, bool, S_IRUGO);
101 102
102static u64 __read_mostly host_xss; 103static u64 __read_mostly host_xss;
103 104
105static bool __read_mostly enable_pml = 1;
106module_param_named(pml, enable_pml, bool, S_IRUGO);
107
104#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) 108#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
105#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) 109#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
106#define KVM_VM_CR0_ALWAYS_ON \ 110#define KVM_VM_CR0_ALWAYS_ON \
@@ -215,7 +219,12 @@ struct __packed vmcs12 {
215 u64 tsc_offset; 219 u64 tsc_offset;
216 u64 virtual_apic_page_addr; 220 u64 virtual_apic_page_addr;
217 u64 apic_access_addr; 221 u64 apic_access_addr;
222 u64 posted_intr_desc_addr;
218 u64 ept_pointer; 223 u64 ept_pointer;
224 u64 eoi_exit_bitmap0;
225 u64 eoi_exit_bitmap1;
226 u64 eoi_exit_bitmap2;
227 u64 eoi_exit_bitmap3;
219 u64 xss_exit_bitmap; 228 u64 xss_exit_bitmap;
220 u64 guest_physical_address; 229 u64 guest_physical_address;
221 u64 vmcs_link_pointer; 230 u64 vmcs_link_pointer;
@@ -330,6 +339,7 @@ struct __packed vmcs12 {
330 u32 vmx_preemption_timer_value; 339 u32 vmx_preemption_timer_value;
331 u32 padding32[7]; /* room for future expansion */ 340 u32 padding32[7]; /* room for future expansion */
332 u16 virtual_processor_id; 341 u16 virtual_processor_id;
342 u16 posted_intr_nv;
333 u16 guest_es_selector; 343 u16 guest_es_selector;
334 u16 guest_cs_selector; 344 u16 guest_cs_selector;
335 u16 guest_ss_selector; 345 u16 guest_ss_selector;
@@ -338,6 +348,7 @@ struct __packed vmcs12 {
338 u16 guest_gs_selector; 348 u16 guest_gs_selector;
339 u16 guest_ldtr_selector; 349 u16 guest_ldtr_selector;
340 u16 guest_tr_selector; 350 u16 guest_tr_selector;
351 u16 guest_intr_status;
341 u16 host_es_selector; 352 u16 host_es_selector;
342 u16 host_cs_selector; 353 u16 host_cs_selector;
343 u16 host_ss_selector; 354 u16 host_ss_selector;
@@ -401,6 +412,10 @@ struct nested_vmx {
401 */ 412 */
402 struct page *apic_access_page; 413 struct page *apic_access_page;
403 struct page *virtual_apic_page; 414 struct page *virtual_apic_page;
415 struct page *pi_desc_page;
416 struct pi_desc *pi_desc;
417 bool pi_pending;
418 u16 posted_intr_nv;
404 u64 msr_ia32_feature_control; 419 u64 msr_ia32_feature_control;
405 420
406 struct hrtimer preemption_timer; 421 struct hrtimer preemption_timer;
@@ -408,6 +423,23 @@ struct nested_vmx {
408 423
409 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ 424 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
410 u64 vmcs01_debugctl; 425 u64 vmcs01_debugctl;
426
427 u32 nested_vmx_procbased_ctls_low;
428 u32 nested_vmx_procbased_ctls_high;
429 u32 nested_vmx_true_procbased_ctls_low;
430 u32 nested_vmx_secondary_ctls_low;
431 u32 nested_vmx_secondary_ctls_high;
432 u32 nested_vmx_pinbased_ctls_low;
433 u32 nested_vmx_pinbased_ctls_high;
434 u32 nested_vmx_exit_ctls_low;
435 u32 nested_vmx_exit_ctls_high;
436 u32 nested_vmx_true_exit_ctls_low;
437 u32 nested_vmx_entry_ctls_low;
438 u32 nested_vmx_entry_ctls_high;
439 u32 nested_vmx_true_entry_ctls_low;
440 u32 nested_vmx_misc_low;
441 u32 nested_vmx_misc_high;
442 u32 nested_vmx_ept_caps;
411}; 443};
412 444
413#define POSTED_INTR_ON 0 445#define POSTED_INTR_ON 0
@@ -511,6 +543,10 @@ struct vcpu_vmx {
511 /* Dynamic PLE window. */ 543 /* Dynamic PLE window. */
512 int ple_window; 544 int ple_window;
513 bool ple_window_dirty; 545 bool ple_window_dirty;
546
547 /* Support for PML */
548#define PML_ENTITY_NUM 512
549 struct page *pml_pg;
514}; 550};
515 551
516enum segment_cache_field { 552enum segment_cache_field {
@@ -594,6 +630,7 @@ static int max_shadow_read_write_fields =
594 630
595static const unsigned short vmcs_field_to_offset_table[] = { 631static const unsigned short vmcs_field_to_offset_table[] = {
596 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), 632 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
633 FIELD(POSTED_INTR_NV, posted_intr_nv),
597 FIELD(GUEST_ES_SELECTOR, guest_es_selector), 634 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
598 FIELD(GUEST_CS_SELECTOR, guest_cs_selector), 635 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
599 FIELD(GUEST_SS_SELECTOR, guest_ss_selector), 636 FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -602,6 +639,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
602 FIELD(GUEST_GS_SELECTOR, guest_gs_selector), 639 FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
603 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), 640 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
604 FIELD(GUEST_TR_SELECTOR, guest_tr_selector), 641 FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
642 FIELD(GUEST_INTR_STATUS, guest_intr_status),
605 FIELD(HOST_ES_SELECTOR, host_es_selector), 643 FIELD(HOST_ES_SELECTOR, host_es_selector),
606 FIELD(HOST_CS_SELECTOR, host_cs_selector), 644 FIELD(HOST_CS_SELECTOR, host_cs_selector),
607 FIELD(HOST_SS_SELECTOR, host_ss_selector), 645 FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -618,7 +656,12 @@ static const unsigned short vmcs_field_to_offset_table[] = {
618 FIELD64(TSC_OFFSET, tsc_offset), 656 FIELD64(TSC_OFFSET, tsc_offset),
619 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), 657 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
620 FIELD64(APIC_ACCESS_ADDR, apic_access_addr), 658 FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
659 FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
621 FIELD64(EPT_POINTER, ept_pointer), 660 FIELD64(EPT_POINTER, ept_pointer),
661 FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
662 FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
663 FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
664 FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
622 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), 665 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
623 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), 666 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
624 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), 667 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -766,6 +809,7 @@ static void kvm_cpu_vmxon(u64 addr);
766static void kvm_cpu_vmxoff(void); 809static void kvm_cpu_vmxoff(void);
767static bool vmx_mpx_supported(void); 810static bool vmx_mpx_supported(void);
768static bool vmx_xsaves_supported(void); 811static bool vmx_xsaves_supported(void);
812static int vmx_vm_has_apicv(struct kvm *kvm);
769static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 813static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
770static void vmx_set_segment(struct kvm_vcpu *vcpu, 814static void vmx_set_segment(struct kvm_vcpu *vcpu,
771 struct kvm_segment *var, int seg); 815 struct kvm_segment *var, int seg);
@@ -793,6 +837,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
793static unsigned long *vmx_msr_bitmap_longmode; 837static unsigned long *vmx_msr_bitmap_longmode;
794static unsigned long *vmx_msr_bitmap_legacy_x2apic; 838static unsigned long *vmx_msr_bitmap_legacy_x2apic;
795static unsigned long *vmx_msr_bitmap_longmode_x2apic; 839static unsigned long *vmx_msr_bitmap_longmode_x2apic;
840static unsigned long *vmx_msr_bitmap_nested;
796static unsigned long *vmx_vmread_bitmap; 841static unsigned long *vmx_vmread_bitmap;
797static unsigned long *vmx_vmwrite_bitmap; 842static unsigned long *vmx_vmwrite_bitmap;
798 843
@@ -959,16 +1004,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void)
959 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; 1004 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
960} 1005}
961 1006
962static inline bool cpu_has_vmx_eptp_uncacheable(void)
963{
964 return vmx_capability.ept & VMX_EPTP_UC_BIT;
965}
966
967static inline bool cpu_has_vmx_eptp_writeback(void)
968{
969 return vmx_capability.ept & VMX_EPTP_WB_BIT;
970}
971
972static inline bool cpu_has_vmx_ept_2m_page(void) 1007static inline bool cpu_has_vmx_ept_2m_page(void)
973{ 1008{
974 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; 1009 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
@@ -1073,6 +1108,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void)
1073 SECONDARY_EXEC_SHADOW_VMCS; 1108 SECONDARY_EXEC_SHADOW_VMCS;
1074} 1109}
1075 1110
1111static inline bool cpu_has_vmx_pml(void)
1112{
1113 return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1114}
1115
1076static inline bool report_flexpriority(void) 1116static inline bool report_flexpriority(void)
1077{ 1117{
1078 return flexpriority_enabled; 1118 return flexpriority_enabled;
@@ -1112,6 +1152,26 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
1112 vmx_xsaves_supported(); 1152 vmx_xsaves_supported();
1113} 1153}
1114 1154
1155static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
1156{
1157 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
1158}
1159
1160static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
1161{
1162 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
1163}
1164
1165static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
1166{
1167 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
1168}
1169
1170static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
1171{
1172 return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
1173}
1174
1115static inline bool is_exception(u32 intr_info) 1175static inline bool is_exception(u32 intr_info)
1116{ 1176{
1117 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 1177 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2284,20 +2344,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2284 * if the corresponding bit in the (32-bit) control field *must* be on, and a 2344 * if the corresponding bit in the (32-bit) control field *must* be on, and a
2285 * bit in the high half is on if the corresponding bit in the control field 2345 * bit in the high half is on if the corresponding bit in the control field
2286 * may be on. See also vmx_control_verify(). 2346 * may be on. See also vmx_control_verify().
2287 * TODO: allow these variables to be modified (downgraded) by module options
2288 * or other means.
2289 */ 2347 */
2290static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; 2348static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2291static u32 nested_vmx_true_procbased_ctls_low;
2292static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
2293static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
2294static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
2295static u32 nested_vmx_true_exit_ctls_low;
2296static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
2297static u32 nested_vmx_true_entry_ctls_low;
2298static u32 nested_vmx_misc_low, nested_vmx_misc_high;
2299static u32 nested_vmx_ept_caps;
2300static __init void nested_vmx_setup_ctls_msrs(void)
2301{ 2349{
2302 /* 2350 /*
2303 * Note that as a general rule, the high half of the MSRs (bits in 2351 * Note that as a general rule, the high half of the MSRs (bits in
@@ -2316,57 +2364,74 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2316 2364
2317 /* pin-based controls */ 2365 /* pin-based controls */
2318 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 2366 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2319 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); 2367 vmx->nested.nested_vmx_pinbased_ctls_low,
2320 nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2368 vmx->nested.nested_vmx_pinbased_ctls_high);
2321 nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | 2369 vmx->nested.nested_vmx_pinbased_ctls_low |=
2322 PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; 2370 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2323 nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2371 vmx->nested.nested_vmx_pinbased_ctls_high &=
2372 PIN_BASED_EXT_INTR_MASK |
2373 PIN_BASED_NMI_EXITING |
2374 PIN_BASED_VIRTUAL_NMIS;
2375 vmx->nested.nested_vmx_pinbased_ctls_high |=
2376 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2324 PIN_BASED_VMX_PREEMPTION_TIMER; 2377 PIN_BASED_VMX_PREEMPTION_TIMER;
2378 if (vmx_vm_has_apicv(vmx->vcpu.kvm))
2379 vmx->nested.nested_vmx_pinbased_ctls_high |=
2380 PIN_BASED_POSTED_INTR;
2325 2381
2326 /* exit controls */ 2382 /* exit controls */
2327 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 2383 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2328 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); 2384 vmx->nested.nested_vmx_exit_ctls_low,
2329 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2385 vmx->nested.nested_vmx_exit_ctls_high);
2386 vmx->nested.nested_vmx_exit_ctls_low =
2387 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2330 2388
2331 nested_vmx_exit_ctls_high &= 2389 vmx->nested.nested_vmx_exit_ctls_high &=
2332#ifdef CONFIG_X86_64 2390#ifdef CONFIG_X86_64
2333 VM_EXIT_HOST_ADDR_SPACE_SIZE | 2391 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2334#endif 2392#endif
2335 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 2393 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2336 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 2394 vmx->nested.nested_vmx_exit_ctls_high |=
2395 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2337 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 2396 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2338 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 2397 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
2339 2398
2340 if (vmx_mpx_supported()) 2399 if (vmx_mpx_supported())
2341 nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 2400 vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2342 2401
2343 /* We support free control of debug control saving. */ 2402 /* We support free control of debug control saving. */
2344 nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low & 2403 vmx->nested.nested_vmx_true_exit_ctls_low =
2404 vmx->nested.nested_vmx_exit_ctls_low &
2345 ~VM_EXIT_SAVE_DEBUG_CONTROLS; 2405 ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2346 2406
2347 /* entry controls */ 2407 /* entry controls */
2348 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2408 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2349 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); 2409 vmx->nested.nested_vmx_entry_ctls_low,
2350 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2410 vmx->nested.nested_vmx_entry_ctls_high);
2351 nested_vmx_entry_ctls_high &= 2411 vmx->nested.nested_vmx_entry_ctls_low =
2412 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2413 vmx->nested.nested_vmx_entry_ctls_high &=
2352#ifdef CONFIG_X86_64 2414#ifdef CONFIG_X86_64
2353 VM_ENTRY_IA32E_MODE | 2415 VM_ENTRY_IA32E_MODE |
2354#endif 2416#endif
2355 VM_ENTRY_LOAD_IA32_PAT; 2417 VM_ENTRY_LOAD_IA32_PAT;
2356 nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | 2418 vmx->nested.nested_vmx_entry_ctls_high |=
2357 VM_ENTRY_LOAD_IA32_EFER); 2419 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
2358 if (vmx_mpx_supported()) 2420 if (vmx_mpx_supported())
2359 nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; 2421 vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2360 2422
2361 /* We support free control of debug control loading. */ 2423 /* We support free control of debug control loading. */
2362 nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low & 2424 vmx->nested.nested_vmx_true_entry_ctls_low =
2425 vmx->nested.nested_vmx_entry_ctls_low &
2363 ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 2426 ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2364 2427
2365 /* cpu-based controls */ 2428 /* cpu-based controls */
2366 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 2429 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2367 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); 2430 vmx->nested.nested_vmx_procbased_ctls_low,
2368 nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2431 vmx->nested.nested_vmx_procbased_ctls_high);
2369 nested_vmx_procbased_ctls_high &= 2432 vmx->nested.nested_vmx_procbased_ctls_low =
2433 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2434 vmx->nested.nested_vmx_procbased_ctls_high &=
2370 CPU_BASED_VIRTUAL_INTR_PENDING | 2435 CPU_BASED_VIRTUAL_INTR_PENDING |
2371 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 2436 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2372 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 2437 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
@@ -2386,45 +2451,55 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2386 * can use it to avoid exits to L1 - even when L0 runs L2 2451 * can use it to avoid exits to L1 - even when L0 runs L2
2387 * without MSR bitmaps. 2452 * without MSR bitmaps.
2388 */ 2453 */
2389 nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2454 vmx->nested.nested_vmx_procbased_ctls_high |=
2455 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2390 CPU_BASED_USE_MSR_BITMAPS; 2456 CPU_BASED_USE_MSR_BITMAPS;
2391 2457
2392 /* We support free control of CR3 access interception. */ 2458 /* We support free control of CR3 access interception. */
2393 nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low & 2459 vmx->nested.nested_vmx_true_procbased_ctls_low =
2460 vmx->nested.nested_vmx_procbased_ctls_low &
2394 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 2461 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
2395 2462
2396 /* secondary cpu-based controls */ 2463 /* secondary cpu-based controls */
2397 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 2464 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2398 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); 2465 vmx->nested.nested_vmx_secondary_ctls_low,
2399 nested_vmx_secondary_ctls_low = 0; 2466 vmx->nested.nested_vmx_secondary_ctls_high);
2400 nested_vmx_secondary_ctls_high &= 2467 vmx->nested.nested_vmx_secondary_ctls_low = 0;
2468 vmx->nested.nested_vmx_secondary_ctls_high &=
2401 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2469 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2470 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2471 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2472 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2402 SECONDARY_EXEC_WBINVD_EXITING | 2473 SECONDARY_EXEC_WBINVD_EXITING |
2403 SECONDARY_EXEC_XSAVES; 2474 SECONDARY_EXEC_XSAVES;
2404 2475
2405 if (enable_ept) { 2476 if (enable_ept) {
2406 /* nested EPT: emulate EPT also to L1 */ 2477 /* nested EPT: emulate EPT also to L1 */
2407 nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT | 2478 vmx->nested.nested_vmx_secondary_ctls_high |=
2479 SECONDARY_EXEC_ENABLE_EPT |
2408 SECONDARY_EXEC_UNRESTRICTED_GUEST; 2480 SECONDARY_EXEC_UNRESTRICTED_GUEST;
2409 nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | 2481 vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2410 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | 2482 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
2411 VMX_EPT_INVEPT_BIT; 2483 VMX_EPT_INVEPT_BIT;
2412 nested_vmx_ept_caps &= vmx_capability.ept; 2484 vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
2413 /* 2485 /*
2414 * For nested guests, we don't do anything specific 2486 * For nested guests, we don't do anything specific
2415 * for single context invalidation. Hence, only advertise 2487 * for single context invalidation. Hence, only advertise
2416 * support for global context invalidation. 2488 * support for global context invalidation.
2417 */ 2489 */
2418 nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; 2490 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
2419 } else 2491 } else
2420 nested_vmx_ept_caps = 0; 2492 vmx->nested.nested_vmx_ept_caps = 0;
2421 2493
2422 /* miscellaneous data */ 2494 /* miscellaneous data */
2423 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); 2495 rdmsr(MSR_IA32_VMX_MISC,
2424 nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; 2496 vmx->nested.nested_vmx_misc_low,
2425 nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 2497 vmx->nested.nested_vmx_misc_high);
2498 vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
2499 vmx->nested.nested_vmx_misc_low |=
2500 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2426 VMX_MISC_ACTIVITY_HLT; 2501 VMX_MISC_ACTIVITY_HLT;
2427 nested_vmx_misc_high = 0; 2502 vmx->nested.nested_vmx_misc_high = 0;
2428} 2503}
2429 2504
2430static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 2505static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2443,6 +2518,8 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
2443/* Returns 0 on success, non-0 otherwise. */ 2518/* Returns 0 on success, non-0 otherwise. */
2444static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2519static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2445{ 2520{
2521 struct vcpu_vmx *vmx = to_vmx(vcpu);
2522
2446 switch (msr_index) { 2523 switch (msr_index) {
2447 case MSR_IA32_VMX_BASIC: 2524 case MSR_IA32_VMX_BASIC:
2448 /* 2525 /*
@@ -2457,36 +2534,44 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2457 break; 2534 break;
2458 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 2535 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2459 case MSR_IA32_VMX_PINBASED_CTLS: 2536 case MSR_IA32_VMX_PINBASED_CTLS:
2460 *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, 2537 *pdata = vmx_control_msr(
2461 nested_vmx_pinbased_ctls_high); 2538 vmx->nested.nested_vmx_pinbased_ctls_low,
2539 vmx->nested.nested_vmx_pinbased_ctls_high);
2462 break; 2540 break;
2463 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 2541 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2464 *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low, 2542 *pdata = vmx_control_msr(
2465 nested_vmx_procbased_ctls_high); 2543 vmx->nested.nested_vmx_true_procbased_ctls_low,
2544 vmx->nested.nested_vmx_procbased_ctls_high);
2466 break; 2545 break;
2467 case MSR_IA32_VMX_PROCBASED_CTLS: 2546 case MSR_IA32_VMX_PROCBASED_CTLS:
2468 *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, 2547 *pdata = vmx_control_msr(
2469 nested_vmx_procbased_ctls_high); 2548 vmx->nested.nested_vmx_procbased_ctls_low,
2549 vmx->nested.nested_vmx_procbased_ctls_high);
2470 break; 2550 break;
2471 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 2551 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2472 *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low, 2552 *pdata = vmx_control_msr(
2473 nested_vmx_exit_ctls_high); 2553 vmx->nested.nested_vmx_true_exit_ctls_low,
2554 vmx->nested.nested_vmx_exit_ctls_high);
2474 break; 2555 break;
2475 case MSR_IA32_VMX_EXIT_CTLS: 2556 case MSR_IA32_VMX_EXIT_CTLS:
2476 *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, 2557 *pdata = vmx_control_msr(
2477 nested_vmx_exit_ctls_high); 2558 vmx->nested.nested_vmx_exit_ctls_low,
2559 vmx->nested.nested_vmx_exit_ctls_high);
2478 break; 2560 break;
2479 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 2561 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2480 *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low, 2562 *pdata = vmx_control_msr(
2481 nested_vmx_entry_ctls_high); 2563 vmx->nested.nested_vmx_true_entry_ctls_low,
2564 vmx->nested.nested_vmx_entry_ctls_high);
2482 break; 2565 break;
2483 case MSR_IA32_VMX_ENTRY_CTLS: 2566 case MSR_IA32_VMX_ENTRY_CTLS:
2484 *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, 2567 *pdata = vmx_control_msr(
2485 nested_vmx_entry_ctls_high); 2568 vmx->nested.nested_vmx_entry_ctls_low,
2569 vmx->nested.nested_vmx_entry_ctls_high);
2486 break; 2570 break;
2487 case MSR_IA32_VMX_MISC: 2571 case MSR_IA32_VMX_MISC:
2488 *pdata = vmx_control_msr(nested_vmx_misc_low, 2572 *pdata = vmx_control_msr(
2489 nested_vmx_misc_high); 2573 vmx->nested.nested_vmx_misc_low,
2574 vmx->nested.nested_vmx_misc_high);
2490 break; 2575 break;
2491 /* 2576 /*
2492 * These MSRs specify bits which the guest must keep fixed (on or off) 2577 * These MSRs specify bits which the guest must keep fixed (on or off)
@@ -2511,12 +2596,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2511 *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 2596 *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
2512 break; 2597 break;
2513 case MSR_IA32_VMX_PROCBASED_CTLS2: 2598 case MSR_IA32_VMX_PROCBASED_CTLS2:
2514 *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, 2599 *pdata = vmx_control_msr(
2515 nested_vmx_secondary_ctls_high); 2600 vmx->nested.nested_vmx_secondary_ctls_low,
2601 vmx->nested.nested_vmx_secondary_ctls_high);
2516 break; 2602 break;
2517 case MSR_IA32_VMX_EPT_VPID_CAP: 2603 case MSR_IA32_VMX_EPT_VPID_CAP:
2518 /* Currently, no nested vpid support */ 2604 /* Currently, no nested vpid support */
2519 *pdata = nested_vmx_ept_caps; 2605 *pdata = vmx->nested.nested_vmx_ept_caps;
2520 break; 2606 break;
2521 default: 2607 default:
2522 return 1; 2608 return 1;
@@ -2929,7 +3015,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2929 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3015 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2930 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3016 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2931 SECONDARY_EXEC_SHADOW_VMCS | 3017 SECONDARY_EXEC_SHADOW_VMCS |
2932 SECONDARY_EXEC_XSAVES; 3018 SECONDARY_EXEC_XSAVES |
3019 SECONDARY_EXEC_ENABLE_PML;
2933 if (adjust_vmx_controls(min2, opt2, 3020 if (adjust_vmx_controls(min2, opt2,
2934 MSR_IA32_VMX_PROCBASED_CTLS2, 3021 MSR_IA32_VMX_PROCBASED_CTLS2,
2935 &_cpu_based_2nd_exec_control) < 0) 3022 &_cpu_based_2nd_exec_control) < 0)
@@ -4159,6 +4246,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
4159 } 4246 }
4160} 4247}
4161 4248
4249/*
4250 * If a msr is allowed by L0, we should check whether it is allowed by L1.
4251 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
4252 */
4253static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
4254 unsigned long *msr_bitmap_nested,
4255 u32 msr, int type)
4256{
4257 int f = sizeof(unsigned long);
4258
4259 if (!cpu_has_vmx_msr_bitmap()) {
4260 WARN_ON(1);
4261 return;
4262 }
4263
4264 /*
4265 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4266 * have the write-low and read-high bitmap offsets the wrong way round.
4267 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4268 */
4269 if (msr <= 0x1fff) {
4270 if (type & MSR_TYPE_R &&
4271 !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
4272 /* read-low */
4273 __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
4274
4275 if (type & MSR_TYPE_W &&
4276 !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
4277 /* write-low */
4278 __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
4279
4280 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4281 msr &= 0x1fff;
4282 if (type & MSR_TYPE_R &&
4283 !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
4284 /* read-high */
4285 __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
4286
4287 if (type & MSR_TYPE_W &&
4288 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
4289 /* write-high */
4290 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
4291
4292 }
4293}
4294
4162static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) 4295static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
4163{ 4296{
4164 if (!longmode_only) 4297 if (!longmode_only)
@@ -4197,6 +4330,64 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
4197 return enable_apicv && irqchip_in_kernel(kvm); 4330 return enable_apicv && irqchip_in_kernel(kvm);
4198} 4331}
4199 4332
4333static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
4334{
4335 struct vcpu_vmx *vmx = to_vmx(vcpu);
4336 int max_irr;
4337 void *vapic_page;
4338 u16 status;
4339
4340 if (vmx->nested.pi_desc &&
4341 vmx->nested.pi_pending) {
4342 vmx->nested.pi_pending = false;
4343 if (!pi_test_and_clear_on(vmx->nested.pi_desc))
4344 return 0;
4345
4346 max_irr = find_last_bit(
4347 (unsigned long *)vmx->nested.pi_desc->pir, 256);
4348
4349 if (max_irr == 256)
4350 return 0;
4351
4352 vapic_page = kmap(vmx->nested.virtual_apic_page);
4353 if (!vapic_page) {
4354 WARN_ON(1);
4355 return -ENOMEM;
4356 }
4357 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
4358 kunmap(vmx->nested.virtual_apic_page);
4359
4360 status = vmcs_read16(GUEST_INTR_STATUS);
4361 if ((u8)max_irr > ((u8)status & 0xff)) {
4362 status &= ~0xff;
4363 status |= (u8)max_irr;
4364 vmcs_write16(GUEST_INTR_STATUS, status);
4365 }
4366 }
4367 return 0;
4368}
4369
4370static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4371 int vector)
4372{
4373 struct vcpu_vmx *vmx = to_vmx(vcpu);
4374
4375 if (is_guest_mode(vcpu) &&
4376 vector == vmx->nested.posted_intr_nv) {
4377 /* the PIR and ON have been set by L1. */
4378 if (vcpu->mode == IN_GUEST_MODE)
4379 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
4380 POSTED_INTR_VECTOR);
4381 /*
4382 * If a posted intr is not recognized by hardware,
4383 * we will accomplish it in the next vmentry.
4384 */
4385 vmx->nested.pi_pending = true;
4386 kvm_make_request(KVM_REQ_EVENT, vcpu);
4387 return 0;
4388 }
4389 return -1;
4390}
4200/* 4391/*
4201 * Send interrupt to vcpu via posted interrupt way. 4392 * Send interrupt to vcpu via posted interrupt way.
4202 * 1. If target vcpu is running(non-root mode), send posted interrupt 4393 * 1. If target vcpu is running(non-root mode), send posted interrupt
@@ -4209,6 +4400,10 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4209 struct vcpu_vmx *vmx = to_vmx(vcpu); 4400 struct vcpu_vmx *vmx = to_vmx(vcpu);
4210 int r; 4401 int r;
4211 4402
4403 r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4404 if (!r)
4405 return;
4406
4212 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4407 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4213 return; 4408 return;
4214 4409
@@ -4360,6 +4555,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4360 a current VMCS12 4555 a current VMCS12
4361 */ 4556 */
4362 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4557 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4558 /* PML is enabled/disabled in creating/destorying vcpu */
4559 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4560
4363 return exec_control; 4561 return exec_control;
4364} 4562}
4365 4563
@@ -4986,11 +5184,12 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4986 hypercall[2] = 0xc1; 5184 hypercall[2] = 0xc1;
4987} 5185}
4988 5186
4989static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) 5187static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
4990{ 5188{
4991 unsigned long always_on = VMXON_CR0_ALWAYSON; 5189 unsigned long always_on = VMXON_CR0_ALWAYSON;
5190 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4992 5191
4993 if (nested_vmx_secondary_ctls_high & 5192 if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
4994 SECONDARY_EXEC_UNRESTRICTED_GUEST && 5193 SECONDARY_EXEC_UNRESTRICTED_GUEST &&
4995 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 5194 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
4996 always_on &= ~(X86_CR0_PE | X86_CR0_PG); 5195 always_on &= ~(X86_CR0_PE | X86_CR0_PG);
@@ -5015,7 +5214,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5015 val = (val & ~vmcs12->cr0_guest_host_mask) | 5214 val = (val & ~vmcs12->cr0_guest_host_mask) |
5016 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5215 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5017 5216
5018 if (!nested_cr0_valid(vmcs12, val)) 5217 if (!nested_cr0_valid(vcpu, val))
5019 return 1; 5218 return 1;
5020 5219
5021 if (kvm_set_cr0(vcpu, val)) 5220 if (kvm_set_cr0(vcpu, val))
@@ -5817,13 +6016,21 @@ static __init int hardware_setup(void)
5817 (unsigned long *)__get_free_page(GFP_KERNEL); 6016 (unsigned long *)__get_free_page(GFP_KERNEL);
5818 if (!vmx_msr_bitmap_longmode_x2apic) 6017 if (!vmx_msr_bitmap_longmode_x2apic)
5819 goto out4; 6018 goto out4;
6019
6020 if (nested) {
6021 vmx_msr_bitmap_nested =
6022 (unsigned long *)__get_free_page(GFP_KERNEL);
6023 if (!vmx_msr_bitmap_nested)
6024 goto out5;
6025 }
6026
5820 vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 6027 vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
5821 if (!vmx_vmread_bitmap) 6028 if (!vmx_vmread_bitmap)
5822 goto out5; 6029 goto out6;
5823 6030
5824 vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 6031 vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
5825 if (!vmx_vmwrite_bitmap) 6032 if (!vmx_vmwrite_bitmap)
5826 goto out6; 6033 goto out7;
5827 6034
5828 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 6035 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
5829 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 6036 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5839,10 +6046,12 @@ static __init int hardware_setup(void)
5839 6046
5840 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); 6047 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
5841 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); 6048 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
6049 if (nested)
6050 memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
5842 6051
5843 if (setup_vmcs_config(&vmcs_config) < 0) { 6052 if (setup_vmcs_config(&vmcs_config) < 0) {
5844 r = -EIO; 6053 r = -EIO;
5845 goto out7; 6054 goto out8;
5846 } 6055 }
5847 6056
5848 if (boot_cpu_has(X86_FEATURE_NX)) 6057 if (boot_cpu_has(X86_FEATURE_NX))
@@ -5868,16 +6077,16 @@ static __init int hardware_setup(void)
5868 if (!cpu_has_vmx_unrestricted_guest()) 6077 if (!cpu_has_vmx_unrestricted_guest())
5869 enable_unrestricted_guest = 0; 6078 enable_unrestricted_guest = 0;
5870 6079
5871 if (!cpu_has_vmx_flexpriority()) { 6080 if (!cpu_has_vmx_flexpriority())
5872 flexpriority_enabled = 0; 6081 flexpriority_enabled = 0;
5873 6082
5874 /* 6083 /*
5875 * set_apic_access_page_addr() is used to reload apic access 6084 * set_apic_access_page_addr() is used to reload apic access
5876 * page upon invalidation. No need to do anything if the 6085 * page upon invalidation. No need to do anything if not
5877 * processor does not have the APIC_ACCESS_ADDR VMCS field. 6086 * using the APIC_ACCESS_ADDR VMCS field.
5878 */ 6087 */
6088 if (!flexpriority_enabled)
5879 kvm_x86_ops->set_apic_access_page_addr = NULL; 6089 kvm_x86_ops->set_apic_access_page_addr = NULL;
5880 }
5881 6090
5882 if (!cpu_has_vmx_tpr_shadow()) 6091 if (!cpu_has_vmx_tpr_shadow())
5883 kvm_x86_ops->update_cr8_intercept = NULL; 6092 kvm_x86_ops->update_cr8_intercept = NULL;
@@ -5895,13 +6104,11 @@ static __init int hardware_setup(void)
5895 kvm_x86_ops->update_cr8_intercept = NULL; 6104 kvm_x86_ops->update_cr8_intercept = NULL;
5896 else { 6105 else {
5897 kvm_x86_ops->hwapic_irr_update = NULL; 6106 kvm_x86_ops->hwapic_irr_update = NULL;
6107 kvm_x86_ops->hwapic_isr_update = NULL;
5898 kvm_x86_ops->deliver_posted_interrupt = NULL; 6108 kvm_x86_ops->deliver_posted_interrupt = NULL;
5899 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; 6109 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
5900 } 6110 }
5901 6111
5902 if (nested)
5903 nested_vmx_setup_ctls_msrs();
5904
5905 vmx_disable_intercept_for_msr(MSR_FS_BASE, false); 6112 vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
5906 vmx_disable_intercept_for_msr(MSR_GS_BASE, false); 6113 vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
5907 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); 6114 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -5945,12 +6152,29 @@ static __init int hardware_setup(void)
5945 6152
5946 update_ple_window_actual_max(); 6153 update_ple_window_actual_max();
5947 6154
6155 /*
6156 * Only enable PML when hardware supports PML feature, and both EPT
6157 * and EPT A/D bit features are enabled -- PML depends on them to work.
6158 */
6159 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
6160 enable_pml = 0;
6161
6162 if (!enable_pml) {
6163 kvm_x86_ops->slot_enable_log_dirty = NULL;
6164 kvm_x86_ops->slot_disable_log_dirty = NULL;
6165 kvm_x86_ops->flush_log_dirty = NULL;
6166 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
6167 }
6168
5948 return alloc_kvm_area(); 6169 return alloc_kvm_area();
5949 6170
5950out7: 6171out8:
5951 free_page((unsigned long)vmx_vmwrite_bitmap); 6172 free_page((unsigned long)vmx_vmwrite_bitmap);
5952out6: 6173out7:
5953 free_page((unsigned long)vmx_vmread_bitmap); 6174 free_page((unsigned long)vmx_vmread_bitmap);
6175out6:
6176 if (nested)
6177 free_page((unsigned long)vmx_msr_bitmap_nested);
5954out5: 6178out5:
5955 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); 6179 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
5956out4: 6180out4:
@@ -5977,6 +6201,8 @@ static __exit void hardware_unsetup(void)
5977 free_page((unsigned long)vmx_io_bitmap_a); 6201 free_page((unsigned long)vmx_io_bitmap_a);
5978 free_page((unsigned long)vmx_vmwrite_bitmap); 6202 free_page((unsigned long)vmx_vmwrite_bitmap);
5979 free_page((unsigned long)vmx_vmread_bitmap); 6203 free_page((unsigned long)vmx_vmread_bitmap);
6204 if (nested)
6205 free_page((unsigned long)vmx_msr_bitmap_nested);
5980 6206
5981 free_kvm_area(); 6207 free_kvm_area();
5982} 6208}
@@ -6143,6 +6369,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
6143 */ 6369 */
6144} 6370}
6145 6371
6372static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
6373{
6374 /* TODO: not to reset guest simply here. */
6375 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6376 pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
6377}
6378
6146static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 6379static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
6147{ 6380{
6148 struct vcpu_vmx *vmx = 6381 struct vcpu_vmx *vmx =
@@ -6432,6 +6665,7 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
6432 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 6665 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6433 vmcs_write64(VMCS_LINK_POINTER, -1ull); 6666 vmcs_write64(VMCS_LINK_POINTER, -1ull);
6434 } 6667 }
6668 vmx->nested.posted_intr_nv = -1;
6435 kunmap(vmx->nested.current_vmcs12_page); 6669 kunmap(vmx->nested.current_vmcs12_page);
6436 nested_release_page(vmx->nested.current_vmcs12_page); 6670 nested_release_page(vmx->nested.current_vmcs12_page);
6437 vmx->nested.current_vmptr = -1ull; 6671 vmx->nested.current_vmptr = -1ull;
@@ -6460,6 +6694,12 @@ static void free_nested(struct vcpu_vmx *vmx)
6460 nested_release_page(vmx->nested.virtual_apic_page); 6694 nested_release_page(vmx->nested.virtual_apic_page);
6461 vmx->nested.virtual_apic_page = NULL; 6695 vmx->nested.virtual_apic_page = NULL;
6462 } 6696 }
6697 if (vmx->nested.pi_desc_page) {
6698 kunmap(vmx->nested.pi_desc_page);
6699 nested_release_page(vmx->nested.pi_desc_page);
6700 vmx->nested.pi_desc_page = NULL;
6701 vmx->nested.pi_desc = NULL;
6702 }
6463 6703
6464 nested_free_all_saved_vmcss(vmx); 6704 nested_free_all_saved_vmcss(vmx);
6465} 6705}
@@ -6893,6 +7133,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
6893/* Emulate the INVEPT instruction */ 7133/* Emulate the INVEPT instruction */
6894static int handle_invept(struct kvm_vcpu *vcpu) 7134static int handle_invept(struct kvm_vcpu *vcpu)
6895{ 7135{
7136 struct vcpu_vmx *vmx = to_vmx(vcpu);
6896 u32 vmx_instruction_info, types; 7137 u32 vmx_instruction_info, types;
6897 unsigned long type; 7138 unsigned long type;
6898 gva_t gva; 7139 gva_t gva;
@@ -6901,8 +7142,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
6901 u64 eptp, gpa; 7142 u64 eptp, gpa;
6902 } operand; 7143 } operand;
6903 7144
6904 if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || 7145 if (!(vmx->nested.nested_vmx_secondary_ctls_high &
6905 !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { 7146 SECONDARY_EXEC_ENABLE_EPT) ||
7147 !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
6906 kvm_queue_exception(vcpu, UD_VECTOR); 7148 kvm_queue_exception(vcpu, UD_VECTOR);
6907 return 1; 7149 return 1;
6908 } 7150 }
@@ -6918,7 +7160,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
6918 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7160 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6919 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 7161 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
6920 7162
6921 types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 7163 types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
6922 7164
6923 if (!(types & (1UL << type))) { 7165 if (!(types & (1UL << type))) {
6924 nested_vmx_failValid(vcpu, 7166 nested_vmx_failValid(vcpu,
@@ -6960,6 +7202,31 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
6960 return 1; 7202 return 1;
6961} 7203}
6962 7204
7205static int handle_pml_full(struct kvm_vcpu *vcpu)
7206{
7207 unsigned long exit_qualification;
7208
7209 trace_kvm_pml_full(vcpu->vcpu_id);
7210
7211 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7212
7213 /*
7214 * PML buffer FULL happened while executing iret from NMI,
7215 * "blocked by NMI" bit has to be set before next VM entry.
7216 */
7217 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
7218 cpu_has_virtual_nmis() &&
7219 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
7220 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7221 GUEST_INTR_STATE_NMI);
7222
7223 /*
7224 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
7225 * here.., and there's no userspace involvement needed for PML.
7226 */
7227 return 1;
7228}
7229
6963/* 7230/*
6964 * The exit handlers return 1 if the exit was handled fully and guest execution 7231 * The exit handlers return 1 if the exit was handled fully and guest execution
6965 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 7232 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7008,6 +7275,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
7008 [EXIT_REASON_INVVPID] = handle_invvpid, 7275 [EXIT_REASON_INVVPID] = handle_invvpid,
7009 [EXIT_REASON_XSAVES] = handle_xsaves, 7276 [EXIT_REASON_XSAVES] = handle_xsaves,
7010 [EXIT_REASON_XRSTORS] = handle_xrstors, 7277 [EXIT_REASON_XRSTORS] = handle_xrstors,
7278 [EXIT_REASON_PML_FULL] = handle_pml_full,
7011}; 7279};
7012 7280
7013static const int kvm_vmx_max_exit_handlers = 7281static const int kvm_vmx_max_exit_handlers =
@@ -7275,6 +7543,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7275 case EXIT_REASON_APIC_ACCESS: 7543 case EXIT_REASON_APIC_ACCESS:
7276 return nested_cpu_has2(vmcs12, 7544 return nested_cpu_has2(vmcs12,
7277 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 7545 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
7546 case EXIT_REASON_APIC_WRITE:
7547 case EXIT_REASON_EOI_INDUCED:
7548 /* apic_write and eoi_induced should exit unconditionally. */
7549 return 1;
7278 case EXIT_REASON_EPT_VIOLATION: 7550 case EXIT_REASON_EPT_VIOLATION:
7279 /* 7551 /*
7280 * L0 always deals with the EPT violation. If nested EPT is 7552 * L0 always deals with the EPT violation. If nested EPT is
@@ -7314,6 +7586,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
7314 *info2 = vmcs_read32(VM_EXIT_INTR_INFO); 7586 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
7315} 7587}
7316 7588
7589static int vmx_enable_pml(struct vcpu_vmx *vmx)
7590{
7591 struct page *pml_pg;
7592 u32 exec_control;
7593
7594 pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
7595 if (!pml_pg)
7596 return -ENOMEM;
7597
7598 vmx->pml_pg = pml_pg;
7599
7600 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
7601 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
7602
7603 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7604 exec_control |= SECONDARY_EXEC_ENABLE_PML;
7605 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7606
7607 return 0;
7608}
7609
7610static void vmx_disable_pml(struct vcpu_vmx *vmx)
7611{
7612 u32 exec_control;
7613
7614 ASSERT(vmx->pml_pg);
7615 __free_page(vmx->pml_pg);
7616 vmx->pml_pg = NULL;
7617
7618 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7619 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
7620 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7621}
7622
7623static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
7624{
7625 struct kvm *kvm = vmx->vcpu.kvm;
7626 u64 *pml_buf;
7627 u16 pml_idx;
7628
7629 pml_idx = vmcs_read16(GUEST_PML_INDEX);
7630
7631 /* Do nothing if PML buffer is empty */
7632 if (pml_idx == (PML_ENTITY_NUM - 1))
7633 return;
7634
7635 /* PML index always points to next available PML buffer entity */
7636 if (pml_idx >= PML_ENTITY_NUM)
7637 pml_idx = 0;
7638 else
7639 pml_idx++;
7640
7641 pml_buf = page_address(vmx->pml_pg);
7642 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
7643 u64 gpa;
7644
7645 gpa = pml_buf[pml_idx];
7646 WARN_ON(gpa & (PAGE_SIZE - 1));
7647 mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
7648 }
7649
7650 /* reset PML index */
7651 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
7652}
7653
7654/*
7655 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
7656 * Called before reporting dirty_bitmap to userspace.
7657 */
7658static void kvm_flush_pml_buffers(struct kvm *kvm)
7659{
7660 int i;
7661 struct kvm_vcpu *vcpu;
7662 /*
7663 * We only need to kick vcpu out of guest mode here, as PML buffer
7664 * is flushed at beginning of all VMEXITs, and it's obvious that only
7665 * vcpus running in guest are possible to have unflushed GPAs in PML
7666 * buffer.
7667 */
7668 kvm_for_each_vcpu(i, vcpu, kvm)
7669 kvm_vcpu_kick(vcpu);
7670}
7671
7317/* 7672/*
7318 * The guest has exited. See if we can fix it or if we need userspace 7673 * The guest has exited. See if we can fix it or if we need userspace
7319 * assistance. 7674 * assistance.
@@ -7324,6 +7679,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
7324 u32 exit_reason = vmx->exit_reason; 7679 u32 exit_reason = vmx->exit_reason;
7325 u32 vectoring_info = vmx->idt_vectoring_info; 7680 u32 vectoring_info = vmx->idt_vectoring_info;
7326 7681
7682 /*
7683 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
7684 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
7685 * querying dirty_bitmap, we only need to kick all vcpus out of guest
7686 * mode as if vcpus is in root mode, the PML buffer must has been
7687 * flushed already.
7688 */
7689 if (enable_pml)
7690 vmx_flush_pml_buffer(vmx);
7691
7327 /* If guest state is invalid, start emulating */ 7692 /* If guest state is invalid, start emulating */
7328 if (vmx->emulation_required) 7693 if (vmx->emulation_required)
7329 return handle_invalid_guest_state(vcpu); 7694 return handle_invalid_guest_state(vcpu);
@@ -7471,9 +7836,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
7471 u16 status; 7836 u16 status;
7472 u8 old; 7837 u8 old;
7473 7838
7474 if (!vmx_vm_has_apicv(kvm))
7475 return;
7476
7477 if (isr == -1) 7839 if (isr == -1)
7478 isr = 0; 7840 isr = 0;
7479 7841
@@ -7973,6 +8335,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
7973{ 8335{
7974 struct vcpu_vmx *vmx = to_vmx(vcpu); 8336 struct vcpu_vmx *vmx = to_vmx(vcpu);
7975 8337
8338 if (enable_pml)
8339 vmx_disable_pml(vmx);
7976 free_vpid(vmx); 8340 free_vpid(vmx);
7977 leave_guest_mode(vcpu); 8341 leave_guest_mode(vcpu);
7978 vmx_load_vmcs01(vcpu); 8342 vmx_load_vmcs01(vcpu);
@@ -8040,9 +8404,25 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8040 goto free_vmcs; 8404 goto free_vmcs;
8041 } 8405 }
8042 8406
8407 if (nested)
8408 nested_vmx_setup_ctls_msrs(vmx);
8409
8410 vmx->nested.posted_intr_nv = -1;
8043 vmx->nested.current_vmptr = -1ull; 8411 vmx->nested.current_vmptr = -1ull;
8044 vmx->nested.current_vmcs12 = NULL; 8412 vmx->nested.current_vmcs12 = NULL;
8045 8413
8414 /*
8415 * If PML is turned on, failure on enabling PML just results in failure
8416 * of creating the vcpu, therefore we can simplify PML logic (by
8417 * avoiding dealing with cases, such as enabling PML partially on vcpus
8418 * for the guest, etc.
8419 */
8420 if (enable_pml) {
8421 err = vmx_enable_pml(vmx);
8422 if (err)
8423 goto free_vmcs;
8424 }
8425
8046 return &vmx->vcpu; 8426 return &vmx->vcpu;
8047 8427
8048free_vmcs: 8428free_vmcs:
@@ -8184,9 +8564,10 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
8184 8564
8185static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 8565static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
8186{ 8566{
8187 kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, 8567 WARN_ON(mmu_is_nested(vcpu));
8188 nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); 8568 kvm_init_shadow_ept_mmu(vcpu,
8189 8569 to_vmx(vcpu)->nested.nested_vmx_ept_caps &
8570 VMX_EPT_EXECUTE_ONLY_BIT);
8190 vcpu->arch.mmu.set_cr3 = vmx_set_cr3; 8571 vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
8191 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; 8572 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
8192 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; 8573 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -8199,6 +8580,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
8199 vcpu->arch.walk_mmu = &vcpu->arch.mmu; 8580 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
8200} 8581}
8201 8582
8583static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
8584 u16 error_code)
8585{
8586 bool inequality, bit;
8587
8588 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
8589 inequality =
8590 (error_code & vmcs12->page_fault_error_code_mask) !=
8591 vmcs12->page_fault_error_code_match;
8592 return inequality ^ bit;
8593}
8594
8202static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 8595static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
8203 struct x86_exception *fault) 8596 struct x86_exception *fault)
8204{ 8597{
@@ -8206,8 +8599,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
8206 8599
8207 WARN_ON(!is_guest_mode(vcpu)); 8600 WARN_ON(!is_guest_mode(vcpu));
8208 8601
8209 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ 8602 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
8210 if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
8211 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, 8603 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
8212 vmcs_read32(VM_EXIT_INTR_INFO), 8604 vmcs_read32(VM_EXIT_INTR_INFO),
8213 vmcs_readl(EXIT_QUALIFICATION)); 8605 vmcs_readl(EXIT_QUALIFICATION));
@@ -8261,6 +8653,31 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
8261 return false; 8653 return false;
8262 } 8654 }
8263 8655
8656 if (nested_cpu_has_posted_intr(vmcs12)) {
8657 if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
8658 return false;
8659
8660 if (vmx->nested.pi_desc_page) { /* shouldn't happen */
8661 kunmap(vmx->nested.pi_desc_page);
8662 nested_release_page(vmx->nested.pi_desc_page);
8663 }
8664 vmx->nested.pi_desc_page =
8665 nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
8666 if (!vmx->nested.pi_desc_page)
8667 return false;
8668
8669 vmx->nested.pi_desc =
8670 (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
8671 if (!vmx->nested.pi_desc) {
8672 nested_release_page_clean(vmx->nested.pi_desc_page);
8673 return false;
8674 }
8675 vmx->nested.pi_desc =
8676 (struct pi_desc *)((void *)vmx->nested.pi_desc +
8677 (unsigned long)(vmcs12->posted_intr_desc_addr &
8678 (PAGE_SIZE - 1)));
8679 }
8680
8264 return true; 8681 return true;
8265} 8682}
8266 8683
@@ -8286,6 +8703,310 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
8286 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); 8703 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
8287} 8704}
8288 8705
8706static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
8707 struct vmcs12 *vmcs12)
8708{
8709 int maxphyaddr;
8710 u64 addr;
8711
8712 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
8713 return 0;
8714
8715 if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
8716 WARN_ON(1);
8717 return -EINVAL;
8718 }
8719 maxphyaddr = cpuid_maxphyaddr(vcpu);
8720
8721 if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
8722 ((addr + PAGE_SIZE) >> maxphyaddr))
8723 return -EINVAL;
8724
8725 return 0;
8726}
8727
8728/*
8729 * Merge L0's and L1's MSR bitmap, return false to indicate that
8730 * we do not use the hardware.
8731 */
8732static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
8733 struct vmcs12 *vmcs12)
8734{
8735 int msr;
8736 struct page *page;
8737 unsigned long *msr_bitmap;
8738
8739 if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
8740 return false;
8741
8742 page = nested_get_page(vcpu, vmcs12->msr_bitmap);
8743 if (!page) {
8744 WARN_ON(1);
8745 return false;
8746 }
8747 msr_bitmap = (unsigned long *)kmap(page);
8748 if (!msr_bitmap) {
8749 nested_release_page_clean(page);
8750 WARN_ON(1);
8751 return false;
8752 }
8753
8754 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
8755 if (nested_cpu_has_apic_reg_virt(vmcs12))
8756 for (msr = 0x800; msr <= 0x8ff; msr++)
8757 nested_vmx_disable_intercept_for_msr(
8758 msr_bitmap,
8759 vmx_msr_bitmap_nested,
8760 msr, MSR_TYPE_R);
8761 /* TPR is allowed */
8762 nested_vmx_disable_intercept_for_msr(msr_bitmap,
8763 vmx_msr_bitmap_nested,
8764 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
8765 MSR_TYPE_R | MSR_TYPE_W);
8766 if (nested_cpu_has_vid(vmcs12)) {
8767 /* EOI and self-IPI are allowed */
8768 nested_vmx_disable_intercept_for_msr(
8769 msr_bitmap,
8770 vmx_msr_bitmap_nested,
8771 APIC_BASE_MSR + (APIC_EOI >> 4),
8772 MSR_TYPE_W);
8773 nested_vmx_disable_intercept_for_msr(
8774 msr_bitmap,
8775 vmx_msr_bitmap_nested,
8776 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
8777 MSR_TYPE_W);
8778 }
8779 } else {
8780 /*
8781 * Enable reading intercept of all the x2apic
8782 * MSRs. We should not rely on vmcs12 to do any
8783 * optimizations here, it may have been modified
8784 * by L1.
8785 */
8786 for (msr = 0x800; msr <= 0x8ff; msr++)
8787 __vmx_enable_intercept_for_msr(
8788 vmx_msr_bitmap_nested,
8789 msr,
8790 MSR_TYPE_R);
8791
8792 __vmx_enable_intercept_for_msr(
8793 vmx_msr_bitmap_nested,
8794 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
8795 MSR_TYPE_W);
8796 __vmx_enable_intercept_for_msr(
8797 vmx_msr_bitmap_nested,
8798 APIC_BASE_MSR + (APIC_EOI >> 4),
8799 MSR_TYPE_W);
8800 __vmx_enable_intercept_for_msr(
8801 vmx_msr_bitmap_nested,
8802 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
8803 MSR_TYPE_W);
8804 }
8805 kunmap(page);
8806 nested_release_page_clean(page);
8807
8808 return true;
8809}
8810
8811static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
8812 struct vmcs12 *vmcs12)
8813{
8814 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
8815 !nested_cpu_has_apic_reg_virt(vmcs12) &&
8816 !nested_cpu_has_vid(vmcs12) &&
8817 !nested_cpu_has_posted_intr(vmcs12))
8818 return 0;
8819
8820 /*
8821 * If virtualize x2apic mode is enabled,
8822 * virtualize apic access must be disabled.
8823 */
8824 if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
8825 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
8826 return -EINVAL;
8827
8828 /*
8829 * If virtual interrupt delivery is enabled,
8830 * we must exit on external interrupts.
8831 */
8832 if (nested_cpu_has_vid(vmcs12) &&
8833 !nested_exit_on_intr(vcpu))
8834 return -EINVAL;
8835
8836 /*
8837 * bits 15:8 should be zero in posted_intr_nv,
8838 * the descriptor address has been already checked
8839 * in nested_get_vmcs12_pages.
8840 */
8841 if (nested_cpu_has_posted_intr(vmcs12) &&
8842 (!nested_cpu_has_vid(vmcs12) ||
8843 !nested_exit_intr_ack_set(vcpu) ||
8844 vmcs12->posted_intr_nv & 0xff00))
8845 return -EINVAL;
8846
8847 /* tpr shadow is needed by all apicv features. */
8848 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
8849 return -EINVAL;
8850
8851 return 0;
8852}
8853
8854static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
8855 unsigned long count_field,
8856 unsigned long addr_field,
8857 int maxphyaddr)
8858{
8859 u64 count, addr;
8860
8861 if (vmcs12_read_any(vcpu, count_field, &count) ||
8862 vmcs12_read_any(vcpu, addr_field, &addr)) {
8863 WARN_ON(1);
8864 return -EINVAL;
8865 }
8866 if (count == 0)
8867 return 0;
8868 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
8869 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
8870 pr_warn_ratelimited(
8871 "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
8872 addr_field, maxphyaddr, count, addr);
8873 return -EINVAL;
8874 }
8875 return 0;
8876}
8877
8878static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
8879 struct vmcs12 *vmcs12)
8880{
8881 int maxphyaddr;
8882
8883 if (vmcs12->vm_exit_msr_load_count == 0 &&
8884 vmcs12->vm_exit_msr_store_count == 0 &&
8885 vmcs12->vm_entry_msr_load_count == 0)
8886 return 0; /* Fast path */
8887 maxphyaddr = cpuid_maxphyaddr(vcpu);
8888 if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
8889 VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
8890 nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
8891 VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
8892 nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
8893 VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
8894 return -EINVAL;
8895 return 0;
8896}
8897
8898static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
8899 struct vmx_msr_entry *e)
8900{
8901 /* x2APIC MSR accesses are not allowed */
8902 if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
8903 return -EINVAL;
8904 if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
8905 e->index == MSR_IA32_UCODE_REV)
8906 return -EINVAL;
8907 if (e->reserved != 0)
8908 return -EINVAL;
8909 return 0;
8910}
8911
8912static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
8913 struct vmx_msr_entry *e)
8914{
8915 if (e->index == MSR_FS_BASE ||
8916 e->index == MSR_GS_BASE ||
8917 e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
8918 nested_vmx_msr_check_common(vcpu, e))
8919 return -EINVAL;
8920 return 0;
8921}
8922
8923static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
8924 struct vmx_msr_entry *e)
8925{
8926 if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
8927 nested_vmx_msr_check_common(vcpu, e))
8928 return -EINVAL;
8929 return 0;
8930}
8931
8932/*
8933 * Load guest's/host's msr at nested entry/exit.
8934 * return 0 for success, entry index for failure.
8935 */
8936static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
8937{
8938 u32 i;
8939 struct vmx_msr_entry e;
8940 struct msr_data msr;
8941
8942 msr.host_initiated = false;
8943 for (i = 0; i < count; i++) {
8944 if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
8945 &e, sizeof(e))) {
8946 pr_warn_ratelimited(
8947 "%s cannot read MSR entry (%u, 0x%08llx)\n",
8948 __func__, i, gpa + i * sizeof(e));
8949 goto fail;
8950 }
8951 if (nested_vmx_load_msr_check(vcpu, &e)) {
8952 pr_warn_ratelimited(
8953 "%s check failed (%u, 0x%x, 0x%x)\n",
8954 __func__, i, e.index, e.reserved);
8955 goto fail;
8956 }
8957 msr.index = e.index;
8958 msr.data = e.value;
8959 if (kvm_set_msr(vcpu, &msr)) {
8960 pr_warn_ratelimited(
8961 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
8962 __func__, i, e.index, e.value);
8963 goto fail;
8964 }
8965 }
8966 return 0;
8967fail:
8968 return i + 1;
8969}
8970
8971static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
8972{
8973 u32 i;
8974 struct vmx_msr_entry e;
8975
8976 for (i = 0; i < count; i++) {
8977 if (kvm_read_guest(vcpu->kvm,
8978 gpa + i * sizeof(e),
8979 &e, 2 * sizeof(u32))) {
8980 pr_warn_ratelimited(
8981 "%s cannot read MSR entry (%u, 0x%08llx)\n",
8982 __func__, i, gpa + i * sizeof(e));
8983 return -EINVAL;
8984 }
8985 if (nested_vmx_store_msr_check(vcpu, &e)) {
8986 pr_warn_ratelimited(
8987 "%s check failed (%u, 0x%x, 0x%x)\n",
8988 __func__, i, e.index, e.reserved);
8989 return -EINVAL;
8990 }
8991 if (kvm_get_msr(vcpu, e.index, &e.value)) {
8992 pr_warn_ratelimited(
8993 "%s cannot read MSR (%u, 0x%x)\n",
8994 __func__, i, e.index);
8995 return -EINVAL;
8996 }
8997 if (kvm_write_guest(vcpu->kvm,
8998 gpa + i * sizeof(e) +
8999 offsetof(struct vmx_msr_entry, value),
9000 &e.value, sizeof(e.value))) {
9001 pr_warn_ratelimited(
9002 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
9003 __func__, i, e.index, e.value);
9004 return -EINVAL;
9005 }
9006 }
9007 return 0;
9008}
9009
8289/* 9010/*
8290 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 9011 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
8291 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 9012 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -8365,8 +9086,23 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
8365 9086
8366 exec_control = vmcs12->pin_based_vm_exec_control; 9087 exec_control = vmcs12->pin_based_vm_exec_control;
8367 exec_control |= vmcs_config.pin_based_exec_ctrl; 9088 exec_control |= vmcs_config.pin_based_exec_ctrl;
8368 exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | 9089 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
8369 PIN_BASED_POSTED_INTR); 9090
9091 if (nested_cpu_has_posted_intr(vmcs12)) {
9092 /*
9093 * Note that we use L0's vector here and in
9094 * vmx_deliver_nested_posted_interrupt.
9095 */
9096 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
9097 vmx->nested.pi_pending = false;
9098 vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
9099 vmcs_write64(POSTED_INTR_DESC_ADDR,
9100 page_to_phys(vmx->nested.pi_desc_page) +
9101 (unsigned long)(vmcs12->posted_intr_desc_addr &
9102 (PAGE_SIZE - 1)));
9103 } else
9104 exec_control &= ~PIN_BASED_POSTED_INTR;
9105
8370 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); 9106 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
8371 9107
8372 vmx->nested.preemption_timer_expired = false; 9108 vmx->nested.preemption_timer_expired = false;
@@ -8423,12 +9159,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
8423 else 9159 else
8424 vmcs_write64(APIC_ACCESS_ADDR, 9160 vmcs_write64(APIC_ACCESS_ADDR,
8425 page_to_phys(vmx->nested.apic_access_page)); 9161 page_to_phys(vmx->nested.apic_access_page));
8426 } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { 9162 } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
9163 (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
8427 exec_control |= 9164 exec_control |=
8428 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 9165 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
8429 kvm_vcpu_reload_apic_access_page(vcpu); 9166 kvm_vcpu_reload_apic_access_page(vcpu);
8430 } 9167 }
8431 9168
9169 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
9170 vmcs_write64(EOI_EXIT_BITMAP0,
9171 vmcs12->eoi_exit_bitmap0);
9172 vmcs_write64(EOI_EXIT_BITMAP1,
9173 vmcs12->eoi_exit_bitmap1);
9174 vmcs_write64(EOI_EXIT_BITMAP2,
9175 vmcs12->eoi_exit_bitmap2);
9176 vmcs_write64(EOI_EXIT_BITMAP3,
9177 vmcs12->eoi_exit_bitmap3);
9178 vmcs_write16(GUEST_INTR_STATUS,
9179 vmcs12->guest_intr_status);
9180 }
9181
8432 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 9182 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
8433 } 9183 }
8434 9184
@@ -8462,11 +9212,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
8462 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 9212 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
8463 } 9213 }
8464 9214
9215 if (cpu_has_vmx_msr_bitmap() &&
9216 exec_control & CPU_BASED_USE_MSR_BITMAPS &&
9217 nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
9218 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
9219 } else
9220 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
9221
8465 /* 9222 /*
8466 * Merging of IO and MSR bitmaps not currently supported. 9223 * Merging of IO bitmap not currently supported.
8467 * Rather, exit every time. 9224 * Rather, exit every time.
8468 */ 9225 */
8469 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
8470 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 9226 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
8471 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 9227 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
8472 9228
@@ -8582,6 +9338,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
8582 int cpu; 9338 int cpu;
8583 struct loaded_vmcs *vmcs02; 9339 struct loaded_vmcs *vmcs02;
8584 bool ia32e; 9340 bool ia32e;
9341 u32 msr_entry_idx;
8585 9342
8586 if (!nested_vmx_check_permission(vcpu) || 9343 if (!nested_vmx_check_permission(vcpu) ||
8587 !nested_vmx_check_vmcs12(vcpu)) 9344 !nested_vmx_check_vmcs12(vcpu))
@@ -8616,41 +9373,42 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
8616 return 1; 9373 return 1;
8617 } 9374 }
8618 9375
8619 if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && 9376 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
8620 !PAGE_ALIGNED(vmcs12->msr_bitmap)) {
8621 /*TODO: Also verify bits beyond physical address width are 0*/ 9377 /*TODO: Also verify bits beyond physical address width are 0*/
8622 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9378 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
8623 return 1; 9379 return 1;
8624 } 9380 }
8625 9381
8626 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { 9382 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
8627 /*TODO: Also verify bits beyond physical address width are 0*/
8628 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9383 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
8629 return 1; 9384 return 1;
8630 } 9385 }
8631 9386
8632 if (vmcs12->vm_entry_msr_load_count > 0 || 9387 if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
8633 vmcs12->vm_exit_msr_load_count > 0 || 9388 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
8634 vmcs12->vm_exit_msr_store_count > 0) { 9389 return 1;
8635 pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", 9390 }
8636 __func__); 9391
9392 if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
8637 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9393 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
8638 return 1; 9394 return 1;
8639 } 9395 }
8640 9396
8641 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 9397 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
8642 nested_vmx_true_procbased_ctls_low, 9398 vmx->nested.nested_vmx_true_procbased_ctls_low,
8643 nested_vmx_procbased_ctls_high) || 9399 vmx->nested.nested_vmx_procbased_ctls_high) ||
8644 !vmx_control_verify(vmcs12->secondary_vm_exec_control, 9400 !vmx_control_verify(vmcs12->secondary_vm_exec_control,
8645 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || 9401 vmx->nested.nested_vmx_secondary_ctls_low,
9402 vmx->nested.nested_vmx_secondary_ctls_high) ||
8646 !vmx_control_verify(vmcs12->pin_based_vm_exec_control, 9403 !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
8647 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || 9404 vmx->nested.nested_vmx_pinbased_ctls_low,
9405 vmx->nested.nested_vmx_pinbased_ctls_high) ||
8648 !vmx_control_verify(vmcs12->vm_exit_controls, 9406 !vmx_control_verify(vmcs12->vm_exit_controls,
8649 nested_vmx_true_exit_ctls_low, 9407 vmx->nested.nested_vmx_true_exit_ctls_low,
8650 nested_vmx_exit_ctls_high) || 9408 vmx->nested.nested_vmx_exit_ctls_high) ||
8651 !vmx_control_verify(vmcs12->vm_entry_controls, 9409 !vmx_control_verify(vmcs12->vm_entry_controls,
8652 nested_vmx_true_entry_ctls_low, 9410 vmx->nested.nested_vmx_true_entry_ctls_low,
8653 nested_vmx_entry_ctls_high)) 9411 vmx->nested.nested_vmx_entry_ctls_high))
8654 { 9412 {
8655 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9413 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
8656 return 1; 9414 return 1;
@@ -8663,7 +9421,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
8663 return 1; 9421 return 1;
8664 } 9422 }
8665 9423
8666 if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || 9424 if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
8667 ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { 9425 ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
8668 nested_vmx_entry_failure(vcpu, vmcs12, 9426 nested_vmx_entry_failure(vcpu, vmcs12,
8669 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 9427 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
@@ -8739,10 +9497,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
8739 9497
8740 vmx_segment_cache_clear(vmx); 9498 vmx_segment_cache_clear(vmx);
8741 9499
8742 vmcs12->launch_state = 1;
8743
8744 prepare_vmcs02(vcpu, vmcs12); 9500 prepare_vmcs02(vcpu, vmcs12);
8745 9501
9502 msr_entry_idx = nested_vmx_load_msr(vcpu,
9503 vmcs12->vm_entry_msr_load_addr,
9504 vmcs12->vm_entry_msr_load_count);
9505 if (msr_entry_idx) {
9506 leave_guest_mode(vcpu);
9507 vmx_load_vmcs01(vcpu);
9508 nested_vmx_entry_failure(vcpu, vmcs12,
9509 EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
9510 return 1;
9511 }
9512
9513 vmcs12->launch_state = 1;
9514
8746 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) 9515 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
8747 return kvm_emulate_halt(vcpu); 9516 return kvm_emulate_halt(vcpu);
8748 9517
@@ -8869,9 +9638,10 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
8869 if (vmx->nested.nested_run_pending) 9638 if (vmx->nested.nested_run_pending)
8870 return -EBUSY; 9639 return -EBUSY;
8871 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 9640 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
9641 return 0;
8872 } 9642 }
8873 9643
8874 return 0; 9644 return vmx_complete_nested_posted_interrupt(vcpu);
8875} 9645}
8876 9646
8877static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 9647static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@ -8981,6 +9751,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
8981 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 9751 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
8982 } 9752 }
8983 9753
9754 if (nested_cpu_has_vid(vmcs12))
9755 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
9756
8984 vmcs12->vm_entry_controls = 9757 vmcs12->vm_entry_controls =
8985 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 9758 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
8986 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 9759 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
@@ -9172,6 +9945,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
9172 9945
9173 kvm_set_dr(vcpu, 7, 0x400); 9946 kvm_set_dr(vcpu, 7, 0x400);
9174 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 9947 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
9948
9949 if (cpu_has_vmx_msr_bitmap())
9950 vmx_set_msr_bitmap(vcpu);
9951
9952 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
9953 vmcs12->vm_exit_msr_load_count))
9954 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
9175} 9955}
9176 9956
9177/* 9957/*
@@ -9193,6 +9973,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
9193 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 9973 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
9194 exit_qualification); 9974 exit_qualification);
9195 9975
9976 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
9977 vmcs12->vm_exit_msr_store_count))
9978 nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
9979
9196 vmx_load_vmcs01(vcpu); 9980 vmx_load_vmcs01(vcpu);
9197 9981
9198 if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) 9982 if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
@@ -9235,6 +10019,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
9235 nested_release_page(vmx->nested.virtual_apic_page); 10019 nested_release_page(vmx->nested.virtual_apic_page);
9236 vmx->nested.virtual_apic_page = NULL; 10020 vmx->nested.virtual_apic_page = NULL;
9237 } 10021 }
10022 if (vmx->nested.pi_desc_page) {
10023 kunmap(vmx->nested.pi_desc_page);
10024 nested_release_page(vmx->nested.pi_desc_page);
10025 vmx->nested.pi_desc_page = NULL;
10026 vmx->nested.pi_desc = NULL;
10027 }
9238 10028
9239 /* 10029 /*
9240 * We are now running in L2, mmu_notifier will force to reload the 10030 * We are now running in L2, mmu_notifier will force to reload the
@@ -9301,6 +10091,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
9301 shrink_ple_window(vcpu); 10091 shrink_ple_window(vcpu);
9302} 10092}
9303 10093
10094static void vmx_slot_enable_log_dirty(struct kvm *kvm,
10095 struct kvm_memory_slot *slot)
10096{
10097 kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
10098 kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
10099}
10100
10101static void vmx_slot_disable_log_dirty(struct kvm *kvm,
10102 struct kvm_memory_slot *slot)
10103{
10104 kvm_mmu_slot_set_dirty(kvm, slot);
10105}
10106
10107static void vmx_flush_log_dirty(struct kvm *kvm)
10108{
10109 kvm_flush_pml_buffers(kvm);
10110}
10111
10112static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
10113 struct kvm_memory_slot *memslot,
10114 gfn_t offset, unsigned long mask)
10115{
10116 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
10117}
10118
9304static struct kvm_x86_ops vmx_x86_ops = { 10119static struct kvm_x86_ops vmx_x86_ops = {
9305 .cpu_has_kvm_support = cpu_has_kvm_support, 10120 .cpu_has_kvm_support = cpu_has_kvm_support,
9306 .disabled_by_bios = vmx_disabled_by_bios, 10121 .disabled_by_bios = vmx_disabled_by_bios,
@@ -9409,6 +10224,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
9409 .check_nested_events = vmx_check_nested_events, 10224 .check_nested_events = vmx_check_nested_events,
9410 10225
9411 .sched_in = vmx_sched_in, 10226 .sched_in = vmx_sched_in,
10227
10228 .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
10229 .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
10230 .flush_log_dirty = vmx_flush_log_dirty,
10231 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
9412}; 10232};
9413 10233
9414static int __init vmx_init(void) 10234static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c259814200bd..bd7a70be41b3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
108static u32 tsc_tolerance_ppm = 250; 108static u32 tsc_tolerance_ppm = 250;
109module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); 109module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
110 110
111/* lapic timer advance (tscdeadline mode only) in nanoseconds */
112unsigned int lapic_timer_advance_ns = 0;
113module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
114
111static bool backwards_tsc_observed = false; 115static bool backwards_tsc_observed = false;
112 116
113#define KVM_NR_SHARED_MSRS 16 117#define KVM_NR_SHARED_MSRS 16
@@ -141,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
141 { "irq_window", VCPU_STAT(irq_window_exits) }, 145 { "irq_window", VCPU_STAT(irq_window_exits) },
142 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 146 { "nmi_window", VCPU_STAT(nmi_window_exits) },
143 { "halt_exits", VCPU_STAT(halt_exits) }, 147 { "halt_exits", VCPU_STAT(halt_exits) },
148 { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
144 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 149 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
145 { "hypercalls", VCPU_STAT(hypercalls) }, 150 { "hypercalls", VCPU_STAT(hypercalls) },
146 { "request_irq", VCPU_STAT(request_irq_exits) }, 151 { "request_irq", VCPU_STAT(request_irq_exits) },
@@ -492,7 +497,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
492} 497}
493EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); 498EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
494 499
495int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 500static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
496 void *data, int offset, int len, u32 access) 501 void *data, int offset, int len, u32 access)
497{ 502{
498 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, 503 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
@@ -643,7 +648,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
643 } 648 }
644} 649}
645 650
646int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 651static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
647{ 652{
648 u64 xcr0 = xcr; 653 u64 xcr0 = xcr;
649 u64 old_xcr0 = vcpu->arch.xcr0; 654 u64 old_xcr0 = vcpu->arch.xcr0;
@@ -1083,6 +1088,15 @@ static void update_pvclock_gtod(struct timekeeper *tk)
1083} 1088}
1084#endif 1089#endif
1085 1090
1091void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
1092{
1093 /*
1094 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1095 * vcpu_enter_guest. This function is only called from
1096 * the physical CPU that is running vcpu.
1097 */
1098 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1099}
1086 1100
1087static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 1101static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1088{ 1102{
@@ -1180,7 +1194,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1180#endif 1194#endif
1181 1195
1182static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 1196static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1183unsigned long max_tsc_khz; 1197static unsigned long max_tsc_khz;
1184 1198
1185static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) 1199static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
1186{ 1200{
@@ -1234,7 +1248,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1234 return tsc; 1248 return tsc;
1235} 1249}
1236 1250
1237void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) 1251static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1238{ 1252{
1239#ifdef CONFIG_X86_64 1253#ifdef CONFIG_X86_64
1240 bool vcpus_matched; 1254 bool vcpus_matched;
@@ -1529,7 +1543,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1529 &ka->master_cycle_now); 1543 &ka->master_cycle_now);
1530 1544
1531 ka->use_master_clock = host_tsc_clocksource && vcpus_matched 1545 ka->use_master_clock = host_tsc_clocksource && vcpus_matched
1532 && !backwards_tsc_observed; 1546 && !backwards_tsc_observed
1547 && !ka->boot_vcpu_runs_old_kvmclock;
1533 1548
1534 if (ka->use_master_clock) 1549 if (ka->use_master_clock)
1535 atomic_set(&kvm_guest_has_master_clock, 1); 1550 atomic_set(&kvm_guest_has_master_clock, 1);
@@ -2161,8 +2176,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2161 case MSR_KVM_SYSTEM_TIME_NEW: 2176 case MSR_KVM_SYSTEM_TIME_NEW:
2162 case MSR_KVM_SYSTEM_TIME: { 2177 case MSR_KVM_SYSTEM_TIME: {
2163 u64 gpa_offset; 2178 u64 gpa_offset;
2179 struct kvm_arch *ka = &vcpu->kvm->arch;
2180
2164 kvmclock_reset(vcpu); 2181 kvmclock_reset(vcpu);
2165 2182
2183 if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2184 bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2185
2186 if (ka->boot_vcpu_runs_old_kvmclock != tmp)
2187 set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
2188 &vcpu->requests);
2189
2190 ka->boot_vcpu_runs_old_kvmclock = tmp;
2191 }
2192
2166 vcpu->arch.time = data; 2193 vcpu->arch.time = data;
2167 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); 2194 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2168 2195
@@ -2324,6 +2351,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2324{ 2351{
2325 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 2352 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
2326} 2353}
2354EXPORT_SYMBOL_GPL(kvm_get_msr);
2327 2355
2328static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 2356static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2329{ 2357{
@@ -2738,6 +2766,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2738 case KVM_CAP_READONLY_MEM: 2766 case KVM_CAP_READONLY_MEM:
2739 case KVM_CAP_HYPERV_TIME: 2767 case KVM_CAP_HYPERV_TIME:
2740 case KVM_CAP_IOAPIC_POLARITY_IGNORED: 2768 case KVM_CAP_IOAPIC_POLARITY_IGNORED:
2769 case KVM_CAP_TSC_DEADLINE_TIMER:
2741#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 2770#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2742 case KVM_CAP_ASSIGN_DEV_IRQ: 2771 case KVM_CAP_ASSIGN_DEV_IRQ:
2743 case KVM_CAP_PCI_2_3: 2772 case KVM_CAP_PCI_2_3:
@@ -2776,9 +2805,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2776 case KVM_CAP_TSC_CONTROL: 2805 case KVM_CAP_TSC_CONTROL:
2777 r = kvm_has_tsc_control; 2806 r = kvm_has_tsc_control;
2778 break; 2807 break;
2779 case KVM_CAP_TSC_DEADLINE_TIMER:
2780 r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
2781 break;
2782 default: 2808 default:
2783 r = 0; 2809 r = 0;
2784 break; 2810 break;
@@ -3734,83 +3760,43 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3734 * @kvm: kvm instance 3760 * @kvm: kvm instance
3735 * @log: slot id and address to which we copy the log 3761 * @log: slot id and address to which we copy the log
3736 * 3762 *
3737 * We need to keep it in mind that VCPU threads can write to the bitmap 3763 * Steps 1-4 below provide general overview of dirty page logging. See
3738 * concurrently. So, to avoid losing data, we keep the following order for 3764 * kvm_get_dirty_log_protect() function description for additional details.
3739 * each bit: 3765 *
3766 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
3767 * always flush the TLB (step 4) even if previous step failed and the dirty
3768 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
3769 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
3770 * writes will be marked dirty for next log read.
3740 * 3771 *
3741 * 1. Take a snapshot of the bit and clear it if needed. 3772 * 1. Take a snapshot of the bit and clear it if needed.
3742 * 2. Write protect the corresponding page. 3773 * 2. Write protect the corresponding page.
3743 * 3. Flush TLB's if needed. 3774 * 3. Copy the snapshot to the userspace.
3744 * 4. Copy the snapshot to the userspace. 3775 * 4. Flush TLB's if needed.
3745 *
3746 * Between 2 and 3, the guest may write to the page using the remaining TLB
3747 * entry. This is not a problem because the page will be reported dirty at
3748 * step 4 using the snapshot taken before and step 3 ensures that successive
3749 * writes will be logged for the next call.
3750 */ 3776 */
3751int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) 3777int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3752{ 3778{
3753 int r;
3754 struct kvm_memory_slot *memslot;
3755 unsigned long n, i;
3756 unsigned long *dirty_bitmap;
3757 unsigned long *dirty_bitmap_buffer;
3758 bool is_dirty = false; 3779 bool is_dirty = false;
3780 int r;
3759 3781
3760 mutex_lock(&kvm->slots_lock); 3782 mutex_lock(&kvm->slots_lock);
3761 3783
3762 r = -EINVAL; 3784 /*
3763 if (log->slot >= KVM_USER_MEM_SLOTS) 3785 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
3764 goto out; 3786 */
3765 3787 if (kvm_x86_ops->flush_log_dirty)
3766 memslot = id_to_memslot(kvm->memslots, log->slot); 3788 kvm_x86_ops->flush_log_dirty(kvm);
3767
3768 dirty_bitmap = memslot->dirty_bitmap;
3769 r = -ENOENT;
3770 if (!dirty_bitmap)
3771 goto out;
3772
3773 n = kvm_dirty_bitmap_bytes(memslot);
3774
3775 dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
3776 memset(dirty_bitmap_buffer, 0, n);
3777
3778 spin_lock(&kvm->mmu_lock);
3779
3780 for (i = 0; i < n / sizeof(long); i++) {
3781 unsigned long mask;
3782 gfn_t offset;
3783
3784 if (!dirty_bitmap[i])
3785 continue;
3786
3787 is_dirty = true;
3788
3789 mask = xchg(&dirty_bitmap[i], 0);
3790 dirty_bitmap_buffer[i] = mask;
3791
3792 offset = i * BITS_PER_LONG;
3793 kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
3794 }
3795
3796 spin_unlock(&kvm->mmu_lock);
3797 3789
3798 /* See the comments in kvm_mmu_slot_remove_write_access(). */ 3790 r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
3799 lockdep_assert_held(&kvm->slots_lock);
3800 3791
3801 /* 3792 /*
3802 * All the TLBs can be flushed out of mmu lock, see the comments in 3793 * All the TLBs can be flushed out of mmu lock, see the comments in
3803 * kvm_mmu_slot_remove_write_access(). 3794 * kvm_mmu_slot_remove_write_access().
3804 */ 3795 */
3796 lockdep_assert_held(&kvm->slots_lock);
3805 if (is_dirty) 3797 if (is_dirty)
3806 kvm_flush_remote_tlbs(kvm); 3798 kvm_flush_remote_tlbs(kvm);
3807 3799
3808 r = -EFAULT;
3809 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
3810 goto out;
3811
3812 r = 0;
3813out:
3814 mutex_unlock(&kvm->slots_lock); 3800 mutex_unlock(&kvm->slots_lock);
3815 return r; 3801 return r;
3816} 3802}
@@ -4516,6 +4502,8 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
4516 if (rc != X86EMUL_CONTINUE) 4502 if (rc != X86EMUL_CONTINUE)
4517 return rc; 4503 return rc;
4518 addr += now; 4504 addr += now;
4505 if (ctxt->mode != X86EMUL_MODE_PROT64)
4506 addr = (u32)addr;
4519 val += now; 4507 val += now;
4520 bytes -= now; 4508 bytes -= now;
4521 } 4509 }
@@ -4984,6 +4972,11 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
4984 kvm_register_write(emul_to_vcpu(ctxt), reg, val); 4972 kvm_register_write(emul_to_vcpu(ctxt), reg, val);
4985} 4973}
4986 4974
4975static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
4976{
4977 kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
4978}
4979
4987static const struct x86_emulate_ops emulate_ops = { 4980static const struct x86_emulate_ops emulate_ops = {
4988 .read_gpr = emulator_read_gpr, 4981 .read_gpr = emulator_read_gpr,
4989 .write_gpr = emulator_write_gpr, 4982 .write_gpr = emulator_write_gpr,
@@ -5019,6 +5012,7 @@ static const struct x86_emulate_ops emulate_ops = {
5019 .put_fpu = emulator_put_fpu, 5012 .put_fpu = emulator_put_fpu,
5020 .intercept = emulator_intercept, 5013 .intercept = emulator_intercept,
5021 .get_cpuid = emulator_get_cpuid, 5014 .get_cpuid = emulator_get_cpuid,
5015 .set_nmi_mask = emulator_set_nmi_mask,
5022}; 5016};
5023 5017
5024static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) 5018static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6311,6 +6305,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6311 } 6305 }
6312 6306
6313 trace_kvm_entry(vcpu->vcpu_id); 6307 trace_kvm_entry(vcpu->vcpu_id);
6308 wait_lapic_expire(vcpu);
6314 kvm_x86_ops->run(vcpu); 6309 kvm_x86_ops->run(vcpu);
6315 6310
6316 /* 6311 /*
@@ -7041,15 +7036,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
7041 return r; 7036 return r;
7042} 7037}
7043 7038
7044int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 7039void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
7045{ 7040{
7046 int r;
7047 struct msr_data msr; 7041 struct msr_data msr;
7048 struct kvm *kvm = vcpu->kvm; 7042 struct kvm *kvm = vcpu->kvm;
7049 7043
7050 r = vcpu_load(vcpu); 7044 if (vcpu_load(vcpu))
7051 if (r) 7045 return;
7052 return r;
7053 msr.data = 0x0; 7046 msr.data = 0x0;
7054 msr.index = MSR_IA32_TSC; 7047 msr.index = MSR_IA32_TSC;
7055 msr.host_initiated = true; 7048 msr.host_initiated = true;
@@ -7058,8 +7051,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
7058 7051
7059 schedule_delayed_work(&kvm->arch.kvmclock_sync_work, 7052 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
7060 KVMCLOCK_SYNC_PERIOD); 7053 KVMCLOCK_SYNC_PERIOD);
7061
7062 return r;
7063} 7054}
7064 7055
7065void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 7056void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -7549,12 +7540,62 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
7549 return 0; 7540 return 0;
7550} 7541}
7551 7542
7543static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
7544 struct kvm_memory_slot *new)
7545{
7546 /* Still write protect RO slot */
7547 if (new->flags & KVM_MEM_READONLY) {
7548 kvm_mmu_slot_remove_write_access(kvm, new);
7549 return;
7550 }
7551
7552 /*
7553 * Call kvm_x86_ops dirty logging hooks when they are valid.
7554 *
7555 * kvm_x86_ops->slot_disable_log_dirty is called when:
7556 *
7557 * - KVM_MR_CREATE with dirty logging is disabled
7558 * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
7559 *
7560 * The reason is, in case of PML, we need to set D-bit for any slots
7561 * with dirty logging disabled in order to eliminate unnecessary GPA
7562 * logging in PML buffer (and potential PML buffer full VMEXT). This
7563 * guarantees leaving PML enabled during guest's lifetime won't have
7564 * any additonal overhead from PML when guest is running with dirty
7565 * logging disabled for memory slots.
7566 *
7567 * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
7568 * to dirty logging mode.
7569 *
7570 * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
7571 *
7572 * In case of write protect:
7573 *
7574 * Write protect all pages for dirty logging.
7575 *
7576 * All the sptes including the large sptes which point to this
7577 * slot are set to readonly. We can not create any new large
7578 * spte on this slot until the end of the logging.
7579 *
7580 * See the comments in fast_page_fault().
7581 */
7582 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
7583 if (kvm_x86_ops->slot_enable_log_dirty)
7584 kvm_x86_ops->slot_enable_log_dirty(kvm, new);
7585 else
7586 kvm_mmu_slot_remove_write_access(kvm, new);
7587 } else {
7588 if (kvm_x86_ops->slot_disable_log_dirty)
7589 kvm_x86_ops->slot_disable_log_dirty(kvm, new);
7590 }
7591}
7592
7552void kvm_arch_commit_memory_region(struct kvm *kvm, 7593void kvm_arch_commit_memory_region(struct kvm *kvm,
7553 struct kvm_userspace_memory_region *mem, 7594 struct kvm_userspace_memory_region *mem,
7554 const struct kvm_memory_slot *old, 7595 const struct kvm_memory_slot *old,
7555 enum kvm_mr_change change) 7596 enum kvm_mr_change change)
7556{ 7597{
7557 7598 struct kvm_memory_slot *new;
7558 int nr_mmu_pages = 0; 7599 int nr_mmu_pages = 0;
7559 7600
7560 if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { 7601 if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
@@ -7573,17 +7614,20 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7573 7614
7574 if (nr_mmu_pages) 7615 if (nr_mmu_pages)
7575 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 7616 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
7617
7618 /* It's OK to get 'new' slot here as it has already been installed */
7619 new = id_to_memslot(kvm->memslots, mem->slot);
7620
7576 /* 7621 /*
7577 * Write protect all pages for dirty logging. 7622 * Set up write protection and/or dirty logging for the new slot.
7578 * 7623 *
7579 * All the sptes including the large sptes which point to this 7624 * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
7580 * slot are set to readonly. We can not create any new large 7625 * been zapped so no dirty logging staff is needed for old slot. For
7581 * spte on this slot until the end of the logging. 7626 * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
7582 * 7627 * new and it's also covered when dealing with the new slot.
7583 * See the comments in fast_page_fault().
7584 */ 7628 */
7585 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) 7629 if (change != KVM_MR_DELETE)
7586 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 7630 kvm_mmu_slot_apply_flags(kvm, new);
7587} 7631}
7588 7632
7589void kvm_arch_flush_shadow_all(struct kvm *kvm) 7633void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7837,3 +7881,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
7837EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); 7881EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
7838EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); 7882EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
7839EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); 7883EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
7884EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cc1d61af6140..f5fef1868096 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
147 147
148void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 148void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
149void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 149void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
150void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
150int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 151int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
151 152
152void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); 153void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -170,5 +171,7 @@ extern u64 kvm_supported_xcr0(void);
170 171
171extern unsigned int min_timer_period_us; 172extern unsigned int min_timer_period_us;
172 173
174extern unsigned int lapic_timer_advance_ns;
175
173extern struct static_key kvm_no_apic_vcpu; 176extern struct static_key kvm_no_apic_vcpu;
174#endif 177#endif