diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-13 12:55:09 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-13 12:55:09 -0500 |
commit | b9085bcbf5f43adf60533f9b635b2e7faeed0fe9 (patch) | |
tree | e397abf5682a45c096e75b3d0fa99c8e228425fc /arch/x86/kvm | |
parent | c7d7b98671552abade78834c522b7308bda73c0d (diff) | |
parent | 6557bada461afeaa920a189fae2cff7c8fdce39f (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM update from Paolo Bonzini:
"Fairly small update, but there are some interesting new features.
Common:
Optional support for adding a small amount of polling on each HLT
instruction executed in the guest (or equivalent for other
architectures). This can improve latency up to 50% on some
scenarios (e.g. O_DSYNC writes or TCP_RR netperf tests). This
also has to be enabled manually for now, but the plan is to
auto-tune this in the future.
ARM/ARM64:
The highlights are support for GICv3 emulation and dirty page
tracking
s390:
Several optimizations and bugfixes. Also a first: a feature
exposed by KVM (UUID and long guest name in /proc/sysinfo) before
it is available in IBM's hypervisor! :)
MIPS:
Bugfixes.
x86:
Support for PML (page modification logging, a new feature in
Broadwell Xeons that speeds up dirty page tracking), nested
virtualization improvements (nested APICv---a nice optimization),
usual round of emulation fixes.
There is also a new option to reduce latency of the TSC deadline
timer in the guest; this needs to be tuned manually.
Some commits are common between this pull and Catalin's; I see you
have already included his tree.
Powerpc:
Nothing yet.
The KVM/PPC changes will come in through the PPC maintainers,
because I haven't received them yet and I might end up being
offline for some part of next week"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits)
KVM: ia64: drop kvm.h from installed user headers
KVM: x86: fix build with !CONFIG_SMP
KVM: x86: emulate: correct page fault error code for NoWrite instructions
KVM: Disable compat ioctl for s390
KVM: s390: add cpu model support
KVM: s390: use facilities and cpu_id per KVM
KVM: s390/CPACF: Choose crypto control block format
s390/kernel: Update /proc/sysinfo file with Extended Name and UUID
KVM: s390: reenable LPP facility
KVM: s390: floating irqs: fix user triggerable endless loop
kvm: add halt_poll_ns module parameter
kvm: remove KVM_MMIO_SIZE
KVM: MIPS: Don't leak FPU/DSP to guest
KVM: MIPS: Disable HTW while in guest
KVM: nVMX: Enable nested posted interrupt processing
KVM: nVMX: Enable nested virtual interrupt delivery
KVM: nVMX: Enable nested apic register virtualization
KVM: nVMX: Make nested control MSRs per-cpu
KVM: nVMX: Enable nested virtualize x2apic mode
KVM: nVMX: Prepare for using hardware MSR bitmap
...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 230 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/iommu.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 147 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 351 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 17 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 38 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 1086 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 209 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 3 |
13 files changed, 1673 insertions, 425 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 7dc7ba577ecd..413a7bf9efbb 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -39,6 +39,7 @@ config KVM | |||
39 | select PERF_EVENTS | 39 | select PERF_EVENTS |
40 | select HAVE_KVM_MSI | 40 | select HAVE_KVM_MSI |
41 | select HAVE_KVM_CPU_RELAX_INTERCEPT | 41 | select HAVE_KVM_CPU_RELAX_INTERCEPT |
42 | select KVM_GENERIC_DIRTYLOG_READ_PROTECT | ||
42 | select KVM_VFIO | 43 | select KVM_VFIO |
43 | select SRCU | 44 | select SRCU |
44 | ---help--- | 45 | ---help--- |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index de12c1d379f1..e0b794a84c35 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -86,6 +86,7 @@ | |||
86 | #define DstAcc (OpAcc << DstShift) | 86 | #define DstAcc (OpAcc << DstShift) |
87 | #define DstDI (OpDI << DstShift) | 87 | #define DstDI (OpDI << DstShift) |
88 | #define DstMem64 (OpMem64 << DstShift) | 88 | #define DstMem64 (OpMem64 << DstShift) |
89 | #define DstMem16 (OpMem16 << DstShift) | ||
89 | #define DstImmUByte (OpImmUByte << DstShift) | 90 | #define DstImmUByte (OpImmUByte << DstShift) |
90 | #define DstDX (OpDX << DstShift) | 91 | #define DstDX (OpDX << DstShift) |
91 | #define DstAccLo (OpAccLo << DstShift) | 92 | #define DstAccLo (OpAccLo << DstShift) |
@@ -124,6 +125,7 @@ | |||
124 | #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ | 125 | #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ |
125 | #define Escape (5<<15) /* Escape to coprocessor instruction */ | 126 | #define Escape (5<<15) /* Escape to coprocessor instruction */ |
126 | #define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ | 127 | #define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ |
128 | #define ModeDual (7<<15) /* Different instruction for 32/64 bit */ | ||
127 | #define Sse (1<<18) /* SSE Vector instruction */ | 129 | #define Sse (1<<18) /* SSE Vector instruction */ |
128 | /* Generic ModRM decode. */ | 130 | /* Generic ModRM decode. */ |
129 | #define ModRM (1<<19) | 131 | #define ModRM (1<<19) |
@@ -165,10 +167,10 @@ | |||
165 | #define NoMod ((u64)1 << 47) /* Mod field is ignored */ | 167 | #define NoMod ((u64)1 << 47) /* Mod field is ignored */ |
166 | #define Intercept ((u64)1 << 48) /* Has valid intercept field */ | 168 | #define Intercept ((u64)1 << 48) /* Has valid intercept field */ |
167 | #define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ | 169 | #define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ |
168 | #define NoBigReal ((u64)1 << 50) /* No big real mode */ | ||
169 | #define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ | 170 | #define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ |
170 | #define NearBranch ((u64)1 << 52) /* Near branches */ | 171 | #define NearBranch ((u64)1 << 52) /* Near branches */ |
171 | #define No16 ((u64)1 << 53) /* No 16 bit operand */ | 172 | #define No16 ((u64)1 << 53) /* No 16 bit operand */ |
173 | #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ | ||
172 | 174 | ||
173 | #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) | 175 | #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) |
174 | 176 | ||
@@ -213,6 +215,7 @@ struct opcode { | |||
213 | const struct gprefix *gprefix; | 215 | const struct gprefix *gprefix; |
214 | const struct escape *esc; | 216 | const struct escape *esc; |
215 | const struct instr_dual *idual; | 217 | const struct instr_dual *idual; |
218 | const struct mode_dual *mdual; | ||
216 | void (*fastop)(struct fastop *fake); | 219 | void (*fastop)(struct fastop *fake); |
217 | } u; | 220 | } u; |
218 | int (*check_perm)(struct x86_emulate_ctxt *ctxt); | 221 | int (*check_perm)(struct x86_emulate_ctxt *ctxt); |
@@ -240,6 +243,11 @@ struct instr_dual { | |||
240 | struct opcode mod3; | 243 | struct opcode mod3; |
241 | }; | 244 | }; |
242 | 245 | ||
246 | struct mode_dual { | ||
247 | struct opcode mode32; | ||
248 | struct opcode mode64; | ||
249 | }; | ||
250 | |||
243 | /* EFLAGS bit definitions. */ | 251 | /* EFLAGS bit definitions. */ |
244 | #define EFLG_ID (1<<21) | 252 | #define EFLG_ID (1<<21) |
245 | #define EFLG_VIP (1<<20) | 253 | #define EFLG_VIP (1<<20) |
@@ -262,6 +270,13 @@ struct instr_dual { | |||
262 | #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a | 270 | #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a |
263 | #define EFLG_RESERVED_ONE_MASK 2 | 271 | #define EFLG_RESERVED_ONE_MASK 2 |
264 | 272 | ||
273 | enum x86_transfer_type { | ||
274 | X86_TRANSFER_NONE, | ||
275 | X86_TRANSFER_CALL_JMP, | ||
276 | X86_TRANSFER_RET, | ||
277 | X86_TRANSFER_TASK_SWITCH, | ||
278 | }; | ||
279 | |||
265 | static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) | 280 | static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) |
266 | { | 281 | { |
267 | if (!(ctxt->regs_valid & (1 << nr))) { | 282 | if (!(ctxt->regs_valid & (1 << nr))) { |
@@ -669,9 +684,13 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, | |||
669 | } | 684 | } |
670 | if (addr.ea > lim) | 685 | if (addr.ea > lim) |
671 | goto bad; | 686 | goto bad; |
672 | *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); | 687 | if (lim == 0xffffffff) |
673 | if (size > *max_size) | 688 | *max_size = ~0u; |
674 | goto bad; | 689 | else { |
690 | *max_size = (u64)lim + 1 - addr.ea; | ||
691 | if (size > *max_size) | ||
692 | goto bad; | ||
693 | } | ||
675 | la &= (u32)-1; | 694 | la &= (u32)-1; |
676 | break; | 695 | break; |
677 | } | 696 | } |
@@ -722,19 +741,26 @@ static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, | |||
722 | const struct desc_struct *cs_desc) | 741 | const struct desc_struct *cs_desc) |
723 | { | 742 | { |
724 | enum x86emul_mode mode = ctxt->mode; | 743 | enum x86emul_mode mode = ctxt->mode; |
744 | int rc; | ||
725 | 745 | ||
726 | #ifdef CONFIG_X86_64 | 746 | #ifdef CONFIG_X86_64 |
727 | if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) { | 747 | if (ctxt->mode >= X86EMUL_MODE_PROT16) { |
728 | u64 efer = 0; | 748 | if (cs_desc->l) { |
749 | u64 efer = 0; | ||
729 | 750 | ||
730 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); | 751 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); |
731 | if (efer & EFER_LMA) | 752 | if (efer & EFER_LMA) |
732 | mode = X86EMUL_MODE_PROT64; | 753 | mode = X86EMUL_MODE_PROT64; |
754 | } else | ||
755 | mode = X86EMUL_MODE_PROT32; /* temporary value */ | ||
733 | } | 756 | } |
734 | #endif | 757 | #endif |
735 | if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) | 758 | if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) |
736 | mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | 759 | mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; |
737 | return assign_eip(ctxt, dst, mode); | 760 | rc = assign_eip(ctxt, dst, mode); |
761 | if (rc == X86EMUL_CONTINUE) | ||
762 | ctxt->mode = mode; | ||
763 | return rc; | ||
738 | } | 764 | } |
739 | 765 | ||
740 | static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) | 766 | static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) |
@@ -1057,8 +1083,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) | |||
1057 | asm volatile("fnstcw %0": "+m"(fcw)); | 1083 | asm volatile("fnstcw %0": "+m"(fcw)); |
1058 | ctxt->ops->put_fpu(ctxt); | 1084 | ctxt->ops->put_fpu(ctxt); |
1059 | 1085 | ||
1060 | /* force 2 byte destination */ | ||
1061 | ctxt->dst.bytes = 2; | ||
1062 | ctxt->dst.val = fcw; | 1086 | ctxt->dst.val = fcw; |
1063 | 1087 | ||
1064 | return X86EMUL_CONTINUE; | 1088 | return X86EMUL_CONTINUE; |
@@ -1075,8 +1099,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) | |||
1075 | asm volatile("fnstsw %0": "+m"(fsw)); | 1099 | asm volatile("fnstsw %0": "+m"(fsw)); |
1076 | ctxt->ops->put_fpu(ctxt); | 1100 | ctxt->ops->put_fpu(ctxt); |
1077 | 1101 | ||
1078 | /* force 2 byte destination */ | ||
1079 | ctxt->dst.bytes = 2; | ||
1080 | ctxt->dst.val = fsw; | 1102 | ctxt->dst.val = fsw; |
1081 | 1103 | ||
1082 | return X86EMUL_CONTINUE; | 1104 | return X86EMUL_CONTINUE; |
@@ -1223,6 +1245,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
1223 | else { | 1245 | else { |
1224 | modrm_ea += reg_read(ctxt, base_reg); | 1246 | modrm_ea += reg_read(ctxt, base_reg); |
1225 | adjust_modrm_seg(ctxt, base_reg); | 1247 | adjust_modrm_seg(ctxt, base_reg); |
1248 | /* Increment ESP on POP [ESP] */ | ||
1249 | if ((ctxt->d & IncSP) && | ||
1250 | base_reg == VCPU_REGS_RSP) | ||
1251 | modrm_ea += ctxt->op_bytes; | ||
1226 | } | 1252 | } |
1227 | if (index_reg != 4) | 1253 | if (index_reg != 4) |
1228 | modrm_ea += reg_read(ctxt, index_reg) << scale; | 1254 | modrm_ea += reg_read(ctxt, index_reg) << scale; |
@@ -1435,10 +1461,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, | |||
1435 | ops->get_gdt(ctxt, dt); | 1461 | ops->get_gdt(ctxt, dt); |
1436 | } | 1462 | } |
1437 | 1463 | ||
1438 | /* allowed just for 8 bytes segments */ | 1464 | static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt, |
1439 | static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1465 | u16 selector, ulong *desc_addr_p) |
1440 | u16 selector, struct desc_struct *desc, | ||
1441 | ulong *desc_addr_p) | ||
1442 | { | 1466 | { |
1443 | struct desc_ptr dt; | 1467 | struct desc_ptr dt; |
1444 | u16 index = selector >> 3; | 1468 | u16 index = selector >> 3; |
@@ -1449,8 +1473,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1449 | if (dt.size < index * 8 + 7) | 1473 | if (dt.size < index * 8 + 7) |
1450 | return emulate_gp(ctxt, selector & 0xfffc); | 1474 | return emulate_gp(ctxt, selector & 0xfffc); |
1451 | 1475 | ||
1452 | *desc_addr_p = addr = dt.address + index * 8; | 1476 | addr = dt.address + index * 8; |
1453 | return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, | 1477 | |
1478 | #ifdef CONFIG_X86_64 | ||
1479 | if (addr >> 32 != 0) { | ||
1480 | u64 efer = 0; | ||
1481 | |||
1482 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); | ||
1483 | if (!(efer & EFER_LMA)) | ||
1484 | addr &= (u32)-1; | ||
1485 | } | ||
1486 | #endif | ||
1487 | |||
1488 | *desc_addr_p = addr; | ||
1489 | return X86EMUL_CONTINUE; | ||
1490 | } | ||
1491 | |||
1492 | /* allowed just for 8 bytes segments */ | ||
1493 | static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | ||
1494 | u16 selector, struct desc_struct *desc, | ||
1495 | ulong *desc_addr_p) | ||
1496 | { | ||
1497 | int rc; | ||
1498 | |||
1499 | rc = get_descriptor_ptr(ctxt, selector, desc_addr_p); | ||
1500 | if (rc != X86EMUL_CONTINUE) | ||
1501 | return rc; | ||
1502 | |||
1503 | return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc), | ||
1454 | &ctxt->exception); | 1504 | &ctxt->exception); |
1455 | } | 1505 | } |
1456 | 1506 | ||
@@ -1458,16 +1508,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1458 | static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1508 | static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
1459 | u16 selector, struct desc_struct *desc) | 1509 | u16 selector, struct desc_struct *desc) |
1460 | { | 1510 | { |
1461 | struct desc_ptr dt; | 1511 | int rc; |
1462 | u16 index = selector >> 3; | ||
1463 | ulong addr; | 1512 | ulong addr; |
1464 | 1513 | ||
1465 | get_descriptor_table_ptr(ctxt, selector, &dt); | 1514 | rc = get_descriptor_ptr(ctxt, selector, &addr); |
1466 | 1515 | if (rc != X86EMUL_CONTINUE) | |
1467 | if (dt.size < index * 8 + 7) | 1516 | return rc; |
1468 | return emulate_gp(ctxt, selector & 0xfffc); | ||
1469 | 1517 | ||
1470 | addr = dt.address + index * 8; | ||
1471 | return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, | 1518 | return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, |
1472 | &ctxt->exception); | 1519 | &ctxt->exception); |
1473 | } | 1520 | } |
@@ -1475,7 +1522,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1475 | /* Does not support long mode */ | 1522 | /* Does not support long mode */ |
1476 | static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1523 | static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
1477 | u16 selector, int seg, u8 cpl, | 1524 | u16 selector, int seg, u8 cpl, |
1478 | bool in_task_switch, | 1525 | enum x86_transfer_type transfer, |
1479 | struct desc_struct *desc) | 1526 | struct desc_struct *desc) |
1480 | { | 1527 | { |
1481 | struct desc_struct seg_desc, old_desc; | 1528 | struct desc_struct seg_desc, old_desc; |
@@ -1529,11 +1576,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1529 | return ret; | 1576 | return ret; |
1530 | 1577 | ||
1531 | err_code = selector & 0xfffc; | 1578 | err_code = selector & 0xfffc; |
1532 | err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR; | 1579 | err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR : |
1580 | GP_VECTOR; | ||
1533 | 1581 | ||
1534 | /* can't load system descriptor into segment selector */ | 1582 | /* can't load system descriptor into segment selector */ |
1535 | if (seg <= VCPU_SREG_GS && !seg_desc.s) | 1583 | if (seg <= VCPU_SREG_GS && !seg_desc.s) { |
1584 | if (transfer == X86_TRANSFER_CALL_JMP) | ||
1585 | return X86EMUL_UNHANDLEABLE; | ||
1536 | goto exception; | 1586 | goto exception; |
1587 | } | ||
1537 | 1588 | ||
1538 | if (!seg_desc.p) { | 1589 | if (!seg_desc.p) { |
1539 | err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; | 1590 | err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; |
@@ -1605,10 +1656,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1605 | 1656 | ||
1606 | if (seg_desc.s) { | 1657 | if (seg_desc.s) { |
1607 | /* mark segment as accessed */ | 1658 | /* mark segment as accessed */ |
1608 | seg_desc.type |= 1; | 1659 | if (!(seg_desc.type & 1)) { |
1609 | ret = write_segment_descriptor(ctxt, selector, &seg_desc); | 1660 | seg_desc.type |= 1; |
1610 | if (ret != X86EMUL_CONTINUE) | 1661 | ret = write_segment_descriptor(ctxt, selector, |
1611 | return ret; | 1662 | &seg_desc); |
1663 | if (ret != X86EMUL_CONTINUE) | ||
1664 | return ret; | ||
1665 | } | ||
1612 | } else if (ctxt->mode == X86EMUL_MODE_PROT64) { | 1666 | } else if (ctxt->mode == X86EMUL_MODE_PROT64) { |
1613 | ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3, | 1667 | ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3, |
1614 | sizeof(base3), &ctxt->exception); | 1668 | sizeof(base3), &ctxt->exception); |
@@ -1631,7 +1685,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1631 | u16 selector, int seg) | 1685 | u16 selector, int seg) |
1632 | { | 1686 | { |
1633 | u8 cpl = ctxt->ops->cpl(ctxt); | 1687 | u8 cpl = ctxt->ops->cpl(ctxt); |
1634 | return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL); | 1688 | return __load_segment_descriptor(ctxt, selector, seg, cpl, |
1689 | X86_TRANSFER_NONE, NULL); | ||
1635 | } | 1690 | } |
1636 | 1691 | ||
1637 | static void write_register_operand(struct operand *op) | 1692 | static void write_register_operand(struct operand *op) |
@@ -1828,12 +1883,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) | |||
1828 | unsigned long selector; | 1883 | unsigned long selector; |
1829 | int rc; | 1884 | int rc; |
1830 | 1885 | ||
1831 | rc = emulate_pop(ctxt, &selector, ctxt->op_bytes); | 1886 | rc = emulate_pop(ctxt, &selector, 2); |
1832 | if (rc != X86EMUL_CONTINUE) | 1887 | if (rc != X86EMUL_CONTINUE) |
1833 | return rc; | 1888 | return rc; |
1834 | 1889 | ||
1835 | if (ctxt->modrm_reg == VCPU_SREG_SS) | 1890 | if (ctxt->modrm_reg == VCPU_SREG_SS) |
1836 | ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; | 1891 | ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; |
1892 | if (ctxt->op_bytes > 2) | ||
1893 | rsp_increment(ctxt, ctxt->op_bytes - 2); | ||
1837 | 1894 | ||
1838 | rc = load_segment_descriptor(ctxt, (u16)selector, seg); | 1895 | rc = load_segment_descriptor(ctxt, (u16)selector, seg); |
1839 | return rc; | 1896 | return rc; |
@@ -2007,6 +2064,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) | |||
2007 | 2064 | ||
2008 | ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ | 2065 | ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ |
2009 | ctxt->eflags |= EFLG_RESERVED_ONE_MASK; | 2066 | ctxt->eflags |= EFLG_RESERVED_ONE_MASK; |
2067 | ctxt->ops->set_nmi_mask(ctxt, false); | ||
2010 | 2068 | ||
2011 | return rc; | 2069 | return rc; |
2012 | } | 2070 | } |
@@ -2041,7 +2099,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) | |||
2041 | 2099 | ||
2042 | memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); | 2100 | memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); |
2043 | 2101 | ||
2044 | rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, | 2102 | rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, |
2103 | X86_TRANSFER_CALL_JMP, | ||
2045 | &new_desc); | 2104 | &new_desc); |
2046 | if (rc != X86EMUL_CONTINUE) | 2105 | if (rc != X86EMUL_CONTINUE) |
2047 | return rc; | 2106 | return rc; |
@@ -2130,7 +2189,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) | |||
2130 | /* Outer-privilege level return is not implemented */ | 2189 | /* Outer-privilege level return is not implemented */ |
2131 | if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) | 2190 | if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) |
2132 | return X86EMUL_UNHANDLEABLE; | 2191 | return X86EMUL_UNHANDLEABLE; |
2133 | rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false, | 2192 | rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, |
2193 | X86_TRANSFER_RET, | ||
2134 | &new_desc); | 2194 | &new_desc); |
2135 | if (rc != X86EMUL_CONTINUE) | 2195 | if (rc != X86EMUL_CONTINUE) |
2136 | return rc; | 2196 | return rc; |
@@ -2163,12 +2223,15 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) | |||
2163 | fastop(ctxt, em_cmp); | 2223 | fastop(ctxt, em_cmp); |
2164 | 2224 | ||
2165 | if (ctxt->eflags & EFLG_ZF) { | 2225 | if (ctxt->eflags & EFLG_ZF) { |
2166 | /* Success: write back to memory. */ | 2226 | /* Success: write back to memory; no update of EAX */ |
2227 | ctxt->src.type = OP_NONE; | ||
2167 | ctxt->dst.val = ctxt->src.orig_val; | 2228 | ctxt->dst.val = ctxt->src.orig_val; |
2168 | } else { | 2229 | } else { |
2169 | /* Failure: write the value we saw to EAX. */ | 2230 | /* Failure: write the value we saw to EAX. */ |
2170 | ctxt->dst.type = OP_REG; | 2231 | ctxt->src.type = OP_REG; |
2171 | ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); | 2232 | ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); |
2233 | ctxt->src.val = ctxt->dst.orig_val; | ||
2234 | /* Create write-cycle to dest by writing the same value */ | ||
2172 | ctxt->dst.val = ctxt->dst.orig_val; | 2235 | ctxt->dst.val = ctxt->dst.orig_val; |
2173 | } | 2236 | } |
2174 | return X86EMUL_CONTINUE; | 2237 | return X86EMUL_CONTINUE; |
@@ -2556,23 +2619,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | |||
2556 | * it is handled in a context of new task | 2619 | * it is handled in a context of new task |
2557 | */ | 2620 | */ |
2558 | ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, | 2621 | ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, |
2559 | true, NULL); | 2622 | X86_TRANSFER_TASK_SWITCH, NULL); |
2560 | if (ret != X86EMUL_CONTINUE) | 2623 | if (ret != X86EMUL_CONTINUE) |
2561 | return ret; | 2624 | return ret; |
2562 | ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, | 2625 | ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, |
2563 | true, NULL); | 2626 | X86_TRANSFER_TASK_SWITCH, NULL); |
2564 | if (ret != X86EMUL_CONTINUE) | 2627 | if (ret != X86EMUL_CONTINUE) |
2565 | return ret; | 2628 | return ret; |
2566 | ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, | 2629 | ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, |
2567 | true, NULL); | 2630 | X86_TRANSFER_TASK_SWITCH, NULL); |
2568 | if (ret != X86EMUL_CONTINUE) | 2631 | if (ret != X86EMUL_CONTINUE) |
2569 | return ret; | 2632 | return ret; |
2570 | ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, | 2633 | ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, |
2571 | true, NULL); | 2634 | X86_TRANSFER_TASK_SWITCH, NULL); |
2572 | if (ret != X86EMUL_CONTINUE) | 2635 | if (ret != X86EMUL_CONTINUE) |
2573 | return ret; | 2636 | return ret; |
2574 | ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, | 2637 | ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, |
2575 | true, NULL); | 2638 | X86_TRANSFER_TASK_SWITCH, NULL); |
2576 | if (ret != X86EMUL_CONTINUE) | 2639 | if (ret != X86EMUL_CONTINUE) |
2577 | return ret; | 2640 | return ret; |
2578 | 2641 | ||
@@ -2694,31 +2757,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2694 | * it is handled in a context of new task | 2757 | * it is handled in a context of new task |
2695 | */ | 2758 | */ |
2696 | ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, | 2759 | ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, |
2697 | cpl, true, NULL); | 2760 | cpl, X86_TRANSFER_TASK_SWITCH, NULL); |
2698 | if (ret != X86EMUL_CONTINUE) | 2761 | if (ret != X86EMUL_CONTINUE) |
2699 | return ret; | 2762 | return ret; |
2700 | ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, | 2763 | ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, |
2701 | true, NULL); | 2764 | X86_TRANSFER_TASK_SWITCH, NULL); |
2702 | if (ret != X86EMUL_CONTINUE) | 2765 | if (ret != X86EMUL_CONTINUE) |
2703 | return ret; | 2766 | return ret; |
2704 | ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, | 2767 | ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, |
2705 | true, NULL); | 2768 | X86_TRANSFER_TASK_SWITCH, NULL); |
2706 | if (ret != X86EMUL_CONTINUE) | 2769 | if (ret != X86EMUL_CONTINUE) |
2707 | return ret; | 2770 | return ret; |
2708 | ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, | 2771 | ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, |
2709 | true, NULL); | 2772 | X86_TRANSFER_TASK_SWITCH, NULL); |
2710 | if (ret != X86EMUL_CONTINUE) | 2773 | if (ret != X86EMUL_CONTINUE) |
2711 | return ret; | 2774 | return ret; |
2712 | ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, | 2775 | ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, |
2713 | true, NULL); | 2776 | X86_TRANSFER_TASK_SWITCH, NULL); |
2714 | if (ret != X86EMUL_CONTINUE) | 2777 | if (ret != X86EMUL_CONTINUE) |
2715 | return ret; | 2778 | return ret; |
2716 | ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, | 2779 | ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, |
2717 | true, NULL); | 2780 | X86_TRANSFER_TASK_SWITCH, NULL); |
2718 | if (ret != X86EMUL_CONTINUE) | 2781 | if (ret != X86EMUL_CONTINUE) |
2719 | return ret; | 2782 | return ret; |
2720 | ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, | 2783 | ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, |
2721 | true, NULL); | 2784 | X86_TRANSFER_TASK_SWITCH, NULL); |
2722 | if (ret != X86EMUL_CONTINUE) | 2785 | if (ret != X86EMUL_CONTINUE) |
2723 | return ret; | 2786 | return ret; |
2724 | 2787 | ||
@@ -2739,7 +2802,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2739 | ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, | 2802 | ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, |
2740 | &ctxt->exception); | 2803 | &ctxt->exception); |
2741 | if (ret != X86EMUL_CONTINUE) | 2804 | if (ret != X86EMUL_CONTINUE) |
2742 | /* FIXME: need to provide precise fault address */ | ||
2743 | return ret; | 2805 | return ret; |
2744 | 2806 | ||
2745 | save_state_to_tss32(ctxt, &tss_seg); | 2807 | save_state_to_tss32(ctxt, &tss_seg); |
@@ -2748,13 +2810,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2748 | ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, | 2810 | ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, |
2749 | ldt_sel_offset - eip_offset, &ctxt->exception); | 2811 | ldt_sel_offset - eip_offset, &ctxt->exception); |
2750 | if (ret != X86EMUL_CONTINUE) | 2812 | if (ret != X86EMUL_CONTINUE) |
2751 | /* FIXME: need to provide precise fault address */ | ||
2752 | return ret; | 2813 | return ret; |
2753 | 2814 | ||
2754 | ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, | 2815 | ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, |
2755 | &ctxt->exception); | 2816 | &ctxt->exception); |
2756 | if (ret != X86EMUL_CONTINUE) | 2817 | if (ret != X86EMUL_CONTINUE) |
2757 | /* FIXME: need to provide precise fault address */ | ||
2758 | return ret; | 2818 | return ret; |
2759 | 2819 | ||
2760 | if (old_tss_sel != 0xffff) { | 2820 | if (old_tss_sel != 0xffff) { |
@@ -2765,7 +2825,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2765 | sizeof tss_seg.prev_task_link, | 2825 | sizeof tss_seg.prev_task_link, |
2766 | &ctxt->exception); | 2826 | &ctxt->exception); |
2767 | if (ret != X86EMUL_CONTINUE) | 2827 | if (ret != X86EMUL_CONTINUE) |
2768 | /* FIXME: need to provide precise fault address */ | ||
2769 | return ret; | 2828 | return ret; |
2770 | } | 2829 | } |
2771 | 2830 | ||
@@ -2999,15 +3058,16 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) | |||
2999 | struct desc_struct old_desc, new_desc; | 3058 | struct desc_struct old_desc, new_desc; |
3000 | const struct x86_emulate_ops *ops = ctxt->ops; | 3059 | const struct x86_emulate_ops *ops = ctxt->ops; |
3001 | int cpl = ctxt->ops->cpl(ctxt); | 3060 | int cpl = ctxt->ops->cpl(ctxt); |
3061 | enum x86emul_mode prev_mode = ctxt->mode; | ||
3002 | 3062 | ||
3003 | old_eip = ctxt->_eip; | 3063 | old_eip = ctxt->_eip; |
3004 | ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); | 3064 | ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); |
3005 | 3065 | ||
3006 | memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); | 3066 | memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); |
3007 | rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, | 3067 | rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, |
3008 | &new_desc); | 3068 | X86_TRANSFER_CALL_JMP, &new_desc); |
3009 | if (rc != X86EMUL_CONTINUE) | 3069 | if (rc != X86EMUL_CONTINUE) |
3010 | return X86EMUL_CONTINUE; | 3070 | return rc; |
3011 | 3071 | ||
3012 | rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); | 3072 | rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); |
3013 | if (rc != X86EMUL_CONTINUE) | 3073 | if (rc != X86EMUL_CONTINUE) |
@@ -3022,11 +3082,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) | |||
3022 | rc = em_push(ctxt); | 3082 | rc = em_push(ctxt); |
3023 | /* If we failed, we tainted the memory, but the very least we should | 3083 | /* If we failed, we tainted the memory, but the very least we should |
3024 | restore cs */ | 3084 | restore cs */ |
3025 | if (rc != X86EMUL_CONTINUE) | 3085 | if (rc != X86EMUL_CONTINUE) { |
3086 | pr_warn_once("faulting far call emulation tainted memory\n"); | ||
3026 | goto fail; | 3087 | goto fail; |
3088 | } | ||
3027 | return rc; | 3089 | return rc; |
3028 | fail: | 3090 | fail: |
3029 | ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); | 3091 | ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); |
3092 | ctxt->mode = prev_mode; | ||
3030 | return rc; | 3093 | return rc; |
3031 | 3094 | ||
3032 | } | 3095 | } |
@@ -3477,6 +3540,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt) | |||
3477 | return X86EMUL_CONTINUE; | 3540 | return X86EMUL_CONTINUE; |
3478 | } | 3541 | } |
3479 | 3542 | ||
3543 | static int em_movsxd(struct x86_emulate_ctxt *ctxt) | ||
3544 | { | ||
3545 | ctxt->dst.val = (s32) ctxt->src.val; | ||
3546 | return X86EMUL_CONTINUE; | ||
3547 | } | ||
3548 | |||
3480 | static bool valid_cr(int nr) | 3549 | static bool valid_cr(int nr) |
3481 | { | 3550 | { |
3482 | switch (nr) { | 3551 | switch (nr) { |
@@ -3676,6 +3745,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) | |||
3676 | #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } | 3745 | #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } |
3677 | #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } | 3746 | #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } |
3678 | #define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) } | 3747 | #define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) } |
3748 | #define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) } | ||
3679 | #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } | 3749 | #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } |
3680 | #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } | 3750 | #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } |
3681 | #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } | 3751 | #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } |
@@ -3738,7 +3808,7 @@ static const struct opcode group1[] = { | |||
3738 | }; | 3808 | }; |
3739 | 3809 | ||
3740 | static const struct opcode group1A[] = { | 3810 | static const struct opcode group1A[] = { |
3741 | I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, | 3811 | I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N, |
3742 | }; | 3812 | }; |
3743 | 3813 | ||
3744 | static const struct opcode group2[] = { | 3814 | static const struct opcode group2[] = { |
@@ -3854,7 +3924,7 @@ static const struct gprefix pfx_0f_e7 = { | |||
3854 | }; | 3924 | }; |
3855 | 3925 | ||
3856 | static const struct escape escape_d9 = { { | 3926 | static const struct escape escape_d9 = { { |
3857 | N, N, N, N, N, N, N, I(DstMem, em_fnstcw), | 3927 | N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw), |
3858 | }, { | 3928 | }, { |
3859 | /* 0xC0 - 0xC7 */ | 3929 | /* 0xC0 - 0xC7 */ |
3860 | N, N, N, N, N, N, N, N, | 3930 | N, N, N, N, N, N, N, N, |
@@ -3896,7 +3966,7 @@ static const struct escape escape_db = { { | |||
3896 | } }; | 3966 | } }; |
3897 | 3967 | ||
3898 | static const struct escape escape_dd = { { | 3968 | static const struct escape escape_dd = { { |
3899 | N, N, N, N, N, N, N, I(DstMem, em_fnstsw), | 3969 | N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw), |
3900 | }, { | 3970 | }, { |
3901 | /* 0xC0 - 0xC7 */ | 3971 | /* 0xC0 - 0xC7 */ |
3902 | N, N, N, N, N, N, N, N, | 3972 | N, N, N, N, N, N, N, N, |
@@ -3920,6 +3990,10 @@ static const struct instr_dual instr_dual_0f_c3 = { | |||
3920 | I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N | 3990 | I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N |
3921 | }; | 3991 | }; |
3922 | 3992 | ||
3993 | static const struct mode_dual mode_dual_63 = { | ||
3994 | N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd) | ||
3995 | }; | ||
3996 | |||
3923 | static const struct opcode opcode_table[256] = { | 3997 | static const struct opcode opcode_table[256] = { |
3924 | /* 0x00 - 0x07 */ | 3998 | /* 0x00 - 0x07 */ |
3925 | F6ALU(Lock, em_add), | 3999 | F6ALU(Lock, em_add), |
@@ -3954,7 +4028,7 @@ static const struct opcode opcode_table[256] = { | |||
3954 | /* 0x60 - 0x67 */ | 4028 | /* 0x60 - 0x67 */ |
3955 | I(ImplicitOps | Stack | No64, em_pusha), | 4029 | I(ImplicitOps | Stack | No64, em_pusha), |
3956 | I(ImplicitOps | Stack | No64, em_popa), | 4030 | I(ImplicitOps | Stack | No64, em_popa), |
3957 | N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , | 4031 | N, MD(ModRM, &mode_dual_63), |
3958 | N, N, N, N, | 4032 | N, N, N, N, |
3959 | /* 0x68 - 0x6F */ | 4033 | /* 0x68 - 0x6F */ |
3960 | I(SrcImm | Mov | Stack, em_push), | 4034 | I(SrcImm | Mov | Stack, em_push), |
@@ -4010,8 +4084,8 @@ static const struct opcode opcode_table[256] = { | |||
4010 | G(ByteOp, group11), G(0, group11), | 4084 | G(ByteOp, group11), G(0, group11), |
4011 | /* 0xC8 - 0xCF */ | 4085 | /* 0xC8 - 0xCF */ |
4012 | I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), | 4086 | I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), |
4013 | I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm), | 4087 | I(ImplicitOps | SrcImmU16, em_ret_far_imm), |
4014 | I(ImplicitOps | Stack, em_ret_far), | 4088 | I(ImplicitOps, em_ret_far), |
4015 | D(ImplicitOps), DI(SrcImmByte, intn), | 4089 | D(ImplicitOps), DI(SrcImmByte, intn), |
4016 | D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), | 4090 | D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), |
4017 | /* 0xD0 - 0xD7 */ | 4091 | /* 0xD0 - 0xD7 */ |
@@ -4108,7 +4182,7 @@ static const struct opcode twobyte_table[256] = { | |||
4108 | F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), | 4182 | F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), |
4109 | GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul), | 4183 | GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul), |
4110 | /* 0xB0 - 0xB7 */ | 4184 | /* 0xB0 - 0xB7 */ |
4111 | I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), | 4185 | I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg), |
4112 | I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), | 4186 | I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), |
4113 | F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), | 4187 | F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), |
4114 | I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), | 4188 | I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), |
@@ -4174,6 +4248,8 @@ static const struct opcode opcode_map_0f_38[256] = { | |||
4174 | #undef I | 4248 | #undef I |
4175 | #undef GP | 4249 | #undef GP |
4176 | #undef EXT | 4250 | #undef EXT |
4251 | #undef MD | ||
4252 | #undef ID | ||
4177 | 4253 | ||
4178 | #undef D2bv | 4254 | #undef D2bv |
4179 | #undef D2bvIP | 4255 | #undef D2bvIP |
@@ -4563,6 +4639,12 @@ done_prefixes: | |||
4563 | else | 4639 | else |
4564 | opcode = opcode.u.idual->mod012; | 4640 | opcode = opcode.u.idual->mod012; |
4565 | break; | 4641 | break; |
4642 | case ModeDual: | ||
4643 | if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
4644 | opcode = opcode.u.mdual->mode64; | ||
4645 | else | ||
4646 | opcode = opcode.u.mdual->mode32; | ||
4647 | break; | ||
4566 | default: | 4648 | default: |
4567 | return EMULATION_FAILED; | 4649 | return EMULATION_FAILED; |
4568 | } | 4650 | } |
@@ -4860,8 +4942,13 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
4860 | /* optimisation - avoid slow emulated read if Mov */ | 4942 | /* optimisation - avoid slow emulated read if Mov */ |
4861 | rc = segmented_read(ctxt, ctxt->dst.addr.mem, | 4943 | rc = segmented_read(ctxt, ctxt->dst.addr.mem, |
4862 | &ctxt->dst.val, ctxt->dst.bytes); | 4944 | &ctxt->dst.val, ctxt->dst.bytes); |
4863 | if (rc != X86EMUL_CONTINUE) | 4945 | if (rc != X86EMUL_CONTINUE) { |
4946 | if (!(ctxt->d & NoWrite) && | ||
4947 | rc == X86EMUL_PROPAGATE_FAULT && | ||
4948 | ctxt->exception.vector == PF_VECTOR) | ||
4949 | ctxt->exception.error_code |= PFERR_WRITE_MASK; | ||
4864 | goto done; | 4950 | goto done; |
4951 | } | ||
4865 | } | 4952 | } |
4866 | ctxt->dst.orig_val = ctxt->dst.val; | 4953 | ctxt->dst.orig_val = ctxt->dst.val; |
4867 | 4954 | ||
@@ -4899,11 +4986,6 @@ special_insn: | |||
4899 | goto threebyte_insn; | 4986 | goto threebyte_insn; |
4900 | 4987 | ||
4901 | switch (ctxt->b) { | 4988 | switch (ctxt->b) { |
4902 | case 0x63: /* movsxd */ | ||
4903 | if (ctxt->mode != X86EMUL_MODE_PROT64) | ||
4904 | goto cannot_emulate; | ||
4905 | ctxt->dst.val = (s32) ctxt->src.val; | ||
4906 | break; | ||
4907 | case 0x70 ... 0x7f: /* jcc (short) */ | 4989 | case 0x70 ... 0x7f: /* jcc (short) */ |
4908 | if (test_cc(ctxt->b, ctxt->eflags)) | 4990 | if (test_cc(ctxt->b, ctxt->eflags)) |
4909 | rc = jmp_rel(ctxt, ctxt->src.val); | 4991 | rc = jmp_rel(ctxt, ctxt->src.val); |
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 3c9195535ffc..c2e36d934af4 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h | |||
@@ -98,7 +98,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) | |||
98 | } | 98 | } |
99 | 99 | ||
100 | void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); | 100 | void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); |
101 | int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | 101 | bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, |
102 | int short_hand, unsigned int dest, int dest_mode); | 102 | int short_hand, unsigned int dest, int dest_mode); |
103 | int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); | 103 | int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); |
104 | void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, | 104 | void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, |
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index 17b73eeac8a4..7dbced309ddb 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c | |||
@@ -138,7 +138,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) | |||
138 | 138 | ||
139 | gfn += page_size >> PAGE_SHIFT; | 139 | gfn += page_size >> PAGE_SHIFT; |
140 | 140 | ||
141 | 141 | cond_resched(); | |
142 | } | 142 | } |
143 | 143 | ||
144 | return 0; | 144 | return 0; |
@@ -306,6 +306,8 @@ static void kvm_iommu_put_pages(struct kvm *kvm, | |||
306 | kvm_unpin_pages(kvm, pfn, unmap_pages); | 306 | kvm_unpin_pages(kvm, pfn, unmap_pages); |
307 | 307 | ||
308 | gfn += unmap_pages; | 308 | gfn += unmap_pages; |
309 | |||
310 | cond_resched(); | ||
309 | } | 311 | } |
310 | } | 312 | } |
311 | 313 | ||
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d52dcf0776ea..e55b5fc344eb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <asm/page.h> | 33 | #include <asm/page.h> |
34 | #include <asm/current.h> | 34 | #include <asm/current.h> |
35 | #include <asm/apicdef.h> | 35 | #include <asm/apicdef.h> |
36 | #include <asm/delay.h> | ||
36 | #include <linux/atomic.h> | 37 | #include <linux/atomic.h> |
37 | #include <linux/jump_label.h> | 38 | #include <linux/jump_label.h> |
38 | #include "kvm_cache_regs.h" | 39 | #include "kvm_cache_regs.h" |
@@ -327,17 +328,24 @@ static u8 count_vectors(void *bitmap) | |||
327 | return count; | 328 | return count; |
328 | } | 329 | } |
329 | 330 | ||
330 | void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) | 331 | void __kvm_apic_update_irr(u32 *pir, void *regs) |
331 | { | 332 | { |
332 | u32 i, pir_val; | 333 | u32 i, pir_val; |
333 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
334 | 334 | ||
335 | for (i = 0; i <= 7; i++) { | 335 | for (i = 0; i <= 7; i++) { |
336 | pir_val = xchg(&pir[i], 0); | 336 | pir_val = xchg(&pir[i], 0); |
337 | if (pir_val) | 337 | if (pir_val) |
338 | *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; | 338 | *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; |
339 | } | 339 | } |
340 | } | 340 | } |
341 | EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); | ||
342 | |||
343 | void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) | ||
344 | { | ||
345 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
346 | |||
347 | __kvm_apic_update_irr(pir, apic->regs); | ||
348 | } | ||
341 | EXPORT_SYMBOL_GPL(kvm_apic_update_irr); | 349 | EXPORT_SYMBOL_GPL(kvm_apic_update_irr); |
342 | 350 | ||
343 | static inline void apic_set_irr(int vec, struct kvm_lapic *apic) | 351 | static inline void apic_set_irr(int vec, struct kvm_lapic *apic) |
@@ -405,7 +413,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) | |||
405 | * because the processor can modify ISR under the hood. Instead | 413 | * because the processor can modify ISR under the hood. Instead |
406 | * just set SVI. | 414 | * just set SVI. |
407 | */ | 415 | */ |
408 | if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) | 416 | if (unlikely(kvm_x86_ops->hwapic_isr_update)) |
409 | kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); | 417 | kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); |
410 | else { | 418 | else { |
411 | ++apic->isr_count; | 419 | ++apic->isr_count; |
@@ -453,7 +461,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) | |||
453 | * on the other hand isr_count and highest_isr_cache are unused | 461 | * on the other hand isr_count and highest_isr_cache are unused |
454 | * and must be left alone. | 462 | * and must be left alone. |
455 | */ | 463 | */ |
456 | if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) | 464 | if (unlikely(kvm_x86_ops->hwapic_isr_update)) |
457 | kvm_x86_ops->hwapic_isr_update(vcpu->kvm, | 465 | kvm_x86_ops->hwapic_isr_update(vcpu->kvm, |
458 | apic_find_highest_isr(apic)); | 466 | apic_find_highest_isr(apic)); |
459 | else { | 467 | else { |
@@ -580,55 +588,48 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) | |||
580 | apic_update_ppr(apic); | 588 | apic_update_ppr(apic); |
581 | } | 589 | } |
582 | 590 | ||
583 | static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) | 591 | static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) |
584 | { | 592 | { |
585 | return dest == (apic_x2apic_mode(apic) ? | 593 | return dest == (apic_x2apic_mode(apic) ? |
586 | X2APIC_BROADCAST : APIC_BROADCAST); | 594 | X2APIC_BROADCAST : APIC_BROADCAST); |
587 | } | 595 | } |
588 | 596 | ||
589 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) | 597 | static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) |
590 | { | 598 | { |
591 | return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); | 599 | return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); |
592 | } | 600 | } |
593 | 601 | ||
594 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) | 602 | static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) |
595 | { | 603 | { |
596 | int result = 0; | ||
597 | u32 logical_id; | 604 | u32 logical_id; |
598 | 605 | ||
599 | if (kvm_apic_broadcast(apic, mda)) | 606 | if (kvm_apic_broadcast(apic, mda)) |
600 | return 1; | 607 | return true; |
601 | 608 | ||
602 | if (apic_x2apic_mode(apic)) { | 609 | logical_id = kvm_apic_get_reg(apic, APIC_LDR); |
603 | logical_id = kvm_apic_get_reg(apic, APIC_LDR); | ||
604 | return logical_id & mda; | ||
605 | } | ||
606 | 610 | ||
607 | logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR)); | 611 | if (apic_x2apic_mode(apic)) |
612 | return ((logical_id >> 16) == (mda >> 16)) | ||
613 | && (logical_id & mda & 0xffff) != 0; | ||
614 | |||
615 | logical_id = GET_APIC_LOGICAL_ID(logical_id); | ||
608 | 616 | ||
609 | switch (kvm_apic_get_reg(apic, APIC_DFR)) { | 617 | switch (kvm_apic_get_reg(apic, APIC_DFR)) { |
610 | case APIC_DFR_FLAT: | 618 | case APIC_DFR_FLAT: |
611 | if (logical_id & mda) | 619 | return (logical_id & mda) != 0; |
612 | result = 1; | ||
613 | break; | ||
614 | case APIC_DFR_CLUSTER: | 620 | case APIC_DFR_CLUSTER: |
615 | if (((logical_id >> 4) == (mda >> 0x4)) | 621 | return ((logical_id >> 4) == (mda >> 4)) |
616 | && (logical_id & mda & 0xf)) | 622 | && (logical_id & mda & 0xf) != 0; |
617 | result = 1; | ||
618 | break; | ||
619 | default: | 623 | default: |
620 | apic_debug("Bad DFR vcpu %d: %08x\n", | 624 | apic_debug("Bad DFR vcpu %d: %08x\n", |
621 | apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR)); | 625 | apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR)); |
622 | break; | 626 | return false; |
623 | } | 627 | } |
624 | |||
625 | return result; | ||
626 | } | 628 | } |
627 | 629 | ||
628 | int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | 630 | bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, |
629 | int short_hand, unsigned int dest, int dest_mode) | 631 | int short_hand, unsigned int dest, int dest_mode) |
630 | { | 632 | { |
631 | int result = 0; | ||
632 | struct kvm_lapic *target = vcpu->arch.apic; | 633 | struct kvm_lapic *target = vcpu->arch.apic; |
633 | 634 | ||
634 | apic_debug("target %p, source %p, dest 0x%x, " | 635 | apic_debug("target %p, source %p, dest 0x%x, " |
@@ -638,29 +639,21 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | |||
638 | ASSERT(target); | 639 | ASSERT(target); |
639 | switch (short_hand) { | 640 | switch (short_hand) { |
640 | case APIC_DEST_NOSHORT: | 641 | case APIC_DEST_NOSHORT: |
641 | if (dest_mode == 0) | 642 | if (dest_mode == APIC_DEST_PHYSICAL) |
642 | /* Physical mode. */ | 643 | return kvm_apic_match_physical_addr(target, dest); |
643 | result = kvm_apic_match_physical_addr(target, dest); | ||
644 | else | 644 | else |
645 | /* Logical mode. */ | 645 | return kvm_apic_match_logical_addr(target, dest); |
646 | result = kvm_apic_match_logical_addr(target, dest); | ||
647 | break; | ||
648 | case APIC_DEST_SELF: | 646 | case APIC_DEST_SELF: |
649 | result = (target == source); | 647 | return target == source; |
650 | break; | ||
651 | case APIC_DEST_ALLINC: | 648 | case APIC_DEST_ALLINC: |
652 | result = 1; | 649 | return true; |
653 | break; | ||
654 | case APIC_DEST_ALLBUT: | 650 | case APIC_DEST_ALLBUT: |
655 | result = (target != source); | 651 | return target != source; |
656 | break; | ||
657 | default: | 652 | default: |
658 | apic_debug("kvm: apic: Bad dest shorthand value %x\n", | 653 | apic_debug("kvm: apic: Bad dest shorthand value %x\n", |
659 | short_hand); | 654 | short_hand); |
660 | break; | 655 | return false; |
661 | } | 656 | } |
662 | |||
663 | return result; | ||
664 | } | 657 | } |
665 | 658 | ||
666 | bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, | 659 | bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, |
@@ -693,7 +686,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, | |||
693 | 686 | ||
694 | ret = true; | 687 | ret = true; |
695 | 688 | ||
696 | if (irq->dest_mode == 0) { /* physical mode */ | 689 | if (irq->dest_mode == APIC_DEST_PHYSICAL) { |
697 | if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) | 690 | if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) |
698 | goto out; | 691 | goto out; |
699 | 692 | ||
@@ -1076,25 +1069,72 @@ static void apic_timer_expired(struct kvm_lapic *apic) | |||
1076 | { | 1069 | { |
1077 | struct kvm_vcpu *vcpu = apic->vcpu; | 1070 | struct kvm_vcpu *vcpu = apic->vcpu; |
1078 | wait_queue_head_t *q = &vcpu->wq; | 1071 | wait_queue_head_t *q = &vcpu->wq; |
1072 | struct kvm_timer *ktimer = &apic->lapic_timer; | ||
1079 | 1073 | ||
1080 | /* | ||
1081 | * Note: KVM_REQ_PENDING_TIMER is implicitly checked in | ||
1082 | * vcpu_enter_guest. | ||
1083 | */ | ||
1084 | if (atomic_read(&apic->lapic_timer.pending)) | 1074 | if (atomic_read(&apic->lapic_timer.pending)) |
1085 | return; | 1075 | return; |
1086 | 1076 | ||
1087 | atomic_inc(&apic->lapic_timer.pending); | 1077 | atomic_inc(&apic->lapic_timer.pending); |
1088 | /* FIXME: this code should not know anything about vcpus */ | 1078 | kvm_set_pending_timer(vcpu); |
1089 | kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); | ||
1090 | 1079 | ||
1091 | if (waitqueue_active(q)) | 1080 | if (waitqueue_active(q)) |
1092 | wake_up_interruptible(q); | 1081 | wake_up_interruptible(q); |
1082 | |||
1083 | if (apic_lvtt_tscdeadline(apic)) | ||
1084 | ktimer->expired_tscdeadline = ktimer->tscdeadline; | ||
1085 | } | ||
1086 | |||
1087 | /* | ||
1088 | * On APICv, this test will cause a busy wait | ||
1089 | * during a higher-priority task. | ||
1090 | */ | ||
1091 | |||
1092 | static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) | ||
1093 | { | ||
1094 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1095 | u32 reg = kvm_apic_get_reg(apic, APIC_LVTT); | ||
1096 | |||
1097 | if (kvm_apic_hw_enabled(apic)) { | ||
1098 | int vec = reg & APIC_VECTOR_MASK; | ||
1099 | void *bitmap = apic->regs + APIC_ISR; | ||
1100 | |||
1101 | if (kvm_x86_ops->deliver_posted_interrupt) | ||
1102 | bitmap = apic->regs + APIC_IRR; | ||
1103 | |||
1104 | if (apic_test_vector(vec, bitmap)) | ||
1105 | return true; | ||
1106 | } | ||
1107 | return false; | ||
1108 | } | ||
1109 | |||
1110 | void wait_lapic_expire(struct kvm_vcpu *vcpu) | ||
1111 | { | ||
1112 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1113 | u64 guest_tsc, tsc_deadline; | ||
1114 | |||
1115 | if (!kvm_vcpu_has_lapic(vcpu)) | ||
1116 | return; | ||
1117 | |||
1118 | if (apic->lapic_timer.expired_tscdeadline == 0) | ||
1119 | return; | ||
1120 | |||
1121 | if (!lapic_timer_int_injected(vcpu)) | ||
1122 | return; | ||
1123 | |||
1124 | tsc_deadline = apic->lapic_timer.expired_tscdeadline; | ||
1125 | apic->lapic_timer.expired_tscdeadline = 0; | ||
1126 | guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); | ||
1127 | trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); | ||
1128 | |||
1129 | /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ | ||
1130 | if (guest_tsc < tsc_deadline) | ||
1131 | __delay(tsc_deadline - guest_tsc); | ||
1093 | } | 1132 | } |
1094 | 1133 | ||
1095 | static void start_apic_timer(struct kvm_lapic *apic) | 1134 | static void start_apic_timer(struct kvm_lapic *apic) |
1096 | { | 1135 | { |
1097 | ktime_t now; | 1136 | ktime_t now; |
1137 | |||
1098 | atomic_set(&apic->lapic_timer.pending, 0); | 1138 | atomic_set(&apic->lapic_timer.pending, 0); |
1099 | 1139 | ||
1100 | if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { | 1140 | if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { |
@@ -1140,6 +1180,7 @@ static void start_apic_timer(struct kvm_lapic *apic) | |||
1140 | /* lapic timer in tsc deadline mode */ | 1180 | /* lapic timer in tsc deadline mode */ |
1141 | u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; | 1181 | u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; |
1142 | u64 ns = 0; | 1182 | u64 ns = 0; |
1183 | ktime_t expire; | ||
1143 | struct kvm_vcpu *vcpu = apic->vcpu; | 1184 | struct kvm_vcpu *vcpu = apic->vcpu; |
1144 | unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; | 1185 | unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; |
1145 | unsigned long flags; | 1186 | unsigned long flags; |
@@ -1154,8 +1195,10 @@ static void start_apic_timer(struct kvm_lapic *apic) | |||
1154 | if (likely(tscdeadline > guest_tsc)) { | 1195 | if (likely(tscdeadline > guest_tsc)) { |
1155 | ns = (tscdeadline - guest_tsc) * 1000000ULL; | 1196 | ns = (tscdeadline - guest_tsc) * 1000000ULL; |
1156 | do_div(ns, this_tsc_khz); | 1197 | do_div(ns, this_tsc_khz); |
1198 | expire = ktime_add_ns(now, ns); | ||
1199 | expire = ktime_sub_ns(expire, lapic_timer_advance_ns); | ||
1157 | hrtimer_start(&apic->lapic_timer.timer, | 1200 | hrtimer_start(&apic->lapic_timer.timer, |
1158 | ktime_add_ns(now, ns), HRTIMER_MODE_ABS); | 1201 | expire, HRTIMER_MODE_ABS); |
1159 | } else | 1202 | } else |
1160 | apic_timer_expired(apic); | 1203 | apic_timer_expired(apic); |
1161 | 1204 | ||
@@ -1745,7 +1788,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, | |||
1745 | if (kvm_x86_ops->hwapic_irr_update) | 1788 | if (kvm_x86_ops->hwapic_irr_update) |
1746 | kvm_x86_ops->hwapic_irr_update(vcpu, | 1789 | kvm_x86_ops->hwapic_irr_update(vcpu, |
1747 | apic_find_highest_irr(apic)); | 1790 | apic_find_highest_irr(apic)); |
1748 | kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); | 1791 | if (unlikely(kvm_x86_ops->hwapic_isr_update)) |
1792 | kvm_x86_ops->hwapic_isr_update(vcpu->kvm, | ||
1793 | apic_find_highest_isr(apic)); | ||
1749 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 1794 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
1750 | kvm_rtc_eoi_tracking_restore_one(vcpu); | 1795 | kvm_rtc_eoi_tracking_restore_one(vcpu); |
1751 | } | 1796 | } |
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index c674fce53cf9..0bc6c656625b 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h | |||
@@ -14,6 +14,7 @@ struct kvm_timer { | |||
14 | u32 timer_mode; | 14 | u32 timer_mode; |
15 | u32 timer_mode_mask; | 15 | u32 timer_mode_mask; |
16 | u64 tscdeadline; | 16 | u64 tscdeadline; |
17 | u64 expired_tscdeadline; | ||
17 | atomic_t pending; /* accumulated triggered timers */ | 18 | atomic_t pending; /* accumulated triggered timers */ |
18 | }; | 19 | }; |
19 | 20 | ||
@@ -56,9 +57,8 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); | |||
56 | void kvm_apic_set_version(struct kvm_vcpu *vcpu); | 57 | void kvm_apic_set_version(struct kvm_vcpu *vcpu); |
57 | 58 | ||
58 | void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); | 59 | void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); |
60 | void __kvm_apic_update_irr(u32 *pir, void *regs); | ||
59 | void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); | 61 | void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); |
60 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest); | ||
61 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda); | ||
62 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, | 62 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, |
63 | unsigned long *dest_map); | 63 | unsigned long *dest_map); |
64 | int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); | 64 | int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); |
@@ -170,4 +170,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) | |||
170 | 170 | ||
171 | bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); | 171 | bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); |
172 | 172 | ||
173 | void wait_lapic_expire(struct kvm_vcpu *vcpu); | ||
174 | |||
173 | #endif | 175 | #endif |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f83fc6c5e0ba..cee759299a35 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -63,30 +63,16 @@ enum { | |||
63 | #undef MMU_DEBUG | 63 | #undef MMU_DEBUG |
64 | 64 | ||
65 | #ifdef MMU_DEBUG | 65 | #ifdef MMU_DEBUG |
66 | static bool dbg = 0; | ||
67 | module_param(dbg, bool, 0644); | ||
66 | 68 | ||
67 | #define pgprintk(x...) do { if (dbg) printk(x); } while (0) | 69 | #define pgprintk(x...) do { if (dbg) printk(x); } while (0) |
68 | #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) | 70 | #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) |
69 | 71 | #define MMU_WARN_ON(x) WARN_ON(x) | |
70 | #else | 72 | #else |
71 | |||
72 | #define pgprintk(x...) do { } while (0) | 73 | #define pgprintk(x...) do { } while (0) |
73 | #define rmap_printk(x...) do { } while (0) | 74 | #define rmap_printk(x...) do { } while (0) |
74 | 75 | #define MMU_WARN_ON(x) do { } while (0) | |
75 | #endif | ||
76 | |||
77 | #ifdef MMU_DEBUG | ||
78 | static bool dbg = 0; | ||
79 | module_param(dbg, bool, 0644); | ||
80 | #endif | ||
81 | |||
82 | #ifndef MMU_DEBUG | ||
83 | #define ASSERT(x) do { } while (0) | ||
84 | #else | ||
85 | #define ASSERT(x) \ | ||
86 | if (!(x)) { \ | ||
87 | printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ | ||
88 | __FILE__, __LINE__, #x); \ | ||
89 | } | ||
90 | #endif | 76 | #endif |
91 | 77 | ||
92 | #define PTE_PREFETCH_NUM 8 | 78 | #define PTE_PREFETCH_NUM 8 |
@@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) | |||
546 | return (old_spte & bit_mask) && !(new_spte & bit_mask); | 532 | return (old_spte & bit_mask) && !(new_spte & bit_mask); |
547 | } | 533 | } |
548 | 534 | ||
535 | static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) | ||
536 | { | ||
537 | return (old_spte & bit_mask) != (new_spte & bit_mask); | ||
538 | } | ||
539 | |||
549 | /* Rules for using mmu_spte_set: | 540 | /* Rules for using mmu_spte_set: |
550 | * Set the sptep from nonpresent to present. | 541 | * Set the sptep from nonpresent to present. |
551 | * Note: the sptep being assigned *must* be either not present | 542 | * Note: the sptep being assigned *must* be either not present |
@@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) | |||
596 | if (!shadow_accessed_mask) | 587 | if (!shadow_accessed_mask) |
597 | return ret; | 588 | return ret; |
598 | 589 | ||
590 | /* | ||
591 | * Flush TLB when accessed/dirty bits are changed in the page tables, | ||
592 | * to guarantee consistency between TLB and page tables. | ||
593 | */ | ||
594 | if (spte_is_bit_changed(old_spte, new_spte, | ||
595 | shadow_accessed_mask | shadow_dirty_mask)) | ||
596 | ret = true; | ||
597 | |||
599 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) | 598 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) |
600 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); | 599 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); |
601 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) | 600 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) |
@@ -1216,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, | |||
1216 | return flush; | 1215 | return flush; |
1217 | } | 1216 | } |
1218 | 1217 | ||
1218 | static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) | ||
1219 | { | ||
1220 | u64 spte = *sptep; | ||
1221 | |||
1222 | rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); | ||
1223 | |||
1224 | spte &= ~shadow_dirty_mask; | ||
1225 | |||
1226 | return mmu_spte_update(sptep, spte); | ||
1227 | } | ||
1228 | |||
1229 | static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) | ||
1230 | { | ||
1231 | u64 *sptep; | ||
1232 | struct rmap_iterator iter; | ||
1233 | bool flush = false; | ||
1234 | |||
1235 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { | ||
1236 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | ||
1237 | |||
1238 | flush |= spte_clear_dirty(kvm, sptep); | ||
1239 | sptep = rmap_get_next(&iter); | ||
1240 | } | ||
1241 | |||
1242 | return flush; | ||
1243 | } | ||
1244 | |||
1245 | static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) | ||
1246 | { | ||
1247 | u64 spte = *sptep; | ||
1248 | |||
1249 | rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); | ||
1250 | |||
1251 | spte |= shadow_dirty_mask; | ||
1252 | |||
1253 | return mmu_spte_update(sptep, spte); | ||
1254 | } | ||
1255 | |||
1256 | static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) | ||
1257 | { | ||
1258 | u64 *sptep; | ||
1259 | struct rmap_iterator iter; | ||
1260 | bool flush = false; | ||
1261 | |||
1262 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { | ||
1263 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | ||
1264 | |||
1265 | flush |= spte_set_dirty(kvm, sptep); | ||
1266 | sptep = rmap_get_next(&iter); | ||
1267 | } | ||
1268 | |||
1269 | return flush; | ||
1270 | } | ||
1271 | |||
1219 | /** | 1272 | /** |
1220 | * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages | 1273 | * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages |
1221 | * @kvm: kvm instance | 1274 | * @kvm: kvm instance |
@@ -1226,7 +1279,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, | |||
1226 | * Used when we do not need to care about huge page mappings: e.g. during dirty | 1279 | * Used when we do not need to care about huge page mappings: e.g. during dirty |
1227 | * logging we do not have any such mappings. | 1280 | * logging we do not have any such mappings. |
1228 | */ | 1281 | */ |
1229 | void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, | 1282 | static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, |
1230 | struct kvm_memory_slot *slot, | 1283 | struct kvm_memory_slot *slot, |
1231 | gfn_t gfn_offset, unsigned long mask) | 1284 | gfn_t gfn_offset, unsigned long mask) |
1232 | { | 1285 | { |
@@ -1242,6 +1295,53 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, | |||
1242 | } | 1295 | } |
1243 | } | 1296 | } |
1244 | 1297 | ||
1298 | /** | ||
1299 | * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages | ||
1300 | * @kvm: kvm instance | ||
1301 | * @slot: slot to clear D-bit | ||
1302 | * @gfn_offset: start of the BITS_PER_LONG pages we care about | ||
1303 | * @mask: indicates which pages we should clear D-bit | ||
1304 | * | ||
1305 | * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. | ||
1306 | */ | ||
1307 | void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, | ||
1308 | struct kvm_memory_slot *slot, | ||
1309 | gfn_t gfn_offset, unsigned long mask) | ||
1310 | { | ||
1311 | unsigned long *rmapp; | ||
1312 | |||
1313 | while (mask) { | ||
1314 | rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), | ||
1315 | PT_PAGE_TABLE_LEVEL, slot); | ||
1316 | __rmap_clear_dirty(kvm, rmapp); | ||
1317 | |||
1318 | /* clear the first set bit */ | ||
1319 | mask &= mask - 1; | ||
1320 | } | ||
1321 | } | ||
1322 | EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); | ||
1323 | |||
1324 | /** | ||
1325 | * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected | ||
1326 | * PT level pages. | ||
1327 | * | ||
1328 | * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to | ||
1329 | * enable dirty logging for them. | ||
1330 | * | ||
1331 | * Used when we do not need to care about huge page mappings: e.g. during dirty | ||
1332 | * logging we do not have any such mappings. | ||
1333 | */ | ||
1334 | void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, | ||
1335 | struct kvm_memory_slot *slot, | ||
1336 | gfn_t gfn_offset, unsigned long mask) | ||
1337 | { | ||
1338 | if (kvm_x86_ops->enable_log_dirty_pt_masked) | ||
1339 | kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset, | ||
1340 | mask); | ||
1341 | else | ||
1342 | kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); | ||
1343 | } | ||
1344 | |||
1245 | static bool rmap_write_protect(struct kvm *kvm, u64 gfn) | 1345 | static bool rmap_write_protect(struct kvm *kvm, u64 gfn) |
1246 | { | 1346 | { |
1247 | struct kvm_memory_slot *slot; | 1347 | struct kvm_memory_slot *slot; |
@@ -1536,7 +1636,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) | |||
1536 | 1636 | ||
1537 | static void kvm_mmu_free_page(struct kvm_mmu_page *sp) | 1637 | static void kvm_mmu_free_page(struct kvm_mmu_page *sp) |
1538 | { | 1638 | { |
1539 | ASSERT(is_empty_shadow_page(sp->spt)); | 1639 | MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); |
1540 | hlist_del(&sp->hash_link); | 1640 | hlist_del(&sp->hash_link); |
1541 | list_del(&sp->link); | 1641 | list_del(&sp->link); |
1542 | free_page((unsigned long)sp->spt); | 1642 | free_page((unsigned long)sp->spt); |
@@ -2501,8 +2601,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2501 | } | 2601 | } |
2502 | } | 2602 | } |
2503 | 2603 | ||
2504 | if (pte_access & ACC_WRITE_MASK) | 2604 | if (pte_access & ACC_WRITE_MASK) { |
2505 | mark_page_dirty(vcpu->kvm, gfn); | 2605 | mark_page_dirty(vcpu->kvm, gfn); |
2606 | spte |= shadow_dirty_mask; | ||
2607 | } | ||
2506 | 2608 | ||
2507 | set_pte: | 2609 | set_pte: |
2508 | if (mmu_spte_update(sptep, spte)) | 2610 | if (mmu_spte_update(sptep, spte)) |
@@ -2818,6 +2920,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
2818 | */ | 2920 | */ |
2819 | gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); | 2921 | gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); |
2820 | 2922 | ||
2923 | /* | ||
2924 | * Theoretically we could also set dirty bit (and flush TLB) here in | ||
2925 | * order to eliminate unnecessary PML logging. See comments in | ||
2926 | * set_spte. But fast_page_fault is very unlikely to happen with PML | ||
2927 | * enabled, so we do not do this. This might result in the same GPA | ||
2928 | * to be logged in PML buffer again when the write really happens, and | ||
2929 | * eventually to be called by mark_page_dirty twice. But it's also no | ||
2930 | * harm. This also avoids the TLB flush needed after setting dirty bit | ||
2931 | * so non-PML cases won't be impacted. | ||
2932 | * | ||
2933 | * Compare with set_spte where instead shadow_dirty_mask is set. | ||
2934 | */ | ||
2821 | if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) | 2935 | if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) |
2822 | mark_page_dirty(vcpu->kvm, gfn); | 2936 | mark_page_dirty(vcpu->kvm, gfn); |
2823 | 2937 | ||
@@ -3041,7 +3155,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) | |||
3041 | for (i = 0; i < 4; ++i) { | 3155 | for (i = 0; i < 4; ++i) { |
3042 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | 3156 | hpa_t root = vcpu->arch.mmu.pae_root[i]; |
3043 | 3157 | ||
3044 | ASSERT(!VALID_PAGE(root)); | 3158 | MMU_WARN_ON(VALID_PAGE(root)); |
3045 | spin_lock(&vcpu->kvm->mmu_lock); | 3159 | spin_lock(&vcpu->kvm->mmu_lock); |
3046 | make_mmu_pages_available(vcpu); | 3160 | make_mmu_pages_available(vcpu); |
3047 | sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), | 3161 | sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), |
@@ -3079,7 +3193,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) | |||
3079 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | 3193 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { |
3080 | hpa_t root = vcpu->arch.mmu.root_hpa; | 3194 | hpa_t root = vcpu->arch.mmu.root_hpa; |
3081 | 3195 | ||
3082 | ASSERT(!VALID_PAGE(root)); | 3196 | MMU_WARN_ON(VALID_PAGE(root)); |
3083 | 3197 | ||
3084 | spin_lock(&vcpu->kvm->mmu_lock); | 3198 | spin_lock(&vcpu->kvm->mmu_lock); |
3085 | make_mmu_pages_available(vcpu); | 3199 | make_mmu_pages_available(vcpu); |
@@ -3104,7 +3218,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) | |||
3104 | for (i = 0; i < 4; ++i) { | 3218 | for (i = 0; i < 4; ++i) { |
3105 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | 3219 | hpa_t root = vcpu->arch.mmu.pae_root[i]; |
3106 | 3220 | ||
3107 | ASSERT(!VALID_PAGE(root)); | 3221 | MMU_WARN_ON(VALID_PAGE(root)); |
3108 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { | 3222 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { |
3109 | pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); | 3223 | pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); |
3110 | if (!is_present_gpte(pdptr)) { | 3224 | if (!is_present_gpte(pdptr)) { |
@@ -3329,8 +3443,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
3329 | if (r) | 3443 | if (r) |
3330 | return r; | 3444 | return r; |
3331 | 3445 | ||
3332 | ASSERT(vcpu); | 3446 | MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
3333 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
3334 | 3447 | ||
3335 | gfn = gva >> PAGE_SHIFT; | 3448 | gfn = gva >> PAGE_SHIFT; |
3336 | 3449 | ||
@@ -3396,8 +3509,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
3396 | int write = error_code & PFERR_WRITE_MASK; | 3509 | int write = error_code & PFERR_WRITE_MASK; |
3397 | bool map_writable; | 3510 | bool map_writable; |
3398 | 3511 | ||
3399 | ASSERT(vcpu); | 3512 | MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
3400 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
3401 | 3513 | ||
3402 | if (unlikely(error_code & PFERR_RSVD_MASK)) { | 3514 | if (unlikely(error_code & PFERR_RSVD_MASK)) { |
3403 | r = handle_mmio_page_fault(vcpu, gpa, error_code, true); | 3515 | r = handle_mmio_page_fault(vcpu, gpa, error_code, true); |
@@ -3718,7 +3830,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, | |||
3718 | update_permission_bitmask(vcpu, context, false); | 3830 | update_permission_bitmask(vcpu, context, false); |
3719 | update_last_pte_bitmap(vcpu, context); | 3831 | update_last_pte_bitmap(vcpu, context); |
3720 | 3832 | ||
3721 | ASSERT(is_pae(vcpu)); | 3833 | MMU_WARN_ON(!is_pae(vcpu)); |
3722 | context->page_fault = paging64_page_fault; | 3834 | context->page_fault = paging64_page_fault; |
3723 | context->gva_to_gpa = paging64_gva_to_gpa; | 3835 | context->gva_to_gpa = paging64_gva_to_gpa; |
3724 | context->sync_page = paging64_sync_page; | 3836 | context->sync_page = paging64_sync_page; |
@@ -3763,7 +3875,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu, | |||
3763 | 3875 | ||
3764 | static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | 3876 | static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) |
3765 | { | 3877 | { |
3766 | struct kvm_mmu *context = vcpu->arch.walk_mmu; | 3878 | struct kvm_mmu *context = &vcpu->arch.mmu; |
3767 | 3879 | ||
3768 | context->base_role.word = 0; | 3880 | context->base_role.word = 0; |
3769 | context->page_fault = tdp_page_fault; | 3881 | context->page_fault = tdp_page_fault; |
@@ -3803,11 +3915,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
3803 | update_last_pte_bitmap(vcpu, context); | 3915 | update_last_pte_bitmap(vcpu, context); |
3804 | } | 3916 | } |
3805 | 3917 | ||
3806 | void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) | 3918 | void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) |
3807 | { | 3919 | { |
3808 | bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); | 3920 | bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); |
3809 | ASSERT(vcpu); | 3921 | struct kvm_mmu *context = &vcpu->arch.mmu; |
3810 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 3922 | |
3923 | MMU_WARN_ON(VALID_PAGE(context->root_hpa)); | ||
3811 | 3924 | ||
3812 | if (!is_paging(vcpu)) | 3925 | if (!is_paging(vcpu)) |
3813 | nonpaging_init_context(vcpu, context); | 3926 | nonpaging_init_context(vcpu, context); |
@@ -3818,19 +3931,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) | |||
3818 | else | 3931 | else |
3819 | paging32_init_context(vcpu, context); | 3932 | paging32_init_context(vcpu, context); |
3820 | 3933 | ||
3821 | vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); | 3934 | context->base_role.nxe = is_nx(vcpu); |
3822 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); | 3935 | context->base_role.cr4_pae = !!is_pae(vcpu); |
3823 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); | 3936 | context->base_role.cr0_wp = is_write_protection(vcpu); |
3824 | vcpu->arch.mmu.base_role.smep_andnot_wp | 3937 | context->base_role.smep_andnot_wp |
3825 | = smep && !is_write_protection(vcpu); | 3938 | = smep && !is_write_protection(vcpu); |
3826 | } | 3939 | } |
3827 | EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); | 3940 | EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); |
3828 | 3941 | ||
3829 | void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, | 3942 | void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) |
3830 | bool execonly) | ||
3831 | { | 3943 | { |
3832 | ASSERT(vcpu); | 3944 | struct kvm_mmu *context = &vcpu->arch.mmu; |
3833 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 3945 | |
3946 | MMU_WARN_ON(VALID_PAGE(context->root_hpa)); | ||
3834 | 3947 | ||
3835 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); | 3948 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); |
3836 | 3949 | ||
@@ -3851,11 +3964,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); | |||
3851 | 3964 | ||
3852 | static void init_kvm_softmmu(struct kvm_vcpu *vcpu) | 3965 | static void init_kvm_softmmu(struct kvm_vcpu *vcpu) |
3853 | { | 3966 | { |
3854 | kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); | 3967 | struct kvm_mmu *context = &vcpu->arch.mmu; |
3855 | vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; | 3968 | |
3856 | vcpu->arch.walk_mmu->get_cr3 = get_cr3; | 3969 | kvm_init_shadow_mmu(vcpu); |
3857 | vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; | 3970 | context->set_cr3 = kvm_x86_ops->set_cr3; |
3858 | vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; | 3971 | context->get_cr3 = get_cr3; |
3972 | context->get_pdptr = kvm_pdptr_read; | ||
3973 | context->inject_page_fault = kvm_inject_page_fault; | ||
3859 | } | 3974 | } |
3860 | 3975 | ||
3861 | static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) | 3976 | static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) |
@@ -3900,17 +4015,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) | |||
3900 | static void init_kvm_mmu(struct kvm_vcpu *vcpu) | 4015 | static void init_kvm_mmu(struct kvm_vcpu *vcpu) |
3901 | { | 4016 | { |
3902 | if (mmu_is_nested(vcpu)) | 4017 | if (mmu_is_nested(vcpu)) |
3903 | return init_kvm_nested_mmu(vcpu); | 4018 | init_kvm_nested_mmu(vcpu); |
3904 | else if (tdp_enabled) | 4019 | else if (tdp_enabled) |
3905 | return init_kvm_tdp_mmu(vcpu); | 4020 | init_kvm_tdp_mmu(vcpu); |
3906 | else | 4021 | else |
3907 | return init_kvm_softmmu(vcpu); | 4022 | init_kvm_softmmu(vcpu); |
3908 | } | 4023 | } |
3909 | 4024 | ||
3910 | void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) | 4025 | void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) |
3911 | { | 4026 | { |
3912 | ASSERT(vcpu); | ||
3913 | |||
3914 | kvm_mmu_unload(vcpu); | 4027 | kvm_mmu_unload(vcpu); |
3915 | init_kvm_mmu(vcpu); | 4028 | init_kvm_mmu(vcpu); |
3916 | } | 4029 | } |
@@ -4266,8 +4379,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | |||
4266 | struct page *page; | 4379 | struct page *page; |
4267 | int i; | 4380 | int i; |
4268 | 4381 | ||
4269 | ASSERT(vcpu); | ||
4270 | |||
4271 | /* | 4382 | /* |
4272 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. | 4383 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. |
4273 | * Therefore we need to allocate shadow page tables in the first | 4384 | * Therefore we need to allocate shadow page tables in the first |
@@ -4286,8 +4397,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | |||
4286 | 4397 | ||
4287 | int kvm_mmu_create(struct kvm_vcpu *vcpu) | 4398 | int kvm_mmu_create(struct kvm_vcpu *vcpu) |
4288 | { | 4399 | { |
4289 | ASSERT(vcpu); | ||
4290 | |||
4291 | vcpu->arch.walk_mmu = &vcpu->arch.mmu; | 4400 | vcpu->arch.walk_mmu = &vcpu->arch.mmu; |
4292 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | 4401 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; |
4293 | vcpu->arch.mmu.translate_gpa = translate_gpa; | 4402 | vcpu->arch.mmu.translate_gpa = translate_gpa; |
@@ -4298,19 +4407,18 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) | |||
4298 | 4407 | ||
4299 | void kvm_mmu_setup(struct kvm_vcpu *vcpu) | 4408 | void kvm_mmu_setup(struct kvm_vcpu *vcpu) |
4300 | { | 4409 | { |
4301 | ASSERT(vcpu); | 4410 | MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
4302 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
4303 | 4411 | ||
4304 | init_kvm_mmu(vcpu); | 4412 | init_kvm_mmu(vcpu); |
4305 | } | 4413 | } |
4306 | 4414 | ||
4307 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | 4415 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, |
4416 | struct kvm_memory_slot *memslot) | ||
4308 | { | 4417 | { |
4309 | struct kvm_memory_slot *memslot; | ||
4310 | gfn_t last_gfn; | 4418 | gfn_t last_gfn; |
4311 | int i; | 4419 | int i; |
4420 | bool flush = false; | ||
4312 | 4421 | ||
4313 | memslot = id_to_memslot(kvm->memslots, slot); | ||
4314 | last_gfn = memslot->base_gfn + memslot->npages - 1; | 4422 | last_gfn = memslot->base_gfn + memslot->npages - 1; |
4315 | 4423 | ||
4316 | spin_lock(&kvm->mmu_lock); | 4424 | spin_lock(&kvm->mmu_lock); |
@@ -4325,7 +4433,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
4325 | 4433 | ||
4326 | for (index = 0; index <= last_index; ++index, ++rmapp) { | 4434 | for (index = 0; index <= last_index; ++index, ++rmapp) { |
4327 | if (*rmapp) | 4435 | if (*rmapp) |
4328 | __rmap_write_protect(kvm, rmapp, false); | 4436 | flush |= __rmap_write_protect(kvm, rmapp, |
4437 | false); | ||
4329 | 4438 | ||
4330 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) | 4439 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) |
4331 | cond_resched_lock(&kvm->mmu_lock); | 4440 | cond_resched_lock(&kvm->mmu_lock); |
@@ -4352,8 +4461,124 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
4352 | * instead of PT_WRITABLE_MASK, that means it does not depend | 4461 | * instead of PT_WRITABLE_MASK, that means it does not depend |
4353 | * on PT_WRITABLE_MASK anymore. | 4462 | * on PT_WRITABLE_MASK anymore. |
4354 | */ | 4463 | */ |
4355 | kvm_flush_remote_tlbs(kvm); | 4464 | if (flush) |
4465 | kvm_flush_remote_tlbs(kvm); | ||
4466 | } | ||
4467 | |||
4468 | void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, | ||
4469 | struct kvm_memory_slot *memslot) | ||
4470 | { | ||
4471 | gfn_t last_gfn; | ||
4472 | unsigned long *rmapp; | ||
4473 | unsigned long last_index, index; | ||
4474 | bool flush = false; | ||
4475 | |||
4476 | last_gfn = memslot->base_gfn + memslot->npages - 1; | ||
4477 | |||
4478 | spin_lock(&kvm->mmu_lock); | ||
4479 | |||
4480 | rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1]; | ||
4481 | last_index = gfn_to_index(last_gfn, memslot->base_gfn, | ||
4482 | PT_PAGE_TABLE_LEVEL); | ||
4483 | |||
4484 | for (index = 0; index <= last_index; ++index, ++rmapp) { | ||
4485 | if (*rmapp) | ||
4486 | flush |= __rmap_clear_dirty(kvm, rmapp); | ||
4487 | |||
4488 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) | ||
4489 | cond_resched_lock(&kvm->mmu_lock); | ||
4490 | } | ||
4491 | |||
4492 | spin_unlock(&kvm->mmu_lock); | ||
4493 | |||
4494 | lockdep_assert_held(&kvm->slots_lock); | ||
4495 | |||
4496 | /* | ||
4497 | * It's also safe to flush TLBs out of mmu lock here as currently this | ||
4498 | * function is only used for dirty logging, in which case flushing TLB | ||
4499 | * out of mmu lock also guarantees no dirty pages will be lost in | ||
4500 | * dirty_bitmap. | ||
4501 | */ | ||
4502 | if (flush) | ||
4503 | kvm_flush_remote_tlbs(kvm); | ||
4504 | } | ||
4505 | EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); | ||
4506 | |||
4507 | void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, | ||
4508 | struct kvm_memory_slot *memslot) | ||
4509 | { | ||
4510 | gfn_t last_gfn; | ||
4511 | int i; | ||
4512 | bool flush = false; | ||
4513 | |||
4514 | last_gfn = memslot->base_gfn + memslot->npages - 1; | ||
4515 | |||
4516 | spin_lock(&kvm->mmu_lock); | ||
4517 | |||
4518 | for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */ | ||
4519 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | ||
4520 | unsigned long *rmapp; | ||
4521 | unsigned long last_index, index; | ||
4522 | |||
4523 | rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; | ||
4524 | last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); | ||
4525 | |||
4526 | for (index = 0; index <= last_index; ++index, ++rmapp) { | ||
4527 | if (*rmapp) | ||
4528 | flush |= __rmap_write_protect(kvm, rmapp, | ||
4529 | false); | ||
4530 | |||
4531 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) | ||
4532 | cond_resched_lock(&kvm->mmu_lock); | ||
4533 | } | ||
4534 | } | ||
4535 | spin_unlock(&kvm->mmu_lock); | ||
4536 | |||
4537 | /* see kvm_mmu_slot_remove_write_access */ | ||
4538 | lockdep_assert_held(&kvm->slots_lock); | ||
4539 | |||
4540 | if (flush) | ||
4541 | kvm_flush_remote_tlbs(kvm); | ||
4542 | } | ||
4543 | EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); | ||
4544 | |||
4545 | void kvm_mmu_slot_set_dirty(struct kvm *kvm, | ||
4546 | struct kvm_memory_slot *memslot) | ||
4547 | { | ||
4548 | gfn_t last_gfn; | ||
4549 | int i; | ||
4550 | bool flush = false; | ||
4551 | |||
4552 | last_gfn = memslot->base_gfn + memslot->npages - 1; | ||
4553 | |||
4554 | spin_lock(&kvm->mmu_lock); | ||
4555 | |||
4556 | for (i = PT_PAGE_TABLE_LEVEL; | ||
4557 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | ||
4558 | unsigned long *rmapp; | ||
4559 | unsigned long last_index, index; | ||
4560 | |||
4561 | rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; | ||
4562 | last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); | ||
4563 | |||
4564 | for (index = 0; index <= last_index; ++index, ++rmapp) { | ||
4565 | if (*rmapp) | ||
4566 | flush |= __rmap_set_dirty(kvm, rmapp); | ||
4567 | |||
4568 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) | ||
4569 | cond_resched_lock(&kvm->mmu_lock); | ||
4570 | } | ||
4571 | } | ||
4572 | |||
4573 | spin_unlock(&kvm->mmu_lock); | ||
4574 | |||
4575 | lockdep_assert_held(&kvm->slots_lock); | ||
4576 | |||
4577 | /* see kvm_mmu_slot_leaf_clear_dirty */ | ||
4578 | if (flush) | ||
4579 | kvm_flush_remote_tlbs(kvm); | ||
4356 | } | 4580 | } |
4581 | EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); | ||
4357 | 4582 | ||
4358 | #define BATCH_ZAP_PAGES 10 | 4583 | #define BATCH_ZAP_PAGES 10 |
4359 | static void kvm_zap_obsolete_pages(struct kvm *kvm) | 4584 | static void kvm_zap_obsolete_pages(struct kvm *kvm) |
@@ -4606,8 +4831,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); | |||
4606 | 4831 | ||
4607 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | 4832 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) |
4608 | { | 4833 | { |
4609 | ASSERT(vcpu); | ||
4610 | |||
4611 | kvm_mmu_unload(vcpu); | 4834 | kvm_mmu_unload(vcpu); |
4612 | free_mmu_pages(vcpu); | 4835 | free_mmu_pages(vcpu); |
4613 | mmu_free_memory_caches(vcpu); | 4836 | mmu_free_memory_caches(vcpu); |
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bde8ee725754..c7d65637c851 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h | |||
@@ -44,18 +44,6 @@ | |||
44 | #define PT_DIRECTORY_LEVEL 2 | 44 | #define PT_DIRECTORY_LEVEL 2 |
45 | #define PT_PAGE_TABLE_LEVEL 1 | 45 | #define PT_PAGE_TABLE_LEVEL 1 |
46 | 46 | ||
47 | #define PFERR_PRESENT_BIT 0 | ||
48 | #define PFERR_WRITE_BIT 1 | ||
49 | #define PFERR_USER_BIT 2 | ||
50 | #define PFERR_RSVD_BIT 3 | ||
51 | #define PFERR_FETCH_BIT 4 | ||
52 | |||
53 | #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) | ||
54 | #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) | ||
55 | #define PFERR_USER_MASK (1U << PFERR_USER_BIT) | ||
56 | #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) | ||
57 | #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) | ||
58 | |||
59 | static inline u64 rsvd_bits(int s, int e) | 47 | static inline u64 rsvd_bits(int s, int e) |
60 | { | 48 | { |
61 | return ((1ULL << (e - s + 1)) - 1) << s; | 49 | return ((1ULL << (e - s + 1)) - 1) << s; |
@@ -81,9 +69,8 @@ enum { | |||
81 | }; | 69 | }; |
82 | 70 | ||
83 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); | 71 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); |
84 | void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); | 72 | void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); |
85 | void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, | 73 | void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); |
86 | bool execonly); | ||
87 | void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | 74 | void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
88 | bool ept); | 75 | bool ept); |
89 | 76 | ||
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 41dd0387cccb..a17d848c6d42 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, | |||
2003 | 2003 | ||
2004 | static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) | 2004 | static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) |
2005 | { | 2005 | { |
2006 | kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); | 2006 | WARN_ON(mmu_is_nested(vcpu)); |
2007 | 2007 | kvm_init_shadow_mmu(vcpu); | |
2008 | vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; | 2008 | vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; |
2009 | vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; | 2009 | vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; |
2010 | vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; | 2010 | vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; |
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index c2a34bb5ad93..7c7bc8bef21f 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h | |||
@@ -848,6 +848,24 @@ TRACE_EVENT(kvm_track_tsc, | |||
848 | 848 | ||
849 | #endif /* CONFIG_X86_64 */ | 849 | #endif /* CONFIG_X86_64 */ |
850 | 850 | ||
851 | /* | ||
852 | * Tracepoint for PML full VMEXIT. | ||
853 | */ | ||
854 | TRACE_EVENT(kvm_pml_full, | ||
855 | TP_PROTO(unsigned int vcpu_id), | ||
856 | TP_ARGS(vcpu_id), | ||
857 | |||
858 | TP_STRUCT__entry( | ||
859 | __field( unsigned int, vcpu_id ) | ||
860 | ), | ||
861 | |||
862 | TP_fast_assign( | ||
863 | __entry->vcpu_id = vcpu_id; | ||
864 | ), | ||
865 | |||
866 | TP_printk("vcpu %d: PML full", __entry->vcpu_id) | ||
867 | ); | ||
868 | |||
851 | TRACE_EVENT(kvm_ple_window, | 869 | TRACE_EVENT(kvm_ple_window, |
852 | TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), | 870 | TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), |
853 | TP_ARGS(grow, vcpu_id, new, old), | 871 | TP_ARGS(grow, vcpu_id, new, old), |
@@ -914,6 +932,26 @@ TRACE_EVENT(kvm_pvclock_update, | |||
914 | __entry->flags) | 932 | __entry->flags) |
915 | ); | 933 | ); |
916 | 934 | ||
935 | TRACE_EVENT(kvm_wait_lapic_expire, | ||
936 | TP_PROTO(unsigned int vcpu_id, s64 delta), | ||
937 | TP_ARGS(vcpu_id, delta), | ||
938 | |||
939 | TP_STRUCT__entry( | ||
940 | __field( unsigned int, vcpu_id ) | ||
941 | __field( s64, delta ) | ||
942 | ), | ||
943 | |||
944 | TP_fast_assign( | ||
945 | __entry->vcpu_id = vcpu_id; | ||
946 | __entry->delta = delta; | ||
947 | ), | ||
948 | |||
949 | TP_printk("vcpu %u: delta %lld (%s)", | ||
950 | __entry->vcpu_id, | ||
951 | __entry->delta, | ||
952 | __entry->delta < 0 ? "early" : "late") | ||
953 | ); | ||
954 | |||
917 | #endif /* _TRACE_KVM_H */ | 955 | #endif /* _TRACE_KVM_H */ |
918 | 956 | ||
919 | #undef TRACE_INCLUDE_PATH | 957 | #undef TRACE_INCLUDE_PATH |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4c58d884838..3f73bfad0349 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <asm/perf_event.h> | 45 | #include <asm/perf_event.h> |
46 | #include <asm/debugreg.h> | 46 | #include <asm/debugreg.h> |
47 | #include <asm/kexec.h> | 47 | #include <asm/kexec.h> |
48 | #include <asm/apic.h> | ||
48 | 49 | ||
49 | #include "trace.h" | 50 | #include "trace.h" |
50 | 51 | ||
@@ -101,6 +102,9 @@ module_param(nested, bool, S_IRUGO); | |||
101 | 102 | ||
102 | static u64 __read_mostly host_xss; | 103 | static u64 __read_mostly host_xss; |
103 | 104 | ||
105 | static bool __read_mostly enable_pml = 1; | ||
106 | module_param_named(pml, enable_pml, bool, S_IRUGO); | ||
107 | |||
104 | #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) | 108 | #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) |
105 | #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) | 109 | #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) |
106 | #define KVM_VM_CR0_ALWAYS_ON \ | 110 | #define KVM_VM_CR0_ALWAYS_ON \ |
@@ -215,7 +219,12 @@ struct __packed vmcs12 { | |||
215 | u64 tsc_offset; | 219 | u64 tsc_offset; |
216 | u64 virtual_apic_page_addr; | 220 | u64 virtual_apic_page_addr; |
217 | u64 apic_access_addr; | 221 | u64 apic_access_addr; |
222 | u64 posted_intr_desc_addr; | ||
218 | u64 ept_pointer; | 223 | u64 ept_pointer; |
224 | u64 eoi_exit_bitmap0; | ||
225 | u64 eoi_exit_bitmap1; | ||
226 | u64 eoi_exit_bitmap2; | ||
227 | u64 eoi_exit_bitmap3; | ||
219 | u64 xss_exit_bitmap; | 228 | u64 xss_exit_bitmap; |
220 | u64 guest_physical_address; | 229 | u64 guest_physical_address; |
221 | u64 vmcs_link_pointer; | 230 | u64 vmcs_link_pointer; |
@@ -330,6 +339,7 @@ struct __packed vmcs12 { | |||
330 | u32 vmx_preemption_timer_value; | 339 | u32 vmx_preemption_timer_value; |
331 | u32 padding32[7]; /* room for future expansion */ | 340 | u32 padding32[7]; /* room for future expansion */ |
332 | u16 virtual_processor_id; | 341 | u16 virtual_processor_id; |
342 | u16 posted_intr_nv; | ||
333 | u16 guest_es_selector; | 343 | u16 guest_es_selector; |
334 | u16 guest_cs_selector; | 344 | u16 guest_cs_selector; |
335 | u16 guest_ss_selector; | 345 | u16 guest_ss_selector; |
@@ -338,6 +348,7 @@ struct __packed vmcs12 { | |||
338 | u16 guest_gs_selector; | 348 | u16 guest_gs_selector; |
339 | u16 guest_ldtr_selector; | 349 | u16 guest_ldtr_selector; |
340 | u16 guest_tr_selector; | 350 | u16 guest_tr_selector; |
351 | u16 guest_intr_status; | ||
341 | u16 host_es_selector; | 352 | u16 host_es_selector; |
342 | u16 host_cs_selector; | 353 | u16 host_cs_selector; |
343 | u16 host_ss_selector; | 354 | u16 host_ss_selector; |
@@ -401,6 +412,10 @@ struct nested_vmx { | |||
401 | */ | 412 | */ |
402 | struct page *apic_access_page; | 413 | struct page *apic_access_page; |
403 | struct page *virtual_apic_page; | 414 | struct page *virtual_apic_page; |
415 | struct page *pi_desc_page; | ||
416 | struct pi_desc *pi_desc; | ||
417 | bool pi_pending; | ||
418 | u16 posted_intr_nv; | ||
404 | u64 msr_ia32_feature_control; | 419 | u64 msr_ia32_feature_control; |
405 | 420 | ||
406 | struct hrtimer preemption_timer; | 421 | struct hrtimer preemption_timer; |
@@ -408,6 +423,23 @@ struct nested_vmx { | |||
408 | 423 | ||
409 | /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ | 424 | /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ |
410 | u64 vmcs01_debugctl; | 425 | u64 vmcs01_debugctl; |
426 | |||
427 | u32 nested_vmx_procbased_ctls_low; | ||
428 | u32 nested_vmx_procbased_ctls_high; | ||
429 | u32 nested_vmx_true_procbased_ctls_low; | ||
430 | u32 nested_vmx_secondary_ctls_low; | ||
431 | u32 nested_vmx_secondary_ctls_high; | ||
432 | u32 nested_vmx_pinbased_ctls_low; | ||
433 | u32 nested_vmx_pinbased_ctls_high; | ||
434 | u32 nested_vmx_exit_ctls_low; | ||
435 | u32 nested_vmx_exit_ctls_high; | ||
436 | u32 nested_vmx_true_exit_ctls_low; | ||
437 | u32 nested_vmx_entry_ctls_low; | ||
438 | u32 nested_vmx_entry_ctls_high; | ||
439 | u32 nested_vmx_true_entry_ctls_low; | ||
440 | u32 nested_vmx_misc_low; | ||
441 | u32 nested_vmx_misc_high; | ||
442 | u32 nested_vmx_ept_caps; | ||
411 | }; | 443 | }; |
412 | 444 | ||
413 | #define POSTED_INTR_ON 0 | 445 | #define POSTED_INTR_ON 0 |
@@ -511,6 +543,10 @@ struct vcpu_vmx { | |||
511 | /* Dynamic PLE window. */ | 543 | /* Dynamic PLE window. */ |
512 | int ple_window; | 544 | int ple_window; |
513 | bool ple_window_dirty; | 545 | bool ple_window_dirty; |
546 | |||
547 | /* Support for PML */ | ||
548 | #define PML_ENTITY_NUM 512 | ||
549 | struct page *pml_pg; | ||
514 | }; | 550 | }; |
515 | 551 | ||
516 | enum segment_cache_field { | 552 | enum segment_cache_field { |
@@ -594,6 +630,7 @@ static int max_shadow_read_write_fields = | |||
594 | 630 | ||
595 | static const unsigned short vmcs_field_to_offset_table[] = { | 631 | static const unsigned short vmcs_field_to_offset_table[] = { |
596 | FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), | 632 | FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), |
633 | FIELD(POSTED_INTR_NV, posted_intr_nv), | ||
597 | FIELD(GUEST_ES_SELECTOR, guest_es_selector), | 634 | FIELD(GUEST_ES_SELECTOR, guest_es_selector), |
598 | FIELD(GUEST_CS_SELECTOR, guest_cs_selector), | 635 | FIELD(GUEST_CS_SELECTOR, guest_cs_selector), |
599 | FIELD(GUEST_SS_SELECTOR, guest_ss_selector), | 636 | FIELD(GUEST_SS_SELECTOR, guest_ss_selector), |
@@ -602,6 +639,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { | |||
602 | FIELD(GUEST_GS_SELECTOR, guest_gs_selector), | 639 | FIELD(GUEST_GS_SELECTOR, guest_gs_selector), |
603 | FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), | 640 | FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), |
604 | FIELD(GUEST_TR_SELECTOR, guest_tr_selector), | 641 | FIELD(GUEST_TR_SELECTOR, guest_tr_selector), |
642 | FIELD(GUEST_INTR_STATUS, guest_intr_status), | ||
605 | FIELD(HOST_ES_SELECTOR, host_es_selector), | 643 | FIELD(HOST_ES_SELECTOR, host_es_selector), |
606 | FIELD(HOST_CS_SELECTOR, host_cs_selector), | 644 | FIELD(HOST_CS_SELECTOR, host_cs_selector), |
607 | FIELD(HOST_SS_SELECTOR, host_ss_selector), | 645 | FIELD(HOST_SS_SELECTOR, host_ss_selector), |
@@ -618,7 +656,12 @@ static const unsigned short vmcs_field_to_offset_table[] = { | |||
618 | FIELD64(TSC_OFFSET, tsc_offset), | 656 | FIELD64(TSC_OFFSET, tsc_offset), |
619 | FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), | 657 | FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), |
620 | FIELD64(APIC_ACCESS_ADDR, apic_access_addr), | 658 | FIELD64(APIC_ACCESS_ADDR, apic_access_addr), |
659 | FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), | ||
621 | FIELD64(EPT_POINTER, ept_pointer), | 660 | FIELD64(EPT_POINTER, ept_pointer), |
661 | FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), | ||
662 | FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), | ||
663 | FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), | ||
664 | FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), | ||
622 | FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), | 665 | FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), |
623 | FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), | 666 | FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), |
624 | FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), | 667 | FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), |
@@ -766,6 +809,7 @@ static void kvm_cpu_vmxon(u64 addr); | |||
766 | static void kvm_cpu_vmxoff(void); | 809 | static void kvm_cpu_vmxoff(void); |
767 | static bool vmx_mpx_supported(void); | 810 | static bool vmx_mpx_supported(void); |
768 | static bool vmx_xsaves_supported(void); | 811 | static bool vmx_xsaves_supported(void); |
812 | static int vmx_vm_has_apicv(struct kvm *kvm); | ||
769 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); | 813 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); |
770 | static void vmx_set_segment(struct kvm_vcpu *vcpu, | 814 | static void vmx_set_segment(struct kvm_vcpu *vcpu, |
771 | struct kvm_segment *var, int seg); | 815 | struct kvm_segment *var, int seg); |
@@ -793,6 +837,7 @@ static unsigned long *vmx_msr_bitmap_legacy; | |||
793 | static unsigned long *vmx_msr_bitmap_longmode; | 837 | static unsigned long *vmx_msr_bitmap_longmode; |
794 | static unsigned long *vmx_msr_bitmap_legacy_x2apic; | 838 | static unsigned long *vmx_msr_bitmap_legacy_x2apic; |
795 | static unsigned long *vmx_msr_bitmap_longmode_x2apic; | 839 | static unsigned long *vmx_msr_bitmap_longmode_x2apic; |
840 | static unsigned long *vmx_msr_bitmap_nested; | ||
796 | static unsigned long *vmx_vmread_bitmap; | 841 | static unsigned long *vmx_vmread_bitmap; |
797 | static unsigned long *vmx_vmwrite_bitmap; | 842 | static unsigned long *vmx_vmwrite_bitmap; |
798 | 843 | ||
@@ -959,16 +1004,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void) | |||
959 | return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; | 1004 | return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; |
960 | } | 1005 | } |
961 | 1006 | ||
962 | static inline bool cpu_has_vmx_eptp_uncacheable(void) | ||
963 | { | ||
964 | return vmx_capability.ept & VMX_EPTP_UC_BIT; | ||
965 | } | ||
966 | |||
967 | static inline bool cpu_has_vmx_eptp_writeback(void) | ||
968 | { | ||
969 | return vmx_capability.ept & VMX_EPTP_WB_BIT; | ||
970 | } | ||
971 | |||
972 | static inline bool cpu_has_vmx_ept_2m_page(void) | 1007 | static inline bool cpu_has_vmx_ept_2m_page(void) |
973 | { | 1008 | { |
974 | return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; | 1009 | return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; |
@@ -1073,6 +1108,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void) | |||
1073 | SECONDARY_EXEC_SHADOW_VMCS; | 1108 | SECONDARY_EXEC_SHADOW_VMCS; |
1074 | } | 1109 | } |
1075 | 1110 | ||
1111 | static inline bool cpu_has_vmx_pml(void) | ||
1112 | { | ||
1113 | return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; | ||
1114 | } | ||
1115 | |||
1076 | static inline bool report_flexpriority(void) | 1116 | static inline bool report_flexpriority(void) |
1077 | { | 1117 | { |
1078 | return flexpriority_enabled; | 1118 | return flexpriority_enabled; |
@@ -1112,6 +1152,26 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) | |||
1112 | vmx_xsaves_supported(); | 1152 | vmx_xsaves_supported(); |
1113 | } | 1153 | } |
1114 | 1154 | ||
1155 | static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) | ||
1156 | { | ||
1157 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); | ||
1158 | } | ||
1159 | |||
1160 | static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) | ||
1161 | { | ||
1162 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); | ||
1163 | } | ||
1164 | |||
1165 | static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) | ||
1166 | { | ||
1167 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
1168 | } | ||
1169 | |||
1170 | static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) | ||
1171 | { | ||
1172 | return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; | ||
1173 | } | ||
1174 | |||
1115 | static inline bool is_exception(u32 intr_info) | 1175 | static inline bool is_exception(u32 intr_info) |
1116 | { | 1176 | { |
1117 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | 1177 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) |
@@ -2284,20 +2344,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) | |||
2284 | * if the corresponding bit in the (32-bit) control field *must* be on, and a | 2344 | * if the corresponding bit in the (32-bit) control field *must* be on, and a |
2285 | * bit in the high half is on if the corresponding bit in the control field | 2345 | * bit in the high half is on if the corresponding bit in the control field |
2286 | * may be on. See also vmx_control_verify(). | 2346 | * may be on. See also vmx_control_verify(). |
2287 | * TODO: allow these variables to be modified (downgraded) by module options | ||
2288 | * or other means. | ||
2289 | */ | 2347 | */ |
2290 | static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; | 2348 | static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) |
2291 | static u32 nested_vmx_true_procbased_ctls_low; | ||
2292 | static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; | ||
2293 | static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; | ||
2294 | static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; | ||
2295 | static u32 nested_vmx_true_exit_ctls_low; | ||
2296 | static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; | ||
2297 | static u32 nested_vmx_true_entry_ctls_low; | ||
2298 | static u32 nested_vmx_misc_low, nested_vmx_misc_high; | ||
2299 | static u32 nested_vmx_ept_caps; | ||
2300 | static __init void nested_vmx_setup_ctls_msrs(void) | ||
2301 | { | 2349 | { |
2302 | /* | 2350 | /* |
2303 | * Note that as a general rule, the high half of the MSRs (bits in | 2351 | * Note that as a general rule, the high half of the MSRs (bits in |
@@ -2316,57 +2364,74 @@ static __init void nested_vmx_setup_ctls_msrs(void) | |||
2316 | 2364 | ||
2317 | /* pin-based controls */ | 2365 | /* pin-based controls */ |
2318 | rdmsr(MSR_IA32_VMX_PINBASED_CTLS, | 2366 | rdmsr(MSR_IA32_VMX_PINBASED_CTLS, |
2319 | nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); | 2367 | vmx->nested.nested_vmx_pinbased_ctls_low, |
2320 | nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | 2368 | vmx->nested.nested_vmx_pinbased_ctls_high); |
2321 | nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | | 2369 | vmx->nested.nested_vmx_pinbased_ctls_low |= |
2322 | PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; | 2370 | PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; |
2323 | nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | | 2371 | vmx->nested.nested_vmx_pinbased_ctls_high &= |
2372 | PIN_BASED_EXT_INTR_MASK | | ||
2373 | PIN_BASED_NMI_EXITING | | ||
2374 | PIN_BASED_VIRTUAL_NMIS; | ||
2375 | vmx->nested.nested_vmx_pinbased_ctls_high |= | ||
2376 | PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | | ||
2324 | PIN_BASED_VMX_PREEMPTION_TIMER; | 2377 | PIN_BASED_VMX_PREEMPTION_TIMER; |
2378 | if (vmx_vm_has_apicv(vmx->vcpu.kvm)) | ||
2379 | vmx->nested.nested_vmx_pinbased_ctls_high |= | ||
2380 | PIN_BASED_POSTED_INTR; | ||
2325 | 2381 | ||
2326 | /* exit controls */ | 2382 | /* exit controls */ |
2327 | rdmsr(MSR_IA32_VMX_EXIT_CTLS, | 2383 | rdmsr(MSR_IA32_VMX_EXIT_CTLS, |
2328 | nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); | 2384 | vmx->nested.nested_vmx_exit_ctls_low, |
2329 | nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; | 2385 | vmx->nested.nested_vmx_exit_ctls_high); |
2386 | vmx->nested.nested_vmx_exit_ctls_low = | ||
2387 | VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; | ||
2330 | 2388 | ||
2331 | nested_vmx_exit_ctls_high &= | 2389 | vmx->nested.nested_vmx_exit_ctls_high &= |
2332 | #ifdef CONFIG_X86_64 | 2390 | #ifdef CONFIG_X86_64 |
2333 | VM_EXIT_HOST_ADDR_SPACE_SIZE | | 2391 | VM_EXIT_HOST_ADDR_SPACE_SIZE | |
2334 | #endif | 2392 | #endif |
2335 | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; | 2393 | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; |
2336 | nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | | 2394 | vmx->nested.nested_vmx_exit_ctls_high |= |
2395 | VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | | ||
2337 | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | | 2396 | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | |
2338 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; | 2397 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; |
2339 | 2398 | ||
2340 | if (vmx_mpx_supported()) | 2399 | if (vmx_mpx_supported()) |
2341 | nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; | 2400 | vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; |
2342 | 2401 | ||
2343 | /* We support free control of debug control saving. */ | 2402 | /* We support free control of debug control saving. */ |
2344 | nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low & | 2403 | vmx->nested.nested_vmx_true_exit_ctls_low = |
2404 | vmx->nested.nested_vmx_exit_ctls_low & | ||
2345 | ~VM_EXIT_SAVE_DEBUG_CONTROLS; | 2405 | ~VM_EXIT_SAVE_DEBUG_CONTROLS; |
2346 | 2406 | ||
2347 | /* entry controls */ | 2407 | /* entry controls */ |
2348 | rdmsr(MSR_IA32_VMX_ENTRY_CTLS, | 2408 | rdmsr(MSR_IA32_VMX_ENTRY_CTLS, |
2349 | nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); | 2409 | vmx->nested.nested_vmx_entry_ctls_low, |
2350 | nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; | 2410 | vmx->nested.nested_vmx_entry_ctls_high); |
2351 | nested_vmx_entry_ctls_high &= | 2411 | vmx->nested.nested_vmx_entry_ctls_low = |
2412 | VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; | ||
2413 | vmx->nested.nested_vmx_entry_ctls_high &= | ||
2352 | #ifdef CONFIG_X86_64 | 2414 | #ifdef CONFIG_X86_64 |
2353 | VM_ENTRY_IA32E_MODE | | 2415 | VM_ENTRY_IA32E_MODE | |
2354 | #endif | 2416 | #endif |
2355 | VM_ENTRY_LOAD_IA32_PAT; | 2417 | VM_ENTRY_LOAD_IA32_PAT; |
2356 | nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | | 2418 | vmx->nested.nested_vmx_entry_ctls_high |= |
2357 | VM_ENTRY_LOAD_IA32_EFER); | 2419 | (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); |
2358 | if (vmx_mpx_supported()) | 2420 | if (vmx_mpx_supported()) |
2359 | nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; | 2421 | vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; |
2360 | 2422 | ||
2361 | /* We support free control of debug control loading. */ | 2423 | /* We support free control of debug control loading. */ |
2362 | nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low & | 2424 | vmx->nested.nested_vmx_true_entry_ctls_low = |
2425 | vmx->nested.nested_vmx_entry_ctls_low & | ||
2363 | ~VM_ENTRY_LOAD_DEBUG_CONTROLS; | 2426 | ~VM_ENTRY_LOAD_DEBUG_CONTROLS; |
2364 | 2427 | ||
2365 | /* cpu-based controls */ | 2428 | /* cpu-based controls */ |
2366 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, | 2429 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, |
2367 | nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); | 2430 | vmx->nested.nested_vmx_procbased_ctls_low, |
2368 | nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | 2431 | vmx->nested.nested_vmx_procbased_ctls_high); |
2369 | nested_vmx_procbased_ctls_high &= | 2432 | vmx->nested.nested_vmx_procbased_ctls_low = |
2433 | CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | ||
2434 | vmx->nested.nested_vmx_procbased_ctls_high &= | ||
2370 | CPU_BASED_VIRTUAL_INTR_PENDING | | 2435 | CPU_BASED_VIRTUAL_INTR_PENDING | |
2371 | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | | 2436 | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | |
2372 | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | | 2437 | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | |
@@ -2386,45 +2451,55 @@ static __init void nested_vmx_setup_ctls_msrs(void) | |||
2386 | * can use it to avoid exits to L1 - even when L0 runs L2 | 2451 | * can use it to avoid exits to L1 - even when L0 runs L2 |
2387 | * without MSR bitmaps. | 2452 | * without MSR bitmaps. |
2388 | */ | 2453 | */ |
2389 | nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | | 2454 | vmx->nested.nested_vmx_procbased_ctls_high |= |
2455 | CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | | ||
2390 | CPU_BASED_USE_MSR_BITMAPS; | 2456 | CPU_BASED_USE_MSR_BITMAPS; |
2391 | 2457 | ||
2392 | /* We support free control of CR3 access interception. */ | 2458 | /* We support free control of CR3 access interception. */ |
2393 | nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low & | 2459 | vmx->nested.nested_vmx_true_procbased_ctls_low = |
2460 | vmx->nested.nested_vmx_procbased_ctls_low & | ||
2394 | ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); | 2461 | ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); |
2395 | 2462 | ||
2396 | /* secondary cpu-based controls */ | 2463 | /* secondary cpu-based controls */ |
2397 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, | 2464 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, |
2398 | nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); | 2465 | vmx->nested.nested_vmx_secondary_ctls_low, |
2399 | nested_vmx_secondary_ctls_low = 0; | 2466 | vmx->nested.nested_vmx_secondary_ctls_high); |
2400 | nested_vmx_secondary_ctls_high &= | 2467 | vmx->nested.nested_vmx_secondary_ctls_low = 0; |
2468 | vmx->nested.nested_vmx_secondary_ctls_high &= | ||
2401 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | 2469 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | |
2470 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | ||
2471 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
2472 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | ||
2402 | SECONDARY_EXEC_WBINVD_EXITING | | 2473 | SECONDARY_EXEC_WBINVD_EXITING | |
2403 | SECONDARY_EXEC_XSAVES; | 2474 | SECONDARY_EXEC_XSAVES; |
2404 | 2475 | ||
2405 | if (enable_ept) { | 2476 | if (enable_ept) { |
2406 | /* nested EPT: emulate EPT also to L1 */ | 2477 | /* nested EPT: emulate EPT also to L1 */ |
2407 | nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT | | 2478 | vmx->nested.nested_vmx_secondary_ctls_high |= |
2479 | SECONDARY_EXEC_ENABLE_EPT | | ||
2408 | SECONDARY_EXEC_UNRESTRICTED_GUEST; | 2480 | SECONDARY_EXEC_UNRESTRICTED_GUEST; |
2409 | nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | | 2481 | vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | |
2410 | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | | 2482 | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | |
2411 | VMX_EPT_INVEPT_BIT; | 2483 | VMX_EPT_INVEPT_BIT; |
2412 | nested_vmx_ept_caps &= vmx_capability.ept; | 2484 | vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; |
2413 | /* | 2485 | /* |
2414 | * For nested guests, we don't do anything specific | 2486 | * For nested guests, we don't do anything specific |
2415 | * for single context invalidation. Hence, only advertise | 2487 | * for single context invalidation. Hence, only advertise |
2416 | * support for global context invalidation. | 2488 | * support for global context invalidation. |
2417 | */ | 2489 | */ |
2418 | nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; | 2490 | vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; |
2419 | } else | 2491 | } else |
2420 | nested_vmx_ept_caps = 0; | 2492 | vmx->nested.nested_vmx_ept_caps = 0; |
2421 | 2493 | ||
2422 | /* miscellaneous data */ | 2494 | /* miscellaneous data */ |
2423 | rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); | 2495 | rdmsr(MSR_IA32_VMX_MISC, |
2424 | nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; | 2496 | vmx->nested.nested_vmx_misc_low, |
2425 | nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | | 2497 | vmx->nested.nested_vmx_misc_high); |
2498 | vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; | ||
2499 | vmx->nested.nested_vmx_misc_low |= | ||
2500 | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | | ||
2426 | VMX_MISC_ACTIVITY_HLT; | 2501 | VMX_MISC_ACTIVITY_HLT; |
2427 | nested_vmx_misc_high = 0; | 2502 | vmx->nested.nested_vmx_misc_high = 0; |
2428 | } | 2503 | } |
2429 | 2504 | ||
2430 | static inline bool vmx_control_verify(u32 control, u32 low, u32 high) | 2505 | static inline bool vmx_control_verify(u32 control, u32 low, u32 high) |
@@ -2443,6 +2518,8 @@ static inline u64 vmx_control_msr(u32 low, u32 high) | |||
2443 | /* Returns 0 on success, non-0 otherwise. */ | 2518 | /* Returns 0 on success, non-0 otherwise. */ |
2444 | static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | 2519 | static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) |
2445 | { | 2520 | { |
2521 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2522 | |||
2446 | switch (msr_index) { | 2523 | switch (msr_index) { |
2447 | case MSR_IA32_VMX_BASIC: | 2524 | case MSR_IA32_VMX_BASIC: |
2448 | /* | 2525 | /* |
@@ -2457,36 +2534,44 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
2457 | break; | 2534 | break; |
2458 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | 2535 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: |
2459 | case MSR_IA32_VMX_PINBASED_CTLS: | 2536 | case MSR_IA32_VMX_PINBASED_CTLS: |
2460 | *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, | 2537 | *pdata = vmx_control_msr( |
2461 | nested_vmx_pinbased_ctls_high); | 2538 | vmx->nested.nested_vmx_pinbased_ctls_low, |
2539 | vmx->nested.nested_vmx_pinbased_ctls_high); | ||
2462 | break; | 2540 | break; |
2463 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | 2541 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: |
2464 | *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low, | 2542 | *pdata = vmx_control_msr( |
2465 | nested_vmx_procbased_ctls_high); | 2543 | vmx->nested.nested_vmx_true_procbased_ctls_low, |
2544 | vmx->nested.nested_vmx_procbased_ctls_high); | ||
2466 | break; | 2545 | break; |
2467 | case MSR_IA32_VMX_PROCBASED_CTLS: | 2546 | case MSR_IA32_VMX_PROCBASED_CTLS: |
2468 | *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, | 2547 | *pdata = vmx_control_msr( |
2469 | nested_vmx_procbased_ctls_high); | 2548 | vmx->nested.nested_vmx_procbased_ctls_low, |
2549 | vmx->nested.nested_vmx_procbased_ctls_high); | ||
2470 | break; | 2550 | break; |
2471 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | 2551 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: |
2472 | *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low, | 2552 | *pdata = vmx_control_msr( |
2473 | nested_vmx_exit_ctls_high); | 2553 | vmx->nested.nested_vmx_true_exit_ctls_low, |
2554 | vmx->nested.nested_vmx_exit_ctls_high); | ||
2474 | break; | 2555 | break; |
2475 | case MSR_IA32_VMX_EXIT_CTLS: | 2556 | case MSR_IA32_VMX_EXIT_CTLS: |
2476 | *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, | 2557 | *pdata = vmx_control_msr( |
2477 | nested_vmx_exit_ctls_high); | 2558 | vmx->nested.nested_vmx_exit_ctls_low, |
2559 | vmx->nested.nested_vmx_exit_ctls_high); | ||
2478 | break; | 2560 | break; |
2479 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | 2561 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: |
2480 | *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low, | 2562 | *pdata = vmx_control_msr( |
2481 | nested_vmx_entry_ctls_high); | 2563 | vmx->nested.nested_vmx_true_entry_ctls_low, |
2564 | vmx->nested.nested_vmx_entry_ctls_high); | ||
2482 | break; | 2565 | break; |
2483 | case MSR_IA32_VMX_ENTRY_CTLS: | 2566 | case MSR_IA32_VMX_ENTRY_CTLS: |
2484 | *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, | 2567 | *pdata = vmx_control_msr( |
2485 | nested_vmx_entry_ctls_high); | 2568 | vmx->nested.nested_vmx_entry_ctls_low, |
2569 | vmx->nested.nested_vmx_entry_ctls_high); | ||
2486 | break; | 2570 | break; |
2487 | case MSR_IA32_VMX_MISC: | 2571 | case MSR_IA32_VMX_MISC: |
2488 | *pdata = vmx_control_msr(nested_vmx_misc_low, | 2572 | *pdata = vmx_control_msr( |
2489 | nested_vmx_misc_high); | 2573 | vmx->nested.nested_vmx_misc_low, |
2574 | vmx->nested.nested_vmx_misc_high); | ||
2490 | break; | 2575 | break; |
2491 | /* | 2576 | /* |
2492 | * These MSRs specify bits which the guest must keep fixed (on or off) | 2577 | * These MSRs specify bits which the guest must keep fixed (on or off) |
@@ -2511,12 +2596,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
2511 | *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ | 2596 | *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ |
2512 | break; | 2597 | break; |
2513 | case MSR_IA32_VMX_PROCBASED_CTLS2: | 2598 | case MSR_IA32_VMX_PROCBASED_CTLS2: |
2514 | *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, | 2599 | *pdata = vmx_control_msr( |
2515 | nested_vmx_secondary_ctls_high); | 2600 | vmx->nested.nested_vmx_secondary_ctls_low, |
2601 | vmx->nested.nested_vmx_secondary_ctls_high); | ||
2516 | break; | 2602 | break; |
2517 | case MSR_IA32_VMX_EPT_VPID_CAP: | 2603 | case MSR_IA32_VMX_EPT_VPID_CAP: |
2518 | /* Currently, no nested vpid support */ | 2604 | /* Currently, no nested vpid support */ |
2519 | *pdata = nested_vmx_ept_caps; | 2605 | *pdata = vmx->nested.nested_vmx_ept_caps; |
2520 | break; | 2606 | break; |
2521 | default: | 2607 | default: |
2522 | return 1; | 2608 | return 1; |
@@ -2929,7 +3015,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
2929 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | 3015 | SECONDARY_EXEC_APIC_REGISTER_VIRT | |
2930 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | 3016 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | |
2931 | SECONDARY_EXEC_SHADOW_VMCS | | 3017 | SECONDARY_EXEC_SHADOW_VMCS | |
2932 | SECONDARY_EXEC_XSAVES; | 3018 | SECONDARY_EXEC_XSAVES | |
3019 | SECONDARY_EXEC_ENABLE_PML; | ||
2933 | if (adjust_vmx_controls(min2, opt2, | 3020 | if (adjust_vmx_controls(min2, opt2, |
2934 | MSR_IA32_VMX_PROCBASED_CTLS2, | 3021 | MSR_IA32_VMX_PROCBASED_CTLS2, |
2935 | &_cpu_based_2nd_exec_control) < 0) | 3022 | &_cpu_based_2nd_exec_control) < 0) |
@@ -4159,6 +4246,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, | |||
4159 | } | 4246 | } |
4160 | } | 4247 | } |
4161 | 4248 | ||
4249 | /* | ||
4250 | * If a msr is allowed by L0, we should check whether it is allowed by L1. | ||
4251 | * The corresponding bit will be cleared unless both of L0 and L1 allow it. | ||
4252 | */ | ||
4253 | static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, | ||
4254 | unsigned long *msr_bitmap_nested, | ||
4255 | u32 msr, int type) | ||
4256 | { | ||
4257 | int f = sizeof(unsigned long); | ||
4258 | |||
4259 | if (!cpu_has_vmx_msr_bitmap()) { | ||
4260 | WARN_ON(1); | ||
4261 | return; | ||
4262 | } | ||
4263 | |||
4264 | /* | ||
4265 | * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals | ||
4266 | * have the write-low and read-high bitmap offsets the wrong way round. | ||
4267 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | ||
4268 | */ | ||
4269 | if (msr <= 0x1fff) { | ||
4270 | if (type & MSR_TYPE_R && | ||
4271 | !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) | ||
4272 | /* read-low */ | ||
4273 | __clear_bit(msr, msr_bitmap_nested + 0x000 / f); | ||
4274 | |||
4275 | if (type & MSR_TYPE_W && | ||
4276 | !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) | ||
4277 | /* write-low */ | ||
4278 | __clear_bit(msr, msr_bitmap_nested + 0x800 / f); | ||
4279 | |||
4280 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
4281 | msr &= 0x1fff; | ||
4282 | if (type & MSR_TYPE_R && | ||
4283 | !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) | ||
4284 | /* read-high */ | ||
4285 | __clear_bit(msr, msr_bitmap_nested + 0x400 / f); | ||
4286 | |||
4287 | if (type & MSR_TYPE_W && | ||
4288 | !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) | ||
4289 | /* write-high */ | ||
4290 | __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); | ||
4291 | |||
4292 | } | ||
4293 | } | ||
4294 | |||
4162 | static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) | 4295 | static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) |
4163 | { | 4296 | { |
4164 | if (!longmode_only) | 4297 | if (!longmode_only) |
@@ -4197,6 +4330,64 @@ static int vmx_vm_has_apicv(struct kvm *kvm) | |||
4197 | return enable_apicv && irqchip_in_kernel(kvm); | 4330 | return enable_apicv && irqchip_in_kernel(kvm); |
4198 | } | 4331 | } |
4199 | 4332 | ||
4333 | static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) | ||
4334 | { | ||
4335 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
4336 | int max_irr; | ||
4337 | void *vapic_page; | ||
4338 | u16 status; | ||
4339 | |||
4340 | if (vmx->nested.pi_desc && | ||
4341 | vmx->nested.pi_pending) { | ||
4342 | vmx->nested.pi_pending = false; | ||
4343 | if (!pi_test_and_clear_on(vmx->nested.pi_desc)) | ||
4344 | return 0; | ||
4345 | |||
4346 | max_irr = find_last_bit( | ||
4347 | (unsigned long *)vmx->nested.pi_desc->pir, 256); | ||
4348 | |||
4349 | if (max_irr == 256) | ||
4350 | return 0; | ||
4351 | |||
4352 | vapic_page = kmap(vmx->nested.virtual_apic_page); | ||
4353 | if (!vapic_page) { | ||
4354 | WARN_ON(1); | ||
4355 | return -ENOMEM; | ||
4356 | } | ||
4357 | __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); | ||
4358 | kunmap(vmx->nested.virtual_apic_page); | ||
4359 | |||
4360 | status = vmcs_read16(GUEST_INTR_STATUS); | ||
4361 | if ((u8)max_irr > ((u8)status & 0xff)) { | ||
4362 | status &= ~0xff; | ||
4363 | status |= (u8)max_irr; | ||
4364 | vmcs_write16(GUEST_INTR_STATUS, status); | ||
4365 | } | ||
4366 | } | ||
4367 | return 0; | ||
4368 | } | ||
4369 | |||
4370 | static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, | ||
4371 | int vector) | ||
4372 | { | ||
4373 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
4374 | |||
4375 | if (is_guest_mode(vcpu) && | ||
4376 | vector == vmx->nested.posted_intr_nv) { | ||
4377 | /* the PIR and ON have been set by L1. */ | ||
4378 | if (vcpu->mode == IN_GUEST_MODE) | ||
4379 | apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), | ||
4380 | POSTED_INTR_VECTOR); | ||
4381 | /* | ||
4382 | * If a posted intr is not recognized by hardware, | ||
4383 | * we will accomplish it in the next vmentry. | ||
4384 | */ | ||
4385 | vmx->nested.pi_pending = true; | ||
4386 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
4387 | return 0; | ||
4388 | } | ||
4389 | return -1; | ||
4390 | } | ||
4200 | /* | 4391 | /* |
4201 | * Send interrupt to vcpu via posted interrupt way. | 4392 | * Send interrupt to vcpu via posted interrupt way. |
4202 | * 1. If target vcpu is running(non-root mode), send posted interrupt | 4393 | * 1. If target vcpu is running(non-root mode), send posted interrupt |
@@ -4209,6 +4400,10 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) | |||
4209 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 4400 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4210 | int r; | 4401 | int r; |
4211 | 4402 | ||
4403 | r = vmx_deliver_nested_posted_interrupt(vcpu, vector); | ||
4404 | if (!r) | ||
4405 | return; | ||
4406 | |||
4212 | if (pi_test_and_set_pir(vector, &vmx->pi_desc)) | 4407 | if (pi_test_and_set_pir(vector, &vmx->pi_desc)) |
4213 | return; | 4408 | return; |
4214 | 4409 | ||
@@ -4360,6 +4555,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) | |||
4360 | a current VMCS12 | 4555 | a current VMCS12 |
4361 | */ | 4556 | */ |
4362 | exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; | 4557 | exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; |
4558 | /* PML is enabled/disabled in creating/destorying vcpu */ | ||
4559 | exec_control &= ~SECONDARY_EXEC_ENABLE_PML; | ||
4560 | |||
4363 | return exec_control; | 4561 | return exec_control; |
4364 | } | 4562 | } |
4365 | 4563 | ||
@@ -4986,11 +5184,12 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
4986 | hypercall[2] = 0xc1; | 5184 | hypercall[2] = 0xc1; |
4987 | } | 5185 | } |
4988 | 5186 | ||
4989 | static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) | 5187 | static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) |
4990 | { | 5188 | { |
4991 | unsigned long always_on = VMXON_CR0_ALWAYSON; | 5189 | unsigned long always_on = VMXON_CR0_ALWAYSON; |
5190 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
4992 | 5191 | ||
4993 | if (nested_vmx_secondary_ctls_high & | 5192 | if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & |
4994 | SECONDARY_EXEC_UNRESTRICTED_GUEST && | 5193 | SECONDARY_EXEC_UNRESTRICTED_GUEST && |
4995 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) | 5194 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) |
4996 | always_on &= ~(X86_CR0_PE | X86_CR0_PG); | 5195 | always_on &= ~(X86_CR0_PE | X86_CR0_PG); |
@@ -5015,7 +5214,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) | |||
5015 | val = (val & ~vmcs12->cr0_guest_host_mask) | | 5214 | val = (val & ~vmcs12->cr0_guest_host_mask) | |
5016 | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); | 5215 | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); |
5017 | 5216 | ||
5018 | if (!nested_cr0_valid(vmcs12, val)) | 5217 | if (!nested_cr0_valid(vcpu, val)) |
5019 | return 1; | 5218 | return 1; |
5020 | 5219 | ||
5021 | if (kvm_set_cr0(vcpu, val)) | 5220 | if (kvm_set_cr0(vcpu, val)) |
@@ -5817,13 +6016,21 @@ static __init int hardware_setup(void) | |||
5817 | (unsigned long *)__get_free_page(GFP_KERNEL); | 6016 | (unsigned long *)__get_free_page(GFP_KERNEL); |
5818 | if (!vmx_msr_bitmap_longmode_x2apic) | 6017 | if (!vmx_msr_bitmap_longmode_x2apic) |
5819 | goto out4; | 6018 | goto out4; |
6019 | |||
6020 | if (nested) { | ||
6021 | vmx_msr_bitmap_nested = | ||
6022 | (unsigned long *)__get_free_page(GFP_KERNEL); | ||
6023 | if (!vmx_msr_bitmap_nested) | ||
6024 | goto out5; | ||
6025 | } | ||
6026 | |||
5820 | vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); | 6027 | vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); |
5821 | if (!vmx_vmread_bitmap) | 6028 | if (!vmx_vmread_bitmap) |
5822 | goto out5; | 6029 | goto out6; |
5823 | 6030 | ||
5824 | vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); | 6031 | vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); |
5825 | if (!vmx_vmwrite_bitmap) | 6032 | if (!vmx_vmwrite_bitmap) |
5826 | goto out6; | 6033 | goto out7; |
5827 | 6034 | ||
5828 | memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); | 6035 | memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); |
5829 | memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); | 6036 | memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); |
@@ -5839,10 +6046,12 @@ static __init int hardware_setup(void) | |||
5839 | 6046 | ||
5840 | memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); | 6047 | memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); |
5841 | memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); | 6048 | memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); |
6049 | if (nested) | ||
6050 | memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); | ||
5842 | 6051 | ||
5843 | if (setup_vmcs_config(&vmcs_config) < 0) { | 6052 | if (setup_vmcs_config(&vmcs_config) < 0) { |
5844 | r = -EIO; | 6053 | r = -EIO; |
5845 | goto out7; | 6054 | goto out8; |
5846 | } | 6055 | } |
5847 | 6056 | ||
5848 | if (boot_cpu_has(X86_FEATURE_NX)) | 6057 | if (boot_cpu_has(X86_FEATURE_NX)) |
@@ -5868,16 +6077,16 @@ static __init int hardware_setup(void) | |||
5868 | if (!cpu_has_vmx_unrestricted_guest()) | 6077 | if (!cpu_has_vmx_unrestricted_guest()) |
5869 | enable_unrestricted_guest = 0; | 6078 | enable_unrestricted_guest = 0; |
5870 | 6079 | ||
5871 | if (!cpu_has_vmx_flexpriority()) { | 6080 | if (!cpu_has_vmx_flexpriority()) |
5872 | flexpriority_enabled = 0; | 6081 | flexpriority_enabled = 0; |
5873 | 6082 | ||
5874 | /* | 6083 | /* |
5875 | * set_apic_access_page_addr() is used to reload apic access | 6084 | * set_apic_access_page_addr() is used to reload apic access |
5876 | * page upon invalidation. No need to do anything if the | 6085 | * page upon invalidation. No need to do anything if not |
5877 | * processor does not have the APIC_ACCESS_ADDR VMCS field. | 6086 | * using the APIC_ACCESS_ADDR VMCS field. |
5878 | */ | 6087 | */ |
6088 | if (!flexpriority_enabled) | ||
5879 | kvm_x86_ops->set_apic_access_page_addr = NULL; | 6089 | kvm_x86_ops->set_apic_access_page_addr = NULL; |
5880 | } | ||
5881 | 6090 | ||
5882 | if (!cpu_has_vmx_tpr_shadow()) | 6091 | if (!cpu_has_vmx_tpr_shadow()) |
5883 | kvm_x86_ops->update_cr8_intercept = NULL; | 6092 | kvm_x86_ops->update_cr8_intercept = NULL; |
@@ -5895,13 +6104,11 @@ static __init int hardware_setup(void) | |||
5895 | kvm_x86_ops->update_cr8_intercept = NULL; | 6104 | kvm_x86_ops->update_cr8_intercept = NULL; |
5896 | else { | 6105 | else { |
5897 | kvm_x86_ops->hwapic_irr_update = NULL; | 6106 | kvm_x86_ops->hwapic_irr_update = NULL; |
6107 | kvm_x86_ops->hwapic_isr_update = NULL; | ||
5898 | kvm_x86_ops->deliver_posted_interrupt = NULL; | 6108 | kvm_x86_ops->deliver_posted_interrupt = NULL; |
5899 | kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; | 6109 | kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; |
5900 | } | 6110 | } |
5901 | 6111 | ||
5902 | if (nested) | ||
5903 | nested_vmx_setup_ctls_msrs(); | ||
5904 | |||
5905 | vmx_disable_intercept_for_msr(MSR_FS_BASE, false); | 6112 | vmx_disable_intercept_for_msr(MSR_FS_BASE, false); |
5906 | vmx_disable_intercept_for_msr(MSR_GS_BASE, false); | 6113 | vmx_disable_intercept_for_msr(MSR_GS_BASE, false); |
5907 | vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); | 6114 | vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); |
@@ -5945,12 +6152,29 @@ static __init int hardware_setup(void) | |||
5945 | 6152 | ||
5946 | update_ple_window_actual_max(); | 6153 | update_ple_window_actual_max(); |
5947 | 6154 | ||
6155 | /* | ||
6156 | * Only enable PML when hardware supports PML feature, and both EPT | ||
6157 | * and EPT A/D bit features are enabled -- PML depends on them to work. | ||
6158 | */ | ||
6159 | if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) | ||
6160 | enable_pml = 0; | ||
6161 | |||
6162 | if (!enable_pml) { | ||
6163 | kvm_x86_ops->slot_enable_log_dirty = NULL; | ||
6164 | kvm_x86_ops->slot_disable_log_dirty = NULL; | ||
6165 | kvm_x86_ops->flush_log_dirty = NULL; | ||
6166 | kvm_x86_ops->enable_log_dirty_pt_masked = NULL; | ||
6167 | } | ||
6168 | |||
5948 | return alloc_kvm_area(); | 6169 | return alloc_kvm_area(); |
5949 | 6170 | ||
5950 | out7: | 6171 | out8: |
5951 | free_page((unsigned long)vmx_vmwrite_bitmap); | 6172 | free_page((unsigned long)vmx_vmwrite_bitmap); |
5952 | out6: | 6173 | out7: |
5953 | free_page((unsigned long)vmx_vmread_bitmap); | 6174 | free_page((unsigned long)vmx_vmread_bitmap); |
6175 | out6: | ||
6176 | if (nested) | ||
6177 | free_page((unsigned long)vmx_msr_bitmap_nested); | ||
5954 | out5: | 6178 | out5: |
5955 | free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); | 6179 | free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); |
5956 | out4: | 6180 | out4: |
@@ -5977,6 +6201,8 @@ static __exit void hardware_unsetup(void) | |||
5977 | free_page((unsigned long)vmx_io_bitmap_a); | 6201 | free_page((unsigned long)vmx_io_bitmap_a); |
5978 | free_page((unsigned long)vmx_vmwrite_bitmap); | 6202 | free_page((unsigned long)vmx_vmwrite_bitmap); |
5979 | free_page((unsigned long)vmx_vmread_bitmap); | 6203 | free_page((unsigned long)vmx_vmread_bitmap); |
6204 | if (nested) | ||
6205 | free_page((unsigned long)vmx_msr_bitmap_nested); | ||
5980 | 6206 | ||
5981 | free_kvm_area(); | 6207 | free_kvm_area(); |
5982 | } | 6208 | } |
@@ -6143,6 +6369,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, | |||
6143 | */ | 6369 | */ |
6144 | } | 6370 | } |
6145 | 6371 | ||
6372 | static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) | ||
6373 | { | ||
6374 | /* TODO: not to reset guest simply here. */ | ||
6375 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||
6376 | pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); | ||
6377 | } | ||
6378 | |||
6146 | static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) | 6379 | static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) |
6147 | { | 6380 | { |
6148 | struct vcpu_vmx *vmx = | 6381 | struct vcpu_vmx *vmx = |
@@ -6432,6 +6665,7 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) | |||
6432 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | 6665 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); |
6433 | vmcs_write64(VMCS_LINK_POINTER, -1ull); | 6666 | vmcs_write64(VMCS_LINK_POINTER, -1ull); |
6434 | } | 6667 | } |
6668 | vmx->nested.posted_intr_nv = -1; | ||
6435 | kunmap(vmx->nested.current_vmcs12_page); | 6669 | kunmap(vmx->nested.current_vmcs12_page); |
6436 | nested_release_page(vmx->nested.current_vmcs12_page); | 6670 | nested_release_page(vmx->nested.current_vmcs12_page); |
6437 | vmx->nested.current_vmptr = -1ull; | 6671 | vmx->nested.current_vmptr = -1ull; |
@@ -6460,6 +6694,12 @@ static void free_nested(struct vcpu_vmx *vmx) | |||
6460 | nested_release_page(vmx->nested.virtual_apic_page); | 6694 | nested_release_page(vmx->nested.virtual_apic_page); |
6461 | vmx->nested.virtual_apic_page = NULL; | 6695 | vmx->nested.virtual_apic_page = NULL; |
6462 | } | 6696 | } |
6697 | if (vmx->nested.pi_desc_page) { | ||
6698 | kunmap(vmx->nested.pi_desc_page); | ||
6699 | nested_release_page(vmx->nested.pi_desc_page); | ||
6700 | vmx->nested.pi_desc_page = NULL; | ||
6701 | vmx->nested.pi_desc = NULL; | ||
6702 | } | ||
6463 | 6703 | ||
6464 | nested_free_all_saved_vmcss(vmx); | 6704 | nested_free_all_saved_vmcss(vmx); |
6465 | } | 6705 | } |
@@ -6893,6 +7133,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) | |||
6893 | /* Emulate the INVEPT instruction */ | 7133 | /* Emulate the INVEPT instruction */ |
6894 | static int handle_invept(struct kvm_vcpu *vcpu) | 7134 | static int handle_invept(struct kvm_vcpu *vcpu) |
6895 | { | 7135 | { |
7136 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
6896 | u32 vmx_instruction_info, types; | 7137 | u32 vmx_instruction_info, types; |
6897 | unsigned long type; | 7138 | unsigned long type; |
6898 | gva_t gva; | 7139 | gva_t gva; |
@@ -6901,8 +7142,9 @@ static int handle_invept(struct kvm_vcpu *vcpu) | |||
6901 | u64 eptp, gpa; | 7142 | u64 eptp, gpa; |
6902 | } operand; | 7143 | } operand; |
6903 | 7144 | ||
6904 | if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || | 7145 | if (!(vmx->nested.nested_vmx_secondary_ctls_high & |
6905 | !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { | 7146 | SECONDARY_EXEC_ENABLE_EPT) || |
7147 | !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { | ||
6906 | kvm_queue_exception(vcpu, UD_VECTOR); | 7148 | kvm_queue_exception(vcpu, UD_VECTOR); |
6907 | return 1; | 7149 | return 1; |
6908 | } | 7150 | } |
@@ -6918,7 +7160,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) | |||
6918 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | 7160 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); |
6919 | type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); | 7161 | type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); |
6920 | 7162 | ||
6921 | types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; | 7163 | types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; |
6922 | 7164 | ||
6923 | if (!(types & (1UL << type))) { | 7165 | if (!(types & (1UL << type))) { |
6924 | nested_vmx_failValid(vcpu, | 7166 | nested_vmx_failValid(vcpu, |
@@ -6960,6 +7202,31 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) | |||
6960 | return 1; | 7202 | return 1; |
6961 | } | 7203 | } |
6962 | 7204 | ||
7205 | static int handle_pml_full(struct kvm_vcpu *vcpu) | ||
7206 | { | ||
7207 | unsigned long exit_qualification; | ||
7208 | |||
7209 | trace_kvm_pml_full(vcpu->vcpu_id); | ||
7210 | |||
7211 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
7212 | |||
7213 | /* | ||
7214 | * PML buffer FULL happened while executing iret from NMI, | ||
7215 | * "blocked by NMI" bit has to be set before next VM entry. | ||
7216 | */ | ||
7217 | if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && | ||
7218 | cpu_has_virtual_nmis() && | ||
7219 | (exit_qualification & INTR_INFO_UNBLOCK_NMI)) | ||
7220 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
7221 | GUEST_INTR_STATE_NMI); | ||
7222 | |||
7223 | /* | ||
7224 | * PML buffer already flushed at beginning of VMEXIT. Nothing to do | ||
7225 | * here.., and there's no userspace involvement needed for PML. | ||
7226 | */ | ||
7227 | return 1; | ||
7228 | } | ||
7229 | |||
6963 | /* | 7230 | /* |
6964 | * The exit handlers return 1 if the exit was handled fully and guest execution | 7231 | * The exit handlers return 1 if the exit was handled fully and guest execution |
6965 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | 7232 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs |
@@ -7008,6 +7275,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
7008 | [EXIT_REASON_INVVPID] = handle_invvpid, | 7275 | [EXIT_REASON_INVVPID] = handle_invvpid, |
7009 | [EXIT_REASON_XSAVES] = handle_xsaves, | 7276 | [EXIT_REASON_XSAVES] = handle_xsaves, |
7010 | [EXIT_REASON_XRSTORS] = handle_xrstors, | 7277 | [EXIT_REASON_XRSTORS] = handle_xrstors, |
7278 | [EXIT_REASON_PML_FULL] = handle_pml_full, | ||
7011 | }; | 7279 | }; |
7012 | 7280 | ||
7013 | static const int kvm_vmx_max_exit_handlers = | 7281 | static const int kvm_vmx_max_exit_handlers = |
@@ -7275,6 +7543,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) | |||
7275 | case EXIT_REASON_APIC_ACCESS: | 7543 | case EXIT_REASON_APIC_ACCESS: |
7276 | return nested_cpu_has2(vmcs12, | 7544 | return nested_cpu_has2(vmcs12, |
7277 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | 7545 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); |
7546 | case EXIT_REASON_APIC_WRITE: | ||
7547 | case EXIT_REASON_EOI_INDUCED: | ||
7548 | /* apic_write and eoi_induced should exit unconditionally. */ | ||
7549 | return 1; | ||
7278 | case EXIT_REASON_EPT_VIOLATION: | 7550 | case EXIT_REASON_EPT_VIOLATION: |
7279 | /* | 7551 | /* |
7280 | * L0 always deals with the EPT violation. If nested EPT is | 7552 | * L0 always deals with the EPT violation. If nested EPT is |
@@ -7314,6 +7586,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) | |||
7314 | *info2 = vmcs_read32(VM_EXIT_INTR_INFO); | 7586 | *info2 = vmcs_read32(VM_EXIT_INTR_INFO); |
7315 | } | 7587 | } |
7316 | 7588 | ||
7589 | static int vmx_enable_pml(struct vcpu_vmx *vmx) | ||
7590 | { | ||
7591 | struct page *pml_pg; | ||
7592 | u32 exec_control; | ||
7593 | |||
7594 | pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
7595 | if (!pml_pg) | ||
7596 | return -ENOMEM; | ||
7597 | |||
7598 | vmx->pml_pg = pml_pg; | ||
7599 | |||
7600 | vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); | ||
7601 | vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); | ||
7602 | |||
7603 | exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); | ||
7604 | exec_control |= SECONDARY_EXEC_ENABLE_PML; | ||
7605 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
7606 | |||
7607 | return 0; | ||
7608 | } | ||
7609 | |||
7610 | static void vmx_disable_pml(struct vcpu_vmx *vmx) | ||
7611 | { | ||
7612 | u32 exec_control; | ||
7613 | |||
7614 | ASSERT(vmx->pml_pg); | ||
7615 | __free_page(vmx->pml_pg); | ||
7616 | vmx->pml_pg = NULL; | ||
7617 | |||
7618 | exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); | ||
7619 | exec_control &= ~SECONDARY_EXEC_ENABLE_PML; | ||
7620 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
7621 | } | ||
7622 | |||
7623 | static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) | ||
7624 | { | ||
7625 | struct kvm *kvm = vmx->vcpu.kvm; | ||
7626 | u64 *pml_buf; | ||
7627 | u16 pml_idx; | ||
7628 | |||
7629 | pml_idx = vmcs_read16(GUEST_PML_INDEX); | ||
7630 | |||
7631 | /* Do nothing if PML buffer is empty */ | ||
7632 | if (pml_idx == (PML_ENTITY_NUM - 1)) | ||
7633 | return; | ||
7634 | |||
7635 | /* PML index always points to next available PML buffer entity */ | ||
7636 | if (pml_idx >= PML_ENTITY_NUM) | ||
7637 | pml_idx = 0; | ||
7638 | else | ||
7639 | pml_idx++; | ||
7640 | |||
7641 | pml_buf = page_address(vmx->pml_pg); | ||
7642 | for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { | ||
7643 | u64 gpa; | ||
7644 | |||
7645 | gpa = pml_buf[pml_idx]; | ||
7646 | WARN_ON(gpa & (PAGE_SIZE - 1)); | ||
7647 | mark_page_dirty(kvm, gpa >> PAGE_SHIFT); | ||
7648 | } | ||
7649 | |||
7650 | /* reset PML index */ | ||
7651 | vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); | ||
7652 | } | ||
7653 | |||
7654 | /* | ||
7655 | * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. | ||
7656 | * Called before reporting dirty_bitmap to userspace. | ||
7657 | */ | ||
7658 | static void kvm_flush_pml_buffers(struct kvm *kvm) | ||
7659 | { | ||
7660 | int i; | ||
7661 | struct kvm_vcpu *vcpu; | ||
7662 | /* | ||
7663 | * We only need to kick vcpu out of guest mode here, as PML buffer | ||
7664 | * is flushed at beginning of all VMEXITs, and it's obvious that only | ||
7665 | * vcpus running in guest are possible to have unflushed GPAs in PML | ||
7666 | * buffer. | ||
7667 | */ | ||
7668 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
7669 | kvm_vcpu_kick(vcpu); | ||
7670 | } | ||
7671 | |||
7317 | /* | 7672 | /* |
7318 | * The guest has exited. See if we can fix it or if we need userspace | 7673 | * The guest has exited. See if we can fix it or if we need userspace |
7319 | * assistance. | 7674 | * assistance. |
@@ -7324,6 +7679,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
7324 | u32 exit_reason = vmx->exit_reason; | 7679 | u32 exit_reason = vmx->exit_reason; |
7325 | u32 vectoring_info = vmx->idt_vectoring_info; | 7680 | u32 vectoring_info = vmx->idt_vectoring_info; |
7326 | 7681 | ||
7682 | /* | ||
7683 | * Flush logged GPAs PML buffer, this will make dirty_bitmap more | ||
7684 | * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before | ||
7685 | * querying dirty_bitmap, we only need to kick all vcpus out of guest | ||
7686 | * mode as if vcpus is in root mode, the PML buffer must has been | ||
7687 | * flushed already. | ||
7688 | */ | ||
7689 | if (enable_pml) | ||
7690 | vmx_flush_pml_buffer(vmx); | ||
7691 | |||
7327 | /* If guest state is invalid, start emulating */ | 7692 | /* If guest state is invalid, start emulating */ |
7328 | if (vmx->emulation_required) | 7693 | if (vmx->emulation_required) |
7329 | return handle_invalid_guest_state(vcpu); | 7694 | return handle_invalid_guest_state(vcpu); |
@@ -7471,9 +7836,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) | |||
7471 | u16 status; | 7836 | u16 status; |
7472 | u8 old; | 7837 | u8 old; |
7473 | 7838 | ||
7474 | if (!vmx_vm_has_apicv(kvm)) | ||
7475 | return; | ||
7476 | |||
7477 | if (isr == -1) | 7839 | if (isr == -1) |
7478 | isr = 0; | 7840 | isr = 0; |
7479 | 7841 | ||
@@ -7973,6 +8335,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | |||
7973 | { | 8335 | { |
7974 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 8336 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
7975 | 8337 | ||
8338 | if (enable_pml) | ||
8339 | vmx_disable_pml(vmx); | ||
7976 | free_vpid(vmx); | 8340 | free_vpid(vmx); |
7977 | leave_guest_mode(vcpu); | 8341 | leave_guest_mode(vcpu); |
7978 | vmx_load_vmcs01(vcpu); | 8342 | vmx_load_vmcs01(vcpu); |
@@ -8040,9 +8404,25 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
8040 | goto free_vmcs; | 8404 | goto free_vmcs; |
8041 | } | 8405 | } |
8042 | 8406 | ||
8407 | if (nested) | ||
8408 | nested_vmx_setup_ctls_msrs(vmx); | ||
8409 | |||
8410 | vmx->nested.posted_intr_nv = -1; | ||
8043 | vmx->nested.current_vmptr = -1ull; | 8411 | vmx->nested.current_vmptr = -1ull; |
8044 | vmx->nested.current_vmcs12 = NULL; | 8412 | vmx->nested.current_vmcs12 = NULL; |
8045 | 8413 | ||
8414 | /* | ||
8415 | * If PML is turned on, failure on enabling PML just results in failure | ||
8416 | * of creating the vcpu, therefore we can simplify PML logic (by | ||
8417 | * avoiding dealing with cases, such as enabling PML partially on vcpus | ||
8418 | * for the guest, etc. | ||
8419 | */ | ||
8420 | if (enable_pml) { | ||
8421 | err = vmx_enable_pml(vmx); | ||
8422 | if (err) | ||
8423 | goto free_vmcs; | ||
8424 | } | ||
8425 | |||
8046 | return &vmx->vcpu; | 8426 | return &vmx->vcpu; |
8047 | 8427 | ||
8048 | free_vmcs: | 8428 | free_vmcs: |
@@ -8184,9 +8564,10 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) | |||
8184 | 8564 | ||
8185 | static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) | 8565 | static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) |
8186 | { | 8566 | { |
8187 | kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, | 8567 | WARN_ON(mmu_is_nested(vcpu)); |
8188 | nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); | 8568 | kvm_init_shadow_ept_mmu(vcpu, |
8189 | 8569 | to_vmx(vcpu)->nested.nested_vmx_ept_caps & | |
8570 | VMX_EPT_EXECUTE_ONLY_BIT); | ||
8190 | vcpu->arch.mmu.set_cr3 = vmx_set_cr3; | 8571 | vcpu->arch.mmu.set_cr3 = vmx_set_cr3; |
8191 | vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; | 8572 | vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; |
8192 | vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; | 8573 | vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; |
@@ -8199,6 +8580,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) | |||
8199 | vcpu->arch.walk_mmu = &vcpu->arch.mmu; | 8580 | vcpu->arch.walk_mmu = &vcpu->arch.mmu; |
8200 | } | 8581 | } |
8201 | 8582 | ||
8583 | static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, | ||
8584 | u16 error_code) | ||
8585 | { | ||
8586 | bool inequality, bit; | ||
8587 | |||
8588 | bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; | ||
8589 | inequality = | ||
8590 | (error_code & vmcs12->page_fault_error_code_mask) != | ||
8591 | vmcs12->page_fault_error_code_match; | ||
8592 | return inequality ^ bit; | ||
8593 | } | ||
8594 | |||
8202 | static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, | 8595 | static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, |
8203 | struct x86_exception *fault) | 8596 | struct x86_exception *fault) |
8204 | { | 8597 | { |
@@ -8206,8 +8599,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, | |||
8206 | 8599 | ||
8207 | WARN_ON(!is_guest_mode(vcpu)); | 8600 | WARN_ON(!is_guest_mode(vcpu)); |
8208 | 8601 | ||
8209 | /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ | 8602 | if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) |
8210 | if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) | ||
8211 | nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, | 8603 | nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, |
8212 | vmcs_read32(VM_EXIT_INTR_INFO), | 8604 | vmcs_read32(VM_EXIT_INTR_INFO), |
8213 | vmcs_readl(EXIT_QUALIFICATION)); | 8605 | vmcs_readl(EXIT_QUALIFICATION)); |
@@ -8261,6 +8653,31 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, | |||
8261 | return false; | 8653 | return false; |
8262 | } | 8654 | } |
8263 | 8655 | ||
8656 | if (nested_cpu_has_posted_intr(vmcs12)) { | ||
8657 | if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64)) | ||
8658 | return false; | ||
8659 | |||
8660 | if (vmx->nested.pi_desc_page) { /* shouldn't happen */ | ||
8661 | kunmap(vmx->nested.pi_desc_page); | ||
8662 | nested_release_page(vmx->nested.pi_desc_page); | ||
8663 | } | ||
8664 | vmx->nested.pi_desc_page = | ||
8665 | nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); | ||
8666 | if (!vmx->nested.pi_desc_page) | ||
8667 | return false; | ||
8668 | |||
8669 | vmx->nested.pi_desc = | ||
8670 | (struct pi_desc *)kmap(vmx->nested.pi_desc_page); | ||
8671 | if (!vmx->nested.pi_desc) { | ||
8672 | nested_release_page_clean(vmx->nested.pi_desc_page); | ||
8673 | return false; | ||
8674 | } | ||
8675 | vmx->nested.pi_desc = | ||
8676 | (struct pi_desc *)((void *)vmx->nested.pi_desc + | ||
8677 | (unsigned long)(vmcs12->posted_intr_desc_addr & | ||
8678 | (PAGE_SIZE - 1))); | ||
8679 | } | ||
8680 | |||
8264 | return true; | 8681 | return true; |
8265 | } | 8682 | } |
8266 | 8683 | ||
@@ -8286,6 +8703,310 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) | |||
8286 | ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); | 8703 | ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); |
8287 | } | 8704 | } |
8288 | 8705 | ||
8706 | static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, | ||
8707 | struct vmcs12 *vmcs12) | ||
8708 | { | ||
8709 | int maxphyaddr; | ||
8710 | u64 addr; | ||
8711 | |||
8712 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | ||
8713 | return 0; | ||
8714 | |||
8715 | if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { | ||
8716 | WARN_ON(1); | ||
8717 | return -EINVAL; | ||
8718 | } | ||
8719 | maxphyaddr = cpuid_maxphyaddr(vcpu); | ||
8720 | |||
8721 | if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || | ||
8722 | ((addr + PAGE_SIZE) >> maxphyaddr)) | ||
8723 | return -EINVAL; | ||
8724 | |||
8725 | return 0; | ||
8726 | } | ||
8727 | |||
8728 | /* | ||
8729 | * Merge L0's and L1's MSR bitmap, return false to indicate that | ||
8730 | * we do not use the hardware. | ||
8731 | */ | ||
8732 | static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, | ||
8733 | struct vmcs12 *vmcs12) | ||
8734 | { | ||
8735 | int msr; | ||
8736 | struct page *page; | ||
8737 | unsigned long *msr_bitmap; | ||
8738 | |||
8739 | if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) | ||
8740 | return false; | ||
8741 | |||
8742 | page = nested_get_page(vcpu, vmcs12->msr_bitmap); | ||
8743 | if (!page) { | ||
8744 | WARN_ON(1); | ||
8745 | return false; | ||
8746 | } | ||
8747 | msr_bitmap = (unsigned long *)kmap(page); | ||
8748 | if (!msr_bitmap) { | ||
8749 | nested_release_page_clean(page); | ||
8750 | WARN_ON(1); | ||
8751 | return false; | ||
8752 | } | ||
8753 | |||
8754 | if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { | ||
8755 | if (nested_cpu_has_apic_reg_virt(vmcs12)) | ||
8756 | for (msr = 0x800; msr <= 0x8ff; msr++) | ||
8757 | nested_vmx_disable_intercept_for_msr( | ||
8758 | msr_bitmap, | ||
8759 | vmx_msr_bitmap_nested, | ||
8760 | msr, MSR_TYPE_R); | ||
8761 | /* TPR is allowed */ | ||
8762 | nested_vmx_disable_intercept_for_msr(msr_bitmap, | ||
8763 | vmx_msr_bitmap_nested, | ||
8764 | APIC_BASE_MSR + (APIC_TASKPRI >> 4), | ||
8765 | MSR_TYPE_R | MSR_TYPE_W); | ||
8766 | if (nested_cpu_has_vid(vmcs12)) { | ||
8767 | /* EOI and self-IPI are allowed */ | ||
8768 | nested_vmx_disable_intercept_for_msr( | ||
8769 | msr_bitmap, | ||
8770 | vmx_msr_bitmap_nested, | ||
8771 | APIC_BASE_MSR + (APIC_EOI >> 4), | ||
8772 | MSR_TYPE_W); | ||
8773 | nested_vmx_disable_intercept_for_msr( | ||
8774 | msr_bitmap, | ||
8775 | vmx_msr_bitmap_nested, | ||
8776 | APIC_BASE_MSR + (APIC_SELF_IPI >> 4), | ||
8777 | MSR_TYPE_W); | ||
8778 | } | ||
8779 | } else { | ||
8780 | /* | ||
8781 | * Enable reading intercept of all the x2apic | ||
8782 | * MSRs. We should not rely on vmcs12 to do any | ||
8783 | * optimizations here, it may have been modified | ||
8784 | * by L1. | ||
8785 | */ | ||
8786 | for (msr = 0x800; msr <= 0x8ff; msr++) | ||
8787 | __vmx_enable_intercept_for_msr( | ||
8788 | vmx_msr_bitmap_nested, | ||
8789 | msr, | ||
8790 | MSR_TYPE_R); | ||
8791 | |||
8792 | __vmx_enable_intercept_for_msr( | ||
8793 | vmx_msr_bitmap_nested, | ||
8794 | APIC_BASE_MSR + (APIC_TASKPRI >> 4), | ||
8795 | MSR_TYPE_W); | ||
8796 | __vmx_enable_intercept_for_msr( | ||
8797 | vmx_msr_bitmap_nested, | ||
8798 | APIC_BASE_MSR + (APIC_EOI >> 4), | ||
8799 | MSR_TYPE_W); | ||
8800 | __vmx_enable_intercept_for_msr( | ||
8801 | vmx_msr_bitmap_nested, | ||
8802 | APIC_BASE_MSR + (APIC_SELF_IPI >> 4), | ||
8803 | MSR_TYPE_W); | ||
8804 | } | ||
8805 | kunmap(page); | ||
8806 | nested_release_page_clean(page); | ||
8807 | |||
8808 | return true; | ||
8809 | } | ||
8810 | |||
8811 | static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, | ||
8812 | struct vmcs12 *vmcs12) | ||
8813 | { | ||
8814 | if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && | ||
8815 | !nested_cpu_has_apic_reg_virt(vmcs12) && | ||
8816 | !nested_cpu_has_vid(vmcs12) && | ||
8817 | !nested_cpu_has_posted_intr(vmcs12)) | ||
8818 | return 0; | ||
8819 | |||
8820 | /* | ||
8821 | * If virtualize x2apic mode is enabled, | ||
8822 | * virtualize apic access must be disabled. | ||
8823 | */ | ||
8824 | if (nested_cpu_has_virt_x2apic_mode(vmcs12) && | ||
8825 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | ||
8826 | return -EINVAL; | ||
8827 | |||
8828 | /* | ||
8829 | * If virtual interrupt delivery is enabled, | ||
8830 | * we must exit on external interrupts. | ||
8831 | */ | ||
8832 | if (nested_cpu_has_vid(vmcs12) && | ||
8833 | !nested_exit_on_intr(vcpu)) | ||
8834 | return -EINVAL; | ||
8835 | |||
8836 | /* | ||
8837 | * bits 15:8 should be zero in posted_intr_nv, | ||
8838 | * the descriptor address has been already checked | ||
8839 | * in nested_get_vmcs12_pages. | ||
8840 | */ | ||
8841 | if (nested_cpu_has_posted_intr(vmcs12) && | ||
8842 | (!nested_cpu_has_vid(vmcs12) || | ||
8843 | !nested_exit_intr_ack_set(vcpu) || | ||
8844 | vmcs12->posted_intr_nv & 0xff00)) | ||
8845 | return -EINVAL; | ||
8846 | |||
8847 | /* tpr shadow is needed by all apicv features. */ | ||
8848 | if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) | ||
8849 | return -EINVAL; | ||
8850 | |||
8851 | return 0; | ||
8852 | } | ||
8853 | |||
8854 | static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, | ||
8855 | unsigned long count_field, | ||
8856 | unsigned long addr_field, | ||
8857 | int maxphyaddr) | ||
8858 | { | ||
8859 | u64 count, addr; | ||
8860 | |||
8861 | if (vmcs12_read_any(vcpu, count_field, &count) || | ||
8862 | vmcs12_read_any(vcpu, addr_field, &addr)) { | ||
8863 | WARN_ON(1); | ||
8864 | return -EINVAL; | ||
8865 | } | ||
8866 | if (count == 0) | ||
8867 | return 0; | ||
8868 | if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || | ||
8869 | (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { | ||
8870 | pr_warn_ratelimited( | ||
8871 | "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", | ||
8872 | addr_field, maxphyaddr, count, addr); | ||
8873 | return -EINVAL; | ||
8874 | } | ||
8875 | return 0; | ||
8876 | } | ||
8877 | |||
8878 | static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, | ||
8879 | struct vmcs12 *vmcs12) | ||
8880 | { | ||
8881 | int maxphyaddr; | ||
8882 | |||
8883 | if (vmcs12->vm_exit_msr_load_count == 0 && | ||
8884 | vmcs12->vm_exit_msr_store_count == 0 && | ||
8885 | vmcs12->vm_entry_msr_load_count == 0) | ||
8886 | return 0; /* Fast path */ | ||
8887 | maxphyaddr = cpuid_maxphyaddr(vcpu); | ||
8888 | if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, | ||
8889 | VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) || | ||
8890 | nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, | ||
8891 | VM_EXIT_MSR_STORE_ADDR, maxphyaddr) || | ||
8892 | nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, | ||
8893 | VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr)) | ||
8894 | return -EINVAL; | ||
8895 | return 0; | ||
8896 | } | ||
8897 | |||
8898 | static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, | ||
8899 | struct vmx_msr_entry *e) | ||
8900 | { | ||
8901 | /* x2APIC MSR accesses are not allowed */ | ||
8902 | if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8) | ||
8903 | return -EINVAL; | ||
8904 | if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ | ||
8905 | e->index == MSR_IA32_UCODE_REV) | ||
8906 | return -EINVAL; | ||
8907 | if (e->reserved != 0) | ||
8908 | return -EINVAL; | ||
8909 | return 0; | ||
8910 | } | ||
8911 | |||
8912 | static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, | ||
8913 | struct vmx_msr_entry *e) | ||
8914 | { | ||
8915 | if (e->index == MSR_FS_BASE || | ||
8916 | e->index == MSR_GS_BASE || | ||
8917 | e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ | ||
8918 | nested_vmx_msr_check_common(vcpu, e)) | ||
8919 | return -EINVAL; | ||
8920 | return 0; | ||
8921 | } | ||
8922 | |||
8923 | static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, | ||
8924 | struct vmx_msr_entry *e) | ||
8925 | { | ||
8926 | if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ | ||
8927 | nested_vmx_msr_check_common(vcpu, e)) | ||
8928 | return -EINVAL; | ||
8929 | return 0; | ||
8930 | } | ||
8931 | |||
8932 | /* | ||
8933 | * Load guest's/host's msr at nested entry/exit. | ||
8934 | * return 0 for success, entry index for failure. | ||
8935 | */ | ||
8936 | static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) | ||
8937 | { | ||
8938 | u32 i; | ||
8939 | struct vmx_msr_entry e; | ||
8940 | struct msr_data msr; | ||
8941 | |||
8942 | msr.host_initiated = false; | ||
8943 | for (i = 0; i < count; i++) { | ||
8944 | if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), | ||
8945 | &e, sizeof(e))) { | ||
8946 | pr_warn_ratelimited( | ||
8947 | "%s cannot read MSR entry (%u, 0x%08llx)\n", | ||
8948 | __func__, i, gpa + i * sizeof(e)); | ||
8949 | goto fail; | ||
8950 | } | ||
8951 | if (nested_vmx_load_msr_check(vcpu, &e)) { | ||
8952 | pr_warn_ratelimited( | ||
8953 | "%s check failed (%u, 0x%x, 0x%x)\n", | ||
8954 | __func__, i, e.index, e.reserved); | ||
8955 | goto fail; | ||
8956 | } | ||
8957 | msr.index = e.index; | ||
8958 | msr.data = e.value; | ||
8959 | if (kvm_set_msr(vcpu, &msr)) { | ||
8960 | pr_warn_ratelimited( | ||
8961 | "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", | ||
8962 | __func__, i, e.index, e.value); | ||
8963 | goto fail; | ||
8964 | } | ||
8965 | } | ||
8966 | return 0; | ||
8967 | fail: | ||
8968 | return i + 1; | ||
8969 | } | ||
8970 | |||
8971 | static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) | ||
8972 | { | ||
8973 | u32 i; | ||
8974 | struct vmx_msr_entry e; | ||
8975 | |||
8976 | for (i = 0; i < count; i++) { | ||
8977 | if (kvm_read_guest(vcpu->kvm, | ||
8978 | gpa + i * sizeof(e), | ||
8979 | &e, 2 * sizeof(u32))) { | ||
8980 | pr_warn_ratelimited( | ||
8981 | "%s cannot read MSR entry (%u, 0x%08llx)\n", | ||
8982 | __func__, i, gpa + i * sizeof(e)); | ||
8983 | return -EINVAL; | ||
8984 | } | ||
8985 | if (nested_vmx_store_msr_check(vcpu, &e)) { | ||
8986 | pr_warn_ratelimited( | ||
8987 | "%s check failed (%u, 0x%x, 0x%x)\n", | ||
8988 | __func__, i, e.index, e.reserved); | ||
8989 | return -EINVAL; | ||
8990 | } | ||
8991 | if (kvm_get_msr(vcpu, e.index, &e.value)) { | ||
8992 | pr_warn_ratelimited( | ||
8993 | "%s cannot read MSR (%u, 0x%x)\n", | ||
8994 | __func__, i, e.index); | ||
8995 | return -EINVAL; | ||
8996 | } | ||
8997 | if (kvm_write_guest(vcpu->kvm, | ||
8998 | gpa + i * sizeof(e) + | ||
8999 | offsetof(struct vmx_msr_entry, value), | ||
9000 | &e.value, sizeof(e.value))) { | ||
9001 | pr_warn_ratelimited( | ||
9002 | "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", | ||
9003 | __func__, i, e.index, e.value); | ||
9004 | return -EINVAL; | ||
9005 | } | ||
9006 | } | ||
9007 | return 0; | ||
9008 | } | ||
9009 | |||
8289 | /* | 9010 | /* |
8290 | * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested | 9011 | * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested |
8291 | * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it | 9012 | * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it |
@@ -8365,8 +9086,23 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | |||
8365 | 9086 | ||
8366 | exec_control = vmcs12->pin_based_vm_exec_control; | 9087 | exec_control = vmcs12->pin_based_vm_exec_control; |
8367 | exec_control |= vmcs_config.pin_based_exec_ctrl; | 9088 | exec_control |= vmcs_config.pin_based_exec_ctrl; |
8368 | exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | | 9089 | exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; |
8369 | PIN_BASED_POSTED_INTR); | 9090 | |
9091 | if (nested_cpu_has_posted_intr(vmcs12)) { | ||
9092 | /* | ||
9093 | * Note that we use L0's vector here and in | ||
9094 | * vmx_deliver_nested_posted_interrupt. | ||
9095 | */ | ||
9096 | vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; | ||
9097 | vmx->nested.pi_pending = false; | ||
9098 | vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); | ||
9099 | vmcs_write64(POSTED_INTR_DESC_ADDR, | ||
9100 | page_to_phys(vmx->nested.pi_desc_page) + | ||
9101 | (unsigned long)(vmcs12->posted_intr_desc_addr & | ||
9102 | (PAGE_SIZE - 1))); | ||
9103 | } else | ||
9104 | exec_control &= ~PIN_BASED_POSTED_INTR; | ||
9105 | |||
8370 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); | 9106 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); |
8371 | 9107 | ||
8372 | vmx->nested.preemption_timer_expired = false; | 9108 | vmx->nested.preemption_timer_expired = false; |
@@ -8423,12 +9159,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | |||
8423 | else | 9159 | else |
8424 | vmcs_write64(APIC_ACCESS_ADDR, | 9160 | vmcs_write64(APIC_ACCESS_ADDR, |
8425 | page_to_phys(vmx->nested.apic_access_page)); | 9161 | page_to_phys(vmx->nested.apic_access_page)); |
8426 | } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { | 9162 | } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && |
9163 | (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { | ||
8427 | exec_control |= | 9164 | exec_control |= |
8428 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | 9165 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; |
8429 | kvm_vcpu_reload_apic_access_page(vcpu); | 9166 | kvm_vcpu_reload_apic_access_page(vcpu); |
8430 | } | 9167 | } |
8431 | 9168 | ||
9169 | if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { | ||
9170 | vmcs_write64(EOI_EXIT_BITMAP0, | ||
9171 | vmcs12->eoi_exit_bitmap0); | ||
9172 | vmcs_write64(EOI_EXIT_BITMAP1, | ||
9173 | vmcs12->eoi_exit_bitmap1); | ||
9174 | vmcs_write64(EOI_EXIT_BITMAP2, | ||
9175 | vmcs12->eoi_exit_bitmap2); | ||
9176 | vmcs_write64(EOI_EXIT_BITMAP3, | ||
9177 | vmcs12->eoi_exit_bitmap3); | ||
9178 | vmcs_write16(GUEST_INTR_STATUS, | ||
9179 | vmcs12->guest_intr_status); | ||
9180 | } | ||
9181 | |||
8432 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | 9182 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); |
8433 | } | 9183 | } |
8434 | 9184 | ||
@@ -8462,11 +9212,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | |||
8462 | vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); | 9212 | vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); |
8463 | } | 9213 | } |
8464 | 9214 | ||
9215 | if (cpu_has_vmx_msr_bitmap() && | ||
9216 | exec_control & CPU_BASED_USE_MSR_BITMAPS && | ||
9217 | nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) { | ||
9218 | vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested)); | ||
9219 | } else | ||
9220 | exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; | ||
9221 | |||
8465 | /* | 9222 | /* |
8466 | * Merging of IO and MSR bitmaps not currently supported. | 9223 | * Merging of IO bitmap not currently supported. |
8467 | * Rather, exit every time. | 9224 | * Rather, exit every time. |
8468 | */ | 9225 | */ |
8469 | exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; | ||
8470 | exec_control &= ~CPU_BASED_USE_IO_BITMAPS; | 9226 | exec_control &= ~CPU_BASED_USE_IO_BITMAPS; |
8471 | exec_control |= CPU_BASED_UNCOND_IO_EXITING; | 9227 | exec_control |= CPU_BASED_UNCOND_IO_EXITING; |
8472 | 9228 | ||
@@ -8582,6 +9338,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | |||
8582 | int cpu; | 9338 | int cpu; |
8583 | struct loaded_vmcs *vmcs02; | 9339 | struct loaded_vmcs *vmcs02; |
8584 | bool ia32e; | 9340 | bool ia32e; |
9341 | u32 msr_entry_idx; | ||
8585 | 9342 | ||
8586 | if (!nested_vmx_check_permission(vcpu) || | 9343 | if (!nested_vmx_check_permission(vcpu) || |
8587 | !nested_vmx_check_vmcs12(vcpu)) | 9344 | !nested_vmx_check_vmcs12(vcpu)) |
@@ -8616,41 +9373,42 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | |||
8616 | return 1; | 9373 | return 1; |
8617 | } | 9374 | } |
8618 | 9375 | ||
8619 | if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && | 9376 | if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { |
8620 | !PAGE_ALIGNED(vmcs12->msr_bitmap)) { | ||
8621 | /*TODO: Also verify bits beyond physical address width are 0*/ | 9377 | /*TODO: Also verify bits beyond physical address width are 0*/ |
8622 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | 9378 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
8623 | return 1; | 9379 | return 1; |
8624 | } | 9380 | } |
8625 | 9381 | ||
8626 | if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { | 9382 | if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { |
8627 | /*TODO: Also verify bits beyond physical address width are 0*/ | ||
8628 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | 9383 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
8629 | return 1; | 9384 | return 1; |
8630 | } | 9385 | } |
8631 | 9386 | ||
8632 | if (vmcs12->vm_entry_msr_load_count > 0 || | 9387 | if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { |
8633 | vmcs12->vm_exit_msr_load_count > 0 || | 9388 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
8634 | vmcs12->vm_exit_msr_store_count > 0) { | 9389 | return 1; |
8635 | pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", | 9390 | } |
8636 | __func__); | 9391 | |
9392 | if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { | ||
8637 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | 9393 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
8638 | return 1; | 9394 | return 1; |
8639 | } | 9395 | } |
8640 | 9396 | ||
8641 | if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, | 9397 | if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, |
8642 | nested_vmx_true_procbased_ctls_low, | 9398 | vmx->nested.nested_vmx_true_procbased_ctls_low, |
8643 | nested_vmx_procbased_ctls_high) || | 9399 | vmx->nested.nested_vmx_procbased_ctls_high) || |
8644 | !vmx_control_verify(vmcs12->secondary_vm_exec_control, | 9400 | !vmx_control_verify(vmcs12->secondary_vm_exec_control, |
8645 | nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || | 9401 | vmx->nested.nested_vmx_secondary_ctls_low, |
9402 | vmx->nested.nested_vmx_secondary_ctls_high) || | ||
8646 | !vmx_control_verify(vmcs12->pin_based_vm_exec_control, | 9403 | !vmx_control_verify(vmcs12->pin_based_vm_exec_control, |
8647 | nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || | 9404 | vmx->nested.nested_vmx_pinbased_ctls_low, |
9405 | vmx->nested.nested_vmx_pinbased_ctls_high) || | ||
8648 | !vmx_control_verify(vmcs12->vm_exit_controls, | 9406 | !vmx_control_verify(vmcs12->vm_exit_controls, |
8649 | nested_vmx_true_exit_ctls_low, | 9407 | vmx->nested.nested_vmx_true_exit_ctls_low, |
8650 | nested_vmx_exit_ctls_high) || | 9408 | vmx->nested.nested_vmx_exit_ctls_high) || |
8651 | !vmx_control_verify(vmcs12->vm_entry_controls, | 9409 | !vmx_control_verify(vmcs12->vm_entry_controls, |
8652 | nested_vmx_true_entry_ctls_low, | 9410 | vmx->nested.nested_vmx_true_entry_ctls_low, |
8653 | nested_vmx_entry_ctls_high)) | 9411 | vmx->nested.nested_vmx_entry_ctls_high)) |
8654 | { | 9412 | { |
8655 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | 9413 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
8656 | return 1; | 9414 | return 1; |
@@ -8663,7 +9421,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | |||
8663 | return 1; | 9421 | return 1; |
8664 | } | 9422 | } |
8665 | 9423 | ||
8666 | if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || | 9424 | if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || |
8667 | ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { | 9425 | ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { |
8668 | nested_vmx_entry_failure(vcpu, vmcs12, | 9426 | nested_vmx_entry_failure(vcpu, vmcs12, |
8669 | EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); | 9427 | EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); |
@@ -8739,10 +9497,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | |||
8739 | 9497 | ||
8740 | vmx_segment_cache_clear(vmx); | 9498 | vmx_segment_cache_clear(vmx); |
8741 | 9499 | ||
8742 | vmcs12->launch_state = 1; | ||
8743 | |||
8744 | prepare_vmcs02(vcpu, vmcs12); | 9500 | prepare_vmcs02(vcpu, vmcs12); |
8745 | 9501 | ||
9502 | msr_entry_idx = nested_vmx_load_msr(vcpu, | ||
9503 | vmcs12->vm_entry_msr_load_addr, | ||
9504 | vmcs12->vm_entry_msr_load_count); | ||
9505 | if (msr_entry_idx) { | ||
9506 | leave_guest_mode(vcpu); | ||
9507 | vmx_load_vmcs01(vcpu); | ||
9508 | nested_vmx_entry_failure(vcpu, vmcs12, | ||
9509 | EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); | ||
9510 | return 1; | ||
9511 | } | ||
9512 | |||
9513 | vmcs12->launch_state = 1; | ||
9514 | |||
8746 | if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) | 9515 | if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) |
8747 | return kvm_emulate_halt(vcpu); | 9516 | return kvm_emulate_halt(vcpu); |
8748 | 9517 | ||
@@ -8869,9 +9638,10 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) | |||
8869 | if (vmx->nested.nested_run_pending) | 9638 | if (vmx->nested.nested_run_pending) |
8870 | return -EBUSY; | 9639 | return -EBUSY; |
8871 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); | 9640 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); |
9641 | return 0; | ||
8872 | } | 9642 | } |
8873 | 9643 | ||
8874 | return 0; | 9644 | return vmx_complete_nested_posted_interrupt(vcpu); |
8875 | } | 9645 | } |
8876 | 9646 | ||
8877 | static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) | 9647 | static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) |
@@ -8981,6 +9751,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, | |||
8981 | vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); | 9751 | vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); |
8982 | } | 9752 | } |
8983 | 9753 | ||
9754 | if (nested_cpu_has_vid(vmcs12)) | ||
9755 | vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); | ||
9756 | |||
8984 | vmcs12->vm_entry_controls = | 9757 | vmcs12->vm_entry_controls = |
8985 | (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | | 9758 | (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | |
8986 | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); | 9759 | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); |
@@ -9172,6 +9945,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, | |||
9172 | 9945 | ||
9173 | kvm_set_dr(vcpu, 7, 0x400); | 9946 | kvm_set_dr(vcpu, 7, 0x400); |
9174 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | 9947 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); |
9948 | |||
9949 | if (cpu_has_vmx_msr_bitmap()) | ||
9950 | vmx_set_msr_bitmap(vcpu); | ||
9951 | |||
9952 | if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, | ||
9953 | vmcs12->vm_exit_msr_load_count)) | ||
9954 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); | ||
9175 | } | 9955 | } |
9176 | 9956 | ||
9177 | /* | 9957 | /* |
@@ -9193,6 +9973,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, | |||
9193 | prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, | 9973 | prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, |
9194 | exit_qualification); | 9974 | exit_qualification); |
9195 | 9975 | ||
9976 | if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, | ||
9977 | vmcs12->vm_exit_msr_store_count)) | ||
9978 | nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); | ||
9979 | |||
9196 | vmx_load_vmcs01(vcpu); | 9980 | vmx_load_vmcs01(vcpu); |
9197 | 9981 | ||
9198 | if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) | 9982 | if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) |
@@ -9235,6 +10019,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, | |||
9235 | nested_release_page(vmx->nested.virtual_apic_page); | 10019 | nested_release_page(vmx->nested.virtual_apic_page); |
9236 | vmx->nested.virtual_apic_page = NULL; | 10020 | vmx->nested.virtual_apic_page = NULL; |
9237 | } | 10021 | } |
10022 | if (vmx->nested.pi_desc_page) { | ||
10023 | kunmap(vmx->nested.pi_desc_page); | ||
10024 | nested_release_page(vmx->nested.pi_desc_page); | ||
10025 | vmx->nested.pi_desc_page = NULL; | ||
10026 | vmx->nested.pi_desc = NULL; | ||
10027 | } | ||
9238 | 10028 | ||
9239 | /* | 10029 | /* |
9240 | * We are now running in L2, mmu_notifier will force to reload the | 10030 | * We are now running in L2, mmu_notifier will force to reload the |
@@ -9301,6 +10091,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) | |||
9301 | shrink_ple_window(vcpu); | 10091 | shrink_ple_window(vcpu); |
9302 | } | 10092 | } |
9303 | 10093 | ||
10094 | static void vmx_slot_enable_log_dirty(struct kvm *kvm, | ||
10095 | struct kvm_memory_slot *slot) | ||
10096 | { | ||
10097 | kvm_mmu_slot_leaf_clear_dirty(kvm, slot); | ||
10098 | kvm_mmu_slot_largepage_remove_write_access(kvm, slot); | ||
10099 | } | ||
10100 | |||
10101 | static void vmx_slot_disable_log_dirty(struct kvm *kvm, | ||
10102 | struct kvm_memory_slot *slot) | ||
10103 | { | ||
10104 | kvm_mmu_slot_set_dirty(kvm, slot); | ||
10105 | } | ||
10106 | |||
10107 | static void vmx_flush_log_dirty(struct kvm *kvm) | ||
10108 | { | ||
10109 | kvm_flush_pml_buffers(kvm); | ||
10110 | } | ||
10111 | |||
10112 | static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, | ||
10113 | struct kvm_memory_slot *memslot, | ||
10114 | gfn_t offset, unsigned long mask) | ||
10115 | { | ||
10116 | kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); | ||
10117 | } | ||
10118 | |||
9304 | static struct kvm_x86_ops vmx_x86_ops = { | 10119 | static struct kvm_x86_ops vmx_x86_ops = { |
9305 | .cpu_has_kvm_support = cpu_has_kvm_support, | 10120 | .cpu_has_kvm_support = cpu_has_kvm_support, |
9306 | .disabled_by_bios = vmx_disabled_by_bios, | 10121 | .disabled_by_bios = vmx_disabled_by_bios, |
@@ -9409,6 +10224,11 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
9409 | .check_nested_events = vmx_check_nested_events, | 10224 | .check_nested_events = vmx_check_nested_events, |
9410 | 10225 | ||
9411 | .sched_in = vmx_sched_in, | 10226 | .sched_in = vmx_sched_in, |
10227 | |||
10228 | .slot_enable_log_dirty = vmx_slot_enable_log_dirty, | ||
10229 | .slot_disable_log_dirty = vmx_slot_disable_log_dirty, | ||
10230 | .flush_log_dirty = vmx_flush_log_dirty, | ||
10231 | .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, | ||
9412 | }; | 10232 | }; |
9413 | 10233 | ||
9414 | static int __init vmx_init(void) | 10234 | static int __init vmx_init(void) |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c259814200bd..bd7a70be41b3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); | |||
108 | static u32 tsc_tolerance_ppm = 250; | 108 | static u32 tsc_tolerance_ppm = 250; |
109 | module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); | 109 | module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); |
110 | 110 | ||
111 | /* lapic timer advance (tscdeadline mode only) in nanoseconds */ | ||
112 | unsigned int lapic_timer_advance_ns = 0; | ||
113 | module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); | ||
114 | |||
111 | static bool backwards_tsc_observed = false; | 115 | static bool backwards_tsc_observed = false; |
112 | 116 | ||
113 | #define KVM_NR_SHARED_MSRS 16 | 117 | #define KVM_NR_SHARED_MSRS 16 |
@@ -141,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
141 | { "irq_window", VCPU_STAT(irq_window_exits) }, | 145 | { "irq_window", VCPU_STAT(irq_window_exits) }, |
142 | { "nmi_window", VCPU_STAT(nmi_window_exits) }, | 146 | { "nmi_window", VCPU_STAT(nmi_window_exits) }, |
143 | { "halt_exits", VCPU_STAT(halt_exits) }, | 147 | { "halt_exits", VCPU_STAT(halt_exits) }, |
148 | { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, | ||
144 | { "halt_wakeup", VCPU_STAT(halt_wakeup) }, | 149 | { "halt_wakeup", VCPU_STAT(halt_wakeup) }, |
145 | { "hypercalls", VCPU_STAT(hypercalls) }, | 150 | { "hypercalls", VCPU_STAT(hypercalls) }, |
146 | { "request_irq", VCPU_STAT(request_irq_exits) }, | 151 | { "request_irq", VCPU_STAT(request_irq_exits) }, |
@@ -492,7 +497,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | |||
492 | } | 497 | } |
493 | EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); | 498 | EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); |
494 | 499 | ||
495 | int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, | 500 | static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, |
496 | void *data, int offset, int len, u32 access) | 501 | void *data, int offset, int len, u32 access) |
497 | { | 502 | { |
498 | return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, | 503 | return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, |
@@ -643,7 +648,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) | |||
643 | } | 648 | } |
644 | } | 649 | } |
645 | 650 | ||
646 | int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) | 651 | static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) |
647 | { | 652 | { |
648 | u64 xcr0 = xcr; | 653 | u64 xcr0 = xcr; |
649 | u64 old_xcr0 = vcpu->arch.xcr0; | 654 | u64 old_xcr0 = vcpu->arch.xcr0; |
@@ -1083,6 +1088,15 @@ static void update_pvclock_gtod(struct timekeeper *tk) | |||
1083 | } | 1088 | } |
1084 | #endif | 1089 | #endif |
1085 | 1090 | ||
1091 | void kvm_set_pending_timer(struct kvm_vcpu *vcpu) | ||
1092 | { | ||
1093 | /* | ||
1094 | * Note: KVM_REQ_PENDING_TIMER is implicitly checked in | ||
1095 | * vcpu_enter_guest. This function is only called from | ||
1096 | * the physical CPU that is running vcpu. | ||
1097 | */ | ||
1098 | kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); | ||
1099 | } | ||
1086 | 1100 | ||
1087 | static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) | 1101 | static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) |
1088 | { | 1102 | { |
@@ -1180,7 +1194,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); | |||
1180 | #endif | 1194 | #endif |
1181 | 1195 | ||
1182 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); | 1196 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); |
1183 | unsigned long max_tsc_khz; | 1197 | static unsigned long max_tsc_khz; |
1184 | 1198 | ||
1185 | static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) | 1199 | static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) |
1186 | { | 1200 | { |
@@ -1234,7 +1248,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) | |||
1234 | return tsc; | 1248 | return tsc; |
1235 | } | 1249 | } |
1236 | 1250 | ||
1237 | void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) | 1251 | static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) |
1238 | { | 1252 | { |
1239 | #ifdef CONFIG_X86_64 | 1253 | #ifdef CONFIG_X86_64 |
1240 | bool vcpus_matched; | 1254 | bool vcpus_matched; |
@@ -1529,7 +1543,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) | |||
1529 | &ka->master_cycle_now); | 1543 | &ka->master_cycle_now); |
1530 | 1544 | ||
1531 | ka->use_master_clock = host_tsc_clocksource && vcpus_matched | 1545 | ka->use_master_clock = host_tsc_clocksource && vcpus_matched |
1532 | && !backwards_tsc_observed; | 1546 | && !backwards_tsc_observed |
1547 | && !ka->boot_vcpu_runs_old_kvmclock; | ||
1533 | 1548 | ||
1534 | if (ka->use_master_clock) | 1549 | if (ka->use_master_clock) |
1535 | atomic_set(&kvm_guest_has_master_clock, 1); | 1550 | atomic_set(&kvm_guest_has_master_clock, 1); |
@@ -2161,8 +2176,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) | |||
2161 | case MSR_KVM_SYSTEM_TIME_NEW: | 2176 | case MSR_KVM_SYSTEM_TIME_NEW: |
2162 | case MSR_KVM_SYSTEM_TIME: { | 2177 | case MSR_KVM_SYSTEM_TIME: { |
2163 | u64 gpa_offset; | 2178 | u64 gpa_offset; |
2179 | struct kvm_arch *ka = &vcpu->kvm->arch; | ||
2180 | |||
2164 | kvmclock_reset(vcpu); | 2181 | kvmclock_reset(vcpu); |
2165 | 2182 | ||
2183 | if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { | ||
2184 | bool tmp = (msr == MSR_KVM_SYSTEM_TIME); | ||
2185 | |||
2186 | if (ka->boot_vcpu_runs_old_kvmclock != tmp) | ||
2187 | set_bit(KVM_REQ_MASTERCLOCK_UPDATE, | ||
2188 | &vcpu->requests); | ||
2189 | |||
2190 | ka->boot_vcpu_runs_old_kvmclock = tmp; | ||
2191 | } | ||
2192 | |||
2166 | vcpu->arch.time = data; | 2193 | vcpu->arch.time = data; |
2167 | kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); | 2194 | kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); |
2168 | 2195 | ||
@@ -2324,6 +2351,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
2324 | { | 2351 | { |
2325 | return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); | 2352 | return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); |
2326 | } | 2353 | } |
2354 | EXPORT_SYMBOL_GPL(kvm_get_msr); | ||
2327 | 2355 | ||
2328 | static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | 2356 | static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) |
2329 | { | 2357 | { |
@@ -2738,6 +2766,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) | |||
2738 | case KVM_CAP_READONLY_MEM: | 2766 | case KVM_CAP_READONLY_MEM: |
2739 | case KVM_CAP_HYPERV_TIME: | 2767 | case KVM_CAP_HYPERV_TIME: |
2740 | case KVM_CAP_IOAPIC_POLARITY_IGNORED: | 2768 | case KVM_CAP_IOAPIC_POLARITY_IGNORED: |
2769 | case KVM_CAP_TSC_DEADLINE_TIMER: | ||
2741 | #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT | 2770 | #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT |
2742 | case KVM_CAP_ASSIGN_DEV_IRQ: | 2771 | case KVM_CAP_ASSIGN_DEV_IRQ: |
2743 | case KVM_CAP_PCI_2_3: | 2772 | case KVM_CAP_PCI_2_3: |
@@ -2776,9 +2805,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) | |||
2776 | case KVM_CAP_TSC_CONTROL: | 2805 | case KVM_CAP_TSC_CONTROL: |
2777 | r = kvm_has_tsc_control; | 2806 | r = kvm_has_tsc_control; |
2778 | break; | 2807 | break; |
2779 | case KVM_CAP_TSC_DEADLINE_TIMER: | ||
2780 | r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); | ||
2781 | break; | ||
2782 | default: | 2808 | default: |
2783 | r = 0; | 2809 | r = 0; |
2784 | break; | 2810 | break; |
@@ -3734,83 +3760,43 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, | |||
3734 | * @kvm: kvm instance | 3760 | * @kvm: kvm instance |
3735 | * @log: slot id and address to which we copy the log | 3761 | * @log: slot id and address to which we copy the log |
3736 | * | 3762 | * |
3737 | * We need to keep it in mind that VCPU threads can write to the bitmap | 3763 | * Steps 1-4 below provide general overview of dirty page logging. See |
3738 | * concurrently. So, to avoid losing data, we keep the following order for | 3764 | * kvm_get_dirty_log_protect() function description for additional details. |
3739 | * each bit: | 3765 | * |
3766 | * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we | ||
3767 | * always flush the TLB (step 4) even if previous step failed and the dirty | ||
3768 | * bitmap may be corrupt. Regardless of previous outcome the KVM logging API | ||
3769 | * does not preclude user space subsequent dirty log read. Flushing TLB ensures | ||
3770 | * writes will be marked dirty for next log read. | ||
3740 | * | 3771 | * |
3741 | * 1. Take a snapshot of the bit and clear it if needed. | 3772 | * 1. Take a snapshot of the bit and clear it if needed. |
3742 | * 2. Write protect the corresponding page. | 3773 | * 2. Write protect the corresponding page. |
3743 | * 3. Flush TLB's if needed. | 3774 | * 3. Copy the snapshot to the userspace. |
3744 | * 4. Copy the snapshot to the userspace. | 3775 | * 4. Flush TLB's if needed. |
3745 | * | ||
3746 | * Between 2 and 3, the guest may write to the page using the remaining TLB | ||
3747 | * entry. This is not a problem because the page will be reported dirty at | ||
3748 | * step 4 using the snapshot taken before and step 3 ensures that successive | ||
3749 | * writes will be logged for the next call. | ||
3750 | */ | 3776 | */ |
3751 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) | 3777 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) |
3752 | { | 3778 | { |
3753 | int r; | ||
3754 | struct kvm_memory_slot *memslot; | ||
3755 | unsigned long n, i; | ||
3756 | unsigned long *dirty_bitmap; | ||
3757 | unsigned long *dirty_bitmap_buffer; | ||
3758 | bool is_dirty = false; | 3779 | bool is_dirty = false; |
3780 | int r; | ||
3759 | 3781 | ||
3760 | mutex_lock(&kvm->slots_lock); | 3782 | mutex_lock(&kvm->slots_lock); |
3761 | 3783 | ||
3762 | r = -EINVAL; | 3784 | /* |
3763 | if (log->slot >= KVM_USER_MEM_SLOTS) | 3785 | * Flush potentially hardware-cached dirty pages to dirty_bitmap. |
3764 | goto out; | 3786 | */ |
3765 | 3787 | if (kvm_x86_ops->flush_log_dirty) | |
3766 | memslot = id_to_memslot(kvm->memslots, log->slot); | 3788 | kvm_x86_ops->flush_log_dirty(kvm); |
3767 | |||
3768 | dirty_bitmap = memslot->dirty_bitmap; | ||
3769 | r = -ENOENT; | ||
3770 | if (!dirty_bitmap) | ||
3771 | goto out; | ||
3772 | |||
3773 | n = kvm_dirty_bitmap_bytes(memslot); | ||
3774 | |||
3775 | dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); | ||
3776 | memset(dirty_bitmap_buffer, 0, n); | ||
3777 | |||
3778 | spin_lock(&kvm->mmu_lock); | ||
3779 | |||
3780 | for (i = 0; i < n / sizeof(long); i++) { | ||
3781 | unsigned long mask; | ||
3782 | gfn_t offset; | ||
3783 | |||
3784 | if (!dirty_bitmap[i]) | ||
3785 | continue; | ||
3786 | |||
3787 | is_dirty = true; | ||
3788 | |||
3789 | mask = xchg(&dirty_bitmap[i], 0); | ||
3790 | dirty_bitmap_buffer[i] = mask; | ||
3791 | |||
3792 | offset = i * BITS_PER_LONG; | ||
3793 | kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); | ||
3794 | } | ||
3795 | |||
3796 | spin_unlock(&kvm->mmu_lock); | ||
3797 | 3789 | ||
3798 | /* See the comments in kvm_mmu_slot_remove_write_access(). */ | 3790 | r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); |
3799 | lockdep_assert_held(&kvm->slots_lock); | ||
3800 | 3791 | ||
3801 | /* | 3792 | /* |
3802 | * All the TLBs can be flushed out of mmu lock, see the comments in | 3793 | * All the TLBs can be flushed out of mmu lock, see the comments in |
3803 | * kvm_mmu_slot_remove_write_access(). | 3794 | * kvm_mmu_slot_remove_write_access(). |
3804 | */ | 3795 | */ |
3796 | lockdep_assert_held(&kvm->slots_lock); | ||
3805 | if (is_dirty) | 3797 | if (is_dirty) |
3806 | kvm_flush_remote_tlbs(kvm); | 3798 | kvm_flush_remote_tlbs(kvm); |
3807 | 3799 | ||
3808 | r = -EFAULT; | ||
3809 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) | ||
3810 | goto out; | ||
3811 | |||
3812 | r = 0; | ||
3813 | out: | ||
3814 | mutex_unlock(&kvm->slots_lock); | 3800 | mutex_unlock(&kvm->slots_lock); |
3815 | return r; | 3801 | return r; |
3816 | } | 3802 | } |
@@ -4516,6 +4502,8 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, | |||
4516 | if (rc != X86EMUL_CONTINUE) | 4502 | if (rc != X86EMUL_CONTINUE) |
4517 | return rc; | 4503 | return rc; |
4518 | addr += now; | 4504 | addr += now; |
4505 | if (ctxt->mode != X86EMUL_MODE_PROT64) | ||
4506 | addr = (u32)addr; | ||
4519 | val += now; | 4507 | val += now; |
4520 | bytes -= now; | 4508 | bytes -= now; |
4521 | } | 4509 | } |
@@ -4984,6 +4972,11 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon | |||
4984 | kvm_register_write(emul_to_vcpu(ctxt), reg, val); | 4972 | kvm_register_write(emul_to_vcpu(ctxt), reg, val); |
4985 | } | 4973 | } |
4986 | 4974 | ||
4975 | static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) | ||
4976 | { | ||
4977 | kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); | ||
4978 | } | ||
4979 | |||
4987 | static const struct x86_emulate_ops emulate_ops = { | 4980 | static const struct x86_emulate_ops emulate_ops = { |
4988 | .read_gpr = emulator_read_gpr, | 4981 | .read_gpr = emulator_read_gpr, |
4989 | .write_gpr = emulator_write_gpr, | 4982 | .write_gpr = emulator_write_gpr, |
@@ -5019,6 +5012,7 @@ static const struct x86_emulate_ops emulate_ops = { | |||
5019 | .put_fpu = emulator_put_fpu, | 5012 | .put_fpu = emulator_put_fpu, |
5020 | .intercept = emulator_intercept, | 5013 | .intercept = emulator_intercept, |
5021 | .get_cpuid = emulator_get_cpuid, | 5014 | .get_cpuid = emulator_get_cpuid, |
5015 | .set_nmi_mask = emulator_set_nmi_mask, | ||
5022 | }; | 5016 | }; |
5023 | 5017 | ||
5024 | static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) | 5018 | static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) |
@@ -6311,6 +6305,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
6311 | } | 6305 | } |
6312 | 6306 | ||
6313 | trace_kvm_entry(vcpu->vcpu_id); | 6307 | trace_kvm_entry(vcpu->vcpu_id); |
6308 | wait_lapic_expire(vcpu); | ||
6314 | kvm_x86_ops->run(vcpu); | 6309 | kvm_x86_ops->run(vcpu); |
6315 | 6310 | ||
6316 | /* | 6311 | /* |
@@ -7041,15 +7036,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | |||
7041 | return r; | 7036 | return r; |
7042 | } | 7037 | } |
7043 | 7038 | ||
7044 | int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) | 7039 | void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) |
7045 | { | 7040 | { |
7046 | int r; | ||
7047 | struct msr_data msr; | 7041 | struct msr_data msr; |
7048 | struct kvm *kvm = vcpu->kvm; | 7042 | struct kvm *kvm = vcpu->kvm; |
7049 | 7043 | ||
7050 | r = vcpu_load(vcpu); | 7044 | if (vcpu_load(vcpu)) |
7051 | if (r) | 7045 | return; |
7052 | return r; | ||
7053 | msr.data = 0x0; | 7046 | msr.data = 0x0; |
7054 | msr.index = MSR_IA32_TSC; | 7047 | msr.index = MSR_IA32_TSC; |
7055 | msr.host_initiated = true; | 7048 | msr.host_initiated = true; |
@@ -7058,8 +7051,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) | |||
7058 | 7051 | ||
7059 | schedule_delayed_work(&kvm->arch.kvmclock_sync_work, | 7052 | schedule_delayed_work(&kvm->arch.kvmclock_sync_work, |
7060 | KVMCLOCK_SYNC_PERIOD); | 7053 | KVMCLOCK_SYNC_PERIOD); |
7061 | |||
7062 | return r; | ||
7063 | } | 7054 | } |
7064 | 7055 | ||
7065 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | 7056 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
@@ -7549,12 +7540,62 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, | |||
7549 | return 0; | 7540 | return 0; |
7550 | } | 7541 | } |
7551 | 7542 | ||
7543 | static void kvm_mmu_slot_apply_flags(struct kvm *kvm, | ||
7544 | struct kvm_memory_slot *new) | ||
7545 | { | ||
7546 | /* Still write protect RO slot */ | ||
7547 | if (new->flags & KVM_MEM_READONLY) { | ||
7548 | kvm_mmu_slot_remove_write_access(kvm, new); | ||
7549 | return; | ||
7550 | } | ||
7551 | |||
7552 | /* | ||
7553 | * Call kvm_x86_ops dirty logging hooks when they are valid. | ||
7554 | * | ||
7555 | * kvm_x86_ops->slot_disable_log_dirty is called when: | ||
7556 | * | ||
7557 | * - KVM_MR_CREATE with dirty logging is disabled | ||
7558 | * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag | ||
7559 | * | ||
7560 | * The reason is, in case of PML, we need to set D-bit for any slots | ||
7561 | * with dirty logging disabled in order to eliminate unnecessary GPA | ||
7562 | * logging in PML buffer (and potential PML buffer full VMEXT). This | ||
7563 | * guarantees leaving PML enabled during guest's lifetime won't have | ||
7564 | * any additonal overhead from PML when guest is running with dirty | ||
7565 | * logging disabled for memory slots. | ||
7566 | * | ||
7567 | * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot | ||
7568 | * to dirty logging mode. | ||
7569 | * | ||
7570 | * If kvm_x86_ops dirty logging hooks are invalid, use write protect. | ||
7571 | * | ||
7572 | * In case of write protect: | ||
7573 | * | ||
7574 | * Write protect all pages for dirty logging. | ||
7575 | * | ||
7576 | * All the sptes including the large sptes which point to this | ||
7577 | * slot are set to readonly. We can not create any new large | ||
7578 | * spte on this slot until the end of the logging. | ||
7579 | * | ||
7580 | * See the comments in fast_page_fault(). | ||
7581 | */ | ||
7582 | if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { | ||
7583 | if (kvm_x86_ops->slot_enable_log_dirty) | ||
7584 | kvm_x86_ops->slot_enable_log_dirty(kvm, new); | ||
7585 | else | ||
7586 | kvm_mmu_slot_remove_write_access(kvm, new); | ||
7587 | } else { | ||
7588 | if (kvm_x86_ops->slot_disable_log_dirty) | ||
7589 | kvm_x86_ops->slot_disable_log_dirty(kvm, new); | ||
7590 | } | ||
7591 | } | ||
7592 | |||
7552 | void kvm_arch_commit_memory_region(struct kvm *kvm, | 7593 | void kvm_arch_commit_memory_region(struct kvm *kvm, |
7553 | struct kvm_userspace_memory_region *mem, | 7594 | struct kvm_userspace_memory_region *mem, |
7554 | const struct kvm_memory_slot *old, | 7595 | const struct kvm_memory_slot *old, |
7555 | enum kvm_mr_change change) | 7596 | enum kvm_mr_change change) |
7556 | { | 7597 | { |
7557 | 7598 | struct kvm_memory_slot *new; | |
7558 | int nr_mmu_pages = 0; | 7599 | int nr_mmu_pages = 0; |
7559 | 7600 | ||
7560 | if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { | 7601 | if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { |
@@ -7573,17 +7614,20 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
7573 | 7614 | ||
7574 | if (nr_mmu_pages) | 7615 | if (nr_mmu_pages) |
7575 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); | 7616 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); |
7617 | |||
7618 | /* It's OK to get 'new' slot here as it has already been installed */ | ||
7619 | new = id_to_memslot(kvm->memslots, mem->slot); | ||
7620 | |||
7576 | /* | 7621 | /* |
7577 | * Write protect all pages for dirty logging. | 7622 | * Set up write protection and/or dirty logging for the new slot. |
7578 | * | 7623 | * |
7579 | * All the sptes including the large sptes which point to this | 7624 | * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have |
7580 | * slot are set to readonly. We can not create any new large | 7625 | * been zapped so no dirty logging staff is needed for old slot. For |
7581 | * spte on this slot until the end of the logging. | 7626 | * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the |
7582 | * | 7627 | * new and it's also covered when dealing with the new slot. |
7583 | * See the comments in fast_page_fault(). | ||
7584 | */ | 7628 | */ |
7585 | if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) | 7629 | if (change != KVM_MR_DELETE) |
7586 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | 7630 | kvm_mmu_slot_apply_flags(kvm, new); |
7587 | } | 7631 | } |
7588 | 7632 | ||
7589 | void kvm_arch_flush_shadow_all(struct kvm *kvm) | 7633 | void kvm_arch_flush_shadow_all(struct kvm *kvm) |
@@ -7837,3 +7881,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); | |||
7837 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); | 7881 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); |
7838 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); | 7882 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); |
7839 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); | 7883 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); |
7884 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); | ||
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cc1d61af6140..f5fef1868096 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, | |||
147 | 147 | ||
148 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); | 148 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); |
149 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); | 149 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); |
150 | void kvm_set_pending_timer(struct kvm_vcpu *vcpu); | ||
150 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); | 151 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); |
151 | 152 | ||
152 | void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); | 153 | void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); |
@@ -170,5 +171,7 @@ extern u64 kvm_supported_xcr0(void); | |||
170 | 171 | ||
171 | extern unsigned int min_timer_period_us; | 172 | extern unsigned int min_timer_period_us; |
172 | 173 | ||
174 | extern unsigned int lapic_timer_advance_ns; | ||
175 | |||
173 | extern struct static_key kvm_no_apic_vcpu; | 176 | extern struct static_key kvm_no_apic_vcpu; |
174 | #endif | 177 | #endif |