Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM update from Paolo Bonzini: "Fairly small update, but there are some interesting new features. Common: Optional support for adding a small amount of polling on each HLT instruction executed in the guest (or equivalent for other architectures). This can improve latency up to 50% on some scenarios (e.g. O_DSYNC writes or TCP_RR netperf tests). This also has to be enabled manually for now, but the plan is to auto-tune this in the future. ARM/ARM64: The highlights are support for GICv3 emulation and dirty page tracking s390: Several optimizations and bugfixes. Also a first: a feature exposed by KVM (UUID and long guest name in /proc/sysinfo) before it is available in IBM's hypervisor! :) MIPS: Bugfixes. x86: Support for PML (page modification logging, a new feature in Broadwell Xeons that speeds up dirty page tracking), nested virtualization improvements (nested APICv---a nice optimization), usual round of emulation fixes. There is also a new option to reduce latency of the TSC deadline timer in the guest; this needs to be tuned manually. Some commits are common between this pull and Catalin's; I see you have already included his tree. Powerpc: Nothing yet. The KVM/PPC changes will come in through the PPC maintainers, because I haven't received them yet and I might end up being offline for some part of next week" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits) KVM: ia64: drop kvm.h from installed user headers KVM: x86: fix build with !CONFIG_SMP KVM: x86: emulate: correct page fault error code for NoWrite instructions KVM: Disable compat ioctl for s390 KVM: s390: add cpu model support KVM: s390: use facilities and cpu_id per KVM KVM: s390/CPACF: Choose crypto control block format s390/kernel: Update /proc/sysinfo file with Extended Name and UUID KVM: s390: reenable LPP facility KVM: s390: floating irqs: fix user triggerable endless loop kvm: add halt_poll_ns module parameter kvm: remove KVM_MMIO_SIZE KVM: MIPS: Don't leak FPU/DSP to guest KVM: MIPS: Disable HTW while in guest KVM: nVMX: Enable nested posted interrupt processing KVM: nVMX: Enable nested virtual interrupt delivery KVM: nVMX: Enable nested apic register virtualization KVM: nVMX: Make nested control MSRs per-cpu KVM: nVMX: Enable nested virtualize x2apic mode KVM: nVMX: Prepare for using hardware MSR bitmap ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-13 12:55:09 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-13 12:55:09 -0500
commit: b9085bcbf5f43adf60533f9b635b2e7faeed0fe9 (patch)
tree: e397abf5682a45c096e75b3d0fa99c8e228425fc /arch/x86/kvm
parent: c7d7b98671552abade78834c522b7308bda73c0d (diff)
parent: 6557bada461afeaa920a189fae2cff7c8fdce39f (diff)
13 files changed, 1673 insertions, 425 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 7dc7ba577ecd..413a7bf9efbb 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -39,6 +39,7 @@ config KVM
        select PERF_EVENTS
        select HAVE_KVM_MSI
        select HAVE_KVM_CPU_RELAX_INTERCEPT
+        select KVM_GENERIC_DIRTYLOG_READ_PROTECT
        select KVM_VFIO
        select SRCU
        ---help---
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index de12c1d379f1..e0b794a84c35 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -86,6 +86,7 @@
 #define DstAcc      (OpAcc << DstShift)
 #define DstDI       (OpDI << DstShift)
 #define DstMem64    (OpMem64 << DstShift)
+#define DstMem16    (OpMem16 << DstShift)
 #define DstImmUByte (OpImmUByte << DstShift)
 #define DstDX       (OpDX << DstShift)
 #define DstAccLo    (OpAccLo << DstShift)
@@ -124,6 +125,7 @@
 #define RMExt       (4<<15)     /* Opcode extension in ModRM r/m if mod == 3 */
 #define Escape      (5<<15)     /* Escape to coprocessor instruction */
 #define InstrDual   (6<<15)     /* Alternate instruction decoding of mod == 3 */
+#define ModeDual    (7<<15)     /* Different instruction for 32/64 bit */
 #define Sse         (1<<18)     /* SSE Vector instruction */
 /* Generic ModRM decode. */
 #define ModRM       (1<<19)
@@ -165,10 +167,10 @@
 #define NoMod       ((u64)1 << 47)  /* Mod field is ignored */
 #define Intercept   ((u64)1 << 48)  /* Has valid intercept field */
 #define CheckPerm   ((u64)1 << 49)  /* Has valid check_perm field */
-#define NoBigReal   ((u64)1 << 50)  /* No big real mode */
 #define PrivUD      ((u64)1 << 51)  /* #UD instead of #GP on CPL > 0 */
 #define NearBranch  ((u64)1 << 52)  /* Near branches */
 #define No16        ((u64)1 << 53)  /* No 16 bit operand */
+#define IncSP       ((u64)1 << 54)  /* SP is incremented before ModRM calc */
 #define DstXacc     (DstAccLo | SrcAccHi | SrcWrite)
@@ -213,6 +215,7 @@ struct opcode {
                const struct gprefix *gprefix;
                const struct escape *esc;
                const struct instr_dual *idual;
+                const struct mode_dual *mdual;
                void (*fastop)(struct fastop *fake);
        } u;
        int (*check_perm)(struct x86_emulate_ctxt *ctxt);
@@ -240,6 +243,11 @@ struct instr_dual {
        struct opcode mod3;
 };
+struct mode_dual {
+        struct opcode mode32;
+        struct opcode mode64;
+};
 /* EFLAGS bit definitions. */
 #define EFLG_ID (1<<21)
 #define EFLG_VIP (1<<20)
@@ -262,6 +270,13 @@ struct instr_dual {
 #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
 #define EFLG_RESERVED_ONE_MASK 2
+enum x86_transfer_type {
+        X86_TRANSFER_NONE,
+        X86_TRANSFER_CALL_JMP,
+        X86_TRANSFER_RET,
+        X86_TRANSFER_TASK_SWITCH,
+};
 static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
 {
        if (!(ctxt->regs_valid & (1 << nr))) {
@@ -669,9 +684,13 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
                }
                if (addr.ea > lim)
                        goto bad;
-                *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
+                if (lim == 0xffffffff)
-                if (size > *max_size)
+                        *max_size = ~0u;
-                        goto bad;
+                else {
+                        *max_size = (u64)lim + 1 - addr.ea;
+                        if (size > *max_size)
+                                goto bad;
+                }
                la &= (u32)-1;
                break;
        }
@@ -722,19 +741,26 @@ static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
                          const struct desc_struct *cs_desc)
 {
        enum x86emul_mode mode = ctxt->mode;
+        int rc;
 #ifdef CONFIG_X86_64
-        if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) {
+        if (ctxt->mode >= X86EMUL_MODE_PROT16) {
-                u64 efer = 0;
+                if (cs_desc->l) {
+                        u64 efer = 0;
-                ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+                        ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-                if (efer & EFER_LMA)
+                        if (efer & EFER_LMA)
-                        mode = X86EMUL_MODE_PROT64;
+                                mode = X86EMUL_MODE_PROT64;
+                } else
+                        mode = X86EMUL_MODE_PROT32; /* temporary value */
        }
 #endif
        if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
                mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-        return assign_eip(ctxt, dst, mode);
+        rc = assign_eip(ctxt, dst, mode);
+        if (rc == X86EMUL_CONTINUE)
+                ctxt->mode = mode;
+        return rc;
 }
 static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -1057,8 +1083,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
        asm volatile("fnstcw %0": "+m"(fcw));
        ctxt->ops->put_fpu(ctxt);
-        /* force 2 byte destination */
-        ctxt->dst.bytes = 2;
        ctxt->dst.val = fcw;
        return X86EMUL_CONTINUE;
@@ -1075,8 +1099,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
        asm volatile("fnstsw %0": "+m"(fsw));
        ctxt->ops->put_fpu(ctxt);
-        /* force 2 byte destination */
-        ctxt->dst.bytes = 2;
        ctxt->dst.val = fsw;
        return X86EMUL_CONTINUE;
@@ -1223,6 +1245,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
                        else {
                                modrm_ea += reg_read(ctxt, base_reg);
                                adjust_modrm_seg(ctxt, base_reg);
+                                /* Increment ESP on POP [ESP] */
+                                if ((ctxt->d & IncSP) &&
+                                    base_reg == VCPU_REGS_RSP)
+                                        modrm_ea += ctxt->op_bytes;
                        }
                        if (index_reg != 4)
                                modrm_ea += reg_read(ctxt, index_reg) << scale;
@@ -1435,10 +1461,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
                ops->get_gdt(ctxt, dt);
 }
-/* allowed just for 8 bytes segments */
+static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
-static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+                              u16 selector, ulong *desc_addr_p)
-                                   u16 selector, struct desc_struct *desc,
-                                   ulong *desc_addr_p)
 {
        struct desc_ptr dt;
        u16 index = selector >> 3;
@@ -1449,8 +1473,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
        if (dt.size < index * 8 + 7)
                return emulate_gp(ctxt, selector & 0xfffc);
-        *desc_addr_p = addr = dt.address + index * 8;
+        addr = dt.address + index * 8;
-        return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+#ifdef CONFIG_X86_64
+        if (addr >> 32 != 0) {
+                u64 efer = 0;
+                ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+                if (!(efer & EFER_LMA))
+                        addr &= (u32)-1;
+        }
+#endif
+        *desc_addr_p = addr;
+        return X86EMUL_CONTINUE;
+}
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+                                   u16 selector, struct desc_struct *desc,
+                                   ulong *desc_addr_p)
+{
+        int rc;
+        rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
+        if (rc != X86EMUL_CONTINUE)
+                return rc;
+        return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc),
                                   &ctxt->exception);
 }
@@ -1458,16 +1508,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
                                    u16 selector, struct desc_struct *desc)
 {
-        struct desc_ptr dt;
+        int rc;
-        u16 index = selector >> 3;
        ulong addr;
-        get_descriptor_table_ptr(ctxt, selector, &dt);
+        rc = get_descriptor_ptr(ctxt, selector, &addr);
+        if (rc != X86EMUL_CONTINUE)
-        if (dt.size < index * 8 + 7)
+                return rc;
-                return emulate_gp(ctxt, selector & 0xfffc);
-        addr = dt.address + index * 8;
        return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
                                    &ctxt->exception);
 }
@@ -1475,7 +1522,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 /* Does not support long mode */
 static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
                                     u16 selector, int seg, u8 cpl,
-                                     bool in_task_switch,
+                                     enum x86_transfer_type transfer,
                                     struct desc_struct *desc)
 {
        struct desc_struct seg_desc, old_desc;
@@ -1529,11 +1576,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
                return ret;
        err_code = selector & 0xfffc;
-        err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR;
+        err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
+                                                           GP_VECTOR;
        /* can't load system descriptor into segment selector */
-        if (seg <= VCPU_SREG_GS && !seg_desc.s)
+        if (seg <= VCPU_SREG_GS && !seg_desc.s) {
+                if (transfer == X86_TRANSFER_CALL_JMP)
+                        return X86EMUL_UNHANDLEABLE;
                goto exception;
+        }
        if (!seg_desc.p) {
                err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
@@ -1605,10 +1656,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
        if (seg_desc.s) {
                /* mark segment as accessed */
-                seg_desc.type |= 1;
+                if (!(seg_desc.type & 1)) {
-                ret = write_segment_descriptor(ctxt, selector, &seg_desc);
+                        seg_desc.type |= 1;
-                if (ret != X86EMUL_CONTINUE)
+                        ret = write_segment_descriptor(ctxt, selector,
-                        return ret;
+                                                       &seg_desc);
+                        if (ret != X86EMUL_CONTINUE)
+                                return ret;
+                }
        } else if (ctxt->mode == X86EMUL_MODE_PROT64) {
                ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
                                sizeof(base3), &ctxt->exception);
@@ -1631,7 +1685,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
                                   u16 selector, int seg)
 {
        u8 cpl = ctxt->ops->cpl(ctxt);
-        return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL);
+        return __load_segment_descriptor(ctxt, selector, seg, cpl,
+                                         X86_TRANSFER_NONE, NULL);
 }
 static void write_register_operand(struct operand *op)
@@ -1828,12 +1883,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
        unsigned long selector;
        int rc;
-        rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
+        rc = emulate_pop(ctxt, &selector, 2);
        if (rc != X86EMUL_CONTINUE)
                return rc;
        if (ctxt->modrm_reg == VCPU_SREG_SS)
                ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+        if (ctxt->op_bytes > 2)
+                rsp_increment(ctxt, ctxt->op_bytes - 2);
        rc = load_segment_descriptor(ctxt, (u16)selector, seg);
        return rc;
@@ -2007,6 +2064,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
        ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
        ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+        ctxt->ops->set_nmi_mask(ctxt, false);
        return rc;
 }
@@ -2041,7 +2099,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
        memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
-        rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
+        rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+                                       X86_TRANSFER_CALL_JMP,
                                       &new_desc);
        if (rc != X86EMUL_CONTINUE)
                return rc;
@@ -2130,7 +2189,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
        /* Outer-privilege level return is not implemented */
        if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
                return X86EMUL_UNHANDLEABLE;
-        rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false,
+        rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
+                                       X86_TRANSFER_RET,
                                       &new_desc);
        if (rc != X86EMUL_CONTINUE)
                return rc;
@@ -2163,12 +2223,15 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
        fastop(ctxt, em_cmp);
        if (ctxt->eflags & EFLG_ZF) {
-                /* Success: write back to memory. */
+                /* Success: write back to memory; no update of EAX */
+                ctxt->src.type = OP_NONE;
                ctxt->dst.val = ctxt->src.orig_val;
        } else {
                /* Failure: write the value we saw to EAX. */
-                ctxt->dst.type = OP_REG;
+                ctxt->src.type = OP_REG;
-                ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+                ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+                ctxt->src.val = ctxt->dst.orig_val;
+                /* Create write-cycle to dest by writing the same value */
                ctxt->dst.val = ctxt->dst.orig_val;
        }
        return X86EMUL_CONTINUE;
@@ -2556,23 +2619,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
         * it is handled in a context of new task
         */
        ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
-                                        true, NULL);
+                                        X86_TRANSFER_TASK_SWITCH, NULL);
        if (ret != X86EMUL_CONTINUE)
                return ret;
        ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
-                                        true, NULL);
+                                        X86_TRANSFER_TASK_SWITCH, NULL);
        if (ret != X86EMUL_CONTINUE)
                return ret;
        ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
-                                        true, NULL);
+                                        X86_TRANSFER_TASK_SWITCH, NULL);
        if (ret != X86EMUL_CONTINUE)
                return ret;
        ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
-                                        true, NULL);
+                                        X86_TRANSFER_TASK_SWITCH, NULL);
        if (ret != X86EMUL_CONTINUE)
                return ret;
        ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
-                                        true, NULL);
+                                        X86_TRANSFER_TASK_SWITCH, NULL);
        if (ret != X86EMUL_CONTINUE)
                return ret;
@@ -2694,31 +2757,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
         * it is handled in a context of new task
         */
        ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
-                                        cpl, true, NULL);
+                                        cpl, X86_TRANSFER_TASK_SWITCH, NULL);
        if (ret != X86EMUL_CONTINUE)
                return ret;
        ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
-                                        true, NULL);
+                                        X86_TRANSFER_TASK_SWITCH, NULL);
        if (ret != X86EMUL_CONTINUE)
                return ret;
        ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
-                                        true, NULL);
+                                        X86_TRANSFER_TASK_SWITCH, NULL);
        if (ret != X86EMUL_CONTINUE)
                return ret;
        ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
-                                        true, NULL);
+                                        X86_TRANSFER_TASK_SWITCH, NULL);
        if (ret != X86EMUL_CONTINUE)
                return ret;
        ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
-                                        true, NULL);
+                                        X86_TRANSFER_TASK_SWITCH, NULL);
        if (ret != X86EMUL_CONTINUE)
                return ret;
        ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
-                                        true, NULL);
+                                        X86_TRANSFER_TASK_SWITCH, NULL);
        if (ret != X86EMUL_CONTINUE)
                return ret;
        ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
-                                        true, NULL);
+                                        X86_TRANSFER_TASK_SWITCH, NULL);
        if (ret != X86EMUL_CONTINUE)
                return ret;
@@ -2739,7 +2802,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
        ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
                            &ctxt->exception);
        if (ret != X86EMUL_CONTINUE)
-                /* FIXME: need to provide precise fault address */
                return ret;
        save_state_to_tss32(ctxt, &tss_seg);
@@ -2748,13 +2810,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
        ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
                             ldt_sel_offset - eip_offset, &ctxt->exception);
        if (ret != X86EMUL_CONTINUE)
-                /* FIXME: need to provide precise fault address */
                return ret;
        ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
                            &ctxt->exception);
        if (ret != X86EMUL_CONTINUE)
-                /* FIXME: need to provide precise fault address */
                return ret;
        if (old_tss_sel != 0xffff) {
@@ -2765,7 +2825,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
                                     sizeof tss_seg.prev_task_link,
                                     &ctxt->exception);
                if (ret != X86EMUL_CONTINUE)
-                        /* FIXME: need to provide precise fault address */
                        return ret;
        }
@@ -2999,15 +3058,16 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
        struct desc_struct old_desc, new_desc;
        const struct x86_emulate_ops *ops = ctxt->ops;
        int cpl = ctxt->ops->cpl(ctxt);
+        enum x86emul_mode prev_mode = ctxt->mode;
        old_eip = ctxt->_eip;
        ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
        memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
-        rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
+        rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
-                                       &new_desc);
+                                       X86_TRANSFER_CALL_JMP, &new_desc);
        if (rc != X86EMUL_CONTINUE)
-                return X86EMUL_CONTINUE;
+                return rc;
        rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
        if (rc != X86EMUL_CONTINUE)
@@ -3022,11 +3082,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
        rc = em_push(ctxt);
        /* If we failed, we tainted the memory, but the very least we should
           restore cs */
-        if (rc != X86EMUL_CONTINUE)
+        if (rc != X86EMUL_CONTINUE) {
+                pr_warn_once("faulting far call emulation tainted memory\n");
                goto fail;
+        }
        return rc;
 fail:
        ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
+        ctxt->mode = prev_mode;
        return rc;
 }
@@ -3477,6 +3540,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt)
        return X86EMUL_CONTINUE;
 }
+static int em_movsxd(struct x86_emulate_ctxt *ctxt)
+{
+        ctxt->dst.val = (s32) ctxt->src.val;
+        return X86EMUL_CONTINUE;
+}
 static bool valid_cr(int nr)
 {
        switch (nr) {
@@ -3676,6 +3745,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
 #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
 #define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
+#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) }
 #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
 #define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
 #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
@@ -3738,7 +3808,7 @@ static const struct opcode group1[] = {
 };
 static const struct opcode group1A[] = {
-        I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
+        I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
 };
 static const struct opcode group2[] = {
@@ -3854,7 +3924,7 @@ static const struct gprefix pfx_0f_e7 = {
 };
 static const struct escape escape_d9 = { {
-        N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
+        N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
 }, {
        /* 0xC0 - 0xC7 */
        N, N, N, N, N, N, N, N,
@@ -3896,7 +3966,7 @@ static const struct escape escape_db = { {
 } };
 static const struct escape escape_dd = { {
-        N, N, N, N, N, N, N, I(DstMem, em_fnstsw),
+        N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
 }, {
        /* 0xC0 - 0xC7 */
        N, N, N, N, N, N, N, N,
@@ -3920,6 +3990,10 @@ static const struct instr_dual instr_dual_0f_c3 = {
        I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
 };
+static const struct mode_dual mode_dual_63 = {
+        N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd)
+};
 static const struct opcode opcode_table[256] = {
        /* 0x00 - 0x07 */
        F6ALU(Lock, em_add),
@@ -3954,7 +4028,7 @@ static const struct opcode opcode_table[256] = {
        /* 0x60 - 0x67 */
        I(ImplicitOps | Stack | No64, em_pusha),
        I(ImplicitOps | Stack | No64, em_popa),
-        N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
+        N, MD(ModRM, &mode_dual_63),
        N, N, N, N,
        /* 0x68 - 0x6F */
        I(SrcImm | Mov | Stack, em_push),
@@ -4010,8 +4084,8 @@ static const struct opcode opcode_table[256] = {
        G(ByteOp, group11), G(0, group11),
        /* 0xC8 - 0xCF */
        I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
-        I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm),
+        I(ImplicitOps | SrcImmU16, em_ret_far_imm),
-        I(ImplicitOps | Stack, em_ret_far),
+        I(ImplicitOps, em_ret_far),
        D(ImplicitOps), DI(SrcImmByte, intn),
        D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
        /* 0xD0 - 0xD7 */
@@ -4108,7 +4182,7 @@ static const struct opcode twobyte_table[256] = {
        F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
        GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul),
        /* 0xB0 - 0xB7 */
-        I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
+        I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg),
        I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
        F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
        I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
@@ -4174,6 +4248,8 @@ static const struct opcode opcode_map_0f_38[256] = {
 #undef I
 #undef GP
 #undef EXT
+#undef MD
+#undef ID
 #undef D2bv
 #undef D2bvIP
@@ -4563,6 +4639,12 @@ done_prefixes:
                        else
                                opcode = opcode.u.idual->mod012;
                        break;
+                case ModeDual:
+                        if (ctxt->mode == X86EMUL_MODE_PROT64)
+                                opcode = opcode.u.mdual->mode64;
+                        else
+                                opcode = opcode.u.mdual->mode32;
+                        break;
                default:
                        return EMULATION_FAILED;
                }
@@ -4860,8 +4942,13 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
                /* optimisation - avoid slow emulated read if Mov */
                rc = segmented_read(ctxt, ctxt->dst.addr.mem,
                                   &ctxt->dst.val, ctxt->dst.bytes);
-                if (rc != X86EMUL_CONTINUE)
+                if (rc != X86EMUL_CONTINUE) {
+                        if (!(ctxt->d & NoWrite) &&
+                            rc == X86EMUL_PROPAGATE_FAULT &&
+                            ctxt->exception.vector == PF_VECTOR)
+                                ctxt->exception.error_code |= PFERR_WRITE_MASK;
                        goto done;
+                }
        }
        ctxt->dst.orig_val = ctxt->dst.val;
@@ -4899,11 +4986,6 @@ special_insn:
                goto threebyte_insn;
        switch (ctxt->b) {
-        case 0x63:              /* movsxd */
-                if (ctxt->mode != X86EMUL_MODE_PROT64)
-                        goto cannot_emulate;
-                ctxt->dst.val = (s32) ctxt->src.val;
-                break;
        case 0x70 ... 0x7f: /* jcc (short) */
                if (test_cc(ctxt->b, ctxt->eflags))
                        rc = jmp_rel(ctxt, ctxt->src.val);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 3c9195535ffc..c2e36d934af4 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -98,7 +98,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 }
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                int short_hand, unsigned int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 17b73eeac8a4..7dbced309ddb 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -138,7 +138,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
                gfn += page_size >> PAGE_SHIFT;
+                cond_resched();
        }
        return 0;
@@ -306,6 +306,8 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
                kvm_unpin_pages(kvm, pfn, unmap_pages);
                gfn += unmap_pages;
+                cond_resched();
        }
 }
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d52dcf0776ea..e55b5fc344eb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
 #include <asm/page.h>
 #include <asm/current.h>
 #include <asm/apicdef.h>
+#include <asm/delay.h>
 #include <linux/atomic.h>
 #include <linux/jump_label.h>
 #include "kvm_cache_regs.h"
@@ -327,17 +328,24 @@ static u8 count_vectors(void *bitmap)
        return count;
 }
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+void __kvm_apic_update_irr(u32 *pir, void *regs)
 {
        u32 i, pir_val;
-        struct kvm_lapic *apic = vcpu->arch.apic;
        for (i = 0; i <= 7; i++) {
                pir_val = xchg(&pir[i], 0);
                if (pir_val)
-                        *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
+                        *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
        }
 }
+EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+{
+        struct kvm_lapic *apic = vcpu->arch.apic;
+        __kvm_apic_update_irr(pir, apic->regs);
+}
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
@@ -405,7 +413,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
         * because the processor can modify ISR under the hood.  Instead
         * just set SVI.
         */
-        if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+        if (unlikely(kvm_x86_ops->hwapic_isr_update))
                kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
        else {
                ++apic->isr_count;
@@ -453,7 +461,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
         * on the other hand isr_count and highest_isr_cache are unused
         * and must be left alone.
         */
-        if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+        if (unlikely(kvm_x86_ops->hwapic_isr_update))
                kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
                                               apic_find_highest_isr(apic));
        else {
@@ -580,55 +588,48 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
        apic_update_ppr(apic);
 }
-static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
 {
        return dest == (apic_x2apic_mode(apic) ?
                        X2APIC_BROADCAST : APIC_BROADCAST);
 }
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
 {
        return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
 }
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
+static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 {
-        int result = 0;
        u32 logical_id;
        if (kvm_apic_broadcast(apic, mda))
-                return 1;
+                return true;
-        if (apic_x2apic_mode(apic)) {
+        logical_id = kvm_apic_get_reg(apic, APIC_LDR);
-                logical_id = kvm_apic_get_reg(apic, APIC_LDR);
-                return logical_id & mda;
-        }
-        logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
+        if (apic_x2apic_mode(apic))
+                return ((logical_id >> 16) == (mda >> 16))
+                       && (logical_id & mda & 0xffff) != 0;
+        logical_id = GET_APIC_LOGICAL_ID(logical_id);
        switch (kvm_apic_get_reg(apic, APIC_DFR)) {
        case APIC_DFR_FLAT:
-                if (logical_id & mda)
+                return (logical_id & mda) != 0;
-                        result = 1;
-                break;
        case APIC_DFR_CLUSTER:
-                if (((logical_id >> 4) == (mda >> 0x4))
+                return ((logical_id >> 4) == (mda >> 4))
-                    && (logical_id & mda & 0xf))
+                       && (logical_id & mda & 0xf) != 0;
-                        result = 1;
-                break;
        default:
                apic_debug("Bad DFR vcpu %d: %08x\n",
                           apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
-                break;
+                return false;
        }
-        return result;
 }
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                           int short_hand, unsigned int dest, int dest_mode)
 {
-        int result = 0;
        struct kvm_lapic *target = vcpu->arch.apic;
        apic_debug("target %p, source %p, dest 0x%x, "
@@ -638,29 +639,21 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
        ASSERT(target);
        switch (short_hand) {
        case APIC_DEST_NOSHORT:
-                if (dest_mode == 0)
+                if (dest_mode == APIC_DEST_PHYSICAL)
-                        /* Physical mode. */
+                        return kvm_apic_match_physical_addr(target, dest);
-                        result = kvm_apic_match_physical_addr(target, dest);
                else
-                        /* Logical mode. */
+                        return kvm_apic_match_logical_addr(target, dest);
-                        result = kvm_apic_match_logical_addr(target, dest);
-                break;
        case APIC_DEST_SELF:
-                result = (target == source);
+                return target == source;
-                break;
        case APIC_DEST_ALLINC:
-                result = 1;
+                return true;
-                break;
        case APIC_DEST_ALLBUT:
-                result = (target != source);
+                return target != source;
-                break;
        default:
                apic_debug("kvm: apic: Bad dest shorthand value %x\n",
                           short_hand);
-                break;
+                return false;
        }
-        return result;
 }
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
@@ -693,7 +686,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
        ret = true;
-        if (irq->dest_mode == 0) { /* physical mode */
+        if (irq->dest_mode == APIC_DEST_PHYSICAL) {
                if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
                        goto out;
@@ -1076,25 +1069,72 @@ static void apic_timer_expired(struct kvm_lapic *apic)
 {
        struct kvm_vcpu *vcpu = apic->vcpu;
        wait_queue_head_t *q = &vcpu->wq;
+        struct kvm_timer *ktimer = &apic->lapic_timer;
-        /*
-         * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
-         * vcpu_enter_guest.
-         */
        if (atomic_read(&apic->lapic_timer.pending))
                return;
        atomic_inc(&apic->lapic_timer.pending);
-        /* FIXME: this code should not know anything about vcpus */
+        kvm_set_pending_timer(vcpu);
-        kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
        if (waitqueue_active(q))
                wake_up_interruptible(q);
+        if (apic_lvtt_tscdeadline(apic))
+                ktimer->expired_tscdeadline = ktimer->tscdeadline;
+}
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+        struct kvm_lapic *apic = vcpu->arch.apic;
+        u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+        if (kvm_apic_hw_enabled(apic)) {
+                int vec = reg & APIC_VECTOR_MASK;
+                void *bitmap = apic->regs + APIC_ISR;
+                if (kvm_x86_ops->deliver_posted_interrupt)
+                        bitmap = apic->regs + APIC_IRR;
+                if (apic_test_vector(vec, bitmap))
+                        return true;
+        }
+        return false;
+}
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+        struct kvm_lapic *apic = vcpu->arch.apic;
+        u64 guest_tsc, tsc_deadline;
+        if (!kvm_vcpu_has_lapic(vcpu))
+                return;
+        if (apic->lapic_timer.expired_tscdeadline == 0)
+                return;
+        if (!lapic_timer_int_injected(vcpu))
+                return;
+        tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+        apic->lapic_timer.expired_tscdeadline = 0;
+        guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+        trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+        /* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
+        if (guest_tsc < tsc_deadline)
+                __delay(tsc_deadline - guest_tsc);
 }
 static void start_apic_timer(struct kvm_lapic *apic)
 {
        ktime_t now;
        atomic_set(&apic->lapic_timer.pending, 0);
        if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1140,6 +1180,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
                /* lapic timer in tsc deadline mode */
                u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
                u64 ns = 0;
+                ktime_t expire;
                struct kvm_vcpu *vcpu = apic->vcpu;
                unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
                unsigned long flags;
@@ -1154,8 +1195,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
                if (likely(tscdeadline > guest_tsc)) {
                        ns = (tscdeadline - guest_tsc) * 1000000ULL;
                        do_div(ns, this_tsc_khz);
+                        expire = ktime_add_ns(now, ns);
+                        expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
                        hrtimer_start(&apic->lapic_timer.timer,
-                                ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+                                      expire, HRTIMER_MODE_ABS);
                } else
                        apic_timer_expired(apic);
@@ -1745,7 +1788,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
        if (kvm_x86_ops->hwapic_irr_update)
                kvm_x86_ops->hwapic_irr_update(vcpu,
                                apic_find_highest_irr(apic));
-        kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
+        if (unlikely(kvm_x86_ops->hwapic_isr_update))
+                kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
+                                apic_find_highest_isr(apic));
        kvm_make_request(KVM_REQ_EVENT, vcpu);
        kvm_rtc_eoi_tracking_restore_one(vcpu);
 }
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c674fce53cf9..0bc6c656625b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
        u32 timer_mode;
        u32 timer_mode_mask;
        u64 tscdeadline;
+        u64 expired_tscdeadline;
        atomic_t pending;                       /* accumulated triggered timers */
 };
@@ -56,9 +57,8 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
+void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
                unsigned long *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
@@ -170,4 +170,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f83fc6c5e0ba..cee759299a35 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -63,30 +63,16 @@ enum {
 #undef MMU_DEBUG
 #ifdef MMU_DEBUG
+static bool dbg = 0;
+module_param(dbg, bool, 0644);
 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+#define MMU_WARN_ON(x) WARN_ON(x)
 #else
 #define pgprintk(x...) do { } while (0)
 #define rmap_printk(x...) do { } while (0)
+#define MMU_WARN_ON(x) do { } while (0)
-#endif
-#ifdef MMU_DEBUG
-static bool dbg = 0;
-module_param(dbg, bool, 0644);
-#endif
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x)                                                       \
-        if (!(x)) {                                                     \
-                printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
-                       __FILE__, __LINE__, #x);                         \
-        }
 #endif
 #define PTE_PREFETCH_NUM                8
@@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
        return (old_spte & bit_mask) && !(new_spte & bit_mask);
 }
+static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+        return (old_spte & bit_mask) != (new_spte & bit_mask);
+}
 /* Rules for using mmu_spte_set:
 * Set the sptep from nonpresent to present.
 * Note: the sptep being assigned *must* be either not present
@@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
        if (!shadow_accessed_mask)
                return ret;
+        /*
+         * Flush TLB when accessed/dirty bits are changed in the page tables,
+         * to guarantee consistency between TLB and page tables.
+         */
+        if (spte_is_bit_changed(old_spte, new_spte,
+                                shadow_accessed_mask | shadow_dirty_mask))
+                ret = true;
        if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
                kvm_set_pfn_accessed(spte_to_pfn(old_spte));
        if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
@@ -1216,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
        return flush;
 }
+static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
+{
+        u64 spte = *sptep;
+        rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+        spte &= ~shadow_dirty_mask;
+        return mmu_spte_update(sptep, spte);
+}
+static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
+{
+        u64 *sptep;
+        struct rmap_iterator iter;
+        bool flush = false;
+        for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+                BUG_ON(!(*sptep & PT_PRESENT_MASK));
+                flush |= spte_clear_dirty(kvm, sptep);
+                sptep = rmap_get_next(&iter);
+        }
+        return flush;
+}
+static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
+{
+        u64 spte = *sptep;
+        rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+        spte |= shadow_dirty_mask;
+        return mmu_spte_update(sptep, spte);
+}
+static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
+{
+        u64 *sptep;
+        struct rmap_iterator iter;
+        bool flush = false;
+        for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+                BUG_ON(!(*sptep & PT_PRESENT_MASK));
+                flush |= spte_set_dirty(kvm, sptep);
+                sptep = rmap_get_next(&iter);
+        }
+        return flush;
+}
 /**
 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
 * @kvm: kvm instance
@@ -1226,7 +1279,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 * Used when we do not need to care about huge page mappings: e.g. during dirty
 * logging we do not have any such mappings.
 */
-void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
                                     struct kvm_memory_slot *slot,
                                     gfn_t gfn_offset, unsigned long mask)
 {
@@ -1242,6 +1295,53 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
        }
 }
+/**
+ * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages
+ * @kvm: kvm instance
+ * @slot: slot to clear D-bit
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should clear D-bit
+ *
+ * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
+ */
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+                                     struct kvm_memory_slot *slot,
+                                     gfn_t gfn_offset, unsigned long mask)
+{
+        unsigned long *rmapp;
+        while (mask) {
+                rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+                                      PT_PAGE_TABLE_LEVEL, slot);
+                __rmap_clear_dirty(kvm, rmapp);
+                /* clear the first set bit */
+                mask &= mask - 1;
+        }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * PT level pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+                                struct kvm_memory_slot *slot,
+                                gfn_t gfn_offset, unsigned long mask)
+{
+        if (kvm_x86_ops->enable_log_dirty_pt_masked)
+                kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
+                                mask);
+        else
+                kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
 static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
        struct kvm_memory_slot *slot;
@@ -1536,7 +1636,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
-        ASSERT(is_empty_shadow_page(sp->spt));
+        MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
        hlist_del(&sp->hash_link);
        list_del(&sp->link);
        free_page((unsigned long)sp->spt);
@@ -2501,8 +2601,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                }
        }
-        if (pte_access & ACC_WRITE_MASK)
+        if (pte_access & ACC_WRITE_MASK) {
                mark_page_dirty(vcpu->kvm, gfn);
+                spte |= shadow_dirty_mask;
+        }
 set_pte:
        if (mmu_spte_update(sptep, spte))
@@ -2818,6 +2920,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
         */
        gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+        /*
+         * Theoretically we could also set dirty bit (and flush TLB) here in
+         * order to eliminate unnecessary PML logging. See comments in
+         * set_spte. But fast_page_fault is very unlikely to happen with PML
+         * enabled, so we do not do this. This might result in the same GPA
+         * to be logged in PML buffer again when the write really happens, and
+         * eventually to be called by mark_page_dirty twice. But it's also no
+         * harm. This also avoids the TLB flush needed after setting dirty bit
+         * so non-PML cases won't be impacted.
+         *
+         * Compare with set_spte where instead shadow_dirty_mask is set.
+         */
        if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
                mark_page_dirty(vcpu->kvm, gfn);
@@ -3041,7 +3155,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
                for (i = 0; i < 4; ++i) {
                        hpa_t root = vcpu->arch.mmu.pae_root[i];
-                        ASSERT(!VALID_PAGE(root));
+                        MMU_WARN_ON(VALID_PAGE(root));
                        spin_lock(&vcpu->kvm->mmu_lock);
                        make_mmu_pages_available(vcpu);
                        sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
@@ -3079,7 +3193,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
        if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
                hpa_t root = vcpu->arch.mmu.root_hpa;
-                ASSERT(!VALID_PAGE(root));
+                MMU_WARN_ON(VALID_PAGE(root));
                spin_lock(&vcpu->kvm->mmu_lock);
                make_mmu_pages_available(vcpu);
@@ -3104,7 +3218,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
        for (i = 0; i < 4; ++i) {
                hpa_t root = vcpu->arch.mmu.pae_root[i];
-                ASSERT(!VALID_PAGE(root));
+                MMU_WARN_ON(VALID_PAGE(root));
                if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
                        pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
                        if (!is_present_gpte(pdptr)) {
@@ -3329,8 +3443,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
        if (r)
                return r;
-        ASSERT(vcpu);
+        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
        gfn = gva >> PAGE_SHIFT;
@@ -3396,8 +3509,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        int write = error_code & PFERR_WRITE_MASK;
        bool map_writable;
-        ASSERT(vcpu);
+        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
        if (unlikely(error_code & PFERR_RSVD_MASK)) {
                r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
@@ -3718,7 +3830,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
-        ASSERT(is_pae(vcpu));
+        MMU_WARN_ON(!is_pae(vcpu));
        context->page_fault = paging64_page_fault;
        context->gva_to_gpa = paging64_gva_to_gpa;
        context->sync_page = paging64_sync_page;
@@ -3763,7 +3875,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
-        struct kvm_mmu *context = vcpu->arch.walk_mmu;
+        struct kvm_mmu *context = &vcpu->arch.mmu;
        context->base_role.word = 0;
        context->page_fault = tdp_page_fault;
@@ -3803,11 +3915,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        update_last_pte_bitmap(vcpu, context);
 }
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 {
        bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
-        ASSERT(vcpu);
+        struct kvm_mmu *context = &vcpu->arch.mmu;
-        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+        MMU_WARN_ON(VALID_PAGE(context->root_hpa));
        if (!is_paging(vcpu))
                nonpaging_init_context(vcpu, context);
@@ -3818,19 +3931,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
        else
                paging32_init_context(vcpu, context);
-        vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
+        context->base_role.nxe = is_nx(vcpu);
-        vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+        context->base_role.cr4_pae = !!is_pae(vcpu);
-        vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+        context->base_role.cr0_wp  = is_write_protection(vcpu);
-        vcpu->arch.mmu.base_role.smep_andnot_wp
+        context->base_role.smep_andnot_wp
                = smep && !is_write_protection(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
-                bool execonly)
 {
-        ASSERT(vcpu);
+        struct kvm_mmu *context = &vcpu->arch.mmu;
-        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+        MMU_WARN_ON(VALID_PAGE(context->root_hpa));
        context->shadow_root_level = kvm_x86_ops->get_tdp_level();
@@ -3851,11 +3964,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
-        kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
+        struct kvm_mmu *context = &vcpu->arch.mmu;
-        vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
-        vcpu->arch.walk_mmu->get_cr3           = get_cr3;
+        kvm_init_shadow_mmu(vcpu);
-        vcpu->arch.walk_mmu->get_pdptr         = kvm_pdptr_read;
+        context->set_cr3           = kvm_x86_ops->set_cr3;
-        vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+        context->get_cr3           = get_cr3;
+        context->get_pdptr         = kvm_pdptr_read;
+        context->inject_page_fault = kvm_inject_page_fault;
 }
 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
@@ -3900,17 +4015,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 static void init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
        if (mmu_is_nested(vcpu))
-                return init_kvm_nested_mmu(vcpu);
+                init_kvm_nested_mmu(vcpu);
        else if (tdp_enabled)
-                return init_kvm_tdp_mmu(vcpu);
+                init_kvm_tdp_mmu(vcpu);
        else
-                return init_kvm_softmmu(vcpu);
+                init_kvm_softmmu(vcpu);
 }
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 {
-        ASSERT(vcpu);
        kvm_mmu_unload(vcpu);
        init_kvm_mmu(vcpu);
 }
@@ -4266,8 +4379,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
        struct page *page;
        int i;
-        ASSERT(vcpu);
        /*
         * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
         * Therefore we need to allocate shadow page tables in the first
@@ -4286,8 +4397,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
-        ASSERT(vcpu);
        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
        vcpu->arch.mmu.root_hpa = INVALID_PAGE;
        vcpu->arch.mmu.translate_gpa = translate_gpa;
@@ -4298,19 +4407,18 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
-        ASSERT(vcpu);
+        MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
-        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
        init_kvm_mmu(vcpu);
 }
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+                                      struct kvm_memory_slot *memslot)
 {
-        struct kvm_memory_slot *memslot;
        gfn_t last_gfn;
        int i;
+        bool flush = false;
-        memslot = id_to_memslot(kvm->memslots, slot);
        last_gfn = memslot->base_gfn + memslot->npages - 1;
        spin_lock(&kvm->mmu_lock);
@@ -4325,7 +4433,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
                for (index = 0; index <= last_index; ++index, ++rmapp) {
                        if (*rmapp)
-                                __rmap_write_protect(kvm, rmapp, false);
+                                flush |= __rmap_write_protect(kvm, rmapp,
+                                                false);
                        if (need_resched() || spin_needbreak(&kvm->mmu_lock))
                                cond_resched_lock(&kvm->mmu_lock);
@@ -4352,8 +4461,124 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
         * instead of PT_WRITABLE_MASK, that means it does not depend
         * on PT_WRITABLE_MASK anymore.
         */
-        kvm_flush_remote_tlbs(kvm);
+        if (flush)
+                kvm_flush_remote_tlbs(kvm);
+}
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+                                   struct kvm_memory_slot *memslot)
+{
+        gfn_t last_gfn;
+        unsigned long *rmapp;
+        unsigned long last_index, index;
+        bool flush = false;
+        last_gfn = memslot->base_gfn + memslot->npages - 1;
+        spin_lock(&kvm->mmu_lock);
+        rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
+        last_index = gfn_to_index(last_gfn, memslot->base_gfn,
+                        PT_PAGE_TABLE_LEVEL);
+        for (index = 0; index <= last_index; ++index, ++rmapp) {
+                if (*rmapp)
+                        flush |= __rmap_clear_dirty(kvm, rmapp);
+                if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+                        cond_resched_lock(&kvm->mmu_lock);
+        }
+        spin_unlock(&kvm->mmu_lock);
+        lockdep_assert_held(&kvm->slots_lock);
+        /*
+         * It's also safe to flush TLBs out of mmu lock here as currently this
+         * function is only used for dirty logging, in which case flushing TLB
+         * out of mmu lock also guarantees no dirty pages will be lost in
+         * dirty_bitmap.
+         */
+        if (flush)
+                kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+                                        struct kvm_memory_slot *memslot)
+{
+        gfn_t last_gfn;
+        int i;
+        bool flush = false;
+        last_gfn = memslot->base_gfn + memslot->npages - 1;
+        spin_lock(&kvm->mmu_lock);
+        for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
+             i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+                unsigned long *rmapp;
+                unsigned long last_index, index;
+                rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+                last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+                for (index = 0; index <= last_index; ++index, ++rmapp) {
+                        if (*rmapp)
+                                flush |= __rmap_write_protect(kvm, rmapp,
+                                                false);
+                        if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+                                cond_resched_lock(&kvm->mmu_lock);
+                }
+        }
+        spin_unlock(&kvm->mmu_lock);
+        /* see kvm_mmu_slot_remove_write_access */
+        lockdep_assert_held(&kvm->slots_lock);
+        if (flush)
+                kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+                            struct kvm_memory_slot *memslot)
+{
+        gfn_t last_gfn;
+        int i;
+        bool flush = false;
+        last_gfn = memslot->base_gfn + memslot->npages - 1;
+        spin_lock(&kvm->mmu_lock);
+        for (i = PT_PAGE_TABLE_LEVEL;
+             i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+                unsigned long *rmapp;
+                unsigned long last_index, index;
+                rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+                last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+                for (index = 0; index <= last_index; ++index, ++rmapp) {
+                        if (*rmapp)
+                                flush |= __rmap_set_dirty(kvm, rmapp);
+                        if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+                                cond_resched_lock(&kvm->mmu_lock);
+                }
+        }
+        spin_unlock(&kvm->mmu_lock);
+        lockdep_assert_held(&kvm->slots_lock);
+        /* see kvm_mmu_slot_leaf_clear_dirty */
+        if (flush)
+                kvm_flush_remote_tlbs(kvm);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 #define BATCH_ZAP_PAGES 10
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
@@ -4606,8 +4831,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-        ASSERT(vcpu);
        kvm_mmu_unload(vcpu);
        free_mmu_pages(vcpu);
        mmu_free_memory_caches(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bde8ee725754..c7d65637c851 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,18 +44,6 @@
 #define PT_DIRECTORY_LEVEL 2
 #define PT_PAGE_TABLE_LEVEL 1
-#define PFERR_PRESENT_BIT 0
-#define PFERR_WRITE_BIT 1
-#define PFERR_USER_BIT 2
-#define PFERR_RSVD_BIT 3
-#define PFERR_FETCH_BIT 4
-#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
-#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
-#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
-#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
-#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
 static inline u64 rsvd_bits(int s, int e)
 {
        return ((1ULL << (e - s + 1)) - 1) << s;
@@ -81,9 +69,8 @@ enum {
 };
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
-                bool execonly);
 void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                bool ept);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 41dd0387cccb..a17d848c6d42 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-        kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+        WARN_ON(mmu_is_nested(vcpu));
+        kvm_init_shadow_mmu(vcpu);
        vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
        vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
        vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index c2a34bb5ad93..7c7bc8bef21f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -848,6 +848,24 @@ TRACE_EVENT(kvm_track_tsc,
 #endif /* CONFIG_X86_64 */
+/*
+ * Tracepoint for PML full VMEXIT.
+ */
+TRACE_EVENT(kvm_pml_full,
+        TP_PROTO(unsigned int vcpu_id),
+        TP_ARGS(vcpu_id),
+        TP_STRUCT__entry(
+                __field(        unsigned int,   vcpu_id                 )
+        ),
+        TP_fast_assign(
+                __entry->vcpu_id                = vcpu_id;
+        ),
+        TP_printk("vcpu %d: PML full", __entry->vcpu_id)
+);
 TRACE_EVENT(kvm_ple_window,
        TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
        TP_ARGS(grow, vcpu_id, new, old),
@@ -914,6 +932,26 @@ TRACE_EVENT(kvm_pvclock_update,
                  __entry->flags)
 );
+TRACE_EVENT(kvm_wait_lapic_expire,
+        TP_PROTO(unsigned int vcpu_id, s64 delta),
+        TP_ARGS(vcpu_id, delta),
+        TP_STRUCT__entry(
+                __field(        unsigned int,   vcpu_id         )
+                __field(        s64,            delta           )
+        ),
+        TP_fast_assign(
+                __entry->vcpu_id           = vcpu_id;
+                __entry->delta             = delta;
+        ),
+        TP_printk("vcpu %u: delta %lld (%s)",
+                  __entry->vcpu_id,
+                  __entry->delta,
+                  __entry->delta < 0 ? "early" : "late")
+);
 #endif /* _TRACE_KVM_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4c58d884838..3f73bfad0349 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -45,6 +45,7 @@
 #include <asm/perf_event.h>
 #include <asm/debugreg.h>
 #include <asm/kexec.h>
+#include <asm/apic.h>
 #include "trace.h"
@@ -101,6 +102,9 @@ module_param(nested, bool, S_IRUGO);
 static u64 __read_mostly host_xss;
+static bool __read_mostly enable_pml = 1;
+module_param_named(pml, enable_pml, bool, S_IRUGO);
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON                                            \
@@ -215,7 +219,12 @@ struct __packed vmcs12 {
        u64 tsc_offset;
        u64 virtual_apic_page_addr;
        u64 apic_access_addr;
+        u64 posted_intr_desc_addr;
        u64 ept_pointer;
+        u64 eoi_exit_bitmap0;
+        u64 eoi_exit_bitmap1;
+        u64 eoi_exit_bitmap2;
+        u64 eoi_exit_bitmap3;
        u64 xss_exit_bitmap;
        u64 guest_physical_address;
        u64 vmcs_link_pointer;
@@ -330,6 +339,7 @@ struct __packed vmcs12 {
        u32 vmx_preemption_timer_value;
        u32 padding32[7]; /* room for future expansion */
        u16 virtual_processor_id;
+        u16 posted_intr_nv;
        u16 guest_es_selector;
        u16 guest_cs_selector;
        u16 guest_ss_selector;
@@ -338,6 +348,7 @@ struct __packed vmcs12 {
        u16 guest_gs_selector;
        u16 guest_ldtr_selector;
        u16 guest_tr_selector;
+        u16 guest_intr_status;
        u16 host_es_selector;
        u16 host_cs_selector;
        u16 host_ss_selector;
@@ -401,6 +412,10 @@ struct nested_vmx {
         */
        struct page *apic_access_page;
        struct page *virtual_apic_page;
+        struct page *pi_desc_page;
+        struct pi_desc *pi_desc;
+        bool pi_pending;
+        u16 posted_intr_nv;
        u64 msr_ia32_feature_control;
        struct hrtimer preemption_timer;
@@ -408,6 +423,23 @@ struct nested_vmx {
        /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
        u64 vmcs01_debugctl;
+        u32 nested_vmx_procbased_ctls_low;
+        u32 nested_vmx_procbased_ctls_high;
+        u32 nested_vmx_true_procbased_ctls_low;
+        u32 nested_vmx_secondary_ctls_low;
+        u32 nested_vmx_secondary_ctls_high;
+        u32 nested_vmx_pinbased_ctls_low;
+        u32 nested_vmx_pinbased_ctls_high;
+        u32 nested_vmx_exit_ctls_low;
+        u32 nested_vmx_exit_ctls_high;
+        u32 nested_vmx_true_exit_ctls_low;
+        u32 nested_vmx_entry_ctls_low;
+        u32 nested_vmx_entry_ctls_high;
+        u32 nested_vmx_true_entry_ctls_low;
+        u32 nested_vmx_misc_low;
+        u32 nested_vmx_misc_high;
+        u32 nested_vmx_ept_caps;
 };
 #define POSTED_INTR_ON  0
@@ -511,6 +543,10 @@ struct vcpu_vmx {
        /* Dynamic PLE window. */
        int ple_window;
        bool ple_window_dirty;
+        /* Support for PML */
+#define PML_ENTITY_NUM          512
+        struct page *pml_pg;
 };
 enum segment_cache_field {
@@ -594,6 +630,7 @@ static int max_shadow_read_write_fields =
 static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+        FIELD(POSTED_INTR_NV, posted_intr_nv),
        FIELD(GUEST_ES_SELECTOR, guest_es_selector),
        FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
        FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -602,6 +639,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
        FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
        FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+        FIELD(GUEST_INTR_STATUS, guest_intr_status),
        FIELD(HOST_ES_SELECTOR, host_es_selector),
        FIELD(HOST_CS_SELECTOR, host_cs_selector),
        FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -618,7 +656,12 @@ static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD64(TSC_OFFSET, tsc_offset),
        FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
        FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+        FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
        FIELD64(EPT_POINTER, ept_pointer),
+        FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+        FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+        FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+        FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
        FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
        FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
        FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -766,6 +809,7 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
+static int vmx_vm_has_apicv(struct kvm *kvm);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
@@ -793,6 +837,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_nested;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;
@@ -959,16 +1004,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void)
        return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
 }
-static inline bool cpu_has_vmx_eptp_uncacheable(void)
-{
-        return vmx_capability.ept & VMX_EPTP_UC_BIT;
-}
-static inline bool cpu_has_vmx_eptp_writeback(void)
-{
-        return vmx_capability.ept & VMX_EPTP_WB_BIT;
-}
 static inline bool cpu_has_vmx_ept_2m_page(void)
 {
        return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
@@ -1073,6 +1108,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void)
                SECONDARY_EXEC_SHADOW_VMCS;
 }
+static inline bool cpu_has_vmx_pml(void)
+{
+        return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
+}
 static inline bool report_flexpriority(void)
 {
        return flexpriority_enabled;
@@ -1112,6 +1152,26 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
                vmx_xsaves_supported();
 }
+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+        return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
 static inline bool is_exception(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2284,20 +2344,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
 * if the corresponding bit in the (32-bit) control field *must* be on, and a
 * bit in the high half is on if the corresponding bit in the control field
 * may be on. See also vmx_control_verify().
- * TODO: allow these variables to be modified (downgraded) by module options
- * or other means.
 */
-static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
+static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
-static u32 nested_vmx_true_procbased_ctls_low;
-static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
-static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
-static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
-static u32 nested_vmx_true_exit_ctls_low;
-static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
-static u32 nested_vmx_true_entry_ctls_low;
-static u32 nested_vmx_misc_low, nested_vmx_misc_high;
-static u32 nested_vmx_ept_caps;
-static __init void nested_vmx_setup_ctls_msrs(void)
 {
        /*
         * Note that as a general rule, the high half of the MSRs (bits in
@@ -2316,57 +2364,74 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        /* pin-based controls */
        rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
-              nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
+                vmx->nested.nested_vmx_pinbased_ctls_low,
-        nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+                vmx->nested.nested_vmx_pinbased_ctls_high);
-        nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
+        vmx->nested.nested_vmx_pinbased_ctls_low |=
-                PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
+                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
-        nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+        vmx->nested.nested_vmx_pinbased_ctls_high &=
+                PIN_BASED_EXT_INTR_MASK |
+                PIN_BASED_NMI_EXITING |
+                PIN_BASED_VIRTUAL_NMIS;
+        vmx->nested.nested_vmx_pinbased_ctls_high |=
+                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
                PIN_BASED_VMX_PREEMPTION_TIMER;
+        if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+                vmx->nested.nested_vmx_pinbased_ctls_high |=
+                        PIN_BASED_POSTED_INTR;
        /* exit controls */
        rdmsr(MSR_IA32_VMX_EXIT_CTLS,
-                nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
+                vmx->nested.nested_vmx_exit_ctls_low,
-        nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+                vmx->nested.nested_vmx_exit_ctls_high);
+        vmx->nested.nested_vmx_exit_ctls_low =
+                VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
-        nested_vmx_exit_ctls_high &=
+        vmx->nested.nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
                VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
-        nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+        vmx->nested.nested_vmx_exit_ctls_high |=
+                VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
        if (vmx_mpx_supported())
-                nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+                vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
        /* We support free control of debug control saving. */
-        nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
+        vmx->nested.nested_vmx_true_exit_ctls_low =
+                vmx->nested.nested_vmx_exit_ctls_low &
                ~VM_EXIT_SAVE_DEBUG_CONTROLS;
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
-                nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
+                vmx->nested.nested_vmx_entry_ctls_low,
-        nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+                vmx->nested.nested_vmx_entry_ctls_high);
-        nested_vmx_entry_ctls_high &=
+        vmx->nested.nested_vmx_entry_ctls_low =
+                VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+        vmx->nested.nested_vmx_entry_ctls_high &=
 #ifdef CONFIG_X86_64
                VM_ENTRY_IA32E_MODE |
 #endif
                VM_ENTRY_LOAD_IA32_PAT;
-        nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
+        vmx->nested.nested_vmx_entry_ctls_high |=
-                                       VM_ENTRY_LOAD_IA32_EFER);
+                (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
        if (vmx_mpx_supported())
-                nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+                vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
        /* We support free control of debug control loading. */
-        nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
+        vmx->nested.nested_vmx_true_entry_ctls_low =
+                vmx->nested.nested_vmx_entry_ctls_low &
                ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
        /* cpu-based controls */
        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
-                nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
+                vmx->nested.nested_vmx_procbased_ctls_low,
-        nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+                vmx->nested.nested_vmx_procbased_ctls_high);
-        nested_vmx_procbased_ctls_high &=
+        vmx->nested.nested_vmx_procbased_ctls_low =
+                CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+        vmx->nested.nested_vmx_procbased_ctls_high &=
                CPU_BASED_VIRTUAL_INTR_PENDING |
                CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
                CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
@@ -2386,45 +2451,55 @@ static __init void nested_vmx_setup_ctls_msrs(void)
         * can use it to avoid exits to L1 - even when L0 runs L2
         * without MSR bitmaps.
         */
-        nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+        vmx->nested.nested_vmx_procbased_ctls_high |=
+                CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
                CPU_BASED_USE_MSR_BITMAPS;
        /* We support free control of CR3 access interception. */
-        nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
+        vmx->nested.nested_vmx_true_procbased_ctls_low =
+                vmx->nested.nested_vmx_procbased_ctls_low &
                ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
        /* secondary cpu-based controls */
        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
-                nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
+                vmx->nested.nested_vmx_secondary_ctls_low,
-        nested_vmx_secondary_ctls_low = 0;
+                vmx->nested.nested_vmx_secondary_ctls_high);
-        nested_vmx_secondary_ctls_high &=
+        vmx->nested.nested_vmx_secondary_ctls_low = 0;
+        vmx->nested.nested_vmx_secondary_ctls_high &=
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+                SECONDARY_EXEC_APIC_REGISTER_VIRT |
+                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                SECONDARY_EXEC_WBINVD_EXITING |
                SECONDARY_EXEC_XSAVES;
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
-                nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
+                vmx->nested.nested_vmx_secondary_ctls_high |=
+                        SECONDARY_EXEC_ENABLE_EPT |
                        SECONDARY_EXEC_UNRESTRICTED_GUEST;
-                nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+                vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
                         VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
                         VMX_EPT_INVEPT_BIT;
-                nested_vmx_ept_caps &= vmx_capability.ept;
+                vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
                /*
                 * For nested guests, we don't do anything specific
                 * for single context invalidation. Hence, only advertise
                 * support for global context invalidation.
                 */
-                nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
+                vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
        } else
-                nested_vmx_ept_caps = 0;
+                vmx->nested.nested_vmx_ept_caps = 0;
        /* miscellaneous data */
-        rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
+        rdmsr(MSR_IA32_VMX_MISC,
-        nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+                vmx->nested.nested_vmx_misc_low,
-        nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+                vmx->nested.nested_vmx_misc_high);
+        vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+        vmx->nested.nested_vmx_misc_low |=
+                VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
                VMX_MISC_ACTIVITY_HLT;
-        nested_vmx_misc_high = 0;
+        vmx->nested.nested_vmx_misc_high = 0;
 }
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2443,6 +2518,8 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
 /* Returns 0 on success, non-0 otherwise. */
 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
        switch (msr_index) {
        case MSR_IA32_VMX_BASIC:
                /*
@@ -2457,36 +2534,44 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                break;
        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
        case MSR_IA32_VMX_PINBASED_CTLS:
-                *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
+                *pdata = vmx_control_msr(
-                                        nested_vmx_pinbased_ctls_high);
+                        vmx->nested.nested_vmx_pinbased_ctls_low,
+                        vmx->nested.nested_vmx_pinbased_ctls_high);
                break;
        case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
-                *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
+                *pdata = vmx_control_msr(
-                                        nested_vmx_procbased_ctls_high);
+                        vmx->nested.nested_vmx_true_procbased_ctls_low,
+                        vmx->nested.nested_vmx_procbased_ctls_high);
                break;
        case MSR_IA32_VMX_PROCBASED_CTLS:
-                *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
+                *pdata = vmx_control_msr(
-                                        nested_vmx_procbased_ctls_high);
+                        vmx->nested.nested_vmx_procbased_ctls_low,
+                        vmx->nested.nested_vmx_procbased_ctls_high);
                break;
        case MSR_IA32_VMX_TRUE_EXIT_CTLS:
-                *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
+                *pdata = vmx_control_msr(
-                                        nested_vmx_exit_ctls_high);
+                        vmx->nested.nested_vmx_true_exit_ctls_low,
+                        vmx->nested.nested_vmx_exit_ctls_high);
                break;
        case MSR_IA32_VMX_EXIT_CTLS:
-                *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
+                *pdata = vmx_control_msr(
-                                        nested_vmx_exit_ctls_high);
+                        vmx->nested.nested_vmx_exit_ctls_low,
+                        vmx->nested.nested_vmx_exit_ctls_high);
                break;
        case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
-                *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
+                *pdata = vmx_control_msr(
-                                        nested_vmx_entry_ctls_high);
+                        vmx->nested.nested_vmx_true_entry_ctls_low,
+                        vmx->nested.nested_vmx_entry_ctls_high);
                break;
        case MSR_IA32_VMX_ENTRY_CTLS:
-                *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
+                *pdata = vmx_control_msr(
-                                        nested_vmx_entry_ctls_high);
+                        vmx->nested.nested_vmx_entry_ctls_low,
+                        vmx->nested.nested_vmx_entry_ctls_high);
                break;
        case MSR_IA32_VMX_MISC:
-                *pdata = vmx_control_msr(nested_vmx_misc_low,
+                *pdata = vmx_control_msr(
-                                         nested_vmx_misc_high);
+                        vmx->nested.nested_vmx_misc_low,
+                        vmx->nested.nested_vmx_misc_high);
                break;
        /*
         * These MSRs specify bits which the guest must keep fixed (on or off)
@@ -2511,12 +2596,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
                break;
        case MSR_IA32_VMX_PROCBASED_CTLS2:
-                *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
+                *pdata = vmx_control_msr(
-                                        nested_vmx_secondary_ctls_high);
+                        vmx->nested.nested_vmx_secondary_ctls_low,
+                        vmx->nested.nested_vmx_secondary_ctls_high);
                break;
        case MSR_IA32_VMX_EPT_VPID_CAP:
                /* Currently, no nested vpid support */
-                *pdata = nested_vmx_ept_caps;
+                *pdata = vmx->nested.nested_vmx_ept_caps;
                break;
        default:
                return 1;
@@ -2929,7 +3015,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                        SECONDARY_EXEC_APIC_REGISTER_VIRT |
                        SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                        SECONDARY_EXEC_SHADOW_VMCS |
-                        SECONDARY_EXEC_XSAVES;
+                        SECONDARY_EXEC_XSAVES |
+                        SECONDARY_EXEC_ENABLE_PML;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
@@ -4159,6 +4246,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
        }
 }
+/*
+ * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ */
+static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
+                                               unsigned long *msr_bitmap_nested,
+                                               u32 msr, int type)
+{
+        int f = sizeof(unsigned long);
+        if (!cpu_has_vmx_msr_bitmap()) {
+                WARN_ON(1);
+                return;
+        }
+        /*
+         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+         * have the write-low and read-high bitmap offsets the wrong way round.
+         * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+         */
+        if (msr <= 0x1fff) {
+                if (type & MSR_TYPE_R &&
+                   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+                        /* read-low */
+                        __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
+                if (type & MSR_TYPE_W &&
+                   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+                        /* write-low */
+                        __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
+        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+                msr &= 0x1fff;
+                if (type & MSR_TYPE_R &&
+                   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+                        /* read-high */
+                        __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+                if (type & MSR_TYPE_W &&
+                   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+                        /* write-high */
+                        __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+        }
+}
 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 {
        if (!longmode_only)
@@ -4197,6 +4330,64 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
        return enable_apicv && irqchip_in_kernel(kvm);
 }
+static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
+        int max_irr;
+        void *vapic_page;
+        u16 status;
+        if (vmx->nested.pi_desc &&
+            vmx->nested.pi_pending) {
+                vmx->nested.pi_pending = false;
+                if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+                        return 0;
+                max_irr = find_last_bit(
+                        (unsigned long *)vmx->nested.pi_desc->pir, 256);
+                if (max_irr == 256)
+                        return 0;
+                vapic_page = kmap(vmx->nested.virtual_apic_page);
+                if (!vapic_page) {
+                        WARN_ON(1);
+                        return -ENOMEM;
+                }
+                __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
+                kunmap(vmx->nested.virtual_apic_page);
+                status = vmcs_read16(GUEST_INTR_STATUS);
+                if ((u8)max_irr > ((u8)status & 0xff)) {
+                        status &= ~0xff;
+                        status |= (u8)max_irr;
+                        vmcs_write16(GUEST_INTR_STATUS, status);
+                }
+        }
+        return 0;
+}
+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+                                                int vector)
+{
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
+        if (is_guest_mode(vcpu) &&
+            vector == vmx->nested.posted_intr_nv) {
+                /* the PIR and ON have been set by L1. */
+                if (vcpu->mode == IN_GUEST_MODE)
+                        apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+                                POSTED_INTR_VECTOR);
+                /*
+                 * If a posted intr is not recognized by hardware,
+                 * we will accomplish it in the next vmentry.
+                 */
+                vmx->nested.pi_pending = true;
+                kvm_make_request(KVM_REQ_EVENT, vcpu);
+                return 0;
+        }
+        return -1;
+}
 /*
 * Send interrupt to vcpu via posted interrupt way.
 * 1. If target vcpu is running(non-root mode), send posted interrupt
@@ -4209,6 +4400,10 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int r;
+        r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+        if (!r)
+                return;
        if (pi_test_and_set_pir(vector, &vmx->pi_desc))
                return;
@@ -4360,6 +4555,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
           a current VMCS12
        */
        exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+        /* PML is enabled/disabled in creating/destorying vcpu */
+        exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
        return exec_control;
 }
@@ -4986,11 +5184,12 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[2] = 0xc1;
 }
-static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
+static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
 {
        unsigned long always_on = VMXON_CR0_ALWAYSON;
+        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-        if (nested_vmx_secondary_ctls_high &
+        if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
                SECONDARY_EXEC_UNRESTRICTED_GUEST &&
            nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
                always_on &= ~(X86_CR0_PE | X86_CR0_PG);
@@ -5015,7 +5214,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
                val = (val & ~vmcs12->cr0_guest_host_mask) |
                        (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
-                if (!nested_cr0_valid(vmcs12, val))
+                if (!nested_cr0_valid(vcpu, val))
                        return 1;
                if (kvm_set_cr0(vcpu, val))
@@ -5817,13 +6016,21 @@ static __init int hardware_setup(void)
                                (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_msr_bitmap_longmode_x2apic)
                goto out4;
+        if (nested) {
+                vmx_msr_bitmap_nested =
+                        (unsigned long *)__get_free_page(GFP_KERNEL);
+                if (!vmx_msr_bitmap_nested)
+                        goto out5;
+        }
        vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_vmread_bitmap)
-                goto out5;
+                goto out6;
        vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_vmwrite_bitmap)
-                goto out6;
+                goto out7;
        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5839,10 +6046,12 @@ static __init int hardware_setup(void)
        memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
        memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+        if (nested)
+                memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
        if (setup_vmcs_config(&vmcs_config) < 0) {
                r = -EIO;
-                goto out7;
+                goto out8;
        }
        if (boot_cpu_has(X86_FEATURE_NX))
@@ -5868,16 +6077,16 @@ static __init int hardware_setup(void)
        if (!cpu_has_vmx_unrestricted_guest())
                enable_unrestricted_guest = 0;
-        if (!cpu_has_vmx_flexpriority()) {
+        if (!cpu_has_vmx_flexpriority())
                flexpriority_enabled = 0;
-                /*
+        /*
-                 * set_apic_access_page_addr() is used to reload apic access
+         * set_apic_access_page_addr() is used to reload apic access
-                 * page upon invalidation.  No need to do anything if the
+         * page upon invalidation.  No need to do anything if not
-                 * processor does not have the APIC_ACCESS_ADDR VMCS field.
+         * using the APIC_ACCESS_ADDR VMCS field.
-                 */
+         */
+        if (!flexpriority_enabled)
                kvm_x86_ops->set_apic_access_page_addr = NULL;
-        }
        if (!cpu_has_vmx_tpr_shadow())
                kvm_x86_ops->update_cr8_intercept = NULL;
@@ -5895,13 +6104,11 @@ static __init int hardware_setup(void)
                kvm_x86_ops->update_cr8_intercept = NULL;
        else {
                kvm_x86_ops->hwapic_irr_update = NULL;
+                kvm_x86_ops->hwapic_isr_update = NULL;
                kvm_x86_ops->deliver_posted_interrupt = NULL;
                kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
        }
-        if (nested)
-                nested_vmx_setup_ctls_msrs();
        vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
        vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
        vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -5945,12 +6152,29 @@ static __init int hardware_setup(void)
        update_ple_window_actual_max();
+        /*
+         * Only enable PML when hardware supports PML feature, and both EPT
+         * and EPT A/D bit features are enabled -- PML depends on them to work.
+         */
+        if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
+                enable_pml = 0;
+        if (!enable_pml) {
+                kvm_x86_ops->slot_enable_log_dirty = NULL;
+                kvm_x86_ops->slot_disable_log_dirty = NULL;
+                kvm_x86_ops->flush_log_dirty = NULL;
+                kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
+        }
        return alloc_kvm_area();
-out7:
+out8:
        free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
+out7:
        free_page((unsigned long)vmx_vmread_bitmap);
+out6:
+        if (nested)
+                free_page((unsigned long)vmx_msr_bitmap_nested);
 out5:
        free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
@@ -5977,6 +6201,8 @@ static __exit void hardware_unsetup(void)
        free_page((unsigned long)vmx_io_bitmap_a);
        free_page((unsigned long)vmx_vmwrite_bitmap);
        free_page((unsigned long)vmx_vmread_bitmap);
+        if (nested)
+                free_page((unsigned long)vmx_msr_bitmap_nested);
        free_kvm_area();
 }
@@ -6143,6 +6369,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
         */
 }
+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+        /* TODO: not to reset guest simply here. */
+        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+        pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+}
 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
 {
        struct vcpu_vmx *vmx =
@@ -6432,6 +6665,7 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
                vmcs_write64(VMCS_LINK_POINTER, -1ull);
        }
+        vmx->nested.posted_intr_nv = -1;
        kunmap(vmx->nested.current_vmcs12_page);
        nested_release_page(vmx->nested.current_vmcs12_page);
        vmx->nested.current_vmptr = -1ull;
@@ -6460,6 +6694,12 @@ static void free_nested(struct vcpu_vmx *vmx)
                nested_release_page(vmx->nested.virtual_apic_page);
                vmx->nested.virtual_apic_page = NULL;
        }
+        if (vmx->nested.pi_desc_page) {
+                kunmap(vmx->nested.pi_desc_page);
+                nested_release_page(vmx->nested.pi_desc_page);
+                vmx->nested.pi_desc_page = NULL;
+                vmx->nested.pi_desc = NULL;
+        }
        nested_free_all_saved_vmcss(vmx);
 }
@@ -6893,6 +7133,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 /* Emulate the INVEPT instruction */
 static int handle_invept(struct kvm_vcpu *vcpu)
 {
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 vmx_instruction_info, types;
        unsigned long type;
        gva_t gva;
@@ -6901,8 +7142,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
                u64 eptp, gpa;
        } operand;
-        if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+        if (!(vmx->nested.nested_vmx_secondary_ctls_high &
-            !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+              SECONDARY_EXEC_ENABLE_EPT) ||
+            !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
                kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
@@ -6918,7 +7160,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
        type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
-        types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+        types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
        if (!(types & (1UL << type))) {
                nested_vmx_failValid(vcpu,
@@ -6960,6 +7202,31 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
        return 1;
 }
+static int handle_pml_full(struct kvm_vcpu *vcpu)
+{
+        unsigned long exit_qualification;
+        trace_kvm_pml_full(vcpu->vcpu_id);
+        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+        /*
+         * PML buffer FULL happened while executing iret from NMI,
+         * "blocked by NMI" bit has to be set before next VM entry.
+         */
+        if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+                        cpu_has_virtual_nmis() &&
+                        (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+                                GUEST_INTR_STATE_NMI);
+        /*
+         * PML buffer already flushed at beginning of VMEXIT. Nothing to do
+         * here.., and there's no userspace involvement needed for PML.
+         */
+        return 1;
+}
 /*
 * The exit handlers return 1 if the exit was handled fully and guest execution
 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7008,6 +7275,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_INVVPID]                 = handle_invvpid,
        [EXIT_REASON_XSAVES]                  = handle_xsaves,
        [EXIT_REASON_XRSTORS]                 = handle_xrstors,
+        [EXIT_REASON_PML_FULL]                = handle_pml_full,
 };
 static const int kvm_vmx_max_exit_handlers =
@@ -7275,6 +7543,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_APIC_ACCESS:
                return nested_cpu_has2(vmcs12,
                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+        case EXIT_REASON_APIC_WRITE:
+        case EXIT_REASON_EOI_INDUCED:
+                /* apic_write and eoi_induced should exit unconditionally. */
+                return 1;
        case EXIT_REASON_EPT_VIOLATION:
                /*
                 * L0 always deals with the EPT violation. If nested EPT is
@@ -7314,6 +7586,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
        *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
+static int vmx_enable_pml(struct vcpu_vmx *vmx)
+{
+        struct page *pml_pg;
+        u32 exec_control;
+        pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        if (!pml_pg)
+                return -ENOMEM;
+        vmx->pml_pg = pml_pg;
+        vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+        vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+        exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+        exec_control |= SECONDARY_EXEC_ENABLE_PML;
+        vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+        return 0;
+}
+static void vmx_disable_pml(struct vcpu_vmx *vmx)
+{
+        u32 exec_control;
+        ASSERT(vmx->pml_pg);
+        __free_page(vmx->pml_pg);
+        vmx->pml_pg = NULL;
+        exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+        exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+        vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+}
+static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
+{
+        struct kvm *kvm = vmx->vcpu.kvm;
+        u64 *pml_buf;
+        u16 pml_idx;
+        pml_idx = vmcs_read16(GUEST_PML_INDEX);
+        /* Do nothing if PML buffer is empty */
+        if (pml_idx == (PML_ENTITY_NUM - 1))
+                return;
+        /* PML index always points to next available PML buffer entity */
+        if (pml_idx >= PML_ENTITY_NUM)
+                pml_idx = 0;
+        else
+                pml_idx++;
+        pml_buf = page_address(vmx->pml_pg);
+        for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+                u64 gpa;
+                gpa = pml_buf[pml_idx];
+                WARN_ON(gpa & (PAGE_SIZE - 1));
+                mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+        }
+        /* reset PML index */
+        vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+}
+/*
+ * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
+ * Called before reporting dirty_bitmap to userspace.
+ */
+static void kvm_flush_pml_buffers(struct kvm *kvm)
+{
+        int i;
+        struct kvm_vcpu *vcpu;
+        /*
+         * We only need to kick vcpu out of guest mode here, as PML buffer
+         * is flushed at beginning of all VMEXITs, and it's obvious that only
+         * vcpus running in guest are possible to have unflushed GPAs in PML
+         * buffer.
+         */
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                kvm_vcpu_kick(vcpu);
+}
 /*
 * The guest has exited.  See if we can fix it or if we need userspace
 * assistance.
@@ -7324,6 +7679,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        u32 exit_reason = vmx->exit_reason;
        u32 vectoring_info = vmx->idt_vectoring_info;
+        /*
+         * Flush logged GPAs PML buffer, this will make dirty_bitmap more
+         * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
+         * querying dirty_bitmap, we only need to kick all vcpus out of guest
+         * mode as if vcpus is in root mode, the PML buffer must has been
+         * flushed already.
+         */
+        if (enable_pml)
+                vmx_flush_pml_buffer(vmx);
        /* If guest state is invalid, start emulating */
        if (vmx->emulation_required)
                return handle_invalid_guest_state(vcpu);
@@ -7471,9 +7836,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
        u16 status;
        u8 old;
-        if (!vmx_vm_has_apicv(kvm))
-                return;
        if (isr == -1)
                isr = 0;
@@ -7973,6 +8335,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+        if (enable_pml)
+                vmx_disable_pml(vmx);
        free_vpid(vmx);
        leave_guest_mode(vcpu);
        vmx_load_vmcs01(vcpu);
@@ -8040,9 +8404,25 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
                        goto free_vmcs;
        }
+        if (nested)
+                nested_vmx_setup_ctls_msrs(vmx);
+        vmx->nested.posted_intr_nv = -1;
        vmx->nested.current_vmptr = -1ull;
        vmx->nested.current_vmcs12 = NULL;
+        /*
+         * If PML is turned on, failure on enabling PML just results in failure
+         * of creating the vcpu, therefore we can simplify PML logic (by
+         * avoiding dealing with cases, such as enabling PML partially on vcpus
+         * for the guest, etc.
+         */
+        if (enable_pml) {
+                err = vmx_enable_pml(vmx);
+                if (err)
+                        goto free_vmcs;
+        }
        return &vmx->vcpu;
 free_vmcs:
@@ -8184,9 +8564,10 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-        kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+        WARN_ON(mmu_is_nested(vcpu));
-                        nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
+        kvm_init_shadow_ept_mmu(vcpu,
+                        to_vmx(vcpu)->nested.nested_vmx_ept_caps &
+                        VMX_EPT_EXECUTE_ONLY_BIT);
        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
        vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -8199,6 +8580,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 }
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+                                            u16 error_code)
+{
+        bool inequality, bit;
+        bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
+        inequality =
+                (error_code & vmcs12->page_fault_error_code_mask) !=
+                 vmcs12->page_fault_error_code_match;
+        return inequality ^ bit;
+}
 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
                struct x86_exception *fault)
 {
@@ -8206,8 +8599,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
        WARN_ON(!is_guest_mode(vcpu));
-        /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+        if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
-        if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
                nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
                                  vmcs_read32(VM_EXIT_INTR_INFO),
                                  vmcs_readl(EXIT_QUALIFICATION));
@@ -8261,6 +8653,31 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                        return false;
        }
+        if (nested_cpu_has_posted_intr(vmcs12)) {
+                if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
+                        return false;
+                if (vmx->nested.pi_desc_page) { /* shouldn't happen */
+                        kunmap(vmx->nested.pi_desc_page);
+                        nested_release_page(vmx->nested.pi_desc_page);
+                }
+                vmx->nested.pi_desc_page =
+                        nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
+                if (!vmx->nested.pi_desc_page)
+                        return false;
+                vmx->nested.pi_desc =
+                        (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
+                if (!vmx->nested.pi_desc) {
+                        nested_release_page_clean(vmx->nested.pi_desc_page);
+                        return false;
+                }
+                vmx->nested.pi_desc =
+                        (struct pi_desc *)((void *)vmx->nested.pi_desc +
+                        (unsigned long)(vmcs12->posted_intr_desc_addr &
+                        (PAGE_SIZE - 1)));
+        }
        return true;
 }
@@ -8286,6 +8703,310 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
                      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }
+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+                                                struct vmcs12 *vmcs12)
+{
+        int maxphyaddr;
+        u64 addr;
+        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+                return 0;
+        if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
+                WARN_ON(1);
+                return -EINVAL;
+        }
+        maxphyaddr = cpuid_maxphyaddr(vcpu);
+        if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
+           ((addr + PAGE_SIZE) >> maxphyaddr))
+                return -EINVAL;
+        return 0;
+}
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+                                               struct vmcs12 *vmcs12)
+{
+        int msr;
+        struct page *page;
+        unsigned long *msr_bitmap;
+        if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+                return false;
+        page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+        if (!page) {
+                WARN_ON(1);
+                return false;
+        }
+        msr_bitmap = (unsigned long *)kmap(page);
+        if (!msr_bitmap) {
+                nested_release_page_clean(page);
+                WARN_ON(1);
+                return false;
+        }
+        if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+                if (nested_cpu_has_apic_reg_virt(vmcs12))
+                        for (msr = 0x800; msr <= 0x8ff; msr++)
+                                nested_vmx_disable_intercept_for_msr(
+                                        msr_bitmap,
+                                        vmx_msr_bitmap_nested,
+                                        msr, MSR_TYPE_R);
+                /* TPR is allowed */
+                nested_vmx_disable_intercept_for_msr(msr_bitmap,
+                                vmx_msr_bitmap_nested,
+                                APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+                                MSR_TYPE_R | MSR_TYPE_W);
+                if (nested_cpu_has_vid(vmcs12)) {
+                        /* EOI and self-IPI are allowed */
+                        nested_vmx_disable_intercept_for_msr(
+                                msr_bitmap,
+                                vmx_msr_bitmap_nested,
+                                APIC_BASE_MSR + (APIC_EOI >> 4),
+                                MSR_TYPE_W);
+                        nested_vmx_disable_intercept_for_msr(
+                                msr_bitmap,
+                                vmx_msr_bitmap_nested,
+                                APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+                                MSR_TYPE_W);
+                }
+        } else {
+                /*
+                 * Enable reading intercept of all the x2apic
+                 * MSRs. We should not rely on vmcs12 to do any
+                 * optimizations here, it may have been modified
+                 * by L1.
+                 */
+                for (msr = 0x800; msr <= 0x8ff; msr++)
+                        __vmx_enable_intercept_for_msr(
+                                vmx_msr_bitmap_nested,
+                                msr,
+                                MSR_TYPE_R);
+                __vmx_enable_intercept_for_msr(
+                                vmx_msr_bitmap_nested,
+                                APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+                                MSR_TYPE_W);
+                __vmx_enable_intercept_for_msr(
+                                vmx_msr_bitmap_nested,
+                                APIC_BASE_MSR + (APIC_EOI >> 4),
+                                MSR_TYPE_W);
+                __vmx_enable_intercept_for_msr(
+                                vmx_msr_bitmap_nested,
+                                APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+                                MSR_TYPE_W);
+        }
+        kunmap(page);
+        nested_release_page_clean(page);
+        return true;
+}
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+                                           struct vmcs12 *vmcs12)
+{
+        if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+            !nested_cpu_has_apic_reg_virt(vmcs12) &&
+            !nested_cpu_has_vid(vmcs12) &&
+            !nested_cpu_has_posted_intr(vmcs12))
+                return 0;
+        /*
+         * If virtualize x2apic mode is enabled,
+         * virtualize apic access must be disabled.
+         */
+        if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+            nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+                return -EINVAL;
+        /*
+         * If virtual interrupt delivery is enabled,
+         * we must exit on external interrupts.
+         */
+        if (nested_cpu_has_vid(vmcs12) &&
+           !nested_exit_on_intr(vcpu))
+                return -EINVAL;
+        /*
+         * bits 15:8 should be zero in posted_intr_nv,
+         * the descriptor address has been already checked
+         * in nested_get_vmcs12_pages.
+         */
+        if (nested_cpu_has_posted_intr(vmcs12) &&
+           (!nested_cpu_has_vid(vmcs12) ||
+            !nested_exit_intr_ack_set(vcpu) ||
+            vmcs12->posted_intr_nv & 0xff00))
+                return -EINVAL;
+        /* tpr shadow is needed by all apicv features. */
+        if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+                return -EINVAL;
+        return 0;
+}
+static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
+                                       unsigned long count_field,
+                                       unsigned long addr_field,
+                                       int maxphyaddr)
+{
+        u64 count, addr;
+        if (vmcs12_read_any(vcpu, count_field, &count) ||
+            vmcs12_read_any(vcpu, addr_field, &addr)) {
+                WARN_ON(1);
+                return -EINVAL;
+        }
+        if (count == 0)
+                return 0;
+        if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
+            (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
+                pr_warn_ratelimited(
+                        "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
+                        addr_field, maxphyaddr, count, addr);
+                return -EINVAL;
+        }
+        return 0;
+}
+static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
+                                                struct vmcs12 *vmcs12)
+{
+        int maxphyaddr;
+        if (vmcs12->vm_exit_msr_load_count == 0 &&
+            vmcs12->vm_exit_msr_store_count == 0 &&
+            vmcs12->vm_entry_msr_load_count == 0)
+                return 0; /* Fast path */
+        maxphyaddr = cpuid_maxphyaddr(vcpu);
+        if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
+                                        VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+            nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
+                                        VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+            nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
+                                        VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+                return -EINVAL;
+        return 0;
+}
+static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
+                                       struct vmx_msr_entry *e)
+{
+        /* x2APIC MSR accesses are not allowed */
+        if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+                return -EINVAL;
+        if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
+            e->index == MSR_IA32_UCODE_REV)
+                return -EINVAL;
+        if (e->reserved != 0)
+                return -EINVAL;
+        return 0;
+}
+static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
+                                     struct vmx_msr_entry *e)
+{
+        if (e->index == MSR_FS_BASE ||
+            e->index == MSR_GS_BASE ||
+            e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
+            nested_vmx_msr_check_common(vcpu, e))
+                return -EINVAL;
+        return 0;
+}
+static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
+                                      struct vmx_msr_entry *e)
+{
+        if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
+            nested_vmx_msr_check_common(vcpu, e))
+                return -EINVAL;
+        return 0;
+}
+/*
+ * Load guest's/host's msr at nested entry/exit.
+ * return 0 for success, entry index for failure.
+ */
+static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+        u32 i;
+        struct vmx_msr_entry e;
+        struct msr_data msr;
+        msr.host_initiated = false;
+        for (i = 0; i < count; i++) {
+                if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
+                                   &e, sizeof(e))) {
+                        pr_warn_ratelimited(
+                                "%s cannot read MSR entry (%u, 0x%08llx)\n",
+                                __func__, i, gpa + i * sizeof(e));
+                        goto fail;
+                }
+                if (nested_vmx_load_msr_check(vcpu, &e)) {
+                        pr_warn_ratelimited(
+                                "%s check failed (%u, 0x%x, 0x%x)\n",
+                                __func__, i, e.index, e.reserved);
+                        goto fail;
+                }
+                msr.index = e.index;
+                msr.data = e.value;
+                if (kvm_set_msr(vcpu, &msr)) {
+                        pr_warn_ratelimited(
+                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+                                __func__, i, e.index, e.value);
+                        goto fail;
+                }
+        }
+        return 0;
+fail:
+        return i + 1;
+}
+static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+        u32 i;
+        struct vmx_msr_entry e;
+        for (i = 0; i < count; i++) {
+                if (kvm_read_guest(vcpu->kvm,
+                                   gpa + i * sizeof(e),
+                                   &e, 2 * sizeof(u32))) {
+                        pr_warn_ratelimited(
+                                "%s cannot read MSR entry (%u, 0x%08llx)\n",
+                                __func__, i, gpa + i * sizeof(e));
+                        return -EINVAL;
+                }
+                if (nested_vmx_store_msr_check(vcpu, &e)) {
+                        pr_warn_ratelimited(
+                                "%s check failed (%u, 0x%x, 0x%x)\n",
+                                __func__, i, e.index, e.reserved);
+                        return -EINVAL;
+                }
+                if (kvm_get_msr(vcpu, e.index, &e.value)) {
+                        pr_warn_ratelimited(
+                                "%s cannot read MSR (%u, 0x%x)\n",
+                                __func__, i, e.index);
+                        return -EINVAL;
+                }
+                if (kvm_write_guest(vcpu->kvm,
+                                    gpa + i * sizeof(e) +
+                                        offsetof(struct vmx_msr_entry, value),
+                                    &e.value, sizeof(e.value))) {
+                        pr_warn_ratelimited(
+                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+                                __func__, i, e.index, e.value);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
 /*
 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -8365,8 +9086,23 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        exec_control = vmcs12->pin_based_vm_exec_control;
        exec_control |= vmcs_config.pin_based_exec_ctrl;
-        exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER |
+        exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
-                          PIN_BASED_POSTED_INTR);
+        if (nested_cpu_has_posted_intr(vmcs12)) {
+                /*
+                 * Note that we use L0's vector here and in
+                 * vmx_deliver_nested_posted_interrupt.
+                 */
+                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+                vmx->nested.pi_pending = false;
+                vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+                vmcs_write64(POSTED_INTR_DESC_ADDR,
+                        page_to_phys(vmx->nested.pi_desc_page) +
+                        (unsigned long)(vmcs12->posted_intr_desc_addr &
+                        (PAGE_SIZE - 1)));
+        } else
+                exec_control &= ~PIN_BASED_POSTED_INTR;
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
        vmx->nested.preemption_timer_expired = false;
@@ -8423,12 +9159,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                        else
                                vmcs_write64(APIC_ACCESS_ADDR,
                                  page_to_phys(vmx->nested.apic_access_page));
-                } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
+                } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+                            (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
                        exec_control |=
                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
                        kvm_vcpu_reload_apic_access_page(vcpu);
                }
+                if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
+                        vmcs_write64(EOI_EXIT_BITMAP0,
+                                vmcs12->eoi_exit_bitmap0);
+                        vmcs_write64(EOI_EXIT_BITMAP1,
+                                vmcs12->eoi_exit_bitmap1);
+                        vmcs_write64(EOI_EXIT_BITMAP2,
+                                vmcs12->eoi_exit_bitmap2);
+                        vmcs_write64(EOI_EXIT_BITMAP3,
+                                vmcs12->eoi_exit_bitmap3);
+                        vmcs_write16(GUEST_INTR_STATUS,
+                                vmcs12->guest_intr_status);
+                }
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
        }
@@ -8462,11 +9212,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
        }
+        if (cpu_has_vmx_msr_bitmap() &&
+            exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+            nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
+                vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+        } else
+                exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
        /*
-         * Merging of IO and MSR bitmaps not currently supported.
+         * Merging of IO bitmap not currently supported.
         * Rather, exit every time.
         */
-        exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
        exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
        exec_control |= CPU_BASED_UNCOND_IO_EXITING;
@@ -8582,6 +9338,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        int cpu;
        struct loaded_vmcs *vmcs02;
        bool ia32e;
+        u32 msr_entry_idx;
        if (!nested_vmx_check_permission(vcpu) ||
            !nested_vmx_check_vmcs12(vcpu))
@@ -8616,41 +9373,42 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return 1;
        }
-        if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
+        if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
-                        !PAGE_ALIGNED(vmcs12->msr_bitmap)) {
                /*TODO: Also verify bits beyond physical address width are 0*/
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }
-        if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
+        if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
-                /*TODO: Also verify bits beyond physical address width are 0*/
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }
-        if (vmcs12->vm_entry_msr_load_count > 0 ||
+        if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
-            vmcs12->vm_exit_msr_load_count > 0 ||
+                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-            vmcs12->vm_exit_msr_store_count > 0) {
+                return 1;
-                pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
+        }
-                                    __func__);
+        if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }
        if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
-                                nested_vmx_true_procbased_ctls_low,
+                                vmx->nested.nested_vmx_true_procbased_ctls_low,
-                                nested_vmx_procbased_ctls_high) ||
+                                vmx->nested.nested_vmx_procbased_ctls_high) ||
            !vmx_control_verify(vmcs12->secondary_vm_exec_control,
-              nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
+                                vmx->nested.nested_vmx_secondary_ctls_low,
+                                vmx->nested.nested_vmx_secondary_ctls_high) ||
            !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
-              nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
+                                vmx->nested.nested_vmx_pinbased_ctls_low,
+                                vmx->nested.nested_vmx_pinbased_ctls_high) ||
            !vmx_control_verify(vmcs12->vm_exit_controls,
-                                nested_vmx_true_exit_ctls_low,
+                                vmx->nested.nested_vmx_true_exit_ctls_low,
-                                nested_vmx_exit_ctls_high) ||
+                                vmx->nested.nested_vmx_exit_ctls_high) ||
            !vmx_control_verify(vmcs12->vm_entry_controls,
-                                nested_vmx_true_entry_ctls_low,
+                                vmx->nested.nested_vmx_true_entry_ctls_low,
-                                nested_vmx_entry_ctls_high))
+                                vmx->nested.nested_vmx_entry_ctls_high))
        {
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
@@ -8663,7 +9421,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return 1;
        }
-        if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
+        if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
            ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
                nested_vmx_entry_failure(vcpu, vmcs12,
                        EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
@@ -8739,10 +9497,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        vmx_segment_cache_clear(vmx);
-        vmcs12->launch_state = 1;
        prepare_vmcs02(vcpu, vmcs12);
+        msr_entry_idx = nested_vmx_load_msr(vcpu,
+                                            vmcs12->vm_entry_msr_load_addr,
+                                            vmcs12->vm_entry_msr_load_count);
+        if (msr_entry_idx) {
+                leave_guest_mode(vcpu);
+                vmx_load_vmcs01(vcpu);
+                nested_vmx_entry_failure(vcpu, vmcs12,
+                                EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
+                return 1;
+        }
+        vmcs12->launch_state = 1;
        if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
                return kvm_emulate_halt(vcpu);
@@ -8869,9 +9638,10 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
                if (vmx->nested.nested_run_pending)
                        return -EBUSY;
                nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+                return 0;
        }
-        return 0;
+        return vmx_complete_nested_posted_interrupt(vcpu);
 }
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@ -8981,6 +9751,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
        }
+        if (nested_cpu_has_vid(vmcs12))
+                vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
        vmcs12->vm_entry_controls =
                (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
                (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
@@ -9172,6 +9945,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        kvm_set_dr(vcpu, 7, 0x400);
        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+        if (cpu_has_vmx_msr_bitmap())
+                vmx_set_msr_bitmap(vcpu);
+        if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+                                vmcs12->vm_exit_msr_load_count))
+                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
 }
 /*
@@ -9193,6 +9973,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
                       exit_qualification);
+        if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+                                 vmcs12->vm_exit_msr_store_count))
+                nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
        vmx_load_vmcs01(vcpu);
        if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
@@ -9235,6 +10019,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                nested_release_page(vmx->nested.virtual_apic_page);
                vmx->nested.virtual_apic_page = NULL;
        }
+        if (vmx->nested.pi_desc_page) {
+                kunmap(vmx->nested.pi_desc_page);
+                nested_release_page(vmx->nested.pi_desc_page);
+                vmx->nested.pi_desc_page = NULL;
+                vmx->nested.pi_desc = NULL;
+        }
        /*
         * We are now running in L2, mmu_notifier will force to reload the
@@ -9301,6 +10091,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
                shrink_ple_window(vcpu);
 }
+static void vmx_slot_enable_log_dirty(struct kvm *kvm,
+                                     struct kvm_memory_slot *slot)
+{
+        kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+        kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
+}
+static void vmx_slot_disable_log_dirty(struct kvm *kvm,
+                                       struct kvm_memory_slot *slot)
+{
+        kvm_mmu_slot_set_dirty(kvm, slot);
+}
+static void vmx_flush_log_dirty(struct kvm *kvm)
+{
+        kvm_flush_pml_buffers(kvm);
+}
+static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
+                                           struct kvm_memory_slot *memslot,
+                                           gfn_t offset, unsigned long mask)
+{
+        kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+}
 static struct kvm_x86_ops vmx_x86_ops = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
@@ -9409,6 +10224,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .check_nested_events = vmx_check_nested_events,
        .sched_in = vmx_sched_in,
+        .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
+        .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
+        .flush_log_dirty = vmx_flush_log_dirty,
+        .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
 };
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c259814200bd..bd7a70be41b3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
 static u32 tsc_tolerance_ppm = 250;
 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+/* lapic timer advance (tscdeadline mode only) in nanoseconds */
+unsigned int lapic_timer_advance_ns = 0;
+module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
 static bool backwards_tsc_observed = false;
 #define KVM_NR_SHARED_MSRS 16
@@ -141,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "irq_window", VCPU_STAT(irq_window_exits) },
        { "nmi_window", VCPU_STAT(nmi_window_exits) },
        { "halt_exits", VCPU_STAT(halt_exits) },
+        { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
        { "halt_wakeup", VCPU_STAT(halt_wakeup) },
        { "hypercalls", VCPU_STAT(hypercalls) },
        { "request_irq", VCPU_STAT(request_irq_exits) },
@@ -492,7 +497,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
                               void *data, int offset, int len, u32 access)
 {
        return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
@@ -643,7 +648,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
        }
 }
-int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
        u64 xcr0 = xcr;
        u64 old_xcr0 = vcpu->arch.xcr0;
@@ -1083,6 +1088,15 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 }
 #endif
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
+{
+        /*
+         * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
+         * vcpu_enter_guest.  This function is only called from
+         * the physical CPU that is running vcpu.
+         */
+        kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+}
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
@@ -1180,7 +1194,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
 #endif
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-unsigned long max_tsc_khz;
+static unsigned long max_tsc_khz;
 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 {
@@ -1234,7 +1248,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
        return tsc;
 }
-void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
        bool vcpus_matched;
@@ -1529,7 +1543,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
                                        &ka->master_cycle_now);
        ka->use_master_clock = host_tsc_clocksource && vcpus_matched
-                                && !backwards_tsc_observed;
+                                && !backwards_tsc_observed
+                                && !ka->boot_vcpu_runs_old_kvmclock;
        if (ka->use_master_clock)
                atomic_set(&kvm_guest_has_master_clock, 1);
@@ -2161,8 +2176,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_KVM_SYSTEM_TIME_NEW:
        case MSR_KVM_SYSTEM_TIME: {
                u64 gpa_offset;
+                struct kvm_arch *ka = &vcpu->kvm->arch;
                kvmclock_reset(vcpu);
+                if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
+                        bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
+                        if (ka->boot_vcpu_runs_old_kvmclock != tmp)
+                                set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+                                        &vcpu->requests);
+                        ka->boot_vcpu_runs_old_kvmclock = tmp;
+                }
                vcpu->arch.time = data;
                kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
@@ -2324,6 +2351,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
        return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 }
+EXPORT_SYMBOL_GPL(kvm_get_msr);
 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
@@ -2738,6 +2766,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_READONLY_MEM:
        case KVM_CAP_HYPERV_TIME:
        case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+        case KVM_CAP_TSC_DEADLINE_TIMER:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
        case KVM_CAP_ASSIGN_DEV_IRQ:
        case KVM_CAP_PCI_2_3:
@@ -2776,9 +2805,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_TSC_CONTROL:
                r = kvm_has_tsc_control;
                break;
-        case KVM_CAP_TSC_DEADLINE_TIMER:
-                r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
-                break;
        default:
                r = 0;
                break;
@@ -3734,83 +3760,43 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 * @kvm: kvm instance
 * @log: slot id and address to which we copy the log
 *
- * We need to keep it in mind that VCPU threads can write to the bitmap
+ * Steps 1-4 below provide general overview of dirty page logging. See
- * concurrently.  So, to avoid losing data, we keep the following order for
+ * kvm_get_dirty_log_protect() function description for additional details.
- * each bit:
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed  and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
 *
 *   1. Take a snapshot of the bit and clear it if needed.
 *   2. Write protect the corresponding page.
- *   3. Flush TLB's if needed.
+ *   3. Copy the snapshot to the userspace.
- *   4. Copy the snapshot to the userspace.
+ *   4. Flush TLB's if needed.
- *
- * Between 2 and 3, the guest may write to the page using the remaining TLB
- * entry.  This is not a problem because the page will be reported dirty at
- * step 4 using the snapshot taken before and step 3 ensures that successive
- * writes will be logged for the next call.
 */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
-        int r;
-        struct kvm_memory_slot *memslot;
-        unsigned long n, i;
-        unsigned long *dirty_bitmap;
-        unsigned long *dirty_bitmap_buffer;
        bool is_dirty = false;
+        int r;
        mutex_lock(&kvm->slots_lock);
-        r = -EINVAL;
+        /*
-        if (log->slot >= KVM_USER_MEM_SLOTS)
+         * Flush potentially hardware-cached dirty pages to dirty_bitmap.
-                goto out;
+         */
+        if (kvm_x86_ops->flush_log_dirty)
-        memslot = id_to_memslot(kvm->memslots, log->slot);
+                kvm_x86_ops->flush_log_dirty(kvm);
-        dirty_bitmap = memslot->dirty_bitmap;
-        r = -ENOENT;
-        if (!dirty_bitmap)
-                goto out;
-        n = kvm_dirty_bitmap_bytes(memslot);
-        dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
-        memset(dirty_bitmap_buffer, 0, n);
-        spin_lock(&kvm->mmu_lock);
-        for (i = 0; i < n / sizeof(long); i++) {
-                unsigned long mask;
-                gfn_t offset;
-                if (!dirty_bitmap[i])
-                        continue;
-                is_dirty = true;
-                mask = xchg(&dirty_bitmap[i], 0);
-                dirty_bitmap_buffer[i] = mask;
-                offset = i * BITS_PER_LONG;
-                kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
-        }
-        spin_unlock(&kvm->mmu_lock);
-        /* See the comments in kvm_mmu_slot_remove_write_access(). */
+        r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
-        lockdep_assert_held(&kvm->slots_lock);
        /*
         * All the TLBs can be flushed out of mmu lock, see the comments in
         * kvm_mmu_slot_remove_write_access().
         */
+        lockdep_assert_held(&kvm->slots_lock);
        if (is_dirty)
                kvm_flush_remote_tlbs(kvm);
-        r = -EFAULT;
-        if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
-                goto out;
-        r = 0;
-out:
        mutex_unlock(&kvm->slots_lock);
        return r;
 }
@@ -4516,6 +4502,8 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
                if (rc != X86EMUL_CONTINUE)
                        return rc;
                addr += now;
+                if (ctxt->mode != X86EMUL_MODE_PROT64)
+                        addr = (u32)addr;
                val += now;
                bytes -= now;
        }
@@ -4984,6 +4972,11 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
        kvm_register_write(emul_to_vcpu(ctxt), reg, val);
 }
+static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
+{
+        kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
+}
 static const struct x86_emulate_ops emulate_ops = {
        .read_gpr            = emulator_read_gpr,
        .write_gpr           = emulator_write_gpr,
@@ -5019,6 +5012,7 @@ static const struct x86_emulate_ops emulate_ops = {
        .put_fpu             = emulator_put_fpu,
        .intercept           = emulator_intercept,
        .get_cpuid           = emulator_get_cpuid,
+        .set_nmi_mask        = emulator_set_nmi_mask,
 };
 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6311,6 +6305,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        }
        trace_kvm_entry(vcpu->vcpu_id);
+        wait_lapic_expire(vcpu);
        kvm_x86_ops->run(vcpu);
        /*
@@ -7041,15 +7036,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        return r;
 }
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-        int r;
        struct msr_data msr;
        struct kvm *kvm = vcpu->kvm;
-        r = vcpu_load(vcpu);
+        if (vcpu_load(vcpu))
-        if (r)
+                return;
-                return r;
        msr.data = 0x0;
        msr.index = MSR_IA32_TSC;
        msr.host_initiated = true;
@@ -7058,8 +7051,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
                                        KVMCLOCK_SYNC_PERIOD);
-        return r;
 }
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -7549,12 +7540,62 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
        return 0;
 }
+static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
+                                     struct kvm_memory_slot *new)
+{
+        /* Still write protect RO slot */
+        if (new->flags & KVM_MEM_READONLY) {
+                kvm_mmu_slot_remove_write_access(kvm, new);
+                return;
+        }
+        /*
+         * Call kvm_x86_ops dirty logging hooks when they are valid.
+         *
+         * kvm_x86_ops->slot_disable_log_dirty is called when:
+         *
+         *  - KVM_MR_CREATE with dirty logging is disabled
+         *  - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
+         *
+         * The reason is, in case of PML, we need to set D-bit for any slots
+         * with dirty logging disabled in order to eliminate unnecessary GPA
+         * logging in PML buffer (and potential PML buffer full VMEXT). This
+         * guarantees leaving PML enabled during guest's lifetime won't have
+         * any additonal overhead from PML when guest is running with dirty
+         * logging disabled for memory slots.
+         *
+         * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
+         * to dirty logging mode.
+         *
+         * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
+         *
+         * In case of write protect:
+         *
+         * Write protect all pages for dirty logging.
+         *
+         * All the sptes including the large sptes which point to this
+         * slot are set to readonly. We can not create any new large
+         * spte on this slot until the end of the logging.
+         *
+         * See the comments in fast_page_fault().
+         */
+        if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+                if (kvm_x86_ops->slot_enable_log_dirty)
+                        kvm_x86_ops->slot_enable_log_dirty(kvm, new);
+                else
+                        kvm_mmu_slot_remove_write_access(kvm, new);
+        } else {
+                if (kvm_x86_ops->slot_disable_log_dirty)
+                        kvm_x86_ops->slot_disable_log_dirty(kvm, new);
+        }
+}
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem,
                                const struct kvm_memory_slot *old,
                                enum kvm_mr_change change)
 {
+        struct kvm_memory_slot *new;
        int nr_mmu_pages = 0;
        if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
@@ -7573,17 +7614,20 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
        if (nr_mmu_pages)
                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+        /* It's OK to get 'new' slot here as it has already been installed */
+        new = id_to_memslot(kvm->memslots, mem->slot);
        /*
-         * Write protect all pages for dirty logging.
+         * Set up write protection and/or dirty logging for the new slot.
         *
-         * All the sptes including the large sptes which point to this
+         * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
-         * slot are set to readonly. We can not create any new large
+         * been zapped so no dirty logging staff is needed for old slot. For
-         * spte on this slot until the end of the logging.
+         * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
-         *
+         * new and it's also covered when dealing with the new slot.
-         * See the comments in fast_page_fault().
         */
-        if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+        if (change != KVM_MR_DELETE)
-                kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+                kvm_mmu_slot_apply_flags(kvm, new);
 }
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7837,3 +7881,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cc1d61af6140..f5fef1868096 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -170,5 +171,7 @@ extern u64 kvm_supported_xcr0(void);
 extern unsigned int min_timer_period_us;
+extern unsigned int lapic_timer_advance_ns;
 extern struct static_key kvm_no_apic_vcpu;
 #endif
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-13 12:55:09 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-13 12:55:09 -0500
commit	b9085bcbf5f43adf60533f9b635b2e7faeed0fe9 (patch)
tree	e397abf5682a45c096e75b3d0fa99c8e228425fc /arch/x86/kvm
parent	c7d7b98671552abade78834c522b7308bda73c0d (diff)
parent	6557bada461afeaa920a189fae2cff7c8fdce39f (diff)