Merge branch 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm updates from Avi Kivity: "Changes include timekeeping improvements, support for assigning host PCI devices that share interrupt lines, s390 user-controlled guests, a large ppc update, and random fixes." This is with the sign-off's fixed, hopefully next merge window we won't have rebased commits. * 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits) KVM: Convert intx_mask_lock to spin lock KVM: x86: fix kvm_write_tsc() TSC matching thinko x86: kvmclock: abstract save/restore sched_clock_state KVM: nVMX: Fix erroneous exception bitmap check KVM: Ignore the writes to MSR_K7_HWCR(3) KVM: MMU: make use of ->root_level in reset_rsvds_bits_mask KVM: PMU: add proper support for fixed counter 2 KVM: PMU: Fix raw event check KVM: PMU: warn when pin control is set in eventsel msr KVM: VMX: Fix delayed load of shared MSRs KVM: use correct tlbs dirty type in cmpxchg KVM: Allow host IRQ sharing for assigned PCI 2.3 devices KVM: Ensure all vcpus are consistent with in-kernel irqchip settings KVM: x86 emulator: Allow PM/VM86 switch during task switch KVM: SVM: Fix CPL updates KVM: x86 emulator: VM86 segments must have DPL 3 KVM: x86 emulator: Fix task switch privilege checks arch/powerpc/kvm/book3s_hv.c: included linux/sched.h twice KVM: x86 emulator: correctly mask pmc index bits in RDPMC instruction emulation KVM: mmu_notifier: Flush TLBs before releasing mmu_lock ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-03-28 17:35:31 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-03-28 17:35:31 -0400
commit: 2e7580b0e75d771d93e24e681031a165b1d31071 (patch)
tree: d9449702609eeaab28913a43b5a4434667e09d43 /arch/x86/kvm
parent: d25413efa9536e2f425ea45c7720598035c597bc (diff)
parent: cf9eeac46350b8b43730b7dc5e999757bed089a4 (diff)
11 files changed, 595 insertions, 206 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 89b02bfaaca5..9fed5bedaad6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        const u32 kvm_supported_word6_x86_features =
                F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
                F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
-                F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
+                F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
                0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
        /* cpuid 0xC0000001.edx */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 5b97e1797a6d..26d1fb437eb5 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
        return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
 }
+static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
+{
+        struct kvm_cpuid_entry2 *best;
+        best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+        return best && (best->ecx & bit(X86_FEATURE_OSVW));
+}
 #endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0982507b962a..83756223f8aa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -57,6 +57,7 @@
 #define OpDS              23ull  /* DS */
 #define OpFS              24ull  /* FS */
 #define OpGS              25ull  /* GS */
+#define OpMem8            26ull  /* 8-bit zero extended memory operand */
 #define OpBits             5  /* Width of operand field */
 #define OpMask             ((1ull << OpBits) - 1)
@@ -101,6 +102,7 @@
 #define SrcAcc      (OpAcc << SrcShift)
 #define SrcImmU16   (OpImmU16 << SrcShift)
 #define SrcDX       (OpDX << SrcShift)
+#define SrcMem8     (OpMem8 << SrcShift)
 #define SrcMask     (OpMask << SrcShift)
 #define BitOp       (1<<11)
 #define MemAbs      (1<<12)      /* Memory operand is absolute displacement */
@@ -858,8 +860,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
 }
 static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
-                                    struct operand *op,
+                                    struct operand *op)
-                                    int inhibit_bytereg)
 {
        unsigned reg = ctxt->modrm_reg;
        int highbyte_regs = ctxt->rex_prefix == 0;
@@ -876,7 +877,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
        }
        op->type = OP_REG;
-        if ((ctxt->d & ByteOp) && !inhibit_bytereg) {
+        if (ctxt->d & ByteOp) {
                op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
                op->bytes = 1;
        } else {
@@ -1151,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
        return 1;
 }
+static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
+                                     u16 index, struct desc_struct *desc)
+{
+        struct desc_ptr dt;
+        ulong addr;
+        ctxt->ops->get_idt(ctxt, &dt);
+        if (dt.size < index * 8 + 7)
+                return emulate_gp(ctxt, index << 3 | 0x2);
+        addr = dt.address + index * 8;
+        return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+                                   &ctxt->exception);
+}
 static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
                                     u16 selector, struct desc_ptr *dt)
 {
@@ -1227,6 +1244,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
                seg_desc.type = 3;
                seg_desc.p = 1;
                seg_desc.s = 1;
+                if (ctxt->mode == X86EMUL_MODE_VM86)
+                        seg_desc.dpl = 3;
                goto load;
        }
@@ -1891,6 +1910,17 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
        ss->p = 1;
 }
+static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
+{
+        u32 eax, ebx, ecx, edx;
+        eax = ecx = 0;
+        return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)
+                && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
+                && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
+                && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
+}
 static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
 {
        struct x86_emulate_ops *ops = ctxt->ops;
@@ -2007,6 +2037,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
        if (ctxt->mode == X86EMUL_MODE_REAL)
                return emulate_gp(ctxt, 0);
+        /*
+         * Not recognized on AMD in compat mode (but is recognized in legacy
+         * mode).
+         */
+        if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA)
+            && !vendor_intel(ctxt))
+                return emulate_ud(ctxt);
        /* XXX sysenter/sysexit have not been tested in 64bit mode.
        * Therefore, we inject an #UD.
        */
@@ -2306,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
                return emulate_gp(ctxt, 0);
        ctxt->_eip = tss->eip;
        ctxt->eflags = tss->eflags | 2;
+        /* General purpose registers */
        ctxt->regs[VCPU_REGS_RAX] = tss->eax;
        ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
        ctxt->regs[VCPU_REGS_RDX] = tss->edx;
@@ -2328,6 +2368,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
        set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
        /*
+         * If we're switching between Protected Mode and VM86, we need to make
+         * sure to update the mode before loading the segment descriptors so
+         * that the selectors are interpreted correctly.
+         *
+         * Need to get rflags to the vcpu struct immediately because it
+         * influences the CPL which is checked at least when loading the segment
+         * descriptors and when pushing an error code to the new kernel stack.
+         *
+         * TODO Introduce a separate ctxt->ops->set_cpl callback
+         */
+        if (ctxt->eflags & X86_EFLAGS_VM)
+                ctxt->mode = X86EMUL_MODE_VM86;
+        else
+                ctxt->mode = X86EMUL_MODE_PROT32;
+        ctxt->ops->set_rflags(ctxt, ctxt->eflags);
+        /*
         * Now load segment descriptors. If fault happenes at this stage
         * it is handled in a context of new task
         */
@@ -2401,7 +2459,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 }
 static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
-                                   u16 tss_selector, int reason,
+                                   u16 tss_selector, int idt_index, int reason,
                                   bool has_error_code, u32 error_code)
 {
        struct x86_emulate_ops *ops = ctxt->ops;
@@ -2423,12 +2481,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
        /* FIXME: check that next_tss_desc is tss */
-        if (reason != TASK_SWITCH_IRET) {
+        /*
-                if ((tss_selector & 3) > next_tss_desc.dpl ||
+         * Check privileges. The three cases are task switch caused by...
-                    ops->cpl(ctxt) > next_tss_desc.dpl)
+         *
-                        return emulate_gp(ctxt, 0);
+         * 1. jmp/call/int to task gate: Check against DPL of the task gate
+         * 2. Exception/IRQ/iret: No check is performed
+         * 3. jmp/call to TSS: Check agains DPL of the TSS
+         */
+        if (reason == TASK_SWITCH_GATE) {
+                if (idt_index != -1) {
+                        /* Software interrupts */
+                        struct desc_struct task_gate_desc;
+                        int dpl;
+                        ret = read_interrupt_descriptor(ctxt, idt_index,
+                                                        &task_gate_desc);
+                        if (ret != X86EMUL_CONTINUE)
+                                return ret;
+                        dpl = task_gate_desc.dpl;
+                        if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+                                return emulate_gp(ctxt, (idt_index << 3) | 0x2);
+                }
+        } else if (reason != TASK_SWITCH_IRET) {
+                int dpl = next_tss_desc.dpl;
+                if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+                        return emulate_gp(ctxt, tss_selector);
        }
        desc_limit = desc_limit_scaled(&next_tss_desc);
        if (!next_tss_desc.p ||
            ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
@@ -2481,7 +2562,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 }
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
-                         u16 tss_selector, int reason,
+                         u16 tss_selector, int idt_index, int reason,
                         bool has_error_code, u32 error_code)
 {
        int rc;
@@ -2489,7 +2570,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
        ctxt->_eip = ctxt->eip;
        ctxt->dst.type = OP_NONE;
-        rc = emulator_do_task_switch(ctxt, tss_selector, reason,
+        rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
                                     has_error_code, error_code);
        if (rc == X86EMUL_CONTINUE)
@@ -3514,13 +3595,13 @@ static struct opcode twobyte_table[256] = {
        I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
        I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
        I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
-        D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+        D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
        /* 0xB8 - 0xBF */
        N, N,
        G(BitOp, group8),
        I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
        I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
-        D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+        D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
        /* 0xC0 - 0xCF */
        D2bv(DstMem | SrcReg | ModRM | Lock),
        N, D(DstMem | SrcReg | ModRM | Mov),
@@ -3602,9 +3683,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
        switch (d) {
        case OpReg:
-                decode_register_operand(ctxt, op,
+                decode_register_operand(ctxt, op);
-                         op == &ctxt->dst &&
-                         ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
                break;
        case OpImmUByte:
                rc = decode_imm(ctxt, op, 1, false);
@@ -3656,6 +3735,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
        case OpImm:
                rc = decode_imm(ctxt, op, imm_size(ctxt), true);
                break;
+        case OpMem8:
+                ctxt->memop.bytes = 1;
+                goto mem_common;
        case OpMem16:
                ctxt->memop.bytes = 2;
                goto mem_common;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index b6a73537e1ef..81cf4fa4a2be 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -307,6 +307,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
                if (val & 0x10) {
                        s->init4 = val & 1;
                        s->last_irr = 0;
+                        s->irr &= s->elcr;
                        s->imr = 0;
                        s->priority_add = 0;
                        s->special_mask = 0;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 31bfc6927bc0..858432287ab6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -433,7 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                break;
        case APIC_DM_INIT:
-                if (level) {
+                if (!trig_mode || level) {
                        result = 1;
                        vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
                        kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
                u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
                u64 ns = 0;
                struct kvm_vcpu *vcpu = apic->vcpu;
-                unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu);
+                unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
                unsigned long flags;
                if (unlikely(!tscdeadline || !this_tsc_khz))
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 224b02c3cda9..4cb164268846 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -688,9 +688,8 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 {
        unsigned long idx;
-        idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+        idx = gfn_to_index(gfn, slot->base_gfn, level);
-              (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+        return &slot->arch.lpage_info[level - 2][idx];
-        return &slot->lpage_info[level - 2][idx];
 }
 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -946,7 +945,7 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
        }
 }
-static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level,
+static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
                                    struct kvm_memory_slot *slot)
 {
        struct kvm_lpage_info *linfo;
@@ -966,7 +965,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
        struct kvm_memory_slot *slot;
        slot = gfn_to_memslot(kvm, gfn);
-        return __gfn_to_rmap(kvm, gfn, level, slot);
+        return __gfn_to_rmap(gfn, level, slot);
 }
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -988,7 +987,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
        return pte_list_add(vcpu, spte, rmapp);
 }
-static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
+static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
 {
        return pte_list_next(rmapp, spte);
 }
@@ -1018,8 +1017,8 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
        u64 *spte;
        int i, write_protected = 0;
-        rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot);
+        rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
-        spte = rmap_next(kvm, rmapp, NULL);
+        spte = rmap_next(rmapp, NULL);
        while (spte) {
                BUG_ON(!(*spte & PT_PRESENT_MASK));
                rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
@@ -1027,14 +1026,14 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
                        mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
                        write_protected = 1;
                }
-                spte = rmap_next(kvm, rmapp, spte);
+                spte = rmap_next(rmapp, spte);
        }
        /* check for huge page mappings */
        for (i = PT_DIRECTORY_LEVEL;
             i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-                rmapp = __gfn_to_rmap(kvm, gfn, i, slot);
+                rmapp = __gfn_to_rmap(gfn, i, slot);
-                spte = rmap_next(kvm, rmapp, NULL);
+                spte = rmap_next(rmapp, NULL);
                while (spte) {
                        BUG_ON(!(*spte & PT_PRESENT_MASK));
                        BUG_ON(!is_large_pte(*spte));
@@ -1045,7 +1044,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
                                spte = NULL;
                                write_protected = 1;
                        }
-                        spte = rmap_next(kvm, rmapp, spte);
+                        spte = rmap_next(rmapp, spte);
                }
        }
@@ -1066,7 +1065,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
        u64 *spte;
        int need_tlb_flush = 0;
-        while ((spte = rmap_next(kvm, rmapp, NULL))) {
+        while ((spte = rmap_next(rmapp, NULL))) {
                BUG_ON(!(*spte & PT_PRESENT_MASK));
                rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
                drop_spte(kvm, spte);
@@ -1085,14 +1084,14 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
        WARN_ON(pte_huge(*ptep));
        new_pfn = pte_pfn(*ptep);
-        spte = rmap_next(kvm, rmapp, NULL);
+        spte = rmap_next(rmapp, NULL);
        while (spte) {
                BUG_ON(!is_shadow_present_pte(*spte));
                rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
                need_flush = 1;
                if (pte_write(*ptep)) {
                        drop_spte(kvm, spte);
-                        spte = rmap_next(kvm, rmapp, NULL);
+                        spte = rmap_next(rmapp, NULL);
                } else {
                        new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
                        new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1102,7 +1101,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
                        new_spte &= ~shadow_accessed_mask;
                        mmu_spte_clear_track_bits(spte);
                        mmu_spte_set(spte, new_spte);
-                        spte = rmap_next(kvm, rmapp, spte);
+                        spte = rmap_next(rmapp, spte);
                }
        }
        if (need_flush)
@@ -1176,7 +1175,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
        if (!shadow_accessed_mask)
                return kvm_unmap_rmapp(kvm, rmapp, data);
-        spte = rmap_next(kvm, rmapp, NULL);
+        spte = rmap_next(rmapp, NULL);
        while (spte) {
                int _young;
                u64 _spte = *spte;
@@ -1186,7 +1185,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
                        young = 1;
                        clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
                }
-                spte = rmap_next(kvm, rmapp, spte);
+                spte = rmap_next(rmapp, spte);
        }
        return young;
 }
@@ -1205,7 +1204,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
        if (!shadow_accessed_mask)
                goto out;
-        spte = rmap_next(kvm, rmapp, NULL);
+        spte = rmap_next(rmapp, NULL);
        while (spte) {
                u64 _spte = *spte;
                BUG_ON(!(_spte & PT_PRESENT_MASK));
@@ -1214,7 +1213,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
                        young = 1;
                        break;
                }
-                spte = rmap_next(kvm, rmapp, spte);
+                spte = rmap_next(rmapp, spte);
        }
 out:
        return young;
@@ -1391,11 +1390,6 @@ struct kvm_mmu_pages {
        unsigned int nr;
 };
-#define for_each_unsync_children(bitmap, idx)           \
-        for (idx = find_first_bit(bitmap, 512);         \
-             idx < 512;                                 \
-             idx = find_next_bit(bitmap, 512, idx+1))
 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
                         int idx)
 {
@@ -1417,7 +1411,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 {
        int i, ret, nr_unsync_leaf = 0;
-        for_each_unsync_children(sp->unsync_child_bitmap, i) {
+        for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
                struct kvm_mmu_page *child;
                u64 ent = sp->spt[i];
@@ -1803,6 +1797,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 {
        if (is_large_pte(*sptep)) {
                drop_spte(vcpu->kvm, sptep);
+                --vcpu->kvm->stat.lpages;
                kvm_flush_remote_tlbs(vcpu->kvm);
        }
 }
@@ -3190,15 +3185,14 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
 #undef PTTYPE
 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu *context,
+                                  struct kvm_mmu *context)
-                                  int level)
 {
        int maxphyaddr = cpuid_maxphyaddr(vcpu);
        u64 exb_bit_rsvd = 0;
        if (!context->nx)
                exb_bit_rsvd = rsvd_bits(63, 63);
-        switch (level) {
+        switch (context->root_level) {
        case PT32_ROOT_LEVEL:
                /* no rsvd bits for 2 level 4K page table entries */
                context->rsvd_bits_mask[0][1] = 0;
@@ -3256,8 +3250,9 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
                                        int level)
 {
        context->nx = is_nx(vcpu);
+        context->root_level = level;
-        reset_rsvds_bits_mask(vcpu, context, level);
+        reset_rsvds_bits_mask(vcpu, context);
        ASSERT(is_pae(vcpu));
        context->new_cr3 = paging_new_cr3;
@@ -3267,7 +3262,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
        context->invlpg = paging64_invlpg;
        context->update_pte = paging64_update_pte;
        context->free = paging_free;
-        context->root_level = level;
        context->shadow_root_level = level;
        context->root_hpa = INVALID_PAGE;
        context->direct_map = false;
@@ -3284,8 +3278,9 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
                                 struct kvm_mmu *context)
 {
        context->nx = false;
+        context->root_level = PT32_ROOT_LEVEL;
-        reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
+        reset_rsvds_bits_mask(vcpu, context);
        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging32_page_fault;
@@ -3294,7 +3289,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
        context->sync_page = paging32_sync_page;
        context->invlpg = paging32_invlpg;
        context->update_pte = paging32_update_pte;
-        context->root_level = PT32_ROOT_LEVEL;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
        context->root_hpa = INVALID_PAGE;
        context->direct_map = false;
@@ -3325,7 +3319,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        context->get_cr3 = get_cr3;
        context->get_pdptr = kvm_pdptr_read;
        context->inject_page_fault = kvm_inject_page_fault;
-        context->nx = is_nx(vcpu);
        if (!is_paging(vcpu)) {
                context->nx = false;
@@ -3333,19 +3326,19 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
                context->root_level = 0;
        } else if (is_long_mode(vcpu)) {
                context->nx = is_nx(vcpu);
-                reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
-                context->gva_to_gpa = paging64_gva_to_gpa;
                context->root_level = PT64_ROOT_LEVEL;
+                reset_rsvds_bits_mask(vcpu, context);
+                context->gva_to_gpa = paging64_gva_to_gpa;
        } else if (is_pae(vcpu)) {
                context->nx = is_nx(vcpu);
-                reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
-                context->gva_to_gpa = paging64_gva_to_gpa;
                context->root_level = PT32E_ROOT_LEVEL;
+                reset_rsvds_bits_mask(vcpu, context);
+                context->gva_to_gpa = paging64_gva_to_gpa;
        } else {
                context->nx = false;
-                reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
-                context->gva_to_gpa = paging32_gva_to_gpa;
                context->root_level = PT32_ROOT_LEVEL;
+                reset_rsvds_bits_mask(vcpu, context);
+                context->gva_to_gpa = paging32_gva_to_gpa;
        }
        return 0;
@@ -3408,18 +3401,18 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
                g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
        } else if (is_long_mode(vcpu)) {
                g_context->nx = is_nx(vcpu);
-                reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
                g_context->root_level = PT64_ROOT_LEVEL;
+                reset_rsvds_bits_mask(vcpu, g_context);
                g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
        } else if (is_pae(vcpu)) {
                g_context->nx = is_nx(vcpu);
-                reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
                g_context->root_level = PT32E_ROOT_LEVEL;
+                reset_rsvds_bits_mask(vcpu, g_context);
                g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
        } else {
                g_context->nx = false;
-                reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
                g_context->root_level = PT32_ROOT_LEVEL;
+                reset_rsvds_bits_mask(vcpu, g_context);
                g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
        }
@@ -3555,7 +3548,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
 * If we're seeing too many writes to a page, it may no longer be a page table,
 * or we may be forking, in which case it is better to unmap the page.
 */
-static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte)
+static bool detect_write_flooding(struct kvm_mmu_page *sp)
 {
        /*
         * Skip write-flooding detected for the sp whose level is 1, because
@@ -3664,10 +3657,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
        for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
-                spte = get_written_sptes(sp, gpa, &npte);
                if (detect_write_misaligned(sp, gpa, bytes) ||
-                      detect_write_flooding(sp, spte)) {
+                      detect_write_flooding(sp)) {
                        zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
                                                     &invalid_list);
                        ++vcpu->kvm->stat.mmu_flooded;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ea7b4fd34676..715da5a19a5b 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -200,13 +200,13 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
        slot = gfn_to_memslot(kvm, sp->gfn);
        rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-        spte = rmap_next(kvm, rmapp, NULL);
+        spte = rmap_next(rmapp, NULL);
        while (spte) {
                if (is_writable_pte(*spte))
                        audit_printk(kvm, "shadow page has writable "
                                     "mappings: gfn %llx role %x\n",
                                     sp->gfn, sp->role.word);
-                spte = rmap_next(kvm, rmapp, spte);
+                spte = rmap_next(rmapp, spte);
        }
 }
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 7aad5446f393..a73f0c104813 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -33,10 +33,11 @@ static struct kvm_arch_event_perf_mapping {
        [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
        [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
        [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+        [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
 };
 /* mapping between fixed pmc index and arch_events array */
-int fixed_pmc_events[] = {1, 0, 2};
+int fixed_pmc_events[] = {1, 0, 7};
 static bool pmc_is_gp(struct kvm_pmc *pmc)
 {
@@ -210,6 +211,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
        unsigned config, type = PERF_TYPE_RAW;
        u8 event_select, unit_mask;
+        if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
+                printk_once("kvm pmu: pin control bit is ignored\n");
        pmc->eventsel = eventsel;
        stop_counter(pmc);
@@ -220,7 +224,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
        event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
        unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
-        if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE |
+        if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
                                ARCH_PERFMON_EVENTSEL_INV |
                                ARCH_PERFMON_EVENTSEL_CMASK))) {
                config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
@@ -413,7 +417,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
        struct kvm_pmc *counters;
        u64 ctr;
-        pmc &= (3u << 30) - 1;
+        pmc &= ~(3u << 30);
        if (!fixed && pmc >= pmu->nr_arch_gp_counters)
                return 1;
        if (fixed && pmc >= pmu->nr_arch_fixed_counters)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e385214711cb..e334389e1c75 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -111,6 +111,12 @@ struct nested_state {
 #define MSRPM_OFFSETS   16
 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+/*
+ * Set osvw_len to higher value when updated Revision Guides
+ * are published and we know what the new status bits are
+ */
+static uint64_t osvw_len = 4, osvw_status;
 struct vcpu_svm {
        struct kvm_vcpu vcpu;
        struct vmcb *vmcb;
@@ -177,11 +183,13 @@ static bool npt_enabled = true;
 #else
 static bool npt_enabled;
 #endif
-static int npt = 1;
+/* allow nested paging (virtualized MMU) for all guests */
+static int npt = true;
 module_param(npt, int, S_IRUGO);
-static int nested = 1;
+/* allow nested virtualization in KVM/SVM */
+static int nested = true;
 module_param(nested, int, S_IRUGO);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@ -557,6 +565,27 @@ static void svm_init_erratum_383(void)
        erratum_383_found = true;
 }
+static void svm_init_osvw(struct kvm_vcpu *vcpu)
+{
+        /*
+         * Guests should see errata 400 and 415 as fixed (assuming that
+         * HLT and IO instructions are intercepted).
+         */
+        vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
+        vcpu->arch.osvw.status = osvw_status & ~(6ULL);
+        /*
+         * By increasing VCPU's osvw.length to 3 we are telling the guest that
+         * all osvw.status bits inside that length, including bit 0 (which is
+         * reserved for erratum 298), are valid. However, if host processor's
+         * osvw_len is 0 then osvw_status[0] carries no information. We need to
+         * be conservative here and therefore we tell the guest that erratum 298
+         * is present (because we really don't know).
+         */
+        if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
+                vcpu->arch.osvw.status |= 1;
+}
 static int has_svm(void)
 {
        const char *msg;
@@ -623,6 +652,36 @@ static int svm_hardware_enable(void *garbage)
                __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
        }
+        /*
+         * Get OSVW bits.
+         *
+         * Note that it is possible to have a system with mixed processor
+         * revisions and therefore different OSVW bits. If bits are not the same
+         * on different processors then choose the worst case (i.e. if erratum
+         * is present on one processor and not on another then assume that the
+         * erratum is present everywhere).
+         */
+        if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
+                uint64_t len, status = 0;
+                int err;
+                len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
+                if (!err)
+                        status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
+                                                      &err);
+                if (err)
+                        osvw_status = osvw_len = 0;
+                else {
+                        if (len < osvw_len)
+                                osvw_len = len;
+                        osvw_status |= status;
+                        osvw_status &= (1ULL << osvw_len) - 1;
+                }
+        } else
+                osvw_status = osvw_len = 0;
        svm_init_erratum_383();
        amd_pmu_enable_virt();
@@ -910,20 +969,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
        return _tsc;
 }
-static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        u64 ratio;
        u64 khz;
-        /* TSC scaling supported? */
+        /* Guest TSC same frequency as host TSC? */
-        if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
+        if (!scale) {
+                svm->tsc_ratio = TSC_RATIO_DEFAULT;
                return;
+        }
-        /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
+        /* TSC scaling supported? */
-        if (user_tsc_khz == 0) {
+        if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
-                vcpu->arch.virtual_tsc_khz = 0;
+                if (user_tsc_khz > tsc_khz) {
-                svm->tsc_ratio = TSC_RATIO_DEFAULT;
+                        vcpu->arch.tsc_catchup = 1;
+                        vcpu->arch.tsc_always_catchup = 1;
+                } else
+                        WARN(1, "user requested TSC rate below hardware speed\n");
                return;
        }
@@ -938,7 +1002,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
                                user_tsc_khz);
                return;
        }
-        vcpu->arch.virtual_tsc_khz = user_tsc_khz;
        svm->tsc_ratio             = ratio;
 }
@@ -958,10 +1021,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
-static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+        WARN_ON(adjustment < 0);
+        if (host)
+                adjustment = svm_scale_tsc(vcpu, adjustment);
        svm->vmcb->control.tsc_offset += adjustment;
        if (is_guest_mode(vcpu))
                svm->nested.hsave->control.tsc_offset += adjustment;
@@ -1191,6 +1258,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
        if (kvm_vcpu_is_bsp(&svm->vcpu))
                svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+        svm_init_osvw(&svm->vcpu);
        return &svm->vcpu;
 free_page4:
@@ -1268,6 +1337,21 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 }
+static void svm_update_cpl(struct kvm_vcpu *vcpu)
+{
+        struct vcpu_svm *svm = to_svm(vcpu);
+        int cpl;
+        if (!is_protmode(vcpu))
+                cpl = 0;
+        else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
+                cpl = 3;
+        else
+                cpl = svm->vmcb->save.cs.selector & 0x3;
+        svm->vmcb->save.cpl = cpl;
+}
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 {
        return to_svm(vcpu)->vmcb->save.rflags;
@@ -1275,7 +1359,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
+        unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
        to_svm(vcpu)->vmcb->save.rflags = rflags;
+        if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
+                svm_update_cpl(vcpu);
 }
 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
@@ -1543,9 +1631,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
                s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
        }
        if (seg == VCPU_SREG_CS)
-                svm->vmcb->save.cpl
+                svm_update_cpl(vcpu);
-                        = (svm->vmcb->save.cs.attrib
-                           >> SVM_SELECTOR_DPL_SHIFT) & 3;
        mark_dirty(svm->vmcb, VMCB_SEG);
 }
@@ -2735,7 +2821,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
             (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
                skip_emulated_instruction(&svm->vcpu);
-        if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
+        if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
+                int_vec = -1;
+        if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
                                has_error_code, error_code) == EMULATE_FAIL) {
                svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 246490f643b6..280751c84724 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -70,9 +70,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 static bool __read_mostly vmm_exclusive = 1;
 module_param(vmm_exclusive, bool, S_IRUGO);
-static bool __read_mostly yield_on_hlt = 1;
-module_param(yield_on_hlt, bool, S_IRUGO);
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
@@ -1655,17 +1652,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
        vmx_set_interrupt_shadow(vcpu, 0);
 }
-static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
-{
-        /* Ensure that we clear the HLT state in the VMCS.  We don't need to
-         * explicitly skip the instruction because if the HLT state is set, then
-         * the instruction is already executing and RIP has already been
-         * advanced. */
-        if (!yield_on_hlt &&
-            vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
-                vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
-}
 /*
 * KVM wants to inject page-faults which it got to the guest. This function
 * checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -1678,7 +1664,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu)
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
-        if (!(vmcs12->exception_bitmap & PF_VECTOR))
+        if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
                return 0;
        nested_vmx_vmexit(vcpu);
@@ -1718,7 +1704,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
                intr_info |= INTR_TYPE_HARD_EXCEPTION;
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
-        vmx_clear_hlt(vcpu);
 }
 static bool vmx_rdtscp_supported(void)
@@ -1817,13 +1802,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
 }
 /*
- * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
+ * Engage any workarounds for mis-matched TSC rates.  Currently limited to
- * ioctl. In this case the call-back should update internal vmx state to make
+ * software catchup for faster rates on slower CPUs.
- * the changes effective.
 */
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 {
-        /* Nothing to do here */
+        if (!scale)
+                return;
+        if (user_tsc_khz > tsc_khz) {
+                vcpu->arch.tsc_catchup = 1;
+                vcpu->arch.tsc_always_catchup = 1;
+        } else
+                WARN(1, "user requested TSC rate below hardware speed\n");
 }
 /*
@@ -1850,7 +1841,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
        }
 }
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
 {
        u64 offset = vmcs_read64(TSC_OFFSET);
        vmcs_write64(TSC_OFFSET, offset + adjustment);
@@ -2219,6 +2210,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
                        msr->data = data;
+                        if (msr - vmx->guest_msrs < vmx->save_nmsrs)
+                                kvm_set_shared_msr(msr->index, msr->data,
+                                                   msr->mask);
                        break;
                }
                ret = kvm_set_msr_common(vcpu, msr_index, data);
@@ -2399,7 +2393,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                                &_pin_based_exec_control) < 0)
                return -EIO;
-        min =
+        min = CPU_BASED_HLT_EXITING |
 #ifdef CONFIG_X86_64
              CPU_BASED_CR8_LOAD_EXITING |
              CPU_BASED_CR8_STORE_EXITING |
@@ -2414,9 +2408,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
              CPU_BASED_INVLPG_EXITING |
              CPU_BASED_RDPMC_EXITING;
-        if (yield_on_hlt)
-                min |= CPU_BASED_HLT_EXITING;
        opt = CPU_BASED_TPR_SHADOW |
              CPU_BASED_USE_MSR_BITMAPS |
              CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -4003,7 +3994,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
        } else
                intr |= INTR_TYPE_EXT_INTR;
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
-        vmx_clear_hlt(vcpu);
 }
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4035,7 +4025,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
        }
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
-        vmx_clear_hlt(vcpu);
 }
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -4672,9 +4661,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
        bool has_error_code = false;
        u32 error_code = 0;
        u16 tss_selector;
-        int reason, type, idt_v;
+        int reason, type, idt_v, idt_index;
        idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+        idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
        type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4712,8 +4702,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
                       type != INTR_TYPE_NMI_INTR))
                skip_emulated_instruction(vcpu);
-        if (kvm_task_switch(vcpu, tss_selector, reason,
+        if (kvm_task_switch(vcpu, tss_selector,
-                                has_error_code, error_code) == EMULATE_FAIL) {
+                            type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
+                            has_error_code, error_code) == EMULATE_FAIL) {
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
                vcpu->run->internal.ndata = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54696b5f8443..4044ce0bf7c1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -97,6 +97,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
 u32  kvm_max_guest_tsc_khz;
 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
+static u32 tsc_tolerance_ppm = 250;
+module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 #define KVM_NR_SHARED_MSRS 16
 struct kvm_shared_msrs_global {
@@ -969,50 +973,51 @@ static inline u64 get_kernel_ns(void)
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 unsigned long max_tsc_khz;
-static inline int kvm_tsc_changes_freq(void)
+static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 {
-        int cpu = get_cpu();
+        return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
-        int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+                                   vcpu->arch.virtual_tsc_shift);
-                  cpufreq_quick_get(cpu) != 0;
-        put_cpu();
-        return ret;
 }
-u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+static u32 adjust_tsc_khz(u32 khz, s32 ppm)
 {
-        if (vcpu->arch.virtual_tsc_khz)
+        u64 v = (u64)khz * (1000000 + ppm);
-                return vcpu->arch.virtual_tsc_khz;
+        do_div(v, 1000000);
-        else
+        return v;
-                return __this_cpu_read(cpu_tsc_khz);
 }
-static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
 {
-        u64 ret;
+        u32 thresh_lo, thresh_hi;
+        int use_scaling = 0;
-        WARN_ON(preemptible());
-        if (kvm_tsc_changes_freq())
-                printk_once(KERN_WARNING
-                 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
-        ret = nsec * vcpu_tsc_khz(vcpu);
-        do_div(ret, USEC_PER_SEC);
-        return ret;
-}
-static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
-{
        /* Compute a scale to convert nanoseconds in TSC cycles */
        kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
-                           &vcpu->arch.tsc_catchup_shift,
+                           &vcpu->arch.virtual_tsc_shift,
-                           &vcpu->arch.tsc_catchup_mult);
+                           &vcpu->arch.virtual_tsc_mult);
+        vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+        /*
+         * Compute the variation in TSC rate which is acceptable
+         * within the range of tolerance and decide if the
+         * rate being applied is within that bounds of the hardware
+         * rate.  If so, no scaling or compensation need be done.
+         */
+        thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
+        thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
+        if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
+                pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+                use_scaling = 1;
+        }
+        kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
 }
 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 {
-        u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
+        u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
-                                      vcpu->arch.tsc_catchup_mult,
+                                      vcpu->arch.virtual_tsc_mult,
-                                      vcpu->arch.tsc_catchup_shift);
+                                      vcpu->arch.virtual_tsc_shift);
-        tsc += vcpu->arch.last_tsc_write;
+        tsc += vcpu->arch.this_tsc_write;
        return tsc;
 }
@@ -1021,48 +1026,88 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
        struct kvm *kvm = vcpu->kvm;
        u64 offset, ns, elapsed;
        unsigned long flags;
-        s64 sdiff;
+        s64 usdiff;
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
        ns = get_kernel_ns();
        elapsed = ns - kvm->arch.last_tsc_nsec;
-        sdiff = data - kvm->arch.last_tsc_write;
-        if (sdiff < 0)
+        /* n.b - signed multiplication and division required */
-                sdiff = -sdiff;
+        usdiff = data - kvm->arch.last_tsc_write;
+#ifdef CONFIG_X86_64
+        usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+#else
+        /* do_div() only does unsigned */
+        asm("idivl %2; xor %%edx, %%edx"
+            : "=A"(usdiff)
+            : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+#endif
+        do_div(elapsed, 1000);
+        usdiff -= elapsed;
+        if (usdiff < 0)
+                usdiff = -usdiff;
        /*
-         * Special case: close write to TSC within 5 seconds of
+         * Special case: TSC write with a small delta (1 second) of virtual
-         * another CPU is interpreted as an attempt to synchronize
+         * cycle time against real time is interpreted as an attempt to
-         * The 5 seconds is to accommodate host load / swapping as
+         * synchronize the CPU.
-         * well as any reset of TSC during the boot process.
+         *
-         *
+         * For a reliable TSC, we can match TSC offsets, and for an unstable
-         * In that case, for a reliable TSC, we can match TSC offsets,
+         * TSC, we add elapsed time in this computation.  We could let the
-         * or make a best guest using elapsed value.
+         * compensation code attempt to catch up if we fall behind, but
-         */
+         * it's better to try to match offsets from the beginning.
-        if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
+         */
-            elapsed < 5ULL * NSEC_PER_SEC) {
+        if (usdiff < USEC_PER_SEC &&
+            vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
                if (!check_tsc_unstable()) {
-                        offset = kvm->arch.last_tsc_offset;
+                        offset = kvm->arch.cur_tsc_offset;
                        pr_debug("kvm: matched tsc offset for %llu\n", data);
                } else {
                        u64 delta = nsec_to_cycles(vcpu, elapsed);
-                        offset += delta;
+                        data += delta;
+                        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
                        pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
                }
-                ns = kvm->arch.last_tsc_nsec;
+        } else {
+                /*
+                 * We split periods of matched TSC writes into generations.
+                 * For each generation, we track the original measured
+                 * nanosecond time, offset, and write, so if TSCs are in
+                 * sync, we can match exact offset, and if not, we can match
+                 * exact software computaion in compute_guest_tsc()
+                 *
+                 * These values are tracked in kvm->arch.cur_xxx variables.
+                 */
+                kvm->arch.cur_tsc_generation++;
+                kvm->arch.cur_tsc_nsec = ns;
+                kvm->arch.cur_tsc_write = data;
+                kvm->arch.cur_tsc_offset = offset;
+                pr_debug("kvm: new tsc generation %u, clock %llu\n",
+                         kvm->arch.cur_tsc_generation, data);
        }
+        /*
+         * We also track th most recent recorded KHZ, write and time to
+         * allow the matching interval to be extended at each write.
+         */
        kvm->arch.last_tsc_nsec = ns;
        kvm->arch.last_tsc_write = data;
-        kvm->arch.last_tsc_offset = offset;
+        kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
-        kvm_x86_ops->write_tsc_offset(vcpu, offset);
-        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
        /* Reset of TSC must disable overshoot protection below */
        vcpu->arch.hv_clock.tsc_timestamp = 0;
-        vcpu->arch.last_tsc_write = data;
+        vcpu->arch.last_guest_tsc = data;
-        vcpu->arch.last_tsc_nsec = ns;
+        /* Keep track of which generation this VCPU has synchronized to */
+        vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+        vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+        vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+        kvm_x86_ops->write_tsc_offset(vcpu, offset);
+        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 }
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -1078,7 +1123,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        local_irq_save(flags);
        tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
        kernel_ns = get_kernel_ns();
-        this_tsc_khz = vcpu_tsc_khz(v);
+        this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
        if (unlikely(this_tsc_khz == 0)) {
                local_irq_restore(flags);
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1098,7 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        if (vcpu->tsc_catchup) {
                u64 tsc = compute_guest_tsc(v, kernel_ns);
                if (tsc > tsc_timestamp) {
-                        kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+                        adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
                        tsc_timestamp = tsc;
                }
        }
@@ -1130,7 +1175,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         * observed by the guest and ensure the new system time is greater.
         */
        max_kernel_ns = 0;
-        if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+        if (vcpu->hv_clock.tsc_timestamp) {
                max_kernel_ns = vcpu->last_guest_tsc -
                                vcpu->hv_clock.tsc_timestamp;
                max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
@@ -1504,6 +1549,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case MSR_K7_HWCR:
                data &= ~(u64)0x40;     /* ignore flush filter disable */
                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
+                data &= ~(u64)0x8;      /* ignore TLB cache disable */
                if (data != 0) {
                        pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
                                data);
@@ -1676,6 +1722,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                 */
                pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
                break;
+        case MSR_AMD64_OSVW_ID_LENGTH:
+                if (!guest_cpuid_has_osvw(vcpu))
+                        return 1;
+                vcpu->arch.osvw.length = data;
+                break;
+        case MSR_AMD64_OSVW_STATUS:
+                if (!guest_cpuid_has_osvw(vcpu))
+                        return 1;
+                vcpu->arch.osvw.status = data;
+                break;
        default:
                if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
                        return xen_hvm_config(vcpu, data);
@@ -1960,6 +2016,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
                 */
                data = 0xbe702111;
                break;
+        case MSR_AMD64_OSVW_ID_LENGTH:
+                if (!guest_cpuid_has_osvw(vcpu))
+                        return 1;
+                data = vcpu->arch.osvw.length;
+                break;
+        case MSR_AMD64_OSVW_STATUS:
+                if (!guest_cpuid_has_osvw(vcpu))
+                        return 1;
+                data = vcpu->arch.osvw.status;
+                break;
        default:
                if (kvm_pmu_msr(vcpu, msr))
                        return kvm_pmu_get_msr(vcpu, msr, pdata);
@@ -2080,6 +2146,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_XSAVE:
        case KVM_CAP_ASYNC_PF:
        case KVM_CAP_GET_TSC_KHZ:
+        case KVM_CAP_PCI_2_3:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -2214,19 +2281,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        }
        kvm_x86_ops->vcpu_load(vcpu, cpu);
-        if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
-                /* Make sure TSC doesn't go backwards */
-                s64 tsc_delta;
-                u64 tsc;
-                tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+        /* Apply any externally detected TSC adjustments (due to suspend) */
-                tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
+        if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
-                             tsc - vcpu->arch.last_guest_tsc;
+                adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+                vcpu->arch.tsc_offset_adjustment = 0;
+                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+        }
+        if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
+                s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+                                native_read_tsc() - vcpu->arch.last_host_tsc;
                if (tsc_delta < 0)
                        mark_tsc_unstable("KVM discovered backwards TSC");
                if (check_tsc_unstable()) {
-                        kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+                        u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
+                                                vcpu->arch.last_guest_tsc);
+                        kvm_x86_ops->write_tsc_offset(vcpu, offset);
                        vcpu->arch.tsc_catchup = 1;
                }
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2243,7 +2314,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
        kvm_x86_ops->vcpu_put(vcpu);
        kvm_put_guest_fpu(vcpu);
-        vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+        vcpu->arch.last_host_tsc = native_read_tsc();
 }
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -2785,26 +2856,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                u32 user_tsc_khz;
                r = -EINVAL;
-                if (!kvm_has_tsc_control)
-                        break;
                user_tsc_khz = (u32)arg;
                if (user_tsc_khz >= kvm_max_guest_tsc_khz)
                        goto out;
-                kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
+                if (user_tsc_khz == 0)
+                        user_tsc_khz = tsc_khz;
+                kvm_set_tsc_khz(vcpu, user_tsc_khz);
                r = 0;
                goto out;
        }
        case KVM_GET_TSC_KHZ: {
-                r = -EIO;
+                r = vcpu->arch.virtual_tsc_khz;
-                if (check_tsc_unstable())
-                        goto out;
-                r = vcpu_tsc_khz(vcpu);
                goto out;
        }
        default:
@@ -2815,6 +2881,11 @@ out:
        return r;
 }
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+        return VM_FAULT_SIGBUS;
+}
 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
 {
        int ret;
@@ -2998,6 +3069,8 @@ static void write_protect_slot(struct kvm *kvm,
                               unsigned long *dirty_bitmap,
                               unsigned long nr_dirty_pages)
 {
+        spin_lock(&kvm->mmu_lock);
        /* Not many dirty pages compared to # of shadow pages. */
        if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
                unsigned long gfn_offset;
@@ -3005,16 +3078,13 @@ static void write_protect_slot(struct kvm *kvm,
                for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
                        unsigned long gfn = memslot->base_gfn + gfn_offset;
-                        spin_lock(&kvm->mmu_lock);
                        kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
-                        spin_unlock(&kvm->mmu_lock);
                }
                kvm_flush_remote_tlbs(kvm);
-        } else {
+        } else
-                spin_lock(&kvm->mmu_lock);
                kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-                spin_unlock(&kvm->mmu_lock);
-        }
+        spin_unlock(&kvm->mmu_lock);
 }
 /*
@@ -3133,6 +3203,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = -EEXIST;
                if (kvm->arch.vpic)
                        goto create_irqchip_unlock;
+                r = -EINVAL;
+                if (atomic_read(&kvm->online_vcpus))
+                        goto create_irqchip_unlock;
                r = -ENOMEM;
                vpic = kvm_create_pic(kvm);
                if (vpic) {
@@ -4063,6 +4136,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
        return res;
 }
+static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
+{
+        kvm_set_rflags(emul_to_vcpu(ctxt), val);
+}
 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
 {
        return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4244,6 +4322,7 @@ static struct x86_emulate_ops emulate_ops = {
        .set_idt             = emulator_set_idt,
        .get_cr              = emulator_get_cr,
        .set_cr              = emulator_set_cr,
+        .set_rflags          = emulator_set_rflags,
        .cpl                 = emulator_get_cpl,
        .get_dr              = emulator_get_dr,
        .set_dr              = emulator_set_dr,
@@ -5288,6 +5367,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                profile_hit(KVM_PROFILING, (void *)rip);
        }
+        if (unlikely(vcpu->arch.tsc_always_catchup))
+                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
        kvm_lapic_sync_from_vapic(vcpu);
@@ -5587,15 +5668,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
        return 0;
 }
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
-                    bool has_error_code, u32 error_code)
+                    int reason, bool has_error_code, u32 error_code)
 {
        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
        int ret;
        init_emulate_ctxt(vcpu);
-        ret = emulator_task_switch(ctxt, tss_selector, reason,
+        ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
                                   has_error_code, error_code);
        if (ret)
@@ -5928,13 +6009,88 @@ int kvm_arch_hardware_enable(void *garbage)
        struct kvm *kvm;
        struct kvm_vcpu *vcpu;
        int i;
+        int ret;
+        u64 local_tsc;
+        u64 max_tsc = 0;
+        bool stable, backwards_tsc = false;
        kvm_shared_msr_cpu_online();
-        list_for_each_entry(kvm, &vm_list, vm_list)
+        ret = kvm_x86_ops->hardware_enable(garbage);
-                kvm_for_each_vcpu(i, vcpu, kvm)
+        if (ret != 0)
-                        if (vcpu->cpu == smp_processor_id())
+                return ret;
-                                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-        return kvm_x86_ops->hardware_enable(garbage);
+        local_tsc = native_read_tsc();
+        stable = !check_tsc_unstable();
+        list_for_each_entry(kvm, &vm_list, vm_list) {
+                kvm_for_each_vcpu(i, vcpu, kvm) {
+                        if (!stable && vcpu->cpu == smp_processor_id())
+                                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+                        if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+                                backwards_tsc = true;
+                                if (vcpu->arch.last_host_tsc > max_tsc)
+                                        max_tsc = vcpu->arch.last_host_tsc;
+                        }
+                }
+        }
+        /*
+         * Sometimes, even reliable TSCs go backwards.  This happens on
+         * platforms that reset TSC during suspend or hibernate actions, but
+         * maintain synchronization.  We must compensate.  Fortunately, we can
+         * detect that condition here, which happens early in CPU bringup,
+         * before any KVM threads can be running.  Unfortunately, we can't
+         * bring the TSCs fully up to date with real time, as we aren't yet far
+         * enough into CPU bringup that we know how much real time has actually
+         * elapsed; our helper function, get_kernel_ns() will be using boot
+         * variables that haven't been updated yet.
+         *
+         * So we simply find the maximum observed TSC above, then record the
+         * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
+         * the adjustment will be applied.  Note that we accumulate
+         * adjustments, in case multiple suspend cycles happen before some VCPU
+         * gets a chance to run again.  In the event that no KVM threads get a
+         * chance to run, we will miss the entire elapsed period, as we'll have
+         * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+         * loose cycle time.  This isn't too big a deal, since the loss will be
+         * uniform across all VCPUs (not to mention the scenario is extremely
+         * unlikely). It is possible that a second hibernate recovery happens
+         * much faster than a first, causing the observed TSC here to be
+         * smaller; this would require additional padding adjustment, which is
+         * why we set last_host_tsc to the local tsc observed here.
+         *
+         * N.B. - this code below runs only on platforms with reliable TSC,
+         * as that is the only way backwards_tsc is set above.  Also note
+         * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+         * have the same delta_cyc adjustment applied if backwards_tsc
+         * is detected.  Note further, this adjustment is only done once,
+         * as we reset last_host_tsc on all VCPUs to stop this from being
+         * called multiple times (one for each physical CPU bringup).
+         *
+         * Platforms with unnreliable TSCs don't have to deal with this, they
+         * will be compensated by the logic in vcpu_load, which sets the TSC to
+         * catchup mode.  This will catchup all VCPUs to real time, but cannot
+         * guarantee that they stay in perfect synchronization.
+         */
+        if (backwards_tsc) {
+                u64 delta_cyc = max_tsc - local_tsc;
+                list_for_each_entry(kvm, &vm_list, vm_list) {
+                        kvm_for_each_vcpu(i, vcpu, kvm) {
+                                vcpu->arch.tsc_offset_adjustment += delta_cyc;
+                                vcpu->arch.last_host_tsc = local_tsc;
+                        }
+                        /*
+                         * We have to disable TSC offset matching.. if you were
+                         * booting a VM while issuing an S4 host suspend....
+                         * you may have some problem.  Solving this issue is
+                         * left as an exercise to the reader.
+                         */
+                        kvm->arch.last_tsc_nsec = 0;
+                        kvm->arch.last_tsc_write = 0;
+                }
+        }
+        return 0;
 }
 void kvm_arch_hardware_disable(void *garbage)
@@ -5958,6 +6114,11 @@ void kvm_arch_check_processor_compat(void *rtn)
        kvm_x86_ops->check_processor_compatibility(rtn);
 }
+bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
+{
+        return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+}
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
        struct page *page;
@@ -5980,7 +6141,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        }
        vcpu->arch.pio_data = page_address(page);
-        kvm_init_tsc_catchup(vcpu, max_tsc_khz);
+        kvm_set_tsc_khz(vcpu, max_tsc_khz);
        r = kvm_mmu_create(vcpu);
        if (r < 0)
@@ -6032,8 +6193,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
        free_page((unsigned long)vcpu->arch.pio_data);
 }
-int kvm_arch_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
+        if (type)
+                return -EINVAL;
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -6093,6 +6257,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
                put_page(kvm->arch.ept_identity_pagetable);
 }
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+                           struct kvm_memory_slot *dont)
+{
+        int i;
+        for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+                if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
+                        vfree(free->arch.lpage_info[i]);
+                        free->arch.lpage_info[i] = NULL;
+                }
+        }
+}
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+        int i;
+        for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+                unsigned long ugfn;
+                int lpages;
+                int level = i + 2;
+                lpages = gfn_to_index(slot->base_gfn + npages - 1,
+                                      slot->base_gfn, level) + 1;
+                slot->arch.lpage_info[i] =
+                        vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
+                if (!slot->arch.lpage_info[i])
+                        goto out_free;
+                if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+                        slot->arch.lpage_info[i][0].write_count = 1;
+                if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+                        slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+                ugfn = slot->userspace_addr >> PAGE_SHIFT;
+                /*
+                 * If the gfn and userspace address are not aligned wrt each
+                 * other, or if explicitly asked to, disable large page
+                 * support for this slot
+                 */
+                if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+                    !kvm_largepages_enabled()) {
+                        unsigned long j;
+                        for (j = 0; j < lpages; ++j)
+                                slot->arch.lpage_info[i][j].write_count = 1;
+                }
+        }
+        return 0;
+out_free:
+        for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+                vfree(slot->arch.lpage_info[i]);
+                slot->arch.lpage_info[i] = NULL;
+        }
+        return -ENOMEM;
+}
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                struct kvm_memory_slot old,
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-03-28 17:35:31 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-03-28 17:35:31 -0400
commit	2e7580b0e75d771d93e24e681031a165b1d31071 (patch)
tree	d9449702609eeaab28913a43b5a4434667e09d43 /arch/x86/kvm
parent	d25413efa9536e2f425ea45c7720598035c597bc (diff)
parent	cf9eeac46350b8b43730b7dc5e999757bed089a4 (diff)