10 files changed, 130 insertions, 59 deletions
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 3324d90038e4..3829aa7b663f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,10 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
        atomic_inc(&pt->pending);
        smp_mb__after_atomic_inc();
-        /* FIXME: handle case where the guest is in guest mode */
+        if (vcpu0) {
-        if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
+                set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
-                vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+                if (waitqueue_active(&vcpu0->wq)) {
-                wake_up_interruptible(&vcpu0->wq);
+                        vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+                        wake_up_interruptible(&vcpu0->wq);
+                }
        }
        pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
@@ -216,7 +218,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
 {
        struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-        if (pit && vcpu->vcpu_id == 0)
+        if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
                return atomic_read(&pit->pit_state.pit_timer.pending);
        return 0;
@@ -237,6 +239,19 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
                return HRTIMER_NORESTART;
 }
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
+{
+        struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+        struct hrtimer *timer;
+        if (vcpu->vcpu_id != 0 || !pit)
+                return;
+        timer = &pit->pit_state.pit_timer.timer;
+        if (hrtimer_cancel(timer))
+                hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+}
 static void destroy_pit_timer(struct kvm_kpit_timer *pt)
 {
        pr_debug("pit: execute del timer!\n");
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index ce1f583459b1..76d736b5f664 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -94,3 +94,9 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
        /* TODO: PIT, RTC etc. */
 }
 EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
+{
+        __kvm_migrate_apic_timer(vcpu);
+        __kvm_migrate_pit_timer(vcpu);
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 1802134b836f..2a15be2275c0 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -84,6 +84,8 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
 int pit_has_pending_timer(struct kvm_vcpu *vcpu);
 int apic_has_pending_timer(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36809d79788b..ebc03f5ae162 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
        wait_queue_head_t *q = &apic->vcpu->wq;
        atomic_inc(&apic->timer.pending);
+        set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
        if (waitqueue_active(q)) {
                apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
                wake_up_interruptible(q);
@@ -957,7 +958,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *lapic = vcpu->arch.apic;
-        if (lapic)
+        if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
                return atomic_read(&lapic->timer.pending);
        return 0;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 36c5406b1813..7e7c3969f7a2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
                        rmap_remove(kvm, spte);
                        --kvm->stat.lpages;
                        set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+                        spte = NULL;
                        write_protected = 1;
                }
                spte = rmap_next(kvm, rmapp, spte);
@@ -658,7 +659,7 @@ static int is_empty_shadow_page(u64 *spt)
        u64 *end;
        for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
-                if (*pos != shadow_trap_nonpresent_pte) {
+                if (is_shadow_present_pte(*pos)) {
                        printk(KERN_ERR "%s: %p %llx\n", __func__,
                               pos, *pos);
                        return 0;
@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                struct kvm_mmu_page *shadow;
                spte |= PT_WRITABLE_MASK;
-                if (user_fault) {
-                        mmu_unshadow(vcpu->kvm, gfn);
-                        goto unshadowed;
-                }
                shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
                if (shadow ||
@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                }
        }
-unshadowed:
        if (pte_access & ACC_WRITE_MASK)
                mark_page_dirty(vcpu->kvm, gfn);
@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
                                  u64 *spte,
                                  const void *new)
 {
-        if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
+        if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
-            && !vcpu->arch.update_pte.largepage) {
+                if (!vcpu->arch.update_pte.largepage ||
-                ++vcpu->kvm->stat.mmu_pde_zapped;
+                    sp->role.glevels == PT32_ROOT_LEVEL) {
-                return;
+                        ++vcpu->kvm->stat.mmu_pde_zapped;
-        }
+                        return;
+                }
+        }
        ++vcpu->kvm->stat.mmu_pte_updated;
        if (sp->role.glevels == PT32_ROOT_LEVEL)
@@ -1858,6 +1855,7 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu)
                sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
                                  struct kvm_mmu_page, link);
                kvm_mmu_zap_page(vcpu->kvm, sp);
+                cond_resched();
        }
        free_page((unsigned long)vcpu->arch.mmu.pae_root);
 }
@@ -1996,7 +1994,7 @@ static struct shrinker mmu_shrinker = {
        .seeks = DEFAULT_SEEKS * 10,
 };
-void mmu_destroy_caches(void)
+static void mmu_destroy_caches(void)
 {
        if (pte_chain_cache)
                kmem_cache_destroy(pte_chain_cache);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 156fe10288ae..934c7b619396 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -418,7 +418,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
        /* mmio */
        if (is_error_pfn(pfn)) {
-                pgprintk("gfn %x is mmio\n", walker.gfn);
+                pgprintk("gfn %lx is mmio\n", walker.gfn);
                kvm_release_pfn_clean(pfn);
                return 1;
        }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ab22615eee89..6b0d5fa5bab3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -688,7 +688,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                delta = vcpu->arch.host_tsc - tsc_this;
                svm->vmcb->control.tsc_offset += delta;
                vcpu->cpu = cpu;
-                kvm_migrate_apic_timer(vcpu);
+                kvm_migrate_timers(vcpu);
        }
        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bfe4db11989c..540e95179074 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
        load_transition_efer(vmx);
 }
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
+static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
        unsigned long flags;
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
        reload_host_efer(vmx);
 }
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
+{
+        preempt_disable();
+        __vmx_load_host_state(vmx);
+        preempt_enable();
+}
 /*
 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
 * vcpu mutex is already taken.
@@ -608,7 +615,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        if (vcpu->cpu != cpu) {
                vcpu_clear(vmx);
-                kvm_migrate_apic_timer(vcpu);
+                kvm_migrate_timers(vcpu);
                vpid_sync_vcpu_all(vmx);
        }
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
-        vmx_load_host_state(to_vmx(vcpu));
+        __vmx_load_host_state(to_vmx(vcpu));
 }
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
        switch (msr_index) {
 #ifdef CONFIG_X86_64
        case MSR_EFER:
+                vmx_load_host_state(vmx);
                ret = kvm_set_msr_common(vcpu, msr_index, data);
-                if (vmx->host_state.loaded) {
-                        reload_host_efer(vmx);
-                        load_transition_efer(vmx);
-                }
                break;
        case MSR_FS_BASE:
                vmcs_writel(GUEST_FS_BASE, data);
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                guest_write_tsc(data);
                break;
        default:
+                vmx_load_host_state(vmx);
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
                        msr->data = data;
-                        if (vmx->host_state.loaded)
-                                load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
                        break;
                }
                ret = kvm_set_msr_common(vcpu, msr_index, data);
@@ -1036,6 +1039,7 @@ static void hardware_enable(void *garbage)
 static void hardware_disable(void *garbage)
 {
        asm volatile (ASM_VMX_VMXOFF : : : "cc");
+        write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21338bdb28ff..63a77caa59f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
        static int version;
-        struct kvm_wall_clock wc;
+        struct pvclock_wall_clock wc;
-        struct timespec wc_ts;
+        struct timespec now, sys, boot;
        if (!wall_clock)
                return;
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
-        wc_ts = current_kernel_time();
+        /*
-        wc.wc_sec = wc_ts.tv_sec;
+         * The guest calculates current wall clock time by adding
-        wc.wc_nsec = wc_ts.tv_nsec;
+         * system time (updated by kvm_write_guest_time below) to the
-        wc.wc_version = version;
+         * wall clock specified here.  guest system time equals host
+         * system time for us, thus we must fill in host boot time here.
+         */
+        now = current_kernel_time();
+        ktime_get_ts(&sys);
+        boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
+        wc.sec = boot.tv_sec;
+        wc.nsec = boot.tv_nsec;
+        wc.version = version;
        kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 }
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+        uint32_t quotient, remainder;
+        /* Don't try to replace with do_div(), this one calculates
+         * "(dividend << 32) / divisor" */
+        __asm__ ( "divl %4"
+                  : "=a" (quotient), "=d" (remainder)
+                  : "0" (0), "1" (dividend), "r" (divisor) );
+        return quotient;
+}
+static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+{
+        uint64_t nsecs = 1000000000LL;
+        int32_t  shift = 0;
+        uint64_t tps64;
+        uint32_t tps32;
+        tps64 = tsc_khz * 1000LL;
+        while (tps64 > nsecs*2) {
+                tps64 >>= 1;
+                shift--;
+        }
+        tps32 = (uint32_t)tps64;
+        while (tps32 <= (uint32_t)nsecs) {
+                tps32 <<= 1;
+                shift++;
+        }
+        hv_clock->tsc_shift = shift;
+        hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
+        pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
+                 __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
+                 hv_clock->tsc_to_system_mul);
+}
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
        struct timespec ts;
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
        if ((!vcpu->time_page))
                return;
+        if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
+                kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
+                vcpu->hv_clock_tsc_khz = tsc_khz;
+        }
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
        kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
        /*
         * The interface expects us to write an even number signaling that the
         * update is finished. Since the guest won't see the intermediate
-         * state, we just write "2" at the end
+         * state, we just increase by 2 at the end.
         */
-        vcpu->hv_clock.version = 2;
+        vcpu->hv_clock.version += 2;
        shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
-                sizeof(vcpu->hv_clock));
+               sizeof(vcpu->hv_clock));
        kunmap_atomic(shared_kaddr, KM_USER0);
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                /* ...but clean it before doing the actual write */
                vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
-                vcpu->arch.hv_clock.tsc_to_system_mul =
-                                        clocksource_khz2mult(tsc_khz, 22);
-                vcpu->arch.hv_clock.tsc_shift = 22;
                down_read(&current->mm->mmap_sem);
                vcpu->arch.time_page =
                                gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
@@ -2758,7 +2807,9 @@ again:
        if (vcpu->requests) {
                if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
-                        __kvm_migrate_apic_timer(vcpu);
+                        __kvm_migrate_timers(vcpu);
+                if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+                        kvm_x86_ops->tlb_flush(vcpu);
                if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
                                       &vcpu->requests)) {
                        kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -2772,6 +2823,7 @@ again:
                }
        }
+        clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
        kvm_inject_pending_timer_irqs(vcpu);
        preempt_disable();
@@ -2781,21 +2833,13 @@ again:
        local_irq_disable();
-        if (need_resched()) {
+        if (vcpu->requests || need_resched()) {
                local_irq_enable();
                preempt_enable();
                r = 1;
                goto out;
        }
-        if (vcpu->requests)
-                if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
-                        local_irq_enable();
-                        preempt_enable();
-                        r = 1;
-                        goto out;
-                }
        if (signal_pending(current)) {
                local_irq_enable();
                preempt_enable();
@@ -2825,9 +2869,6 @@ again:
        kvm_guest_enter();
-        if (vcpu->requests)
-                if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
-                        kvm_x86_ops->tlb_flush(vcpu);
        KVMTRACE_0D(VMENTRY, vcpu, entryexit);
        kvm_x86_ops->run(vcpu, kvm_run);
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f2a696d6a243..932f216d890c 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -677,8 +677,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
        c->use_modrm_ea = 1;
        if (c->modrm_mod == 3) {
-                c->modrm_val = *(unsigned long *)
+                c->modrm_ptr = decode_register(c->modrm_rm,
-                        decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
+                                               c->regs, c->d & ByteOp);
+                c->modrm_val = *(unsigned long *)c->modrm_ptr;
                return rc;
        }
@@ -1005,6 +1006,7 @@ done_prefixes:
                if ((c->d & ModRM) && c->modrm_mod == 3) {
                        c->src.type = OP_REG;
                        c->src.val = c->modrm_val;
+                        c->src.ptr = c->modrm_ptr;
                        break;
                }
                c->src.type = OP_MEM;
@@ -1049,6 +1051,7 @@ done_prefixes:
                if ((c->d & ModRM) && c->modrm_mod == 3) {
                        c->dst.type = OP_REG;
                        c->dst.val = c->dst.orig_val = c->modrm_val;
+                        c->dst.ptr = c->modrm_ptr;
                        break;
                }
                c->dst.type = OP_MEM;
@@ -1724,7 +1727,8 @@ twobyte_insn:
                        if (rc)
                                goto done;
-                        kvm_emulate_hypercall(ctxt->vcpu);
+                        /* Let the processor re-execute the fixed hypercall */
+                        c->eip = ctxt->vcpu->arch.rip;
                        /* Disable writeback. */
                        c->dst.type = OP_NONE;
                        break;