15 files changed, 312 insertions, 296 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 52e18e6d2ba0..e0edaaa6920a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -383,6 +383,7 @@ config VMI
 config KVM_CLOCK
        bool "KVM paravirtualized clock"
        select PARAVIRT
+        select PARAVIRT_CLOCK
        depends on !(X86_VISWS || X86_VOYAGER)
        help
          Turning on this option will allow you to run a paravirtualized clock
@@ -410,6 +411,10 @@ config PARAVIRT
          over full virtualization.  However, when run without a hypervisor
          the kernel is theoretically slower and slightly larger.
+config PARAVIRT_CLOCK
+        bool
+        default n
 endif
 config MEMTEST_BOOTPARAM
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..77807d4769c9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
 obj-$(CONFIG_KVM_GUEST)         += kvm.o
 obj-$(CONFIG_KVM_CLOCK)         += kvmclock.o
 obj-$(CONFIG_PARAVIRT)          += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT_CLOCK)    += pvclock.o
 obj-$(CONFIG_PCSPKR_PLATFORM)   += pcspeaker.o
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 08a30986d472..87edf1ceb1df 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -18,6 +18,7 @@
 #include <linux/clocksource.h>
 #include <linux/kvm_para.h>
+#include <asm/pvclock.h>
 #include <asm/arch_hooks.h>
 #include <asm/msr.h>
 #include <asm/apic.h>
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)
 early_param("no-kvmclock", parse_no_kvmclock);
 /* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
-#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
+static struct pvclock_wall_clock wall_clock;
-static inline u64 kvm_get_delta(u64 last_tsc)
-{
-        int cpu = smp_processor_id();
-        u64 delta = native_read_tsc() - last_tsc;
-        return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
-}
-static struct kvm_wall_clock wall_clock;
-static cycle_t kvm_clock_read(void);
 /*
 * The wallclock is the time of day when we booted. Since then, some time may
 * have elapsed since the hypervisor wrote the data. So we try to account for
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);
 */
 static unsigned long kvm_get_wallclock(void)
 {
-        u32 wc_sec, wc_nsec;
+        struct pvclock_vcpu_time_info *vcpu_time;
-        u64 delta;
        struct timespec ts;
-        int version, nsec;
        int low, high;
        low = (int)__pa(&wall_clock);
        high = ((u64)__pa(&wall_clock) >> 32);
+        native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
-        delta = kvm_clock_read();
+        vcpu_time = &get_cpu_var(hv_clock);
+        pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
+        put_cpu_var(hv_clock);
-        native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+        return ts.tv_sec;
-        do {
-                version = wall_clock.wc_version;
-                rmb();
-                wc_sec = wall_clock.wc_sec;
-                wc_nsec = wall_clock.wc_nsec;
-                rmb();
-        } while ((wall_clock.wc_version != version) || (version & 1));
-        delta = kvm_clock_read() - delta;
-        delta += wc_nsec;
-        nsec = do_div(delta, NSEC_PER_SEC);
-        set_normalized_timespec(&ts, wc_sec + delta, nsec);
-        /*
-         * Of all mechanisms of time adjustment I've tested, this one
-         * was the champion!
-         */
-        return ts.tv_sec + 1;
 }
 static int kvm_set_wallclock(unsigned long now)
 {
-        return 0;
+        return -1;
 }
-/*
- * This is our read_clock function. The host puts an tsc timestamp each time
- * it updates a new time. Without the tsc adjustment, we can have a situation
- * in which a vcpu starts to run earlier (smaller system_time), but probes
- * time later (compared to another vcpu), leading to backwards time
- */
 static cycle_t kvm_clock_read(void)
 {
-        u64 last_tsc, now;
+        struct pvclock_vcpu_time_info *src;
-        int cpu;
+        cycle_t ret;
-        preempt_disable();
+        src = &get_cpu_var(hv_clock);
-        cpu = smp_processor_id();
+        ret = pvclock_clocksource_read(src);
+        put_cpu_var(hv_clock);
-        last_tsc = get_clock(cpu, tsc_timestamp);
+        return ret;
-        now = get_clock(cpu, system_time);
-        now += kvm_get_delta(last_tsc);
-        preempt_enable();
-        return now;
 }
 static struct clocksource kvm_clock = {
        .name = "kvm-clock",
        .read = kvm_clock_read,
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
-static int kvm_register_clock(void)
+static int kvm_register_clock(char *txt)
 {
        int cpu = smp_processor_id();
        int low, high;
        low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
        high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+        printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
+               cpu, high, low, txt);
        return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
 }
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
         * Now that the first cpu already had this clocksource initialized,
         * we shouldn't fail.
         */
-        WARN_ON(kvm_register_clock());
+        WARN_ON(kvm_register_clock("secondary cpu clock"));
        /* ok, done with our trickery, call native */
        setup_secondary_APIC_clock();
 }
 #endif
+#ifdef CONFIG_SMP
+void __init kvm_smp_prepare_boot_cpu(void)
+{
+        WARN_ON(kvm_register_clock("primary cpu clock"));
+        native_smp_prepare_boot_cpu();
+}
+#endif
 /*
 * After the clock is registered, the host will keep writing to the
 * registered memory location. If the guest happens to shutdown, this memory
@@ -174,7 +148,7 @@ void __init kvmclock_init(void)
                return;
        if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
-                if (kvm_register_clock())
+                if (kvm_register_clock("boot clock"))
                        return;
                pv_time_ops.get_wallclock = kvm_get_wallclock;
                pv_time_ops.set_wallclock = kvm_set_wallclock;
@@ -182,6 +156,9 @@ void __init kvmclock_init(void)
 #ifdef CONFIG_X86_LOCAL_APIC
                pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
 #endif
+#ifdef CONFIG_SMP
+                smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+#endif
                machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
                machine_ops.crash_shutdown  = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
new file mode 100644
index 000000000000..05fbe9a0325a
--- /dev/null
+++ b/arch/x86/kernel/pvclock.c
@@ -0,0 +1,141 @@
+/*  paravirtual clock -- common code used by kvm/xen
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <asm/pvclock.h>
+/*
+ * These are perodically updated
+ *    xen: magic shared_info page
+ *    kvm: gpa registered via msr
+ * and then copied here.
+ */
+struct pvclock_shadow_time {
+        u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+        u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+        u32 tsc_to_nsec_mul;
+        int tsc_shift;
+        u32 version;
+};
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+        u64 product;
+#ifdef __i386__
+        u32 tmp1, tmp2;
+#endif
+        if (shift < 0)
+                delta >>= -shift;
+        else
+                delta <<= shift;
+#ifdef __i386__
+        __asm__ (
+                "mul  %5       ; "
+                "mov  %4,%%eax ; "
+                "mov  %%edx,%4 ; "
+                "mul  %5       ; "
+                "xor  %5,%5    ; "
+                "add  %4,%%eax ; "
+                "adc  %5,%%edx ; "
+                : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+                : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+        __asm__ (
+                "mul %%rdx ; shrd $32,%%rdx,%%rax"
+                : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+        return product;
+}
+static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
+{
+        u64 delta = native_read_tsc() - shadow->tsc_timestamp;
+        return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+/*
+ * Reads a consistent set of time-base values from hypervisor,
+ * into a shadow data area.
+ */
+static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
+                                        struct pvclock_vcpu_time_info *src)
+{
+        do {
+                dst->version = src->version;
+                rmb();          /* fetch version before data */
+                dst->tsc_timestamp     = src->tsc_timestamp;
+                dst->system_timestamp  = src->system_time;
+                dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+                dst->tsc_shift         = src->tsc_shift;
+                rmb();          /* test version after fetching data */
+        } while ((src->version & 1) || (dst->version != src->version));
+        return dst->version;
+}
+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+        struct pvclock_shadow_time shadow;
+        unsigned version;
+        cycle_t ret, offset;
+        do {
+                version = pvclock_get_time_values(&shadow, src);
+                barrier();
+                offset = pvclock_get_nsec_offset(&shadow);
+                ret = shadow.system_timestamp + offset;
+                barrier();
+        } while (version != src->version);
+        return ret;
+}
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
+                            struct pvclock_vcpu_time_info *vcpu_time,
+                            struct timespec *ts)
+{
+        u32 version;
+        u64 delta;
+        struct timespec now;
+        /* get wallclock at system boot */
+        do {
+                version = wall_clock->version;
+                rmb();          /* fetch version before time */
+                now.tv_sec  = wall_clock->sec;
+                now.tv_nsec = wall_clock->nsec;
+                rmb();          /* fetch time before checking version */
+        } while ((wall_clock->version & 1) || (version != wall_clock->version));
+        delta = pvclock_clocksource_read(vcpu_time);    /* time since system boot */
+        delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+        now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+        now.tv_sec = delta;
+        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index f2f5d260874e..3829aa7b663f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
        atomic_inc(&pt->pending);
        smp_mb__after_atomic_inc();
-        if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
+        if (vcpu0) {
-                vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+                set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
-                wake_up_interruptible(&vcpu0->wq);
+                if (waitqueue_active(&vcpu0->wq)) {
+                        vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+                        wake_up_interruptible(&vcpu0->wq);
+                }
        }
        pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c297c50eba63..ebc03f5ae162 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
        wait_queue_head_t *q = &apic->vcpu->wq;
        atomic_inc(&apic->timer.pending);
+        set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
        if (waitqueue_active(q)) {
                apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
                wake_up_interruptible(q);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ee3f53098f0c..7e7c3969f7a2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
                        rmap_remove(kvm, spte);
                        --kvm->stat.lpages;
                        set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+                        spte = NULL;
                        write_protected = 1;
                }
                spte = rmap_next(kvm, rmapp, spte);
@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                struct kvm_mmu_page *shadow;
                spte |= PT_WRITABLE_MASK;
-                if (user_fault) {
-                        mmu_unshadow(vcpu->kvm, gfn);
-                        goto unshadowed;
-                }
                shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
                if (shadow ||
@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                }
        }
-unshadowed:
        if (pte_access & ACC_WRITE_MASK)
                mark_page_dirty(vcpu->kvm, gfn);
@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
                                  u64 *spte,
                                  const void *new)
 {
-        if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
+        if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
-            && !vcpu->arch.update_pte.largepage) {
+                if (!vcpu->arch.update_pte.largepage ||
-                ++vcpu->kvm->stat.mmu_pde_zapped;
+                    sp->role.glevels == PT32_ROOT_LEVEL) {
-                return;
+                        ++vcpu->kvm->stat.mmu_pde_zapped;
-        }
+                        return;
+                }
+        }
        ++vcpu->kvm->stat.mmu_pte_updated;
        if (sp->role.glevels == PT32_ROOT_LEVEL)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 02efbe75f317..540e95179074 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
        load_transition_efer(vmx);
 }
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
+static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
        unsigned long flags;
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
        reload_host_efer(vmx);
 }
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
+{
+        preempt_disable();
+        __vmx_load_host_state(vmx);
+        preempt_enable();
+}
 /*
 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
 * vcpu mutex is already taken.
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
-        vmx_load_host_state(to_vmx(vcpu));
+        __vmx_load_host_state(to_vmx(vcpu));
 }
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
        switch (msr_index) {
 #ifdef CONFIG_X86_64
        case MSR_EFER:
+                vmx_load_host_state(vmx);
                ret = kvm_set_msr_common(vcpu, msr_index, data);
-                if (vmx->host_state.loaded) {
-                        reload_host_efer(vmx);
-                        load_transition_efer(vmx);
-                }
                break;
        case MSR_FS_BASE:
                vmcs_writel(GUEST_FS_BASE, data);
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                guest_write_tsc(data);
                break;
        default:
+                vmx_load_host_state(vmx);
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
                        msr->data = data;
-                        if (vmx->host_state.loaded)
-                                load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
                        break;
                }
                ret = kvm_set_msr_common(vcpu, msr_index, data);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00acf1301a15..63a77caa59f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
        static int version;
-        struct kvm_wall_clock wc;
+        struct pvclock_wall_clock wc;
-        struct timespec wc_ts;
+        struct timespec now, sys, boot;
        if (!wall_clock)
                return;
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
-        wc_ts = current_kernel_time();
+        /*
-        wc.wc_sec = wc_ts.tv_sec;
+         * The guest calculates current wall clock time by adding
-        wc.wc_nsec = wc_ts.tv_nsec;
+         * system time (updated by kvm_write_guest_time below) to the
-        wc.wc_version = version;
+         * wall clock specified here.  guest system time equals host
+         * system time for us, thus we must fill in host boot time here.
+         */
+        now = current_kernel_time();
+        ktime_get_ts(&sys);
+        boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
+        wc.sec = boot.tv_sec;
+        wc.nsec = boot.tv_nsec;
+        wc.version = version;
        kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 }
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+        uint32_t quotient, remainder;
+        /* Don't try to replace with do_div(), this one calculates
+         * "(dividend << 32) / divisor" */
+        __asm__ ( "divl %4"
+                  : "=a" (quotient), "=d" (remainder)
+                  : "0" (0), "1" (dividend), "r" (divisor) );
+        return quotient;
+}
+static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+{
+        uint64_t nsecs = 1000000000LL;
+        int32_t  shift = 0;
+        uint64_t tps64;
+        uint32_t tps32;
+        tps64 = tsc_khz * 1000LL;
+        while (tps64 > nsecs*2) {
+                tps64 >>= 1;
+                shift--;
+        }
+        tps32 = (uint32_t)tps64;
+        while (tps32 <= (uint32_t)nsecs) {
+                tps32 <<= 1;
+                shift++;
+        }
+        hv_clock->tsc_shift = shift;
+        hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
+        pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
+                 __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
+                 hv_clock->tsc_to_system_mul);
+}
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
        struct timespec ts;
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
        if ((!vcpu->time_page))
                return;
+        if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
+                kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
+                vcpu->hv_clock_tsc_khz = tsc_khz;
+        }
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
        kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
        /*
         * The interface expects us to write an even number signaling that the
         * update is finished. Since the guest won't see the intermediate
-         * state, we just write "2" at the end
+         * state, we just increase by 2 at the end.
         */
-        vcpu->hv_clock.version = 2;
+        vcpu->hv_clock.version += 2;
        shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
-                sizeof(vcpu->hv_clock));
+               sizeof(vcpu->hv_clock));
        kunmap_atomic(shared_kaddr, KM_USER0);
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                /* ...but clean it before doing the actual write */
                vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
-                vcpu->arch.hv_clock.tsc_to_system_mul =
-                                        clocksource_khz2mult(tsc_khz, 22);
-                vcpu->arch.hv_clock.tsc_shift = 22;
                down_read(&current->mm->mmap_sem);
                vcpu->arch.time_page =
                                gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
@@ -2759,6 +2808,8 @@ again:
        if (vcpu->requests) {
                if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
                        __kvm_migrate_timers(vcpu);
+                if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+                        kvm_x86_ops->tlb_flush(vcpu);
                if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
                                       &vcpu->requests)) {
                        kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -2772,6 +2823,7 @@ again:
                }
        }
+        clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
        kvm_inject_pending_timer_irqs(vcpu);
        preempt_disable();
@@ -2781,21 +2833,13 @@ again:
        local_irq_disable();
-        if (need_resched()) {
+        if (vcpu->requests || need_resched()) {
                local_irq_enable();
                preempt_enable();
                r = 1;
                goto out;
        }
-        if (vcpu->requests)
-                if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
-                        local_irq_enable();
-                        preempt_enable();
-                        r = 1;
-                        goto out;
-                }
        if (signal_pending(current)) {
                local_irq_enable();
                preempt_enable();
@@ -2825,9 +2869,6 @@ again:
        kvm_guest_enter();
-        if (vcpu->requests)
-                if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
-                        kvm_x86_ops->tlb_flush(vcpu);
        KVMTRACE_0D(VMENTRY, vcpu, entryexit);
        kvm_x86_ops->run(vcpu, kvm_run);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 2e641be2737e..6c388e593bc8 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,8 +5,9 @@
 config XEN
        bool "Xen guest support"
        select PARAVIRT
+        select PARAVIRT_CLOCK
        depends on X86_32
-        depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER)
+        depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
        help
          This is the Linux Xen port.  Enabling this will allow the
          kernel to boot in a paravirtualized environment under the
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c048de34d6a1..f09c1c69c37a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
 static __init void xen_pagetable_setup_start(pgd_t *base)
 {
        pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+        int i;
        /* special set_pte for pagetable initialization */
        pv_mmu_ops.set_pte = xen_set_pte_init;
        init_mm.pgd = base;
        /*
-         * copy top-level of Xen-supplied pagetable into place.  For
+         * copy top-level of Xen-supplied pagetable into place.  This
-         * !PAE we can use this as-is, but for PAE it is a stand-in
+         * is a stand-in while we copy the pmd pages.
-         * while we copy the pmd pages.
         */
        memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
-        if (PTRS_PER_PMD > 1) {
+        /*
-                int i;
+         * For PAE, need to allocate new pmds, rather than
-                /*
+         * share Xen's, since Xen doesn't like pmd's being
-                 * For PAE, need to allocate new pmds, rather than
+         * shared between address spaces.
-                 * share Xen's, since Xen doesn't like pmd's being
+         */
-                 * shared between address spaces.
+        for (i = 0; i < PTRS_PER_PGD; i++) {
-                 */
+                if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
-                for (i = 0; i < PTRS_PER_PGD; i++) {
+                        pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
-                        if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
-                                pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
-                                memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+                        memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
-                                       PAGE_SIZE);
+                               PAGE_SIZE);
-                                make_lowmem_page_readonly(pmd);
+                        make_lowmem_page_readonly(pmd);
-                                set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+                        set_pgd(&base[i], __pgd(1 + __pa(pmd)));
-                        } else
+                } else
-                                pgd_clear(&base[i]);
+                        pgd_clear(&base[i]);
-                }
        }
        /* make sure zero_page is mapped RO so we can use it in pagetables */
@@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
        /* Actually pin the pagetable down, but we can't set PG_pinned
           yet because the page structures don't exist yet. */
-        {
+        pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
-                unsigned level;
-#ifdef CONFIG_X86_PAE
-                level = MMUEXT_PIN_L3_TABLE;
-#else
-                level = MMUEXT_PIN_L2_TABLE;
-#endif
-                pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
-        }
 }
 /* This is called once we have the cpu_possible_map */
@@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
        .make_pte = xen_make_pte,
        .make_pgd = xen_make_pgd,
-#ifdef CONFIG_X86_PAE
        .set_pte_atomic = xen_set_pte_atomic,
        .set_pte_present = xen_set_pte_at,
        .set_pud = xen_set_pud,
@@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
        .make_pmd = xen_make_pmd,
        .pmd_val = xen_pmd_val,
-#endif  /* PAE */
        .activate_mm = xen_activate_mm,
        .dup_mmap = xen_dup_mmap,
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 265601d5a6ae..df40bf74ea75 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -228,7 +228,7 @@ pmdval_t xen_pmd_val(pmd_t pmd)
 {
        return pte_mfn_to_pfn(pmd.pmd);
 }
-#ifdef CONFIG_X86_PAE
 void xen_set_pud(pud_t *ptr, pud_t val)
 {
        struct multicall_space mcs;
@@ -276,12 +276,6 @@ pmd_t xen_make_pmd(pmdval_t pmd)
        pmd = pte_pfn_to_mfn(pmd);
        return native_make_pmd(pmd);
 }
-#else  /* !PAE */
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
-        *ptep = pte;
-}
-#endif  /* CONFIG_X86_PAE */
 /*
  (Yet another) pagetable walker.  This one is intended for pinning a
@@ -434,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level)
   read-only, and can be pinned. */
 void xen_pgd_pin(pgd_t *pgd)
 {
-        unsigned level;
        xen_mc_batch();
        if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
@@ -445,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd)
                xen_mc_batch();
        }
-#ifdef CONFIG_X86_PAE
+        xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
-        level = MMUEXT_PIN_L3_TABLE;
-#else
-        level = MMUEXT_PIN_L2_TABLE;
-#endif
-        xen_do_pin(level, PFN_DOWN(__pa(pgd)));
        xen_mc_issue(0);
 }
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index b5e189b1519d..5fe961caffd4 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm);
 void xen_pgd_pin(pgd_t *pgd);
 //void xen_pgd_unpin(pgd_t *pgd);
-#ifdef CONFIG_X86_PAE
+pteval_t xen_pte_val(pte_t);
-unsigned long long xen_pte_val(pte_t);
+pmdval_t xen_pmd_val(pmd_t);
-unsigned long long xen_pmd_val(pmd_t);
+pgdval_t xen_pgd_val(pgd_t);
-unsigned long long xen_pgd_val(pgd_t);
-pte_t xen_make_pte(unsigned long long);
+pte_t xen_make_pte(pteval_t);
-pmd_t xen_make_pmd(unsigned long long);
+pmd_t xen_make_pmd(pmdval_t);
-pgd_t xen_make_pgd(unsigned long long);
+pgd_t xen_make_pgd(pgdval_t);
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
                    pte_t *ptep, pte_t pteval);
@@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val);
 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 void xen_pmd_clear(pmd_t *pmdp);
-#else
-unsigned long xen_pte_val(pte_t);
-unsigned long xen_pmd_val(pmd_t);
-unsigned long xen_pgd_val(pgd_t);
-pte_t xen_make_pte(unsigned long);
-pmd_t xen_make_pmd(unsigned long);
-pgd_t xen_make_pgd(unsigned long);
-#endif
 #endif  /* _XEN_MMU_H */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 52b2e3856980..41e217503c96 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -14,6 +14,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/math64.h>
+#include <asm/pvclock.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
@@ -31,17 +32,6 @@
 static cycle_t xen_clocksource_read(void);
-/* These are perodically updated in shared_info, and then copied here. */
-struct shadow_time_info {
-        u64 tsc_timestamp;     /* TSC at last update of time vals.  */
-        u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
-        u32 tsc_to_nsec_mul;
-        int tsc_shift;
-        u32 version;
-};
-static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
 /* runstate info updated by Xen */
 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
@@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void)
 unsigned long xen_cpu_khz(void)
 {
        u64 xen_khz = 1000000ULL << 32;
-        const struct vcpu_time_info *info =
+        const struct pvclock_vcpu_time_info *info =
                &HYPERVISOR_shared_info->vcpu_info[0].time;
        do_div(xen_khz, info->tsc_to_system_mul);
@@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void)
        return xen_khz;
 }
-/*
- * Reads a consistent set of time-base values from Xen, into a shadow data
- * area.
- */
-static unsigned get_time_values_from_xen(void)
-{
-        struct vcpu_time_info   *src;
-        struct shadow_time_info *dst;
-        /* src is shared memory with the hypervisor, so we need to
-           make sure we get a consistent snapshot, even in the face of
-           being preempted. */
-        src = &__get_cpu_var(xen_vcpu)->time;
-        dst = &__get_cpu_var(shadow_time);
-        do {
-                dst->version = src->version;
-                rmb();          /* fetch version before data */
-                dst->tsc_timestamp     = src->tsc_timestamp;
-                dst->system_timestamp  = src->system_time;
-                dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
-                dst->tsc_shift         = src->tsc_shift;
-                rmb();          /* test version after fetching data */
-        } while ((src->version & 1) | (dst->version ^ src->version));
-        return dst->version;
-}
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
-        u64 product;
-#ifdef __i386__
-        u32 tmp1, tmp2;
-#endif
-        if (shift < 0)
-                delta >>= -shift;
-        else
-                delta <<= shift;
-#ifdef __i386__
-        __asm__ (
-                "mul  %5       ; "
-                "mov  %4,%%eax ; "
-                "mov  %%edx,%4 ; "
-                "mul  %5       ; "
-                "xor  %5,%5    ; "
-                "add  %4,%%eax ; "
-                "adc  %5,%%edx ; "
-                : "=A" (product), "=r" (tmp1), "=r" (tmp2)
-                : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif __x86_64__
-        __asm__ (
-                "mul %%rdx ; shrd $32,%%rdx,%%rax"
-                : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-        return product;
-}
-static u64 get_nsec_offset(struct shadow_time_info *shadow)
-{
-        u64 now, delta;
-        now = native_read_tsc();
-        delta = now - shadow->tsc_timestamp;
-        return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
-}
 static cycle_t xen_clocksource_read(void)
 {
-        struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+        struct pvclock_vcpu_time_info *src;
        cycle_t ret;
-        unsigned version;
-        do {
-                version = get_time_values_from_xen();
-                barrier();
-                ret = shadow->system_timestamp + get_nsec_offset(shadow);
-                barrier();
-        } while (version != __get_cpu_var(xen_vcpu)->time.version);
-        put_cpu_var(shadow_time);
+        src = &get_cpu_var(xen_vcpu)->time;
+        ret = pvclock_clocksource_read(src);
+        put_cpu_var(xen_vcpu);
        return ret;
 }
 static void xen_read_wallclock(struct timespec *ts)
 {
-        const struct shared_info *s = HYPERVISOR_shared_info;
+        struct shared_info *s = HYPERVISOR_shared_info;
-        u32 version;
+        struct pvclock_wall_clock *wall_clock = &(s->wc);
-        u64 delta;
+        struct pvclock_vcpu_time_info *vcpu_time;
-        struct timespec now;
-        /* get wallclock at system boot */
-        do {
-                version = s->wc_version;
-                rmb();          /* fetch version before time */
-                now.tv_sec  = s->wc_sec;
-                now.tv_nsec = s->wc_nsec;
-                rmb();          /* fetch time before checking version */
-        } while ((s->wc_version & 1) | (version ^ s->wc_version));
-        delta = xen_clocksource_read(); /* time since system boot */
+        vcpu_time = &get_cpu_var(xen_vcpu)->time;
-        delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+        pvclock_read_wallclock(wall_clock, vcpu_time, ts);
+        put_cpu_var(xen_vcpu);
-        now.tv_nsec = do_div(delta, NSEC_PER_SEC);
-        now.tv_sec = delta;
-        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
 unsigned long xen_get_wallclock(void)
@@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void)
        struct timespec ts;
        xen_read_wallclock(&ts);
        return ts.tv_sec;
 }
@@ -569,8 +463,6 @@ __init void xen_time_init(void)
 {
        int cpu = smp_processor_id();
-        get_time_values_from_xen();
        clocksource_register(&xen_clocksource);
        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 3175e973fd0d..6ec3b4f7719b 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -30,11 +30,7 @@ ENTRY(hypercall_page)
        ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
        ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
        ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
-#ifdef CONFIG_X86_PAE
        ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
-#else
-        ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
-#endif
        ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
 #endif /*CONFIG_XEN */