4 files changed, 43 insertions, 69 deletions
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 25b1cc07d496..d6b078e9fa28 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 struct pvclock_vsyscall_time_info {
        struct pvclock_vcpu_time_info pvti;
-        u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index e5ecd20e72dd..2f355d229a58 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
-        if (!pvclock_vdso_info) {
-                BUG();
-                return NULL;
-        }
-        return &pvclock_vdso_info[cpu];
-}
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
-        return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
-}
 #ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
-                                void *v)
-{
-        struct task_migration_notifier *mn = v;
-        struct pvclock_vsyscall_time_info *pvti;
-        pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
-        /* this is NULL when pvclock vsyscall is not initialized */
-        if (unlikely(pvti == NULL))
-                return NOTIFY_DONE;
-        pvti->migrate_count++;
-        return NOTIFY_DONE;
-}
-static struct notifier_block pvclock_migrate = {
-        .notifier_call = pvclock_task_migrate,
-};
 /*
 * Initialize the generic pvclock vsyscall state.  This will allocate
 * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
        WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
-        pvclock_vdso_info = i;
        for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
                __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
                             __pa(i) + (idx*PAGE_SIZE),
                             PAGE_KERNEL_VVAR);
        }
-        register_task_migration_notifier(&pvclock_migrate);
        return 0;
 }
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ed31c31b2485..c73efcd03e29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                &guest_hv_clock, sizeof(guest_hv_clock))))
                return 0;
-        /*
+        /* This VCPU is paused, but it's legal for a guest to read another
-         * The interface expects us to write an even number signaling that the
+         * VCPU's kvmclock, so we really have to follow the specification where
-         * update is finished. Since the guest won't see the intermediate
+         * it says that version is odd if data is being modified, and even after
-         * state, we just increase by 2 at the end.
+         * it is consistent.
+         *
+         * Version field updates must be kept separate.  This is because
+         * kvm_write_guest_cached might use a "rep movs" instruction, and
+         * writes within a string instruction are weakly ordered.  So there
+         * are three writes overall.
+         *
+         * As a small optimization, only write the version field in the first
+         * and third write.  The vcpu->pv_time cache is still valid, because the
+         * version field is the first in the struct.
         */
-        vcpu->hv_clock.version = guest_hv_clock.version + 2;
+        BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+        vcpu->hv_clock.version = guest_hv_clock.version + 1;
+        kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                                &vcpu->hv_clock,
+                                sizeof(vcpu->hv_clock.version));
+        smp_wmb();
        /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
        pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
@@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
                                &vcpu->hv_clock,
                                sizeof(vcpu->hv_clock));
+        smp_wmb();
+        vcpu->hv_clock.version++;
+        kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                                &vcpu->hv_clock,
+                                sizeof(vcpu->hv_clock.version));
        return 0;
 }
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 40d2473836c9..9793322751e0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode)
        cycle_t ret;
        u64 last;
        u32 version;
-        u32 migrate_count;
        u8 flags;
        unsigned cpu, cpu1;
        /*
-         * When looping to get a consistent (time-info, tsc) pair, we
+         * Note: hypervisor must guarantee that:
-         * also need to deal with the possibility we can switch vcpus,
+         * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-         * so make sure we always re-fetch time-info for the current vcpu.
+         * 2. that per-CPU pvclock time info is updated if the
+         *    underlying CPU changes.
+         * 3. that version is increased whenever underlying CPU
+         *    changes.
+         *
         */
        do {
                cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode)
                 * __getcpu() calls (Gleb).
                 */
-                /* Make sure migrate_count will change if we leave the VCPU. */
+                pvti = get_pvti(cpu);
-                do {
-                        pvti = get_pvti(cpu);
-                        migrate_count = pvti->migrate_count;
-                        cpu1 = cpu;
-                        cpu = __getcpu() & VGETCPU_CPU_MASK;
-                } while (unlikely(cpu != cpu1));
                version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
                /*
                 * Test we're still on the cpu as well as the version.
-                 * - We must read TSC of pvti's VCPU.
+                 * We could have been migrated just after the first
-                 * - KVM doesn't follow the versioning protocol, so data could
+                 * vgetcpu but before fetching the version, so we
-                 *   change before version if we left the VCPU.
+                 * wouldn't notice a version change.
                 */
-                smp_rmb();
+                cpu1 = __getcpu() & VGETCPU_CPU_MASK;
-        } while (unlikely((pvti->pvti.version & 1) ||
+        } while (unlikely(cpu != cpu1 ||
-                          pvti->pvti.version != version ||
+                          (pvti->pvti.version & 1) ||
-                          pvti->migrate_count != migrate_count));
+                          pvti->pvti.version != version));
        if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
                *mode = VCLOCK_NONE;