5 files changed, 272 insertions, 8 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d60535adec9..32f0e4a063b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,8 @@
 #include <linux/kvm_para.h>
 #include <linux/kvm_types.h>
 #include <linux/perf_event.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/clocksource.h>
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
@@ -560,6 +562,11 @@ struct kvm_arch {
        u64 cur_tsc_offset;
        u8  cur_tsc_generation;
+        spinlock_t pvclock_gtod_sync_lock;
+        bool use_master_clock;
+        u64 master_kernel_ns;
+        cycle_t master_cycle_now;
        struct kvm_xen_hvm_config xen_hvm_config;
        /* fields used by HYPER-V emulation */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bca63f04dcc..1d652685608 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
 #include <linux/tracepoint.h>
 #include <asm/vmx.h>
 #include <asm/svm.h>
+#include <asm/clocksource.h>
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
@@ -754,6 +755,35 @@ TRACE_EVENT(
                  __entry->write ? "Write" : "Read",
                  __entry->gpa_match ? "GPA" : "GVA")
 );
+#ifdef CONFIG_X86_64
+#define host_clocks                                     \
+        {VCLOCK_NONE, "none"},                          \
+        {VCLOCK_TSC,  "tsc"},                           \
+        {VCLOCK_HPET, "hpet"}                           \
+TRACE_EVENT(kvm_update_master_clock,
+        TP_PROTO(bool use_master_clock, unsigned int host_clock),
+        TP_ARGS(use_master_clock, host_clock),
+        TP_STRUCT__entry(
+                __field(                bool,   use_master_clock        )
+                __field(        unsigned int,   host_clock              )
+        ),
+        TP_fast_assign(
+                __entry->use_master_clock       = use_master_clock;
+                __entry->host_clock             = host_clock;
+        ),
+        TP_printk("masterclock %d hostclock %s",
+                  __entry->use_master_clock,
+                  __print_symbolic(__entry->host_clock, host_clocks))
+);
+#endif /* CONFIG_X86_64 */
 #endif /* _TRACE_KVM_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c077b817d1c..a7b97a49d8a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1048,7 +1048,9 @@ static inline u64 get_kernel_ns(void)
        return timespec_to_ns(&ts);
 }
+#ifdef CONFIG_X86_64
 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
+#endif
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 unsigned long max_tsc_khz;
@@ -1190,21 +1192,170 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
+#ifdef CONFIG_X86_64
+static cycle_t read_tsc(void)
+{
+        cycle_t ret;
+        u64 last;
+        /*
+         * Empirically, a fence (of type that depends on the CPU)
+         * before rdtsc is enough to ensure that rdtsc is ordered
+         * with respect to loads.  The various CPU manuals are unclear
+         * as to whether rdtsc can be reordered with later loads,
+         * but no one has ever seen it happen.
+         */
+        rdtsc_barrier();
+        ret = (cycle_t)vget_cycles();
+        last = pvclock_gtod_data.clock.cycle_last;
+        if (likely(ret >= last))
+                return ret;
+        /*
+         * GCC likes to generate cmov here, but this branch is extremely
+         * predictable (it's just a funciton of time and the likely is
+         * very likely) and there's a data dependence, so force GCC
+         * to generate a branch instead.  I don't barrier() because
+         * we don't actually need a barrier, and if this function
+         * ever gets inlined it will generate worse code.
+         */
+        asm volatile ("");
+        return last;
+}
+static inline u64 vgettsc(cycle_t *cycle_now)
+{
+        long v;
+        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+        *cycle_now = read_tsc();
+        v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
+        return v * gtod->clock.mult;
+}
+static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
+{
+        unsigned long seq;
+        u64 ns;
+        int mode;
+        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+        ts->tv_nsec = 0;
+        do {
+                seq = read_seqcount_begin(&gtod->seq);
+                mode = gtod->clock.vclock_mode;
+                ts->tv_sec = gtod->monotonic_time_sec;
+                ns = gtod->monotonic_time_snsec;
+                ns += vgettsc(cycle_now);
+                ns >>= gtod->clock.shift;
+        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+        timespec_add_ns(ts, ns);
+        return mode;
+}
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
+{
+        struct timespec ts;
+        /* checked again under seqlock below */
+        if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+                return false;
+        if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
+                return false;
+        monotonic_to_bootbased(&ts);
+        *kernel_ns = timespec_to_ns(&ts);
+        return true;
+}
+#endif
+/*
+ *
+ * Assuming a stable TSC across physical CPUS, the following condition
+ * is possible. Each numbered line represents an event visible to both
+ * CPUs at the next numbered event.
+ *
+ * "timespecX" represents host monotonic time. "tscX" represents
+ * RDTSC value.
+ *
+ *              VCPU0 on CPU0           |       VCPU1 on CPU1
+ *
+ * 1.  read timespec0,tsc0
+ * 2.                                   | timespec1 = timespec0 + N
+ *                                      | tsc1 = tsc0 + M
+ * 3. transition to guest               | transition to guest
+ * 4. ret0 = timespec0 + (rdtsc - tsc0) |
+ * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
+ *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
+ *
+ * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
+ *
+ *      - ret0 < ret1
+ *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
+ *              ...
+ *      - 0 < N - M => M < N
+ *
+ * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
+ * always the case (the difference between two distinct xtime instances
+ * might be smaller then the difference between corresponding TSC reads,
+ * when updating guest vcpus pvclock areas).
+ *
+ * To avoid that problem, do not allow visibility of distinct
+ * system_timestamp/tsc_timestamp values simultaneously: use a master
+ * copy of host monotonic time values. Update that master copy
+ * in lockstep.
+ *
+ * Rely on synchronization of host TSCs for monotonicity.
+ *
+ */
+static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+        struct kvm_arch *ka = &kvm->arch;
+        int vclock_mode;
+        /*
+         * If the host uses TSC clock, then passthrough TSC as stable
+         * to the guest.
+         */
+        ka->use_master_clock = kvm_get_time_and_clockread(
+                                        &ka->master_kernel_ns,
+                                        &ka->master_cycle_now);
+        if (ka->use_master_clock)
+                atomic_set(&kvm_guest_has_master_clock, 1);
+        vclock_mode = pvclock_gtod_data.clock.vclock_mode;
+        trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode);
+#endif
+}
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-        unsigned long flags;
+        unsigned long flags, this_tsc_khz;
        struct kvm_vcpu_arch *vcpu = &v->arch;
+        struct kvm_arch *ka = &v->kvm->arch;
        void *shared_kaddr;
-        unsigned long this_tsc_khz;
        s64 kernel_ns, max_kernel_ns;
-        u64 tsc_timestamp;
+        u64 tsc_timestamp, host_tsc;
        struct pvclock_vcpu_time_info *guest_hv_clock;
        u8 pvclock_flags;
+        bool use_master_clock;
+        kernel_ns = 0;
+        host_tsc = 0;
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-        tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, native_read_tsc());
-        kernel_ns = get_kernel_ns();
        this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
        if (unlikely(this_tsc_khz == 0)) {
                local_irq_restore(flags);
@@ -1213,6 +1364,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        }
        /*
+         * If the host uses TSC clock, then passthrough TSC as stable
+         * to the guest.
+         */
+        spin_lock(&ka->pvclock_gtod_sync_lock);
+        use_master_clock = ka->use_master_clock;
+        if (use_master_clock) {
+                host_tsc = ka->master_cycle_now;
+                kernel_ns = ka->master_kernel_ns;
+        }
+        spin_unlock(&ka->pvclock_gtod_sync_lock);
+        if (!use_master_clock) {
+                host_tsc = native_read_tsc();
+                kernel_ns = get_kernel_ns();
+        }
+        tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+        /*
         * We may have to catch up the TSC to match elapsed wall clock
         * time for two reasons, even if kvmclock is used.
         *   1) CPU could have been running below the maximum TSC rate
@@ -1273,9 +1442,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                vcpu->hw_tsc_khz = this_tsc_khz;
        }
-        if (max_kernel_ns > kernel_ns)
+        /* with a master <monotonic time, tsc value> tuple,
-                kernel_ns = max_kernel_ns;
+         * pvclock clock reads always increase at the (scaled) rate
+         * of guest TSC - no need to deal with sampling errors.
+         */
+        if (!use_master_clock) {
+                if (max_kernel_ns > kernel_ns)
+                        kernel_ns = max_kernel_ns;
+        }
        /* With all the info we got, fill in the values */
        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
@@ -1301,6 +1475,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                vcpu->pvclock_set_guest_stopped_request = false;
        }
+        /* If the host uses TSC clocksource, then it is stable */
+        if (use_master_clock)
+                pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
        vcpu->hv_clock.flags = pvclock_flags;
        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
@@ -4912,6 +5090,17 @@ static void kvm_set_mmio_spte_mask(void)
 #ifdef CONFIG_X86_64
 static void pvclock_gtod_update_fn(struct work_struct *work)
 {
+        struct kvm *kvm;
+        struct kvm_vcpu *vcpu;
+        int i;
+        raw_spin_lock(&kvm_lock);
+        list_for_each_entry(kvm, &vm_list, vm_list)
+                kvm_for_each_vcpu(i, vcpu, kvm)
+                        set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
+        atomic_set(&kvm_guest_has_master_clock, 0);
+        raw_spin_unlock(&kvm_lock);
 }
 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -5303,6 +5492,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+        int i;
+        struct kvm_vcpu *vcpu;
+        struct kvm_arch *ka = &kvm->arch;
+        spin_lock(&ka->pvclock_gtod_sync_lock);
+        kvm_make_mclock_inprogress_request(kvm);
+        /* no guest entries from this point */
+        pvclock_update_vm_gtod_copy(kvm);
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+        /* guest entries allowed */
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+        spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
        int r;
@@ -5315,6 +5527,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        kvm_mmu_unload(vcpu);
                if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
                        __kvm_migrate_timers(vcpu);
+                if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+                        kvm_gen_update_masterclock(vcpu->kvm);
                if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
                        r = kvm_guest_time_update(vcpu);
                        if (unlikely(r))
@@ -6219,6 +6433,8 @@ int kvm_arch_hardware_enable(void *garbage)
                        kvm_for_each_vcpu(i, vcpu, kvm) {
                                vcpu->arch.tsc_offset_adjustment += delta_cyc;
                                vcpu->arch.last_host_tsc = local_tsc;
+                                set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+                                        &vcpu->requests);
                        }
                        /*
@@ -6356,6 +6572,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
        mutex_init(&kvm->arch.apic_map_lock);
+        spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+        pvclock_update_vm_gtod_copy(kvm);
        return 0;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 99a47627e04..c94c9985dee 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -131,6 +131,8 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_PMU               16
 #define KVM_REQ_PMI               17
 #define KVM_REQ_WATCHDOG          18
+#define KVM_REQ_MASTERCLOCK_UPDATE 19
+#define KVM_REQ_MCLOCK_INPROGRESS 20
 #define KVM_USERSPACE_IRQ_SOURCE_ID             0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID        1
@@ -540,6 +542,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
+void kvm_make_mclock_inprogress_request(struct kvm *kvm);
 long kvm_arch_dev_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e3f5b143158..be3e7bb73b1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -212,6 +212,11 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
        make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
+void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+{
+        make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+}
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
        struct page *page;

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d60535adec9..32f0e4a063b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,8 @@
22	#include <linux/kvm_para.h>	22	#include <linux/kvm_para.h>
23	#include <linux/kvm_types.h>	23	#include <linux/kvm_types.h>
24	#include <linux/perf_event.h>	24	#include <linux/perf_event.h>
		25	#include <linux/pvclock_gtod.h>
		26	#include <linux/clocksource.h>
25		27
26	#include <asm/pvclock-abi.h>	28	#include <asm/pvclock-abi.h>
27	#include <asm/desc.h>	29	#include <asm/desc.h>
@@ -560,6 +562,11 @@ struct kvm_arch {
560	u64 cur_tsc_offset;	562	u64 cur_tsc_offset;
561	u8 cur_tsc_generation;	563	u8 cur_tsc_generation;
562		564
		565	spinlock_t pvclock_gtod_sync_lock;
		566	bool use_master_clock;
		567	u64 master_kernel_ns;
		568	cycle_t master_cycle_now;
		569
563	struct kvm_xen_hvm_config xen_hvm_config;	570	struct kvm_xen_hvm_config xen_hvm_config;
564		571
565	/* fields used by HYPER-V emulation */	572	/* fields used by HYPER-V emulation */


diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index bca63f04dcc..1d652685608 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
4	#include <linux/tracepoint.h>	4	#include <linux/tracepoint.h>
5	#include <asm/vmx.h>	5	#include <asm/vmx.h>
6	#include <asm/svm.h>	6	#include <asm/svm.h>
		7	#include <asm/clocksource.h>
7		8
8	#undef TRACE_SYSTEM	9	#undef TRACE_SYSTEM
9	#define TRACE_SYSTEM kvm	10	#define TRACE_SYSTEM kvm
@@ -754,6 +755,35 @@ TRACE_EVENT(
754	__entry->write ? "Write" : "Read",	755	__entry->write ? "Write" : "Read",
755	__entry->gpa_match ? "GPA" : "GVA")	756	__entry->gpa_match ? "GPA" : "GVA")
756	);	757	);
		758
		759	#ifdef CONFIG_X86_64
		760
		761	#define host_clocks \
		762	{VCLOCK_NONE, "none"}, \
		763	{VCLOCK_TSC, "tsc"}, \
		764	{VCLOCK_HPET, "hpet"} \
		765
		766	TRACE_EVENT(kvm_update_master_clock,
		767	TP_PROTO(bool use_master_clock, unsigned int host_clock),
		768	TP_ARGS(use_master_clock, host_clock),
		769
		770	TP_STRUCT__entry(
		771	__field( bool, use_master_clock )
		772	__field( unsigned int, host_clock )
		773	),
		774
		775	TP_fast_assign(
		776	__entry->use_master_clock = use_master_clock;
		777	__entry->host_clock = host_clock;
		778	),
		779
		780	TP_printk("masterclock %d hostclock %s",
		781	__entry->use_master_clock,
		782	__print_symbolic(__entry->host_clock, host_clocks))
		783	);
		784
		785	#endif /* CONFIG_X86_64 */
		786
757	#endif /* _TRACE_KVM_H */	787	#endif /* _TRACE_KVM_H */
758		788
759	#undef TRACE_INCLUDE_PATH	789	#undef TRACE_INCLUDE_PATH


diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c077b817d1c..a7b97a49d8a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c
@@ -1048,7 +1048,9 @@ static inline u64 get_kernel_ns(void)
1048	return timespec_to_ns(&ts);	1048	return timespec_to_ns(&ts);
1049	}	1049	}
1050		1050
		1051	#ifdef CONFIG_X86_64
1051	static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);	1052	static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
		1053	#endif
1052		1054
1053	static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);	1055	static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1054	unsigned long max_tsc_khz;	1056	unsigned long max_tsc_khz;
@@ -1190,21 +1192,170 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1190		1192
1191	EXPORT_SYMBOL_GPL(kvm_write_tsc);	1193	EXPORT_SYMBOL_GPL(kvm_write_tsc);
1192		1194
		1195	#ifdef CONFIG_X86_64
		1196
		1197	static cycle_t read_tsc(void)
		1198	{
		1199	cycle_t ret;
		1200	u64 last;
		1201
		1202	/*
		1203	* Empirically, a fence (of type that depends on the CPU)
		1204	* before rdtsc is enough to ensure that rdtsc is ordered
		1205	* with respect to loads. The various CPU manuals are unclear
		1206	* as to whether rdtsc can be reordered with later loads,
		1207	* but no one has ever seen it happen.
		1208	*/
		1209	rdtsc_barrier();
		1210	ret = (cycle_t)vget_cycles();
		1211
		1212	last = pvclock_gtod_data.clock.cycle_last;
		1213
		1214	if (likely(ret >= last))
		1215	return ret;
		1216
		1217	/*
		1218	* GCC likes to generate cmov here, but this branch is extremely
		1219	* predictable (it's just a funciton of time and the likely is
		1220	* very likely) and there's a data dependence, so force GCC
		1221	* to generate a branch instead. I don't barrier() because
		1222	* we don't actually need a barrier, and if this function
		1223	* ever gets inlined it will generate worse code.
		1224	*/
		1225	asm volatile ("");
		1226	return last;
		1227	}
		1228
		1229	static inline u64 vgettsc(cycle_t *cycle_now)
		1230	{
		1231	long v;
		1232	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
		1233
		1234	*cycle_now = read_tsc();
		1235
		1236	v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
		1237	return v * gtod->clock.mult;
		1238	}
		1239
		1240	static int do_monotonic(struct timespec ts, cycle_t cycle_now)
		1241	{
		1242	unsigned long seq;
		1243	u64 ns;
		1244	int mode;
		1245	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
		1246
		1247	ts->tv_nsec = 0;
		1248	do {
		1249	seq = read_seqcount_begin(&gtod->seq);
		1250	mode = gtod->clock.vclock_mode;
		1251	ts->tv_sec = gtod->monotonic_time_sec;
		1252	ns = gtod->monotonic_time_snsec;
		1253	ns += vgettsc(cycle_now);
		1254	ns >>= gtod->clock.shift;
		1255	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
		1256	timespec_add_ns(ts, ns);
		1257
		1258	return mode;
		1259	}
		1260
		1261	/* returns true if host is using tsc clocksource */
		1262	static bool kvm_get_time_and_clockread(s64 kernel_ns, cycle_t cycle_now)
		1263	{
		1264	struct timespec ts;
		1265
		1266	/* checked again under seqlock below */
		1267	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
		1268	return false;
		1269
		1270	if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
		1271	return false;
		1272
		1273	monotonic_to_bootbased(&ts);
		1274	*kernel_ns = timespec_to_ns(&ts);
		1275
		1276	return true;
		1277	}
		1278	#endif
		1279
		1280	/*
		1281	*
		1282	* Assuming a stable TSC across physical CPUS, the following condition
		1283	* is possible. Each numbered line represents an event visible to both
		1284	* CPUs at the next numbered event.
		1285	*
		1286	* "timespecX" represents host monotonic time. "tscX" represents
		1287	* RDTSC value.
		1288	*
		1289	* VCPU0 on CPU0 \| VCPU1 on CPU1
		1290	*
		1291	* 1. read timespec0,tsc0
		1292	* 2. \| timespec1 = timespec0 + N
		1293	* \| tsc1 = tsc0 + M
		1294	* 3. transition to guest \| transition to guest
		1295	* 4. ret0 = timespec0 + (rdtsc - tsc0) \|
		1296	* 5. \| ret1 = timespec1 + (rdtsc - tsc1)
		1297	* \| ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
		1298	*
		1299	* Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
		1300	*
		1301	* - ret0 < ret1
		1302	* - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
		1303	* ...
		1304	* - 0 < N - M => M < N
		1305	*
		1306	* That is, when timespec0 != timespec1, M < N. Unfortunately that is not
		1307	* always the case (the difference between two distinct xtime instances
		1308	* might be smaller then the difference between corresponding TSC reads,
		1309	* when updating guest vcpus pvclock areas).
		1310	*
		1311	* To avoid that problem, do not allow visibility of distinct
		1312	* system_timestamp/tsc_timestamp values simultaneously: use a master
		1313	* copy of host monotonic time values. Update that master copy
		1314	* in lockstep.
		1315	*
		1316	* Rely on synchronization of host TSCs for monotonicity.
		1317	*
		1318	*/
		1319
		1320	static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
		1321	{
		1322	#ifdef CONFIG_X86_64
		1323	struct kvm_arch *ka = &kvm->arch;
		1324	int vclock_mode;
		1325
		1326	/*
		1327	* If the host uses TSC clock, then passthrough TSC as stable
		1328	* to the guest.
		1329	*/
		1330	ka->use_master_clock = kvm_get_time_and_clockread(
		1331	&ka->master_kernel_ns,
		1332	&ka->master_cycle_now);
		1333
		1334	if (ka->use_master_clock)
		1335	atomic_set(&kvm_guest_has_master_clock, 1);
		1336
		1337	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
		1338	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode);
		1339	#endif
		1340	}
		1341
1193	static int kvm_guest_time_update(struct kvm_vcpu *v)	1342	static int kvm_guest_time_update(struct kvm_vcpu *v)
1194	{	1343	{
1195	unsigned long flags;	1344	unsigned long flags, this_tsc_khz;
1196	struct kvm_vcpu_arch *vcpu = &v->arch;	1345	struct kvm_vcpu_arch *vcpu = &v->arch;
		1346	struct kvm_arch *ka = &v->kvm->arch;
1197	void *shared_kaddr;	1347	void *shared_kaddr;
1198	unsigned long this_tsc_khz;
1199	s64 kernel_ns, max_kernel_ns;	1348	s64 kernel_ns, max_kernel_ns;
1200	u64 tsc_timestamp;	1349	u64 tsc_timestamp, host_tsc;
1201	struct pvclock_vcpu_time_info *guest_hv_clock;	1350	struct pvclock_vcpu_time_info *guest_hv_clock;
1202	u8 pvclock_flags;	1351	u8 pvclock_flags;
		1352	bool use_master_clock;
		1353
		1354	kernel_ns = 0;
		1355	host_tsc = 0;
1203		1356
1204	/* Keep irq disabled to prevent changes to the clock */	1357	/* Keep irq disabled to prevent changes to the clock */
1205	local_irq_save(flags);	1358	local_irq_save(flags);
1206	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, native_read_tsc());
1207	kernel_ns = get_kernel_ns();
1208	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);	1359	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1209	if (unlikely(this_tsc_khz == 0)) {	1360	if (unlikely(this_tsc_khz == 0)) {
1210	local_irq_restore(flags);	1361	local_irq_restore(flags);
@@ -1213,6 +1364,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1213	}	1364	}
1214		1365
1215	/*	1366	/*
		1367	* If the host uses TSC clock, then passthrough TSC as stable
		1368	* to the guest.
		1369	*/
		1370	spin_lock(&ka->pvclock_gtod_sync_lock);
		1371	use_master_clock = ka->use_master_clock;
		1372	if (use_master_clock) {
		1373	host_tsc = ka->master_cycle_now;
		1374	kernel_ns = ka->master_kernel_ns;
		1375	}
		1376	spin_unlock(&ka->pvclock_gtod_sync_lock);
		1377	if (!use_master_clock) {
		1378	host_tsc = native_read_tsc();
		1379	kernel_ns = get_kernel_ns();
		1380	}
		1381
		1382	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
		1383
		1384	/*
1216	* We may have to catch up the TSC to match elapsed wall clock	1385	* We may have to catch up the TSC to match elapsed wall clock
1217	* time for two reasons, even if kvmclock is used.	1386	* time for two reasons, even if kvmclock is used.
1218	* 1) CPU could have been running below the maximum TSC rate	1387	* 1) CPU could have been running below the maximum TSC rate
@@ -1273,9 +1442,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1273	vcpu->hw_tsc_khz = this_tsc_khz;	1442	vcpu->hw_tsc_khz = this_tsc_khz;
1274	}	1443	}
1275		1444
1276	if (max_kernel_ns > kernel_ns)	1445	/* with a master <monotonic time, tsc value> tuple,
1277	kernel_ns = max_kernel_ns;	1446	* pvclock clock reads always increase at the (scaled) rate
1278		1447	* of guest TSC - no need to deal with sampling errors.
		1448	*/
		1449	if (!use_master_clock) {
		1450	if (max_kernel_ns > kernel_ns)
		1451	kernel_ns = max_kernel_ns;
		1452	}
1279	/* With all the info we got, fill in the values */	1453	/* With all the info we got, fill in the values */
1280	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;	1454	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1281	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;	1455	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
@@ -1301,6 +1475,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1301	vcpu->pvclock_set_guest_stopped_request = false;	1475	vcpu->pvclock_set_guest_stopped_request = false;
1302	}	1476	}
1303		1477
		1478	/* If the host uses TSC clocksource, then it is stable */
		1479	if (use_master_clock)
		1480	pvclock_flags \|= PVCLOCK_TSC_STABLE_BIT;
		1481
1304	vcpu->hv_clock.flags = pvclock_flags;	1482	vcpu->hv_clock.flags = pvclock_flags;
1305		1483
1306	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,	1484	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
@@ -4912,6 +5090,17 @@ static void kvm_set_mmio_spte_mask(void)
4912	#ifdef CONFIG_X86_64	5090	#ifdef CONFIG_X86_64
4913	static void pvclock_gtod_update_fn(struct work_struct *work)	5091	static void pvclock_gtod_update_fn(struct work_struct *work)
4914	{	5092	{
		5093	struct kvm *kvm;
		5094
		5095	struct kvm_vcpu *vcpu;
		5096	int i;
		5097
		5098	raw_spin_lock(&kvm_lock);
		5099	list_for_each_entry(kvm, &vm_list, vm_list)
		5100	kvm_for_each_vcpu(i, vcpu, kvm)
		5101	set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
		5102	atomic_set(&kvm_guest_has_master_clock, 0);
		5103	raw_spin_unlock(&kvm_lock);
4915	}	5104	}
4916		5105
4917	static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);	5106	static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -5303,6 +5492,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
5303	kvm_make_request(KVM_REQ_EVENT, vcpu);	5492	kvm_make_request(KVM_REQ_EVENT, vcpu);
5304	}	5493	}
5305		5494
		5495	static void kvm_gen_update_masterclock(struct kvm *kvm)
		5496	{
		5497	#ifdef CONFIG_X86_64
		5498	int i;
		5499	struct kvm_vcpu *vcpu;
		5500	struct kvm_arch *ka = &kvm->arch;
		5501
		5502	spin_lock(&ka->pvclock_gtod_sync_lock);
		5503	kvm_make_mclock_inprogress_request(kvm);
		5504	/* no guest entries from this point */
		5505	pvclock_update_vm_gtod_copy(kvm);
		5506
		5507	kvm_for_each_vcpu(i, vcpu, kvm)
		5508	set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
		5509
		5510	/* guest entries allowed */
		5511	kvm_for_each_vcpu(i, vcpu, kvm)
		5512	clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
		5513
		5514	spin_unlock(&ka->pvclock_gtod_sync_lock);
		5515	#endif
		5516	}
		5517
5306	static int vcpu_enter_guest(struct kvm_vcpu *vcpu)	5518	static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5307	{	5519	{
5308	int r;	5520	int r;
@@ -5315,6 +5527,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5315	kvm_mmu_unload(vcpu);	5527	kvm_mmu_unload(vcpu);
5316	if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))	5528	if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
5317	__kvm_migrate_timers(vcpu);	5529	__kvm_migrate_timers(vcpu);
		5530	if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
		5531	kvm_gen_update_masterclock(vcpu->kvm);
5318	if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {	5532	if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5319	r = kvm_guest_time_update(vcpu);	5533	r = kvm_guest_time_update(vcpu);
5320	if (unlikely(r))	5534	if (unlikely(r))
@@ -6219,6 +6433,8 @@ int kvm_arch_hardware_enable(void *garbage)
6219	kvm_for_each_vcpu(i, vcpu, kvm) {	6433	kvm_for_each_vcpu(i, vcpu, kvm) {
6220	vcpu->arch.tsc_offset_adjustment += delta_cyc;	6434	vcpu->arch.tsc_offset_adjustment += delta_cyc;
6221	vcpu->arch.last_host_tsc = local_tsc;	6435	vcpu->arch.last_host_tsc = local_tsc;
		6436	set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
		6437	&vcpu->requests);
6222	}	6438	}
6223		6439
6224	/*	6440	/*
@@ -6356,6 +6572,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6356		6572
6357	raw_spin_lock_init(&kvm->arch.tsc_write_lock);	6573	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6358	mutex_init(&kvm->arch.apic_map_lock);	6574	mutex_init(&kvm->arch.apic_map_lock);
		6575	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
		6576
		6577	pvclock_update_vm_gtod_copy(kvm);
6359		6578
6360	return 0;	6579	return 0;
6361	}	6580	}


diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 99a47627e04..c94c9985dee 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h
@@ -131,6 +131,8 @@ static inline bool is_error_page(struct page *page)
131	#define KVM_REQ_PMU 16	131	#define KVM_REQ_PMU 16
132	#define KVM_REQ_PMI 17	132	#define KVM_REQ_PMI 17
133	#define KVM_REQ_WATCHDOG 18	133	#define KVM_REQ_WATCHDOG 18
		134	#define KVM_REQ_MASTERCLOCK_UPDATE 19
		135	#define KVM_REQ_MCLOCK_INPROGRESS 20
134		136
135	#define KVM_USERSPACE_IRQ_SOURCE_ID 0	137	#define KVM_USERSPACE_IRQ_SOURCE_ID 0
136	#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1	138	#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
@@ -540,6 +542,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
540		542
541	void kvm_flush_remote_tlbs(struct kvm *kvm);	543	void kvm_flush_remote_tlbs(struct kvm *kvm);
542	void kvm_reload_remote_mmus(struct kvm *kvm);	544	void kvm_reload_remote_mmus(struct kvm *kvm);
		545	void kvm_make_mclock_inprogress_request(struct kvm *kvm);
543		546
544	long kvm_arch_dev_ioctl(struct file *filp,	547	long kvm_arch_dev_ioctl(struct file *filp,
545	unsigned int ioctl, unsigned long arg);	548	unsigned int ioctl, unsigned long arg);


diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e3f5b143158..be3e7bb73b1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c
@@ -212,6 +212,11 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
212	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);	212	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
213	}	213	}
214		214
		215	void kvm_make_mclock_inprogress_request(struct kvm *kvm)
		216	{
		217	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
		218	}
		219
215	int kvm_vcpu_init(struct kvm_vcpu vcpu, struct kvm kvm, unsigned id)	220	int kvm_vcpu_init(struct kvm_vcpu vcpu, struct kvm kvm, unsigned id)
216	{	221	{
217	struct page *page;	222	struct page *page;