KVM: x86: implement PVCLOCK_TSC_STABLE_BIT pvclock flag

KVM added a global variable to guarantee monotonicity in the guest. One of the reasons for that is that the time between 1. ktime_get_ts(&timespec); 2. rdtscll(tsc); Is variable. That is, given a host with stable TSC, suppose that two VCPUs read the same time via ktime_get_ts() above. The time required to execute 2. is not the same on those two instances executing in different VCPUS (cache misses, interrupts...). If the TSC value that is used by the host to interpolate when calculating the monotonic time is the same value used to calculate the tsc_timestamp value stored in the pvclock data structure, and a single <system_timestamp, tsc_timestamp> tuple is visible to all vcpus simultaneously, this problem disappears. See comment on top of pvclock_update_vm_gtod_copy for details. Monotonicity is then guaranteed by synchronicity of the host TSCs and guest TSCs. Set TSC stable pvclock flag in that case, allowing the guest to read clock from userspace. Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
author: Marcelo Tosatti <mtosatti@redhat.com> 2012-11-27 20:29:01 -0500
committer: Marcelo Tosatti <mtosatti@redhat.com> 2012-11-27 20:29:13 -0500
commit: d828199e84447795c6669ff0e6c6d55eb9beeff6 (patch)
tree: c11fc58c50234ddf06f1c4ca98a4115c8fe8ac2f /arch/x86/kvm
parent: 16e8d74d2da9920f874b10a3d979fb25c01f518f (diff)
2 files changed, 257 insertions, 8 deletions
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bca63f04dccb..1d6526856080 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
 #include <linux/tracepoint.h>
 #include <asm/vmx.h>
 #include <asm/svm.h>
+#include <asm/clocksource.h>
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
@@ -754,6 +755,35 @@ TRACE_EVENT(
                  __entry->write ? "Write" : "Read",
                  __entry->gpa_match ? "GPA" : "GVA")
 );
+#ifdef CONFIG_X86_64
+#define host_clocks                                     \
+        {VCLOCK_NONE, "none"},                          \
+        {VCLOCK_TSC,  "tsc"},                           \
+        {VCLOCK_HPET, "hpet"}                           \
+TRACE_EVENT(kvm_update_master_clock,
+        TP_PROTO(bool use_master_clock, unsigned int host_clock),
+        TP_ARGS(use_master_clock, host_clock),
+        TP_STRUCT__entry(
+                __field(                bool,   use_master_clock        )
+                __field(        unsigned int,   host_clock              )
+        ),
+        TP_fast_assign(
+                __entry->use_master_clock       = use_master_clock;
+                __entry->host_clock             = host_clock;
+        ),
+        TP_printk("masterclock %d hostclock %s",
+                  __entry->use_master_clock,
+                  __print_symbolic(__entry->host_clock, host_clocks))
+);
+#endif /* CONFIG_X86_64 */
 #endif /* _TRACE_KVM_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c077b817d1c3..a7b97a49d8ad 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1048,7 +1048,9 @@ static inline u64 get_kernel_ns(void)
        return timespec_to_ns(&ts);
 }
+#ifdef CONFIG_X86_64
 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
+#endif
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 unsigned long max_tsc_khz;
@@ -1190,21 +1192,170 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
+#ifdef CONFIG_X86_64
+static cycle_t read_tsc(void)
+{
+        cycle_t ret;
+        u64 last;
+        /*
+         * Empirically, a fence (of type that depends on the CPU)
+         * before rdtsc is enough to ensure that rdtsc is ordered
+         * with respect to loads.  The various CPU manuals are unclear
+         * as to whether rdtsc can be reordered with later loads,
+         * but no one has ever seen it happen.
+         */
+        rdtsc_barrier();
+        ret = (cycle_t)vget_cycles();
+        last = pvclock_gtod_data.clock.cycle_last;
+        if (likely(ret >= last))
+                return ret;
+        /*
+         * GCC likes to generate cmov here, but this branch is extremely
+         * predictable (it's just a funciton of time and the likely is
+         * very likely) and there's a data dependence, so force GCC
+         * to generate a branch instead.  I don't barrier() because
+         * we don't actually need a barrier, and if this function
+         * ever gets inlined it will generate worse code.
+         */
+        asm volatile ("");
+        return last;
+}
+static inline u64 vgettsc(cycle_t *cycle_now)
+{
+        long v;
+        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+        *cycle_now = read_tsc();
+        v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
+        return v * gtod->clock.mult;
+}
+static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
+{
+        unsigned long seq;
+        u64 ns;
+        int mode;
+        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+        ts->tv_nsec = 0;
+        do {
+                seq = read_seqcount_begin(&gtod->seq);
+                mode = gtod->clock.vclock_mode;
+                ts->tv_sec = gtod->monotonic_time_sec;
+                ns = gtod->monotonic_time_snsec;
+                ns += vgettsc(cycle_now);
+                ns >>= gtod->clock.shift;
+        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+        timespec_add_ns(ts, ns);
+        return mode;
+}
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
+{
+        struct timespec ts;
+        /* checked again under seqlock below */
+        if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+                return false;
+        if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
+                return false;
+        monotonic_to_bootbased(&ts);
+        *kernel_ns = timespec_to_ns(&ts);
+        return true;
+}
+#endif
+/*
+ *
+ * Assuming a stable TSC across physical CPUS, the following condition
+ * is possible. Each numbered line represents an event visible to both
+ * CPUs at the next numbered event.
+ *
+ * "timespecX" represents host monotonic time. "tscX" represents
+ * RDTSC value.
+ *
+ *              VCPU0 on CPU0           |       VCPU1 on CPU1
+ *
+ * 1.  read timespec0,tsc0
+ * 2.                                   | timespec1 = timespec0 + N
+ *                                      | tsc1 = tsc0 + M
+ * 3. transition to guest               | transition to guest
+ * 4. ret0 = timespec0 + (rdtsc - tsc0) |
+ * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
+ *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
+ *
+ * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
+ *
+ *      - ret0 < ret1
+ *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
+ *              ...
+ *      - 0 < N - M => M < N
+ *
+ * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
+ * always the case (the difference between two distinct xtime instances
+ * might be smaller then the difference between corresponding TSC reads,
+ * when updating guest vcpus pvclock areas).
+ *
+ * To avoid that problem, do not allow visibility of distinct
+ * system_timestamp/tsc_timestamp values simultaneously: use a master
+ * copy of host monotonic time values. Update that master copy
+ * in lockstep.
+ *
+ * Rely on synchronization of host TSCs for monotonicity.
+ *
+ */
+static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+        struct kvm_arch *ka = &kvm->arch;
+        int vclock_mode;
+        /*
+         * If the host uses TSC clock, then passthrough TSC as stable
+         * to the guest.
+         */
+        ka->use_master_clock = kvm_get_time_and_clockread(
+                                        &ka->master_kernel_ns,
+                                        &ka->master_cycle_now);
+        if (ka->use_master_clock)
+                atomic_set(&kvm_guest_has_master_clock, 1);
+        vclock_mode = pvclock_gtod_data.clock.vclock_mode;
+        trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode);
+#endif
+}
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-        unsigned long flags;
+        unsigned long flags, this_tsc_khz;
        struct kvm_vcpu_arch *vcpu = &v->arch;
+        struct kvm_arch *ka = &v->kvm->arch;
        void *shared_kaddr;
-        unsigned long this_tsc_khz;
        s64 kernel_ns, max_kernel_ns;
-        u64 tsc_timestamp;
+        u64 tsc_timestamp, host_tsc;
        struct pvclock_vcpu_time_info *guest_hv_clock;
        u8 pvclock_flags;
+        bool use_master_clock;
+        kernel_ns = 0;
+        host_tsc = 0;
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-        tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, native_read_tsc());
-        kernel_ns = get_kernel_ns();
        this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
        if (unlikely(this_tsc_khz == 0)) {
                local_irq_restore(flags);
@@ -1213,6 +1364,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        }
        /*
+         * If the host uses TSC clock, then passthrough TSC as stable
+         * to the guest.
+         */
+        spin_lock(&ka->pvclock_gtod_sync_lock);
+        use_master_clock = ka->use_master_clock;
+        if (use_master_clock) {
+                host_tsc = ka->master_cycle_now;
+                kernel_ns = ka->master_kernel_ns;
+        }
+        spin_unlock(&ka->pvclock_gtod_sync_lock);
+        if (!use_master_clock) {
+                host_tsc = native_read_tsc();
+                kernel_ns = get_kernel_ns();
+        }
+        tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+        /*
         * We may have to catch up the TSC to match elapsed wall clock
         * time for two reasons, even if kvmclock is used.
         *   1) CPU could have been running below the maximum TSC rate
@@ -1273,9 +1442,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                vcpu->hw_tsc_khz = this_tsc_khz;
        }
-        if (max_kernel_ns > kernel_ns)
+        /* with a master <monotonic time, tsc value> tuple,
-                kernel_ns = max_kernel_ns;
+         * pvclock clock reads always increase at the (scaled) rate
+         * of guest TSC - no need to deal with sampling errors.
+         */
+        if (!use_master_clock) {
+                if (max_kernel_ns > kernel_ns)
+                        kernel_ns = max_kernel_ns;
+        }
        /* With all the info we got, fill in the values */
        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
@@ -1301,6 +1475,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                vcpu->pvclock_set_guest_stopped_request = false;
        }
+        /* If the host uses TSC clocksource, then it is stable */
+        if (use_master_clock)
+                pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
        vcpu->hv_clock.flags = pvclock_flags;
        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
@@ -4912,6 +5090,17 @@ static void kvm_set_mmio_spte_mask(void)
 #ifdef CONFIG_X86_64
 static void pvclock_gtod_update_fn(struct work_struct *work)
 {
+        struct kvm *kvm;
+        struct kvm_vcpu *vcpu;
+        int i;
+        raw_spin_lock(&kvm_lock);
+        list_for_each_entry(kvm, &vm_list, vm_list)
+                kvm_for_each_vcpu(i, vcpu, kvm)
+                        set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
+        atomic_set(&kvm_guest_has_master_clock, 0);
+        raw_spin_unlock(&kvm_lock);
 }
 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -5303,6 +5492,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+        int i;
+        struct kvm_vcpu *vcpu;
+        struct kvm_arch *ka = &kvm->arch;
+        spin_lock(&ka->pvclock_gtod_sync_lock);
+        kvm_make_mclock_inprogress_request(kvm);
+        /* no guest entries from this point */
+        pvclock_update_vm_gtod_copy(kvm);
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+        /* guest entries allowed */
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+        spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
        int r;
@@ -5315,6 +5527,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        kvm_mmu_unload(vcpu);
                if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
                        __kvm_migrate_timers(vcpu);
+                if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+                        kvm_gen_update_masterclock(vcpu->kvm);
                if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
                        r = kvm_guest_time_update(vcpu);
                        if (unlikely(r))
@@ -6219,6 +6433,8 @@ int kvm_arch_hardware_enable(void *garbage)
                        kvm_for_each_vcpu(i, vcpu, kvm) {
                                vcpu->arch.tsc_offset_adjustment += delta_cyc;
                                vcpu->arch.last_host_tsc = local_tsc;
+                                set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+                                        &vcpu->requests);
                        }
                        /*
@@ -6356,6 +6572,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
        mutex_init(&kvm->arch.apic_map_lock);
+        spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+        pvclock_update_vm_gtod_copy(kvm);
        return 0;
 }
author	Marcelo Tosatti <mtosatti@redhat.com>	2012-11-27 20:29:01 -0500
committer	Marcelo Tosatti <mtosatti@redhat.com>	2012-11-27 20:29:13 -0500
commit	d828199e84447795c6669ff0e6c6d55eb9beeff6 (patch)
tree	c11fc58c50234ddf06f1c4ca98a4115c8fe8ac2f /arch/x86/kvm
parent	16e8d74d2da9920f874b10a3d979fb25c01f518f (diff)

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index bca63f04dccb..1d6526856080 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
4	#include <linux/tracepoint.h>	4	#include <linux/tracepoint.h>
5	#include <asm/vmx.h>	5	#include <asm/vmx.h>
6	#include <asm/svm.h>	6	#include <asm/svm.h>
		7	#include <asm/clocksource.h>
7		8
8	#undef TRACE_SYSTEM	9	#undef TRACE_SYSTEM
9	#define TRACE_SYSTEM kvm	10	#define TRACE_SYSTEM kvm
@@ -754,6 +755,35 @@ TRACE_EVENT(
754	__entry->write ? "Write" : "Read",	755	__entry->write ? "Write" : "Read",
755	__entry->gpa_match ? "GPA" : "GVA")	756	__entry->gpa_match ? "GPA" : "GVA")
756	);	757	);
		758
		759	#ifdef CONFIG_X86_64
		760
		761	#define host_clocks \
		762	{VCLOCK_NONE, "none"}, \
		763	{VCLOCK_TSC, "tsc"}, \
		764	{VCLOCK_HPET, "hpet"} \
		765
		766	TRACE_EVENT(kvm_update_master_clock,
		767	TP_PROTO(bool use_master_clock, unsigned int host_clock),
		768	TP_ARGS(use_master_clock, host_clock),
		769
		770	TP_STRUCT__entry(
		771	__field( bool, use_master_clock )
		772	__field( unsigned int, host_clock )
		773	),
		774
		775	TP_fast_assign(
		776	__entry->use_master_clock = use_master_clock;
		777	__entry->host_clock = host_clock;
		778	),
		779
		780	TP_printk("masterclock %d hostclock %s",
		781	__entry->use_master_clock,
		782	__print_symbolic(__entry->host_clock, host_clocks))
		783	);
		784
		785	#endif /* CONFIG_X86_64 */
		786
757	#endif /* _TRACE_KVM_H */	787	#endif /* _TRACE_KVM_H */
758		788
759	#undef TRACE_INCLUDE_PATH	789	#undef TRACE_INCLUDE_PATH


diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c077b817d1c3..a7b97a49d8ad 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c
@@ -1048,7 +1048,9 @@ static inline u64 get_kernel_ns(void)
1048	return timespec_to_ns(&ts);	1048	return timespec_to_ns(&ts);
1049	}	1049	}
1050		1050
		1051	#ifdef CONFIG_X86_64
1051	static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);	1052	static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
		1053	#endif
1052		1054
1053	static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);	1055	static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1054	unsigned long max_tsc_khz;	1056	unsigned long max_tsc_khz;
@@ -1190,21 +1192,170 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1190		1192
1191	EXPORT_SYMBOL_GPL(kvm_write_tsc);	1193	EXPORT_SYMBOL_GPL(kvm_write_tsc);
1192		1194
		1195	#ifdef CONFIG_X86_64
		1196
		1197	static cycle_t read_tsc(void)
		1198	{
		1199	cycle_t ret;
		1200	u64 last;
		1201
		1202	/*
		1203	* Empirically, a fence (of type that depends on the CPU)
		1204	* before rdtsc is enough to ensure that rdtsc is ordered
		1205	* with respect to loads. The various CPU manuals are unclear
		1206	* as to whether rdtsc can be reordered with later loads,
		1207	* but no one has ever seen it happen.
		1208	*/
		1209	rdtsc_barrier();
		1210	ret = (cycle_t)vget_cycles();
		1211
		1212	last = pvclock_gtod_data.clock.cycle_last;
		1213
		1214	if (likely(ret >= last))
		1215	return ret;
		1216
		1217	/*
		1218	* GCC likes to generate cmov here, but this branch is extremely
		1219	* predictable (it's just a funciton of time and the likely is
		1220	* very likely) and there's a data dependence, so force GCC
		1221	* to generate a branch instead. I don't barrier() because
		1222	* we don't actually need a barrier, and if this function
		1223	* ever gets inlined it will generate worse code.
		1224	*/
		1225	asm volatile ("");
		1226	return last;
		1227	}
		1228
		1229	static inline u64 vgettsc(cycle_t *cycle_now)
		1230	{
		1231	long v;
		1232	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
		1233
		1234	*cycle_now = read_tsc();
		1235
		1236	v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
		1237	return v * gtod->clock.mult;
		1238	}
		1239
		1240	static int do_monotonic(struct timespec ts, cycle_t cycle_now)
		1241	{
		1242	unsigned long seq;
		1243	u64 ns;
		1244	int mode;
		1245	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
		1246
		1247	ts->tv_nsec = 0;
		1248	do {
		1249	seq = read_seqcount_begin(&gtod->seq);
		1250	mode = gtod->clock.vclock_mode;
		1251	ts->tv_sec = gtod->monotonic_time_sec;
		1252	ns = gtod->monotonic_time_snsec;
		1253	ns += vgettsc(cycle_now);
		1254	ns >>= gtod->clock.shift;
		1255	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
		1256	timespec_add_ns(ts, ns);
		1257
		1258	return mode;
		1259	}
		1260
		1261	/* returns true if host is using tsc clocksource */
		1262	static bool kvm_get_time_and_clockread(s64 kernel_ns, cycle_t cycle_now)
		1263	{
		1264	struct timespec ts;
		1265
		1266	/* checked again under seqlock below */
		1267	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
		1268	return false;
		1269
		1270	if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
		1271	return false;
		1272
		1273	monotonic_to_bootbased(&ts);
		1274	*kernel_ns = timespec_to_ns(&ts);
		1275
		1276	return true;
		1277	}
		1278	#endif
		1279
		1280	/*
		1281	*
		1282	* Assuming a stable TSC across physical CPUS, the following condition
		1283	* is possible. Each numbered line represents an event visible to both
		1284	* CPUs at the next numbered event.
		1285	*
		1286	* "timespecX" represents host monotonic time. "tscX" represents
		1287	* RDTSC value.
		1288	*
		1289	* VCPU0 on CPU0 \| VCPU1 on CPU1
		1290	*
		1291	* 1. read timespec0,tsc0
		1292	* 2. \| timespec1 = timespec0 + N
		1293	* \| tsc1 = tsc0 + M
		1294	* 3. transition to guest \| transition to guest
		1295	* 4. ret0 = timespec0 + (rdtsc - tsc0) \|
		1296	* 5. \| ret1 = timespec1 + (rdtsc - tsc1)
		1297	* \| ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
		1298	*
		1299	* Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
		1300	*
		1301	* - ret0 < ret1
		1302	* - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
		1303	* ...
		1304	* - 0 < N - M => M < N
		1305	*
		1306	* That is, when timespec0 != timespec1, M < N. Unfortunately that is not
		1307	* always the case (the difference between two distinct xtime instances
		1308	* might be smaller then the difference between corresponding TSC reads,
		1309	* when updating guest vcpus pvclock areas).
		1310	*
		1311	* To avoid that problem, do not allow visibility of distinct
		1312	* system_timestamp/tsc_timestamp values simultaneously: use a master
		1313	* copy of host monotonic time values. Update that master copy
		1314	* in lockstep.
		1315	*
		1316	* Rely on synchronization of host TSCs for monotonicity.
		1317	*
		1318	*/
		1319
		1320	static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
		1321	{
		1322	#ifdef CONFIG_X86_64
		1323	struct kvm_arch *ka = &kvm->arch;
		1324	int vclock_mode;
		1325
		1326	/*
		1327	* If the host uses TSC clock, then passthrough TSC as stable
		1328	* to the guest.
		1329	*/
		1330	ka->use_master_clock = kvm_get_time_and_clockread(
		1331	&ka->master_kernel_ns,
		1332	&ka->master_cycle_now);
		1333
		1334	if (ka->use_master_clock)
		1335	atomic_set(&kvm_guest_has_master_clock, 1);
		1336
		1337	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
		1338	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode);
		1339	#endif
		1340	}
		1341
1193	static int kvm_guest_time_update(struct kvm_vcpu *v)	1342	static int kvm_guest_time_update(struct kvm_vcpu *v)
1194	{	1343	{
1195	unsigned long flags;	1344	unsigned long flags, this_tsc_khz;
1196	struct kvm_vcpu_arch *vcpu = &v->arch;	1345	struct kvm_vcpu_arch *vcpu = &v->arch;
		1346	struct kvm_arch *ka = &v->kvm->arch;
1197	void *shared_kaddr;	1347	void *shared_kaddr;
1198	unsigned long this_tsc_khz;
1199	s64 kernel_ns, max_kernel_ns;	1348	s64 kernel_ns, max_kernel_ns;
1200	u64 tsc_timestamp;	1349	u64 tsc_timestamp, host_tsc;
1201	struct pvclock_vcpu_time_info *guest_hv_clock;	1350	struct pvclock_vcpu_time_info *guest_hv_clock;
1202	u8 pvclock_flags;	1351	u8 pvclock_flags;
		1352	bool use_master_clock;
		1353
		1354	kernel_ns = 0;
		1355	host_tsc = 0;
1203		1356
1204	/* Keep irq disabled to prevent changes to the clock */	1357	/* Keep irq disabled to prevent changes to the clock */
1205	local_irq_save(flags);	1358	local_irq_save(flags);
1206	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, native_read_tsc());
1207	kernel_ns = get_kernel_ns();
1208	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);	1359	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1209	if (unlikely(this_tsc_khz == 0)) {	1360	if (unlikely(this_tsc_khz == 0)) {
1210	local_irq_restore(flags);	1361	local_irq_restore(flags);
@@ -1213,6 +1364,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1213	}	1364	}
1214		1365
1215	/*	1366	/*
		1367	* If the host uses TSC clock, then passthrough TSC as stable
		1368	* to the guest.
		1369	*/
		1370	spin_lock(&ka->pvclock_gtod_sync_lock);
		1371	use_master_clock = ka->use_master_clock;
		1372	if (use_master_clock) {
		1373	host_tsc = ka->master_cycle_now;
		1374	kernel_ns = ka->master_kernel_ns;
		1375	}
		1376	spin_unlock(&ka->pvclock_gtod_sync_lock);
		1377	if (!use_master_clock) {
		1378	host_tsc = native_read_tsc();
		1379	kernel_ns = get_kernel_ns();
		1380	}
		1381
		1382	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
		1383
		1384	/*
1216	* We may have to catch up the TSC to match elapsed wall clock	1385	* We may have to catch up the TSC to match elapsed wall clock
1217	* time for two reasons, even if kvmclock is used.	1386	* time for two reasons, even if kvmclock is used.
1218	* 1) CPU could have been running below the maximum TSC rate	1387	* 1) CPU could have been running below the maximum TSC rate
@@ -1273,9 +1442,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1273	vcpu->hw_tsc_khz = this_tsc_khz;	1442	vcpu->hw_tsc_khz = this_tsc_khz;
1274	}	1443	}
1275		1444
1276	if (max_kernel_ns > kernel_ns)	1445	/* with a master <monotonic time, tsc value> tuple,
1277	kernel_ns = max_kernel_ns;	1446	* pvclock clock reads always increase at the (scaled) rate
1278		1447	* of guest TSC - no need to deal with sampling errors.
		1448	*/
		1449	if (!use_master_clock) {
		1450	if (max_kernel_ns > kernel_ns)
		1451	kernel_ns = max_kernel_ns;
		1452	}
1279	/* With all the info we got, fill in the values */	1453	/* With all the info we got, fill in the values */
1280	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;	1454	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1281	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;	1455	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
@@ -1301,6 +1475,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1301	vcpu->pvclock_set_guest_stopped_request = false;	1475	vcpu->pvclock_set_guest_stopped_request = false;
1302	}	1476	}
1303		1477
		1478	/* If the host uses TSC clocksource, then it is stable */
		1479	if (use_master_clock)
		1480	pvclock_flags \|= PVCLOCK_TSC_STABLE_BIT;
		1481
1304	vcpu->hv_clock.flags = pvclock_flags;	1482	vcpu->hv_clock.flags = pvclock_flags;
1305		1483
1306	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,	1484	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
@@ -4912,6 +5090,17 @@ static void kvm_set_mmio_spte_mask(void)
4912	#ifdef CONFIG_X86_64	5090	#ifdef CONFIG_X86_64
4913	static void pvclock_gtod_update_fn(struct work_struct *work)	5091	static void pvclock_gtod_update_fn(struct work_struct *work)
4914	{	5092	{
		5093	struct kvm *kvm;
		5094
		5095	struct kvm_vcpu *vcpu;
		5096	int i;
		5097
		5098	raw_spin_lock(&kvm_lock);
		5099	list_for_each_entry(kvm, &vm_list, vm_list)
		5100	kvm_for_each_vcpu(i, vcpu, kvm)
		5101	set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
		5102	atomic_set(&kvm_guest_has_master_clock, 0);
		5103	raw_spin_unlock(&kvm_lock);
4915	}	5104	}
4916		5105
4917	static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);	5106	static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -5303,6 +5492,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
5303	kvm_make_request(KVM_REQ_EVENT, vcpu);	5492	kvm_make_request(KVM_REQ_EVENT, vcpu);
5304	}	5493	}
5305		5494
		5495	static void kvm_gen_update_masterclock(struct kvm *kvm)
		5496	{
		5497	#ifdef CONFIG_X86_64
		5498	int i;
		5499	struct kvm_vcpu *vcpu;
		5500	struct kvm_arch *ka = &kvm->arch;
		5501
		5502	spin_lock(&ka->pvclock_gtod_sync_lock);
		5503	kvm_make_mclock_inprogress_request(kvm);
		5504	/* no guest entries from this point */
		5505	pvclock_update_vm_gtod_copy(kvm);
		5506
		5507	kvm_for_each_vcpu(i, vcpu, kvm)
		5508	set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
		5509
		5510	/* guest entries allowed */
		5511	kvm_for_each_vcpu(i, vcpu, kvm)
		5512	clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
		5513
		5514	spin_unlock(&ka->pvclock_gtod_sync_lock);
		5515	#endif
		5516	}
		5517
5306	static int vcpu_enter_guest(struct kvm_vcpu *vcpu)	5518	static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5307	{	5519	{
5308	int r;	5520	int r;
@@ -5315,6 +5527,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5315	kvm_mmu_unload(vcpu);	5527	kvm_mmu_unload(vcpu);
5316	if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))	5528	if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
5317	__kvm_migrate_timers(vcpu);	5529	__kvm_migrate_timers(vcpu);
		5530	if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
		5531	kvm_gen_update_masterclock(vcpu->kvm);
5318	if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {	5532	if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5319	r = kvm_guest_time_update(vcpu);	5533	r = kvm_guest_time_update(vcpu);
5320	if (unlikely(r))	5534	if (unlikely(r))
@@ -6219,6 +6433,8 @@ int kvm_arch_hardware_enable(void *garbage)
6219	kvm_for_each_vcpu(i, vcpu, kvm) {	6433	kvm_for_each_vcpu(i, vcpu, kvm) {
6220	vcpu->arch.tsc_offset_adjustment += delta_cyc;	6434	vcpu->arch.tsc_offset_adjustment += delta_cyc;
6221	vcpu->arch.last_host_tsc = local_tsc;	6435	vcpu->arch.last_host_tsc = local_tsc;
		6436	set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
		6437	&vcpu->requests);
6222	}	6438	}
6223		6439
6224	/*	6440	/*
@@ -6356,6 +6572,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6356		6572
6357	raw_spin_lock_init(&kvm->arch.tsc_write_lock);	6573	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6358	mutex_init(&kvm->arch.apic_map_lock);	6574	mutex_init(&kvm->arch.apic_map_lock);
		6575	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
		6576
		6577	pvclock_update_vm_gtod_copy(kvm);
6359		6578
6360	return 0;	6579	return 0;
6361	}	6580	}