2 files changed, 89 insertions, 5 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dd439f13df8..4fbeb84b181 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -423,6 +423,7 @@ struct kvm_vcpu_arch {
        u64 last_tsc_nsec;
        u64 last_tsc_write;
        u64 last_host_tsc;
+        u64 tsc_offset_adjustment;
        bool tsc_catchup;
        bool tsc_always_catchup;
        s8 virtual_tsc_shift;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3b931302fa5..4e9bd23d522 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2252,6 +2252,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        }
        kvm_x86_ops->vcpu_load(vcpu, cpu);
+        /* Apply any externally detected TSC adjustments (due to suspend) */
+        if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+                adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+                vcpu->arch.tsc_offset_adjustment = 0;
+                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+        }
        if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
                s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
                                native_read_tsc() - vcpu->arch.last_host_tsc;
@@ -5964,13 +5972,88 @@ int kvm_arch_hardware_enable(void *garbage)
        struct kvm *kvm;
        struct kvm_vcpu *vcpu;
        int i;
+        int ret;
+        u64 local_tsc;
+        u64 max_tsc = 0;
+        bool stable, backwards_tsc = false;
        kvm_shared_msr_cpu_online();
-        list_for_each_entry(kvm, &vm_list, vm_list)
+        ret = kvm_x86_ops->hardware_enable(garbage);
-                kvm_for_each_vcpu(i, vcpu, kvm)
+        if (ret != 0)
-                        if (vcpu->cpu == smp_processor_id())
+                return ret;
-                                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-        return kvm_x86_ops->hardware_enable(garbage);
+        local_tsc = native_read_tsc();
+        stable = !check_tsc_unstable();
+        list_for_each_entry(kvm, &vm_list, vm_list) {
+                kvm_for_each_vcpu(i, vcpu, kvm) {
+                        if (!stable && vcpu->cpu == smp_processor_id())
+                                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+                        if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+                                backwards_tsc = true;
+                                if (vcpu->arch.last_host_tsc > max_tsc)
+                                        max_tsc = vcpu->arch.last_host_tsc;
+                        }
+                }
+        }
+        /*
+         * Sometimes, even reliable TSCs go backwards.  This happens on
+         * platforms that reset TSC during suspend or hibernate actions, but
+         * maintain synchronization.  We must compensate.  Fortunately, we can
+         * detect that condition here, which happens early in CPU bringup,
+         * before any KVM threads can be running.  Unfortunately, we can't
+         * bring the TSCs fully up to date with real time, as we aren't yet far
+         * enough into CPU bringup that we know how much real time has actually
+         * elapsed; our helper function, get_kernel_ns() will be using boot
+         * variables that haven't been updated yet.
+         *
+         * So we simply find the maximum observed TSC above, then record the
+         * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
+         * the adjustment will be applied.  Note that we accumulate
+         * adjustments, in case multiple suspend cycles happen before some VCPU
+         * gets a chance to run again.  In the event that no KVM threads get a
+         * chance to run, we will miss the entire elapsed period, as we'll have
+         * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+         * loose cycle time.  This isn't too big a deal, since the loss will be
+         * uniform across all VCPUs (not to mention the scenario is extremely
+         * unlikely). It is possible that a second hibernate recovery happens
+         * much faster than a first, causing the observed TSC here to be
+         * smaller; this would require additional padding adjustment, which is
+         * why we set last_host_tsc to the local tsc observed here.
+         *
+         * N.B. - this code below runs only on platforms with reliable TSC,
+         * as that is the only way backwards_tsc is set above.  Also note
+         * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+         * have the same delta_cyc adjustment applied if backwards_tsc
+         * is detected.  Note further, this adjustment is only done once,
+         * as we reset last_host_tsc on all VCPUs to stop this from being
+         * called multiple times (one for each physical CPU bringup).
+         *
+         * Platforms with unnreliable TSCs don't have to deal with this, they
+         * will be compensated by the logic in vcpu_load, which sets the TSC to
+         * catchup mode.  This will catchup all VCPUs to real time, but cannot
+         * guarantee that they stay in perfect synchronization.
+         */
+        if (backwards_tsc) {
+                u64 delta_cyc = max_tsc - local_tsc;
+                list_for_each_entry(kvm, &vm_list, vm_list) {
+                        kvm_for_each_vcpu(i, vcpu, kvm) {
+                                vcpu->arch.tsc_offset_adjustment += delta_cyc;
+                                vcpu->arch.last_host_tsc = local_tsc;
+                        }
+                        /*
+                         * We have to disable TSC offset matching.. if you were
+                         * booting a VM while issuing an S4 host suspend....
+                         * you may have some problem.  Solving this issue is
+                         * left as an exercise to the reader.
+                         */
+                        kvm->arch.last_tsc_nsec = 0;
+                        kvm->arch.last_tsc_write = 0;
+                }
+        }
+        return 0;
 }
 void kvm_arch_hardware_disable(void *garbage)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd439f13df8..4fbeb84b181 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
@@ -423,6 +423,7 @@ struct kvm_vcpu_arch {
423	u64 last_tsc_nsec;	423	u64 last_tsc_nsec;
424	u64 last_tsc_write;	424	u64 last_tsc_write;
425	u64 last_host_tsc;	425	u64 last_host_tsc;
		426	u64 tsc_offset_adjustment;
426	bool tsc_catchup;	427	bool tsc_catchup;
427	bool tsc_always_catchup;	428	bool tsc_always_catchup;
428	s8 virtual_tsc_shift;	429	s8 virtual_tsc_shift;


diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b931302fa5..4e9bd23d522 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c
@@ -2252,6 +2252,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2252	}	2252	}
2253		2253
2254	kvm_x86_ops->vcpu_load(vcpu, cpu);	2254	kvm_x86_ops->vcpu_load(vcpu, cpu);
		2255
		2256	/* Apply any externally detected TSC adjustments (due to suspend) */
		2257	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
		2258	adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
		2259	vcpu->arch.tsc_offset_adjustment = 0;
		2260	set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
		2261	}
		2262
2255	if (unlikely(vcpu->cpu != cpu) \|\| check_tsc_unstable()) {	2263	if (unlikely(vcpu->cpu != cpu) \|\| check_tsc_unstable()) {
2256	s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :	2264	s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
2257	native_read_tsc() - vcpu->arch.last_host_tsc;	2265	native_read_tsc() - vcpu->arch.last_host_tsc;
@@ -5964,13 +5972,88 @@ int kvm_arch_hardware_enable(void *garbage)
5964	struct kvm *kvm;	5972	struct kvm *kvm;
5965	struct kvm_vcpu *vcpu;	5973	struct kvm_vcpu *vcpu;
5966	int i;	5974	int i;
		5975	int ret;
		5976	u64 local_tsc;
		5977	u64 max_tsc = 0;
		5978	bool stable, backwards_tsc = false;
5967		5979
5968	kvm_shared_msr_cpu_online();	5980	kvm_shared_msr_cpu_online();
5969	list_for_each_entry(kvm, &vm_list, vm_list)	5981	ret = kvm_x86_ops->hardware_enable(garbage);
5970	kvm_for_each_vcpu(i, vcpu, kvm)	5982	if (ret != 0)
5971	if (vcpu->cpu == smp_processor_id())	5983	return ret;
5972	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);	5984
5973	return kvm_x86_ops->hardware_enable(garbage);	5985	local_tsc = native_read_tsc();
		5986	stable = !check_tsc_unstable();
		5987	list_for_each_entry(kvm, &vm_list, vm_list) {
		5988	kvm_for_each_vcpu(i, vcpu, kvm) {
		5989	if (!stable && vcpu->cpu == smp_processor_id())
		5990	set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
		5991	if (stable && vcpu->arch.last_host_tsc > local_tsc) {
		5992	backwards_tsc = true;
		5993	if (vcpu->arch.last_host_tsc > max_tsc)
		5994	max_tsc = vcpu->arch.last_host_tsc;
		5995	}
		5996	}
		5997	}
		5998
		5999	/*
		6000	* Sometimes, even reliable TSCs go backwards. This happens on
		6001	* platforms that reset TSC during suspend or hibernate actions, but
		6002	* maintain synchronization. We must compensate. Fortunately, we can
		6003	* detect that condition here, which happens early in CPU bringup,
		6004	* before any KVM threads can be running. Unfortunately, we can't
		6005	* bring the TSCs fully up to date with real time, as we aren't yet far
		6006	* enough into CPU bringup that we know how much real time has actually
		6007	* elapsed; our helper function, get_kernel_ns() will be using boot
		6008	* variables that haven't been updated yet.
		6009	*
		6010	* So we simply find the maximum observed TSC above, then record the
		6011	* adjustment to TSC in each VCPU. When the VCPU later gets loaded,
		6012	* the adjustment will be applied. Note that we accumulate
		6013	* adjustments, in case multiple suspend cycles happen before some VCPU
		6014	* gets a chance to run again. In the event that no KVM threads get a
		6015	* chance to run, we will miss the entire elapsed period, as we'll have
		6016	* reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
		6017	* loose cycle time. This isn't too big a deal, since the loss will be
		6018	* uniform across all VCPUs (not to mention the scenario is extremely
		6019	* unlikely). It is possible that a second hibernate recovery happens
		6020	* much faster than a first, causing the observed TSC here to be
		6021	* smaller; this would require additional padding adjustment, which is
		6022	* why we set last_host_tsc to the local tsc observed here.
		6023	*
		6024	* N.B. - this code below runs only on platforms with reliable TSC,
		6025	* as that is the only way backwards_tsc is set above. Also note
		6026	* that this runs for ALL vcpus, which is not a bug; all VCPUs should
		6027	* have the same delta_cyc adjustment applied if backwards_tsc
		6028	* is detected. Note further, this adjustment is only done once,
		6029	* as we reset last_host_tsc on all VCPUs to stop this from being
		6030	* called multiple times (one for each physical CPU bringup).
		6031	*
		6032	* Platforms with unnreliable TSCs don't have to deal with this, they
		6033	* will be compensated by the logic in vcpu_load, which sets the TSC to
		6034	* catchup mode. This will catchup all VCPUs to real time, but cannot
		6035	* guarantee that they stay in perfect synchronization.
		6036	*/
		6037	if (backwards_tsc) {
		6038	u64 delta_cyc = max_tsc - local_tsc;
		6039	list_for_each_entry(kvm, &vm_list, vm_list) {
		6040	kvm_for_each_vcpu(i, vcpu, kvm) {
		6041	vcpu->arch.tsc_offset_adjustment += delta_cyc;
		6042	vcpu->arch.last_host_tsc = local_tsc;
		6043	}
		6044
		6045	/*
		6046	* We have to disable TSC offset matching.. if you were
		6047	* booting a VM while issuing an S4 host suspend....
		6048	* you may have some problem. Solving this issue is
		6049	* left as an exercise to the reader.
		6050	*/
		6051	kvm->arch.last_tsc_nsec = 0;
		6052	kvm->arch.last_tsc_write = 0;
		6053	}
		6054
		6055	}
		6056	return 0;
5974	}	6057	}
5975		6058
5976	void kvm_arch_hardware_disable(void *garbage)	6059	void kvm_arch_hardware_disable(void *garbage)