diff options
author | Zachary Amsden <zamsden@redhat.com> | 2010-08-20 04:07:30 -0400 |
---|---|---|
committer | Avi Kivity <avi@redhat.com> | 2010-10-24 04:51:24 -0400 |
commit | 1d5f066e0b63271b67eac6d3752f8aa96adcbddb (patch) | |
tree | b3ba7c2783edbc6d9e6658d91e991913ba264684 /arch/x86/kvm/x86.c | |
parent | 347bb4448c2155eb2310923ccaa4be5677649003 (diff) |
KVM: x86: Fix a possible backwards warp of kvmclock
Kernel time, which advances in discrete steps may progress much slower
than TSC. As a result, when kvmclock is adjusted to a new base, the
apparent time to the guest, which runs at a much higher, nsec scaled
rate based on the current TSC, may have already been observed to have
a larger value (kernel_ns + scaled tsc) than the value to which we are
setting it (kernel_ns + 0).
We must instead compute the clock as potentially observed by the guest
for kernel_ns to make sure it does not go backwards.
Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 44 |
1 files changed, 42 insertions, 2 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d0764a258047..d4d33f943d99 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <asm/mce.h> | 55 | #include <asm/mce.h> |
56 | #include <asm/i387.h> | 56 | #include <asm/i387.h> |
57 | #include <asm/xcr.h> | 57 | #include <asm/xcr.h> |
58 | #include <asm/pvclock.h> | ||
58 | 59 | ||
59 | #define MAX_IO_MSRS 256 | 60 | #define MAX_IO_MSRS 256 |
60 | #define CR0_RESERVED_BITS \ | 61 | #define CR0_RESERVED_BITS \ |
@@ -976,14 +977,15 @@ static int kvm_write_guest_time(struct kvm_vcpu *v) | |||
976 | struct kvm_vcpu_arch *vcpu = &v->arch; | 977 | struct kvm_vcpu_arch *vcpu = &v->arch; |
977 | void *shared_kaddr; | 978 | void *shared_kaddr; |
978 | unsigned long this_tsc_khz; | 979 | unsigned long this_tsc_khz; |
979 | s64 kernel_ns; | 980 | s64 kernel_ns, max_kernel_ns; |
981 | u64 tsc_timestamp; | ||
980 | 982 | ||
981 | if ((!vcpu->time_page)) | 983 | if ((!vcpu->time_page)) |
982 | return 0; | 984 | return 0; |
983 | 985 | ||
984 | /* Keep irq disabled to prevent changes to the clock */ | 986 | /* Keep irq disabled to prevent changes to the clock */ |
985 | local_irq_save(flags); | 987 | local_irq_save(flags); |
986 | kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); | 988 | kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); |
987 | kernel_ns = get_kernel_ns(); | 989 | kernel_ns = get_kernel_ns(); |
988 | this_tsc_khz = __get_cpu_var(cpu_tsc_khz); | 990 | this_tsc_khz = __get_cpu_var(cpu_tsc_khz); |
989 | local_irq_restore(flags); | 991 | local_irq_restore(flags); |
@@ -993,13 +995,49 @@ static int kvm_write_guest_time(struct kvm_vcpu *v) | |||
993 | return 1; | 995 | return 1; |
994 | } | 996 | } |
995 | 997 | ||
998 | /* | ||
999 | * Time as measured by the TSC may go backwards when resetting the base | ||
1000 | * tsc_timestamp. The reason for this is that the TSC resolution is | ||
1001 | * higher than the resolution of the other clock scales. Thus, many | ||
1002 | * possible measurments of the TSC correspond to one measurement of any | ||
1003 | * other clock, and so a spread of values is possible. This is not a | ||
1004 | * problem for the computation of the nanosecond clock; with TSC rates | ||
1005 | * around 1GHZ, there can only be a few cycles which correspond to one | ||
1006 | * nanosecond value, and any path through this code will inevitably | ||
1007 | * take longer than that. However, with the kernel_ns value itself, | ||
1008 | * the precision may be much lower, down to HZ granularity. If the | ||
1009 | * first sampling of TSC against kernel_ns ends in the low part of the | ||
1010 | * range, and the second in the high end of the range, we can get: | ||
1011 | * | ||
1012 | * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new | ||
1013 | * | ||
1014 | * As the sampling errors potentially range in the thousands of cycles, | ||
1015 | * it is possible such a time value has already been observed by the | ||
1016 | * guest. To protect against this, we must compute the system time as | ||
1017 | * observed by the guest and ensure the new system time is greater. | ||
1018 | */ | ||
1019 | max_kernel_ns = 0; | ||
1020 | if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { | ||
1021 | max_kernel_ns = vcpu->last_guest_tsc - | ||
1022 | vcpu->hv_clock.tsc_timestamp; | ||
1023 | max_kernel_ns = pvclock_scale_delta(max_kernel_ns, | ||
1024 | vcpu->hv_clock.tsc_to_system_mul, | ||
1025 | vcpu->hv_clock.tsc_shift); | ||
1026 | max_kernel_ns += vcpu->last_kernel_ns; | ||
1027 | } | ||
1028 | |||
996 | if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { | 1029 | if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { |
997 | kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); | 1030 | kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); |
998 | vcpu->hw_tsc_khz = this_tsc_khz; | 1031 | vcpu->hw_tsc_khz = this_tsc_khz; |
999 | } | 1032 | } |
1000 | 1033 | ||
1034 | if (max_kernel_ns > kernel_ns) | ||
1035 | kernel_ns = max_kernel_ns; | ||
1036 | |||
1001 | /* With all the info we got, fill in the values */ | 1037 | /* With all the info we got, fill in the values */ |
1038 | vcpu->hv_clock.tsc_timestamp = tsc_timestamp; | ||
1002 | vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; | 1039 | vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; |
1040 | vcpu->last_kernel_ns = kernel_ns; | ||
1003 | vcpu->hv_clock.flags = 0; | 1041 | vcpu->hv_clock.flags = 0; |
1004 | 1042 | ||
1005 | /* | 1043 | /* |
@@ -4931,6 +4969,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
4931 | if (hw_breakpoint_active()) | 4969 | if (hw_breakpoint_active()) |
4932 | hw_breakpoint_restore(); | 4970 | hw_breakpoint_restore(); |
4933 | 4971 | ||
4972 | kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); | ||
4973 | |||
4934 | atomic_set(&vcpu->guest_mode, 0); | 4974 | atomic_set(&vcpu->guest_mode, 0); |
4935 | smp_wmb(); | 4975 | smp_wmb(); |
4936 | local_irq_enable(); | 4976 | local_irq_enable(); |