aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTomasz Grabiec <tgrabiec@cloudius-systems.com>2014-06-24 03:42:43 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2014-07-09 12:09:57 -0400
commit0d3da0d26e3c3515997c99451ce3b0ad1a69a36c (patch)
treebf4d58245a974f0cbd2980e00cfd61cbb65f5474
parent6cbc5f5a80a9ae5a80bc81efc574b5a85bfd4a84 (diff)
KVM: x86: fix TSC matching
I've observed kvmclock being marked as unstable on a modern single-socket system with a stable TSC and qemu-1.6.2 or qemu-2.0.0. The culprit was failure in TSC matching because of overflow of kvm_arch::nr_vcpus_matched_tsc in case there were multiple TSC writes in a single synchronization cycle. Turns out that qemu does multiple TSC writes during init, below is the evidence of that (qemu-2.0.0): The first one: 0xffffffffa08ff2b4 : vmx_write_tsc_offset+0xa4/0xb0 [kvm_intel] 0xffffffffa04c9c05 : kvm_write_tsc+0x1a5/0x360 [kvm] 0xffffffffa04cfd6b : kvm_arch_vcpu_postcreate+0x4b/0x80 [kvm] 0xffffffffa04b8188 : kvm_vm_ioctl+0x418/0x750 [kvm] The second one: 0xffffffffa08ff2b4 : vmx_write_tsc_offset+0xa4/0xb0 [kvm_intel] 0xffffffffa04c9c05 : kvm_write_tsc+0x1a5/0x360 [kvm] 0xffffffffa090610d : vmx_set_msr+0x29d/0x350 [kvm_intel] 0xffffffffa04be83b : do_set_msr+0x3b/0x60 [kvm] 0xffffffffa04c10a8 : msr_io+0xc8/0x160 [kvm] 0xffffffffa04caeb6 : kvm_arch_vcpu_ioctl+0xc86/0x1060 [kvm] 0xffffffffa04b6797 : kvm_vcpu_ioctl+0xc7/0x5a0 [kvm] #0 kvm_vcpu_ioctl at /build/buildd/qemu-2.0.0+dfsg/kvm-all.c:1780 #1 kvm_put_msrs at /build/buildd/qemu-2.0.0+dfsg/target-i386/kvm.c:1270 #2 kvm_arch_put_registers at /build/buildd/qemu-2.0.0+dfsg/target-i386/kvm.c:1909 #3 kvm_cpu_synchronize_post_init at /build/buildd/qemu-2.0.0+dfsg/kvm-all.c:1641 #4 cpu_synchronize_post_init at /build/buildd/qemu-2.0.0+dfsg/include/sysemu/kvm.h:330 #5 cpu_synchronize_all_post_init () at /build/buildd/qemu-2.0.0+dfsg/cpus.c:521 #6 main at /build/buildd/qemu-2.0.0+dfsg/vl.c:4390 The third one: 0xffffffffa08ff2b4 : vmx_write_tsc_offset+0xa4/0xb0 [kvm_intel] 0xffffffffa04c9c05 : kvm_write_tsc+0x1a5/0x360 [kvm] 0xffffffffa090610d : vmx_set_msr+0x29d/0x350 [kvm_intel] 0xffffffffa04be83b : do_set_msr+0x3b/0x60 [kvm] 0xffffffffa04c10a8 : msr_io+0xc8/0x160 [kvm] 0xffffffffa04caeb6 : kvm_arch_vcpu_ioctl+0xc86/0x1060 [kvm] 0xffffffffa04b6797 : kvm_vcpu_ioctl+0xc7/0x5a0 [kvm] #0 kvm_vcpu_ioctl at /build/buildd/qemu-2.0.0+dfsg/kvm-all.c:1780 #1 kvm_put_msrs at /build/buildd/qemu-2.0.0+dfsg/target-i386/kvm.c:1270 #2 kvm_arch_put_registers at /build/buildd/qemu-2.0.0+dfsg/target-i386/kvm.c:1909 #3 kvm_cpu_synchronize_post_reset at /build/buildd/qemu-2.0.0+dfsg/kvm-all.c:1635 #4 cpu_synchronize_post_reset at /build/buildd/qemu-2.0.0+dfsg/include/sysemu/kvm.h:323 #5 cpu_synchronize_all_post_reset () at /build/buildd/qemu-2.0.0+dfsg/cpus.c:512 #6 main at /build/buildd/qemu-2.0.0+dfsg/vl.c:4482 The fix is to count each vCPU only once when matched, so that nr_vcpus_matched_tsc holds the size of the matched set. This is achieved by reusing generation counters. Every vCPU with this_tsc_generation == cur_tsc_generation is in the matched set. The match set is cleared by setting cur_tsc_generation to a value which no other vCPU is set to (by incrementing it). I needed to bump up the counter size form u8 to u64 to ensure it never overflows. Otherwise in cases TSC is not written the same number of times on each vCPU the counter could overflow and incorrectly indicate some vCPUs as being in the matched set. This scenario seems unlikely but I'm not sure if it can be disregarded. Signed-off-by: Tomasz Grabiec <tgrabiec@cloudius-systems.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-rw-r--r--arch/x86/include/asm/kvm_host.h4
-rw-r--r--arch/x86/kvm/x86.c11
2 files changed, 9 insertions, 6 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 63e020be3da7..af36f89fe67a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -448,7 +448,7 @@ struct kvm_vcpu_arch {
448 u64 tsc_offset_adjustment; 448 u64 tsc_offset_adjustment;
449 u64 this_tsc_nsec; 449 u64 this_tsc_nsec;
450 u64 this_tsc_write; 450 u64 this_tsc_write;
451 u8 this_tsc_generation; 451 u64 this_tsc_generation;
452 bool tsc_catchup; 452 bool tsc_catchup;
453 bool tsc_always_catchup; 453 bool tsc_always_catchup;
454 s8 virtual_tsc_shift; 454 s8 virtual_tsc_shift;
@@ -591,7 +591,7 @@ struct kvm_arch {
591 u64 cur_tsc_nsec; 591 u64 cur_tsc_nsec;
592 u64 cur_tsc_write; 592 u64 cur_tsc_write;
593 u64 cur_tsc_offset; 593 u64 cur_tsc_offset;
594 u8 cur_tsc_generation; 594 u64 cur_tsc_generation;
595 int nr_vcpus_matched_tsc; 595 int nr_vcpus_matched_tsc;
596 596
597 spinlock_t pvclock_gtod_sync_lock; 597 spinlock_t pvclock_gtod_sync_lock;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5a8691b0ed76..f056f855f8e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1215,6 +1215,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1215 unsigned long flags; 1215 unsigned long flags;
1216 s64 usdiff; 1216 s64 usdiff;
1217 bool matched; 1217 bool matched;
1218 bool already_matched;
1218 u64 data = msr->data; 1219 u64 data = msr->data;
1219 1220
1220 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1221 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -1279,6 +1280,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1279 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1280 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1280 } 1281 }
1281 matched = true; 1282 matched = true;
1283 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
1282 } else { 1284 } else {
1283 /* 1285 /*
1284 * We split periods of matched TSC writes into generations. 1286 * We split periods of matched TSC writes into generations.
@@ -1294,7 +1296,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1294 kvm->arch.cur_tsc_write = data; 1296 kvm->arch.cur_tsc_write = data;
1295 kvm->arch.cur_tsc_offset = offset; 1297 kvm->arch.cur_tsc_offset = offset;
1296 matched = false; 1298 matched = false;
1297 pr_debug("kvm: new tsc generation %u, clock %llu\n", 1299 pr_debug("kvm: new tsc generation %llu, clock %llu\n",
1298 kvm->arch.cur_tsc_generation, data); 1300 kvm->arch.cur_tsc_generation, data);
1299 } 1301 }
1300 1302
@@ -1319,10 +1321,11 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1319 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1321 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1320 1322
1321 spin_lock(&kvm->arch.pvclock_gtod_sync_lock); 1323 spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1322 if (matched) 1324 if (!matched) {
1323 kvm->arch.nr_vcpus_matched_tsc++;
1324 else
1325 kvm->arch.nr_vcpus_matched_tsc = 0; 1325 kvm->arch.nr_vcpus_matched_tsc = 0;
1326 } else if (!already_matched) {
1327 kvm->arch.nr_vcpus_matched_tsc++;
1328 }
1326 1329
1327 kvm_track_tsc_matching(vcpu); 1330 kvm_track_tsc_matching(vcpu);
1328 spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); 1331 spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);