aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorMarcelo Tosatti <mtosatti@redhat.com>2012-11-27 20:29:01 -0500
committerMarcelo Tosatti <mtosatti@redhat.com>2012-11-27 20:29:13 -0500
commitd828199e84447795c6669ff0e6c6d55eb9beeff6 (patch)
treec11fc58c50234ddf06f1c4ca98a4115c8fe8ac2f /arch/x86/kvm
parent16e8d74d2da9920f874b10a3d979fb25c01f518f (diff)
KVM: x86: implement PVCLOCK_TSC_STABLE_BIT pvclock flag
KVM added a global variable to guarantee monotonicity in the guest. One of the reasons for that is that the time between 1. ktime_get_ts(&timespec); 2. rdtscll(tsc); Is variable. That is, given a host with stable TSC, suppose that two VCPUs read the same time via ktime_get_ts() above. The time required to execute 2. is not the same on those two instances executing in different VCPUS (cache misses, interrupts...). If the TSC value that is used by the host to interpolate when calculating the monotonic time is the same value used to calculate the tsc_timestamp value stored in the pvclock data structure, and a single <system_timestamp, tsc_timestamp> tuple is visible to all vcpus simultaneously, this problem disappears. See comment on top of pvclock_update_vm_gtod_copy for details. Monotonicity is then guaranteed by synchronicity of the host TSCs and guest TSCs. Set TSC stable pvclock flag in that case, allowing the guest to read clock from userspace. Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/trace.h30
-rw-r--r--arch/x86/kvm/x86.c235
2 files changed, 257 insertions, 8 deletions
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bca63f04dccb..1d6526856080 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
4#include <linux/tracepoint.h> 4#include <linux/tracepoint.h>
5#include <asm/vmx.h> 5#include <asm/vmx.h>
6#include <asm/svm.h> 6#include <asm/svm.h>
7#include <asm/clocksource.h>
7 8
8#undef TRACE_SYSTEM 9#undef TRACE_SYSTEM
9#define TRACE_SYSTEM kvm 10#define TRACE_SYSTEM kvm
@@ -754,6 +755,35 @@ TRACE_EVENT(
754 __entry->write ? "Write" : "Read", 755 __entry->write ? "Write" : "Read",
755 __entry->gpa_match ? "GPA" : "GVA") 756 __entry->gpa_match ? "GPA" : "GVA")
756); 757);
758
759#ifdef CONFIG_X86_64
760
761#define host_clocks \
762 {VCLOCK_NONE, "none"}, \
763 {VCLOCK_TSC, "tsc"}, \
764 {VCLOCK_HPET, "hpet"} \
765
766TRACE_EVENT(kvm_update_master_clock,
767 TP_PROTO(bool use_master_clock, unsigned int host_clock),
768 TP_ARGS(use_master_clock, host_clock),
769
770 TP_STRUCT__entry(
771 __field( bool, use_master_clock )
772 __field( unsigned int, host_clock )
773 ),
774
775 TP_fast_assign(
776 __entry->use_master_clock = use_master_clock;
777 __entry->host_clock = host_clock;
778 ),
779
780 TP_printk("masterclock %d hostclock %s",
781 __entry->use_master_clock,
782 __print_symbolic(__entry->host_clock, host_clocks))
783);
784
785#endif /* CONFIG_X86_64 */
786
757#endif /* _TRACE_KVM_H */ 787#endif /* _TRACE_KVM_H */
758 788
759#undef TRACE_INCLUDE_PATH 789#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c077b817d1c3..a7b97a49d8ad 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1048,7 +1048,9 @@ static inline u64 get_kernel_ns(void)
1048 return timespec_to_ns(&ts); 1048 return timespec_to_ns(&ts);
1049} 1049}
1050 1050
1051#ifdef CONFIG_X86_64
1051static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); 1052static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1053#endif
1052 1054
1053static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 1055static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1054unsigned long max_tsc_khz; 1056unsigned long max_tsc_khz;
@@ -1190,21 +1192,170 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1190 1192
1191EXPORT_SYMBOL_GPL(kvm_write_tsc); 1193EXPORT_SYMBOL_GPL(kvm_write_tsc);
1192 1194
1195#ifdef CONFIG_X86_64
1196
1197static cycle_t read_tsc(void)
1198{
1199 cycle_t ret;
1200 u64 last;
1201
1202 /*
1203 * Empirically, a fence (of type that depends on the CPU)
1204 * before rdtsc is enough to ensure that rdtsc is ordered
1205 * with respect to loads. The various CPU manuals are unclear
1206 * as to whether rdtsc can be reordered with later loads,
1207 * but no one has ever seen it happen.
1208 */
1209 rdtsc_barrier();
1210 ret = (cycle_t)vget_cycles();
1211
1212 last = pvclock_gtod_data.clock.cycle_last;
1213
1214 if (likely(ret >= last))
1215 return ret;
1216
1217 /*
1218 * GCC likes to generate cmov here, but this branch is extremely
1219 * predictable (it's just a funciton of time and the likely is
1220 * very likely) and there's a data dependence, so force GCC
1221 * to generate a branch instead. I don't barrier() because
1222 * we don't actually need a barrier, and if this function
1223 * ever gets inlined it will generate worse code.
1224 */
1225 asm volatile ("");
1226 return last;
1227}
1228
1229static inline u64 vgettsc(cycle_t *cycle_now)
1230{
1231 long v;
1232 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1233
1234 *cycle_now = read_tsc();
1235
1236 v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
1237 return v * gtod->clock.mult;
1238}
1239
1240static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
1241{
1242 unsigned long seq;
1243 u64 ns;
1244 int mode;
1245 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1246
1247 ts->tv_nsec = 0;
1248 do {
1249 seq = read_seqcount_begin(&gtod->seq);
1250 mode = gtod->clock.vclock_mode;
1251 ts->tv_sec = gtod->monotonic_time_sec;
1252 ns = gtod->monotonic_time_snsec;
1253 ns += vgettsc(cycle_now);
1254 ns >>= gtod->clock.shift;
1255 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1256 timespec_add_ns(ts, ns);
1257
1258 return mode;
1259}
1260
1261/* returns true if host is using tsc clocksource */
1262static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
1263{
1264 struct timespec ts;
1265
1266 /* checked again under seqlock below */
1267 if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
1268 return false;
1269
1270 if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
1271 return false;
1272
1273 monotonic_to_bootbased(&ts);
1274 *kernel_ns = timespec_to_ns(&ts);
1275
1276 return true;
1277}
1278#endif
1279
1280/*
1281 *
1282 * Assuming a stable TSC across physical CPUS, the following condition
1283 * is possible. Each numbered line represents an event visible to both
1284 * CPUs at the next numbered event.
1285 *
1286 * "timespecX" represents host monotonic time. "tscX" represents
1287 * RDTSC value.
1288 *
1289 * VCPU0 on CPU0 | VCPU1 on CPU1
1290 *
1291 * 1. read timespec0,tsc0
1292 * 2. | timespec1 = timespec0 + N
1293 * | tsc1 = tsc0 + M
1294 * 3. transition to guest | transition to guest
1295 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1296 * 5. | ret1 = timespec1 + (rdtsc - tsc1)
1297 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1298 *
1299 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1300 *
1301 * - ret0 < ret1
1302 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1303 * ...
1304 * - 0 < N - M => M < N
1305 *
1306 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1307 * always the case (the difference between two distinct xtime instances
1308 * might be smaller then the difference between corresponding TSC reads,
1309 * when updating guest vcpus pvclock areas).
1310 *
1311 * To avoid that problem, do not allow visibility of distinct
1312 * system_timestamp/tsc_timestamp values simultaneously: use a master
1313 * copy of host monotonic time values. Update that master copy
1314 * in lockstep.
1315 *
1316 * Rely on synchronization of host TSCs for monotonicity.
1317 *
1318 */
1319
1320static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1321{
1322#ifdef CONFIG_X86_64
1323 struct kvm_arch *ka = &kvm->arch;
1324 int vclock_mode;
1325
1326 /*
1327 * If the host uses TSC clock, then passthrough TSC as stable
1328 * to the guest.
1329 */
1330 ka->use_master_clock = kvm_get_time_and_clockread(
1331 &ka->master_kernel_ns,
1332 &ka->master_cycle_now);
1333
1334 if (ka->use_master_clock)
1335 atomic_set(&kvm_guest_has_master_clock, 1);
1336
1337 vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1338 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode);
1339#endif
1340}
1341
1193static int kvm_guest_time_update(struct kvm_vcpu *v) 1342static int kvm_guest_time_update(struct kvm_vcpu *v)
1194{ 1343{
1195 unsigned long flags; 1344 unsigned long flags, this_tsc_khz;
1196 struct kvm_vcpu_arch *vcpu = &v->arch; 1345 struct kvm_vcpu_arch *vcpu = &v->arch;
1346 struct kvm_arch *ka = &v->kvm->arch;
1197 void *shared_kaddr; 1347 void *shared_kaddr;
1198 unsigned long this_tsc_khz;
1199 s64 kernel_ns, max_kernel_ns; 1348 s64 kernel_ns, max_kernel_ns;
1200 u64 tsc_timestamp; 1349 u64 tsc_timestamp, host_tsc;
1201 struct pvclock_vcpu_time_info *guest_hv_clock; 1350 struct pvclock_vcpu_time_info *guest_hv_clock;
1202 u8 pvclock_flags; 1351 u8 pvclock_flags;
1352 bool use_master_clock;
1353
1354 kernel_ns = 0;
1355 host_tsc = 0;
1203 1356
1204 /* Keep irq disabled to prevent changes to the clock */ 1357 /* Keep irq disabled to prevent changes to the clock */
1205 local_irq_save(flags); 1358 local_irq_save(flags);
1206 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, native_read_tsc());
1207 kernel_ns = get_kernel_ns();
1208 this_tsc_khz = __get_cpu_var(cpu_tsc_khz); 1359 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1209 if (unlikely(this_tsc_khz == 0)) { 1360 if (unlikely(this_tsc_khz == 0)) {
1210 local_irq_restore(flags); 1361 local_irq_restore(flags);
@@ -1213,6 +1364,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1213 } 1364 }
1214 1365
1215 /* 1366 /*
1367 * If the host uses TSC clock, then passthrough TSC as stable
1368 * to the guest.
1369 */
1370 spin_lock(&ka->pvclock_gtod_sync_lock);
1371 use_master_clock = ka->use_master_clock;
1372 if (use_master_clock) {
1373 host_tsc = ka->master_cycle_now;
1374 kernel_ns = ka->master_kernel_ns;
1375 }
1376 spin_unlock(&ka->pvclock_gtod_sync_lock);
1377 if (!use_master_clock) {
1378 host_tsc = native_read_tsc();
1379 kernel_ns = get_kernel_ns();
1380 }
1381
1382 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
1383
1384 /*
1216 * We may have to catch up the TSC to match elapsed wall clock 1385 * We may have to catch up the TSC to match elapsed wall clock
1217 * time for two reasons, even if kvmclock is used. 1386 * time for two reasons, even if kvmclock is used.
1218 * 1) CPU could have been running below the maximum TSC rate 1387 * 1) CPU could have been running below the maximum TSC rate
@@ -1273,9 +1442,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1273 vcpu->hw_tsc_khz = this_tsc_khz; 1442 vcpu->hw_tsc_khz = this_tsc_khz;
1274 } 1443 }
1275 1444
1276 if (max_kernel_ns > kernel_ns) 1445 /* with a master <monotonic time, tsc value> tuple,
1277 kernel_ns = max_kernel_ns; 1446 * pvclock clock reads always increase at the (scaled) rate
1278 1447 * of guest TSC - no need to deal with sampling errors.
1448 */
1449 if (!use_master_clock) {
1450 if (max_kernel_ns > kernel_ns)
1451 kernel_ns = max_kernel_ns;
1452 }
1279 /* With all the info we got, fill in the values */ 1453 /* With all the info we got, fill in the values */
1280 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; 1454 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1281 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1455 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
@@ -1301,6 +1475,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1301 vcpu->pvclock_set_guest_stopped_request = false; 1475 vcpu->pvclock_set_guest_stopped_request = false;
1302 } 1476 }
1303 1477
1478 /* If the host uses TSC clocksource, then it is stable */
1479 if (use_master_clock)
1480 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
1481
1304 vcpu->hv_clock.flags = pvclock_flags; 1482 vcpu->hv_clock.flags = pvclock_flags;
1305 1483
1306 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 1484 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
@@ -4912,6 +5090,17 @@ static void kvm_set_mmio_spte_mask(void)
4912#ifdef CONFIG_X86_64 5090#ifdef CONFIG_X86_64
4913static void pvclock_gtod_update_fn(struct work_struct *work) 5091static void pvclock_gtod_update_fn(struct work_struct *work)
4914{ 5092{
5093 struct kvm *kvm;
5094
5095 struct kvm_vcpu *vcpu;
5096 int i;
5097
5098 raw_spin_lock(&kvm_lock);
5099 list_for_each_entry(kvm, &vm_list, vm_list)
5100 kvm_for_each_vcpu(i, vcpu, kvm)
5101 set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
5102 atomic_set(&kvm_guest_has_master_clock, 0);
5103 raw_spin_unlock(&kvm_lock);
4915} 5104}
4916 5105
4917static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); 5106static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -5303,6 +5492,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
5303 kvm_make_request(KVM_REQ_EVENT, vcpu); 5492 kvm_make_request(KVM_REQ_EVENT, vcpu);
5304} 5493}
5305 5494
5495static void kvm_gen_update_masterclock(struct kvm *kvm)
5496{
5497#ifdef CONFIG_X86_64
5498 int i;
5499 struct kvm_vcpu *vcpu;
5500 struct kvm_arch *ka = &kvm->arch;
5501
5502 spin_lock(&ka->pvclock_gtod_sync_lock);
5503 kvm_make_mclock_inprogress_request(kvm);
5504 /* no guest entries from this point */
5505 pvclock_update_vm_gtod_copy(kvm);
5506
5507 kvm_for_each_vcpu(i, vcpu, kvm)
5508 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
5509
5510 /* guest entries allowed */
5511 kvm_for_each_vcpu(i, vcpu, kvm)
5512 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
5513
5514 spin_unlock(&ka->pvclock_gtod_sync_lock);
5515#endif
5516}
5517
5306static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5518static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5307{ 5519{
5308 int r; 5520 int r;
@@ -5315,6 +5527,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5315 kvm_mmu_unload(vcpu); 5527 kvm_mmu_unload(vcpu);
5316 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5528 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
5317 __kvm_migrate_timers(vcpu); 5529 __kvm_migrate_timers(vcpu);
5530 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
5531 kvm_gen_update_masterclock(vcpu->kvm);
5318 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { 5532 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5319 r = kvm_guest_time_update(vcpu); 5533 r = kvm_guest_time_update(vcpu);
5320 if (unlikely(r)) 5534 if (unlikely(r))
@@ -6219,6 +6433,8 @@ int kvm_arch_hardware_enable(void *garbage)
6219 kvm_for_each_vcpu(i, vcpu, kvm) { 6433 kvm_for_each_vcpu(i, vcpu, kvm) {
6220 vcpu->arch.tsc_offset_adjustment += delta_cyc; 6434 vcpu->arch.tsc_offset_adjustment += delta_cyc;
6221 vcpu->arch.last_host_tsc = local_tsc; 6435 vcpu->arch.last_host_tsc = local_tsc;
6436 set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
6437 &vcpu->requests);
6222 } 6438 }
6223 6439
6224 /* 6440 /*
@@ -6356,6 +6572,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6356 6572
6357 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 6573 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6358 mutex_init(&kvm->arch.apic_map_lock); 6574 mutex_init(&kvm->arch.apic_map_lock);
6575 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
6576
6577 pvclock_update_vm_gtod_copy(kvm);
6359 6578
6360 return 0; 6579 return 0;
6361} 6580}