aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/kvm_host.h7
-rw-r--r--arch/x86/kvm/trace.h30
-rw-r--r--arch/x86/kvm/x86.c235
-rw-r--r--include/linux/kvm_host.h3
-rw-r--r--virt/kvm/kvm_main.c5
5 files changed, 272 insertions, 8 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d60535adec9..32f0e4a063b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,8 @@
22#include <linux/kvm_para.h> 22#include <linux/kvm_para.h>
23#include <linux/kvm_types.h> 23#include <linux/kvm_types.h>
24#include <linux/perf_event.h> 24#include <linux/perf_event.h>
25#include <linux/pvclock_gtod.h>
26#include <linux/clocksource.h>
25 27
26#include <asm/pvclock-abi.h> 28#include <asm/pvclock-abi.h>
27#include <asm/desc.h> 29#include <asm/desc.h>
@@ -560,6 +562,11 @@ struct kvm_arch {
560 u64 cur_tsc_offset; 562 u64 cur_tsc_offset;
561 u8 cur_tsc_generation; 563 u8 cur_tsc_generation;
562 564
565 spinlock_t pvclock_gtod_sync_lock;
566 bool use_master_clock;
567 u64 master_kernel_ns;
568 cycle_t master_cycle_now;
569
563 struct kvm_xen_hvm_config xen_hvm_config; 570 struct kvm_xen_hvm_config xen_hvm_config;
564 571
565 /* fields used by HYPER-V emulation */ 572 /* fields used by HYPER-V emulation */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bca63f04dcc..1d652685608 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
4#include <linux/tracepoint.h> 4#include <linux/tracepoint.h>
5#include <asm/vmx.h> 5#include <asm/vmx.h>
6#include <asm/svm.h> 6#include <asm/svm.h>
7#include <asm/clocksource.h>
7 8
8#undef TRACE_SYSTEM 9#undef TRACE_SYSTEM
9#define TRACE_SYSTEM kvm 10#define TRACE_SYSTEM kvm
@@ -754,6 +755,35 @@ TRACE_EVENT(
754 __entry->write ? "Write" : "Read", 755 __entry->write ? "Write" : "Read",
755 __entry->gpa_match ? "GPA" : "GVA") 756 __entry->gpa_match ? "GPA" : "GVA")
756); 757);
758
759#ifdef CONFIG_X86_64
760
761#define host_clocks \
762 {VCLOCK_NONE, "none"}, \
763 {VCLOCK_TSC, "tsc"}, \
764 {VCLOCK_HPET, "hpet"} \
765
766TRACE_EVENT(kvm_update_master_clock,
767 TP_PROTO(bool use_master_clock, unsigned int host_clock),
768 TP_ARGS(use_master_clock, host_clock),
769
770 TP_STRUCT__entry(
771 __field( bool, use_master_clock )
772 __field( unsigned int, host_clock )
773 ),
774
775 TP_fast_assign(
776 __entry->use_master_clock = use_master_clock;
777 __entry->host_clock = host_clock;
778 ),
779
780 TP_printk("masterclock %d hostclock %s",
781 __entry->use_master_clock,
782 __print_symbolic(__entry->host_clock, host_clocks))
783);
784
785#endif /* CONFIG_X86_64 */
786
757#endif /* _TRACE_KVM_H */ 787#endif /* _TRACE_KVM_H */
758 788
759#undef TRACE_INCLUDE_PATH 789#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c077b817d1c..a7b97a49d8a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1048,7 +1048,9 @@ static inline u64 get_kernel_ns(void)
1048 return timespec_to_ns(&ts); 1048 return timespec_to_ns(&ts);
1049} 1049}
1050 1050
1051#ifdef CONFIG_X86_64
1051static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); 1052static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1053#endif
1052 1054
1053static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 1055static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1054unsigned long max_tsc_khz; 1056unsigned long max_tsc_khz;
@@ -1190,21 +1192,170 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1190 1192
1191EXPORT_SYMBOL_GPL(kvm_write_tsc); 1193EXPORT_SYMBOL_GPL(kvm_write_tsc);
1192 1194
1195#ifdef CONFIG_X86_64
1196
1197static cycle_t read_tsc(void)
1198{
1199 cycle_t ret;
1200 u64 last;
1201
1202 /*
1203 * Empirically, a fence (of type that depends on the CPU)
1204 * before rdtsc is enough to ensure that rdtsc is ordered
1205 * with respect to loads. The various CPU manuals are unclear
1206 * as to whether rdtsc can be reordered with later loads,
1207 * but no one has ever seen it happen.
1208 */
1209 rdtsc_barrier();
1210 ret = (cycle_t)vget_cycles();
1211
1212 last = pvclock_gtod_data.clock.cycle_last;
1213
1214 if (likely(ret >= last))
1215 return ret;
1216
1217 /*
1218 * GCC likes to generate cmov here, but this branch is extremely
1219 * predictable (it's just a funciton of time and the likely is
1220 * very likely) and there's a data dependence, so force GCC
1221 * to generate a branch instead. I don't barrier() because
1222 * we don't actually need a barrier, and if this function
1223 * ever gets inlined it will generate worse code.
1224 */
1225 asm volatile ("");
1226 return last;
1227}
1228
1229static inline u64 vgettsc(cycle_t *cycle_now)
1230{
1231 long v;
1232 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1233
1234 *cycle_now = read_tsc();
1235
1236 v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
1237 return v * gtod->clock.mult;
1238}
1239
1240static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
1241{
1242 unsigned long seq;
1243 u64 ns;
1244 int mode;
1245 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1246
1247 ts->tv_nsec = 0;
1248 do {
1249 seq = read_seqcount_begin(&gtod->seq);
1250 mode = gtod->clock.vclock_mode;
1251 ts->tv_sec = gtod->monotonic_time_sec;
1252 ns = gtod->monotonic_time_snsec;
1253 ns += vgettsc(cycle_now);
1254 ns >>= gtod->clock.shift;
1255 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1256 timespec_add_ns(ts, ns);
1257
1258 return mode;
1259}
1260
1261/* returns true if host is using tsc clocksource */
1262static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
1263{
1264 struct timespec ts;
1265
1266 /* checked again under seqlock below */
1267 if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
1268 return false;
1269
1270 if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
1271 return false;
1272
1273 monotonic_to_bootbased(&ts);
1274 *kernel_ns = timespec_to_ns(&ts);
1275
1276 return true;
1277}
1278#endif
1279
1280/*
1281 *
1282 * Assuming a stable TSC across physical CPUS, the following condition
1283 * is possible. Each numbered line represents an event visible to both
1284 * CPUs at the next numbered event.
1285 *
1286 * "timespecX" represents host monotonic time. "tscX" represents
1287 * RDTSC value.
1288 *
1289 * VCPU0 on CPU0 | VCPU1 on CPU1
1290 *
1291 * 1. read timespec0,tsc0
1292 * 2. | timespec1 = timespec0 + N
1293 * | tsc1 = tsc0 + M
1294 * 3. transition to guest | transition to guest
1295 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1296 * 5. | ret1 = timespec1 + (rdtsc - tsc1)
1297 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1298 *
1299 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1300 *
1301 * - ret0 < ret1
1302 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1303 * ...
1304 * - 0 < N - M => M < N
1305 *
1306 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1307 * always the case (the difference between two distinct xtime instances
1308 * might be smaller then the difference between corresponding TSC reads,
1309 * when updating guest vcpus pvclock areas).
1310 *
1311 * To avoid that problem, do not allow visibility of distinct
1312 * system_timestamp/tsc_timestamp values simultaneously: use a master
1313 * copy of host monotonic time values. Update that master copy
1314 * in lockstep.
1315 *
1316 * Rely on synchronization of host TSCs for monotonicity.
1317 *
1318 */
1319
1320static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1321{
1322#ifdef CONFIG_X86_64
1323 struct kvm_arch *ka = &kvm->arch;
1324 int vclock_mode;
1325
1326 /*
1327 * If the host uses TSC clock, then passthrough TSC as stable
1328 * to the guest.
1329 */
1330 ka->use_master_clock = kvm_get_time_and_clockread(
1331 &ka->master_kernel_ns,
1332 &ka->master_cycle_now);
1333
1334 if (ka->use_master_clock)
1335 atomic_set(&kvm_guest_has_master_clock, 1);
1336
1337 vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1338 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode);
1339#endif
1340}
1341
1193static int kvm_guest_time_update(struct kvm_vcpu *v) 1342static int kvm_guest_time_update(struct kvm_vcpu *v)
1194{ 1343{
1195 unsigned long flags; 1344 unsigned long flags, this_tsc_khz;
1196 struct kvm_vcpu_arch *vcpu = &v->arch; 1345 struct kvm_vcpu_arch *vcpu = &v->arch;
1346 struct kvm_arch *ka = &v->kvm->arch;
1197 void *shared_kaddr; 1347 void *shared_kaddr;
1198 unsigned long this_tsc_khz;
1199 s64 kernel_ns, max_kernel_ns; 1348 s64 kernel_ns, max_kernel_ns;
1200 u64 tsc_timestamp; 1349 u64 tsc_timestamp, host_tsc;
1201 struct pvclock_vcpu_time_info *guest_hv_clock; 1350 struct pvclock_vcpu_time_info *guest_hv_clock;
1202 u8 pvclock_flags; 1351 u8 pvclock_flags;
1352 bool use_master_clock;
1353
1354 kernel_ns = 0;
1355 host_tsc = 0;
1203 1356
1204 /* Keep irq disabled to prevent changes to the clock */ 1357 /* Keep irq disabled to prevent changes to the clock */
1205 local_irq_save(flags); 1358 local_irq_save(flags);
1206 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, native_read_tsc());
1207 kernel_ns = get_kernel_ns();
1208 this_tsc_khz = __get_cpu_var(cpu_tsc_khz); 1359 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1209 if (unlikely(this_tsc_khz == 0)) { 1360 if (unlikely(this_tsc_khz == 0)) {
1210 local_irq_restore(flags); 1361 local_irq_restore(flags);
@@ -1213,6 +1364,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1213 } 1364 }
1214 1365
1215 /* 1366 /*
1367 * If the host uses TSC clock, then passthrough TSC as stable
1368 * to the guest.
1369 */
1370 spin_lock(&ka->pvclock_gtod_sync_lock);
1371 use_master_clock = ka->use_master_clock;
1372 if (use_master_clock) {
1373 host_tsc = ka->master_cycle_now;
1374 kernel_ns = ka->master_kernel_ns;
1375 }
1376 spin_unlock(&ka->pvclock_gtod_sync_lock);
1377 if (!use_master_clock) {
1378 host_tsc = native_read_tsc();
1379 kernel_ns = get_kernel_ns();
1380 }
1381
1382 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
1383
1384 /*
1216 * We may have to catch up the TSC to match elapsed wall clock 1385 * We may have to catch up the TSC to match elapsed wall clock
1217 * time for two reasons, even if kvmclock is used. 1386 * time for two reasons, even if kvmclock is used.
1218 * 1) CPU could have been running below the maximum TSC rate 1387 * 1) CPU could have been running below the maximum TSC rate
@@ -1273,9 +1442,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1273 vcpu->hw_tsc_khz = this_tsc_khz; 1442 vcpu->hw_tsc_khz = this_tsc_khz;
1274 } 1443 }
1275 1444
1276 if (max_kernel_ns > kernel_ns) 1445 /* with a master <monotonic time, tsc value> tuple,
1277 kernel_ns = max_kernel_ns; 1446 * pvclock clock reads always increase at the (scaled) rate
1278 1447 * of guest TSC - no need to deal with sampling errors.
1448 */
1449 if (!use_master_clock) {
1450 if (max_kernel_ns > kernel_ns)
1451 kernel_ns = max_kernel_ns;
1452 }
1279 /* With all the info we got, fill in the values */ 1453 /* With all the info we got, fill in the values */
1280 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; 1454 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1281 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1455 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
@@ -1301,6 +1475,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1301 vcpu->pvclock_set_guest_stopped_request = false; 1475 vcpu->pvclock_set_guest_stopped_request = false;
1302 } 1476 }
1303 1477
1478 /* If the host uses TSC clocksource, then it is stable */
1479 if (use_master_clock)
1480 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
1481
1304 vcpu->hv_clock.flags = pvclock_flags; 1482 vcpu->hv_clock.flags = pvclock_flags;
1305 1483
1306 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 1484 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
@@ -4912,6 +5090,17 @@ static void kvm_set_mmio_spte_mask(void)
4912#ifdef CONFIG_X86_64 5090#ifdef CONFIG_X86_64
4913static void pvclock_gtod_update_fn(struct work_struct *work) 5091static void pvclock_gtod_update_fn(struct work_struct *work)
4914{ 5092{
5093 struct kvm *kvm;
5094
5095 struct kvm_vcpu *vcpu;
5096 int i;
5097
5098 raw_spin_lock(&kvm_lock);
5099 list_for_each_entry(kvm, &vm_list, vm_list)
5100 kvm_for_each_vcpu(i, vcpu, kvm)
5101 set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
5102 atomic_set(&kvm_guest_has_master_clock, 0);
5103 raw_spin_unlock(&kvm_lock);
4915} 5104}
4916 5105
4917static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); 5106static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -5303,6 +5492,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
5303 kvm_make_request(KVM_REQ_EVENT, vcpu); 5492 kvm_make_request(KVM_REQ_EVENT, vcpu);
5304} 5493}
5305 5494
5495static void kvm_gen_update_masterclock(struct kvm *kvm)
5496{
5497#ifdef CONFIG_X86_64
5498 int i;
5499 struct kvm_vcpu *vcpu;
5500 struct kvm_arch *ka = &kvm->arch;
5501
5502 spin_lock(&ka->pvclock_gtod_sync_lock);
5503 kvm_make_mclock_inprogress_request(kvm);
5504 /* no guest entries from this point */
5505 pvclock_update_vm_gtod_copy(kvm);
5506
5507 kvm_for_each_vcpu(i, vcpu, kvm)
5508 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
5509
5510 /* guest entries allowed */
5511 kvm_for_each_vcpu(i, vcpu, kvm)
5512 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
5513
5514 spin_unlock(&ka->pvclock_gtod_sync_lock);
5515#endif
5516}
5517
5306static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5518static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5307{ 5519{
5308 int r; 5520 int r;
@@ -5315,6 +5527,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5315 kvm_mmu_unload(vcpu); 5527 kvm_mmu_unload(vcpu);
5316 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5528 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
5317 __kvm_migrate_timers(vcpu); 5529 __kvm_migrate_timers(vcpu);
5530 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
5531 kvm_gen_update_masterclock(vcpu->kvm);
5318 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { 5532 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5319 r = kvm_guest_time_update(vcpu); 5533 r = kvm_guest_time_update(vcpu);
5320 if (unlikely(r)) 5534 if (unlikely(r))
@@ -6219,6 +6433,8 @@ int kvm_arch_hardware_enable(void *garbage)
6219 kvm_for_each_vcpu(i, vcpu, kvm) { 6433 kvm_for_each_vcpu(i, vcpu, kvm) {
6220 vcpu->arch.tsc_offset_adjustment += delta_cyc; 6434 vcpu->arch.tsc_offset_adjustment += delta_cyc;
6221 vcpu->arch.last_host_tsc = local_tsc; 6435 vcpu->arch.last_host_tsc = local_tsc;
6436 set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
6437 &vcpu->requests);
6222 } 6438 }
6223 6439
6224 /* 6440 /*
@@ -6356,6 +6572,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6356 6572
6357 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 6573 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6358 mutex_init(&kvm->arch.apic_map_lock); 6574 mutex_init(&kvm->arch.apic_map_lock);
6575 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
6576
6577 pvclock_update_vm_gtod_copy(kvm);
6359 6578
6360 return 0; 6579 return 0;
6361} 6580}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 99a47627e04..c94c9985dee 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -131,6 +131,8 @@ static inline bool is_error_page(struct page *page)
131#define KVM_REQ_PMU 16 131#define KVM_REQ_PMU 16
132#define KVM_REQ_PMI 17 132#define KVM_REQ_PMI 17
133#define KVM_REQ_WATCHDOG 18 133#define KVM_REQ_WATCHDOG 18
134#define KVM_REQ_MASTERCLOCK_UPDATE 19
135#define KVM_REQ_MCLOCK_INPROGRESS 20
134 136
135#define KVM_USERSPACE_IRQ_SOURCE_ID 0 137#define KVM_USERSPACE_IRQ_SOURCE_ID 0
136#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 138#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
@@ -540,6 +542,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
540 542
541void kvm_flush_remote_tlbs(struct kvm *kvm); 543void kvm_flush_remote_tlbs(struct kvm *kvm);
542void kvm_reload_remote_mmus(struct kvm *kvm); 544void kvm_reload_remote_mmus(struct kvm *kvm);
545void kvm_make_mclock_inprogress_request(struct kvm *kvm);
543 546
544long kvm_arch_dev_ioctl(struct file *filp, 547long kvm_arch_dev_ioctl(struct file *filp,
545 unsigned int ioctl, unsigned long arg); 548 unsigned int ioctl, unsigned long arg);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e3f5b143158..be3e7bb73b1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -212,6 +212,11 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
212 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 212 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
213} 213}
214 214
215void kvm_make_mclock_inprogress_request(struct kvm *kvm)
216{
217 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
218}
219
215int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 220int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
216{ 221{
217 struct page *page; 222 struct page *page;