aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c780
1 files changed, 598 insertions, 182 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6c2ecf0a806d..2288ad829b32 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008 8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affilates. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * 10 *
11 * Authors: 11 * Authors:
12 * Avi Kivity <avi@qumranet.com> 12 * Avi Kivity <avi@qumranet.com>
@@ -55,6 +55,8 @@
55#include <asm/mce.h> 55#include <asm/mce.h>
56#include <asm/i387.h> 56#include <asm/i387.h>
57#include <asm/xcr.h> 57#include <asm/xcr.h>
58#include <asm/pvclock.h>
59#include <asm/div64.h>
58 60
59#define MAX_IO_MSRS 256 61#define MAX_IO_MSRS 256
60#define CR0_RESERVED_BITS \ 62#define CR0_RESERVED_BITS \
@@ -71,7 +73,7 @@
71#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 73#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
72 74
73#define KVM_MAX_MCE_BANKS 32 75#define KVM_MAX_MCE_BANKS 32
74#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P 76#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
75 77
76/* EFER defaults: 78/* EFER defaults:
77 * - enable syscall per default because its emulated by KVM 79 * - enable syscall per default because its emulated by KVM
@@ -282,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
282 u32 prev_nr; 284 u32 prev_nr;
283 int class1, class2; 285 int class1, class2;
284 286
287 kvm_make_request(KVM_REQ_EVENT, vcpu);
288
285 if (!vcpu->arch.exception.pending) { 289 if (!vcpu->arch.exception.pending) {
286 queue: 290 queue:
287 vcpu->arch.exception.pending = true; 291 vcpu->arch.exception.pending = true;
@@ -327,16 +331,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
327} 331}
328EXPORT_SYMBOL_GPL(kvm_requeue_exception); 332EXPORT_SYMBOL_GPL(kvm_requeue_exception);
329 333
330void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 334void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
331 u32 error_code)
332{ 335{
336 unsigned error_code = vcpu->arch.fault.error_code;
337
333 ++vcpu->stat.pf_guest; 338 ++vcpu->stat.pf_guest;
334 vcpu->arch.cr2 = addr; 339 vcpu->arch.cr2 = vcpu->arch.fault.address;
335 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 340 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
336} 341}
337 342
343void kvm_propagate_fault(struct kvm_vcpu *vcpu)
344{
345 if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
346 vcpu->arch.nested_mmu.inject_page_fault(vcpu);
347 else
348 vcpu->arch.mmu.inject_page_fault(vcpu);
349
350 vcpu->arch.fault.nested = false;
351}
352
338void kvm_inject_nmi(struct kvm_vcpu *vcpu) 353void kvm_inject_nmi(struct kvm_vcpu *vcpu)
339{ 354{
355 kvm_make_request(KVM_REQ_EVENT, vcpu);
340 vcpu->arch.nmi_pending = 1; 356 vcpu->arch.nmi_pending = 1;
341} 357}
342EXPORT_SYMBOL_GPL(kvm_inject_nmi); 358EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -367,18 +383,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
367EXPORT_SYMBOL_GPL(kvm_require_cpl); 383EXPORT_SYMBOL_GPL(kvm_require_cpl);
368 384
369/* 385/*
386 * This function will be used to read from the physical memory of the currently
387 * running guest. The difference to kvm_read_guest_page is that this function
388 * can read from guest physical or from the guest's guest physical memory.
389 */
390int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
391 gfn_t ngfn, void *data, int offset, int len,
392 u32 access)
393{
394 gfn_t real_gfn;
395 gpa_t ngpa;
396
397 ngpa = gfn_to_gpa(ngfn);
398 real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
399 if (real_gfn == UNMAPPED_GVA)
400 return -EFAULT;
401
402 real_gfn = gpa_to_gfn(real_gfn);
403
404 return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
405}
406EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
407
408int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
409 void *data, int offset, int len, u32 access)
410{
411 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
412 data, offset, len, access);
413}
414
415/*
370 * Load the pae pdptrs. Return true is they are all valid. 416 * Load the pae pdptrs. Return true is they are all valid.
371 */ 417 */
372int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 418int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
373{ 419{
374 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 420 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
375 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 421 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
376 int i; 422 int i;
377 int ret; 423 int ret;
378 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 424 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
379 425
380 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 426 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
381 offset * sizeof(u64), sizeof(pdpte)); 427 offset * sizeof(u64), sizeof(pdpte),
428 PFERR_USER_MASK|PFERR_WRITE_MASK);
382 if (ret < 0) { 429 if (ret < 0) {
383 ret = 0; 430 ret = 0;
384 goto out; 431 goto out;
@@ -392,7 +439,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
392 } 439 }
393 ret = 1; 440 ret = 1;
394 441
395 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 442 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
396 __set_bit(VCPU_EXREG_PDPTR, 443 __set_bit(VCPU_EXREG_PDPTR,
397 (unsigned long *)&vcpu->arch.regs_avail); 444 (unsigned long *)&vcpu->arch.regs_avail);
398 __set_bit(VCPU_EXREG_PDPTR, 445 __set_bit(VCPU_EXREG_PDPTR,
@@ -405,8 +452,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
405 452
406static bool pdptrs_changed(struct kvm_vcpu *vcpu) 453static bool pdptrs_changed(struct kvm_vcpu *vcpu)
407{ 454{
408 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 455 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
409 bool changed = true; 456 bool changed = true;
457 int offset;
458 gfn_t gfn;
410 int r; 459 int r;
411 460
412 if (is_long_mode(vcpu) || !is_pae(vcpu)) 461 if (is_long_mode(vcpu) || !is_pae(vcpu))
@@ -416,10 +465,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
416 (unsigned long *)&vcpu->arch.regs_avail)) 465 (unsigned long *)&vcpu->arch.regs_avail))
417 return true; 466 return true;
418 467
419 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 468 gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
469 offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
470 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
471 PFERR_USER_MASK | PFERR_WRITE_MASK);
420 if (r < 0) 472 if (r < 0)
421 goto out; 473 goto out;
422 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 474 changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
423out: 475out:
424 476
425 return changed; 477 return changed;
@@ -458,7 +510,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
458 return 1; 510 return 1;
459 } else 511 } else
460#endif 512#endif
461 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) 513 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
514 vcpu->arch.cr3))
462 return 1; 515 return 1;
463 } 516 }
464 517
@@ -547,7 +600,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
547 return 1; 600 return 1;
548 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 601 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
549 && ((cr4 ^ old_cr4) & pdptr_bits) 602 && ((cr4 ^ old_cr4) & pdptr_bits)
550 && !load_pdptrs(vcpu, vcpu->arch.cr3)) 603 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
551 return 1; 604 return 1;
552 605
553 if (cr4 & X86_CR4_VMXE) 606 if (cr4 & X86_CR4_VMXE)
@@ -580,7 +633,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
580 if (is_pae(vcpu)) { 633 if (is_pae(vcpu)) {
581 if (cr3 & CR3_PAE_RESERVED_BITS) 634 if (cr3 & CR3_PAE_RESERVED_BITS)
582 return 1; 635 return 1;
583 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) 636 if (is_paging(vcpu) &&
637 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
584 return 1; 638 return 1;
585 } 639 }
586 /* 640 /*
@@ -737,7 +791,7 @@ static u32 msrs_to_save[] = {
737#ifdef CONFIG_X86_64 791#ifdef CONFIG_X86_64
738 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 792 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
739#endif 793#endif
740 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 794 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
741}; 795};
742 796
743static unsigned num_msrs_to_save; 797static unsigned num_msrs_to_save;
@@ -838,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
838 892
839 /* 893 /*
840 * The guest calculates current wall clock time by adding 894 * The guest calculates current wall clock time by adding
841 * system time (updated by kvm_write_guest_time below) to the 895 * system time (updated by kvm_guest_time_update below) to the
842 * wall clock specified here. guest system time equals host 896 * wall clock specified here. guest system time equals host
843 * system time for us, thus we must fill in host boot time here. 897 * system time for us, thus we must fill in host boot time here.
844 */ 898 */
@@ -866,65 +920,229 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
866 return quotient; 920 return quotient;
867} 921}
868 922
869static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 923static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
924 s8 *pshift, u32 *pmultiplier)
870{ 925{
871 uint64_t nsecs = 1000000000LL; 926 uint64_t scaled64;
872 int32_t shift = 0; 927 int32_t shift = 0;
873 uint64_t tps64; 928 uint64_t tps64;
874 uint32_t tps32; 929 uint32_t tps32;
875 930
876 tps64 = tsc_khz * 1000LL; 931 tps64 = base_khz * 1000LL;
877 while (tps64 > nsecs*2) { 932 scaled64 = scaled_khz * 1000LL;
933 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
878 tps64 >>= 1; 934 tps64 >>= 1;
879 shift--; 935 shift--;
880 } 936 }
881 937
882 tps32 = (uint32_t)tps64; 938 tps32 = (uint32_t)tps64;
883 while (tps32 <= (uint32_t)nsecs) { 939 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
884 tps32 <<= 1; 940 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
941 scaled64 >>= 1;
942 else
943 tps32 <<= 1;
885 shift++; 944 shift++;
886 } 945 }
887 946
888 hv_clock->tsc_shift = shift; 947 *pshift = shift;
889 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 948 *pmultiplier = div_frac(scaled64, tps32);
890 949
891 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 950 pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
892 __func__, tsc_khz, hv_clock->tsc_shift, 951 __func__, base_khz, scaled_khz, shift, *pmultiplier);
893 hv_clock->tsc_to_system_mul); 952}
953
954static inline u64 get_kernel_ns(void)
955{
956 struct timespec ts;
957
958 WARN_ON(preemptible());
959 ktime_get_ts(&ts);
960 monotonic_to_bootbased(&ts);
961 return timespec_to_ns(&ts);
894} 962}
895 963
896static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 964static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
965unsigned long max_tsc_khz;
897 966
898static void kvm_write_guest_time(struct kvm_vcpu *v) 967static inline int kvm_tsc_changes_freq(void)
968{
969 int cpu = get_cpu();
970 int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
971 cpufreq_quick_get(cpu) != 0;
972 put_cpu();
973 return ret;
974}
975
976static inline u64 nsec_to_cycles(u64 nsec)
977{
978 u64 ret;
979
980 WARN_ON(preemptible());
981 if (kvm_tsc_changes_freq())
982 printk_once(KERN_WARNING
983 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
984 ret = nsec * __get_cpu_var(cpu_tsc_khz);
985 do_div(ret, USEC_PER_SEC);
986 return ret;
987}
988
989static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
990{
991 /* Compute a scale to convert nanoseconds in TSC cycles */
992 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
993 &kvm->arch.virtual_tsc_shift,
994 &kvm->arch.virtual_tsc_mult);
995 kvm->arch.virtual_tsc_khz = this_tsc_khz;
996}
997
998static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
999{
1000 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
1001 vcpu->kvm->arch.virtual_tsc_mult,
1002 vcpu->kvm->arch.virtual_tsc_shift);
1003 tsc += vcpu->arch.last_tsc_write;
1004 return tsc;
1005}
1006
1007void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1008{
1009 struct kvm *kvm = vcpu->kvm;
1010 u64 offset, ns, elapsed;
1011 unsigned long flags;
1012 s64 sdiff;
1013
1014 spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1015 offset = data - native_read_tsc();
1016 ns = get_kernel_ns();
1017 elapsed = ns - kvm->arch.last_tsc_nsec;
1018 sdiff = data - kvm->arch.last_tsc_write;
1019 if (sdiff < 0)
1020 sdiff = -sdiff;
1021
1022 /*
1023 * Special case: close write to TSC within 5 seconds of
1024 * another CPU is interpreted as an attempt to synchronize
1025 * The 5 seconds is to accomodate host load / swapping as
1026 * well as any reset of TSC during the boot process.
1027 *
1028 * In that case, for a reliable TSC, we can match TSC offsets,
1029 * or make a best guest using elapsed value.
1030 */
1031 if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
1032 elapsed < 5ULL * NSEC_PER_SEC) {
1033 if (!check_tsc_unstable()) {
1034 offset = kvm->arch.last_tsc_offset;
1035 pr_debug("kvm: matched tsc offset for %llu\n", data);
1036 } else {
1037 u64 delta = nsec_to_cycles(elapsed);
1038 offset += delta;
1039 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1040 }
1041 ns = kvm->arch.last_tsc_nsec;
1042 }
1043 kvm->arch.last_tsc_nsec = ns;
1044 kvm->arch.last_tsc_write = data;
1045 kvm->arch.last_tsc_offset = offset;
1046 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1047 spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1048
1049 /* Reset of TSC must disable overshoot protection below */
1050 vcpu->arch.hv_clock.tsc_timestamp = 0;
1051 vcpu->arch.last_tsc_write = data;
1052 vcpu->arch.last_tsc_nsec = ns;
1053}
1054EXPORT_SYMBOL_GPL(kvm_write_tsc);
1055
1056static int kvm_guest_time_update(struct kvm_vcpu *v)
899{ 1057{
900 struct timespec ts;
901 unsigned long flags; 1058 unsigned long flags;
902 struct kvm_vcpu_arch *vcpu = &v->arch; 1059 struct kvm_vcpu_arch *vcpu = &v->arch;
903 void *shared_kaddr; 1060 void *shared_kaddr;
904 unsigned long this_tsc_khz; 1061 unsigned long this_tsc_khz;
1062 s64 kernel_ns, max_kernel_ns;
1063 u64 tsc_timestamp;
905 1064
906 if ((!vcpu->time_page)) 1065 /* Keep irq disabled to prevent changes to the clock */
907 return; 1066 local_irq_save(flags);
1067 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
1068 kernel_ns = get_kernel_ns();
1069 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
908 1070
909 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 1071 if (unlikely(this_tsc_khz == 0)) {
910 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 1072 local_irq_restore(flags);
911 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 1073 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
912 vcpu->hv_clock_tsc_khz = this_tsc_khz; 1074 return 1;
1075 }
1076
1077 /*
1078 * We may have to catch up the TSC to match elapsed wall clock
1079 * time for two reasons, even if kvmclock is used.
1080 * 1) CPU could have been running below the maximum TSC rate
1081 * 2) Broken TSC compensation resets the base at each VCPU
1082 * entry to avoid unknown leaps of TSC even when running
1083 * again on the same CPU. This may cause apparent elapsed
1084 * time to disappear, and the guest to stand still or run
1085 * very slowly.
1086 */
1087 if (vcpu->tsc_catchup) {
1088 u64 tsc = compute_guest_tsc(v, kernel_ns);
1089 if (tsc > tsc_timestamp) {
1090 kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
1091 tsc_timestamp = tsc;
1092 }
913 } 1093 }
914 put_cpu_var(cpu_tsc_khz);
915 1094
916 /* Keep irq disabled to prevent changes to the clock */
917 local_irq_save(flags);
918 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
919 ktime_get_ts(&ts);
920 monotonic_to_bootbased(&ts);
921 local_irq_restore(flags); 1095 local_irq_restore(flags);
922 1096
923 /* With all the info we got, fill in the values */ 1097 if (!vcpu->time_page)
1098 return 0;
924 1099
925 vcpu->hv_clock.system_time = ts.tv_nsec + 1100 /*
926 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 1101 * Time as measured by the TSC may go backwards when resetting the base
1102 * tsc_timestamp. The reason for this is that the TSC resolution is
1103 * higher than the resolution of the other clock scales. Thus, many
1104 * possible measurments of the TSC correspond to one measurement of any
1105 * other clock, and so a spread of values is possible. This is not a
1106 * problem for the computation of the nanosecond clock; with TSC rates
1107 * around 1GHZ, there can only be a few cycles which correspond to one
1108 * nanosecond value, and any path through this code will inevitably
1109 * take longer than that. However, with the kernel_ns value itself,
1110 * the precision may be much lower, down to HZ granularity. If the
1111 * first sampling of TSC against kernel_ns ends in the low part of the
1112 * range, and the second in the high end of the range, we can get:
1113 *
1114 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
1115 *
1116 * As the sampling errors potentially range in the thousands of cycles,
1117 * it is possible such a time value has already been observed by the
1118 * guest. To protect against this, we must compute the system time as
1119 * observed by the guest and ensure the new system time is greater.
1120 */
1121 max_kernel_ns = 0;
1122 if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
1123 max_kernel_ns = vcpu->last_guest_tsc -
1124 vcpu->hv_clock.tsc_timestamp;
1125 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
1126 vcpu->hv_clock.tsc_to_system_mul,
1127 vcpu->hv_clock.tsc_shift);
1128 max_kernel_ns += vcpu->last_kernel_ns;
1129 }
927 1130
1131 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1132 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1133 &vcpu->hv_clock.tsc_shift,
1134 &vcpu->hv_clock.tsc_to_system_mul);
1135 vcpu->hw_tsc_khz = this_tsc_khz;
1136 }
1137
1138 if (max_kernel_ns > kernel_ns)
1139 kernel_ns = max_kernel_ns;
1140
1141 /* With all the info we got, fill in the values */
1142 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1143 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1144 vcpu->last_kernel_ns = kernel_ns;
1145 vcpu->last_guest_tsc = tsc_timestamp;
928 vcpu->hv_clock.flags = 0; 1146 vcpu->hv_clock.flags = 0;
929 1147
930 /* 1148 /*
@@ -942,16 +1160,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
942 kunmap_atomic(shared_kaddr, KM_USER0); 1160 kunmap_atomic(shared_kaddr, KM_USER0);
943 1161
944 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 1162 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
945} 1163 return 0;
946
947static int kvm_request_guest_time_update(struct kvm_vcpu *v)
948{
949 struct kvm_vcpu_arch *vcpu = &v->arch;
950
951 if (!vcpu->time_page)
952 return 0;
953 kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
954 return 1;
955} 1164}
956 1165
957static bool msr_mtrr_valid(unsigned msr) 1166static bool msr_mtrr_valid(unsigned msr)
@@ -1277,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1277 } 1486 }
1278 1487
1279 vcpu->arch.time = data; 1488 vcpu->arch.time = data;
1489 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1280 1490
1281 /* we verify if the enable bit is set... */ 1491 /* we verify if the enable bit is set... */
1282 if (!(data & 1)) 1492 if (!(data & 1))
@@ -1292,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1292 kvm_release_page_clean(vcpu->arch.time_page); 1502 kvm_release_page_clean(vcpu->arch.time_page);
1293 vcpu->arch.time_page = NULL; 1503 vcpu->arch.time_page = NULL;
1294 } 1504 }
1295
1296 kvm_request_guest_time_update(vcpu);
1297 break; 1505 break;
1298 } 1506 }
1299 case MSR_IA32_MCG_CTL: 1507 case MSR_IA32_MCG_CTL:
@@ -1330,6 +1538,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1330 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1538 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1331 "0x%x data 0x%llx\n", msr, data); 1539 "0x%x data 0x%llx\n", msr, data);
1332 break; 1540 break;
1541 case MSR_K7_CLK_CTL:
1542 /*
1543 * Ignore all writes to this no longer documented MSR.
1544 * Writes are only relevant for old K7 processors,
1545 * all pre-dating SVM, but a recommended workaround from
1546 * AMD for these chips. It is possible to speicify the
1547 * affected processor models on the command line, hence
1548 * the need to ignore the workaround.
1549 */
1550 break;
1333 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1551 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1334 if (kvm_hv_msr_partition_wide(msr)) { 1552 if (kvm_hv_msr_partition_wide(msr)) {
1335 int r; 1553 int r;
@@ -1522,6 +1740,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1522 case 0xcd: /* fsb frequency */ 1740 case 0xcd: /* fsb frequency */
1523 data = 3; 1741 data = 3;
1524 break; 1742 break;
1743 /*
1744 * MSR_EBC_FREQUENCY_ID
1745 * Conservative value valid for even the basic CPU models.
1746 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
1747 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
1748 * and 266MHz for model 3, or 4. Set Core Clock
1749 * Frequency to System Bus Frequency Ratio to 1 (bits
1750 * 31:24) even though these are only valid for CPU
1751 * models > 2, however guests may end up dividing or
1752 * multiplying by zero otherwise.
1753 */
1754 case MSR_EBC_FREQUENCY_ID:
1755 data = 1 << 24;
1756 break;
1525 case MSR_IA32_APICBASE: 1757 case MSR_IA32_APICBASE:
1526 data = kvm_get_apic_base(vcpu); 1758 data = kvm_get_apic_base(vcpu);
1527 break; 1759 break;
@@ -1555,6 +1787,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1555 case MSR_IA32_MCG_STATUS: 1787 case MSR_IA32_MCG_STATUS:
1556 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1788 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1557 return get_msr_mce(vcpu, msr, pdata); 1789 return get_msr_mce(vcpu, msr, pdata);
1790 case MSR_K7_CLK_CTL:
1791 /*
1792 * Provide expected ramp-up count for K7. All other
1793 * are set to zero, indicating minimum divisors for
1794 * every field.
1795 *
1796 * This prevents guest kernels on AMD host with CPU
1797 * type 6, model 8 and higher from exploding due to
1798 * the rdmsr failing.
1799 */
1800 data = 0x20000000;
1801 break;
1558 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1802 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1559 if (kvm_hv_msr_partition_wide(msr)) { 1803 if (kvm_hv_msr_partition_wide(msr)) {
1560 int r; 1804 int r;
@@ -1808,19 +2052,28 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1808 } 2052 }
1809 2053
1810 kvm_x86_ops->vcpu_load(vcpu, cpu); 2054 kvm_x86_ops->vcpu_load(vcpu, cpu);
1811 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 2055 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
1812 unsigned long khz = cpufreq_quick_get(cpu); 2056 /* Make sure TSC doesn't go backwards */
1813 if (!khz) 2057 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
1814 khz = tsc_khz; 2058 native_read_tsc() - vcpu->arch.last_host_tsc;
1815 per_cpu(cpu_tsc_khz, cpu) = khz; 2059 if (tsc_delta < 0)
2060 mark_tsc_unstable("KVM discovered backwards TSC");
2061 if (check_tsc_unstable()) {
2062 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
2063 vcpu->arch.tsc_catchup = 1;
2064 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2065 }
2066 if (vcpu->cpu != cpu)
2067 kvm_migrate_timers(vcpu);
2068 vcpu->cpu = cpu;
1816 } 2069 }
1817 kvm_request_guest_time_update(vcpu);
1818} 2070}
1819 2071
1820void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2072void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1821{ 2073{
1822 kvm_x86_ops->vcpu_put(vcpu); 2074 kvm_x86_ops->vcpu_put(vcpu);
1823 kvm_put_guest_fpu(vcpu); 2075 kvm_put_guest_fpu(vcpu);
2076 vcpu->arch.last_host_tsc = native_read_tsc();
1824} 2077}
1825 2078
1826static int is_efer_nx(void) 2079static int is_efer_nx(void)
@@ -1995,7 +2248,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1995 F(F16C); 2248 F(F16C);
1996 /* cpuid 0x80000001.ecx */ 2249 /* cpuid 0x80000001.ecx */
1997 const u32 kvm_supported_word6_x86_features = 2250 const u32 kvm_supported_word6_x86_features =
1998 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 2251 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
1999 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 2252 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
2000 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 2253 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2001 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); 2254 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
@@ -2204,6 +2457,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2204 return -ENXIO; 2457 return -ENXIO;
2205 2458
2206 kvm_queue_interrupt(vcpu, irq->irq, false); 2459 kvm_queue_interrupt(vcpu, irq->irq, false);
2460 kvm_make_request(KVM_REQ_EVENT, vcpu);
2207 2461
2208 return 0; 2462 return 0;
2209} 2463}
@@ -2357,6 +2611,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2357 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2611 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2358 vcpu->arch.sipi_vector = events->sipi_vector; 2612 vcpu->arch.sipi_vector = events->sipi_vector;
2359 2613
2614 kvm_make_request(KVM_REQ_EVENT, vcpu);
2615
2360 return 0; 2616 return 0;
2361} 2617}
2362 2618
@@ -2760,7 +3016,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2760 3016
2761static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 3017static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2762{ 3018{
2763 return kvm->arch.n_alloc_mmu_pages; 3019 return kvm->arch.n_max_mmu_pages;
2764} 3020}
2765 3021
2766static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 3022static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
@@ -2796,18 +3052,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2796 r = 0; 3052 r = 0;
2797 switch (chip->chip_id) { 3053 switch (chip->chip_id) {
2798 case KVM_IRQCHIP_PIC_MASTER: 3054 case KVM_IRQCHIP_PIC_MASTER:
2799 raw_spin_lock(&pic_irqchip(kvm)->lock); 3055 spin_lock(&pic_irqchip(kvm)->lock);
2800 memcpy(&pic_irqchip(kvm)->pics[0], 3056 memcpy(&pic_irqchip(kvm)->pics[0],
2801 &chip->chip.pic, 3057 &chip->chip.pic,
2802 sizeof(struct kvm_pic_state)); 3058 sizeof(struct kvm_pic_state));
2803 raw_spin_unlock(&pic_irqchip(kvm)->lock); 3059 spin_unlock(&pic_irqchip(kvm)->lock);
2804 break; 3060 break;
2805 case KVM_IRQCHIP_PIC_SLAVE: 3061 case KVM_IRQCHIP_PIC_SLAVE:
2806 raw_spin_lock(&pic_irqchip(kvm)->lock); 3062 spin_lock(&pic_irqchip(kvm)->lock);
2807 memcpy(&pic_irqchip(kvm)->pics[1], 3063 memcpy(&pic_irqchip(kvm)->pics[1],
2808 &chip->chip.pic, 3064 &chip->chip.pic,
2809 sizeof(struct kvm_pic_state)); 3065 sizeof(struct kvm_pic_state));
2810 raw_spin_unlock(&pic_irqchip(kvm)->lock); 3066 spin_unlock(&pic_irqchip(kvm)->lock);
2811 break; 3067 break;
2812 case KVM_IRQCHIP_IOAPIC: 3068 case KVM_IRQCHIP_IOAPIC:
2813 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 3069 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -3201,7 +3457,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3201 break; 3457 break;
3202 } 3458 }
3203 case KVM_SET_CLOCK: { 3459 case KVM_SET_CLOCK: {
3204 struct timespec now;
3205 struct kvm_clock_data user_ns; 3460 struct kvm_clock_data user_ns;
3206 u64 now_ns; 3461 u64 now_ns;
3207 s64 delta; 3462 s64 delta;
@@ -3215,20 +3470,21 @@ long kvm_arch_vm_ioctl(struct file *filp,
3215 goto out; 3470 goto out;
3216 3471
3217 r = 0; 3472 r = 0;
3218 ktime_get_ts(&now); 3473 local_irq_disable();
3219 now_ns = timespec_to_ns(&now); 3474 now_ns = get_kernel_ns();
3220 delta = user_ns.clock - now_ns; 3475 delta = user_ns.clock - now_ns;
3476 local_irq_enable();
3221 kvm->arch.kvmclock_offset = delta; 3477 kvm->arch.kvmclock_offset = delta;
3222 break; 3478 break;
3223 } 3479 }
3224 case KVM_GET_CLOCK: { 3480 case KVM_GET_CLOCK: {
3225 struct timespec now;
3226 struct kvm_clock_data user_ns; 3481 struct kvm_clock_data user_ns;
3227 u64 now_ns; 3482 u64 now_ns;
3228 3483
3229 ktime_get_ts(&now); 3484 local_irq_disable();
3230 now_ns = timespec_to_ns(&now); 3485 now_ns = get_kernel_ns();
3231 user_ns.clock = kvm->arch.kvmclock_offset + now_ns; 3486 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
3487 local_irq_enable();
3232 user_ns.flags = 0; 3488 user_ns.flags = 0;
3233 3489
3234 r = -EFAULT; 3490 r = -EFAULT;
@@ -3292,30 +3548,51 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
3292 kvm_x86_ops->get_segment(vcpu, var, seg); 3548 kvm_x86_ops->get_segment(vcpu, var, seg);
3293} 3549}
3294 3550
3551static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3552{
3553 return gpa;
3554}
3555
3556static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3557{
3558 gpa_t t_gpa;
3559 u32 error;
3560
3561 BUG_ON(!mmu_is_nested(vcpu));
3562
3563 /* NPT walks are always user-walks */
3564 access |= PFERR_USER_MASK;
3565 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
3566 if (t_gpa == UNMAPPED_GVA)
3567 vcpu->arch.fault.nested = true;
3568
3569 return t_gpa;
3570}
3571
3295gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3572gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3296{ 3573{
3297 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3574 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3298 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3575 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
3299} 3576}
3300 3577
3301 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3578 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3302{ 3579{
3303 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3580 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3304 access |= PFERR_FETCH_MASK; 3581 access |= PFERR_FETCH_MASK;
3305 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3582 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
3306} 3583}
3307 3584
3308gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3585gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3309{ 3586{
3310 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3587 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3311 access |= PFERR_WRITE_MASK; 3588 access |= PFERR_WRITE_MASK;
3312 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3589 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
3313} 3590}
3314 3591
3315/* uses this to access any guest's mapped memory without checking CPL */ 3592/* uses this to access any guest's mapped memory without checking CPL */
3316gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3593gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3317{ 3594{
3318 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); 3595 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
3319} 3596}
3320 3597
3321static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3598static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
@@ -3326,7 +3603,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3326 int r = X86EMUL_CONTINUE; 3603 int r = X86EMUL_CONTINUE;
3327 3604
3328 while (bytes) { 3605 while (bytes) {
3329 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); 3606 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3607 error);
3330 unsigned offset = addr & (PAGE_SIZE-1); 3608 unsigned offset = addr & (PAGE_SIZE-1);
3331 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3609 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3332 int ret; 3610 int ret;
@@ -3381,8 +3659,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3381 int r = X86EMUL_CONTINUE; 3659 int r = X86EMUL_CONTINUE;
3382 3660
3383 while (bytes) { 3661 while (bytes) {
3384 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, 3662 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3385 PFERR_WRITE_MASK, error); 3663 PFERR_WRITE_MASK,
3664 error);
3386 unsigned offset = addr & (PAGE_SIZE-1); 3665 unsigned offset = addr & (PAGE_SIZE-1);
3387 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3666 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3388 int ret; 3667 int ret;
@@ -3624,7 +3903,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3624 if (vcpu->arch.pio.count) 3903 if (vcpu->arch.pio.count)
3625 goto data_avail; 3904 goto data_avail;
3626 3905
3627 trace_kvm_pio(1, port, size, 1); 3906 trace_kvm_pio(0, port, size, 1);
3628 3907
3629 vcpu->arch.pio.port = port; 3908 vcpu->arch.pio.port = port;
3630 vcpu->arch.pio.in = 1; 3909 vcpu->arch.pio.in = 1;
@@ -3652,7 +3931,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
3652 const void *val, unsigned int count, 3931 const void *val, unsigned int count,
3653 struct kvm_vcpu *vcpu) 3932 struct kvm_vcpu *vcpu)
3654{ 3933{
3655 trace_kvm_pio(0, port, size, 1); 3934 trace_kvm_pio(1, port, size, 1);
3656 3935
3657 vcpu->arch.pio.port = port; 3936 vcpu->arch.pio.port = port;
3658 vcpu->arch.pio.in = 0; 3937 vcpu->arch.pio.in = 0;
@@ -3791,6 +4070,11 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3791 kvm_x86_ops->get_gdt(vcpu, dt); 4070 kvm_x86_ops->get_gdt(vcpu, dt);
3792} 4071}
3793 4072
4073static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
4074{
4075 kvm_x86_ops->get_idt(vcpu, dt);
4076}
4077
3794static unsigned long emulator_get_cached_segment_base(int seg, 4078static unsigned long emulator_get_cached_segment_base(int seg,
3795 struct kvm_vcpu *vcpu) 4079 struct kvm_vcpu *vcpu)
3796{ 4080{
@@ -3884,6 +4168,7 @@ static struct x86_emulate_ops emulate_ops = {
3884 .set_segment_selector = emulator_set_segment_selector, 4168 .set_segment_selector = emulator_set_segment_selector,
3885 .get_cached_segment_base = emulator_get_cached_segment_base, 4169 .get_cached_segment_base = emulator_get_cached_segment_base,
3886 .get_gdt = emulator_get_gdt, 4170 .get_gdt = emulator_get_gdt,
4171 .get_idt = emulator_get_idt,
3887 .get_cr = emulator_get_cr, 4172 .get_cr = emulator_get_cr,
3888 .set_cr = emulator_set_cr, 4173 .set_cr = emulator_set_cr,
3889 .cpl = emulator_get_cpl, 4174 .cpl = emulator_get_cpl,
@@ -3919,13 +4204,64 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
3919{ 4204{
3920 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4205 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
3921 if (ctxt->exception == PF_VECTOR) 4206 if (ctxt->exception == PF_VECTOR)
3922 kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); 4207 kvm_propagate_fault(vcpu);
3923 else if (ctxt->error_code_valid) 4208 else if (ctxt->error_code_valid)
3924 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 4209 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
3925 else 4210 else
3926 kvm_queue_exception(vcpu, ctxt->exception); 4211 kvm_queue_exception(vcpu, ctxt->exception);
3927} 4212}
3928 4213
4214static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4215{
4216 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4217 int cs_db, cs_l;
4218
4219 cache_all_regs(vcpu);
4220
4221 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4222
4223 vcpu->arch.emulate_ctxt.vcpu = vcpu;
4224 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
4225 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4226 vcpu->arch.emulate_ctxt.mode =
4227 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4228 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
4229 ? X86EMUL_MODE_VM86 : cs_l
4230 ? X86EMUL_MODE_PROT64 : cs_db
4231 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4232 memset(c, 0, sizeof(struct decode_cache));
4233 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4234}
4235
4236int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
4237{
4238 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4239 int ret;
4240
4241 init_emulate_ctxt(vcpu);
4242
4243 vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
4244 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
4245 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
4246 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
4247
4248 if (ret != X86EMUL_CONTINUE)
4249 return EMULATE_FAIL;
4250
4251 vcpu->arch.emulate_ctxt.eip = c->eip;
4252 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4253 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4254 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4255
4256 if (irq == NMI_VECTOR)
4257 vcpu->arch.nmi_pending = false;
4258 else
4259 vcpu->arch.interrupt.pending = false;
4260
4261 return EMULATE_DONE;
4262}
4263EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4264
3929static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4265static int handle_emulation_failure(struct kvm_vcpu *vcpu)
3930{ 4266{
3931 ++vcpu->stat.insn_emulation_fail; 4267 ++vcpu->stat.insn_emulation_fail;
@@ -3982,24 +4318,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3982 cache_all_regs(vcpu); 4318 cache_all_regs(vcpu);
3983 4319
3984 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4320 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3985 int cs_db, cs_l; 4321 init_emulate_ctxt(vcpu);
3986 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3987
3988 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3989 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
3990 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
3991 vcpu->arch.emulate_ctxt.mode =
3992 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3993 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
3994 ? X86EMUL_MODE_VM86 : cs_l
3995 ? X86EMUL_MODE_PROT64 : cs_db
3996 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3997 memset(c, 0, sizeof(struct decode_cache));
3998 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
3999 vcpu->arch.emulate_ctxt.interruptibility = 0; 4322 vcpu->arch.emulate_ctxt.interruptibility = 0;
4000 vcpu->arch.emulate_ctxt.exception = -1; 4323 vcpu->arch.emulate_ctxt.exception = -1;
4324 vcpu->arch.emulate_ctxt.perm_ok = false;
4325
4326 r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
4327 if (r == X86EMUL_PROPAGATE_FAULT)
4328 goto done;
4001 4329
4002 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
4003 trace_kvm_emulate_insn_start(vcpu); 4330 trace_kvm_emulate_insn_start(vcpu);
4004 4331
4005 /* Only allow emulation of specific instructions on #UD 4332 /* Only allow emulation of specific instructions on #UD
@@ -4049,41 +4376,39 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
4049 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4376 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4050 4377
4051restart: 4378restart:
4052 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4379 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
4053 4380
4054 if (r) { /* emulation failed */ 4381 if (r == EMULATION_FAILED) {
4055 if (reexecute_instruction(vcpu, cr2)) 4382 if (reexecute_instruction(vcpu, cr2))
4056 return EMULATE_DONE; 4383 return EMULATE_DONE;
4057 4384
4058 return handle_emulation_failure(vcpu); 4385 return handle_emulation_failure(vcpu);
4059 } 4386 }
4060 4387
4061 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); 4388done:
4062 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4063 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4064 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4065
4066 if (vcpu->arch.emulate_ctxt.exception >= 0) { 4389 if (vcpu->arch.emulate_ctxt.exception >= 0) {
4067 inject_emulated_exception(vcpu); 4390 inject_emulated_exception(vcpu);
4068 return EMULATE_DONE; 4391 r = EMULATE_DONE;
4069 } 4392 } else if (vcpu->arch.pio.count) {
4070
4071 if (vcpu->arch.pio.count) {
4072 if (!vcpu->arch.pio.in) 4393 if (!vcpu->arch.pio.in)
4073 vcpu->arch.pio.count = 0; 4394 vcpu->arch.pio.count = 0;
4074 return EMULATE_DO_MMIO; 4395 r = EMULATE_DO_MMIO;
4075 } 4396 } else if (vcpu->mmio_needed) {
4076
4077 if (vcpu->mmio_needed) {
4078 if (vcpu->mmio_is_write) 4397 if (vcpu->mmio_is_write)
4079 vcpu->mmio_needed = 0; 4398 vcpu->mmio_needed = 0;
4080 return EMULATE_DO_MMIO; 4399 r = EMULATE_DO_MMIO;
4081 } 4400 } else if (r == EMULATION_RESTART)
4082
4083 if (vcpu->arch.emulate_ctxt.restart)
4084 goto restart; 4401 goto restart;
4402 else
4403 r = EMULATE_DONE;
4085 4404
4086 return EMULATE_DONE; 4405 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
4406 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4407 kvm_make_request(KVM_REQ_EVENT, vcpu);
4408 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4409 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4410
4411 return r;
4087} 4412}
4088EXPORT_SYMBOL_GPL(emulate_instruction); 4413EXPORT_SYMBOL_GPL(emulate_instruction);
4089 4414
@@ -4097,9 +4422,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4097} 4422}
4098EXPORT_SYMBOL_GPL(kvm_fast_pio_out); 4423EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
4099 4424
4100static void bounce_off(void *info) 4425static void tsc_bad(void *info)
4426{
4427 __get_cpu_var(cpu_tsc_khz) = 0;
4428}
4429
4430static void tsc_khz_changed(void *data)
4101{ 4431{
4102 /* nothing */ 4432 struct cpufreq_freqs *freq = data;
4433 unsigned long khz = 0;
4434
4435 if (data)
4436 khz = freq->new;
4437 else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4438 khz = cpufreq_quick_get(raw_smp_processor_id());
4439 if (!khz)
4440 khz = tsc_khz;
4441 __get_cpu_var(cpu_tsc_khz) = khz;
4103} 4442}
4104 4443
4105static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 4444static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4110,21 +4449,60 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4110 struct kvm_vcpu *vcpu; 4449 struct kvm_vcpu *vcpu;
4111 int i, send_ipi = 0; 4450 int i, send_ipi = 0;
4112 4451
4452 /*
4453 * We allow guests to temporarily run on slowing clocks,
4454 * provided we notify them after, or to run on accelerating
4455 * clocks, provided we notify them before. Thus time never
4456 * goes backwards.
4457 *
4458 * However, we have a problem. We can't atomically update
4459 * the frequency of a given CPU from this function; it is
4460 * merely a notifier, which can be called from any CPU.
4461 * Changing the TSC frequency at arbitrary points in time
4462 * requires a recomputation of local variables related to
4463 * the TSC for each VCPU. We must flag these local variables
4464 * to be updated and be sure the update takes place with the
4465 * new frequency before any guests proceed.
4466 *
4467 * Unfortunately, the combination of hotplug CPU and frequency
4468 * change creates an intractable locking scenario; the order
4469 * of when these callouts happen is undefined with respect to
4470 * CPU hotplug, and they can race with each other. As such,
4471 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
4472 * undefined; you can actually have a CPU frequency change take
4473 * place in between the computation of X and the setting of the
4474 * variable. To protect against this problem, all updates of
4475 * the per_cpu tsc_khz variable are done in an interrupt
4476 * protected IPI, and all callers wishing to update the value
4477 * must wait for a synchronous IPI to complete (which is trivial
4478 * if the caller is on the CPU already). This establishes the
4479 * necessary total order on variable updates.
4480 *
4481 * Note that because a guest time update may take place
4482 * anytime after the setting of the VCPU's request bit, the
4483 * correct TSC value must be set before the request. However,
4484 * to ensure the update actually makes it to any guest which
4485 * starts running in hardware virtualization between the set
4486 * and the acquisition of the spinlock, we must also ping the
4487 * CPU after setting the request bit.
4488 *
4489 */
4490
4113 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 4491 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
4114 return 0; 4492 return 0;
4115 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 4493 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
4116 return 0; 4494 return 0;
4117 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; 4495
4496 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4118 4497
4119 spin_lock(&kvm_lock); 4498 spin_lock(&kvm_lock);
4120 list_for_each_entry(kvm, &vm_list, vm_list) { 4499 list_for_each_entry(kvm, &vm_list, vm_list) {
4121 kvm_for_each_vcpu(i, vcpu, kvm) { 4500 kvm_for_each_vcpu(i, vcpu, kvm) {
4122 if (vcpu->cpu != freq->cpu) 4501 if (vcpu->cpu != freq->cpu)
4123 continue; 4502 continue;
4124 if (!kvm_request_guest_time_update(vcpu)) 4503 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4125 continue;
4126 if (vcpu->cpu != smp_processor_id()) 4504 if (vcpu->cpu != smp_processor_id())
4127 send_ipi++; 4505 send_ipi = 1;
4128 } 4506 }
4129 } 4507 }
4130 spin_unlock(&kvm_lock); 4508 spin_unlock(&kvm_lock);
@@ -4142,32 +4520,57 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4142 * guest context is entered kvmclock will be updated, 4520 * guest context is entered kvmclock will be updated,
4143 * so the guest will not see stale values. 4521 * so the guest will not see stale values.
4144 */ 4522 */
4145 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 4523 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4146 } 4524 }
4147 return 0; 4525 return 0;
4148} 4526}
4149 4527
4150static struct notifier_block kvmclock_cpufreq_notifier_block = { 4528static struct notifier_block kvmclock_cpufreq_notifier_block = {
4151 .notifier_call = kvmclock_cpufreq_notifier 4529 .notifier_call = kvmclock_cpufreq_notifier
4530};
4531
4532static int kvmclock_cpu_notifier(struct notifier_block *nfb,
4533 unsigned long action, void *hcpu)
4534{
4535 unsigned int cpu = (unsigned long)hcpu;
4536
4537 switch (action) {
4538 case CPU_ONLINE:
4539 case CPU_DOWN_FAILED:
4540 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4541 break;
4542 case CPU_DOWN_PREPARE:
4543 smp_call_function_single(cpu, tsc_bad, NULL, 1);
4544 break;
4545 }
4546 return NOTIFY_OK;
4547}
4548
4549static struct notifier_block kvmclock_cpu_notifier_block = {
4550 .notifier_call = kvmclock_cpu_notifier,
4551 .priority = -INT_MAX
4152}; 4552};
4153 4553
4154static void kvm_timer_init(void) 4554static void kvm_timer_init(void)
4155{ 4555{
4156 int cpu; 4556 int cpu;
4157 4557
4558 max_tsc_khz = tsc_khz;
4559 register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
4158 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 4560 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
4561#ifdef CONFIG_CPU_FREQ
4562 struct cpufreq_policy policy;
4563 memset(&policy, 0, sizeof(policy));
4564 cpufreq_get_policy(&policy, get_cpu());
4565 if (policy.cpuinfo.max_freq)
4566 max_tsc_khz = policy.cpuinfo.max_freq;
4567#endif
4159 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 4568 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
4160 CPUFREQ_TRANSITION_NOTIFIER); 4569 CPUFREQ_TRANSITION_NOTIFIER);
4161 for_each_online_cpu(cpu) {
4162 unsigned long khz = cpufreq_get(cpu);
4163 if (!khz)
4164 khz = tsc_khz;
4165 per_cpu(cpu_tsc_khz, cpu) = khz;
4166 }
4167 } else {
4168 for_each_possible_cpu(cpu)
4169 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
4170 } 4570 }
4571 pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
4572 for_each_online_cpu(cpu)
4573 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4171} 4574}
4172 4575
4173static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 4576static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -4269,6 +4672,7 @@ void kvm_arch_exit(void)
4269 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 4672 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4270 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 4673 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
4271 CPUFREQ_TRANSITION_NOTIFIER); 4674 CPUFREQ_TRANSITION_NOTIFIER);
4675 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
4272 kvm_x86_ops = NULL; 4676 kvm_x86_ops = NULL;
4273 kvm_mmu_module_exit(); 4677 kvm_mmu_module_exit();
4274} 4678}
@@ -4684,8 +5088,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4684 kvm_mmu_unload(vcpu); 5088 kvm_mmu_unload(vcpu);
4685 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5089 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
4686 __kvm_migrate_timers(vcpu); 5090 __kvm_migrate_timers(vcpu);
4687 if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) 5091 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
4688 kvm_write_guest_time(vcpu); 5092 r = kvm_guest_time_update(vcpu);
5093 if (unlikely(r))
5094 goto out;
5095 }
4689 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) 5096 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4690 kvm_mmu_sync_roots(vcpu); 5097 kvm_mmu_sync_roots(vcpu);
4691 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 5098 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
@@ -4710,6 +5117,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4710 if (unlikely(r)) 5117 if (unlikely(r))
4711 goto out; 5118 goto out;
4712 5119
5120 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5121 inject_pending_event(vcpu);
5122
5123 /* enable NMI/IRQ window open exits if needed */
5124 if (vcpu->arch.nmi_pending)
5125 kvm_x86_ops->enable_nmi_window(vcpu);
5126 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
5127 kvm_x86_ops->enable_irq_window(vcpu);
5128
5129 if (kvm_lapic_enabled(vcpu)) {
5130 update_cr8_intercept(vcpu);
5131 kvm_lapic_sync_to_vapic(vcpu);
5132 }
5133 }
5134
4713 preempt_disable(); 5135 preempt_disable();
4714 5136
4715 kvm_x86_ops->prepare_guest_switch(vcpu); 5137 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -4728,23 +5150,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4728 smp_wmb(); 5150 smp_wmb();
4729 local_irq_enable(); 5151 local_irq_enable();
4730 preempt_enable(); 5152 preempt_enable();
5153 kvm_x86_ops->cancel_injection(vcpu);
4731 r = 1; 5154 r = 1;
4732 goto out; 5155 goto out;
4733 } 5156 }
4734 5157
4735 inject_pending_event(vcpu);
4736
4737 /* enable NMI/IRQ window open exits if needed */
4738 if (vcpu->arch.nmi_pending)
4739 kvm_x86_ops->enable_nmi_window(vcpu);
4740 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
4741 kvm_x86_ops->enable_irq_window(vcpu);
4742
4743 if (kvm_lapic_enabled(vcpu)) {
4744 update_cr8_intercept(vcpu);
4745 kvm_lapic_sync_to_vapic(vcpu);
4746 }
4747
4748 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5158 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4749 5159
4750 kvm_guest_enter(); 5160 kvm_guest_enter();
@@ -4770,6 +5180,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4770 if (hw_breakpoint_active()) 5180 if (hw_breakpoint_active())
4771 hw_breakpoint_restore(); 5181 hw_breakpoint_restore();
4772 5182
5183 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
5184
4773 atomic_set(&vcpu->guest_mode, 0); 5185 atomic_set(&vcpu->guest_mode, 0);
4774 smp_wmb(); 5186 smp_wmb();
4775 local_irq_enable(); 5187 local_irq_enable();
@@ -4899,8 +5311,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4899 if (!irqchip_in_kernel(vcpu->kvm)) 5311 if (!irqchip_in_kernel(vcpu->kvm))
4900 kvm_set_cr8(vcpu, kvm_run->cr8); 5312 kvm_set_cr8(vcpu, kvm_run->cr8);
4901 5313
4902 if (vcpu->arch.pio.count || vcpu->mmio_needed || 5314 if (vcpu->arch.pio.count || vcpu->mmio_needed) {
4903 vcpu->arch.emulate_ctxt.restart) {
4904 if (vcpu->mmio_needed) { 5315 if (vcpu->mmio_needed) {
4905 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 5316 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4906 vcpu->mmio_read_completed = 1; 5317 vcpu->mmio_read_completed = 1;
@@ -4981,6 +5392,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4981 5392
4982 vcpu->arch.exception.pending = false; 5393 vcpu->arch.exception.pending = false;
4983 5394
5395 kvm_make_request(KVM_REQ_EVENT, vcpu);
5396
4984 return 0; 5397 return 0;
4985} 5398}
4986 5399
@@ -5044,6 +5457,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5044 struct kvm_mp_state *mp_state) 5457 struct kvm_mp_state *mp_state)
5045{ 5458{
5046 vcpu->arch.mp_state = mp_state->mp_state; 5459 vcpu->arch.mp_state = mp_state->mp_state;
5460 kvm_make_request(KVM_REQ_EVENT, vcpu);
5047 return 0; 5461 return 0;
5048} 5462}
5049 5463
@@ -5051,24 +5465,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5051 bool has_error_code, u32 error_code) 5465 bool has_error_code, u32 error_code)
5052{ 5466{
5053 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 5467 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
5054 int cs_db, cs_l, ret; 5468 int ret;
5055 cache_all_regs(vcpu);
5056
5057 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5058 5469
5059 vcpu->arch.emulate_ctxt.vcpu = vcpu; 5470 init_emulate_ctxt(vcpu);
5060 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
5061 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
5062 vcpu->arch.emulate_ctxt.mode =
5063 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
5064 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
5065 ? X86EMUL_MODE_VM86 : cs_l
5066 ? X86EMUL_MODE_PROT64 : cs_db
5067 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
5068 memset(c, 0, sizeof(struct decode_cache));
5069 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
5070 5471
5071 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5472 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
5072 tss_selector, reason, has_error_code, 5473 tss_selector, reason, has_error_code,
5073 error_code); 5474 error_code);
5074 5475
@@ -5078,6 +5479,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5078 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5479 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5079 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 5480 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
5080 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5481 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5482 kvm_make_request(KVM_REQ_EVENT, vcpu);
5081 return EMULATE_DONE; 5483 return EMULATE_DONE;
5082} 5484}
5083EXPORT_SYMBOL_GPL(kvm_task_switch); 5485EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5113,7 +5515,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5113 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5515 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
5114 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5516 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5115 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5517 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5116 load_pdptrs(vcpu, vcpu->arch.cr3); 5518 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
5117 mmu_reset_needed = 1; 5519 mmu_reset_needed = 1;
5118 } 5520 }
5119 5521
@@ -5148,6 +5550,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5148 !is_protmode(vcpu)) 5550 !is_protmode(vcpu))
5149 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5551 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5150 5552
5553 kvm_make_request(KVM_REQ_EVENT, vcpu);
5554
5151 return 0; 5555 return 0;
5152} 5556}
5153 5557
@@ -5334,6 +5738,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5334struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 5738struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
5335 unsigned int id) 5739 unsigned int id)
5336{ 5740{
5741 if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
5742 printk_once(KERN_WARNING
5743 "kvm: SMP vm created on host with unstable TSC; "
5744 "guest TSC will not be reliable\n");
5337 return kvm_x86_ops->vcpu_create(kvm, id); 5745 return kvm_x86_ops->vcpu_create(kvm, id);
5338} 5746}
5339 5747
@@ -5376,22 +5784,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5376 vcpu->arch.dr6 = DR6_FIXED_1; 5784 vcpu->arch.dr6 = DR6_FIXED_1;
5377 vcpu->arch.dr7 = DR7_FIXED_1; 5785 vcpu->arch.dr7 = DR7_FIXED_1;
5378 5786
5787 kvm_make_request(KVM_REQ_EVENT, vcpu);
5788
5379 return kvm_x86_ops->vcpu_reset(vcpu); 5789 return kvm_x86_ops->vcpu_reset(vcpu);
5380} 5790}
5381 5791
5382int kvm_arch_hardware_enable(void *garbage) 5792int kvm_arch_hardware_enable(void *garbage)
5383{ 5793{
5384 /* 5794 struct kvm *kvm;
5385 * Since this may be called from a hotplug notifcation, 5795 struct kvm_vcpu *vcpu;
5386 * we can't get the CPU frequency directly. 5796 int i;
5387 */
5388 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5389 int cpu = raw_smp_processor_id();
5390 per_cpu(cpu_tsc_khz, cpu) = 0;
5391 }
5392 5797
5393 kvm_shared_msr_cpu_online(); 5798 kvm_shared_msr_cpu_online();
5394 5799 list_for_each_entry(kvm, &vm_list, vm_list)
5800 kvm_for_each_vcpu(i, vcpu, kvm)
5801 if (vcpu->cpu == smp_processor_id())
5802 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5395 return kvm_x86_ops->hardware_enable(garbage); 5803 return kvm_x86_ops->hardware_enable(garbage);
5396} 5804}
5397 5805
@@ -5425,7 +5833,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5425 BUG_ON(vcpu->kvm == NULL); 5833 BUG_ON(vcpu->kvm == NULL);
5426 kvm = vcpu->kvm; 5834 kvm = vcpu->kvm;
5427 5835
5836 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
5837 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
5428 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 5838 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
5839 vcpu->arch.mmu.translate_gpa = translate_gpa;
5840 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5429 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 5841 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
5430 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5842 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5431 else 5843 else
@@ -5438,6 +5850,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5438 } 5850 }
5439 vcpu->arch.pio_data = page_address(page); 5851 vcpu->arch.pio_data = page_address(page);
5440 5852
5853 if (!kvm->arch.virtual_tsc_khz)
5854 kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
5855
5441 r = kvm_mmu_create(vcpu); 5856 r = kvm_mmu_create(vcpu);
5442 if (r < 0) 5857 if (r < 0)
5443 goto fail_free_pio_data; 5858 goto fail_free_pio_data;
@@ -5497,7 +5912,7 @@ struct kvm *kvm_arch_create_vm(void)
5497 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 5912 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
5498 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 5913 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
5499 5914
5500 rdtscll(kvm->arch.vm_init_tsc); 5915 spin_lock_init(&kvm->arch.tsc_write_lock);
5501 5916
5502 return kvm; 5917 return kvm;
5503} 5918}
@@ -5684,6 +6099,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5684 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) 6099 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
5685 rflags |= X86_EFLAGS_TF; 6100 rflags |= X86_EFLAGS_TF;
5686 kvm_x86_ops->set_rflags(vcpu, rflags); 6101 kvm_x86_ops->set_rflags(vcpu, rflags);
6102 kvm_make_request(KVM_REQ_EVENT, vcpu);
5687} 6103}
5688EXPORT_SYMBOL_GPL(kvm_set_rflags); 6104EXPORT_SYMBOL_GPL(kvm_set_rflags);
5689 6105