aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c403
1 files changed, 313 insertions, 90 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54696b5f8443..4044ce0bf7c1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -97,6 +97,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
97u32 kvm_max_guest_tsc_khz; 97u32 kvm_max_guest_tsc_khz;
98EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); 98EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
99 99
100/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
101static u32 tsc_tolerance_ppm = 250;
102module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
103
100#define KVM_NR_SHARED_MSRS 16 104#define KVM_NR_SHARED_MSRS 16
101 105
102struct kvm_shared_msrs_global { 106struct kvm_shared_msrs_global {
@@ -969,50 +973,51 @@ static inline u64 get_kernel_ns(void)
969static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 973static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
970unsigned long max_tsc_khz; 974unsigned long max_tsc_khz;
971 975
972static inline int kvm_tsc_changes_freq(void) 976static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
973{ 977{
974 int cpu = get_cpu(); 978 return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
975 int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && 979 vcpu->arch.virtual_tsc_shift);
976 cpufreq_quick_get(cpu) != 0;
977 put_cpu();
978 return ret;
979} 980}
980 981
981u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) 982static u32 adjust_tsc_khz(u32 khz, s32 ppm)
982{ 983{
983 if (vcpu->arch.virtual_tsc_khz) 984 u64 v = (u64)khz * (1000000 + ppm);
984 return vcpu->arch.virtual_tsc_khz; 985 do_div(v, 1000000);
985 else 986 return v;
986 return __this_cpu_read(cpu_tsc_khz);
987} 987}
988 988
989static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) 989static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
990{ 990{
991 u64 ret; 991 u32 thresh_lo, thresh_hi;
992 992 int use_scaling = 0;
993 WARN_ON(preemptible());
994 if (kvm_tsc_changes_freq())
995 printk_once(KERN_WARNING
996 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
997 ret = nsec * vcpu_tsc_khz(vcpu);
998 do_div(ret, USEC_PER_SEC);
999 return ret;
1000}
1001 993
1002static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1003{
1004 /* Compute a scale to convert nanoseconds in TSC cycles */ 994 /* Compute a scale to convert nanoseconds in TSC cycles */
1005 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, 995 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1006 &vcpu->arch.tsc_catchup_shift, 996 &vcpu->arch.virtual_tsc_shift,
1007 &vcpu->arch.tsc_catchup_mult); 997 &vcpu->arch.virtual_tsc_mult);
998 vcpu->arch.virtual_tsc_khz = this_tsc_khz;
999
1000 /*
1001 * Compute the variation in TSC rate which is acceptable
1002 * within the range of tolerance and decide if the
1003 * rate being applied is within that bounds of the hardware
1004 * rate. If so, no scaling or compensation need be done.
1005 */
1006 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1007 thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1008 if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
1009 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
1010 use_scaling = 1;
1011 }
1012 kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
1008} 1013}
1009 1014
1010static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) 1015static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1011{ 1016{
1012 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, 1017 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1013 vcpu->arch.tsc_catchup_mult, 1018 vcpu->arch.virtual_tsc_mult,
1014 vcpu->arch.tsc_catchup_shift); 1019 vcpu->arch.virtual_tsc_shift);
1015 tsc += vcpu->arch.last_tsc_write; 1020 tsc += vcpu->arch.this_tsc_write;
1016 return tsc; 1021 return tsc;
1017} 1022}
1018 1023
@@ -1021,48 +1026,88 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1021 struct kvm *kvm = vcpu->kvm; 1026 struct kvm *kvm = vcpu->kvm;
1022 u64 offset, ns, elapsed; 1027 u64 offset, ns, elapsed;
1023 unsigned long flags; 1028 unsigned long flags;
1024 s64 sdiff; 1029 s64 usdiff;
1025 1030
1026 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1031 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1027 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1032 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1028 ns = get_kernel_ns(); 1033 ns = get_kernel_ns();
1029 elapsed = ns - kvm->arch.last_tsc_nsec; 1034 elapsed = ns - kvm->arch.last_tsc_nsec;
1030 sdiff = data - kvm->arch.last_tsc_write; 1035
1031 if (sdiff < 0) 1036 /* n.b - signed multiplication and division required */
1032 sdiff = -sdiff; 1037 usdiff = data - kvm->arch.last_tsc_write;
1038#ifdef CONFIG_X86_64
1039 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1040#else
1041 /* do_div() only does unsigned */
1042 asm("idivl %2; xor %%edx, %%edx"
1043 : "=A"(usdiff)
1044 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
1045#endif
1046 do_div(elapsed, 1000);
1047 usdiff -= elapsed;
1048 if (usdiff < 0)
1049 usdiff = -usdiff;
1033 1050
1034 /* 1051 /*
1035 * Special case: close write to TSC within 5 seconds of 1052 * Special case: TSC write with a small delta (1 second) of virtual
1036 * another CPU is interpreted as an attempt to synchronize 1053 * cycle time against real time is interpreted as an attempt to
1037 * The 5 seconds is to accommodate host load / swapping as 1054 * synchronize the CPU.
1038 * well as any reset of TSC during the boot process. 1055 *
1039 * 1056 * For a reliable TSC, we can match TSC offsets, and for an unstable
1040 * In that case, for a reliable TSC, we can match TSC offsets, 1057 * TSC, we add elapsed time in this computation. We could let the
1041 * or make a best guest using elapsed value. 1058 * compensation code attempt to catch up if we fall behind, but
1042 */ 1059 * it's better to try to match offsets from the beginning.
1043 if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && 1060 */
1044 elapsed < 5ULL * NSEC_PER_SEC) { 1061 if (usdiff < USEC_PER_SEC &&
1062 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1045 if (!check_tsc_unstable()) { 1063 if (!check_tsc_unstable()) {
1046 offset = kvm->arch.last_tsc_offset; 1064 offset = kvm->arch.cur_tsc_offset;
1047 pr_debug("kvm: matched tsc offset for %llu\n", data); 1065 pr_debug("kvm: matched tsc offset for %llu\n", data);
1048 } else { 1066 } else {
1049 u64 delta = nsec_to_cycles(vcpu, elapsed); 1067 u64 delta = nsec_to_cycles(vcpu, elapsed);
1050 offset += delta; 1068 data += delta;
1069 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1051 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1070 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1052 } 1071 }
1053 ns = kvm->arch.last_tsc_nsec; 1072 } else {
1073 /*
1074 * We split periods of matched TSC writes into generations.
1075 * For each generation, we track the original measured
1076 * nanosecond time, offset, and write, so if TSCs are in
1077 * sync, we can match exact offset, and if not, we can match
1078 * exact software computaion in compute_guest_tsc()
1079 *
1080 * These values are tracked in kvm->arch.cur_xxx variables.
1081 */
1082 kvm->arch.cur_tsc_generation++;
1083 kvm->arch.cur_tsc_nsec = ns;
1084 kvm->arch.cur_tsc_write = data;
1085 kvm->arch.cur_tsc_offset = offset;
1086 pr_debug("kvm: new tsc generation %u, clock %llu\n",
1087 kvm->arch.cur_tsc_generation, data);
1054 } 1088 }
1089
1090 /*
1091 * We also track th most recent recorded KHZ, write and time to
1092 * allow the matching interval to be extended at each write.
1093 */
1055 kvm->arch.last_tsc_nsec = ns; 1094 kvm->arch.last_tsc_nsec = ns;
1056 kvm->arch.last_tsc_write = data; 1095 kvm->arch.last_tsc_write = data;
1057 kvm->arch.last_tsc_offset = offset; 1096 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1058 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1059 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1060 1097
1061 /* Reset of TSC must disable overshoot protection below */ 1098 /* Reset of TSC must disable overshoot protection below */
1062 vcpu->arch.hv_clock.tsc_timestamp = 0; 1099 vcpu->arch.hv_clock.tsc_timestamp = 0;
1063 vcpu->arch.last_tsc_write = data; 1100 vcpu->arch.last_guest_tsc = data;
1064 vcpu->arch.last_tsc_nsec = ns; 1101
1102 /* Keep track of which generation this VCPU has synchronized to */
1103 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1104 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1105 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1106
1107 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1108 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1065} 1109}
1110
1066EXPORT_SYMBOL_GPL(kvm_write_tsc); 1111EXPORT_SYMBOL_GPL(kvm_write_tsc);
1067 1112
1068static int kvm_guest_time_update(struct kvm_vcpu *v) 1113static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -1078,7 +1123,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1078 local_irq_save(flags); 1123 local_irq_save(flags);
1079 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); 1124 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
1080 kernel_ns = get_kernel_ns(); 1125 kernel_ns = get_kernel_ns();
1081 this_tsc_khz = vcpu_tsc_khz(v); 1126 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1082 if (unlikely(this_tsc_khz == 0)) { 1127 if (unlikely(this_tsc_khz == 0)) {
1083 local_irq_restore(flags); 1128 local_irq_restore(flags);
1084 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 1129 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1098,7 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1098 if (vcpu->tsc_catchup) { 1143 if (vcpu->tsc_catchup) {
1099 u64 tsc = compute_guest_tsc(v, kernel_ns); 1144 u64 tsc = compute_guest_tsc(v, kernel_ns);
1100 if (tsc > tsc_timestamp) { 1145 if (tsc > tsc_timestamp) {
1101 kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); 1146 adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
1102 tsc_timestamp = tsc; 1147 tsc_timestamp = tsc;
1103 } 1148 }
1104 } 1149 }
@@ -1130,7 +1175,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1130 * observed by the guest and ensure the new system time is greater. 1175 * observed by the guest and ensure the new system time is greater.
1131 */ 1176 */
1132 max_kernel_ns = 0; 1177 max_kernel_ns = 0;
1133 if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { 1178 if (vcpu->hv_clock.tsc_timestamp) {
1134 max_kernel_ns = vcpu->last_guest_tsc - 1179 max_kernel_ns = vcpu->last_guest_tsc -
1135 vcpu->hv_clock.tsc_timestamp; 1180 vcpu->hv_clock.tsc_timestamp;
1136 max_kernel_ns = pvclock_scale_delta(max_kernel_ns, 1181 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
@@ -1504,6 +1549,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1504 case MSR_K7_HWCR: 1549 case MSR_K7_HWCR:
1505 data &= ~(u64)0x40; /* ignore flush filter disable */ 1550 data &= ~(u64)0x40; /* ignore flush filter disable */
1506 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 1551 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1552 data &= ~(u64)0x8; /* ignore TLB cache disable */
1507 if (data != 0) { 1553 if (data != 0) {
1508 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1554 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1509 data); 1555 data);
@@ -1676,6 +1722,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1676 */ 1722 */
1677 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); 1723 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1678 break; 1724 break;
1725 case MSR_AMD64_OSVW_ID_LENGTH:
1726 if (!guest_cpuid_has_osvw(vcpu))
1727 return 1;
1728 vcpu->arch.osvw.length = data;
1729 break;
1730 case MSR_AMD64_OSVW_STATUS:
1731 if (!guest_cpuid_has_osvw(vcpu))
1732 return 1;
1733 vcpu->arch.osvw.status = data;
1734 break;
1679 default: 1735 default:
1680 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1736 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1681 return xen_hvm_config(vcpu, data); 1737 return xen_hvm_config(vcpu, data);
@@ -1960,6 +2016,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1960 */ 2016 */
1961 data = 0xbe702111; 2017 data = 0xbe702111;
1962 break; 2018 break;
2019 case MSR_AMD64_OSVW_ID_LENGTH:
2020 if (!guest_cpuid_has_osvw(vcpu))
2021 return 1;
2022 data = vcpu->arch.osvw.length;
2023 break;
2024 case MSR_AMD64_OSVW_STATUS:
2025 if (!guest_cpuid_has_osvw(vcpu))
2026 return 1;
2027 data = vcpu->arch.osvw.status;
2028 break;
1963 default: 2029 default:
1964 if (kvm_pmu_msr(vcpu, msr)) 2030 if (kvm_pmu_msr(vcpu, msr))
1965 return kvm_pmu_get_msr(vcpu, msr, pdata); 2031 return kvm_pmu_get_msr(vcpu, msr, pdata);
@@ -2080,6 +2146,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2080 case KVM_CAP_XSAVE: 2146 case KVM_CAP_XSAVE:
2081 case KVM_CAP_ASYNC_PF: 2147 case KVM_CAP_ASYNC_PF:
2082 case KVM_CAP_GET_TSC_KHZ: 2148 case KVM_CAP_GET_TSC_KHZ:
2149 case KVM_CAP_PCI_2_3:
2083 r = 1; 2150 r = 1;
2084 break; 2151 break;
2085 case KVM_CAP_COALESCED_MMIO: 2152 case KVM_CAP_COALESCED_MMIO:
@@ -2214,19 +2281,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2214 } 2281 }
2215 2282
2216 kvm_x86_ops->vcpu_load(vcpu, cpu); 2283 kvm_x86_ops->vcpu_load(vcpu, cpu);
2217 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2218 /* Make sure TSC doesn't go backwards */
2219 s64 tsc_delta;
2220 u64 tsc;
2221 2284
2222 tsc = kvm_x86_ops->read_l1_tsc(vcpu); 2285 /* Apply any externally detected TSC adjustments (due to suspend) */
2223 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : 2286 if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
2224 tsc - vcpu->arch.last_guest_tsc; 2287 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
2288 vcpu->arch.tsc_offset_adjustment = 0;
2289 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
2290 }
2225 2291
2292 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2293 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
2294 native_read_tsc() - vcpu->arch.last_host_tsc;
2226 if (tsc_delta < 0) 2295 if (tsc_delta < 0)
2227 mark_tsc_unstable("KVM discovered backwards TSC"); 2296 mark_tsc_unstable("KVM discovered backwards TSC");
2228 if (check_tsc_unstable()) { 2297 if (check_tsc_unstable()) {
2229 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); 2298 u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
2299 vcpu->arch.last_guest_tsc);
2300 kvm_x86_ops->write_tsc_offset(vcpu, offset);
2230 vcpu->arch.tsc_catchup = 1; 2301 vcpu->arch.tsc_catchup = 1;
2231 } 2302 }
2232 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2303 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2243,7 +2314,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2243{ 2314{
2244 kvm_x86_ops->vcpu_put(vcpu); 2315 kvm_x86_ops->vcpu_put(vcpu);
2245 kvm_put_guest_fpu(vcpu); 2316 kvm_put_guest_fpu(vcpu);
2246 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 2317 vcpu->arch.last_host_tsc = native_read_tsc();
2247} 2318}
2248 2319
2249static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2320static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -2785,26 +2856,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2785 u32 user_tsc_khz; 2856 u32 user_tsc_khz;
2786 2857
2787 r = -EINVAL; 2858 r = -EINVAL;
2788 if (!kvm_has_tsc_control)
2789 break;
2790
2791 user_tsc_khz = (u32)arg; 2859 user_tsc_khz = (u32)arg;
2792 2860
2793 if (user_tsc_khz >= kvm_max_guest_tsc_khz) 2861 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
2794 goto out; 2862 goto out;
2795 2863
2796 kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); 2864 if (user_tsc_khz == 0)
2865 user_tsc_khz = tsc_khz;
2866
2867 kvm_set_tsc_khz(vcpu, user_tsc_khz);
2797 2868
2798 r = 0; 2869 r = 0;
2799 goto out; 2870 goto out;
2800 } 2871 }
2801 case KVM_GET_TSC_KHZ: { 2872 case KVM_GET_TSC_KHZ: {
2802 r = -EIO; 2873 r = vcpu->arch.virtual_tsc_khz;
2803 if (check_tsc_unstable())
2804 goto out;
2805
2806 r = vcpu_tsc_khz(vcpu);
2807
2808 goto out; 2874 goto out;
2809 } 2875 }
2810 default: 2876 default:
@@ -2815,6 +2881,11 @@ out:
2815 return r; 2881 return r;
2816} 2882}
2817 2883
2884int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
2885{
2886 return VM_FAULT_SIGBUS;
2887}
2888
2818static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 2889static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
2819{ 2890{
2820 int ret; 2891 int ret;
@@ -2998,6 +3069,8 @@ static void write_protect_slot(struct kvm *kvm,
2998 unsigned long *dirty_bitmap, 3069 unsigned long *dirty_bitmap,
2999 unsigned long nr_dirty_pages) 3070 unsigned long nr_dirty_pages)
3000{ 3071{
3072 spin_lock(&kvm->mmu_lock);
3073
3001 /* Not many dirty pages compared to # of shadow pages. */ 3074 /* Not many dirty pages compared to # of shadow pages. */
3002 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { 3075 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
3003 unsigned long gfn_offset; 3076 unsigned long gfn_offset;
@@ -3005,16 +3078,13 @@ static void write_protect_slot(struct kvm *kvm,
3005 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { 3078 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
3006 unsigned long gfn = memslot->base_gfn + gfn_offset; 3079 unsigned long gfn = memslot->base_gfn + gfn_offset;
3007 3080
3008 spin_lock(&kvm->mmu_lock);
3009 kvm_mmu_rmap_write_protect(kvm, gfn, memslot); 3081 kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
3010 spin_unlock(&kvm->mmu_lock);
3011 } 3082 }
3012 kvm_flush_remote_tlbs(kvm); 3083 kvm_flush_remote_tlbs(kvm);
3013 } else { 3084 } else
3014 spin_lock(&kvm->mmu_lock);
3015 kvm_mmu_slot_remove_write_access(kvm, memslot->id); 3085 kvm_mmu_slot_remove_write_access(kvm, memslot->id);
3016 spin_unlock(&kvm->mmu_lock); 3086
3017 } 3087 spin_unlock(&kvm->mmu_lock);
3018} 3088}
3019 3089
3020/* 3090/*
@@ -3133,6 +3203,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
3133 r = -EEXIST; 3203 r = -EEXIST;
3134 if (kvm->arch.vpic) 3204 if (kvm->arch.vpic)
3135 goto create_irqchip_unlock; 3205 goto create_irqchip_unlock;
3206 r = -EINVAL;
3207 if (atomic_read(&kvm->online_vcpus))
3208 goto create_irqchip_unlock;
3136 r = -ENOMEM; 3209 r = -ENOMEM;
3137 vpic = kvm_create_pic(kvm); 3210 vpic = kvm_create_pic(kvm);
3138 if (vpic) { 3211 if (vpic) {
@@ -4063,6 +4136,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4063 return res; 4136 return res;
4064} 4137}
4065 4138
4139static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
4140{
4141 kvm_set_rflags(emul_to_vcpu(ctxt), val);
4142}
4143
4066static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) 4144static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
4067{ 4145{
4068 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); 4146 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4244,6 +4322,7 @@ static struct x86_emulate_ops emulate_ops = {
4244 .set_idt = emulator_set_idt, 4322 .set_idt = emulator_set_idt,
4245 .get_cr = emulator_get_cr, 4323 .get_cr = emulator_get_cr,
4246 .set_cr = emulator_set_cr, 4324 .set_cr = emulator_set_cr,
4325 .set_rflags = emulator_set_rflags,
4247 .cpl = emulator_get_cpl, 4326 .cpl = emulator_get_cpl,
4248 .get_dr = emulator_get_dr, 4327 .get_dr = emulator_get_dr,
4249 .set_dr = emulator_set_dr, 4328 .set_dr = emulator_set_dr,
@@ -5288,6 +5367,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5288 profile_hit(KVM_PROFILING, (void *)rip); 5367 profile_hit(KVM_PROFILING, (void *)rip);
5289 } 5368 }
5290 5369
5370 if (unlikely(vcpu->arch.tsc_always_catchup))
5371 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5291 5372
5292 kvm_lapic_sync_from_vapic(vcpu); 5373 kvm_lapic_sync_from_vapic(vcpu);
5293 5374
@@ -5587,15 +5668,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5587 return 0; 5668 return 0;
5588} 5669}
5589 5670
5590int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 5671int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
5591 bool has_error_code, u32 error_code) 5672 int reason, bool has_error_code, u32 error_code)
5592{ 5673{
5593 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 5674 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5594 int ret; 5675 int ret;
5595 5676
5596 init_emulate_ctxt(vcpu); 5677 init_emulate_ctxt(vcpu);
5597 5678
5598 ret = emulator_task_switch(ctxt, tss_selector, reason, 5679 ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
5599 has_error_code, error_code); 5680 has_error_code, error_code);
5600 5681
5601 if (ret) 5682 if (ret)
@@ -5928,13 +6009,88 @@ int kvm_arch_hardware_enable(void *garbage)
5928 struct kvm *kvm; 6009 struct kvm *kvm;
5929 struct kvm_vcpu *vcpu; 6010 struct kvm_vcpu *vcpu;
5930 int i; 6011 int i;
6012 int ret;
6013 u64 local_tsc;
6014 u64 max_tsc = 0;
6015 bool stable, backwards_tsc = false;
5931 6016
5932 kvm_shared_msr_cpu_online(); 6017 kvm_shared_msr_cpu_online();
5933 list_for_each_entry(kvm, &vm_list, vm_list) 6018 ret = kvm_x86_ops->hardware_enable(garbage);
5934 kvm_for_each_vcpu(i, vcpu, kvm) 6019 if (ret != 0)
5935 if (vcpu->cpu == smp_processor_id()) 6020 return ret;
5936 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 6021
5937 return kvm_x86_ops->hardware_enable(garbage); 6022 local_tsc = native_read_tsc();
6023 stable = !check_tsc_unstable();
6024 list_for_each_entry(kvm, &vm_list, vm_list) {
6025 kvm_for_each_vcpu(i, vcpu, kvm) {
6026 if (!stable && vcpu->cpu == smp_processor_id())
6027 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
6028 if (stable && vcpu->arch.last_host_tsc > local_tsc) {
6029 backwards_tsc = true;
6030 if (vcpu->arch.last_host_tsc > max_tsc)
6031 max_tsc = vcpu->arch.last_host_tsc;
6032 }
6033 }
6034 }
6035
6036 /*
6037 * Sometimes, even reliable TSCs go backwards. This happens on
6038 * platforms that reset TSC during suspend or hibernate actions, but
6039 * maintain synchronization. We must compensate. Fortunately, we can
6040 * detect that condition here, which happens early in CPU bringup,
6041 * before any KVM threads can be running. Unfortunately, we can't
6042 * bring the TSCs fully up to date with real time, as we aren't yet far
6043 * enough into CPU bringup that we know how much real time has actually
6044 * elapsed; our helper function, get_kernel_ns() will be using boot
6045 * variables that haven't been updated yet.
6046 *
6047 * So we simply find the maximum observed TSC above, then record the
6048 * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
6049 * the adjustment will be applied. Note that we accumulate
6050 * adjustments, in case multiple suspend cycles happen before some VCPU
6051 * gets a chance to run again. In the event that no KVM threads get a
6052 * chance to run, we will miss the entire elapsed period, as we'll have
6053 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
6054 * loose cycle time. This isn't too big a deal, since the loss will be
6055 * uniform across all VCPUs (not to mention the scenario is extremely
6056 * unlikely). It is possible that a second hibernate recovery happens
6057 * much faster than a first, causing the observed TSC here to be
6058 * smaller; this would require additional padding adjustment, which is
6059 * why we set last_host_tsc to the local tsc observed here.
6060 *
6061 * N.B. - this code below runs only on platforms with reliable TSC,
6062 * as that is the only way backwards_tsc is set above. Also note
6063 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
6064 * have the same delta_cyc adjustment applied if backwards_tsc
6065 * is detected. Note further, this adjustment is only done once,
6066 * as we reset last_host_tsc on all VCPUs to stop this from being
6067 * called multiple times (one for each physical CPU bringup).
6068 *
6069 * Platforms with unnreliable TSCs don't have to deal with this, they
6070 * will be compensated by the logic in vcpu_load, which sets the TSC to
6071 * catchup mode. This will catchup all VCPUs to real time, but cannot
6072 * guarantee that they stay in perfect synchronization.
6073 */
6074 if (backwards_tsc) {
6075 u64 delta_cyc = max_tsc - local_tsc;
6076 list_for_each_entry(kvm, &vm_list, vm_list) {
6077 kvm_for_each_vcpu(i, vcpu, kvm) {
6078 vcpu->arch.tsc_offset_adjustment += delta_cyc;
6079 vcpu->arch.last_host_tsc = local_tsc;
6080 }
6081
6082 /*
6083 * We have to disable TSC offset matching.. if you were
6084 * booting a VM while issuing an S4 host suspend....
6085 * you may have some problem. Solving this issue is
6086 * left as an exercise to the reader.
6087 */
6088 kvm->arch.last_tsc_nsec = 0;
6089 kvm->arch.last_tsc_write = 0;
6090 }
6091
6092 }
6093 return 0;
5938} 6094}
5939 6095
5940void kvm_arch_hardware_disable(void *garbage) 6096void kvm_arch_hardware_disable(void *garbage)
@@ -5958,6 +6114,11 @@ void kvm_arch_check_processor_compat(void *rtn)
5958 kvm_x86_ops->check_processor_compatibility(rtn); 6114 kvm_x86_ops->check_processor_compatibility(rtn);
5959} 6115}
5960 6116
6117bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
6118{
6119 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
6120}
6121
5961int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 6122int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5962{ 6123{
5963 struct page *page; 6124 struct page *page;
@@ -5980,7 +6141,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5980 } 6141 }
5981 vcpu->arch.pio_data = page_address(page); 6142 vcpu->arch.pio_data = page_address(page);
5982 6143
5983 kvm_init_tsc_catchup(vcpu, max_tsc_khz); 6144 kvm_set_tsc_khz(vcpu, max_tsc_khz);
5984 6145
5985 r = kvm_mmu_create(vcpu); 6146 r = kvm_mmu_create(vcpu);
5986 if (r < 0) 6147 if (r < 0)
@@ -6032,8 +6193,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
6032 free_page((unsigned long)vcpu->arch.pio_data); 6193 free_page((unsigned long)vcpu->arch.pio_data);
6033} 6194}
6034 6195
6035int kvm_arch_init_vm(struct kvm *kvm) 6196int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6036{ 6197{
6198 if (type)
6199 return -EINVAL;
6200
6037 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6201 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6038 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6202 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
6039 6203
@@ -6093,6 +6257,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
6093 put_page(kvm->arch.ept_identity_pagetable); 6257 put_page(kvm->arch.ept_identity_pagetable);
6094} 6258}
6095 6259
6260void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6261 struct kvm_memory_slot *dont)
6262{
6263 int i;
6264
6265 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6266 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
6267 vfree(free->arch.lpage_info[i]);
6268 free->arch.lpage_info[i] = NULL;
6269 }
6270 }
6271}
6272
6273int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6274{
6275 int i;
6276
6277 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6278 unsigned long ugfn;
6279 int lpages;
6280 int level = i + 2;
6281
6282 lpages = gfn_to_index(slot->base_gfn + npages - 1,
6283 slot->base_gfn, level) + 1;
6284
6285 slot->arch.lpage_info[i] =
6286 vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
6287 if (!slot->arch.lpage_info[i])
6288 goto out_free;
6289
6290 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
6291 slot->arch.lpage_info[i][0].write_count = 1;
6292 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
6293 slot->arch.lpage_info[i][lpages - 1].write_count = 1;
6294 ugfn = slot->userspace_addr >> PAGE_SHIFT;
6295 /*
6296 * If the gfn and userspace address are not aligned wrt each
6297 * other, or if explicitly asked to, disable large page
6298 * support for this slot
6299 */
6300 if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
6301 !kvm_largepages_enabled()) {
6302 unsigned long j;
6303
6304 for (j = 0; j < lpages; ++j)
6305 slot->arch.lpage_info[i][j].write_count = 1;
6306 }
6307 }
6308
6309 return 0;
6310
6311out_free:
6312 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6313 vfree(slot->arch.lpage_info[i]);
6314 slot->arch.lpage_info[i] = NULL;
6315 }
6316 return -ENOMEM;
6317}
6318
6096int kvm_arch_prepare_memory_region(struct kvm *kvm, 6319int kvm_arch_prepare_memory_region(struct kvm *kvm,
6097 struct kvm_memory_slot *memslot, 6320 struct kvm_memory_slot *memslot,
6098 struct kvm_memory_slot old, 6321 struct kvm_memory_slot old,