diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 403 |
1 files changed, 313 insertions, 90 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54696b5f8443..4044ce0bf7c1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -97,6 +97,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control); | |||
97 | u32 kvm_max_guest_tsc_khz; | 97 | u32 kvm_max_guest_tsc_khz; |
98 | EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); | 98 | EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); |
99 | 99 | ||
100 | /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ | ||
101 | static u32 tsc_tolerance_ppm = 250; | ||
102 | module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); | ||
103 | |||
100 | #define KVM_NR_SHARED_MSRS 16 | 104 | #define KVM_NR_SHARED_MSRS 16 |
101 | 105 | ||
102 | struct kvm_shared_msrs_global { | 106 | struct kvm_shared_msrs_global { |
@@ -969,50 +973,51 @@ static inline u64 get_kernel_ns(void) | |||
969 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); | 973 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); |
970 | unsigned long max_tsc_khz; | 974 | unsigned long max_tsc_khz; |
971 | 975 | ||
972 | static inline int kvm_tsc_changes_freq(void) | 976 | static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) |
973 | { | 977 | { |
974 | int cpu = get_cpu(); | 978 | return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, |
975 | int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && | 979 | vcpu->arch.virtual_tsc_shift); |
976 | cpufreq_quick_get(cpu) != 0; | ||
977 | put_cpu(); | ||
978 | return ret; | ||
979 | } | 980 | } |
980 | 981 | ||
981 | u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) | 982 | static u32 adjust_tsc_khz(u32 khz, s32 ppm) |
982 | { | 983 | { |
983 | if (vcpu->arch.virtual_tsc_khz) | 984 | u64 v = (u64)khz * (1000000 + ppm); |
984 | return vcpu->arch.virtual_tsc_khz; | 985 | do_div(v, 1000000); |
985 | else | 986 | return v; |
986 | return __this_cpu_read(cpu_tsc_khz); | ||
987 | } | 987 | } |
988 | 988 | ||
989 | static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) | 989 | static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) |
990 | { | 990 | { |
991 | u64 ret; | 991 | u32 thresh_lo, thresh_hi; |
992 | 992 | int use_scaling = 0; | |
993 | WARN_ON(preemptible()); | ||
994 | if (kvm_tsc_changes_freq()) | ||
995 | printk_once(KERN_WARNING | ||
996 | "kvm: unreliable cycle conversion on adjustable rate TSC\n"); | ||
997 | ret = nsec * vcpu_tsc_khz(vcpu); | ||
998 | do_div(ret, USEC_PER_SEC); | ||
999 | return ret; | ||
1000 | } | ||
1001 | 993 | ||
1002 | static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) | ||
1003 | { | ||
1004 | /* Compute a scale to convert nanoseconds in TSC cycles */ | 994 | /* Compute a scale to convert nanoseconds in TSC cycles */ |
1005 | kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, | 995 | kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, |
1006 | &vcpu->arch.tsc_catchup_shift, | 996 | &vcpu->arch.virtual_tsc_shift, |
1007 | &vcpu->arch.tsc_catchup_mult); | 997 | &vcpu->arch.virtual_tsc_mult); |
998 | vcpu->arch.virtual_tsc_khz = this_tsc_khz; | ||
999 | |||
1000 | /* | ||
1001 | * Compute the variation in TSC rate which is acceptable | ||
1002 | * within the range of tolerance and decide if the | ||
1003 | * rate being applied is within that bounds of the hardware | ||
1004 | * rate. If so, no scaling or compensation need be done. | ||
1005 | */ | ||
1006 | thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); | ||
1007 | thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); | ||
1008 | if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { | ||
1009 | pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); | ||
1010 | use_scaling = 1; | ||
1011 | } | ||
1012 | kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); | ||
1008 | } | 1013 | } |
1009 | 1014 | ||
1010 | static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) | 1015 | static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) |
1011 | { | 1016 | { |
1012 | u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, | 1017 | u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, |
1013 | vcpu->arch.tsc_catchup_mult, | 1018 | vcpu->arch.virtual_tsc_mult, |
1014 | vcpu->arch.tsc_catchup_shift); | 1019 | vcpu->arch.virtual_tsc_shift); |
1015 | tsc += vcpu->arch.last_tsc_write; | 1020 | tsc += vcpu->arch.this_tsc_write; |
1016 | return tsc; | 1021 | return tsc; |
1017 | } | 1022 | } |
1018 | 1023 | ||
@@ -1021,48 +1026,88 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) | |||
1021 | struct kvm *kvm = vcpu->kvm; | 1026 | struct kvm *kvm = vcpu->kvm; |
1022 | u64 offset, ns, elapsed; | 1027 | u64 offset, ns, elapsed; |
1023 | unsigned long flags; | 1028 | unsigned long flags; |
1024 | s64 sdiff; | 1029 | s64 usdiff; |
1025 | 1030 | ||
1026 | raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); | 1031 | raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); |
1027 | offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); | 1032 | offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); |
1028 | ns = get_kernel_ns(); | 1033 | ns = get_kernel_ns(); |
1029 | elapsed = ns - kvm->arch.last_tsc_nsec; | 1034 | elapsed = ns - kvm->arch.last_tsc_nsec; |
1030 | sdiff = data - kvm->arch.last_tsc_write; | 1035 | |
1031 | if (sdiff < 0) | 1036 | /* n.b - signed multiplication and division required */ |
1032 | sdiff = -sdiff; | 1037 | usdiff = data - kvm->arch.last_tsc_write; |
1038 | #ifdef CONFIG_X86_64 | ||
1039 | usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; | ||
1040 | #else | ||
1041 | /* do_div() only does unsigned */ | ||
1042 | asm("idivl %2; xor %%edx, %%edx" | ||
1043 | : "=A"(usdiff) | ||
1044 | : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); | ||
1045 | #endif | ||
1046 | do_div(elapsed, 1000); | ||
1047 | usdiff -= elapsed; | ||
1048 | if (usdiff < 0) | ||
1049 | usdiff = -usdiff; | ||
1033 | 1050 | ||
1034 | /* | 1051 | /* |
1035 | * Special case: close write to TSC within 5 seconds of | 1052 | * Special case: TSC write with a small delta (1 second) of virtual |
1036 | * another CPU is interpreted as an attempt to synchronize | 1053 | * cycle time against real time is interpreted as an attempt to |
1037 | * The 5 seconds is to accommodate host load / swapping as | 1054 | * synchronize the CPU. |
1038 | * well as any reset of TSC during the boot process. | 1055 | * |
1039 | * | 1056 | * For a reliable TSC, we can match TSC offsets, and for an unstable |
1040 | * In that case, for a reliable TSC, we can match TSC offsets, | 1057 | * TSC, we add elapsed time in this computation. We could let the |
1041 | * or make a best guest using elapsed value. | 1058 | * compensation code attempt to catch up if we fall behind, but |
1042 | */ | 1059 | * it's better to try to match offsets from the beginning. |
1043 | if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && | 1060 | */ |
1044 | elapsed < 5ULL * NSEC_PER_SEC) { | 1061 | if (usdiff < USEC_PER_SEC && |
1062 | vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { | ||
1045 | if (!check_tsc_unstable()) { | 1063 | if (!check_tsc_unstable()) { |
1046 | offset = kvm->arch.last_tsc_offset; | 1064 | offset = kvm->arch.cur_tsc_offset; |
1047 | pr_debug("kvm: matched tsc offset for %llu\n", data); | 1065 | pr_debug("kvm: matched tsc offset for %llu\n", data); |
1048 | } else { | 1066 | } else { |
1049 | u64 delta = nsec_to_cycles(vcpu, elapsed); | 1067 | u64 delta = nsec_to_cycles(vcpu, elapsed); |
1050 | offset += delta; | 1068 | data += delta; |
1069 | offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); | ||
1051 | pr_debug("kvm: adjusted tsc offset by %llu\n", delta); | 1070 | pr_debug("kvm: adjusted tsc offset by %llu\n", delta); |
1052 | } | 1071 | } |
1053 | ns = kvm->arch.last_tsc_nsec; | 1072 | } else { |
1073 | /* | ||
1074 | * We split periods of matched TSC writes into generations. | ||
1075 | * For each generation, we track the original measured | ||
1076 | * nanosecond time, offset, and write, so if TSCs are in | ||
1077 | * sync, we can match exact offset, and if not, we can match | ||
1078 | * exact software computaion in compute_guest_tsc() | ||
1079 | * | ||
1080 | * These values are tracked in kvm->arch.cur_xxx variables. | ||
1081 | */ | ||
1082 | kvm->arch.cur_tsc_generation++; | ||
1083 | kvm->arch.cur_tsc_nsec = ns; | ||
1084 | kvm->arch.cur_tsc_write = data; | ||
1085 | kvm->arch.cur_tsc_offset = offset; | ||
1086 | pr_debug("kvm: new tsc generation %u, clock %llu\n", | ||
1087 | kvm->arch.cur_tsc_generation, data); | ||
1054 | } | 1088 | } |
1089 | |||
1090 | /* | ||
1091 | * We also track th most recent recorded KHZ, write and time to | ||
1092 | * allow the matching interval to be extended at each write. | ||
1093 | */ | ||
1055 | kvm->arch.last_tsc_nsec = ns; | 1094 | kvm->arch.last_tsc_nsec = ns; |
1056 | kvm->arch.last_tsc_write = data; | 1095 | kvm->arch.last_tsc_write = data; |
1057 | kvm->arch.last_tsc_offset = offset; | 1096 | kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; |
1058 | kvm_x86_ops->write_tsc_offset(vcpu, offset); | ||
1059 | raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); | ||
1060 | 1097 | ||
1061 | /* Reset of TSC must disable overshoot protection below */ | 1098 | /* Reset of TSC must disable overshoot protection below */ |
1062 | vcpu->arch.hv_clock.tsc_timestamp = 0; | 1099 | vcpu->arch.hv_clock.tsc_timestamp = 0; |
1063 | vcpu->arch.last_tsc_write = data; | 1100 | vcpu->arch.last_guest_tsc = data; |
1064 | vcpu->arch.last_tsc_nsec = ns; | 1101 | |
1102 | /* Keep track of which generation this VCPU has synchronized to */ | ||
1103 | vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; | ||
1104 | vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; | ||
1105 | vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; | ||
1106 | |||
1107 | kvm_x86_ops->write_tsc_offset(vcpu, offset); | ||
1108 | raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); | ||
1065 | } | 1109 | } |
1110 | |||
1066 | EXPORT_SYMBOL_GPL(kvm_write_tsc); | 1111 | EXPORT_SYMBOL_GPL(kvm_write_tsc); |
1067 | 1112 | ||
1068 | static int kvm_guest_time_update(struct kvm_vcpu *v) | 1113 | static int kvm_guest_time_update(struct kvm_vcpu *v) |
@@ -1078,7 +1123,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1078 | local_irq_save(flags); | 1123 | local_irq_save(flags); |
1079 | tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); | 1124 | tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); |
1080 | kernel_ns = get_kernel_ns(); | 1125 | kernel_ns = get_kernel_ns(); |
1081 | this_tsc_khz = vcpu_tsc_khz(v); | 1126 | this_tsc_khz = __get_cpu_var(cpu_tsc_khz); |
1082 | if (unlikely(this_tsc_khz == 0)) { | 1127 | if (unlikely(this_tsc_khz == 0)) { |
1083 | local_irq_restore(flags); | 1128 | local_irq_restore(flags); |
1084 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); | 1129 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); |
@@ -1098,7 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1098 | if (vcpu->tsc_catchup) { | 1143 | if (vcpu->tsc_catchup) { |
1099 | u64 tsc = compute_guest_tsc(v, kernel_ns); | 1144 | u64 tsc = compute_guest_tsc(v, kernel_ns); |
1100 | if (tsc > tsc_timestamp) { | 1145 | if (tsc > tsc_timestamp) { |
1101 | kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); | 1146 | adjust_tsc_offset_guest(v, tsc - tsc_timestamp); |
1102 | tsc_timestamp = tsc; | 1147 | tsc_timestamp = tsc; |
1103 | } | 1148 | } |
1104 | } | 1149 | } |
@@ -1130,7 +1175,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1130 | * observed by the guest and ensure the new system time is greater. | 1175 | * observed by the guest and ensure the new system time is greater. |
1131 | */ | 1176 | */ |
1132 | max_kernel_ns = 0; | 1177 | max_kernel_ns = 0; |
1133 | if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { | 1178 | if (vcpu->hv_clock.tsc_timestamp) { |
1134 | max_kernel_ns = vcpu->last_guest_tsc - | 1179 | max_kernel_ns = vcpu->last_guest_tsc - |
1135 | vcpu->hv_clock.tsc_timestamp; | 1180 | vcpu->hv_clock.tsc_timestamp; |
1136 | max_kernel_ns = pvclock_scale_delta(max_kernel_ns, | 1181 | max_kernel_ns = pvclock_scale_delta(max_kernel_ns, |
@@ -1504,6 +1549,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1504 | case MSR_K7_HWCR: | 1549 | case MSR_K7_HWCR: |
1505 | data &= ~(u64)0x40; /* ignore flush filter disable */ | 1550 | data &= ~(u64)0x40; /* ignore flush filter disable */ |
1506 | data &= ~(u64)0x100; /* ignore ignne emulation enable */ | 1551 | data &= ~(u64)0x100; /* ignore ignne emulation enable */ |
1552 | data &= ~(u64)0x8; /* ignore TLB cache disable */ | ||
1507 | if (data != 0) { | 1553 | if (data != 0) { |
1508 | pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", | 1554 | pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", |
1509 | data); | 1555 | data); |
@@ -1676,6 +1722,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1676 | */ | 1722 | */ |
1677 | pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); | 1723 | pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); |
1678 | break; | 1724 | break; |
1725 | case MSR_AMD64_OSVW_ID_LENGTH: | ||
1726 | if (!guest_cpuid_has_osvw(vcpu)) | ||
1727 | return 1; | ||
1728 | vcpu->arch.osvw.length = data; | ||
1729 | break; | ||
1730 | case MSR_AMD64_OSVW_STATUS: | ||
1731 | if (!guest_cpuid_has_osvw(vcpu)) | ||
1732 | return 1; | ||
1733 | vcpu->arch.osvw.status = data; | ||
1734 | break; | ||
1679 | default: | 1735 | default: |
1680 | if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) | 1736 | if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) |
1681 | return xen_hvm_config(vcpu, data); | 1737 | return xen_hvm_config(vcpu, data); |
@@ -1960,6 +2016,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1960 | */ | 2016 | */ |
1961 | data = 0xbe702111; | 2017 | data = 0xbe702111; |
1962 | break; | 2018 | break; |
2019 | case MSR_AMD64_OSVW_ID_LENGTH: | ||
2020 | if (!guest_cpuid_has_osvw(vcpu)) | ||
2021 | return 1; | ||
2022 | data = vcpu->arch.osvw.length; | ||
2023 | break; | ||
2024 | case MSR_AMD64_OSVW_STATUS: | ||
2025 | if (!guest_cpuid_has_osvw(vcpu)) | ||
2026 | return 1; | ||
2027 | data = vcpu->arch.osvw.status; | ||
2028 | break; | ||
1963 | default: | 2029 | default: |
1964 | if (kvm_pmu_msr(vcpu, msr)) | 2030 | if (kvm_pmu_msr(vcpu, msr)) |
1965 | return kvm_pmu_get_msr(vcpu, msr, pdata); | 2031 | return kvm_pmu_get_msr(vcpu, msr, pdata); |
@@ -2080,6 +2146,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
2080 | case KVM_CAP_XSAVE: | 2146 | case KVM_CAP_XSAVE: |
2081 | case KVM_CAP_ASYNC_PF: | 2147 | case KVM_CAP_ASYNC_PF: |
2082 | case KVM_CAP_GET_TSC_KHZ: | 2148 | case KVM_CAP_GET_TSC_KHZ: |
2149 | case KVM_CAP_PCI_2_3: | ||
2083 | r = 1; | 2150 | r = 1; |
2084 | break; | 2151 | break; |
2085 | case KVM_CAP_COALESCED_MMIO: | 2152 | case KVM_CAP_COALESCED_MMIO: |
@@ -2214,19 +2281,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
2214 | } | 2281 | } |
2215 | 2282 | ||
2216 | kvm_x86_ops->vcpu_load(vcpu, cpu); | 2283 | kvm_x86_ops->vcpu_load(vcpu, cpu); |
2217 | if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { | ||
2218 | /* Make sure TSC doesn't go backwards */ | ||
2219 | s64 tsc_delta; | ||
2220 | u64 tsc; | ||
2221 | 2284 | ||
2222 | tsc = kvm_x86_ops->read_l1_tsc(vcpu); | 2285 | /* Apply any externally detected TSC adjustments (due to suspend) */ |
2223 | tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : | 2286 | if (unlikely(vcpu->arch.tsc_offset_adjustment)) { |
2224 | tsc - vcpu->arch.last_guest_tsc; | 2287 | adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); |
2288 | vcpu->arch.tsc_offset_adjustment = 0; | ||
2289 | set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); | ||
2290 | } | ||
2225 | 2291 | ||
2292 | if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { | ||
2293 | s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : | ||
2294 | native_read_tsc() - vcpu->arch.last_host_tsc; | ||
2226 | if (tsc_delta < 0) | 2295 | if (tsc_delta < 0) |
2227 | mark_tsc_unstable("KVM discovered backwards TSC"); | 2296 | mark_tsc_unstable("KVM discovered backwards TSC"); |
2228 | if (check_tsc_unstable()) { | 2297 | if (check_tsc_unstable()) { |
2229 | kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); | 2298 | u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, |
2299 | vcpu->arch.last_guest_tsc); | ||
2300 | kvm_x86_ops->write_tsc_offset(vcpu, offset); | ||
2230 | vcpu->arch.tsc_catchup = 1; | 2301 | vcpu->arch.tsc_catchup = 1; |
2231 | } | 2302 | } |
2232 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | 2303 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
@@ -2243,7 +2314,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | |||
2243 | { | 2314 | { |
2244 | kvm_x86_ops->vcpu_put(vcpu); | 2315 | kvm_x86_ops->vcpu_put(vcpu); |
2245 | kvm_put_guest_fpu(vcpu); | 2316 | kvm_put_guest_fpu(vcpu); |
2246 | vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); | 2317 | vcpu->arch.last_host_tsc = native_read_tsc(); |
2247 | } | 2318 | } |
2248 | 2319 | ||
2249 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | 2320 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, |
@@ -2785,26 +2856,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2785 | u32 user_tsc_khz; | 2856 | u32 user_tsc_khz; |
2786 | 2857 | ||
2787 | r = -EINVAL; | 2858 | r = -EINVAL; |
2788 | if (!kvm_has_tsc_control) | ||
2789 | break; | ||
2790 | |||
2791 | user_tsc_khz = (u32)arg; | 2859 | user_tsc_khz = (u32)arg; |
2792 | 2860 | ||
2793 | if (user_tsc_khz >= kvm_max_guest_tsc_khz) | 2861 | if (user_tsc_khz >= kvm_max_guest_tsc_khz) |
2794 | goto out; | 2862 | goto out; |
2795 | 2863 | ||
2796 | kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); | 2864 | if (user_tsc_khz == 0) |
2865 | user_tsc_khz = tsc_khz; | ||
2866 | |||
2867 | kvm_set_tsc_khz(vcpu, user_tsc_khz); | ||
2797 | 2868 | ||
2798 | r = 0; | 2869 | r = 0; |
2799 | goto out; | 2870 | goto out; |
2800 | } | 2871 | } |
2801 | case KVM_GET_TSC_KHZ: { | 2872 | case KVM_GET_TSC_KHZ: { |
2802 | r = -EIO; | 2873 | r = vcpu->arch.virtual_tsc_khz; |
2803 | if (check_tsc_unstable()) | ||
2804 | goto out; | ||
2805 | |||
2806 | r = vcpu_tsc_khz(vcpu); | ||
2807 | |||
2808 | goto out; | 2874 | goto out; |
2809 | } | 2875 | } |
2810 | default: | 2876 | default: |
@@ -2815,6 +2881,11 @@ out: | |||
2815 | return r; | 2881 | return r; |
2816 | } | 2882 | } |
2817 | 2883 | ||
2884 | int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) | ||
2885 | { | ||
2886 | return VM_FAULT_SIGBUS; | ||
2887 | } | ||
2888 | |||
2818 | static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) | 2889 | static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) |
2819 | { | 2890 | { |
2820 | int ret; | 2891 | int ret; |
@@ -2998,6 +3069,8 @@ static void write_protect_slot(struct kvm *kvm, | |||
2998 | unsigned long *dirty_bitmap, | 3069 | unsigned long *dirty_bitmap, |
2999 | unsigned long nr_dirty_pages) | 3070 | unsigned long nr_dirty_pages) |
3000 | { | 3071 | { |
3072 | spin_lock(&kvm->mmu_lock); | ||
3073 | |||
3001 | /* Not many dirty pages compared to # of shadow pages. */ | 3074 | /* Not many dirty pages compared to # of shadow pages. */ |
3002 | if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { | 3075 | if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { |
3003 | unsigned long gfn_offset; | 3076 | unsigned long gfn_offset; |
@@ -3005,16 +3078,13 @@ static void write_protect_slot(struct kvm *kvm, | |||
3005 | for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { | 3078 | for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { |
3006 | unsigned long gfn = memslot->base_gfn + gfn_offset; | 3079 | unsigned long gfn = memslot->base_gfn + gfn_offset; |
3007 | 3080 | ||
3008 | spin_lock(&kvm->mmu_lock); | ||
3009 | kvm_mmu_rmap_write_protect(kvm, gfn, memslot); | 3081 | kvm_mmu_rmap_write_protect(kvm, gfn, memslot); |
3010 | spin_unlock(&kvm->mmu_lock); | ||
3011 | } | 3082 | } |
3012 | kvm_flush_remote_tlbs(kvm); | 3083 | kvm_flush_remote_tlbs(kvm); |
3013 | } else { | 3084 | } else |
3014 | spin_lock(&kvm->mmu_lock); | ||
3015 | kvm_mmu_slot_remove_write_access(kvm, memslot->id); | 3085 | kvm_mmu_slot_remove_write_access(kvm, memslot->id); |
3016 | spin_unlock(&kvm->mmu_lock); | 3086 | |
3017 | } | 3087 | spin_unlock(&kvm->mmu_lock); |
3018 | } | 3088 | } |
3019 | 3089 | ||
3020 | /* | 3090 | /* |
@@ -3133,6 +3203,9 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3133 | r = -EEXIST; | 3203 | r = -EEXIST; |
3134 | if (kvm->arch.vpic) | 3204 | if (kvm->arch.vpic) |
3135 | goto create_irqchip_unlock; | 3205 | goto create_irqchip_unlock; |
3206 | r = -EINVAL; | ||
3207 | if (atomic_read(&kvm->online_vcpus)) | ||
3208 | goto create_irqchip_unlock; | ||
3136 | r = -ENOMEM; | 3209 | r = -ENOMEM; |
3137 | vpic = kvm_create_pic(kvm); | 3210 | vpic = kvm_create_pic(kvm); |
3138 | if (vpic) { | 3211 | if (vpic) { |
@@ -4063,6 +4136,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) | |||
4063 | return res; | 4136 | return res; |
4064 | } | 4137 | } |
4065 | 4138 | ||
4139 | static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) | ||
4140 | { | ||
4141 | kvm_set_rflags(emul_to_vcpu(ctxt), val); | ||
4142 | } | ||
4143 | |||
4066 | static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) | 4144 | static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) |
4067 | { | 4145 | { |
4068 | return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); | 4146 | return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); |
@@ -4244,6 +4322,7 @@ static struct x86_emulate_ops emulate_ops = { | |||
4244 | .set_idt = emulator_set_idt, | 4322 | .set_idt = emulator_set_idt, |
4245 | .get_cr = emulator_get_cr, | 4323 | .get_cr = emulator_get_cr, |
4246 | .set_cr = emulator_set_cr, | 4324 | .set_cr = emulator_set_cr, |
4325 | .set_rflags = emulator_set_rflags, | ||
4247 | .cpl = emulator_get_cpl, | 4326 | .cpl = emulator_get_cpl, |
4248 | .get_dr = emulator_get_dr, | 4327 | .get_dr = emulator_get_dr, |
4249 | .set_dr = emulator_set_dr, | 4328 | .set_dr = emulator_set_dr, |
@@ -5288,6 +5367,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5288 | profile_hit(KVM_PROFILING, (void *)rip); | 5367 | profile_hit(KVM_PROFILING, (void *)rip); |
5289 | } | 5368 | } |
5290 | 5369 | ||
5370 | if (unlikely(vcpu->arch.tsc_always_catchup)) | ||
5371 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | ||
5291 | 5372 | ||
5292 | kvm_lapic_sync_from_vapic(vcpu); | 5373 | kvm_lapic_sync_from_vapic(vcpu); |
5293 | 5374 | ||
@@ -5587,15 +5668,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, | |||
5587 | return 0; | 5668 | return 0; |
5588 | } | 5669 | } |
5589 | 5670 | ||
5590 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | 5671 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, |
5591 | bool has_error_code, u32 error_code) | 5672 | int reason, bool has_error_code, u32 error_code) |
5592 | { | 5673 | { |
5593 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; | 5674 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
5594 | int ret; | 5675 | int ret; |
5595 | 5676 | ||
5596 | init_emulate_ctxt(vcpu); | 5677 | init_emulate_ctxt(vcpu); |
5597 | 5678 | ||
5598 | ret = emulator_task_switch(ctxt, tss_selector, reason, | 5679 | ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, |
5599 | has_error_code, error_code); | 5680 | has_error_code, error_code); |
5600 | 5681 | ||
5601 | if (ret) | 5682 | if (ret) |
@@ -5928,13 +6009,88 @@ int kvm_arch_hardware_enable(void *garbage) | |||
5928 | struct kvm *kvm; | 6009 | struct kvm *kvm; |
5929 | struct kvm_vcpu *vcpu; | 6010 | struct kvm_vcpu *vcpu; |
5930 | int i; | 6011 | int i; |
6012 | int ret; | ||
6013 | u64 local_tsc; | ||
6014 | u64 max_tsc = 0; | ||
6015 | bool stable, backwards_tsc = false; | ||
5931 | 6016 | ||
5932 | kvm_shared_msr_cpu_online(); | 6017 | kvm_shared_msr_cpu_online(); |
5933 | list_for_each_entry(kvm, &vm_list, vm_list) | 6018 | ret = kvm_x86_ops->hardware_enable(garbage); |
5934 | kvm_for_each_vcpu(i, vcpu, kvm) | 6019 | if (ret != 0) |
5935 | if (vcpu->cpu == smp_processor_id()) | 6020 | return ret; |
5936 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | 6021 | |
5937 | return kvm_x86_ops->hardware_enable(garbage); | 6022 | local_tsc = native_read_tsc(); |
6023 | stable = !check_tsc_unstable(); | ||
6024 | list_for_each_entry(kvm, &vm_list, vm_list) { | ||
6025 | kvm_for_each_vcpu(i, vcpu, kvm) { | ||
6026 | if (!stable && vcpu->cpu == smp_processor_id()) | ||
6027 | set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); | ||
6028 | if (stable && vcpu->arch.last_host_tsc > local_tsc) { | ||
6029 | backwards_tsc = true; | ||
6030 | if (vcpu->arch.last_host_tsc > max_tsc) | ||
6031 | max_tsc = vcpu->arch.last_host_tsc; | ||
6032 | } | ||
6033 | } | ||
6034 | } | ||
6035 | |||
6036 | /* | ||
6037 | * Sometimes, even reliable TSCs go backwards. This happens on | ||
6038 | * platforms that reset TSC during suspend or hibernate actions, but | ||
6039 | * maintain synchronization. We must compensate. Fortunately, we can | ||
6040 | * detect that condition here, which happens early in CPU bringup, | ||
6041 | * before any KVM threads can be running. Unfortunately, we can't | ||
6042 | * bring the TSCs fully up to date with real time, as we aren't yet far | ||
6043 | * enough into CPU bringup that we know how much real time has actually | ||
6044 | * elapsed; our helper function, get_kernel_ns() will be using boot | ||
6045 | * variables that haven't been updated yet. | ||
6046 | * | ||
6047 | * So we simply find the maximum observed TSC above, then record the | ||
6048 | * adjustment to TSC in each VCPU. When the VCPU later gets loaded, | ||
6049 | * the adjustment will be applied. Note that we accumulate | ||
6050 | * adjustments, in case multiple suspend cycles happen before some VCPU | ||
6051 | * gets a chance to run again. In the event that no KVM threads get a | ||
6052 | * chance to run, we will miss the entire elapsed period, as we'll have | ||
6053 | * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may | ||
6054 | * loose cycle time. This isn't too big a deal, since the loss will be | ||
6055 | * uniform across all VCPUs (not to mention the scenario is extremely | ||
6056 | * unlikely). It is possible that a second hibernate recovery happens | ||
6057 | * much faster than a first, causing the observed TSC here to be | ||
6058 | * smaller; this would require additional padding adjustment, which is | ||
6059 | * why we set last_host_tsc to the local tsc observed here. | ||
6060 | * | ||
6061 | * N.B. - this code below runs only on platforms with reliable TSC, | ||
6062 | * as that is the only way backwards_tsc is set above. Also note | ||
6063 | * that this runs for ALL vcpus, which is not a bug; all VCPUs should | ||
6064 | * have the same delta_cyc adjustment applied if backwards_tsc | ||
6065 | * is detected. Note further, this adjustment is only done once, | ||
6066 | * as we reset last_host_tsc on all VCPUs to stop this from being | ||
6067 | * called multiple times (one for each physical CPU bringup). | ||
6068 | * | ||
6069 | * Platforms with unnreliable TSCs don't have to deal with this, they | ||
6070 | * will be compensated by the logic in vcpu_load, which sets the TSC to | ||
6071 | * catchup mode. This will catchup all VCPUs to real time, but cannot | ||
6072 | * guarantee that they stay in perfect synchronization. | ||
6073 | */ | ||
6074 | if (backwards_tsc) { | ||
6075 | u64 delta_cyc = max_tsc - local_tsc; | ||
6076 | list_for_each_entry(kvm, &vm_list, vm_list) { | ||
6077 | kvm_for_each_vcpu(i, vcpu, kvm) { | ||
6078 | vcpu->arch.tsc_offset_adjustment += delta_cyc; | ||
6079 | vcpu->arch.last_host_tsc = local_tsc; | ||
6080 | } | ||
6081 | |||
6082 | /* | ||
6083 | * We have to disable TSC offset matching.. if you were | ||
6084 | * booting a VM while issuing an S4 host suspend.... | ||
6085 | * you may have some problem. Solving this issue is | ||
6086 | * left as an exercise to the reader. | ||
6087 | */ | ||
6088 | kvm->arch.last_tsc_nsec = 0; | ||
6089 | kvm->arch.last_tsc_write = 0; | ||
6090 | } | ||
6091 | |||
6092 | } | ||
6093 | return 0; | ||
5938 | } | 6094 | } |
5939 | 6095 | ||
5940 | void kvm_arch_hardware_disable(void *garbage) | 6096 | void kvm_arch_hardware_disable(void *garbage) |
@@ -5958,6 +6114,11 @@ void kvm_arch_check_processor_compat(void *rtn) | |||
5958 | kvm_x86_ops->check_processor_compatibility(rtn); | 6114 | kvm_x86_ops->check_processor_compatibility(rtn); |
5959 | } | 6115 | } |
5960 | 6116 | ||
6117 | bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) | ||
6118 | { | ||
6119 | return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); | ||
6120 | } | ||
6121 | |||
5961 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | 6122 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) |
5962 | { | 6123 | { |
5963 | struct page *page; | 6124 | struct page *page; |
@@ -5980,7 +6141,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
5980 | } | 6141 | } |
5981 | vcpu->arch.pio_data = page_address(page); | 6142 | vcpu->arch.pio_data = page_address(page); |
5982 | 6143 | ||
5983 | kvm_init_tsc_catchup(vcpu, max_tsc_khz); | 6144 | kvm_set_tsc_khz(vcpu, max_tsc_khz); |
5984 | 6145 | ||
5985 | r = kvm_mmu_create(vcpu); | 6146 | r = kvm_mmu_create(vcpu); |
5986 | if (r < 0) | 6147 | if (r < 0) |
@@ -6032,8 +6193,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) | |||
6032 | free_page((unsigned long)vcpu->arch.pio_data); | 6193 | free_page((unsigned long)vcpu->arch.pio_data); |
6033 | } | 6194 | } |
6034 | 6195 | ||
6035 | int kvm_arch_init_vm(struct kvm *kvm) | 6196 | int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) |
6036 | { | 6197 | { |
6198 | if (type) | ||
6199 | return -EINVAL; | ||
6200 | |||
6037 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | 6201 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
6038 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | 6202 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); |
6039 | 6203 | ||
@@ -6093,6 +6257,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm) | |||
6093 | put_page(kvm->arch.ept_identity_pagetable); | 6257 | put_page(kvm->arch.ept_identity_pagetable); |
6094 | } | 6258 | } |
6095 | 6259 | ||
6260 | void kvm_arch_free_memslot(struct kvm_memory_slot *free, | ||
6261 | struct kvm_memory_slot *dont) | ||
6262 | { | ||
6263 | int i; | ||
6264 | |||
6265 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | ||
6266 | if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { | ||
6267 | vfree(free->arch.lpage_info[i]); | ||
6268 | free->arch.lpage_info[i] = NULL; | ||
6269 | } | ||
6270 | } | ||
6271 | } | ||
6272 | |||
6273 | int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) | ||
6274 | { | ||
6275 | int i; | ||
6276 | |||
6277 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | ||
6278 | unsigned long ugfn; | ||
6279 | int lpages; | ||
6280 | int level = i + 2; | ||
6281 | |||
6282 | lpages = gfn_to_index(slot->base_gfn + npages - 1, | ||
6283 | slot->base_gfn, level) + 1; | ||
6284 | |||
6285 | slot->arch.lpage_info[i] = | ||
6286 | vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); | ||
6287 | if (!slot->arch.lpage_info[i]) | ||
6288 | goto out_free; | ||
6289 | |||
6290 | if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) | ||
6291 | slot->arch.lpage_info[i][0].write_count = 1; | ||
6292 | if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) | ||
6293 | slot->arch.lpage_info[i][lpages - 1].write_count = 1; | ||
6294 | ugfn = slot->userspace_addr >> PAGE_SHIFT; | ||
6295 | /* | ||
6296 | * If the gfn and userspace address are not aligned wrt each | ||
6297 | * other, or if explicitly asked to, disable large page | ||
6298 | * support for this slot | ||
6299 | */ | ||
6300 | if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || | ||
6301 | !kvm_largepages_enabled()) { | ||
6302 | unsigned long j; | ||
6303 | |||
6304 | for (j = 0; j < lpages; ++j) | ||
6305 | slot->arch.lpage_info[i][j].write_count = 1; | ||
6306 | } | ||
6307 | } | ||
6308 | |||
6309 | return 0; | ||
6310 | |||
6311 | out_free: | ||
6312 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | ||
6313 | vfree(slot->arch.lpage_info[i]); | ||
6314 | slot->arch.lpage_info[i] = NULL; | ||
6315 | } | ||
6316 | return -ENOMEM; | ||
6317 | } | ||
6318 | |||
6096 | int kvm_arch_prepare_memory_region(struct kvm *kvm, | 6319 | int kvm_arch_prepare_memory_region(struct kvm *kvm, |
6097 | struct kvm_memory_slot *memslot, | 6320 | struct kvm_memory_slot *memslot, |
6098 | struct kvm_memory_slot old, | 6321 | struct kvm_memory_slot old, |