aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-06-24 21:09:06 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-06-24 21:09:06 -0400
commit919c0d14ae93073a3957c018a6d86ceb1e2e454b (patch)
treeb654cfcee21adaf4842447e4fe21f74aa956bcca /arch
parentde08341a0ef747d607542af3ae441b286f503e35 (diff)
parent6b1ed9086592fd4b066daae222751bb6757ca5eb (diff)
Merge branch 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
* 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: KVM: Remove now unused structs from kvm_para.h x86: KVM guest: Use the paravirt clocksource structs and functions KVM: Make kvm host use the paravirt clocksource structs x86: Make xen use the paravirt clocksource structs and functions x86: Add structs and functions for paravirt clocksource KVM: VMX: Fix host msr corruption with preemption enabled KVM: ioapic: fix lost interrupt when changing a device's irq KVM: MMU: Fix oops on guest userspace access to guest pagetable KVM: MMU: large page update_pte issue with non-PAE 32-bit guests (resend) KVM: MMU: Fix rmap_write_protect() hugepage iteration bug KVM: close timer injection race window in __vcpu_run KVM: Fix race between timer migration and vcpu migration
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/kvmclock.c89
-rw-r--r--arch/x86/kernel/pvclock.c141
-rw-r--r--arch/x86/kvm/i8254.c9
-rw-r--r--arch/x86/kvm/lapic.c1
-rw-r--r--arch/x86/kvm/mmu.c19
-rw-r--r--arch/x86/kvm/vmx.c19
-rw-r--r--arch/x86/kvm/x86.c91
-rw-r--r--arch/x86/xen/Kconfig1
-rw-r--r--arch/x86/xen/time.c132
11 files changed, 285 insertions, 223 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 52e18e6d2ba0..e0edaaa6920a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -383,6 +383,7 @@ config VMI
383config KVM_CLOCK 383config KVM_CLOCK
384 bool "KVM paravirtualized clock" 384 bool "KVM paravirtualized clock"
385 select PARAVIRT 385 select PARAVIRT
386 select PARAVIRT_CLOCK
386 depends on !(X86_VISWS || X86_VOYAGER) 387 depends on !(X86_VISWS || X86_VOYAGER)
387 help 388 help
388 Turning on this option will allow you to run a paravirtualized clock 389 Turning on this option will allow you to run a paravirtualized clock
@@ -410,6 +411,10 @@ config PARAVIRT
410 over full virtualization. However, when run without a hypervisor 411 over full virtualization. However, when run without a hypervisor
411 the kernel is theoretically slower and slightly larger. 412 the kernel is theoretically slower and slightly larger.
412 413
414config PARAVIRT_CLOCK
415 bool
416 default n
417
413endif 418endif
414 419
415config MEMTEST_BOOTPARAM 420config MEMTEST_BOOTPARAM
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..77807d4769c9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
82obj-$(CONFIG_KVM_GUEST) += kvm.o 82obj-$(CONFIG_KVM_GUEST) += kvm.o
83obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 83obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
84obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 84obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
85obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
85 86
86obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 87obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
87 88
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 08a30986d472..87edf1ceb1df 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/clocksource.h> 19#include <linux/clocksource.h>
20#include <linux/kvm_para.h> 20#include <linux/kvm_para.h>
21#include <asm/pvclock.h>
21#include <asm/arch_hooks.h> 22#include <asm/arch_hooks.h>
22#include <asm/msr.h> 23#include <asm/msr.h>
23#include <asm/apic.h> 24#include <asm/apic.h>
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)
36early_param("no-kvmclock", parse_no_kvmclock); 37early_param("no-kvmclock", parse_no_kvmclock);
37 38
38/* The hypervisor will put information about time periodically here */ 39/* The hypervisor will put information about time periodically here */
39static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); 40static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
40#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field 41static struct pvclock_wall_clock wall_clock;
41 42
42static inline u64 kvm_get_delta(u64 last_tsc)
43{
44 int cpu = smp_processor_id();
45 u64 delta = native_read_tsc() - last_tsc;
46 return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
47}
48
49static struct kvm_wall_clock wall_clock;
50static cycle_t kvm_clock_read(void);
51/* 43/*
52 * The wallclock is the time of day when we booted. Since then, some time may 44 * The wallclock is the time of day when we booted. Since then, some time may
53 * have elapsed since the hypervisor wrote the data. So we try to account for 45 * have elapsed since the hypervisor wrote the data. So we try to account for
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);
55 */ 47 */
56static unsigned long kvm_get_wallclock(void) 48static unsigned long kvm_get_wallclock(void)
57{ 49{
58 u32 wc_sec, wc_nsec; 50 struct pvclock_vcpu_time_info *vcpu_time;
59 u64 delta;
60 struct timespec ts; 51 struct timespec ts;
61 int version, nsec;
62 int low, high; 52 int low, high;
63 53
64 low = (int)__pa(&wall_clock); 54 low = (int)__pa(&wall_clock);
65 high = ((u64)__pa(&wall_clock) >> 32); 55 high = ((u64)__pa(&wall_clock) >> 32);
56 native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
66 57
67 delta = kvm_clock_read(); 58 vcpu_time = &get_cpu_var(hv_clock);
59 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
60 put_cpu_var(hv_clock);
68 61
69 native_write_msr(MSR_KVM_WALL_CLOCK, low, high); 62 return ts.tv_sec;
70 do {
71 version = wall_clock.wc_version;
72 rmb();
73 wc_sec = wall_clock.wc_sec;
74 wc_nsec = wall_clock.wc_nsec;
75 rmb();
76 } while ((wall_clock.wc_version != version) || (version & 1));
77
78 delta = kvm_clock_read() - delta;
79 delta += wc_nsec;
80 nsec = do_div(delta, NSEC_PER_SEC);
81 set_normalized_timespec(&ts, wc_sec + delta, nsec);
82 /*
83 * Of all mechanisms of time adjustment I've tested, this one
84 * was the champion!
85 */
86 return ts.tv_sec + 1;
87} 63}
88 64
89static int kvm_set_wallclock(unsigned long now) 65static int kvm_set_wallclock(unsigned long now)
90{ 66{
91 return 0; 67 return -1;
92} 68}
93 69
94/*
95 * This is our read_clock function. The host puts an tsc timestamp each time
96 * it updates a new time. Without the tsc adjustment, we can have a situation
97 * in which a vcpu starts to run earlier (smaller system_time), but probes
98 * time later (compared to another vcpu), leading to backwards time
99 */
100static cycle_t kvm_clock_read(void) 70static cycle_t kvm_clock_read(void)
101{ 71{
102 u64 last_tsc, now; 72 struct pvclock_vcpu_time_info *src;
103 int cpu; 73 cycle_t ret;
104 74
105 preempt_disable(); 75 src = &get_cpu_var(hv_clock);
106 cpu = smp_processor_id(); 76 ret = pvclock_clocksource_read(src);
107 77 put_cpu_var(hv_clock);
108 last_tsc = get_clock(cpu, tsc_timestamp); 78 return ret;
109 now = get_clock(cpu, system_time);
110
111 now += kvm_get_delta(last_tsc);
112 preempt_enable();
113
114 return now;
115} 79}
80
116static struct clocksource kvm_clock = { 81static struct clocksource kvm_clock = {
117 .name = "kvm-clock", 82 .name = "kvm-clock",
118 .read = kvm_clock_read, 83 .read = kvm_clock_read,
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
123 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 88 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
124}; 89};
125 90
126static int kvm_register_clock(void) 91static int kvm_register_clock(char *txt)
127{ 92{
128 int cpu = smp_processor_id(); 93 int cpu = smp_processor_id();
129 int low, high; 94 int low, high;
130 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 95 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
131 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 96 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
132 97 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
98 cpu, high, low, txt);
133 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); 99 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
134} 100}
135 101
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
140 * Now that the first cpu already had this clocksource initialized, 106 * Now that the first cpu already had this clocksource initialized,
141 * we shouldn't fail. 107 * we shouldn't fail.
142 */ 108 */
143 WARN_ON(kvm_register_clock()); 109 WARN_ON(kvm_register_clock("secondary cpu clock"));
144 /* ok, done with our trickery, call native */ 110 /* ok, done with our trickery, call native */
145 setup_secondary_APIC_clock(); 111 setup_secondary_APIC_clock();
146} 112}
147#endif 113#endif
148 114
115#ifdef CONFIG_SMP
116void __init kvm_smp_prepare_boot_cpu(void)
117{
118 WARN_ON(kvm_register_clock("primary cpu clock"));
119 native_smp_prepare_boot_cpu();
120}
121#endif
122
149/* 123/*
150 * After the clock is registered, the host will keep writing to the 124 * After the clock is registered, the host will keep writing to the
151 * registered memory location. If the guest happens to shutdown, this memory 125 * registered memory location. If the guest happens to shutdown, this memory
@@ -174,7 +148,7 @@ void __init kvmclock_init(void)
174 return; 148 return;
175 149
176 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 150 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
177 if (kvm_register_clock()) 151 if (kvm_register_clock("boot clock"))
178 return; 152 return;
179 pv_time_ops.get_wallclock = kvm_get_wallclock; 153 pv_time_ops.get_wallclock = kvm_get_wallclock;
180 pv_time_ops.set_wallclock = kvm_set_wallclock; 154 pv_time_ops.set_wallclock = kvm_set_wallclock;
@@ -182,6 +156,9 @@ void __init kvmclock_init(void)
182#ifdef CONFIG_X86_LOCAL_APIC 156#ifdef CONFIG_X86_LOCAL_APIC
183 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 157 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
184#endif 158#endif
159#ifdef CONFIG_SMP
160 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
161#endif
185 machine_ops.shutdown = kvm_shutdown; 162 machine_ops.shutdown = kvm_shutdown;
186#ifdef CONFIG_KEXEC 163#ifdef CONFIG_KEXEC
187 machine_ops.crash_shutdown = kvm_crash_shutdown; 164 machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
new file mode 100644
index 000000000000..05fbe9a0325a
--- /dev/null
+++ b/arch/x86/kernel/pvclock.c
@@ -0,0 +1,141 @@
1/* paravirtual clock -- common code used by kvm/xen
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16*/
17
18#include <linux/kernel.h>
19#include <linux/percpu.h>
20#include <asm/pvclock.h>
21
22/*
23 * These are perodically updated
24 * xen: magic shared_info page
25 * kvm: gpa registered via msr
26 * and then copied here.
27 */
28struct pvclock_shadow_time {
29 u64 tsc_timestamp; /* TSC at last update of time vals. */
30 u64 system_timestamp; /* Time, in nanosecs, since boot. */
31 u32 tsc_to_nsec_mul;
32 int tsc_shift;
33 u32 version;
34};
35
36/*
37 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
38 * yielding a 64-bit result.
39 */
40static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
41{
42 u64 product;
43#ifdef __i386__
44 u32 tmp1, tmp2;
45#endif
46
47 if (shift < 0)
48 delta >>= -shift;
49 else
50 delta <<= shift;
51
52#ifdef __i386__
53 __asm__ (
54 "mul %5 ; "
55 "mov %4,%%eax ; "
56 "mov %%edx,%4 ; "
57 "mul %5 ; "
58 "xor %5,%5 ; "
59 "add %4,%%eax ; "
60 "adc %5,%%edx ; "
61 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
63#elif __x86_64__
64 __asm__ (
65 "mul %%rdx ; shrd $32,%%rdx,%%rax"
66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
67#else
68#error implement me!
69#endif
70
71 return product;
72}
73
74static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
75{
76 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
77 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
78}
79
80/*
81 * Reads a consistent set of time-base values from hypervisor,
82 * into a shadow data area.
83 */
84static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
85 struct pvclock_vcpu_time_info *src)
86{
87 do {
88 dst->version = src->version;
89 rmb(); /* fetch version before data */
90 dst->tsc_timestamp = src->tsc_timestamp;
91 dst->system_timestamp = src->system_time;
92 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
93 dst->tsc_shift = src->tsc_shift;
94 rmb(); /* test version after fetching data */
95 } while ((src->version & 1) || (dst->version != src->version));
96
97 return dst->version;
98}
99
100cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
101{
102 struct pvclock_shadow_time shadow;
103 unsigned version;
104 cycle_t ret, offset;
105
106 do {
107 version = pvclock_get_time_values(&shadow, src);
108 barrier();
109 offset = pvclock_get_nsec_offset(&shadow);
110 ret = shadow.system_timestamp + offset;
111 barrier();
112 } while (version != src->version);
113
114 return ret;
115}
116
117void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
118 struct pvclock_vcpu_time_info *vcpu_time,
119 struct timespec *ts)
120{
121 u32 version;
122 u64 delta;
123 struct timespec now;
124
125 /* get wallclock at system boot */
126 do {
127 version = wall_clock->version;
128 rmb(); /* fetch version before time */
129 now.tv_sec = wall_clock->sec;
130 now.tv_nsec = wall_clock->nsec;
131 rmb(); /* fetch time before checking version */
132 } while ((wall_clock->version & 1) || (version != wall_clock->version));
133
134 delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
135 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
136
137 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
138 now.tv_sec = delta;
139
140 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
141}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index f2f5d260874e..3829aa7b663f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
200 200
201 atomic_inc(&pt->pending); 201 atomic_inc(&pt->pending);
202 smp_mb__after_atomic_inc(); 202 smp_mb__after_atomic_inc();
203 if (vcpu0 && waitqueue_active(&vcpu0->wq)) { 203 if (vcpu0) {
204 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 204 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
205 wake_up_interruptible(&vcpu0->wq); 205 if (waitqueue_active(&vcpu0->wq)) {
206 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
207 wake_up_interruptible(&vcpu0->wq);
208 }
206 } 209 }
207 210
208 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 211 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c297c50eba63..ebc03f5ae162 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
940 wait_queue_head_t *q = &apic->vcpu->wq; 940 wait_queue_head_t *q = &apic->vcpu->wq;
941 941
942 atomic_inc(&apic->timer.pending); 942 atomic_inc(&apic->timer.pending);
943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
943 if (waitqueue_active(q)) { 944 if (waitqueue_active(q)) {
944 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 945 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
945 wake_up_interruptible(q); 946 wake_up_interruptible(q);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ee3f53098f0c..7e7c3969f7a2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
640 rmap_remove(kvm, spte); 640 rmap_remove(kvm, spte);
641 --kvm->stat.lpages; 641 --kvm->stat.lpages;
642 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 642 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
643 spte = NULL;
643 write_protected = 1; 644 write_protected = 1;
644 } 645 }
645 spte = rmap_next(kvm, rmapp, spte); 646 spte = rmap_next(kvm, rmapp, spte);
@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1082 struct kvm_mmu_page *shadow; 1083 struct kvm_mmu_page *shadow;
1083 1084
1084 spte |= PT_WRITABLE_MASK; 1085 spte |= PT_WRITABLE_MASK;
1085 if (user_fault) {
1086 mmu_unshadow(vcpu->kvm, gfn);
1087 goto unshadowed;
1088 }
1089 1086
1090 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1087 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1091 if (shadow || 1088 if (shadow ||
@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1102 } 1099 }
1103 } 1100 }
1104 1101
1105unshadowed:
1106
1107 if (pte_access & ACC_WRITE_MASK) 1102 if (pte_access & ACC_WRITE_MASK)
1108 mark_page_dirty(vcpu->kvm, gfn); 1103 mark_page_dirty(vcpu->kvm, gfn);
1109 1104
@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1580 u64 *spte, 1575 u64 *spte,
1581 const void *new) 1576 const void *new)
1582{ 1577{
1583 if ((sp->role.level != PT_PAGE_TABLE_LEVEL) 1578 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
1584 && !vcpu->arch.update_pte.largepage) { 1579 if (!vcpu->arch.update_pte.largepage ||
1585 ++vcpu->kvm->stat.mmu_pde_zapped; 1580 sp->role.glevels == PT32_ROOT_LEVEL) {
1586 return; 1581 ++vcpu->kvm->stat.mmu_pde_zapped;
1587 } 1582 return;
1583 }
1584 }
1588 1585
1589 ++vcpu->kvm->stat.mmu_pte_updated; 1586 ++vcpu->kvm->stat.mmu_pte_updated;
1590 if (sp->role.glevels == PT32_ROOT_LEVEL) 1587 if (sp->role.glevels == PT32_ROOT_LEVEL)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 02efbe75f317..540e95179074 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
566 load_transition_efer(vmx); 566 load_transition_efer(vmx);
567} 567}
568 568
569static void vmx_load_host_state(struct vcpu_vmx *vmx) 569static void __vmx_load_host_state(struct vcpu_vmx *vmx)
570{ 570{
571 unsigned long flags; 571 unsigned long flags;
572 572
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
596 reload_host_efer(vmx); 596 reload_host_efer(vmx);
597} 597}
598 598
599static void vmx_load_host_state(struct vcpu_vmx *vmx)
600{
601 preempt_disable();
602 __vmx_load_host_state(vmx);
603 preempt_enable();
604}
605
599/* 606/*
600 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 607 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
601 * vcpu mutex is already taken. 608 * vcpu mutex is already taken.
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
654 661
655static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 662static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
656{ 663{
657 vmx_load_host_state(to_vmx(vcpu)); 664 __vmx_load_host_state(to_vmx(vcpu));
658} 665}
659 666
660static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 667static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
884 switch (msr_index) { 891 switch (msr_index) {
885#ifdef CONFIG_X86_64 892#ifdef CONFIG_X86_64
886 case MSR_EFER: 893 case MSR_EFER:
894 vmx_load_host_state(vmx);
887 ret = kvm_set_msr_common(vcpu, msr_index, data); 895 ret = kvm_set_msr_common(vcpu, msr_index, data);
888 if (vmx->host_state.loaded) {
889 reload_host_efer(vmx);
890 load_transition_efer(vmx);
891 }
892 break; 896 break;
893 case MSR_FS_BASE: 897 case MSR_FS_BASE:
894 vmcs_writel(GUEST_FS_BASE, data); 898 vmcs_writel(GUEST_FS_BASE, data);
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
910 guest_write_tsc(data); 914 guest_write_tsc(data);
911 break; 915 break;
912 default: 916 default:
917 vmx_load_host_state(vmx);
913 msr = find_msr_entry(vmx, msr_index); 918 msr = find_msr_entry(vmx, msr_index);
914 if (msr) { 919 if (msr) {
915 msr->data = data; 920 msr->data = data;
916 if (vmx->host_state.loaded)
917 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
918 break; 921 break;
919 } 922 }
920 ret = kvm_set_msr_common(vcpu, msr_index, data); 923 ret = kvm_set_msr_common(vcpu, msr_index, data);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00acf1301a15..63a77caa59f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
492static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 492static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
493{ 493{
494 static int version; 494 static int version;
495 struct kvm_wall_clock wc; 495 struct pvclock_wall_clock wc;
496 struct timespec wc_ts; 496 struct timespec now, sys, boot;
497 497
498 if (!wall_clock) 498 if (!wall_clock)
499 return; 499 return;
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
502 502
503 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 503 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
504 504
505 wc_ts = current_kernel_time(); 505 /*
506 wc.wc_sec = wc_ts.tv_sec; 506 * The guest calculates current wall clock time by adding
507 wc.wc_nsec = wc_ts.tv_nsec; 507 * system time (updated by kvm_write_guest_time below) to the
508 wc.wc_version = version; 508 * wall clock specified here. guest system time equals host
509 * system time for us, thus we must fill in host boot time here.
510 */
511 now = current_kernel_time();
512 ktime_get_ts(&sys);
513 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
514
515 wc.sec = boot.tv_sec;
516 wc.nsec = boot.tv_nsec;
517 wc.version = version;
509 518
510 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 519 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
511 520
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
513 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 522 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
514} 523}
515 524
525static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
526{
527 uint32_t quotient, remainder;
528
529 /* Don't try to replace with do_div(), this one calculates
530 * "(dividend << 32) / divisor" */
531 __asm__ ( "divl %4"
532 : "=a" (quotient), "=d" (remainder)
533 : "0" (0), "1" (dividend), "r" (divisor) );
534 return quotient;
535}
536
537static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
538{
539 uint64_t nsecs = 1000000000LL;
540 int32_t shift = 0;
541 uint64_t tps64;
542 uint32_t tps32;
543
544 tps64 = tsc_khz * 1000LL;
545 while (tps64 > nsecs*2) {
546 tps64 >>= 1;
547 shift--;
548 }
549
550 tps32 = (uint32_t)tps64;
551 while (tps32 <= (uint32_t)nsecs) {
552 tps32 <<= 1;
553 shift++;
554 }
555
556 hv_clock->tsc_shift = shift;
557 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
558
559 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
560 __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
561 hv_clock->tsc_to_system_mul);
562}
563
516static void kvm_write_guest_time(struct kvm_vcpu *v) 564static void kvm_write_guest_time(struct kvm_vcpu *v)
517{ 565{
518 struct timespec ts; 566 struct timespec ts;
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
523 if ((!vcpu->time_page)) 571 if ((!vcpu->time_page))
524 return; 572 return;
525 573
574 if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
575 kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
576 vcpu->hv_clock_tsc_khz = tsc_khz;
577 }
578
526 /* Keep irq disabled to prevent changes to the clock */ 579 /* Keep irq disabled to prevent changes to the clock */
527 local_irq_save(flags); 580 local_irq_save(flags);
528 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 581 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
537 /* 590 /*
538 * The interface expects us to write an even number signaling that the 591 * The interface expects us to write an even number signaling that the
539 * update is finished. Since the guest won't see the intermediate 592 * update is finished. Since the guest won't see the intermediate
540 * state, we just write "2" at the end 593 * state, we just increase by 2 at the end.
541 */ 594 */
542 vcpu->hv_clock.version = 2; 595 vcpu->hv_clock.version += 2;
543 596
544 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 597 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
545 598
546 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 599 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
547 sizeof(vcpu->hv_clock)); 600 sizeof(vcpu->hv_clock));
548 601
549 kunmap_atomic(shared_kaddr, KM_USER0); 602 kunmap_atomic(shared_kaddr, KM_USER0);
550 603
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
599 /* ...but clean it before doing the actual write */ 652 /* ...but clean it before doing the actual write */
600 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 653 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
601 654
602 vcpu->arch.hv_clock.tsc_to_system_mul =
603 clocksource_khz2mult(tsc_khz, 22);
604 vcpu->arch.hv_clock.tsc_shift = 22;
605
606 down_read(&current->mm->mmap_sem); 655 down_read(&current->mm->mmap_sem);
607 vcpu->arch.time_page = 656 vcpu->arch.time_page =
608 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 657 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
@@ -2759,6 +2808,8 @@ again:
2759 if (vcpu->requests) { 2808 if (vcpu->requests) {
2760 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2809 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2761 __kvm_migrate_timers(vcpu); 2810 __kvm_migrate_timers(vcpu);
2811 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2812 kvm_x86_ops->tlb_flush(vcpu);
2762 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2813 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2763 &vcpu->requests)) { 2814 &vcpu->requests)) {
2764 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 2815 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -2772,6 +2823,7 @@ again:
2772 } 2823 }
2773 } 2824 }
2774 2825
2826 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
2775 kvm_inject_pending_timer_irqs(vcpu); 2827 kvm_inject_pending_timer_irqs(vcpu);
2776 2828
2777 preempt_disable(); 2829 preempt_disable();
@@ -2781,21 +2833,13 @@ again:
2781 2833
2782 local_irq_disable(); 2834 local_irq_disable();
2783 2835
2784 if (need_resched()) { 2836 if (vcpu->requests || need_resched()) {
2785 local_irq_enable(); 2837 local_irq_enable();
2786 preempt_enable(); 2838 preempt_enable();
2787 r = 1; 2839 r = 1;
2788 goto out; 2840 goto out;
2789 } 2841 }
2790 2842
2791 if (vcpu->requests)
2792 if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
2793 local_irq_enable();
2794 preempt_enable();
2795 r = 1;
2796 goto out;
2797 }
2798
2799 if (signal_pending(current)) { 2843 if (signal_pending(current)) {
2800 local_irq_enable(); 2844 local_irq_enable();
2801 preempt_enable(); 2845 preempt_enable();
@@ -2825,9 +2869,6 @@ again:
2825 2869
2826 kvm_guest_enter(); 2870 kvm_guest_enter();
2827 2871
2828 if (vcpu->requests)
2829 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2830 kvm_x86_ops->tlb_flush(vcpu);
2831 2872
2832 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 2873 KVMTRACE_0D(VMENTRY, vcpu, entryexit);
2833 kvm_x86_ops->run(vcpu, kvm_run); 2874 kvm_x86_ops->run(vcpu, kvm_run);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 525b108411bd..6c388e593bc8 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,6 +5,7 @@
5config XEN 5config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK
8 depends on X86_32 9 depends on X86_32
9 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
10 help 11 help
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 52b2e3856980..41e217503c96 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -14,6 +14,7 @@
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/math64.h> 15#include <linux/math64.h>
16 16
17#include <asm/pvclock.h>
17#include <asm/xen/hypervisor.h> 18#include <asm/xen/hypervisor.h>
18#include <asm/xen/hypercall.h> 19#include <asm/xen/hypercall.h>
19 20
@@ -31,17 +32,6 @@
31 32
32static cycle_t xen_clocksource_read(void); 33static cycle_t xen_clocksource_read(void);
33 34
34/* These are perodically updated in shared_info, and then copied here. */
35struct shadow_time_info {
36 u64 tsc_timestamp; /* TSC at last update of time vals. */
37 u64 system_timestamp; /* Time, in nanosecs, since boot. */
38 u32 tsc_to_nsec_mul;
39 int tsc_shift;
40 u32 version;
41};
42
43static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
44
45/* runstate info updated by Xen */ 35/* runstate info updated by Xen */
46static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
47 37
@@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void)
211unsigned long xen_cpu_khz(void) 201unsigned long xen_cpu_khz(void)
212{ 202{
213 u64 xen_khz = 1000000ULL << 32; 203 u64 xen_khz = 1000000ULL << 32;
214 const struct vcpu_time_info *info = 204 const struct pvclock_vcpu_time_info *info =
215 &HYPERVISOR_shared_info->vcpu_info[0].time; 205 &HYPERVISOR_shared_info->vcpu_info[0].time;
216 206
217 do_div(xen_khz, info->tsc_to_system_mul); 207 do_div(xen_khz, info->tsc_to_system_mul);
@@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void)
223 return xen_khz; 213 return xen_khz;
224} 214}
225 215
226/*
227 * Reads a consistent set of time-base values from Xen, into a shadow data
228 * area.
229 */
230static unsigned get_time_values_from_xen(void)
231{
232 struct vcpu_time_info *src;
233 struct shadow_time_info *dst;
234
235 /* src is shared memory with the hypervisor, so we need to
236 make sure we get a consistent snapshot, even in the face of
237 being preempted. */
238 src = &__get_cpu_var(xen_vcpu)->time;
239 dst = &__get_cpu_var(shadow_time);
240
241 do {
242 dst->version = src->version;
243 rmb(); /* fetch version before data */
244 dst->tsc_timestamp = src->tsc_timestamp;
245 dst->system_timestamp = src->system_time;
246 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
247 dst->tsc_shift = src->tsc_shift;
248 rmb(); /* test version after fetching data */
249 } while ((src->version & 1) | (dst->version ^ src->version));
250
251 return dst->version;
252}
253
254/*
255 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
256 * yielding a 64-bit result.
257 */
258static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
259{
260 u64 product;
261#ifdef __i386__
262 u32 tmp1, tmp2;
263#endif
264
265 if (shift < 0)
266 delta >>= -shift;
267 else
268 delta <<= shift;
269
270#ifdef __i386__
271 __asm__ (
272 "mul %5 ; "
273 "mov %4,%%eax ; "
274 "mov %%edx,%4 ; "
275 "mul %5 ; "
276 "xor %5,%5 ; "
277 "add %4,%%eax ; "
278 "adc %5,%%edx ; "
279 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
280 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
281#elif __x86_64__
282 __asm__ (
283 "mul %%rdx ; shrd $32,%%rdx,%%rax"
284 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
285#else
286#error implement me!
287#endif
288
289 return product;
290}
291
292static u64 get_nsec_offset(struct shadow_time_info *shadow)
293{
294 u64 now, delta;
295 now = native_read_tsc();
296 delta = now - shadow->tsc_timestamp;
297 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
298}
299
300static cycle_t xen_clocksource_read(void) 216static cycle_t xen_clocksource_read(void)
301{ 217{
302 struct shadow_time_info *shadow = &get_cpu_var(shadow_time); 218 struct pvclock_vcpu_time_info *src;
303 cycle_t ret; 219 cycle_t ret;
304 unsigned version;
305
306 do {
307 version = get_time_values_from_xen();
308 barrier();
309 ret = shadow->system_timestamp + get_nsec_offset(shadow);
310 barrier();
311 } while (version != __get_cpu_var(xen_vcpu)->time.version);
312
313 put_cpu_var(shadow_time);
314 220
221 src = &get_cpu_var(xen_vcpu)->time;
222 ret = pvclock_clocksource_read(src);
223 put_cpu_var(xen_vcpu);
315 return ret; 224 return ret;
316} 225}
317 226
318static void xen_read_wallclock(struct timespec *ts) 227static void xen_read_wallclock(struct timespec *ts)
319{ 228{
320 const struct shared_info *s = HYPERVISOR_shared_info; 229 struct shared_info *s = HYPERVISOR_shared_info;
321 u32 version; 230 struct pvclock_wall_clock *wall_clock = &(s->wc);
322 u64 delta; 231 struct pvclock_vcpu_time_info *vcpu_time;
323 struct timespec now;
324
325 /* get wallclock at system boot */
326 do {
327 version = s->wc_version;
328 rmb(); /* fetch version before time */
329 now.tv_sec = s->wc_sec;
330 now.tv_nsec = s->wc_nsec;
331 rmb(); /* fetch time before checking version */
332 } while ((s->wc_version & 1) | (version ^ s->wc_version));
333 232
334 delta = xen_clocksource_read(); /* time since system boot */ 233 vcpu_time = &get_cpu_var(xen_vcpu)->time;
335 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; 234 pvclock_read_wallclock(wall_clock, vcpu_time, ts);
336 235 put_cpu_var(xen_vcpu);
337 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
338 now.tv_sec = delta;
339
340 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
341} 236}
342 237
343unsigned long xen_get_wallclock(void) 238unsigned long xen_get_wallclock(void)
@@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void)
345 struct timespec ts; 240 struct timespec ts;
346 241
347 xen_read_wallclock(&ts); 242 xen_read_wallclock(&ts);
348
349 return ts.tv_sec; 243 return ts.tv_sec;
350} 244}
351 245
@@ -569,8 +463,6 @@ __init void xen_time_init(void)
569{ 463{
570 int cpu = smp_processor_id(); 464 int cpu = smp_processor_id();
571 465
572 get_time_values_from_xen();
573
574 clocksource_register(&xen_clocksource); 466 clocksource_register(&xen_clocksource);
575 467
576 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { 468 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {