aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-06-25 06:30:21 -0400
committerIngo Molnar <mingo@elte.hu>2008-06-25 06:30:21 -0400
commit8b7ef4ec5b1ac8b6feebf5ae9cda85a7514728f8 (patch)
tree8f1f746f98707fb82bcec281ee8f65216eddae36 /arch/x86
parenta1d5a8691f1b6c92491747bea3b778b184fa5837 (diff)
parent543cf4cb3fe6f6cae3651ba918b9c56200b257d0 (diff)
Merge branch 'linus' into x86/fixmap
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/geode_32.c5
-rw-r--r--arch/x86/kernel/kvmclock.c89
-rw-r--r--arch/x86/kernel/process_32.c1
-rw-r--r--arch/x86/kernel/process_64.c1
-rw-r--r--arch/x86/kernel/pvclock.c141
-rw-r--r--arch/x86/kernel/setup_32.c10
-rw-r--r--arch/x86/kernel/tsc_32.c18
-rw-r--r--arch/x86/kvm/i8254.c9
-rw-r--r--arch/x86/kvm/lapic.c1
-rw-r--r--arch/x86/kvm/mmu.c19
-rw-r--r--arch/x86/kvm/vmx.c19
-rw-r--r--arch/x86/kvm/x86.c91
-rw-r--r--arch/x86/xen/Kconfig3
-rw-r--r--arch/x86/xen/enlighten.c56
-rw-r--r--arch/x86/xen/mmu.c75
-rw-r--r--arch/x86/xen/mmu.h24
-rw-r--r--arch/x86/xen/time.c132
-rw-r--r--arch/x86/xen/xen-head.S6
20 files changed, 370 insertions, 336 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 52e18e6d2ba0..e0edaaa6920a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -383,6 +383,7 @@ config VMI
383config KVM_CLOCK 383config KVM_CLOCK
384 bool "KVM paravirtualized clock" 384 bool "KVM paravirtualized clock"
385 select PARAVIRT 385 select PARAVIRT
386 select PARAVIRT_CLOCK
386 depends on !(X86_VISWS || X86_VOYAGER) 387 depends on !(X86_VISWS || X86_VOYAGER)
387 help 388 help
388 Turning on this option will allow you to run a paravirtualized clock 389 Turning on this option will allow you to run a paravirtualized clock
@@ -410,6 +411,10 @@ config PARAVIRT
410 over full virtualization. However, when run without a hypervisor 411 over full virtualization. However, when run without a hypervisor
411 the kernel is theoretically slower and slightly larger. 412 the kernel is theoretically slower and slightly larger.
412 413
414config PARAVIRT_CLOCK
415 bool
416 default n
417
413endif 418endif
414 419
415config MEMTEST_BOOTPARAM 420config MEMTEST_BOOTPARAM
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..77807d4769c9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
82obj-$(CONFIG_KVM_GUEST) += kvm.o 82obj-$(CONFIG_KVM_GUEST) += kvm.o
83obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 83obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
84obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 84obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
85obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
85 86
86obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 87obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
87 88
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
index e8edd63ab000..9b08e852fd1a 100644
--- a/arch/x86/kernel/geode_32.c
+++ b/arch/x86/kernel/geode_32.c
@@ -166,6 +166,8 @@ int geode_has_vsa2(void)
166 static int has_vsa2 = -1; 166 static int has_vsa2 = -1;
167 167
168 if (has_vsa2 == -1) { 168 if (has_vsa2 == -1) {
169 u16 val;
170
169 /* 171 /*
170 * The VSA has virtual registers that we can query for a 172 * The VSA has virtual registers that we can query for a
171 * signature. 173 * signature.
@@ -173,7 +175,8 @@ int geode_has_vsa2(void)
173 outw(VSA_VR_UNLOCK, VSA_VRC_INDEX); 175 outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
174 outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX); 176 outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
175 177
176 has_vsa2 = (inw(VSA_VRC_DATA) == VSA_SIG); 178 val = inw(VSA_VRC_DATA);
179 has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
177 } 180 }
178 181
179 return has_vsa2; 182 return has_vsa2;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 08a30986d472..87edf1ceb1df 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/clocksource.h> 19#include <linux/clocksource.h>
20#include <linux/kvm_para.h> 20#include <linux/kvm_para.h>
21#include <asm/pvclock.h>
21#include <asm/arch_hooks.h> 22#include <asm/arch_hooks.h>
22#include <asm/msr.h> 23#include <asm/msr.h>
23#include <asm/apic.h> 24#include <asm/apic.h>
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)
36early_param("no-kvmclock", parse_no_kvmclock); 37early_param("no-kvmclock", parse_no_kvmclock);
37 38
38/* The hypervisor will put information about time periodically here */ 39/* The hypervisor will put information about time periodically here */
39static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); 40static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
40#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field 41static struct pvclock_wall_clock wall_clock;
41 42
42static inline u64 kvm_get_delta(u64 last_tsc)
43{
44 int cpu = smp_processor_id();
45 u64 delta = native_read_tsc() - last_tsc;
46 return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
47}
48
49static struct kvm_wall_clock wall_clock;
50static cycle_t kvm_clock_read(void);
51/* 43/*
52 * The wallclock is the time of day when we booted. Since then, some time may 44 * The wallclock is the time of day when we booted. Since then, some time may
53 * have elapsed since the hypervisor wrote the data. So we try to account for 45 * have elapsed since the hypervisor wrote the data. So we try to account for
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);
55 */ 47 */
56static unsigned long kvm_get_wallclock(void) 48static unsigned long kvm_get_wallclock(void)
57{ 49{
58 u32 wc_sec, wc_nsec; 50 struct pvclock_vcpu_time_info *vcpu_time;
59 u64 delta;
60 struct timespec ts; 51 struct timespec ts;
61 int version, nsec;
62 int low, high; 52 int low, high;
63 53
64 low = (int)__pa(&wall_clock); 54 low = (int)__pa(&wall_clock);
65 high = ((u64)__pa(&wall_clock) >> 32); 55 high = ((u64)__pa(&wall_clock) >> 32);
56 native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
66 57
67 delta = kvm_clock_read(); 58 vcpu_time = &get_cpu_var(hv_clock);
59 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
60 put_cpu_var(hv_clock);
68 61
69 native_write_msr(MSR_KVM_WALL_CLOCK, low, high); 62 return ts.tv_sec;
70 do {
71 version = wall_clock.wc_version;
72 rmb();
73 wc_sec = wall_clock.wc_sec;
74 wc_nsec = wall_clock.wc_nsec;
75 rmb();
76 } while ((wall_clock.wc_version != version) || (version & 1));
77
78 delta = kvm_clock_read() - delta;
79 delta += wc_nsec;
80 nsec = do_div(delta, NSEC_PER_SEC);
81 set_normalized_timespec(&ts, wc_sec + delta, nsec);
82 /*
83 * Of all mechanisms of time adjustment I've tested, this one
84 * was the champion!
85 */
86 return ts.tv_sec + 1;
87} 63}
88 64
89static int kvm_set_wallclock(unsigned long now) 65static int kvm_set_wallclock(unsigned long now)
90{ 66{
91 return 0; 67 return -1;
92} 68}
93 69
94/*
95 * This is our read_clock function. The host puts an tsc timestamp each time
96 * it updates a new time. Without the tsc adjustment, we can have a situation
97 * in which a vcpu starts to run earlier (smaller system_time), but probes
98 * time later (compared to another vcpu), leading to backwards time
99 */
100static cycle_t kvm_clock_read(void) 70static cycle_t kvm_clock_read(void)
101{ 71{
102 u64 last_tsc, now; 72 struct pvclock_vcpu_time_info *src;
103 int cpu; 73 cycle_t ret;
104 74
105 preempt_disable(); 75 src = &get_cpu_var(hv_clock);
106 cpu = smp_processor_id(); 76 ret = pvclock_clocksource_read(src);
107 77 put_cpu_var(hv_clock);
108 last_tsc = get_clock(cpu, tsc_timestamp); 78 return ret;
109 now = get_clock(cpu, system_time);
110
111 now += kvm_get_delta(last_tsc);
112 preempt_enable();
113
114 return now;
115} 79}
80
116static struct clocksource kvm_clock = { 81static struct clocksource kvm_clock = {
117 .name = "kvm-clock", 82 .name = "kvm-clock",
118 .read = kvm_clock_read, 83 .read = kvm_clock_read,
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
123 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 88 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
124}; 89};
125 90
126static int kvm_register_clock(void) 91static int kvm_register_clock(char *txt)
127{ 92{
128 int cpu = smp_processor_id(); 93 int cpu = smp_processor_id();
129 int low, high; 94 int low, high;
130 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 95 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
131 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 96 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
132 97 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
98 cpu, high, low, txt);
133 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); 99 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
134} 100}
135 101
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
140 * Now that the first cpu already had this clocksource initialized, 106 * Now that the first cpu already had this clocksource initialized,
141 * we shouldn't fail. 107 * we shouldn't fail.
142 */ 108 */
143 WARN_ON(kvm_register_clock()); 109 WARN_ON(kvm_register_clock("secondary cpu clock"));
144 /* ok, done with our trickery, call native */ 110 /* ok, done with our trickery, call native */
145 setup_secondary_APIC_clock(); 111 setup_secondary_APIC_clock();
146} 112}
147#endif 113#endif
148 114
115#ifdef CONFIG_SMP
116void __init kvm_smp_prepare_boot_cpu(void)
117{
118 WARN_ON(kvm_register_clock("primary cpu clock"));
119 native_smp_prepare_boot_cpu();
120}
121#endif
122
149/* 123/*
150 * After the clock is registered, the host will keep writing to the 124 * After the clock is registered, the host will keep writing to the
151 * registered memory location. If the guest happens to shutdown, this memory 125 * registered memory location. If the guest happens to shutdown, this memory
@@ -174,7 +148,7 @@ void __init kvmclock_init(void)
174 return; 148 return;
175 149
176 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 150 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
177 if (kvm_register_clock()) 151 if (kvm_register_clock("boot clock"))
178 return; 152 return;
179 pv_time_ops.get_wallclock = kvm_get_wallclock; 153 pv_time_ops.get_wallclock = kvm_get_wallclock;
180 pv_time_ops.set_wallclock = kvm_set_wallclock; 154 pv_time_ops.set_wallclock = kvm_set_wallclock;
@@ -182,6 +156,9 @@ void __init kvmclock_init(void)
182#ifdef CONFIG_X86_LOCAL_APIC 156#ifdef CONFIG_X86_LOCAL_APIC
183 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 157 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
184#endif 158#endif
159#ifdef CONFIG_SMP
160 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
161#endif
185 machine_ops.shutdown = kvm_shutdown; 162 machine_ops.shutdown = kvm_shutdown;
186#ifdef CONFIG_KEXEC 163#ifdef CONFIG_KEXEC
187 machine_ops.crash_shutdown = kvm_crash_shutdown; 164 machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6d5483356e74..e2db9ac5c61c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -333,6 +333,7 @@ void flush_thread(void)
333 /* 333 /*
334 * Forget coprocessor state.. 334 * Forget coprocessor state..
335 */ 335 */
336 tsk->fpu_counter = 0;
336 clear_fpu(tsk); 337 clear_fpu(tsk);
337 clear_used_math(); 338 clear_used_math();
338} 339}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ac54ff56df80..c6eb5c91e5f6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -294,6 +294,7 @@ void flush_thread(void)
294 /* 294 /*
295 * Forget coprocessor state.. 295 * Forget coprocessor state..
296 */ 296 */
297 tsk->fpu_counter = 0;
297 clear_fpu(tsk); 298 clear_fpu(tsk);
298 clear_used_math(); 299 clear_used_math();
299} 300}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
new file mode 100644
index 000000000000..05fbe9a0325a
--- /dev/null
+++ b/arch/x86/kernel/pvclock.c
@@ -0,0 +1,141 @@
1/* paravirtual clock -- common code used by kvm/xen
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16*/
17
18#include <linux/kernel.h>
19#include <linux/percpu.h>
20#include <asm/pvclock.h>
21
22/*
23 * These are perodically updated
24 * xen: magic shared_info page
25 * kvm: gpa registered via msr
26 * and then copied here.
27 */
28struct pvclock_shadow_time {
29 u64 tsc_timestamp; /* TSC at last update of time vals. */
30 u64 system_timestamp; /* Time, in nanosecs, since boot. */
31 u32 tsc_to_nsec_mul;
32 int tsc_shift;
33 u32 version;
34};
35
36/*
37 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
38 * yielding a 64-bit result.
39 */
40static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
41{
42 u64 product;
43#ifdef __i386__
44 u32 tmp1, tmp2;
45#endif
46
47 if (shift < 0)
48 delta >>= -shift;
49 else
50 delta <<= shift;
51
52#ifdef __i386__
53 __asm__ (
54 "mul %5 ; "
55 "mov %4,%%eax ; "
56 "mov %%edx,%4 ; "
57 "mul %5 ; "
58 "xor %5,%5 ; "
59 "add %4,%%eax ; "
60 "adc %5,%%edx ; "
61 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
63#elif __x86_64__
64 __asm__ (
65 "mul %%rdx ; shrd $32,%%rdx,%%rax"
66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
67#else
68#error implement me!
69#endif
70
71 return product;
72}
73
74static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
75{
76 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
77 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
78}
79
80/*
81 * Reads a consistent set of time-base values from hypervisor,
82 * into a shadow data area.
83 */
84static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
85 struct pvclock_vcpu_time_info *src)
86{
87 do {
88 dst->version = src->version;
89 rmb(); /* fetch version before data */
90 dst->tsc_timestamp = src->tsc_timestamp;
91 dst->system_timestamp = src->system_time;
92 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
93 dst->tsc_shift = src->tsc_shift;
94 rmb(); /* test version after fetching data */
95 } while ((src->version & 1) || (dst->version != src->version));
96
97 return dst->version;
98}
99
100cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
101{
102 struct pvclock_shadow_time shadow;
103 unsigned version;
104 cycle_t ret, offset;
105
106 do {
107 version = pvclock_get_time_values(&shadow, src);
108 barrier();
109 offset = pvclock_get_nsec_offset(&shadow);
110 ret = shadow.system_timestamp + offset;
111 barrier();
112 } while (version != src->version);
113
114 return ret;
115}
116
117void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
118 struct pvclock_vcpu_time_info *vcpu_time,
119 struct timespec *ts)
120{
121 u32 version;
122 u64 delta;
123 struct timespec now;
124
125 /* get wallclock at system boot */
126 do {
127 version = wall_clock->version;
128 rmb(); /* fetch version before time */
129 now.tv_sec = wall_clock->sec;
130 now.tv_nsec = wall_clock->nsec;
131 rmb(); /* fetch time before checking version */
132 } while ((wall_clock->version & 1) || (version != wall_clock->version));
133
134 delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
135 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
136
137 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
138 now.tv_sec = delta;
139
140 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
141}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 2c5f8b213e86..5a2f8e063887 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -532,10 +532,16 @@ static void __init reserve_crashkernel(void)
532 (unsigned long)(crash_size >> 20), 532 (unsigned long)(crash_size >> 20),
533 (unsigned long)(crash_base >> 20), 533 (unsigned long)(crash_base >> 20),
534 (unsigned long)(total_mem >> 20)); 534 (unsigned long)(total_mem >> 20));
535
536 if (reserve_bootmem(crash_base, crash_size,
537 BOOTMEM_EXCLUSIVE) < 0) {
538 printk(KERN_INFO "crashkernel reservation "
539 "failed - memory is in use\n");
540 return;
541 }
542
535 crashk_res.start = crash_base; 543 crashk_res.start = crash_base;
536 crashk_res.end = crash_base + crash_size - 1; 544 crashk_res.end = crash_base + crash_size - 1;
537 reserve_bootmem(crash_base, crash_size,
538 BOOTMEM_DEFAULT);
539 } else 545 } else
540 printk(KERN_INFO "crashkernel reservation failed - " 546 printk(KERN_INFO "crashkernel reservation failed - "
541 "you have to specify a base address\n"); 547 "you have to specify a base address\n");
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 068759db63dd..65b70637ad97 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -14,7 +14,10 @@
14 14
15#include "mach_timer.h" 15#include "mach_timer.h"
16 16
17static int tsc_disabled; 17/* native_sched_clock() is called before tsc_init(), so
18 we must start with the TSC soft disabled to prevent
19 erroneous rdtsc usage on !cpu_has_tsc processors */
20static int tsc_disabled = -1;
18 21
19/* 22/*
20 * On some systems the TSC frequency does not 23 * On some systems the TSC frequency does not
@@ -402,25 +405,20 @@ void __init tsc_init(void)
402{ 405{
403 int cpu; 406 int cpu;
404 407
405 if (!cpu_has_tsc || tsc_disabled) { 408 if (!cpu_has_tsc || tsc_disabled > 0)
406 /* Disable the TSC in case of !cpu_has_tsc */
407 tsc_disabled = 1;
408 return; 409 return;
409 }
410 410
411 cpu_khz = calculate_cpu_khz(); 411 cpu_khz = calculate_cpu_khz();
412 tsc_khz = cpu_khz; 412 tsc_khz = cpu_khz;
413 413
414 if (!cpu_khz) { 414 if (!cpu_khz) {
415 mark_tsc_unstable("could not calculate TSC khz"); 415 mark_tsc_unstable("could not calculate TSC khz");
416 /*
417 * We need to disable the TSC completely in this case
418 * to prevent sched_clock() from using it.
419 */
420 tsc_disabled = 1;
421 return; 416 return;
422 } 417 }
423 418
419 /* now allow native_sched_clock() to use rdtsc */
420 tsc_disabled = 0;
421
424 printk("Detected %lu.%03lu MHz processor.\n", 422 printk("Detected %lu.%03lu MHz processor.\n",
425 (unsigned long)cpu_khz / 1000, 423 (unsigned long)cpu_khz / 1000,
426 (unsigned long)cpu_khz % 1000); 424 (unsigned long)cpu_khz % 1000);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index f2f5d260874e..3829aa7b663f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
200 200
201 atomic_inc(&pt->pending); 201 atomic_inc(&pt->pending);
202 smp_mb__after_atomic_inc(); 202 smp_mb__after_atomic_inc();
203 if (vcpu0 && waitqueue_active(&vcpu0->wq)) { 203 if (vcpu0) {
204 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 204 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
205 wake_up_interruptible(&vcpu0->wq); 205 if (waitqueue_active(&vcpu0->wq)) {
206 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
207 wake_up_interruptible(&vcpu0->wq);
208 }
206 } 209 }
207 210
208 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 211 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c297c50eba63..ebc03f5ae162 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
940 wait_queue_head_t *q = &apic->vcpu->wq; 940 wait_queue_head_t *q = &apic->vcpu->wq;
941 941
942 atomic_inc(&apic->timer.pending); 942 atomic_inc(&apic->timer.pending);
943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
943 if (waitqueue_active(q)) { 944 if (waitqueue_active(q)) {
944 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 945 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
945 wake_up_interruptible(q); 946 wake_up_interruptible(q);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ee3f53098f0c..7e7c3969f7a2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
640 rmap_remove(kvm, spte); 640 rmap_remove(kvm, spte);
641 --kvm->stat.lpages; 641 --kvm->stat.lpages;
642 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 642 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
643 spte = NULL;
643 write_protected = 1; 644 write_protected = 1;
644 } 645 }
645 spte = rmap_next(kvm, rmapp, spte); 646 spte = rmap_next(kvm, rmapp, spte);
@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1082 struct kvm_mmu_page *shadow; 1083 struct kvm_mmu_page *shadow;
1083 1084
1084 spte |= PT_WRITABLE_MASK; 1085 spte |= PT_WRITABLE_MASK;
1085 if (user_fault) {
1086 mmu_unshadow(vcpu->kvm, gfn);
1087 goto unshadowed;
1088 }
1089 1086
1090 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1087 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1091 if (shadow || 1088 if (shadow ||
@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1102 } 1099 }
1103 } 1100 }
1104 1101
1105unshadowed:
1106
1107 if (pte_access & ACC_WRITE_MASK) 1102 if (pte_access & ACC_WRITE_MASK)
1108 mark_page_dirty(vcpu->kvm, gfn); 1103 mark_page_dirty(vcpu->kvm, gfn);
1109 1104
@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1580 u64 *spte, 1575 u64 *spte,
1581 const void *new) 1576 const void *new)
1582{ 1577{
1583 if ((sp->role.level != PT_PAGE_TABLE_LEVEL) 1578 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
1584 && !vcpu->arch.update_pte.largepage) { 1579 if (!vcpu->arch.update_pte.largepage ||
1585 ++vcpu->kvm->stat.mmu_pde_zapped; 1580 sp->role.glevels == PT32_ROOT_LEVEL) {
1586 return; 1581 ++vcpu->kvm->stat.mmu_pde_zapped;
1587 } 1582 return;
1583 }
1584 }
1588 1585
1589 ++vcpu->kvm->stat.mmu_pte_updated; 1586 ++vcpu->kvm->stat.mmu_pte_updated;
1590 if (sp->role.glevels == PT32_ROOT_LEVEL) 1587 if (sp->role.glevels == PT32_ROOT_LEVEL)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 02efbe75f317..540e95179074 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
566 load_transition_efer(vmx); 566 load_transition_efer(vmx);
567} 567}
568 568
569static void vmx_load_host_state(struct vcpu_vmx *vmx) 569static void __vmx_load_host_state(struct vcpu_vmx *vmx)
570{ 570{
571 unsigned long flags; 571 unsigned long flags;
572 572
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
596 reload_host_efer(vmx); 596 reload_host_efer(vmx);
597} 597}
598 598
599static void vmx_load_host_state(struct vcpu_vmx *vmx)
600{
601 preempt_disable();
602 __vmx_load_host_state(vmx);
603 preempt_enable();
604}
605
599/* 606/*
600 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 607 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
601 * vcpu mutex is already taken. 608 * vcpu mutex is already taken.
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
654 661
655static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 662static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
656{ 663{
657 vmx_load_host_state(to_vmx(vcpu)); 664 __vmx_load_host_state(to_vmx(vcpu));
658} 665}
659 666
660static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 667static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
884 switch (msr_index) { 891 switch (msr_index) {
885#ifdef CONFIG_X86_64 892#ifdef CONFIG_X86_64
886 case MSR_EFER: 893 case MSR_EFER:
894 vmx_load_host_state(vmx);
887 ret = kvm_set_msr_common(vcpu, msr_index, data); 895 ret = kvm_set_msr_common(vcpu, msr_index, data);
888 if (vmx->host_state.loaded) {
889 reload_host_efer(vmx);
890 load_transition_efer(vmx);
891 }
892 break; 896 break;
893 case MSR_FS_BASE: 897 case MSR_FS_BASE:
894 vmcs_writel(GUEST_FS_BASE, data); 898 vmcs_writel(GUEST_FS_BASE, data);
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
910 guest_write_tsc(data); 914 guest_write_tsc(data);
911 break; 915 break;
912 default: 916 default:
917 vmx_load_host_state(vmx);
913 msr = find_msr_entry(vmx, msr_index); 918 msr = find_msr_entry(vmx, msr_index);
914 if (msr) { 919 if (msr) {
915 msr->data = data; 920 msr->data = data;
916 if (vmx->host_state.loaded)
917 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
918 break; 921 break;
919 } 922 }
920 ret = kvm_set_msr_common(vcpu, msr_index, data); 923 ret = kvm_set_msr_common(vcpu, msr_index, data);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00acf1301a15..63a77caa59f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
492static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 492static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
493{ 493{
494 static int version; 494 static int version;
495 struct kvm_wall_clock wc; 495 struct pvclock_wall_clock wc;
496 struct timespec wc_ts; 496 struct timespec now, sys, boot;
497 497
498 if (!wall_clock) 498 if (!wall_clock)
499 return; 499 return;
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
502 502
503 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 503 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
504 504
505 wc_ts = current_kernel_time(); 505 /*
506 wc.wc_sec = wc_ts.tv_sec; 506 * The guest calculates current wall clock time by adding
507 wc.wc_nsec = wc_ts.tv_nsec; 507 * system time (updated by kvm_write_guest_time below) to the
508 wc.wc_version = version; 508 * wall clock specified here. guest system time equals host
509 * system time for us, thus we must fill in host boot time here.
510 */
511 now = current_kernel_time();
512 ktime_get_ts(&sys);
513 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
514
515 wc.sec = boot.tv_sec;
516 wc.nsec = boot.tv_nsec;
517 wc.version = version;
509 518
510 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 519 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
511 520
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
513 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 522 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
514} 523}
515 524
525static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
526{
527 uint32_t quotient, remainder;
528
529 /* Don't try to replace with do_div(), this one calculates
530 * "(dividend << 32) / divisor" */
531 __asm__ ( "divl %4"
532 : "=a" (quotient), "=d" (remainder)
533 : "0" (0), "1" (dividend), "r" (divisor) );
534 return quotient;
535}
536
537static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
538{
539 uint64_t nsecs = 1000000000LL;
540 int32_t shift = 0;
541 uint64_t tps64;
542 uint32_t tps32;
543
544 tps64 = tsc_khz * 1000LL;
545 while (tps64 > nsecs*2) {
546 tps64 >>= 1;
547 shift--;
548 }
549
550 tps32 = (uint32_t)tps64;
551 while (tps32 <= (uint32_t)nsecs) {
552 tps32 <<= 1;
553 shift++;
554 }
555
556 hv_clock->tsc_shift = shift;
557 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
558
559 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
560 __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
561 hv_clock->tsc_to_system_mul);
562}
563
516static void kvm_write_guest_time(struct kvm_vcpu *v) 564static void kvm_write_guest_time(struct kvm_vcpu *v)
517{ 565{
518 struct timespec ts; 566 struct timespec ts;
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
523 if ((!vcpu->time_page)) 571 if ((!vcpu->time_page))
524 return; 572 return;
525 573
574 if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
575 kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
576 vcpu->hv_clock_tsc_khz = tsc_khz;
577 }
578
526 /* Keep irq disabled to prevent changes to the clock */ 579 /* Keep irq disabled to prevent changes to the clock */
527 local_irq_save(flags); 580 local_irq_save(flags);
528 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 581 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
537 /* 590 /*
538 * The interface expects us to write an even number signaling that the 591 * The interface expects us to write an even number signaling that the
539 * update is finished. Since the guest won't see the intermediate 592 * update is finished. Since the guest won't see the intermediate
540 * state, we just write "2" at the end 593 * state, we just increase by 2 at the end.
541 */ 594 */
542 vcpu->hv_clock.version = 2; 595 vcpu->hv_clock.version += 2;
543 596
544 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 597 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
545 598
546 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 599 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
547 sizeof(vcpu->hv_clock)); 600 sizeof(vcpu->hv_clock));
548 601
549 kunmap_atomic(shared_kaddr, KM_USER0); 602 kunmap_atomic(shared_kaddr, KM_USER0);
550 603
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
599 /* ...but clean it before doing the actual write */ 652 /* ...but clean it before doing the actual write */
600 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 653 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
601 654
602 vcpu->arch.hv_clock.tsc_to_system_mul =
603 clocksource_khz2mult(tsc_khz, 22);
604 vcpu->arch.hv_clock.tsc_shift = 22;
605
606 down_read(&current->mm->mmap_sem); 655 down_read(&current->mm->mmap_sem);
607 vcpu->arch.time_page = 656 vcpu->arch.time_page =
608 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 657 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
@@ -2759,6 +2808,8 @@ again:
2759 if (vcpu->requests) { 2808 if (vcpu->requests) {
2760 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2809 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2761 __kvm_migrate_timers(vcpu); 2810 __kvm_migrate_timers(vcpu);
2811 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2812 kvm_x86_ops->tlb_flush(vcpu);
2762 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2813 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2763 &vcpu->requests)) { 2814 &vcpu->requests)) {
2764 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 2815 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -2772,6 +2823,7 @@ again:
2772 } 2823 }
2773 } 2824 }
2774 2825
2826 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
2775 kvm_inject_pending_timer_irqs(vcpu); 2827 kvm_inject_pending_timer_irqs(vcpu);
2776 2828
2777 preempt_disable(); 2829 preempt_disable();
@@ -2781,21 +2833,13 @@ again:
2781 2833
2782 local_irq_disable(); 2834 local_irq_disable();
2783 2835
2784 if (need_resched()) { 2836 if (vcpu->requests || need_resched()) {
2785 local_irq_enable(); 2837 local_irq_enable();
2786 preempt_enable(); 2838 preempt_enable();
2787 r = 1; 2839 r = 1;
2788 goto out; 2840 goto out;
2789 } 2841 }
2790 2842
2791 if (vcpu->requests)
2792 if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
2793 local_irq_enable();
2794 preempt_enable();
2795 r = 1;
2796 goto out;
2797 }
2798
2799 if (signal_pending(current)) { 2843 if (signal_pending(current)) {
2800 local_irq_enable(); 2844 local_irq_enable();
2801 preempt_enable(); 2845 preempt_enable();
@@ -2825,9 +2869,6 @@ again:
2825 2869
2826 kvm_guest_enter(); 2870 kvm_guest_enter();
2827 2871
2828 if (vcpu->requests)
2829 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2830 kvm_x86_ops->tlb_flush(vcpu);
2831 2872
2832 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 2873 KVMTRACE_0D(VMENTRY, vcpu, entryexit);
2833 kvm_x86_ops->run(vcpu, kvm_run); 2874 kvm_x86_ops->run(vcpu, kvm_run);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 2e641be2737e..6c388e593bc8 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,8 +5,9 @@
5config XEN 5config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK
8 depends on X86_32 9 depends on X86_32
9 depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
10 help 11 help
11 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
12 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0ad8a64a2e05..2d19382e6555 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
785static __init void xen_pagetable_setup_start(pgd_t *base) 785static __init void xen_pagetable_setup_start(pgd_t *base)
786{ 786{
787 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; 787 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
788 int i;
788 789
789 /* special set_pte for pagetable initialization */ 790 /* special set_pte for pagetable initialization */
790 pv_mmu_ops.set_pte = xen_set_pte_init; 791 pv_mmu_ops.set_pte = xen_set_pte_init;
791 792
792 init_mm.pgd = base; 793 init_mm.pgd = base;
793 /* 794 /*
794 * copy top-level of Xen-supplied pagetable into place. For 795 * copy top-level of Xen-supplied pagetable into place. This
795 * !PAE we can use this as-is, but for PAE it is a stand-in 796 * is a stand-in while we copy the pmd pages.
796 * while we copy the pmd pages.
797 */ 797 */
798 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); 798 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
799 799
800 if (PTRS_PER_PMD > 1) { 800 /*
801 int i; 801 * For PAE, need to allocate new pmds, rather than
802 /* 802 * share Xen's, since Xen doesn't like pmd's being
803 * For PAE, need to allocate new pmds, rather than 803 * shared between address spaces.
804 * share Xen's, since Xen doesn't like pmd's being 804 */
805 * shared between address spaces. 805 for (i = 0; i < PTRS_PER_PGD; i++) {
806 */ 806 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
807 for (i = 0; i < PTRS_PER_PGD; i++) { 807 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
808 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
809 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
810 808
811 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), 809 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
812 PAGE_SIZE); 810 PAGE_SIZE);
813 811
814 make_lowmem_page_readonly(pmd); 812 make_lowmem_page_readonly(pmd);
815 813
816 set_pgd(&base[i], __pgd(1 + __pa(pmd))); 814 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
817 } else 815 } else
818 pgd_clear(&base[i]); 816 pgd_clear(&base[i]);
819 }
820 } 817 }
821 818
822 /* make sure zero_page is mapped RO so we can use it in pagetables */ 819 /* make sure zero_page is mapped RO so we can use it in pagetables */
@@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
873 870
874 /* Actually pin the pagetable down, but we can't set PG_pinned 871 /* Actually pin the pagetable down, but we can't set PG_pinned
875 yet because the page structures don't exist yet. */ 872 yet because the page structures don't exist yet. */
876 { 873 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
877 unsigned level;
878
879#ifdef CONFIG_X86_PAE
880 level = MMUEXT_PIN_L3_TABLE;
881#else
882 level = MMUEXT_PIN_L2_TABLE;
883#endif
884
885 pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
886 }
887} 874}
888 875
889/* This is called once we have the cpu_possible_map */ 876/* This is called once we have the cpu_possible_map */
@@ -1120,7 +1107,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1120 .make_pte = xen_make_pte, 1107 .make_pte = xen_make_pte,
1121 .make_pgd = xen_make_pgd, 1108 .make_pgd = xen_make_pgd,
1122 1109
1123#ifdef CONFIG_X86_PAE
1124 .set_pte_atomic = xen_set_pte_atomic, 1110 .set_pte_atomic = xen_set_pte_atomic,
1125 .set_pte_present = xen_set_pte_at, 1111 .set_pte_present = xen_set_pte_at,
1126 .set_pud = xen_set_pud, 1112 .set_pud = xen_set_pud,
@@ -1129,7 +1115,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1129 1115
1130 .make_pmd = xen_make_pmd, 1116 .make_pmd = xen_make_pmd,
1131 .pmd_val = xen_pmd_val, 1117 .pmd_val = xen_pmd_val,
1132#endif /* PAE */
1133 1118
1134 .activate_mm = xen_activate_mm, 1119 .activate_mm = xen_activate_mm,
1135 .dup_mmap = xen_dup_mmap, 1120 .dup_mmap = xen_dup_mmap,
@@ -1257,6 +1242,11 @@ asmlinkage void __init xen_start_kernel(void)
1257 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1242 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1258 pv_info.kernel_rpl = 0; 1243 pv_info.kernel_rpl = 0;
1259 1244
1245 /* Prevent unwanted bits from being set in PTEs. */
1246 __supported_pte_mask &= ~_PAGE_GLOBAL;
1247 if (!is_initial_xendomain())
1248 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1249
1260 /* set the limit of our address space */ 1250 /* set the limit of our address space */
1261 xen_reserve_top(); 1251 xen_reserve_top();
1262 1252
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3525ef523a74..df40bf74ea75 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -179,50 +179,56 @@ out:
179 preempt_enable(); 179 preempt_enable();
180} 180}
181 181
182pteval_t xen_pte_val(pte_t pte) 182/* Assume pteval_t is equivalent to all the other *val_t types. */
183static pteval_t pte_mfn_to_pfn(pteval_t val)
184{
185 if (val & _PAGE_PRESENT) {
186 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
187 pteval_t flags = val & ~PTE_MASK;
188 val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
189 }
190
191 return val;
192}
193
194static pteval_t pte_pfn_to_mfn(pteval_t val)
183{ 195{
184 pteval_t ret = pte.pte; 196 if (val & _PAGE_PRESENT) {
197 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
198 pteval_t flags = val & ~PTE_MASK;
199 val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
200 }
185 201
186 if (ret & _PAGE_PRESENT) 202 return val;
187 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; 203}
188 204
189 return ret; 205pteval_t xen_pte_val(pte_t pte)
206{
207 return pte_mfn_to_pfn(pte.pte);
190} 208}
191 209
192pgdval_t xen_pgd_val(pgd_t pgd) 210pgdval_t xen_pgd_val(pgd_t pgd)
193{ 211{
194 pgdval_t ret = pgd.pgd; 212 return pte_mfn_to_pfn(pgd.pgd);
195 if (ret & _PAGE_PRESENT)
196 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
197 return ret;
198} 213}
199 214
200pte_t xen_make_pte(pteval_t pte) 215pte_t xen_make_pte(pteval_t pte)
201{ 216{
202 if (pte & _PAGE_PRESENT) { 217 pte = pte_pfn_to_mfn(pte);
203 pte = phys_to_machine(XPADDR(pte)).maddr; 218 return native_make_pte(pte);
204 pte &= ~(_PAGE_PCD | _PAGE_PWT);
205 }
206
207 return (pte_t){ .pte = pte };
208} 219}
209 220
210pgd_t xen_make_pgd(pgdval_t pgd) 221pgd_t xen_make_pgd(pgdval_t pgd)
211{ 222{
212 if (pgd & _PAGE_PRESENT) 223 pgd = pte_pfn_to_mfn(pgd);
213 pgd = phys_to_machine(XPADDR(pgd)).maddr; 224 return native_make_pgd(pgd);
214
215 return (pgd_t){ pgd };
216} 225}
217 226
218pmdval_t xen_pmd_val(pmd_t pmd) 227pmdval_t xen_pmd_val(pmd_t pmd)
219{ 228{
220 pmdval_t ret = native_pmd_val(pmd); 229 return pte_mfn_to_pfn(pmd.pmd);
221 if (ret & _PAGE_PRESENT)
222 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
223 return ret;
224} 230}
225#ifdef CONFIG_X86_PAE 231
226void xen_set_pud(pud_t *ptr, pud_t val) 232void xen_set_pud(pud_t *ptr, pud_t val)
227{ 233{
228 struct multicall_space mcs; 234 struct multicall_space mcs;
@@ -267,17 +273,9 @@ void xen_pmd_clear(pmd_t *pmdp)
267 273
268pmd_t xen_make_pmd(pmdval_t pmd) 274pmd_t xen_make_pmd(pmdval_t pmd)
269{ 275{
270 if (pmd & _PAGE_PRESENT) 276 pmd = pte_pfn_to_mfn(pmd);
271 pmd = phys_to_machine(XPADDR(pmd)).maddr;
272
273 return native_make_pmd(pmd); 277 return native_make_pmd(pmd);
274} 278}
275#else /* !PAE */
276void xen_set_pte(pte_t *ptep, pte_t pte)
277{
278 *ptep = pte;
279}
280#endif /* CONFIG_X86_PAE */
281 279
282/* 280/*
283 (Yet another) pagetable walker. This one is intended for pinning a 281 (Yet another) pagetable walker. This one is intended for pinning a
@@ -430,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level)
430 read-only, and can be pinned. */ 428 read-only, and can be pinned. */
431void xen_pgd_pin(pgd_t *pgd) 429void xen_pgd_pin(pgd_t *pgd)
432{ 430{
433 unsigned level;
434
435 xen_mc_batch(); 431 xen_mc_batch();
436 432
437 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 433 if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
@@ -441,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd)
441 xen_mc_batch(); 437 xen_mc_batch();
442 } 438 }
443 439
444#ifdef CONFIG_X86_PAE 440 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
445 level = MMUEXT_PIN_L3_TABLE;
446#else
447 level = MMUEXT_PIN_L2_TABLE;
448#endif
449
450 xen_do_pin(level, PFN_DOWN(__pa(pgd)));
451
452 xen_mc_issue(0); 441 xen_mc_issue(0);
453} 442}
454 443
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index b5e189b1519d..5fe961caffd4 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm);
37void xen_pgd_pin(pgd_t *pgd); 37void xen_pgd_pin(pgd_t *pgd);
38//void xen_pgd_unpin(pgd_t *pgd); 38//void xen_pgd_unpin(pgd_t *pgd);
39 39
40#ifdef CONFIG_X86_PAE 40pteval_t xen_pte_val(pte_t);
41unsigned long long xen_pte_val(pte_t); 41pmdval_t xen_pmd_val(pmd_t);
42unsigned long long xen_pmd_val(pmd_t); 42pgdval_t xen_pgd_val(pgd_t);
43unsigned long long xen_pgd_val(pgd_t);
44 43
45pte_t xen_make_pte(unsigned long long); 44pte_t xen_make_pte(pteval_t);
46pmd_t xen_make_pmd(unsigned long long); 45pmd_t xen_make_pmd(pmdval_t);
47pgd_t xen_make_pgd(unsigned long long); 46pgd_t xen_make_pgd(pgdval_t);
48 47
49void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 48void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
50 pte_t *ptep, pte_t pteval); 49 pte_t *ptep, pte_t pteval);
@@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val);
53void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
54void xen_pmd_clear(pmd_t *pmdp); 53void xen_pmd_clear(pmd_t *pmdp);
55 54
56
57#else
58unsigned long xen_pte_val(pte_t);
59unsigned long xen_pmd_val(pmd_t);
60unsigned long xen_pgd_val(pgd_t);
61
62pte_t xen_make_pte(unsigned long);
63pmd_t xen_make_pmd(unsigned long);
64pgd_t xen_make_pgd(unsigned long);
65#endif
66
67#endif /* _XEN_MMU_H */ 55#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 52b2e3856980..41e217503c96 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -14,6 +14,7 @@
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/math64.h> 15#include <linux/math64.h>
16 16
17#include <asm/pvclock.h>
17#include <asm/xen/hypervisor.h> 18#include <asm/xen/hypervisor.h>
18#include <asm/xen/hypercall.h> 19#include <asm/xen/hypercall.h>
19 20
@@ -31,17 +32,6 @@
31 32
32static cycle_t xen_clocksource_read(void); 33static cycle_t xen_clocksource_read(void);
33 34
34/* These are perodically updated in shared_info, and then copied here. */
35struct shadow_time_info {
36 u64 tsc_timestamp; /* TSC at last update of time vals. */
37 u64 system_timestamp; /* Time, in nanosecs, since boot. */
38 u32 tsc_to_nsec_mul;
39 int tsc_shift;
40 u32 version;
41};
42
43static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
44
45/* runstate info updated by Xen */ 35/* runstate info updated by Xen */
46static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
47 37
@@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void)
211unsigned long xen_cpu_khz(void) 201unsigned long xen_cpu_khz(void)
212{ 202{
213 u64 xen_khz = 1000000ULL << 32; 203 u64 xen_khz = 1000000ULL << 32;
214 const struct vcpu_time_info *info = 204 const struct pvclock_vcpu_time_info *info =
215 &HYPERVISOR_shared_info->vcpu_info[0].time; 205 &HYPERVISOR_shared_info->vcpu_info[0].time;
216 206
217 do_div(xen_khz, info->tsc_to_system_mul); 207 do_div(xen_khz, info->tsc_to_system_mul);
@@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void)
223 return xen_khz; 213 return xen_khz;
224} 214}
225 215
226/*
227 * Reads a consistent set of time-base values from Xen, into a shadow data
228 * area.
229 */
230static unsigned get_time_values_from_xen(void)
231{
232 struct vcpu_time_info *src;
233 struct shadow_time_info *dst;
234
235 /* src is shared memory with the hypervisor, so we need to
236 make sure we get a consistent snapshot, even in the face of
237 being preempted. */
238 src = &__get_cpu_var(xen_vcpu)->time;
239 dst = &__get_cpu_var(shadow_time);
240
241 do {
242 dst->version = src->version;
243 rmb(); /* fetch version before data */
244 dst->tsc_timestamp = src->tsc_timestamp;
245 dst->system_timestamp = src->system_time;
246 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
247 dst->tsc_shift = src->tsc_shift;
248 rmb(); /* test version after fetching data */
249 } while ((src->version & 1) | (dst->version ^ src->version));
250
251 return dst->version;
252}
253
254/*
255 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
256 * yielding a 64-bit result.
257 */
258static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
259{
260 u64 product;
261#ifdef __i386__
262 u32 tmp1, tmp2;
263#endif
264
265 if (shift < 0)
266 delta >>= -shift;
267 else
268 delta <<= shift;
269
270#ifdef __i386__
271 __asm__ (
272 "mul %5 ; "
273 "mov %4,%%eax ; "
274 "mov %%edx,%4 ; "
275 "mul %5 ; "
276 "xor %5,%5 ; "
277 "add %4,%%eax ; "
278 "adc %5,%%edx ; "
279 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
280 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
281#elif __x86_64__
282 __asm__ (
283 "mul %%rdx ; shrd $32,%%rdx,%%rax"
284 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
285#else
286#error implement me!
287#endif
288
289 return product;
290}
291
292static u64 get_nsec_offset(struct shadow_time_info *shadow)
293{
294 u64 now, delta;
295 now = native_read_tsc();
296 delta = now - shadow->tsc_timestamp;
297 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
298}
299
300static cycle_t xen_clocksource_read(void) 216static cycle_t xen_clocksource_read(void)
301{ 217{
302 struct shadow_time_info *shadow = &get_cpu_var(shadow_time); 218 struct pvclock_vcpu_time_info *src;
303 cycle_t ret; 219 cycle_t ret;
304 unsigned version;
305
306 do {
307 version = get_time_values_from_xen();
308 barrier();
309 ret = shadow->system_timestamp + get_nsec_offset(shadow);
310 barrier();
311 } while (version != __get_cpu_var(xen_vcpu)->time.version);
312
313 put_cpu_var(shadow_time);
314 220
221 src = &get_cpu_var(xen_vcpu)->time;
222 ret = pvclock_clocksource_read(src);
223 put_cpu_var(xen_vcpu);
315 return ret; 224 return ret;
316} 225}
317 226
318static void xen_read_wallclock(struct timespec *ts) 227static void xen_read_wallclock(struct timespec *ts)
319{ 228{
320 const struct shared_info *s = HYPERVISOR_shared_info; 229 struct shared_info *s = HYPERVISOR_shared_info;
321 u32 version; 230 struct pvclock_wall_clock *wall_clock = &(s->wc);
322 u64 delta; 231 struct pvclock_vcpu_time_info *vcpu_time;
323 struct timespec now;
324
325 /* get wallclock at system boot */
326 do {
327 version = s->wc_version;
328 rmb(); /* fetch version before time */
329 now.tv_sec = s->wc_sec;
330 now.tv_nsec = s->wc_nsec;
331 rmb(); /* fetch time before checking version */
332 } while ((s->wc_version & 1) | (version ^ s->wc_version));
333 232
334 delta = xen_clocksource_read(); /* time since system boot */ 233 vcpu_time = &get_cpu_var(xen_vcpu)->time;
335 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; 234 pvclock_read_wallclock(wall_clock, vcpu_time, ts);
336 235 put_cpu_var(xen_vcpu);
337 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
338 now.tv_sec = delta;
339
340 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
341} 236}
342 237
343unsigned long xen_get_wallclock(void) 238unsigned long xen_get_wallclock(void)
@@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void)
345 struct timespec ts; 240 struct timespec ts;
346 241
347 xen_read_wallclock(&ts); 242 xen_read_wallclock(&ts);
348
349 return ts.tv_sec; 243 return ts.tv_sec;
350} 244}
351 245
@@ -569,8 +463,6 @@ __init void xen_time_init(void)
569{ 463{
570 int cpu = smp_processor_id(); 464 int cpu = smp_processor_id();
571 465
572 get_time_values_from_xen();
573
574 clocksource_register(&xen_clocksource); 466 clocksource_register(&xen_clocksource);
575 467
576 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { 468 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 288d587ce73c..6ec3b4f7719b 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -17,7 +17,7 @@ ENTRY(startup_xen)
17 17
18 __FINIT 18 __FINIT
19 19
20.pushsection .bss.page_aligned 20.pushsection .text
21 .align PAGE_SIZE_asm 21 .align PAGE_SIZE_asm
22ENTRY(hypercall_page) 22ENTRY(hypercall_page)
23 .skip 0x1000 23 .skip 0x1000
@@ -30,11 +30,7 @@ ENTRY(hypercall_page)
30 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 30 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
31 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 31 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
32 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 32 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
33#ifdef CONFIG_X86_PAE
34 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 33 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
35#else
36 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
37#endif
38 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 34 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
39 35
40#endif /*CONFIG_XEN */ 36#endif /*CONFIG_XEN */