aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2007-02-16 04:27:34 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-02-16 11:13:57 -0500
commit95492e4646e5de8b43d9a7908d6177fb737b61f0 (patch)
treeae25cd206ca76f78d50ac2a206ef012e0ab1d9df /arch/x86_64
parent92c7e00254b2d0efc1e36ac3e45474ce1871b6b2 (diff)
[PATCH] x86: rewrite SMP TSC sync code
make the TSC synchronization code more robust, and unify it between x86_64 and i386. The biggest change is the removal of the 'fix up TSCs' code on x86_64 and i386, in some rare cases it was /causing/ time-warps on SMP systems. The new code only checks for TSC asynchronity - and if it can prove a time-warp (if it can observe the TSC going backwards when going from one CPU to another within a critical section), then the TSC clock-source is turned off. The TSC synchronization-checking code also got moved into a separate file. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: john stultz <johnstul@us.ibm.com> Cc: Roman Zippel <zippel@linux-m68k.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/kernel/Makefile2
-rw-r--r--arch/x86_64/kernel/smpboot.c230
-rw-r--r--arch/x86_64/kernel/time.c11
-rw-r--r--arch/x86_64/kernel/tsc_sync.c187
4 files changed, 210 insertions, 220 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index ae399458024b..6465eee6d920 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -19,7 +19,7 @@ obj-$(CONFIG_ACPI) += acpi/
19obj-$(CONFIG_X86_MSR) += msr.o 19obj-$(CONFIG_X86_MSR) += msr.o
20obj-$(CONFIG_MICROCODE) += microcode.o 20obj-$(CONFIG_MICROCODE) += microcode.o
21obj-$(CONFIG_X86_CPUID) += cpuid.o 21obj-$(CONFIG_X86_CPUID) += cpuid.o
22obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o 22obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o
23obj-y += apic.o nmi.o 23obj-y += apic.o nmi.o
24obj-y += io_apic.o mpparse.o \ 24obj-y += io_apic.o mpparse.o \
25 genapic.o genapic_cluster.o genapic_flat.o 25 genapic.o genapic_cluster.o genapic_flat.o
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index daf19332f0dd..62d828433c30 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -148,217 +148,6 @@ static void __cpuinit smp_store_cpu_info(int id)
148 print_cpu_info(c); 148 print_cpu_info(c);
149} 149}
150 150
151/*
152 * New Funky TSC sync algorithm borrowed from IA64.
153 * Main advantage is that it doesn't reset the TSCs fully and
154 * in general looks more robust and it works better than my earlier
155 * attempts. I believe it was written by David Mosberger. Some minor
156 * adjustments for x86-64 by me -AK
157 *
158 * Original comment reproduced below.
159 *
160 * Synchronize TSC of the current (slave) CPU with the TSC of the
161 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
162 * eliminate the possibility of unaccounted-for errors (such as
163 * getting a machine check in the middle of a calibration step). The
164 * basic idea is for the slave to ask the master what itc value it has
165 * and to read its own itc before and after the master responds. Each
166 * iteration gives us three timestamps:
167 *
168 * slave master
169 *
170 * t0 ---\
171 * ---\
172 * --->
173 * tm
174 * /---
175 * /---
176 * t1 <---
177 *
178 *
179 * The goal is to adjust the slave's TSC such that tm falls exactly
180 * half-way between t0 and t1. If we achieve this, the clocks are
181 * synchronized provided the interconnect between the slave and the
182 * master is symmetric. Even if the interconnect were asymmetric, we
183 * would still know that the synchronization error is smaller than the
184 * roundtrip latency (t0 - t1).
185 *
186 * When the interconnect is quiet and symmetric, this lets us
187 * synchronize the TSC to within one or two cycles. However, we can
188 * only *guarantee* that the synchronization is accurate to within a
189 * round-trip time, which is typically in the range of several hundred
190 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
191 * are usually almost perfectly synchronized, but we shouldn't assume
192 * that the accuracy is much better than half a micro second or so.
193 *
194 * [there are other errors like the latency of RDTSC and of the
195 * WRMSR. These can also account to hundreds of cycles. So it's
196 * probably worse. It claims 153 cycles error on a dual Opteron,
197 * but I suspect the numbers are actually somewhat worse -AK]
198 */
199
200#define MASTER 0
201#define SLAVE (SMP_CACHE_BYTES/8)
202
203/* Intentionally don't use cpu_relax() while TSC synchronization
204 because we don't want to go into funky power save modi or cause
205 hypervisors to schedule us away. Going to sleep would likely affect
206 latency and low latency is the primary objective here. -AK */
207#define no_cpu_relax() barrier()
208
209static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
210static volatile __cpuinitdata unsigned long go[SLAVE + 1];
211static int notscsync __cpuinitdata;
212
213#undef DEBUG_TSC_SYNC
214
215#define NUM_ROUNDS 64 /* magic value */
216#define NUM_ITERS 5 /* likewise */
217
218/* Callback on boot CPU */
219static __cpuinit void sync_master(void *arg)
220{
221 unsigned long flags, i;
222
223 go[MASTER] = 0;
224
225 local_irq_save(flags);
226 {
227 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
228 while (!go[MASTER])
229 no_cpu_relax();
230 go[MASTER] = 0;
231 rdtscll(go[SLAVE]);
232 }
233 }
234 local_irq_restore(flags);
235}
236
237/*
238 * Return the number of cycles by which our tsc differs from the tsc
239 * on the master (time-keeper) CPU. A positive number indicates our
240 * tsc is ahead of the master, negative that it is behind.
241 */
242static inline long
243get_delta(long *rt, long *master)
244{
245 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
246 unsigned long tcenter, t0, t1, tm;
247 int i;
248
249 for (i = 0; i < NUM_ITERS; ++i) {
250 rdtscll(t0);
251 go[MASTER] = 1;
252 while (!(tm = go[SLAVE]))
253 no_cpu_relax();
254 go[SLAVE] = 0;
255 rdtscll(t1);
256
257 if (t1 - t0 < best_t1 - best_t0)
258 best_t0 = t0, best_t1 = t1, best_tm = tm;
259 }
260
261 *rt = best_t1 - best_t0;
262 *master = best_tm - best_t0;
263
264 /* average best_t0 and best_t1 without overflow: */
265 tcenter = (best_t0/2 + best_t1/2);
266 if (best_t0 % 2 + best_t1 % 2 == 2)
267 ++tcenter;
268 return tcenter - best_tm;
269}
270
271static __cpuinit void sync_tsc(unsigned int master)
272{
273 int i, done = 0;
274 long delta, adj, adjust_latency = 0;
275 unsigned long flags, rt, master_time_stamp, bound;
276#ifdef DEBUG_TSC_SYNC
277 static struct syncdebug {
278 long rt; /* roundtrip time */
279 long master; /* master's timestamp */
280 long diff; /* difference between midpoint and master's timestamp */
281 long lat; /* estimate of tsc adjustment latency */
282 } t[NUM_ROUNDS] __cpuinitdata;
283#endif
284
285 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
286 smp_processor_id(), master);
287
288 go[MASTER] = 1;
289
290 /* It is dangerous to broadcast IPI as cpus are coming up,
291 * as they may not be ready to accept them. So since
292 * we only need to send the ipi to the boot cpu direct
293 * the message, and avoid the race.
294 */
295 smp_call_function_single(master, sync_master, NULL, 1, 0);
296
297 while (go[MASTER]) /* wait for master to be ready */
298 no_cpu_relax();
299
300 spin_lock_irqsave(&tsc_sync_lock, flags);
301 {
302 for (i = 0; i < NUM_ROUNDS; ++i) {
303 delta = get_delta(&rt, &master_time_stamp);
304 if (delta == 0) {
305 done = 1; /* let's lock on to this... */
306 bound = rt;
307 }
308
309 if (!done) {
310 unsigned long t;
311 if (i > 0) {
312 adjust_latency += -delta;
313 adj = -delta + adjust_latency/4;
314 } else
315 adj = -delta;
316
317 rdtscll(t);
318 wrmsrl(MSR_IA32_TSC, t + adj);
319 }
320#ifdef DEBUG_TSC_SYNC
321 t[i].rt = rt;
322 t[i].master = master_time_stamp;
323 t[i].diff = delta;
324 t[i].lat = adjust_latency/4;
325#endif
326 }
327 }
328 spin_unlock_irqrestore(&tsc_sync_lock, flags);
329
330#ifdef DEBUG_TSC_SYNC
331 for (i = 0; i < NUM_ROUNDS; ++i)
332 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
333 t[i].rt, t[i].master, t[i].diff, t[i].lat);
334#endif
335
336 printk(KERN_INFO
337 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
338 "maxerr %lu cycles)\n",
339 smp_processor_id(), master, delta, rt);
340}
341
342static void __cpuinit tsc_sync_wait(void)
343{
344 /*
345 * When the CPU has synchronized TSCs assume the BIOS
346 * or the hardware already synced. Otherwise we could
347 * mess up a possible perfect synchronization with a
348 * not-quite-perfect algorithm.
349 */
350 if (notscsync || !cpu_has_tsc || !unsynchronized_tsc())
351 return;
352 sync_tsc(0);
353}
354
355static __init int notscsync_setup(char *s)
356{
357 notscsync = 1;
358 return 1;
359}
360__setup("notscsync", notscsync_setup);
361
362static atomic_t init_deasserted __cpuinitdata; 151static atomic_t init_deasserted __cpuinitdata;
363 152
364/* 153/*
@@ -546,6 +335,11 @@ void __cpuinit start_secondary(void)
546 /* otherwise gcc will move up the smp_processor_id before the cpu_init */ 335 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
547 barrier(); 336 barrier();
548 337
338 /*
339 * Check TSC sync first:
340 */
341 check_tsc_sync_target();
342
549 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); 343 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
550 setup_secondary_APIC_clock(); 344 setup_secondary_APIC_clock();
551 345
@@ -565,14 +359,6 @@ void __cpuinit start_secondary(void)
565 */ 359 */
566 set_cpu_sibling_map(smp_processor_id()); 360 set_cpu_sibling_map(smp_processor_id());
567 361
568 /*
569 * Wait for TSC sync to not schedule things before.
570 * We still process interrupts, which could see an inconsistent
571 * time in that window unfortunately.
572 * Do this here because TSC sync has global unprotected state.
573 */
574 tsc_sync_wait();
575
576 /* 362 /*
577 * We need to hold call_lock, so there is no inconsistency 363 * We need to hold call_lock, so there is no inconsistency
578 * between the time smp_call_function() determines number of 364 * between the time smp_call_function() determines number of
@@ -592,6 +378,7 @@ void __cpuinit start_secondary(void)
592 cpu_set(smp_processor_id(), cpu_online_map); 378 cpu_set(smp_processor_id(), cpu_online_map);
593 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 379 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
594 spin_unlock(&vector_lock); 380 spin_unlock(&vector_lock);
381
595 unlock_ipi_call_lock(); 382 unlock_ipi_call_lock();
596 383
597 cpu_idle(); 384 cpu_idle();
@@ -1168,6 +955,11 @@ int __cpuinit __cpu_up(unsigned int cpu)
1168 /* Unleash the CPU! */ 955 /* Unleash the CPU! */
1169 Dprintk("waiting for cpu %d\n", cpu); 956 Dprintk("waiting for cpu %d\n", cpu);
1170 957
958 /*
959 * Make sure and check TSC sync:
960 */
961 check_tsc_sync_source(cpu);
962
1171 while (!cpu_isset(cpu, cpu_online_map)) 963 while (!cpu_isset(cpu, cpu_online_map))
1172 cpu_relax(); 964 cpu_relax();
1173 965
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 3cc6886f1fb7..8cb2b2d35f5d 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -944,12 +944,23 @@ void __init time_init(void)
944#endif 944#endif
945} 945}
946 946
947static int tsc_unstable = 0;
948
949void mark_tsc_unstable(void)
950{
951 tsc_unstable = 1;
952}
953EXPORT_SYMBOL_GPL(mark_tsc_unstable);
954
947/* 955/*
948 * Make an educated guess if the TSC is trustworthy and synchronized 956 * Make an educated guess if the TSC is trustworthy and synchronized
949 * over all CPUs. 957 * over all CPUs.
950 */ 958 */
951__cpuinit int unsynchronized_tsc(void) 959__cpuinit int unsynchronized_tsc(void)
952{ 960{
961 if (tsc_unstable)
962 return 1;
963
953#ifdef CONFIG_SMP 964#ifdef CONFIG_SMP
954 if (apic_is_clustered_box()) 965 if (apic_is_clustered_box())
955 return 1; 966 return 1;
diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c
new file mode 100644
index 000000000000..014f0db45dfa
--- /dev/null
+++ b/arch/x86_64/kernel/tsc_sync.c
@@ -0,0 +1,187 @@
1/*
2 * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization.
3 *
4 * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
5 *
6 * We check whether all boot CPUs have their TSC's synchronized,
7 * print a warning if not and turn off the TSC clock-source.
8 *
9 * The warp-check is point-to-point between two CPUs, the CPU
10 * initiating the bootup is the 'source CPU', the freshly booting
11 * CPU is the 'target CPU'.
12 *
13 * Only two CPUs may participate - they can enter in any order.
14 * ( The serial nature of the boot logic and the CPU hotplug lock
15 * protects against more than 2 CPUs entering this code. )
16 */
17#include <linux/spinlock.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/smp.h>
21#include <linux/nmi.h>
22#include <asm/tsc.h>
23
24/*
25 * Entry/exit counters that make sure that both CPUs
26 * run the measurement code at once:
27 */
28static __cpuinitdata atomic_t start_count;
29static __cpuinitdata atomic_t stop_count;
30
31/*
32 * We use a raw spinlock in this exceptional case, because
33 * we want to have the fastest, inlined, non-debug version
34 * of a critical section, to be able to prove TSC time-warps:
35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
37static __cpuinitdata cycles_t last_tsc;
38static __cpuinitdata cycles_t max_warp;
39static __cpuinitdata int nr_warps;
40
41/*
42 * TSC-warp measurement loop running on both CPUs:
43 */
44static __cpuinit void check_tsc_warp(void)
45{
46 cycles_t start, now, prev, end;
47 int i;
48
49 start = get_cycles_sync();
50 /*
51 * The measurement runs for 20 msecs:
52 */
53 end = start + cpu_khz * 20ULL;
54 now = start;
55
56 for (i = 0; ; i++) {
57 /*
58 * We take the global lock, measure TSC, save the
59 * previous TSC that was measured (possibly on
60 * another CPU) and update the previous TSC timestamp.
61 */
62 __raw_spin_lock(&sync_lock);
63 prev = last_tsc;
64 now = get_cycles_sync();
65 last_tsc = now;
66 __raw_spin_unlock(&sync_lock);
67
68 /*
69 * Be nice every now and then (and also check whether
70 * measurement is done [we also insert a 100 million
71 * loops safety exit, so we dont lock up in case the
72 * TSC readout is totally broken]):
73 */
74 if (unlikely(!(i & 7))) {
75 if (now > end || i > 100000000)
76 break;
77 cpu_relax();
78 touch_nmi_watchdog();
79 }
80 /*
81 * Outside the critical section we can now see whether
82 * we saw a time-warp of the TSC going backwards:
83 */
84 if (unlikely(prev > now)) {
85 __raw_spin_lock(&sync_lock);
86 max_warp = max(max_warp, prev - now);
87 nr_warps++;
88 __raw_spin_unlock(&sync_lock);
89 }
90
91 }
92}
93
94/*
95 * Source CPU calls into this - it waits for the freshly booted
96 * target CPU to arrive and then starts the measurement:
97 */
98void __cpuinit check_tsc_sync_source(int cpu)
99{
100 int cpus = 2;
101
102 /*
103 * No need to check if we already know that the TSC is not
104 * synchronized:
105 */
106 if (unsynchronized_tsc())
107 return;
108
109 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
110 smp_processor_id(), cpu);
111
112 /*
113 * Reset it - in case this is a second bootup:
114 */
115 atomic_set(&stop_count, 0);
116
117 /*
118 * Wait for the target to arrive:
119 */
120 while (atomic_read(&start_count) != cpus-1)
121 cpu_relax();
122 /*
123 * Trigger the target to continue into the measurement too:
124 */
125 atomic_inc(&start_count);
126
127 check_tsc_warp();
128
129 while (atomic_read(&stop_count) != cpus-1)
130 cpu_relax();
131
132 /*
133 * Reset it - just in case we boot another CPU later:
134 */
135 atomic_set(&start_count, 0);
136
137 if (nr_warps) {
138 printk("\n");
139 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
140 " turning off TSC clock.\n", max_warp);
141 mark_tsc_unstable();
142 nr_warps = 0;
143 max_warp = 0;
144 last_tsc = 0;
145 } else {
146 printk(" passed.\n");
147 }
148
149 /*
150 * Let the target continue with the bootup:
151 */
152 atomic_inc(&stop_count);
153}
154
155/*
156 * Freshly booted CPUs call into this:
157 */
158void __cpuinit check_tsc_sync_target(void)
159{
160 int cpus = 2;
161
162 if (unsynchronized_tsc())
163 return;
164
165 /*
166 * Register this CPU's participation and wait for the
167 * source CPU to start the measurement:
168 */
169 atomic_inc(&start_count);
170 while (atomic_read(&start_count) != cpus)
171 cpu_relax();
172
173 check_tsc_warp();
174
175 /*
176 * Ok, we are done:
177 */
178 atomic_inc(&stop_count);
179
180 /*
181 * Wait for the source CPU to print stuff:
182 */
183 while (atomic_read(&stop_count) != cpus)
184 cpu_relax();
185}
186#undef NR_LOOPS
187