aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2007-02-16 04:27:34 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-02-16 11:13:57 -0500
commit95492e4646e5de8b43d9a7908d6177fb737b61f0 (patch)
treeae25cd206ca76f78d50ac2a206ef012e0ab1d9df /arch
parent92c7e00254b2d0efc1e36ac3e45474ce1871b6b2 (diff)
[PATCH] x86: rewrite SMP TSC sync code
make the TSC synchronization code more robust, and unify it between x86_64 and i386. The biggest change is the removal of the 'fix up TSCs' code on x86_64 and i386, in some rare cases it was /causing/ time-warps on SMP systems. The new code only checks for TSC asynchronity - and if it can prove a time-warp (if it can observe the TSC going backwards when going from one CPU to another within a critical section), then the TSC clock-source is turned off. The TSC synchronization-checking code also got moved into a separate file. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: john stultz <johnstul@us.ibm.com> Cc: Roman Zippel <zippel@linux-m68k.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/i386/kernel/Makefile2
-rw-r--r--arch/i386/kernel/smpboot.c178
-rw-r--r--arch/i386/kernel/tsc.c4
-rw-r--r--arch/i386/kernel/tsc_sync.c1
-rw-r--r--arch/x86_64/kernel/Makefile2
-rw-r--r--arch/x86_64/kernel/smpboot.c230
-rw-r--r--arch/x86_64/kernel/time.c11
-rw-r--r--arch/x86_64/kernel/tsc_sync.c187
8 files changed, 227 insertions, 388 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index cbe4e601885c..c2b3b79dc436 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -18,7 +18,7 @@ obj-$(CONFIG_X86_MSR) += msr.o
18obj-$(CONFIG_X86_CPUID) += cpuid.o 18obj-$(CONFIG_X86_CPUID) += cpuid.o
19obj-$(CONFIG_MICROCODE) += microcode.o 19obj-$(CONFIG_MICROCODE) += microcode.o
20obj-$(CONFIG_APM) += apm.o 20obj-$(CONFIG_APM) += apm.o
21obj-$(CONFIG_X86_SMP) += smp.o smpboot.o 21obj-$(CONFIG_X86_SMP) += smp.o smpboot.o tsc_sync.o
22obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 22obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
23obj-$(CONFIG_X86_MPPARSE) += mpparse.o 23obj-$(CONFIG_X86_MPPARSE) += mpparse.o
24obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 24obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index f46a4d095e6c..6ddffe8aabb2 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -94,12 +94,6 @@ cpumask_t cpu_possible_map;
94EXPORT_SYMBOL(cpu_possible_map); 94EXPORT_SYMBOL(cpu_possible_map);
95static cpumask_t smp_commenced_mask; 95static cpumask_t smp_commenced_mask;
96 96
97/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
98 * is no way to resync one AP against BP. TBD: for prescott and above, we
99 * should use IA64's algorithm
100 */
101static int __devinitdata tsc_sync_disabled;
102
103/* Per CPU bogomips and other parameters */ 97/* Per CPU bogomips and other parameters */
104struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; 98struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
105EXPORT_SYMBOL(cpu_data); 99EXPORT_SYMBOL(cpu_data);
@@ -216,151 +210,6 @@ valid_k7:
216 ; 210 ;
217} 211}
218 212
219/*
220 * TSC synchronization.
221 *
222 * We first check whether all CPUs have their TSC's synchronized,
223 * then we print a warning if not, and always resync.
224 */
225
226static struct {
227 atomic_t start_flag;
228 atomic_t count_start;
229 atomic_t count_stop;
230 unsigned long long values[NR_CPUS];
231} tsc __cpuinitdata = {
232 .start_flag = ATOMIC_INIT(0),
233 .count_start = ATOMIC_INIT(0),
234 .count_stop = ATOMIC_INIT(0),
235};
236
237#define NR_LOOPS 5
238
239static void __init synchronize_tsc_bp(void)
240{
241 int i;
242 unsigned long long t0;
243 unsigned long long sum, avg;
244 long long delta;
245 unsigned int one_usec;
246 int buggy = 0;
247
248 printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
249
250 /* convert from kcyc/sec to cyc/usec */
251 one_usec = cpu_khz / 1000;
252
253 atomic_set(&tsc.start_flag, 1);
254 wmb();
255
256 /*
257 * We loop a few times to get a primed instruction cache,
258 * then the last pass is more or less synchronized and
259 * the BP and APs set their cycle counters to zero all at
260 * once. This reduces the chance of having random offsets
261 * between the processors, and guarantees that the maximum
262 * delay between the cycle counters is never bigger than
263 * the latency of information-passing (cachelines) between
264 * two CPUs.
265 */
266 for (i = 0; i < NR_LOOPS; i++) {
267 /*
268 * all APs synchronize but they loop on '== num_cpus'
269 */
270 while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
271 cpu_relax();
272 atomic_set(&tsc.count_stop, 0);
273 wmb();
274 /*
275 * this lets the APs save their current TSC:
276 */
277 atomic_inc(&tsc.count_start);
278
279 rdtscll(tsc.values[smp_processor_id()]);
280 /*
281 * We clear the TSC in the last loop:
282 */
283 if (i == NR_LOOPS-1)
284 write_tsc(0, 0);
285
286 /*
287 * Wait for all APs to leave the synchronization point:
288 */
289 while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
290 cpu_relax();
291 atomic_set(&tsc.count_start, 0);
292 wmb();
293 atomic_inc(&tsc.count_stop);
294 }
295
296 sum = 0;
297 for (i = 0; i < NR_CPUS; i++) {
298 if (cpu_isset(i, cpu_callout_map)) {
299 t0 = tsc.values[i];
300 sum += t0;
301 }
302 }
303 avg = sum;
304 do_div(avg, num_booting_cpus());
305
306 for (i = 0; i < NR_CPUS; i++) {
307 if (!cpu_isset(i, cpu_callout_map))
308 continue;
309 delta = tsc.values[i] - avg;
310 if (delta < 0)
311 delta = -delta;
312 /*
313 * We report bigger than 2 microseconds clock differences.
314 */
315 if (delta > 2*one_usec) {
316 long long realdelta;
317
318 if (!buggy) {
319 buggy = 1;
320 printk("\n");
321 }
322 realdelta = delta;
323 do_div(realdelta, one_usec);
324 if (tsc.values[i] < avg)
325 realdelta = -realdelta;
326
327 if (realdelta)
328 printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
329 "skew, fixed it up.\n", i, realdelta);
330 }
331 }
332 if (!buggy)
333 printk("passed.\n");
334}
335
336static void __cpuinit synchronize_tsc_ap(void)
337{
338 int i;
339
340 /*
341 * Not every cpu is online at the time
342 * this gets called, so we first wait for the BP to
343 * finish SMP initialization:
344 */
345 while (!atomic_read(&tsc.start_flag))
346 cpu_relax();
347
348 for (i = 0; i < NR_LOOPS; i++) {
349 atomic_inc(&tsc.count_start);
350 while (atomic_read(&tsc.count_start) != num_booting_cpus())
351 cpu_relax();
352
353 rdtscll(tsc.values[smp_processor_id()]);
354 if (i == NR_LOOPS-1)
355 write_tsc(0, 0);
356
357 atomic_inc(&tsc.count_stop);
358 while (atomic_read(&tsc.count_stop) != num_booting_cpus())
359 cpu_relax();
360 }
361}
362#undef NR_LOOPS
363
364extern void calibrate_delay(void); 213extern void calibrate_delay(void);
365 214
366static atomic_t init_deasserted; 215static atomic_t init_deasserted;
@@ -446,12 +295,6 @@ static void __cpuinit smp_callin(void)
446 * Allow the master to continue. 295 * Allow the master to continue.
447 */ 296 */
448 cpu_set(cpuid, cpu_callin_map); 297 cpu_set(cpuid, cpu_callin_map);
449
450 /*
451 * Synchronize the TSC with the BP
452 */
453 if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
454 synchronize_tsc_ap();
455} 298}
456 299
457static int cpucount; 300static int cpucount;
@@ -554,6 +397,11 @@ static void __cpuinit start_secondary(void *unused)
554 smp_callin(); 397 smp_callin();
555 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 398 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
556 rep_nop(); 399 rep_nop();
400 /*
401 * Check TSC synchronization with the BP:
402 */
403 check_tsc_sync_target();
404
557 setup_secondary_clock(); 405 setup_secondary_clock();
558 if (nmi_watchdog == NMI_IO_APIC) { 406 if (nmi_watchdog == NMI_IO_APIC) {
559 disable_8259A_irq(0); 407 disable_8259A_irq(0);
@@ -1125,8 +973,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1125 info.cpu = cpu; 973 info.cpu = cpu;
1126 INIT_WORK(&info.task, do_warm_boot_cpu); 974 INIT_WORK(&info.task, do_warm_boot_cpu);
1127 975
1128 tsc_sync_disabled = 1;
1129
1130 /* init low mem mapping */ 976 /* init low mem mapping */
1131 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 977 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
1132 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); 978 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
@@ -1134,7 +980,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1134 schedule_work(&info.task); 980 schedule_work(&info.task);
1135 wait_for_completion(&done); 981 wait_for_completion(&done);
1136 982
1137 tsc_sync_disabled = 0;
1138 zap_low_mappings(); 983 zap_low_mappings();
1139 ret = 0; 984 ret = 0;
1140exit: 985exit:
@@ -1331,12 +1176,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1331 smpboot_setup_io_apic(); 1176 smpboot_setup_io_apic();
1332 1177
1333 setup_boot_clock(); 1178 setup_boot_clock();
1334
1335 /*
1336 * Synchronize the TSC with the AP
1337 */
1338 if (cpu_has_tsc && cpucount && cpu_khz)
1339 synchronize_tsc_bp();
1340} 1179}
1341 1180
1342/* These are wrappers to interface to the new boot process. Someone 1181/* These are wrappers to interface to the new boot process. Someone
@@ -1471,9 +1310,16 @@ int __cpuinit __cpu_up(unsigned int cpu)
1471 } 1310 }
1472 1311
1473 local_irq_enable(); 1312 local_irq_enable();
1313
1474 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 1314 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
1475 /* Unleash the CPU! */ 1315 /* Unleash the CPU! */
1476 cpu_set(cpu, smp_commenced_mask); 1316 cpu_set(cpu, smp_commenced_mask);
1317
1318 /*
1319 * Check TSC synchronization with the AP:
1320 */
1321 check_tsc_sync_source(cpu);
1322
1477 while (!cpu_isset(cpu, cpu_online_map)) 1323 while (!cpu_isset(cpu, cpu_online_map))
1478 cpu_relax(); 1324 cpu_relax();
1479 1325
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 6f6971da761c..0fd93107ff9a 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -406,8 +406,10 @@ out:
406 * Make an educated guess if the TSC is trustworthy and synchronized 406 * Make an educated guess if the TSC is trustworthy and synchronized
407 * over all CPUs. 407 * over all CPUs.
408 */ 408 */
409static __init int unsynchronized_tsc(void) 409__cpuinit int unsynchronized_tsc(void)
410{ 410{
411 if (!cpu_has_tsc || tsc_unstable)
412 return 1;
411 /* 413 /*
412 * Intel systems are normally all synchronized. 414 * Intel systems are normally all synchronized.
413 * Exceptions must mark TSC as unstable: 415 * Exceptions must mark TSC as unstable:
diff --git a/arch/i386/kernel/tsc_sync.c b/arch/i386/kernel/tsc_sync.c
new file mode 100644
index 000000000000..12424629af87
--- /dev/null
+++ b/arch/i386/kernel/tsc_sync.c
@@ -0,0 +1 @@
#include "../../x86_64/kernel/tsc_sync.c"
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index ae399458024b..6465eee6d920 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -19,7 +19,7 @@ obj-$(CONFIG_ACPI) += acpi/
19obj-$(CONFIG_X86_MSR) += msr.o 19obj-$(CONFIG_X86_MSR) += msr.o
20obj-$(CONFIG_MICROCODE) += microcode.o 20obj-$(CONFIG_MICROCODE) += microcode.o
21obj-$(CONFIG_X86_CPUID) += cpuid.o 21obj-$(CONFIG_X86_CPUID) += cpuid.o
22obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o 22obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o
23obj-y += apic.o nmi.o 23obj-y += apic.o nmi.o
24obj-y += io_apic.o mpparse.o \ 24obj-y += io_apic.o mpparse.o \
25 genapic.o genapic_cluster.o genapic_flat.o 25 genapic.o genapic_cluster.o genapic_flat.o
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index daf19332f0dd..62d828433c30 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -148,217 +148,6 @@ static void __cpuinit smp_store_cpu_info(int id)
148 print_cpu_info(c); 148 print_cpu_info(c);
149} 149}
150 150
151/*
152 * New Funky TSC sync algorithm borrowed from IA64.
153 * Main advantage is that it doesn't reset the TSCs fully and
154 * in general looks more robust and it works better than my earlier
155 * attempts. I believe it was written by David Mosberger. Some minor
156 * adjustments for x86-64 by me -AK
157 *
158 * Original comment reproduced below.
159 *
160 * Synchronize TSC of the current (slave) CPU with the TSC of the
161 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
162 * eliminate the possibility of unaccounted-for errors (such as
163 * getting a machine check in the middle of a calibration step). The
164 * basic idea is for the slave to ask the master what itc value it has
165 * and to read its own itc before and after the master responds. Each
166 * iteration gives us three timestamps:
167 *
168 * slave master
169 *
170 * t0 ---\
171 * ---\
172 * --->
173 * tm
174 * /---
175 * /---
176 * t1 <---
177 *
178 *
179 * The goal is to adjust the slave's TSC such that tm falls exactly
180 * half-way between t0 and t1. If we achieve this, the clocks are
181 * synchronized provided the interconnect between the slave and the
182 * master is symmetric. Even if the interconnect were asymmetric, we
183 * would still know that the synchronization error is smaller than the
184 * roundtrip latency (t0 - t1).
185 *
186 * When the interconnect is quiet and symmetric, this lets us
187 * synchronize the TSC to within one or two cycles. However, we can
188 * only *guarantee* that the synchronization is accurate to within a
189 * round-trip time, which is typically in the range of several hundred
190 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
191 * are usually almost perfectly synchronized, but we shouldn't assume
192 * that the accuracy is much better than half a micro second or so.
193 *
194 * [there are other errors like the latency of RDTSC and of the
195 * WRMSR. These can also account to hundreds of cycles. So it's
196 * probably worse. It claims 153 cycles error on a dual Opteron,
197 * but I suspect the numbers are actually somewhat worse -AK]
198 */
199
200#define MASTER 0
201#define SLAVE (SMP_CACHE_BYTES/8)
202
203/* Intentionally don't use cpu_relax() while TSC synchronization
204 because we don't want to go into funky power save modi or cause
205 hypervisors to schedule us away. Going to sleep would likely affect
206 latency and low latency is the primary objective here. -AK */
207#define no_cpu_relax() barrier()
208
209static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
210static volatile __cpuinitdata unsigned long go[SLAVE + 1];
211static int notscsync __cpuinitdata;
212
213#undef DEBUG_TSC_SYNC
214
215#define NUM_ROUNDS 64 /* magic value */
216#define NUM_ITERS 5 /* likewise */
217
218/* Callback on boot CPU */
219static __cpuinit void sync_master(void *arg)
220{
221 unsigned long flags, i;
222
223 go[MASTER] = 0;
224
225 local_irq_save(flags);
226 {
227 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
228 while (!go[MASTER])
229 no_cpu_relax();
230 go[MASTER] = 0;
231 rdtscll(go[SLAVE]);
232 }
233 }
234 local_irq_restore(flags);
235}
236
237/*
238 * Return the number of cycles by which our tsc differs from the tsc
239 * on the master (time-keeper) CPU. A positive number indicates our
240 * tsc is ahead of the master, negative that it is behind.
241 */
242static inline long
243get_delta(long *rt, long *master)
244{
245 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
246 unsigned long tcenter, t0, t1, tm;
247 int i;
248
249 for (i = 0; i < NUM_ITERS; ++i) {
250 rdtscll(t0);
251 go[MASTER] = 1;
252 while (!(tm = go[SLAVE]))
253 no_cpu_relax();
254 go[SLAVE] = 0;
255 rdtscll(t1);
256
257 if (t1 - t0 < best_t1 - best_t0)
258 best_t0 = t0, best_t1 = t1, best_tm = tm;
259 }
260
261 *rt = best_t1 - best_t0;
262 *master = best_tm - best_t0;
263
264 /* average best_t0 and best_t1 without overflow: */
265 tcenter = (best_t0/2 + best_t1/2);
266 if (best_t0 % 2 + best_t1 % 2 == 2)
267 ++tcenter;
268 return tcenter - best_tm;
269}
270
271static __cpuinit void sync_tsc(unsigned int master)
272{
273 int i, done = 0;
274 long delta, adj, adjust_latency = 0;
275 unsigned long flags, rt, master_time_stamp, bound;
276#ifdef DEBUG_TSC_SYNC
277 static struct syncdebug {
278 long rt; /* roundtrip time */
279 long master; /* master's timestamp */
280 long diff; /* difference between midpoint and master's timestamp */
281 long lat; /* estimate of tsc adjustment latency */
282 } t[NUM_ROUNDS] __cpuinitdata;
283#endif
284
285 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
286 smp_processor_id(), master);
287
288 go[MASTER] = 1;
289
290 /* It is dangerous to broadcast IPI as cpus are coming up,
291 * as they may not be ready to accept them. So since
292 * we only need to send the ipi to the boot cpu direct
293 * the message, and avoid the race.
294 */
295 smp_call_function_single(master, sync_master, NULL, 1, 0);
296
297 while (go[MASTER]) /* wait for master to be ready */
298 no_cpu_relax();
299
300 spin_lock_irqsave(&tsc_sync_lock, flags);
301 {
302 for (i = 0; i < NUM_ROUNDS; ++i) {
303 delta = get_delta(&rt, &master_time_stamp);
304 if (delta == 0) {
305 done = 1; /* let's lock on to this... */
306 bound = rt;
307 }
308
309 if (!done) {
310 unsigned long t;
311 if (i > 0) {
312 adjust_latency += -delta;
313 adj = -delta + adjust_latency/4;
314 } else
315 adj = -delta;
316
317 rdtscll(t);
318 wrmsrl(MSR_IA32_TSC, t + adj);
319 }
320#ifdef DEBUG_TSC_SYNC
321 t[i].rt = rt;
322 t[i].master = master_time_stamp;
323 t[i].diff = delta;
324 t[i].lat = adjust_latency/4;
325#endif
326 }
327 }
328 spin_unlock_irqrestore(&tsc_sync_lock, flags);
329
330#ifdef DEBUG_TSC_SYNC
331 for (i = 0; i < NUM_ROUNDS; ++i)
332 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
333 t[i].rt, t[i].master, t[i].diff, t[i].lat);
334#endif
335
336 printk(KERN_INFO
337 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
338 "maxerr %lu cycles)\n",
339 smp_processor_id(), master, delta, rt);
340}
341
342static void __cpuinit tsc_sync_wait(void)
343{
344 /*
345 * When the CPU has synchronized TSCs assume the BIOS
346 * or the hardware already synced. Otherwise we could
347 * mess up a possible perfect synchronization with a
348 * not-quite-perfect algorithm.
349 */
350 if (notscsync || !cpu_has_tsc || !unsynchronized_tsc())
351 return;
352 sync_tsc(0);
353}
354
355static __init int notscsync_setup(char *s)
356{
357 notscsync = 1;
358 return 1;
359}
360__setup("notscsync", notscsync_setup);
361
362static atomic_t init_deasserted __cpuinitdata; 151static atomic_t init_deasserted __cpuinitdata;
363 152
364/* 153/*
@@ -546,6 +335,11 @@ void __cpuinit start_secondary(void)
546 /* otherwise gcc will move up the smp_processor_id before the cpu_init */ 335 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
547 barrier(); 336 barrier();
548 337
338 /*
339 * Check TSC sync first:
340 */
341 check_tsc_sync_target();
342
549 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); 343 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
550 setup_secondary_APIC_clock(); 344 setup_secondary_APIC_clock();
551 345
@@ -565,14 +359,6 @@ void __cpuinit start_secondary(void)
565 */ 359 */
566 set_cpu_sibling_map(smp_processor_id()); 360 set_cpu_sibling_map(smp_processor_id());
567 361
568 /*
569 * Wait for TSC sync to not schedule things before.
570 * We still process interrupts, which could see an inconsistent
571 * time in that window unfortunately.
572 * Do this here because TSC sync has global unprotected state.
573 */
574 tsc_sync_wait();
575
576 /* 362 /*
577 * We need to hold call_lock, so there is no inconsistency 363 * We need to hold call_lock, so there is no inconsistency
578 * between the time smp_call_function() determines number of 364 * between the time smp_call_function() determines number of
@@ -592,6 +378,7 @@ void __cpuinit start_secondary(void)
592 cpu_set(smp_processor_id(), cpu_online_map); 378 cpu_set(smp_processor_id(), cpu_online_map);
593 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 379 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
594 spin_unlock(&vector_lock); 380 spin_unlock(&vector_lock);
381
595 unlock_ipi_call_lock(); 382 unlock_ipi_call_lock();
596 383
597 cpu_idle(); 384 cpu_idle();
@@ -1168,6 +955,11 @@ int __cpuinit __cpu_up(unsigned int cpu)
1168 /* Unleash the CPU! */ 955 /* Unleash the CPU! */
1169 Dprintk("waiting for cpu %d\n", cpu); 956 Dprintk("waiting for cpu %d\n", cpu);
1170 957
958 /*
959 * Make sure and check TSC sync:
960 */
961 check_tsc_sync_source(cpu);
962
1171 while (!cpu_isset(cpu, cpu_online_map)) 963 while (!cpu_isset(cpu, cpu_online_map))
1172 cpu_relax(); 964 cpu_relax();
1173 965
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 3cc6886f1fb7..8cb2b2d35f5d 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -944,12 +944,23 @@ void __init time_init(void)
944#endif 944#endif
945} 945}
946 946
947static int tsc_unstable = 0;
948
949void mark_tsc_unstable(void)
950{
951 tsc_unstable = 1;
952}
953EXPORT_SYMBOL_GPL(mark_tsc_unstable);
954
947/* 955/*
948 * Make an educated guess if the TSC is trustworthy and synchronized 956 * Make an educated guess if the TSC is trustworthy and synchronized
949 * over all CPUs. 957 * over all CPUs.
950 */ 958 */
951__cpuinit int unsynchronized_tsc(void) 959__cpuinit int unsynchronized_tsc(void)
952{ 960{
961 if (tsc_unstable)
962 return 1;
963
953#ifdef CONFIG_SMP 964#ifdef CONFIG_SMP
954 if (apic_is_clustered_box()) 965 if (apic_is_clustered_box())
955 return 1; 966 return 1;
diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c
new file mode 100644
index 000000000000..014f0db45dfa
--- /dev/null
+++ b/arch/x86_64/kernel/tsc_sync.c
@@ -0,0 +1,187 @@
1/*
2 * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization.
3 *
4 * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
5 *
6 * We check whether all boot CPUs have their TSC's synchronized,
7 * print a warning if not and turn off the TSC clock-source.
8 *
9 * The warp-check is point-to-point between two CPUs, the CPU
10 * initiating the bootup is the 'source CPU', the freshly booting
11 * CPU is the 'target CPU'.
12 *
13 * Only two CPUs may participate - they can enter in any order.
14 * ( The serial nature of the boot logic and the CPU hotplug lock
15 * protects against more than 2 CPUs entering this code. )
16 */
17#include <linux/spinlock.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/smp.h>
21#include <linux/nmi.h>
22#include <asm/tsc.h>
23
24/*
25 * Entry/exit counters that make sure that both CPUs
26 * run the measurement code at once:
27 */
28static __cpuinitdata atomic_t start_count;
29static __cpuinitdata atomic_t stop_count;
30
31/*
32 * We use a raw spinlock in this exceptional case, because
33 * we want to have the fastest, inlined, non-debug version
34 * of a critical section, to be able to prove TSC time-warps:
35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
37static __cpuinitdata cycles_t last_tsc;
38static __cpuinitdata cycles_t max_warp;
39static __cpuinitdata int nr_warps;
40
41/*
42 * TSC-warp measurement loop running on both CPUs:
43 */
44static __cpuinit void check_tsc_warp(void)
45{
46 cycles_t start, now, prev, end;
47 int i;
48
49 start = get_cycles_sync();
50 /*
51 * The measurement runs for 20 msecs:
52 */
53 end = start + cpu_khz * 20ULL;
54 now = start;
55
56 for (i = 0; ; i++) {
57 /*
58 * We take the global lock, measure TSC, save the
59 * previous TSC that was measured (possibly on
60 * another CPU) and update the previous TSC timestamp.
61 */
62 __raw_spin_lock(&sync_lock);
63 prev = last_tsc;
64 now = get_cycles_sync();
65 last_tsc = now;
66 __raw_spin_unlock(&sync_lock);
67
68 /*
69 * Be nice every now and then (and also check whether
70 * measurement is done [we also insert a 100 million
71 * loops safety exit, so we dont lock up in case the
72 * TSC readout is totally broken]):
73 */
74 if (unlikely(!(i & 7))) {
75 if (now > end || i > 100000000)
76 break;
77 cpu_relax();
78 touch_nmi_watchdog();
79 }
80 /*
81 * Outside the critical section we can now see whether
82 * we saw a time-warp of the TSC going backwards:
83 */
84 if (unlikely(prev > now)) {
85 __raw_spin_lock(&sync_lock);
86 max_warp = max(max_warp, prev - now);
87 nr_warps++;
88 __raw_spin_unlock(&sync_lock);
89 }
90
91 }
92}
93
94/*
95 * Source CPU calls into this - it waits for the freshly booted
96 * target CPU to arrive and then starts the measurement:
97 */
98void __cpuinit check_tsc_sync_source(int cpu)
99{
100 int cpus = 2;
101
102 /*
103 * No need to check if we already know that the TSC is not
104 * synchronized:
105 */
106 if (unsynchronized_tsc())
107 return;
108
109 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
110 smp_processor_id(), cpu);
111
112 /*
113 * Reset it - in case this is a second bootup:
114 */
115 atomic_set(&stop_count, 0);
116
117 /*
118 * Wait for the target to arrive:
119 */
120 while (atomic_read(&start_count) != cpus-1)
121 cpu_relax();
122 /*
123 * Trigger the target to continue into the measurement too:
124 */
125 atomic_inc(&start_count);
126
127 check_tsc_warp();
128
129 while (atomic_read(&stop_count) != cpus-1)
130 cpu_relax();
131
132 /*
133 * Reset it - just in case we boot another CPU later:
134 */
135 atomic_set(&start_count, 0);
136
137 if (nr_warps) {
138 printk("\n");
139 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
140 " turning off TSC clock.\n", max_warp);
141 mark_tsc_unstable();
142 nr_warps = 0;
143 max_warp = 0;
144 last_tsc = 0;
145 } else {
146 printk(" passed.\n");
147 }
148
149 /*
150 * Let the target continue with the bootup:
151 */
152 atomic_inc(&stop_count);
153}
154
155/*
156 * Freshly booted CPUs call into this:
157 */
158void __cpuinit check_tsc_sync_target(void)
159{
160 int cpus = 2;
161
162 if (unsynchronized_tsc())
163 return;
164
165 /*
166 * Register this CPU's participation and wait for the
167 * source CPU to start the measurement:
168 */
169 atomic_inc(&start_count);
170 while (atomic_read(&start_count) != cpus)
171 cpu_relax();
172
173 check_tsc_warp();
174
175 /*
176 * Ok, we are done:
177 */
178 atomic_inc(&stop_count);
179
180 /*
181 * Wait for the source CPU to print stuff:
182 */
183 while (atomic_read(&stop_count) != cpus)
184 cpu_relax();
185}
186#undef NR_LOOPS
187