aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel/smpboot.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2007-02-16 04:27:34 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-02-16 11:13:57 -0500
commit95492e4646e5de8b43d9a7908d6177fb737b61f0 (patch)
treeae25cd206ca76f78d50ac2a206ef012e0ab1d9df /arch/x86_64/kernel/smpboot.c
parent92c7e00254b2d0efc1e36ac3e45474ce1871b6b2 (diff)
[PATCH] x86: rewrite SMP TSC sync code
make the TSC synchronization code more robust, and unify it between x86_64 and i386. The biggest change is the removal of the 'fix up TSCs' code on x86_64 and i386, in some rare cases it was /causing/ time-warps on SMP systems. The new code only checks for TSC asynchronity - and if it can prove a time-warp (if it can observe the TSC going backwards when going from one CPU to another within a critical section), then the TSC clock-source is turned off. The TSC synchronization-checking code also got moved into a separate file. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: john stultz <johnstul@us.ibm.com> Cc: Roman Zippel <zippel@linux-m68k.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86_64/kernel/smpboot.c')
-rw-r--r--arch/x86_64/kernel/smpboot.c230
1 files changed, 11 insertions, 219 deletions
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index daf19332f0dd..62d828433c30 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -148,217 +148,6 @@ static void __cpuinit smp_store_cpu_info(int id)
148 print_cpu_info(c); 148 print_cpu_info(c);
149} 149}
150 150
151/*
152 * New Funky TSC sync algorithm borrowed from IA64.
153 * Main advantage is that it doesn't reset the TSCs fully and
154 * in general looks more robust and it works better than my earlier
155 * attempts. I believe it was written by David Mosberger. Some minor
156 * adjustments for x86-64 by me -AK
157 *
158 * Original comment reproduced below.
159 *
160 * Synchronize TSC of the current (slave) CPU with the TSC of the
161 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
162 * eliminate the possibility of unaccounted-for errors (such as
163 * getting a machine check in the middle of a calibration step). The
164 * basic idea is for the slave to ask the master what itc value it has
165 * and to read its own itc before and after the master responds. Each
166 * iteration gives us three timestamps:
167 *
168 * slave master
169 *
170 * t0 ---\
171 * ---\
172 * --->
173 * tm
174 * /---
175 * /---
176 * t1 <---
177 *
178 *
179 * The goal is to adjust the slave's TSC such that tm falls exactly
180 * half-way between t0 and t1. If we achieve this, the clocks are
181 * synchronized provided the interconnect between the slave and the
182 * master is symmetric. Even if the interconnect were asymmetric, we
183 * would still know that the synchronization error is smaller than the
184 * roundtrip latency (t0 - t1).
185 *
186 * When the interconnect is quiet and symmetric, this lets us
187 * synchronize the TSC to within one or two cycles. However, we can
188 * only *guarantee* that the synchronization is accurate to within a
189 * round-trip time, which is typically in the range of several hundred
190 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
191 * are usually almost perfectly synchronized, but we shouldn't assume
192 * that the accuracy is much better than half a micro second or so.
193 *
194 * [there are other errors like the latency of RDTSC and of the
195 * WRMSR. These can also account to hundreds of cycles. So it's
196 * probably worse. It claims 153 cycles error on a dual Opteron,
197 * but I suspect the numbers are actually somewhat worse -AK]
198 */
199
200#define MASTER 0
201#define SLAVE (SMP_CACHE_BYTES/8)
202
203/* Intentionally don't use cpu_relax() while TSC synchronization
204 because we don't want to go into funky power save modi or cause
205 hypervisors to schedule us away. Going to sleep would likely affect
206 latency and low latency is the primary objective here. -AK */
207#define no_cpu_relax() barrier()
208
209static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
210static volatile __cpuinitdata unsigned long go[SLAVE + 1];
211static int notscsync __cpuinitdata;
212
213#undef DEBUG_TSC_SYNC
214
215#define NUM_ROUNDS 64 /* magic value */
216#define NUM_ITERS 5 /* likewise */
217
218/* Callback on boot CPU */
219static __cpuinit void sync_master(void *arg)
220{
221 unsigned long flags, i;
222
223 go[MASTER] = 0;
224
225 local_irq_save(flags);
226 {
227 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
228 while (!go[MASTER])
229 no_cpu_relax();
230 go[MASTER] = 0;
231 rdtscll(go[SLAVE]);
232 }
233 }
234 local_irq_restore(flags);
235}
236
237/*
238 * Return the number of cycles by which our tsc differs from the tsc
239 * on the master (time-keeper) CPU. A positive number indicates our
240 * tsc is ahead of the master, negative that it is behind.
241 */
242static inline long
243get_delta(long *rt, long *master)
244{
245 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
246 unsigned long tcenter, t0, t1, tm;
247 int i;
248
249 for (i = 0; i < NUM_ITERS; ++i) {
250 rdtscll(t0);
251 go[MASTER] = 1;
252 while (!(tm = go[SLAVE]))
253 no_cpu_relax();
254 go[SLAVE] = 0;
255 rdtscll(t1);
256
257 if (t1 - t0 < best_t1 - best_t0)
258 best_t0 = t0, best_t1 = t1, best_tm = tm;
259 }
260
261 *rt = best_t1 - best_t0;
262 *master = best_tm - best_t0;
263
264 /* average best_t0 and best_t1 without overflow: */
265 tcenter = (best_t0/2 + best_t1/2);
266 if (best_t0 % 2 + best_t1 % 2 == 2)
267 ++tcenter;
268 return tcenter - best_tm;
269}
270
271static __cpuinit void sync_tsc(unsigned int master)
272{
273 int i, done = 0;
274 long delta, adj, adjust_latency = 0;
275 unsigned long flags, rt, master_time_stamp, bound;
276#ifdef DEBUG_TSC_SYNC
277 static struct syncdebug {
278 long rt; /* roundtrip time */
279 long master; /* master's timestamp */
280 long diff; /* difference between midpoint and master's timestamp */
281 long lat; /* estimate of tsc adjustment latency */
282 } t[NUM_ROUNDS] __cpuinitdata;
283#endif
284
285 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
286 smp_processor_id(), master);
287
288 go[MASTER] = 1;
289
290 /* It is dangerous to broadcast IPI as cpus are coming up,
291 * as they may not be ready to accept them. So since
292 * we only need to send the ipi to the boot cpu direct
293 * the message, and avoid the race.
294 */
295 smp_call_function_single(master, sync_master, NULL, 1, 0);
296
297 while (go[MASTER]) /* wait for master to be ready */
298 no_cpu_relax();
299
300 spin_lock_irqsave(&tsc_sync_lock, flags);
301 {
302 for (i = 0; i < NUM_ROUNDS; ++i) {
303 delta = get_delta(&rt, &master_time_stamp);
304 if (delta == 0) {
305 done = 1; /* let's lock on to this... */
306 bound = rt;
307 }
308
309 if (!done) {
310 unsigned long t;
311 if (i > 0) {
312 adjust_latency += -delta;
313 adj = -delta + adjust_latency/4;
314 } else
315 adj = -delta;
316
317 rdtscll(t);
318 wrmsrl(MSR_IA32_TSC, t + adj);
319 }
320#ifdef DEBUG_TSC_SYNC
321 t[i].rt = rt;
322 t[i].master = master_time_stamp;
323 t[i].diff = delta;
324 t[i].lat = adjust_latency/4;
325#endif
326 }
327 }
328 spin_unlock_irqrestore(&tsc_sync_lock, flags);
329
330#ifdef DEBUG_TSC_SYNC
331 for (i = 0; i < NUM_ROUNDS; ++i)
332 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
333 t[i].rt, t[i].master, t[i].diff, t[i].lat);
334#endif
335
336 printk(KERN_INFO
337 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
338 "maxerr %lu cycles)\n",
339 smp_processor_id(), master, delta, rt);
340}
341
342static void __cpuinit tsc_sync_wait(void)
343{
344 /*
345 * When the CPU has synchronized TSCs assume the BIOS
346 * or the hardware already synced. Otherwise we could
347 * mess up a possible perfect synchronization with a
348 * not-quite-perfect algorithm.
349 */
350 if (notscsync || !cpu_has_tsc || !unsynchronized_tsc())
351 return;
352 sync_tsc(0);
353}
354
355static __init int notscsync_setup(char *s)
356{
357 notscsync = 1;
358 return 1;
359}
360__setup("notscsync", notscsync_setup);
361
362static atomic_t init_deasserted __cpuinitdata; 151static atomic_t init_deasserted __cpuinitdata;
363 152
364/* 153/*
@@ -546,6 +335,11 @@ void __cpuinit start_secondary(void)
546 /* otherwise gcc will move up the smp_processor_id before the cpu_init */ 335 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
547 barrier(); 336 barrier();
548 337
338 /*
339 * Check TSC sync first:
340 */
341 check_tsc_sync_target();
342
549 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); 343 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
550 setup_secondary_APIC_clock(); 344 setup_secondary_APIC_clock();
551 345
@@ -565,14 +359,6 @@ void __cpuinit start_secondary(void)
565 */ 359 */
566 set_cpu_sibling_map(smp_processor_id()); 360 set_cpu_sibling_map(smp_processor_id());
567 361
568 /*
569 * Wait for TSC sync to not schedule things before.
570 * We still process interrupts, which could see an inconsistent
571 * time in that window unfortunately.
572 * Do this here because TSC sync has global unprotected state.
573 */
574 tsc_sync_wait();
575
576 /* 362 /*
577 * We need to hold call_lock, so there is no inconsistency 363 * We need to hold call_lock, so there is no inconsistency
578 * between the time smp_call_function() determines number of 364 * between the time smp_call_function() determines number of
@@ -592,6 +378,7 @@ void __cpuinit start_secondary(void)
592 cpu_set(smp_processor_id(), cpu_online_map); 378 cpu_set(smp_processor_id(), cpu_online_map);
593 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 379 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
594 spin_unlock(&vector_lock); 380 spin_unlock(&vector_lock);
381
595 unlock_ipi_call_lock(); 382 unlock_ipi_call_lock();
596 383
597 cpu_idle(); 384 cpu_idle();
@@ -1168,6 +955,11 @@ int __cpuinit __cpu_up(unsigned int cpu)
1168 /* Unleash the CPU! */ 955 /* Unleash the CPU! */
1169 Dprintk("waiting for cpu %d\n", cpu); 956 Dprintk("waiting for cpu %d\n", cpu);
1170 957
958 /*
959 * Make sure and check TSC sync:
960 */
961 check_tsc_sync_source(cpu);
962
1171 while (!cpu_isset(cpu, cpu_online_map)) 963 while (!cpu_isset(cpu, cpu_online_map))
1172 cpu_relax(); 964 cpu_relax();
1173 965