aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel/smpboot.c
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2005-05-17 00:53:25 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-05-17 10:59:14 -0400
commitdda50e716dc9451f40eebfb2902c260e4f62cf34 (patch)
tree0012e8cd5d702f8ae8aa7bc3518706054b764806 /arch/x86_64/kernel/smpboot.c
parent93ef70a217637ade3f335303a112b22a134a1ec2 (diff)
[PATCH] x86_64: Update TSC sync algorithm
The new TSC sync algorithm recently submitted did not work too well. The result was that some MP machines where the TSC came up of the BIOS very unsynchronized and that did not have HPET support were nearly unusable because the time would jump forwards and backwards between CPUs. After a lot of research ;-) and some more prototypes I ended up with just using the one from IA64 which looks best. It has some internal self tuning that should adapt to changing interconnect latencies. It holds up in my tests so far. I believe it was originally written by David Mosberger, I just ported it over to x86-64. See the inline comment for a description. This cleans up the code because it uses smp_call_function for syncing instead of having custom hooks in SMP bootup. Please note that the cycle numbers it outputs are too optimistic because they do not take into account the latency of WRMSR and RDTSC, which can be hundreds of cycles. It seems to be able to sync a dual Opteron to 200-300 cycles, which is probably good enough. There is a timing window during AP bootup where interrupts can see inconsistent time before the TSC is synced. It is hard to avoid unfortunately because we can only do the TSC sync after some setup, and we need to enable interrupts before that. I just ignored it for now. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/kernel/smpboot.c')
-rw-r--r--arch/x86_64/kernel/smpboot.c257
1 files changed, 182 insertions, 75 deletions
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 73f7e8b9543a..17efc40baa32 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -125,96 +125,210 @@ static void __cpuinit smp_store_cpu_info(int id)
125 125
126 *c = boot_cpu_data; 126 *c = boot_cpu_data;
127 identify_cpu(c); 127 identify_cpu(c);
128 print_cpu_info(c);
128} 129}
129 130
130/* 131/*
131 * Synchronize TSCs of CPUs 132 * New Funky TSC sync algorithm borrowed from IA64.
133 * Main advantage is that it doesn't reset the TSCs fully and
134 * in general looks more robust and it works better than my earlier
135 * attempts. I believe it was written by David Mosberger. Some minor
136 * adjustments for x86-64 by me -AK
132 * 137 *
133 * This new algorithm is less accurate than the old "zero TSCs" 138 * Original comment reproduced below.
134 * one, but we cannot zero TSCs anymore in the new hotplug CPU 139 *
135 * model. 140 * Synchronize TSC of the current (slave) CPU with the TSC of the
141 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
142 * eliminate the possibility of unaccounted-for errors (such as
143 * getting a machine check in the middle of a calibration step). The
144 * basic idea is for the slave to ask the master what itc value it has
145 * and to read its own itc before and after the master responds. Each
146 * iteration gives us three timestamps:
147 *
148 * slave master
149 *
150 * t0 ---\
151 * ---\
152 * --->
153 * tm
154 * /---
155 * /---
156 * t1 <---
157 *
158 *
159 * The goal is to adjust the slave's TSC such that tm falls exactly
160 * half-way between t0 and t1. If we achieve this, the clocks are
161 * synchronized provided the interconnect between the slave and the
162 * master is symmetric. Even if the interconnect were asymmetric, we
163 * would still know that the synchronization error is smaller than the
164 * roundtrip latency (t0 - t1).
165 *
166 * When the interconnect is quiet and symmetric, this lets us
167 * synchronize the TSC to within one or two cycles. However, we can
168 * only *guarantee* that the synchronization is accurate to within a
169 * round-trip time, which is typically in the range of several hundred
170 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
171 * are usually almost perfectly synchronized, but we shouldn't assume
172 * that the accuracy is much better than half a micro second or so.
173 *
174 * [there are other errors like the latency of RDTSC and of the
175 * WRMSR. These can also account to hundreds of cycles. So it's
176 * probably worse. It claims 153 cycles error on a dual Opteron,
177 * but I suspect the numbers are actually somewhat worse -AK]
136 */ 178 */
137 179
138static atomic_t __cpuinitdata tsc_flag; 180#define MASTER 0
181#define SLAVE (SMP_CACHE_BYTES/8)
182
183/* Intentionally don't use cpu_relax() while TSC synchronization
184 because we don't want to go into funky power save modi or cause
185 hypervisors to schedule us away. Going to sleep would likely affect
186 latency and low latency is the primary objective here. -AK */
187#define no_cpu_relax() barrier()
188
139static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); 189static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
140static unsigned long long __cpuinitdata bp_tsc, ap_tsc; 190static volatile __cpuinitdata unsigned long go[SLAVE + 1];
191static int notscsync __cpuinitdata;
192
193#undef DEBUG_TSC_SYNC
141 194
142#define NR_LOOPS 5 195#define NUM_ROUNDS 64 /* magic value */
196#define NUM_ITERS 5 /* likewise */
143 197
144static void __cpuinit sync_tsc_bp_init(int init) 198/* Callback on boot CPU */
199static __cpuinit void sync_master(void *arg)
145{ 200{
146 if (init) 201 unsigned long flags, i;
147 _raw_spin_lock(&tsc_sync_lock); 202
148 else 203 if (smp_processor_id() != boot_cpu_id)
149 _raw_spin_unlock(&tsc_sync_lock); 204 return;
150 atomic_set(&tsc_flag, 0); 205
206 go[MASTER] = 0;
207
208 local_irq_save(flags);
209 {
210 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
211 while (!go[MASTER])
212 no_cpu_relax();
213 go[MASTER] = 0;
214 rdtscll(go[SLAVE]);
215 }
216 }
217 local_irq_restore(flags);
151} 218}
152 219
153/* 220/*
154 * Synchronize TSC on AP with BP. 221 * Return the number of cycles by which our tsc differs from the tsc
222 * on the master (time-keeper) CPU. A positive number indicates our
223 * tsc is ahead of the master, negative that it is behind.
155 */ 224 */
156static void __cpuinit __sync_tsc_ap(void) 225static inline long
226get_delta(long *rt, long *master)
157{ 227{
158 if (!cpu_has_tsc) 228 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
159 return; 229 unsigned long tcenter, t0, t1, tm;
160 Dprintk("AP %d syncing TSC\n", smp_processor_id()); 230 int i;
161 231
162 while (atomic_read(&tsc_flag) != 0) 232 for (i = 0; i < NUM_ITERS; ++i) {
163 cpu_relax(); 233 rdtscll(t0);
164 atomic_inc(&tsc_flag); 234 go[MASTER] = 1;
165 mb(); 235 while (!(tm = go[SLAVE]))
166 _raw_spin_lock(&tsc_sync_lock); 236 no_cpu_relax();
167 wrmsrl(MSR_IA32_TSC, bp_tsc); 237 go[SLAVE] = 0;
168 _raw_spin_unlock(&tsc_sync_lock); 238 rdtscll(t1);
169 rdtscll(ap_tsc); 239
170 mb(); 240 if (t1 - t0 < best_t1 - best_t0)
171 atomic_inc(&tsc_flag); 241 best_t0 = t0, best_t1 = t1, best_tm = tm;
172 mb(); 242 }
243
244 *rt = best_t1 - best_t0;
245 *master = best_tm - best_t0;
246
247 /* average best_t0 and best_t1 without overflow: */
248 tcenter = (best_t0/2 + best_t1/2);
249 if (best_t0 % 2 + best_t1 % 2 == 2)
250 ++tcenter;
251 return tcenter - best_tm;
173} 252}
174 253
175static void __cpuinit sync_tsc_ap(void) 254static __cpuinit void sync_tsc(void)
176{ 255{
177 int i; 256 int i, done = 0;
178 for (i = 0; i < NR_LOOPS; i++) 257 long delta, adj, adjust_latency = 0;
179 __sync_tsc_ap(); 258 unsigned long flags, rt, master_time_stamp, bound;
259#if DEBUG_TSC_SYNC
260 static struct syncdebug {
261 long rt; /* roundtrip time */
262 long master; /* master's timestamp */
263 long diff; /* difference between midpoint and master's timestamp */
264 long lat; /* estimate of tsc adjustment latency */
265 } t[NUM_ROUNDS] __cpuinitdata;
266#endif
267
268 go[MASTER] = 1;
269
270 smp_call_function(sync_master, NULL, 1, 0);
271
272 while (go[MASTER]) /* wait for master to be ready */
273 no_cpu_relax();
274
275 spin_lock_irqsave(&tsc_sync_lock, flags);
276 {
277 for (i = 0; i < NUM_ROUNDS; ++i) {
278 delta = get_delta(&rt, &master_time_stamp);
279 if (delta == 0) {
280 done = 1; /* let's lock on to this... */
281 bound = rt;
282 }
283
284 if (!done) {
285 unsigned long t;
286 if (i > 0) {
287 adjust_latency += -delta;
288 adj = -delta + adjust_latency/4;
289 } else
290 adj = -delta;
291
292 rdtscll(t);
293 wrmsrl(MSR_IA32_TSC, t + adj);
294 }
295#if DEBUG_TSC_SYNC
296 t[i].rt = rt;
297 t[i].master = master_time_stamp;
298 t[i].diff = delta;
299 t[i].lat = adjust_latency/4;
300#endif
301 }
302 }
303 spin_unlock_irqrestore(&tsc_sync_lock, flags);
304
305#if DEBUG_TSC_SYNC
306 for (i = 0; i < NUM_ROUNDS; ++i)
307 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
308 t[i].rt, t[i].master, t[i].diff, t[i].lat);
309#endif
310
311 printk(KERN_INFO
312 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
313 "maxerr %lu cycles)\n",
314 smp_processor_id(), boot_cpu_id, delta, rt);
180} 315}
181 316
182/* 317static void __cpuinit tsc_sync_wait(void)
183 * Synchronize TSC from BP to AP.
184 */
185static void __cpuinit __sync_tsc_bp(int cpu)
186{ 318{
187 if (!cpu_has_tsc) 319 if (notscsync || !cpu_has_tsc)
188 return; 320 return;
189 321 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
190 /* Wait for AP */ 322 boot_cpu_id);
191 while (atomic_read(&tsc_flag) == 0) 323 sync_tsc();
192 cpu_relax();
193 /* Save BPs TSC */
194 sync_core();
195 rdtscll(bp_tsc);
196 /* Don't do the sync core here to avoid too much latency. */
197 mb();
198 /* Start the AP */
199 _raw_spin_unlock(&tsc_sync_lock);
200 /* Wait for AP again */
201 while (atomic_read(&tsc_flag) < 2)
202 cpu_relax();
203 rdtscl(bp_tsc);
204 barrier();
205} 324}
206 325
207static void __cpuinit sync_tsc_bp(int cpu) 326static __init int notscsync_setup(char *s)
208{ 327{
209 int i; 328 notscsync = 1;
210 for (i = 0; i < NR_LOOPS - 1; i++) { 329 return 0;
211 __sync_tsc_bp(cpu);
212 sync_tsc_bp_init(1);
213 }
214 __sync_tsc_bp(cpu);
215 printk(KERN_INFO "Synced TSC of CPU %d difference %Ld\n",
216 cpu, ap_tsc - bp_tsc);
217} 330}
331__setup("notscsync", notscsync_setup);
218 332
219static atomic_t init_deasserted __cpuinitdata; 333static atomic_t init_deasserted __cpuinitdata;
220 334
@@ -315,11 +429,6 @@ void __cpuinit start_secondary(void)
315 cpu_init(); 429 cpu_init();
316 smp_callin(); 430 smp_callin();
317 431
318 /*
319 * Synchronize the TSC with the BP
320 */
321 sync_tsc_ap();
322
323 /* otherwise gcc will move up the smp_processor_id before the cpu_init */ 432 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
324 barrier(); 433 barrier();
325 434
@@ -334,7 +443,6 @@ void __cpuinit start_secondary(void)
334 enable_8259A_irq(0); 443 enable_8259A_irq(0);
335 } 444 }
336 445
337
338 enable_APIC_timer(); 446 enable_APIC_timer();
339 447
340 /* 448 /*
@@ -343,6 +451,11 @@ void __cpuinit start_secondary(void)
343 cpu_set(smp_processor_id(), cpu_online_map); 451 cpu_set(smp_processor_id(), cpu_online_map);
344 mb(); 452 mb();
345 453
454 /* Wait for TSC sync to not schedule things before.
455 We still process interrupts, which could see an inconsistent
456 time in that window unfortunately. */
457 tsc_sync_wait();
458
346 cpu_idle(); 459 cpu_idle();
347} 460}
348 461
@@ -600,8 +713,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
600 713
601 if (cpu_isset(cpu, cpu_callin_map)) { 714 if (cpu_isset(cpu, cpu_callin_map)) {
602 /* number CPUs logically, starting from 1 (BSP is 0) */ 715 /* number CPUs logically, starting from 1 (BSP is 0) */
603 Dprintk("OK.\n");
604 print_cpu_info(&cpu_data[cpu]);
605 Dprintk("CPU has booted.\n"); 716 Dprintk("CPU has booted.\n");
606 } else { 717 } else {
607 boot_error = 1; 718 boot_error = 1;
@@ -889,18 +1000,14 @@ int __cpuinit __cpu_up(unsigned int cpu)
889 printk("__cpu_up: bad cpu %d\n", cpu); 1000 printk("__cpu_up: bad cpu %d\n", cpu);
890 return -EINVAL; 1001 return -EINVAL;
891 } 1002 }
892 sync_tsc_bp_init(1);
893 1003
894 /* Boot it! */ 1004 /* Boot it! */
895 err = do_boot_cpu(cpu, apicid); 1005 err = do_boot_cpu(cpu, apicid);
896 if (err < 0) { 1006 if (err < 0) {
897 sync_tsc_bp_init(0);
898 Dprintk("do_boot_cpu failed %d\n", err); 1007 Dprintk("do_boot_cpu failed %d\n", err);
899 return err; 1008 return err;
900 } 1009 }
901 1010
902 sync_tsc_bp(cpu);
903
904 /* Unleash the CPU! */ 1011 /* Unleash the CPU! */
905 Dprintk("waiting for cpu %d\n", cpu); 1012 Dprintk("waiting for cpu %d\n", cpu);
906 1013