aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel/smpboot.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/kernel/smpboot.c')
-rw-r--r--arch/x86_64/kernel/smpboot.c262
1 files changed, 185 insertions, 77 deletions
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 73f7e8b9543a..5abdee1e16a5 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -56,6 +56,7 @@
56#include <asm/kdebug.h> 56#include <asm/kdebug.h>
57#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
58#include <asm/proto.h> 58#include <asm/proto.h>
59#include <asm/nmi.h>
59 60
60/* Change for real CPU hotplug. Note other files need to be fixed 61/* Change for real CPU hotplug. Note other files need to be fixed
61 first too. */ 62 first too. */
@@ -125,96 +126,210 @@ static void __cpuinit smp_store_cpu_info(int id)
125 126
126 *c = boot_cpu_data; 127 *c = boot_cpu_data;
127 identify_cpu(c); 128 identify_cpu(c);
129 print_cpu_info(c);
128} 130}
129 131
130/* 132/*
131 * Synchronize TSCs of CPUs 133 * New Funky TSC sync algorithm borrowed from IA64.
134 * Main advantage is that it doesn't reset the TSCs fully and
135 * in general looks more robust and it works better than my earlier
136 * attempts. I believe it was written by David Mosberger. Some minor
137 * adjustments for x86-64 by me -AK
132 * 138 *
133 * This new algorithm is less accurate than the old "zero TSCs" 139 * Original comment reproduced below.
134 * one, but we cannot zero TSCs anymore in the new hotplug CPU 140 *
135 * model. 141 * Synchronize TSC of the current (slave) CPU with the TSC of the
142 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
143 * eliminate the possibility of unaccounted-for errors (such as
144 * getting a machine check in the middle of a calibration step). The
145 * basic idea is for the slave to ask the master what itc value it has
146 * and to read its own itc before and after the master responds. Each
147 * iteration gives us three timestamps:
148 *
149 * slave master
150 *
151 * t0 ---\
152 * ---\
153 * --->
154 * tm
155 * /---
156 * /---
157 * t1 <---
158 *
159 *
160 * The goal is to adjust the slave's TSC such that tm falls exactly
161 * half-way between t0 and t1. If we achieve this, the clocks are
162 * synchronized provided the interconnect between the slave and the
163 * master is symmetric. Even if the interconnect were asymmetric, we
164 * would still know that the synchronization error is smaller than the
165 * roundtrip latency (t0 - t1).
166 *
167 * When the interconnect is quiet and symmetric, this lets us
168 * synchronize the TSC to within one or two cycles. However, we can
169 * only *guarantee* that the synchronization is accurate to within a
170 * round-trip time, which is typically in the range of several hundred
171 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
172 * are usually almost perfectly synchronized, but we shouldn't assume
173 * that the accuracy is much better than half a micro second or so.
174 *
175 * [there are other errors like the latency of RDTSC and of the
176 * WRMSR. These can also account to hundreds of cycles. So it's
177 * probably worse. It claims 153 cycles error on a dual Opteron,
178 * but I suspect the numbers are actually somewhat worse -AK]
136 */ 179 */
137 180
138static atomic_t __cpuinitdata tsc_flag; 181#define MASTER 0
182#define SLAVE (SMP_CACHE_BYTES/8)
183
184/* Intentionally don't use cpu_relax() while TSC synchronization
185 because we don't want to go into funky power save modi or cause
186 hypervisors to schedule us away. Going to sleep would likely affect
187 latency and low latency is the primary objective here. -AK */
188#define no_cpu_relax() barrier()
189
139static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); 190static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
140static unsigned long long __cpuinitdata bp_tsc, ap_tsc; 191static volatile __cpuinitdata unsigned long go[SLAVE + 1];
192static int notscsync __cpuinitdata;
193
194#undef DEBUG_TSC_SYNC
141 195
142#define NR_LOOPS 5 196#define NUM_ROUNDS 64 /* magic value */
197#define NUM_ITERS 5 /* likewise */
143 198
144static void __cpuinit sync_tsc_bp_init(int init) 199/* Callback on boot CPU */
200static __cpuinit void sync_master(void *arg)
145{ 201{
146 if (init) 202 unsigned long flags, i;
147 _raw_spin_lock(&tsc_sync_lock); 203
148 else 204 if (smp_processor_id() != boot_cpu_id)
149 _raw_spin_unlock(&tsc_sync_lock); 205 return;
150 atomic_set(&tsc_flag, 0); 206
207 go[MASTER] = 0;
208
209 local_irq_save(flags);
210 {
211 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
212 while (!go[MASTER])
213 no_cpu_relax();
214 go[MASTER] = 0;
215 rdtscll(go[SLAVE]);
216 }
217 }
218 local_irq_restore(flags);
151} 219}
152 220
153/* 221/*
154 * Synchronize TSC on AP with BP. 222 * Return the number of cycles by which our tsc differs from the tsc
223 * on the master (time-keeper) CPU. A positive number indicates our
224 * tsc is ahead of the master, negative that it is behind.
155 */ 225 */
156static void __cpuinit __sync_tsc_ap(void) 226static inline long
227get_delta(long *rt, long *master)
157{ 228{
158 if (!cpu_has_tsc) 229 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
159 return; 230 unsigned long tcenter, t0, t1, tm;
160 Dprintk("AP %d syncing TSC\n", smp_processor_id()); 231 int i;
161 232
162 while (atomic_read(&tsc_flag) != 0) 233 for (i = 0; i < NUM_ITERS; ++i) {
163 cpu_relax(); 234 rdtscll(t0);
164 atomic_inc(&tsc_flag); 235 go[MASTER] = 1;
165 mb(); 236 while (!(tm = go[SLAVE]))
166 _raw_spin_lock(&tsc_sync_lock); 237 no_cpu_relax();
167 wrmsrl(MSR_IA32_TSC, bp_tsc); 238 go[SLAVE] = 0;
168 _raw_spin_unlock(&tsc_sync_lock); 239 rdtscll(t1);
169 rdtscll(ap_tsc); 240
170 mb(); 241 if (t1 - t0 < best_t1 - best_t0)
171 atomic_inc(&tsc_flag); 242 best_t0 = t0, best_t1 = t1, best_tm = tm;
172 mb(); 243 }
244
245 *rt = best_t1 - best_t0;
246 *master = best_tm - best_t0;
247
248 /* average best_t0 and best_t1 without overflow: */
249 tcenter = (best_t0/2 + best_t1/2);
250 if (best_t0 % 2 + best_t1 % 2 == 2)
251 ++tcenter;
252 return tcenter - best_tm;
173} 253}
174 254
175static void __cpuinit sync_tsc_ap(void) 255static __cpuinit void sync_tsc(void)
176{ 256{
177 int i; 257 int i, done = 0;
178 for (i = 0; i < NR_LOOPS; i++) 258 long delta, adj, adjust_latency = 0;
179 __sync_tsc_ap(); 259 unsigned long flags, rt, master_time_stamp, bound;
260#if DEBUG_TSC_SYNC
261 static struct syncdebug {
262 long rt; /* roundtrip time */
263 long master; /* master's timestamp */
264 long diff; /* difference between midpoint and master's timestamp */
265 long lat; /* estimate of tsc adjustment latency */
266 } t[NUM_ROUNDS] __cpuinitdata;
267#endif
268
269 go[MASTER] = 1;
270
271 smp_call_function(sync_master, NULL, 1, 0);
272
273 while (go[MASTER]) /* wait for master to be ready */
274 no_cpu_relax();
275
276 spin_lock_irqsave(&tsc_sync_lock, flags);
277 {
278 for (i = 0; i < NUM_ROUNDS; ++i) {
279 delta = get_delta(&rt, &master_time_stamp);
280 if (delta == 0) {
281 done = 1; /* let's lock on to this... */
282 bound = rt;
283 }
284
285 if (!done) {
286 unsigned long t;
287 if (i > 0) {
288 adjust_latency += -delta;
289 adj = -delta + adjust_latency/4;
290 } else
291 adj = -delta;
292
293 rdtscll(t);
294 wrmsrl(MSR_IA32_TSC, t + adj);
295 }
296#if DEBUG_TSC_SYNC
297 t[i].rt = rt;
298 t[i].master = master_time_stamp;
299 t[i].diff = delta;
300 t[i].lat = adjust_latency/4;
301#endif
302 }
303 }
304 spin_unlock_irqrestore(&tsc_sync_lock, flags);
305
306#if DEBUG_TSC_SYNC
307 for (i = 0; i < NUM_ROUNDS; ++i)
308 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
309 t[i].rt, t[i].master, t[i].diff, t[i].lat);
310#endif
311
312 printk(KERN_INFO
313 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
314 "maxerr %lu cycles)\n",
315 smp_processor_id(), boot_cpu_id, delta, rt);
180} 316}
181 317
182/* 318static void __cpuinit tsc_sync_wait(void)
183 * Synchronize TSC from BP to AP.
184 */
185static void __cpuinit __sync_tsc_bp(int cpu)
186{ 319{
187 if (!cpu_has_tsc) 320 if (notscsync || !cpu_has_tsc)
188 return; 321 return;
189 322 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
190 /* Wait for AP */ 323 boot_cpu_id);
191 while (atomic_read(&tsc_flag) == 0) 324 sync_tsc();
192 cpu_relax();
193 /* Save BPs TSC */
194 sync_core();
195 rdtscll(bp_tsc);
196 /* Don't do the sync core here to avoid too much latency. */
197 mb();
198 /* Start the AP */
199 _raw_spin_unlock(&tsc_sync_lock);
200 /* Wait for AP again */
201 while (atomic_read(&tsc_flag) < 2)
202 cpu_relax();
203 rdtscl(bp_tsc);
204 barrier();
205} 325}
206 326
207static void __cpuinit sync_tsc_bp(int cpu) 327static __init int notscsync_setup(char *s)
208{ 328{
209 int i; 329 notscsync = 1;
210 for (i = 0; i < NR_LOOPS - 1; i++) { 330 return 0;
211 __sync_tsc_bp(cpu);
212 sync_tsc_bp_init(1);
213 }
214 __sync_tsc_bp(cpu);
215 printk(KERN_INFO "Synced TSC of CPU %d difference %Ld\n",
216 cpu, ap_tsc - bp_tsc);
217} 331}
332__setup("notscsync", notscsync_setup);
218 333
219static atomic_t init_deasserted __cpuinitdata; 334static atomic_t init_deasserted __cpuinitdata;
220 335
@@ -315,11 +430,6 @@ void __cpuinit start_secondary(void)
315 cpu_init(); 430 cpu_init();
316 smp_callin(); 431 smp_callin();
317 432
318 /*
319 * Synchronize the TSC with the BP
320 */
321 sync_tsc_ap();
322
323 /* otherwise gcc will move up the smp_processor_id before the cpu_init */ 433 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
324 barrier(); 434 barrier();
325 435
@@ -334,7 +444,6 @@ void __cpuinit start_secondary(void)
334 enable_8259A_irq(0); 444 enable_8259A_irq(0);
335 } 445 }
336 446
337
338 enable_APIC_timer(); 447 enable_APIC_timer();
339 448
340 /* 449 /*
@@ -343,6 +452,11 @@ void __cpuinit start_secondary(void)
343 cpu_set(smp_processor_id(), cpu_online_map); 452 cpu_set(smp_processor_id(), cpu_online_map);
344 mb(); 453 mb();
345 454
455 /* Wait for TSC sync to not schedule things before.
456 We still process interrupts, which could see an inconsistent
457 time in that window unfortunately. */
458 tsc_sync_wait();
459
346 cpu_idle(); 460 cpu_idle();
347} 461}
348 462
@@ -531,7 +645,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
531 printk("failed fork for CPU %d\n", cpu); 645 printk("failed fork for CPU %d\n", cpu);
532 return PTR_ERR(idle); 646 return PTR_ERR(idle);
533 } 647 }
534 x86_cpu_to_apicid[cpu] = apicid;
535 648
536 cpu_pda[cpu].pcurrent = idle; 649 cpu_pda[cpu].pcurrent = idle;
537 650
@@ -600,8 +713,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
600 713
601 if (cpu_isset(cpu, cpu_callin_map)) { 714 if (cpu_isset(cpu, cpu_callin_map)) {
602 /* number CPUs logically, starting from 1 (BSP is 0) */ 715 /* number CPUs logically, starting from 1 (BSP is 0) */
603 Dprintk("OK.\n");
604 print_cpu_info(&cpu_data[cpu]);
605 Dprintk("CPU has booted.\n"); 716 Dprintk("CPU has booted.\n");
606 } else { 717 } else {
607 boot_error = 1; 718 boot_error = 1;
@@ -842,7 +953,6 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
842 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); 953 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
843 /* Or can we switch back to PIC here? */ 954 /* Or can we switch back to PIC here? */
844 } 955 }
845 x86_cpu_to_apicid[0] = boot_cpu_id;
846 956
847 /* 957 /*
848 * Now start the IO-APICs 958 * Now start the IO-APICs
@@ -889,18 +999,14 @@ int __cpuinit __cpu_up(unsigned int cpu)
889 printk("__cpu_up: bad cpu %d\n", cpu); 999 printk("__cpu_up: bad cpu %d\n", cpu);
890 return -EINVAL; 1000 return -EINVAL;
891 } 1001 }
892 sync_tsc_bp_init(1);
893 1002
894 /* Boot it! */ 1003 /* Boot it! */
895 err = do_boot_cpu(cpu, apicid); 1004 err = do_boot_cpu(cpu, apicid);
896 if (err < 0) { 1005 if (err < 0) {
897 sync_tsc_bp_init(0);
898 Dprintk("do_boot_cpu failed %d\n", err); 1006 Dprintk("do_boot_cpu failed %d\n", err);
899 return err; 1007 return err;
900 } 1008 }
901 1009
902 sync_tsc_bp(cpu);
903
904 /* Unleash the CPU! */ 1010 /* Unleash the CPU! */
905 Dprintk("waiting for cpu %d\n", cpu); 1011 Dprintk("waiting for cpu %d\n", cpu);
906 1012
@@ -923,4 +1029,6 @@ void __cpuinit smp_cpus_done(unsigned int max_cpus)
923 1029
924 detect_siblings(); 1030 detect_siblings();
925 time_init_gtod(); 1031 time_init_gtod();
1032
1033 check_nmi_watchdog();
926} 1034}