aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel/smpboot.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/kernel/smpboot.c')
-rw-r--r--arch/x86_64/kernel/smpboot.c263
1 files changed, 186 insertions, 77 deletions
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 73f7e8b9543a..f1ec0f345941 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -56,6 +56,7 @@
56#include <asm/kdebug.h> 56#include <asm/kdebug.h>
57#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
58#include <asm/proto.h> 58#include <asm/proto.h>
59#include <asm/nmi.h>
59 60
60/* Change for real CPU hotplug. Note other files need to be fixed 61/* Change for real CPU hotplug. Note other files need to be fixed
61 first too. */ 62 first too. */
@@ -93,6 +94,7 @@ int smp_threads_ready;
93 94
94cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; 95cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
95cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; 96cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
97EXPORT_SYMBOL(cpu_core_map);
96 98
97/* 99/*
98 * Trampoline 80x86 program as an array. 100 * Trampoline 80x86 program as an array.
@@ -125,96 +127,210 @@ static void __cpuinit smp_store_cpu_info(int id)
125 127
126 *c = boot_cpu_data; 128 *c = boot_cpu_data;
127 identify_cpu(c); 129 identify_cpu(c);
130 print_cpu_info(c);
128} 131}
129 132
130/* 133/*
131 * Synchronize TSCs of CPUs 134 * New Funky TSC sync algorithm borrowed from IA64.
135 * Main advantage is that it doesn't reset the TSCs fully and
136 * in general looks more robust and it works better than my earlier
137 * attempts. I believe it was written by David Mosberger. Some minor
138 * adjustments for x86-64 by me -AK
132 * 139 *
133 * This new algorithm is less accurate than the old "zero TSCs" 140 * Original comment reproduced below.
134 * one, but we cannot zero TSCs anymore in the new hotplug CPU 141 *
135 * model. 142 * Synchronize TSC of the current (slave) CPU with the TSC of the
143 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
144 * eliminate the possibility of unaccounted-for errors (such as
145 * getting a machine check in the middle of a calibration step). The
146 * basic idea is for the slave to ask the master what itc value it has
147 * and to read its own itc before and after the master responds. Each
148 * iteration gives us three timestamps:
149 *
150 * slave master
151 *
152 * t0 ---\
153 * ---\
154 * --->
155 * tm
156 * /---
157 * /---
158 * t1 <---
159 *
160 *
161 * The goal is to adjust the slave's TSC such that tm falls exactly
162 * half-way between t0 and t1. If we achieve this, the clocks are
163 * synchronized provided the interconnect between the slave and the
164 * master is symmetric. Even if the interconnect were asymmetric, we
165 * would still know that the synchronization error is smaller than the
166 * roundtrip latency (t0 - t1).
167 *
168 * When the interconnect is quiet and symmetric, this lets us
169 * synchronize the TSC to within one or two cycles. However, we can
170 * only *guarantee* that the synchronization is accurate to within a
171 * round-trip time, which is typically in the range of several hundred
172 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
173 * are usually almost perfectly synchronized, but we shouldn't assume
174 * that the accuracy is much better than half a micro second or so.
175 *
176 * [there are other errors like the latency of RDTSC and of the
177 * WRMSR. These can also account to hundreds of cycles. So it's
178 * probably worse. It claims 153 cycles error on a dual Opteron,
179 * but I suspect the numbers are actually somewhat worse -AK]
136 */ 180 */
137 181
138static atomic_t __cpuinitdata tsc_flag; 182#define MASTER 0
183#define SLAVE (SMP_CACHE_BYTES/8)
184
185/* Intentionally don't use cpu_relax() while TSC synchronization
186 because we don't want to go into funky power save modi or cause
187 hypervisors to schedule us away. Going to sleep would likely affect
188 latency and low latency is the primary objective here. -AK */
189#define no_cpu_relax() barrier()
190
139static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); 191static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
140static unsigned long long __cpuinitdata bp_tsc, ap_tsc; 192static volatile __cpuinitdata unsigned long go[SLAVE + 1];
193static int notscsync __cpuinitdata;
194
195#undef DEBUG_TSC_SYNC
141 196
142#define NR_LOOPS 5 197#define NUM_ROUNDS 64 /* magic value */
198#define NUM_ITERS 5 /* likewise */
143 199
144static void __cpuinit sync_tsc_bp_init(int init) 200/* Callback on boot CPU */
201static __cpuinit void sync_master(void *arg)
145{ 202{
146 if (init) 203 unsigned long flags, i;
147 _raw_spin_lock(&tsc_sync_lock); 204
148 else 205 if (smp_processor_id() != boot_cpu_id)
149 _raw_spin_unlock(&tsc_sync_lock); 206 return;
150 atomic_set(&tsc_flag, 0); 207
208 go[MASTER] = 0;
209
210 local_irq_save(flags);
211 {
212 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
213 while (!go[MASTER])
214 no_cpu_relax();
215 go[MASTER] = 0;
216 rdtscll(go[SLAVE]);
217 }
218 }
219 local_irq_restore(flags);
151} 220}
152 221
153/* 222/*
154 * Synchronize TSC on AP with BP. 223 * Return the number of cycles by which our tsc differs from the tsc
224 * on the master (time-keeper) CPU. A positive number indicates our
225 * tsc is ahead of the master, negative that it is behind.
155 */ 226 */
156static void __cpuinit __sync_tsc_ap(void) 227static inline long
228get_delta(long *rt, long *master)
157{ 229{
158 if (!cpu_has_tsc) 230 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
159 return; 231 unsigned long tcenter, t0, t1, tm;
160 Dprintk("AP %d syncing TSC\n", smp_processor_id()); 232 int i;
161 233
162 while (atomic_read(&tsc_flag) != 0) 234 for (i = 0; i < NUM_ITERS; ++i) {
163 cpu_relax(); 235 rdtscll(t0);
164 atomic_inc(&tsc_flag); 236 go[MASTER] = 1;
165 mb(); 237 while (!(tm = go[SLAVE]))
166 _raw_spin_lock(&tsc_sync_lock); 238 no_cpu_relax();
167 wrmsrl(MSR_IA32_TSC, bp_tsc); 239 go[SLAVE] = 0;
168 _raw_spin_unlock(&tsc_sync_lock); 240 rdtscll(t1);
169 rdtscll(ap_tsc); 241
170 mb(); 242 if (t1 - t0 < best_t1 - best_t0)
171 atomic_inc(&tsc_flag); 243 best_t0 = t0, best_t1 = t1, best_tm = tm;
172 mb(); 244 }
245
246 *rt = best_t1 - best_t0;
247 *master = best_tm - best_t0;
248
249 /* average best_t0 and best_t1 without overflow: */
250 tcenter = (best_t0/2 + best_t1/2);
251 if (best_t0 % 2 + best_t1 % 2 == 2)
252 ++tcenter;
253 return tcenter - best_tm;
173} 254}
174 255
175static void __cpuinit sync_tsc_ap(void) 256static __cpuinit void sync_tsc(void)
176{ 257{
177 int i; 258 int i, done = 0;
178 for (i = 0; i < NR_LOOPS; i++) 259 long delta, adj, adjust_latency = 0;
179 __sync_tsc_ap(); 260 unsigned long flags, rt, master_time_stamp, bound;
261#if DEBUG_TSC_SYNC
262 static struct syncdebug {
263 long rt; /* roundtrip time */
264 long master; /* master's timestamp */
265 long diff; /* difference between midpoint and master's timestamp */
266 long lat; /* estimate of tsc adjustment latency */
267 } t[NUM_ROUNDS] __cpuinitdata;
268#endif
269
270 go[MASTER] = 1;
271
272 smp_call_function(sync_master, NULL, 1, 0);
273
274 while (go[MASTER]) /* wait for master to be ready */
275 no_cpu_relax();
276
277 spin_lock_irqsave(&tsc_sync_lock, flags);
278 {
279 for (i = 0; i < NUM_ROUNDS; ++i) {
280 delta = get_delta(&rt, &master_time_stamp);
281 if (delta == 0) {
282 done = 1; /* let's lock on to this... */
283 bound = rt;
284 }
285
286 if (!done) {
287 unsigned long t;
288 if (i > 0) {
289 adjust_latency += -delta;
290 adj = -delta + adjust_latency/4;
291 } else
292 adj = -delta;
293
294 rdtscll(t);
295 wrmsrl(MSR_IA32_TSC, t + adj);
296 }
297#if DEBUG_TSC_SYNC
298 t[i].rt = rt;
299 t[i].master = master_time_stamp;
300 t[i].diff = delta;
301 t[i].lat = adjust_latency/4;
302#endif
303 }
304 }
305 spin_unlock_irqrestore(&tsc_sync_lock, flags);
306
307#if DEBUG_TSC_SYNC
308 for (i = 0; i < NUM_ROUNDS; ++i)
309 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
310 t[i].rt, t[i].master, t[i].diff, t[i].lat);
311#endif
312
313 printk(KERN_INFO
314 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
315 "maxerr %lu cycles)\n",
316 smp_processor_id(), boot_cpu_id, delta, rt);
180} 317}
181 318
182/* 319static void __cpuinit tsc_sync_wait(void)
183 * Synchronize TSC from BP to AP.
184 */
185static void __cpuinit __sync_tsc_bp(int cpu)
186{ 320{
187 if (!cpu_has_tsc) 321 if (notscsync || !cpu_has_tsc)
188 return; 322 return;
189 323 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
190 /* Wait for AP */ 324 boot_cpu_id);
191 while (atomic_read(&tsc_flag) == 0) 325 sync_tsc();
192 cpu_relax();
193 /* Save BPs TSC */
194 sync_core();
195 rdtscll(bp_tsc);
196 /* Don't do the sync core here to avoid too much latency. */
197 mb();
198 /* Start the AP */
199 _raw_spin_unlock(&tsc_sync_lock);
200 /* Wait for AP again */
201 while (atomic_read(&tsc_flag) < 2)
202 cpu_relax();
203 rdtscl(bp_tsc);
204 barrier();
205} 326}
206 327
207static void __cpuinit sync_tsc_bp(int cpu) 328static __init int notscsync_setup(char *s)
208{ 329{
209 int i; 330 notscsync = 1;
210 for (i = 0; i < NR_LOOPS - 1; i++) { 331 return 0;
211 __sync_tsc_bp(cpu);
212 sync_tsc_bp_init(1);
213 }
214 __sync_tsc_bp(cpu);
215 printk(KERN_INFO "Synced TSC of CPU %d difference %Ld\n",
216 cpu, ap_tsc - bp_tsc);
217} 332}
333__setup("notscsync", notscsync_setup);
218 334
219static atomic_t init_deasserted __cpuinitdata; 335static atomic_t init_deasserted __cpuinitdata;
220 336
@@ -315,11 +431,6 @@ void __cpuinit start_secondary(void)
315 cpu_init(); 431 cpu_init();
316 smp_callin(); 432 smp_callin();
317 433
318 /*
319 * Synchronize the TSC with the BP
320 */
321 sync_tsc_ap();
322
323 /* otherwise gcc will move up the smp_processor_id before the cpu_init */ 434 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
324 barrier(); 435 barrier();
325 436
@@ -334,7 +445,6 @@ void __cpuinit start_secondary(void)
334 enable_8259A_irq(0); 445 enable_8259A_irq(0);
335 } 446 }
336 447
337
338 enable_APIC_timer(); 448 enable_APIC_timer();
339 449
340 /* 450 /*
@@ -343,6 +453,11 @@ void __cpuinit start_secondary(void)
343 cpu_set(smp_processor_id(), cpu_online_map); 453 cpu_set(smp_processor_id(), cpu_online_map);
344 mb(); 454 mb();
345 455
456 /* Wait for TSC sync to not schedule things before.
457 We still process interrupts, which could see an inconsistent
458 time in that window unfortunately. */
459 tsc_sync_wait();
460
346 cpu_idle(); 461 cpu_idle();
347} 462}
348 463
@@ -531,7 +646,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
531 printk("failed fork for CPU %d\n", cpu); 646 printk("failed fork for CPU %d\n", cpu);
532 return PTR_ERR(idle); 647 return PTR_ERR(idle);
533 } 648 }
534 x86_cpu_to_apicid[cpu] = apicid;
535 649
536 cpu_pda[cpu].pcurrent = idle; 650 cpu_pda[cpu].pcurrent = idle;
537 651
@@ -600,8 +714,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
600 714
601 if (cpu_isset(cpu, cpu_callin_map)) { 715 if (cpu_isset(cpu, cpu_callin_map)) {
602 /* number CPUs logically, starting from 1 (BSP is 0) */ 716 /* number CPUs logically, starting from 1 (BSP is 0) */
603 Dprintk("OK.\n");
604 print_cpu_info(&cpu_data[cpu]);
605 Dprintk("CPU has booted.\n"); 717 Dprintk("CPU has booted.\n");
606 } else { 718 } else {
607 boot_error = 1; 719 boot_error = 1;
@@ -842,7 +954,6 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
842 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); 954 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
843 /* Or can we switch back to PIC here? */ 955 /* Or can we switch back to PIC here? */
844 } 956 }
845 x86_cpu_to_apicid[0] = boot_cpu_id;
846 957
847 /* 958 /*
848 * Now start the IO-APICs 959 * Now start the IO-APICs
@@ -889,18 +1000,14 @@ int __cpuinit __cpu_up(unsigned int cpu)
889 printk("__cpu_up: bad cpu %d\n", cpu); 1000 printk("__cpu_up: bad cpu %d\n", cpu);
890 return -EINVAL; 1001 return -EINVAL;
891 } 1002 }
892 sync_tsc_bp_init(1);
893 1003
894 /* Boot it! */ 1004 /* Boot it! */
895 err = do_boot_cpu(cpu, apicid); 1005 err = do_boot_cpu(cpu, apicid);
896 if (err < 0) { 1006 if (err < 0) {
897 sync_tsc_bp_init(0);
898 Dprintk("do_boot_cpu failed %d\n", err); 1007 Dprintk("do_boot_cpu failed %d\n", err);
899 return err; 1008 return err;
900 } 1009 }
901 1010
902 sync_tsc_bp(cpu);
903
904 /* Unleash the CPU! */ 1011 /* Unleash the CPU! */
905 Dprintk("waiting for cpu %d\n", cpu); 1012 Dprintk("waiting for cpu %d\n", cpu);
906 1013
@@ -923,4 +1030,6 @@ void __cpuinit smp_cpus_done(unsigned int max_cpus)
923 1030
924 detect_siblings(); 1031 detect_siblings();
925 time_init_gtod(); 1032 time_init_gtod();
1033
1034 check_nmi_watchdog();
926} 1035}