diff options
-rw-r--r-- | arch/x86_64/kernel/smpboot.c | 257 |
1 files changed, 182 insertions, 75 deletions
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 73f7e8b9543a..17efc40baa32 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c | |||
@@ -125,96 +125,210 @@ static void __cpuinit smp_store_cpu_info(int id) | |||
125 | 125 | ||
126 | *c = boot_cpu_data; | 126 | *c = boot_cpu_data; |
127 | identify_cpu(c); | 127 | identify_cpu(c); |
128 | print_cpu_info(c); | ||
128 | } | 129 | } |
129 | 130 | ||
130 | /* | 131 | /* |
131 | * Synchronize TSCs of CPUs | 132 | * New Funky TSC sync algorithm borrowed from IA64. |
133 | * Main advantage is that it doesn't reset the TSCs fully and | ||
134 | * in general looks more robust and it works better than my earlier | ||
135 | * attempts. I believe it was written by David Mosberger. Some minor | ||
136 | * adjustments for x86-64 by me -AK | ||
132 | * | 137 | * |
133 | * This new algorithm is less accurate than the old "zero TSCs" | 138 | * Original comment reproduced below. |
134 | * one, but we cannot zero TSCs anymore in the new hotplug CPU | 139 | * |
135 | * model. | 140 | * Synchronize TSC of the current (slave) CPU with the TSC of the |
141 | * MASTER CPU (normally the time-keeper CPU). We use a closed loop to | ||
142 | * eliminate the possibility of unaccounted-for errors (such as | ||
143 | * getting a machine check in the middle of a calibration step). The | ||
144 | * basic idea is for the slave to ask the master what itc value it has | ||
145 | * and to read its own itc before and after the master responds. Each | ||
146 | * iteration gives us three timestamps: | ||
147 | * | ||
148 | * slave master | ||
149 | * | ||
150 | * t0 ---\ | ||
151 | * ---\ | ||
152 | * ---> | ||
153 | * tm | ||
154 | * /--- | ||
155 | * /--- | ||
156 | * t1 <--- | ||
157 | * | ||
158 | * | ||
159 | * The goal is to adjust the slave's TSC such that tm falls exactly | ||
160 | * half-way between t0 and t1. If we achieve this, the clocks are | ||
161 | * synchronized provided the interconnect between the slave and the | ||
162 | * master is symmetric. Even if the interconnect were asymmetric, we | ||
163 | * would still know that the synchronization error is smaller than the | ||
164 | * roundtrip latency (t0 - t1). | ||
165 | * | ||
166 | * When the interconnect is quiet and symmetric, this lets us | ||
167 | * synchronize the TSC to within one or two cycles. However, we can | ||
168 | * only *guarantee* that the synchronization is accurate to within a | ||
169 | * round-trip time, which is typically in the range of several hundred | ||
170 | * cycles (e.g., ~500 cycles). In practice, this means that the TSCs | ||
171 | * are usually almost perfectly synchronized, but we shouldn't assume | ||
172 | * that the accuracy is much better than half a micro second or so. | ||
173 | * | ||
174 | * [there are other errors like the latency of RDTSC and of the | ||
175 | * WRMSR. These can also account to hundreds of cycles. So it's | ||
176 | * probably worse. It claims 153 cycles error on a dual Opteron, | ||
177 | * but I suspect the numbers are actually somewhat worse -AK] | ||
136 | */ | 178 | */ |
137 | 179 | ||
138 | static atomic_t __cpuinitdata tsc_flag; | 180 | #define MASTER 0 |
181 | #define SLAVE (SMP_CACHE_BYTES/8) | ||
182 | |||
183 | /* Intentionally don't use cpu_relax() while TSC synchronization | ||
184 | because we don't want to go into funky power save modi or cause | ||
185 | hypervisors to schedule us away. Going to sleep would likely affect | ||
186 | latency and low latency is the primary objective here. -AK */ | ||
187 | #define no_cpu_relax() barrier() | ||
188 | |||
139 | static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); | 189 | static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); |
140 | static unsigned long long __cpuinitdata bp_tsc, ap_tsc; | 190 | static volatile __cpuinitdata unsigned long go[SLAVE + 1]; |
191 | static int notscsync __cpuinitdata; | ||
192 | |||
193 | #undef DEBUG_TSC_SYNC | ||
141 | 194 | ||
142 | #define NR_LOOPS 5 | 195 | #define NUM_ROUNDS 64 /* magic value */ |
196 | #define NUM_ITERS 5 /* likewise */ | ||
143 | 197 | ||
144 | static void __cpuinit sync_tsc_bp_init(int init) | 198 | /* Callback on boot CPU */ |
199 | static __cpuinit void sync_master(void *arg) | ||
145 | { | 200 | { |
146 | if (init) | 201 | unsigned long flags, i; |
147 | _raw_spin_lock(&tsc_sync_lock); | 202 | |
148 | else | 203 | if (smp_processor_id() != boot_cpu_id) |
149 | _raw_spin_unlock(&tsc_sync_lock); | 204 | return; |
150 | atomic_set(&tsc_flag, 0); | 205 | |
206 | go[MASTER] = 0; | ||
207 | |||
208 | local_irq_save(flags); | ||
209 | { | ||
210 | for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { | ||
211 | while (!go[MASTER]) | ||
212 | no_cpu_relax(); | ||
213 | go[MASTER] = 0; | ||
214 | rdtscll(go[SLAVE]); | ||
215 | } | ||
216 | } | ||
217 | local_irq_restore(flags); | ||
151 | } | 218 | } |
152 | 219 | ||
153 | /* | 220 | /* |
154 | * Synchronize TSC on AP with BP. | 221 | * Return the number of cycles by which our tsc differs from the tsc |
222 | * on the master (time-keeper) CPU. A positive number indicates our | ||
223 | * tsc is ahead of the master, negative that it is behind. | ||
155 | */ | 224 | */ |
156 | static void __cpuinit __sync_tsc_ap(void) | 225 | static inline long |
226 | get_delta(long *rt, long *master) | ||
157 | { | 227 | { |
158 | if (!cpu_has_tsc) | 228 | unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; |
159 | return; | 229 | unsigned long tcenter, t0, t1, tm; |
160 | Dprintk("AP %d syncing TSC\n", smp_processor_id()); | 230 | int i; |
161 | 231 | ||
162 | while (atomic_read(&tsc_flag) != 0) | 232 | for (i = 0; i < NUM_ITERS; ++i) { |
163 | cpu_relax(); | 233 | rdtscll(t0); |
164 | atomic_inc(&tsc_flag); | 234 | go[MASTER] = 1; |
165 | mb(); | 235 | while (!(tm = go[SLAVE])) |
166 | _raw_spin_lock(&tsc_sync_lock); | 236 | no_cpu_relax(); |
167 | wrmsrl(MSR_IA32_TSC, bp_tsc); | 237 | go[SLAVE] = 0; |
168 | _raw_spin_unlock(&tsc_sync_lock); | 238 | rdtscll(t1); |
169 | rdtscll(ap_tsc); | 239 | |
170 | mb(); | 240 | if (t1 - t0 < best_t1 - best_t0) |
171 | atomic_inc(&tsc_flag); | 241 | best_t0 = t0, best_t1 = t1, best_tm = tm; |
172 | mb(); | 242 | } |
243 | |||
244 | *rt = best_t1 - best_t0; | ||
245 | *master = best_tm - best_t0; | ||
246 | |||
247 | /* average best_t0 and best_t1 without overflow: */ | ||
248 | tcenter = (best_t0/2 + best_t1/2); | ||
249 | if (best_t0 % 2 + best_t1 % 2 == 2) | ||
250 | ++tcenter; | ||
251 | return tcenter - best_tm; | ||
173 | } | 252 | } |
174 | 253 | ||
175 | static void __cpuinit sync_tsc_ap(void) | 254 | static __cpuinit void sync_tsc(void) |
176 | { | 255 | { |
177 | int i; | 256 | int i, done = 0; |
178 | for (i = 0; i < NR_LOOPS; i++) | 257 | long delta, adj, adjust_latency = 0; |
179 | __sync_tsc_ap(); | 258 | unsigned long flags, rt, master_time_stamp, bound; |
259 | #if DEBUG_TSC_SYNC | ||
260 | static struct syncdebug { | ||
261 | long rt; /* roundtrip time */ | ||
262 | long master; /* master's timestamp */ | ||
263 | long diff; /* difference between midpoint and master's timestamp */ | ||
264 | long lat; /* estimate of tsc adjustment latency */ | ||
265 | } t[NUM_ROUNDS] __cpuinitdata; | ||
266 | #endif | ||
267 | |||
268 | go[MASTER] = 1; | ||
269 | |||
270 | smp_call_function(sync_master, NULL, 1, 0); | ||
271 | |||
272 | while (go[MASTER]) /* wait for master to be ready */ | ||
273 | no_cpu_relax(); | ||
274 | |||
275 | spin_lock_irqsave(&tsc_sync_lock, flags); | ||
276 | { | ||
277 | for (i = 0; i < NUM_ROUNDS; ++i) { | ||
278 | delta = get_delta(&rt, &master_time_stamp); | ||
279 | if (delta == 0) { | ||
280 | done = 1; /* let's lock on to this... */ | ||
281 | bound = rt; | ||
282 | } | ||
283 | |||
284 | if (!done) { | ||
285 | unsigned long t; | ||
286 | if (i > 0) { | ||
287 | adjust_latency += -delta; | ||
288 | adj = -delta + adjust_latency/4; | ||
289 | } else | ||
290 | adj = -delta; | ||
291 | |||
292 | rdtscll(t); | ||
293 | wrmsrl(MSR_IA32_TSC, t + adj); | ||
294 | } | ||
295 | #if DEBUG_TSC_SYNC | ||
296 | t[i].rt = rt; | ||
297 | t[i].master = master_time_stamp; | ||
298 | t[i].diff = delta; | ||
299 | t[i].lat = adjust_latency/4; | ||
300 | #endif | ||
301 | } | ||
302 | } | ||
303 | spin_unlock_irqrestore(&tsc_sync_lock, flags); | ||
304 | |||
305 | #if DEBUG_TSC_SYNC | ||
306 | for (i = 0; i < NUM_ROUNDS; ++i) | ||
307 | printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", | ||
308 | t[i].rt, t[i].master, t[i].diff, t[i].lat); | ||
309 | #endif | ||
310 | |||
311 | printk(KERN_INFO | ||
312 | "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " | ||
313 | "maxerr %lu cycles)\n", | ||
314 | smp_processor_id(), boot_cpu_id, delta, rt); | ||
180 | } | 315 | } |
181 | 316 | ||
182 | /* | 317 | static void __cpuinit tsc_sync_wait(void) |
183 | * Synchronize TSC from BP to AP. | ||
184 | */ | ||
185 | static void __cpuinit __sync_tsc_bp(int cpu) | ||
186 | { | 318 | { |
187 | if (!cpu_has_tsc) | 319 | if (notscsync || !cpu_has_tsc) |
188 | return; | 320 | return; |
189 | 321 | printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(), | |
190 | /* Wait for AP */ | 322 | boot_cpu_id); |
191 | while (atomic_read(&tsc_flag) == 0) | 323 | sync_tsc(); |
192 | cpu_relax(); | ||
193 | /* Save BPs TSC */ | ||
194 | sync_core(); | ||
195 | rdtscll(bp_tsc); | ||
196 | /* Don't do the sync core here to avoid too much latency. */ | ||
197 | mb(); | ||
198 | /* Start the AP */ | ||
199 | _raw_spin_unlock(&tsc_sync_lock); | ||
200 | /* Wait for AP again */ | ||
201 | while (atomic_read(&tsc_flag) < 2) | ||
202 | cpu_relax(); | ||
203 | rdtscl(bp_tsc); | ||
204 | barrier(); | ||
205 | } | 324 | } |
206 | 325 | ||
207 | static void __cpuinit sync_tsc_bp(int cpu) | 326 | static __init int notscsync_setup(char *s) |
208 | { | 327 | { |
209 | int i; | 328 | notscsync = 1; |
210 | for (i = 0; i < NR_LOOPS - 1; i++) { | 329 | return 0; |
211 | __sync_tsc_bp(cpu); | ||
212 | sync_tsc_bp_init(1); | ||
213 | } | ||
214 | __sync_tsc_bp(cpu); | ||
215 | printk(KERN_INFO "Synced TSC of CPU %d difference %Ld\n", | ||
216 | cpu, ap_tsc - bp_tsc); | ||
217 | } | 330 | } |
331 | __setup("notscsync", notscsync_setup); | ||
218 | 332 | ||
219 | static atomic_t init_deasserted __cpuinitdata; | 333 | static atomic_t init_deasserted __cpuinitdata; |
220 | 334 | ||
@@ -315,11 +429,6 @@ void __cpuinit start_secondary(void) | |||
315 | cpu_init(); | 429 | cpu_init(); |
316 | smp_callin(); | 430 | smp_callin(); |
317 | 431 | ||
318 | /* | ||
319 | * Synchronize the TSC with the BP | ||
320 | */ | ||
321 | sync_tsc_ap(); | ||
322 | |||
323 | /* otherwise gcc will move up the smp_processor_id before the cpu_init */ | 432 | /* otherwise gcc will move up the smp_processor_id before the cpu_init */ |
324 | barrier(); | 433 | barrier(); |
325 | 434 | ||
@@ -334,7 +443,6 @@ void __cpuinit start_secondary(void) | |||
334 | enable_8259A_irq(0); | 443 | enable_8259A_irq(0); |
335 | } | 444 | } |
336 | 445 | ||
337 | |||
338 | enable_APIC_timer(); | 446 | enable_APIC_timer(); |
339 | 447 | ||
340 | /* | 448 | /* |
@@ -343,6 +451,11 @@ void __cpuinit start_secondary(void) | |||
343 | cpu_set(smp_processor_id(), cpu_online_map); | 451 | cpu_set(smp_processor_id(), cpu_online_map); |
344 | mb(); | 452 | mb(); |
345 | 453 | ||
454 | /* Wait for TSC sync to not schedule things before. | ||
455 | We still process interrupts, which could see an inconsistent | ||
456 | time in that window unfortunately. */ | ||
457 | tsc_sync_wait(); | ||
458 | |||
346 | cpu_idle(); | 459 | cpu_idle(); |
347 | } | 460 | } |
348 | 461 | ||
@@ -600,8 +713,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) | |||
600 | 713 | ||
601 | if (cpu_isset(cpu, cpu_callin_map)) { | 714 | if (cpu_isset(cpu, cpu_callin_map)) { |
602 | /* number CPUs logically, starting from 1 (BSP is 0) */ | 715 | /* number CPUs logically, starting from 1 (BSP is 0) */ |
603 | Dprintk("OK.\n"); | ||
604 | print_cpu_info(&cpu_data[cpu]); | ||
605 | Dprintk("CPU has booted.\n"); | 716 | Dprintk("CPU has booted.\n"); |
606 | } else { | 717 | } else { |
607 | boot_error = 1; | 718 | boot_error = 1; |
@@ -889,18 +1000,14 @@ int __cpuinit __cpu_up(unsigned int cpu) | |||
889 | printk("__cpu_up: bad cpu %d\n", cpu); | 1000 | printk("__cpu_up: bad cpu %d\n", cpu); |
890 | return -EINVAL; | 1001 | return -EINVAL; |
891 | } | 1002 | } |
892 | sync_tsc_bp_init(1); | ||
893 | 1003 | ||
894 | /* Boot it! */ | 1004 | /* Boot it! */ |
895 | err = do_boot_cpu(cpu, apicid); | 1005 | err = do_boot_cpu(cpu, apicid); |
896 | if (err < 0) { | 1006 | if (err < 0) { |
897 | sync_tsc_bp_init(0); | ||
898 | Dprintk("do_boot_cpu failed %d\n", err); | 1007 | Dprintk("do_boot_cpu failed %d\n", err); |
899 | return err; | 1008 | return err; |
900 | } | 1009 | } |
901 | 1010 | ||
902 | sync_tsc_bp(cpu); | ||
903 | |||
904 | /* Unleash the CPU! */ | 1011 | /* Unleash the CPU! */ |
905 | Dprintk("waiting for cpu %d\n", cpu); | 1012 | Dprintk("waiting for cpu %d\n", cpu); |
906 | 1013 | ||