diff options
Diffstat (limited to 'arch/x86_64/kernel/smpboot.c')
| -rw-r--r-- | arch/x86_64/kernel/smpboot.c | 263 |
1 files changed, 186 insertions, 77 deletions
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 73f7e8b9543a..f1ec0f345941 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c | |||
| @@ -56,6 +56,7 @@ | |||
| 56 | #include <asm/kdebug.h> | 56 | #include <asm/kdebug.h> |
| 57 | #include <asm/tlbflush.h> | 57 | #include <asm/tlbflush.h> |
| 58 | #include <asm/proto.h> | 58 | #include <asm/proto.h> |
| 59 | #include <asm/nmi.h> | ||
| 59 | 60 | ||
| 60 | /* Change for real CPU hotplug. Note other files need to be fixed | 61 | /* Change for real CPU hotplug. Note other files need to be fixed |
| 61 | first too. */ | 62 | first too. */ |
| @@ -93,6 +94,7 @@ int smp_threads_ready; | |||
| 93 | 94 | ||
| 94 | cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; | 95 | cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; |
| 95 | cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; | 96 | cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; |
| 97 | EXPORT_SYMBOL(cpu_core_map); | ||
| 96 | 98 | ||
| 97 | /* | 99 | /* |
| 98 | * Trampoline 80x86 program as an array. | 100 | * Trampoline 80x86 program as an array. |
| @@ -125,96 +127,210 @@ static void __cpuinit smp_store_cpu_info(int id) | |||
| 125 | 127 | ||
| 126 | *c = boot_cpu_data; | 128 | *c = boot_cpu_data; |
| 127 | identify_cpu(c); | 129 | identify_cpu(c); |
| 130 | print_cpu_info(c); | ||
| 128 | } | 131 | } |
| 129 | 132 | ||
| 130 | /* | 133 | /* |
| 131 | * Synchronize TSCs of CPUs | 134 | * New Funky TSC sync algorithm borrowed from IA64. |
| 135 | * Main advantage is that it doesn't reset the TSCs fully and | ||
| 136 | * in general looks more robust and it works better than my earlier | ||
| 137 | * attempts. I believe it was written by David Mosberger. Some minor | ||
| 138 | * adjustments for x86-64 by me -AK | ||
| 132 | * | 139 | * |
| 133 | * This new algorithm is less accurate than the old "zero TSCs" | 140 | * Original comment reproduced below. |
| 134 | * one, but we cannot zero TSCs anymore in the new hotplug CPU | 141 | * |
| 135 | * model. | 142 | * Synchronize TSC of the current (slave) CPU with the TSC of the |
| 143 | * MASTER CPU (normally the time-keeper CPU). We use a closed loop to | ||
| 144 | * eliminate the possibility of unaccounted-for errors (such as | ||
| 145 | * getting a machine check in the middle of a calibration step). The | ||
| 146 | * basic idea is for the slave to ask the master what itc value it has | ||
| 147 | * and to read its own itc before and after the master responds. Each | ||
| 148 | * iteration gives us three timestamps: | ||
| 149 | * | ||
| 150 | * slave master | ||
| 151 | * | ||
| 152 | * t0 ---\ | ||
| 153 | * ---\ | ||
| 154 | * ---> | ||
| 155 | * tm | ||
| 156 | * /--- | ||
| 157 | * /--- | ||
| 158 | * t1 <--- | ||
| 159 | * | ||
| 160 | * | ||
| 161 | * The goal is to adjust the slave's TSC such that tm falls exactly | ||
| 162 | * half-way between t0 and t1. If we achieve this, the clocks are | ||
| 163 | * synchronized provided the interconnect between the slave and the | ||
| 164 | * master is symmetric. Even if the interconnect were asymmetric, we | ||
| 165 | * would still know that the synchronization error is smaller than the | ||
| 166 | * roundtrip latency (t0 - t1). | ||
| 167 | * | ||
| 168 | * When the interconnect is quiet and symmetric, this lets us | ||
| 169 | * synchronize the TSC to within one or two cycles. However, we can | ||
| 170 | * only *guarantee* that the synchronization is accurate to within a | ||
| 171 | * round-trip time, which is typically in the range of several hundred | ||
| 172 | * cycles (e.g., ~500 cycles). In practice, this means that the TSCs | ||
| 173 | * are usually almost perfectly synchronized, but we shouldn't assume | ||
| 174 | * that the accuracy is much better than half a micro second or so. | ||
| 175 | * | ||
| 176 | * [there are other errors like the latency of RDTSC and of the | ||
| 177 | * WRMSR. These can also account to hundreds of cycles. So it's | ||
| 178 | * probably worse. It claims 153 cycles error on a dual Opteron, | ||
| 179 | * but I suspect the numbers are actually somewhat worse -AK] | ||
| 136 | */ | 180 | */ |
| 137 | 181 | ||
| 138 | static atomic_t __cpuinitdata tsc_flag; | 182 | #define MASTER 0 |
| 183 | #define SLAVE (SMP_CACHE_BYTES/8) | ||
| 184 | |||
| 185 | /* Intentionally don't use cpu_relax() while TSC synchronization | ||
| 186 | because we don't want to go into funky power save modi or cause | ||
| 187 | hypervisors to schedule us away. Going to sleep would likely affect | ||
| 188 | latency and low latency is the primary objective here. -AK */ | ||
| 189 | #define no_cpu_relax() barrier() | ||
| 190 | |||
| 139 | static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); | 191 | static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); |
| 140 | static unsigned long long __cpuinitdata bp_tsc, ap_tsc; | 192 | static volatile __cpuinitdata unsigned long go[SLAVE + 1]; |
| 193 | static int notscsync __cpuinitdata; | ||
| 194 | |||
| 195 | #undef DEBUG_TSC_SYNC | ||
| 141 | 196 | ||
| 142 | #define NR_LOOPS 5 | 197 | #define NUM_ROUNDS 64 /* magic value */ |
| 198 | #define NUM_ITERS 5 /* likewise */ | ||
| 143 | 199 | ||
| 144 | static void __cpuinit sync_tsc_bp_init(int init) | 200 | /* Callback on boot CPU */ |
| 201 | static __cpuinit void sync_master(void *arg) | ||
| 145 | { | 202 | { |
| 146 | if (init) | 203 | unsigned long flags, i; |
| 147 | _raw_spin_lock(&tsc_sync_lock); | 204 | |
| 148 | else | 205 | if (smp_processor_id() != boot_cpu_id) |
| 149 | _raw_spin_unlock(&tsc_sync_lock); | 206 | return; |
| 150 | atomic_set(&tsc_flag, 0); | 207 | |
| 208 | go[MASTER] = 0; | ||
| 209 | |||
| 210 | local_irq_save(flags); | ||
| 211 | { | ||
| 212 | for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { | ||
| 213 | while (!go[MASTER]) | ||
| 214 | no_cpu_relax(); | ||
| 215 | go[MASTER] = 0; | ||
| 216 | rdtscll(go[SLAVE]); | ||
| 217 | } | ||
| 218 | } | ||
| 219 | local_irq_restore(flags); | ||
| 151 | } | 220 | } |
| 152 | 221 | ||
| 153 | /* | 222 | /* |
| 154 | * Synchronize TSC on AP with BP. | 223 | * Return the number of cycles by which our tsc differs from the tsc |
| 224 | * on the master (time-keeper) CPU. A positive number indicates our | ||
| 225 | * tsc is ahead of the master, negative that it is behind. | ||
| 155 | */ | 226 | */ |
| 156 | static void __cpuinit __sync_tsc_ap(void) | 227 | static inline long |
| 228 | get_delta(long *rt, long *master) | ||
| 157 | { | 229 | { |
| 158 | if (!cpu_has_tsc) | 230 | unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; |
| 159 | return; | 231 | unsigned long tcenter, t0, t1, tm; |
| 160 | Dprintk("AP %d syncing TSC\n", smp_processor_id()); | 232 | int i; |
| 161 | 233 | ||
| 162 | while (atomic_read(&tsc_flag) != 0) | 234 | for (i = 0; i < NUM_ITERS; ++i) { |
| 163 | cpu_relax(); | 235 | rdtscll(t0); |
| 164 | atomic_inc(&tsc_flag); | 236 | go[MASTER] = 1; |
| 165 | mb(); | 237 | while (!(tm = go[SLAVE])) |
| 166 | _raw_spin_lock(&tsc_sync_lock); | 238 | no_cpu_relax(); |
| 167 | wrmsrl(MSR_IA32_TSC, bp_tsc); | 239 | go[SLAVE] = 0; |
| 168 | _raw_spin_unlock(&tsc_sync_lock); | 240 | rdtscll(t1); |
| 169 | rdtscll(ap_tsc); | 241 | |
| 170 | mb(); | 242 | if (t1 - t0 < best_t1 - best_t0) |
| 171 | atomic_inc(&tsc_flag); | 243 | best_t0 = t0, best_t1 = t1, best_tm = tm; |
| 172 | mb(); | 244 | } |
| 245 | |||
| 246 | *rt = best_t1 - best_t0; | ||
| 247 | *master = best_tm - best_t0; | ||
| 248 | |||
| 249 | /* average best_t0 and best_t1 without overflow: */ | ||
| 250 | tcenter = (best_t0/2 + best_t1/2); | ||
| 251 | if (best_t0 % 2 + best_t1 % 2 == 2) | ||
| 252 | ++tcenter; | ||
| 253 | return tcenter - best_tm; | ||
| 173 | } | 254 | } |
| 174 | 255 | ||
| 175 | static void __cpuinit sync_tsc_ap(void) | 256 | static __cpuinit void sync_tsc(void) |
| 176 | { | 257 | { |
| 177 | int i; | 258 | int i, done = 0; |
| 178 | for (i = 0; i < NR_LOOPS; i++) | 259 | long delta, adj, adjust_latency = 0; |
| 179 | __sync_tsc_ap(); | 260 | unsigned long flags, rt, master_time_stamp, bound; |
| 261 | #if DEBUG_TSC_SYNC | ||
| 262 | static struct syncdebug { | ||
| 263 | long rt; /* roundtrip time */ | ||
| 264 | long master; /* master's timestamp */ | ||
| 265 | long diff; /* difference between midpoint and master's timestamp */ | ||
| 266 | long lat; /* estimate of tsc adjustment latency */ | ||
| 267 | } t[NUM_ROUNDS] __cpuinitdata; | ||
| 268 | #endif | ||
| 269 | |||
| 270 | go[MASTER] = 1; | ||
| 271 | |||
| 272 | smp_call_function(sync_master, NULL, 1, 0); | ||
| 273 | |||
| 274 | while (go[MASTER]) /* wait for master to be ready */ | ||
| 275 | no_cpu_relax(); | ||
| 276 | |||
| 277 | spin_lock_irqsave(&tsc_sync_lock, flags); | ||
| 278 | { | ||
| 279 | for (i = 0; i < NUM_ROUNDS; ++i) { | ||
| 280 | delta = get_delta(&rt, &master_time_stamp); | ||
| 281 | if (delta == 0) { | ||
| 282 | done = 1; /* let's lock on to this... */ | ||
| 283 | bound = rt; | ||
| 284 | } | ||
| 285 | |||
| 286 | if (!done) { | ||
| 287 | unsigned long t; | ||
| 288 | if (i > 0) { | ||
| 289 | adjust_latency += -delta; | ||
| 290 | adj = -delta + adjust_latency/4; | ||
| 291 | } else | ||
| 292 | adj = -delta; | ||
| 293 | |||
| 294 | rdtscll(t); | ||
| 295 | wrmsrl(MSR_IA32_TSC, t + adj); | ||
| 296 | } | ||
| 297 | #if DEBUG_TSC_SYNC | ||
| 298 | t[i].rt = rt; | ||
| 299 | t[i].master = master_time_stamp; | ||
| 300 | t[i].diff = delta; | ||
| 301 | t[i].lat = adjust_latency/4; | ||
| 302 | #endif | ||
| 303 | } | ||
| 304 | } | ||
| 305 | spin_unlock_irqrestore(&tsc_sync_lock, flags); | ||
| 306 | |||
| 307 | #if DEBUG_TSC_SYNC | ||
| 308 | for (i = 0; i < NUM_ROUNDS; ++i) | ||
| 309 | printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", | ||
| 310 | t[i].rt, t[i].master, t[i].diff, t[i].lat); | ||
| 311 | #endif | ||
| 312 | |||
| 313 | printk(KERN_INFO | ||
| 314 | "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " | ||
| 315 | "maxerr %lu cycles)\n", | ||
| 316 | smp_processor_id(), boot_cpu_id, delta, rt); | ||
| 180 | } | 317 | } |
| 181 | 318 | ||
| 182 | /* | 319 | static void __cpuinit tsc_sync_wait(void) |
| 183 | * Synchronize TSC from BP to AP. | ||
| 184 | */ | ||
| 185 | static void __cpuinit __sync_tsc_bp(int cpu) | ||
| 186 | { | 320 | { |
| 187 | if (!cpu_has_tsc) | 321 | if (notscsync || !cpu_has_tsc) |
| 188 | return; | 322 | return; |
| 189 | 323 | printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(), | |
| 190 | /* Wait for AP */ | 324 | boot_cpu_id); |
| 191 | while (atomic_read(&tsc_flag) == 0) | 325 | sync_tsc(); |
| 192 | cpu_relax(); | ||
| 193 | /* Save BPs TSC */ | ||
| 194 | sync_core(); | ||
| 195 | rdtscll(bp_tsc); | ||
| 196 | /* Don't do the sync core here to avoid too much latency. */ | ||
| 197 | mb(); | ||
| 198 | /* Start the AP */ | ||
| 199 | _raw_spin_unlock(&tsc_sync_lock); | ||
| 200 | /* Wait for AP again */ | ||
| 201 | while (atomic_read(&tsc_flag) < 2) | ||
| 202 | cpu_relax(); | ||
| 203 | rdtscl(bp_tsc); | ||
| 204 | barrier(); | ||
| 205 | } | 326 | } |
| 206 | 327 | ||
| 207 | static void __cpuinit sync_tsc_bp(int cpu) | 328 | static __init int notscsync_setup(char *s) |
| 208 | { | 329 | { |
| 209 | int i; | 330 | notscsync = 1; |
| 210 | for (i = 0; i < NR_LOOPS - 1; i++) { | 331 | return 0; |
| 211 | __sync_tsc_bp(cpu); | ||
| 212 | sync_tsc_bp_init(1); | ||
| 213 | } | ||
| 214 | __sync_tsc_bp(cpu); | ||
| 215 | printk(KERN_INFO "Synced TSC of CPU %d difference %Ld\n", | ||
| 216 | cpu, ap_tsc - bp_tsc); | ||
| 217 | } | 332 | } |
| 333 | __setup("notscsync", notscsync_setup); | ||
| 218 | 334 | ||
| 219 | static atomic_t init_deasserted __cpuinitdata; | 335 | static atomic_t init_deasserted __cpuinitdata; |
| 220 | 336 | ||
| @@ -315,11 +431,6 @@ void __cpuinit start_secondary(void) | |||
| 315 | cpu_init(); | 431 | cpu_init(); |
| 316 | smp_callin(); | 432 | smp_callin(); |
| 317 | 433 | ||
| 318 | /* | ||
| 319 | * Synchronize the TSC with the BP | ||
| 320 | */ | ||
| 321 | sync_tsc_ap(); | ||
| 322 | |||
| 323 | /* otherwise gcc will move up the smp_processor_id before the cpu_init */ | 434 | /* otherwise gcc will move up the smp_processor_id before the cpu_init */ |
| 324 | barrier(); | 435 | barrier(); |
| 325 | 436 | ||
| @@ -334,7 +445,6 @@ void __cpuinit start_secondary(void) | |||
| 334 | enable_8259A_irq(0); | 445 | enable_8259A_irq(0); |
| 335 | } | 446 | } |
| 336 | 447 | ||
| 337 | |||
| 338 | enable_APIC_timer(); | 448 | enable_APIC_timer(); |
| 339 | 449 | ||
| 340 | /* | 450 | /* |
| @@ -343,6 +453,11 @@ void __cpuinit start_secondary(void) | |||
| 343 | cpu_set(smp_processor_id(), cpu_online_map); | 453 | cpu_set(smp_processor_id(), cpu_online_map); |
| 344 | mb(); | 454 | mb(); |
| 345 | 455 | ||
| 456 | /* Wait for TSC sync to not schedule things before. | ||
| 457 | We still process interrupts, which could see an inconsistent | ||
| 458 | time in that window unfortunately. */ | ||
| 459 | tsc_sync_wait(); | ||
| 460 | |||
| 346 | cpu_idle(); | 461 | cpu_idle(); |
| 347 | } | 462 | } |
| 348 | 463 | ||
| @@ -531,7 +646,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) | |||
| 531 | printk("failed fork for CPU %d\n", cpu); | 646 | printk("failed fork for CPU %d\n", cpu); |
| 532 | return PTR_ERR(idle); | 647 | return PTR_ERR(idle); |
| 533 | } | 648 | } |
| 534 | x86_cpu_to_apicid[cpu] = apicid; | ||
| 535 | 649 | ||
| 536 | cpu_pda[cpu].pcurrent = idle; | 650 | cpu_pda[cpu].pcurrent = idle; |
| 537 | 651 | ||
| @@ -600,8 +714,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) | |||
| 600 | 714 | ||
| 601 | if (cpu_isset(cpu, cpu_callin_map)) { | 715 | if (cpu_isset(cpu, cpu_callin_map)) { |
| 602 | /* number CPUs logically, starting from 1 (BSP is 0) */ | 716 | /* number CPUs logically, starting from 1 (BSP is 0) */ |
| 603 | Dprintk("OK.\n"); | ||
| 604 | print_cpu_info(&cpu_data[cpu]); | ||
| 605 | Dprintk("CPU has booted.\n"); | 717 | Dprintk("CPU has booted.\n"); |
| 606 | } else { | 718 | } else { |
| 607 | boot_error = 1; | 719 | boot_error = 1; |
| @@ -842,7 +954,6 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus) | |||
| 842 | GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); | 954 | GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); |
| 843 | /* Or can we switch back to PIC here? */ | 955 | /* Or can we switch back to PIC here? */ |
| 844 | } | 956 | } |
| 845 | x86_cpu_to_apicid[0] = boot_cpu_id; | ||
| 846 | 957 | ||
| 847 | /* | 958 | /* |
| 848 | * Now start the IO-APICs | 959 | * Now start the IO-APICs |
| @@ -889,18 +1000,14 @@ int __cpuinit __cpu_up(unsigned int cpu) | |||
| 889 | printk("__cpu_up: bad cpu %d\n", cpu); | 1000 | printk("__cpu_up: bad cpu %d\n", cpu); |
| 890 | return -EINVAL; | 1001 | return -EINVAL; |
| 891 | } | 1002 | } |
| 892 | sync_tsc_bp_init(1); | ||
| 893 | 1003 | ||
| 894 | /* Boot it! */ | 1004 | /* Boot it! */ |
| 895 | err = do_boot_cpu(cpu, apicid); | 1005 | err = do_boot_cpu(cpu, apicid); |
| 896 | if (err < 0) { | 1006 | if (err < 0) { |
| 897 | sync_tsc_bp_init(0); | ||
| 898 | Dprintk("do_boot_cpu failed %d\n", err); | 1007 | Dprintk("do_boot_cpu failed %d\n", err); |
| 899 | return err; | 1008 | return err; |
| 900 | } | 1009 | } |
| 901 | 1010 | ||
| 902 | sync_tsc_bp(cpu); | ||
| 903 | |||
| 904 | /* Unleash the CPU! */ | 1011 | /* Unleash the CPU! */ |
| 905 | Dprintk("waiting for cpu %d\n", cpu); | 1012 | Dprintk("waiting for cpu %d\n", cpu); |
| 906 | 1013 | ||
| @@ -923,4 +1030,6 @@ void __cpuinit smp_cpus_done(unsigned int max_cpus) | |||
| 923 | 1030 | ||
| 924 | detect_siblings(); | 1031 | detect_siblings(); |
| 925 | time_init_gtod(); | 1032 | time_init_gtod(); |
| 1033 | |||
| 1034 | check_nmi_watchdog(); | ||
| 926 | } | 1035 | } |
