aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2005-04-16 18:25:19 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:25:19 -0400
commita8ab26fe5bfeef43bdcde5182ca051ae0647607e (patch)
tree456f92b42111f83a4568b27efd863f20ffabbe3c /arch
parentebfcaa96fccc01301a577c5c56a5f00543cf167e (diff)
[PATCH] x86_64: Switch SMP bootup over to new CPU hotplug state machine
This will allow hotplug CPU in the future and in general cleans up a lot of crufty code. It also should plug some races that the old hackish way introduces. Remove one old race workaround in NMI watchdog setup that is not needed anymore. I removed the old total sum of bogomips reporting code. The brag value of BogoMips has been greatly devalued in the last years on the open market. Real CPU hotplug will need some more work, but the infrastructure for it is there now. One drawback: the new TSC sync algorithm is less accurate than before. The old way of zeroing TSCs is too intrusive to do later. Instead the TSC of the BP is duplicated now, which is less accurate. akpm: - sync_tsc_bp_init seems to have the sense of `init' inverted. - SPIN_LOCK_UNLOCKED is deprecated - use DEFINE_SPINLOCK. Cc: <rusty@rustcorp.com.au> Cc: <mingo@elte.hu> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86_64/kernel/nmi.c6
-rw-r--r--arch/x86_64/kernel/smp.c25
-rw-r--r--arch/x86_64/kernel/smpboot.c688
-rw-r--r--arch/x86_64/kernel/time.c9
4 files changed, 363 insertions, 365 deletions
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 6ec5217f4b73..72f40ca4f06b 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -130,12 +130,6 @@ int __init check_nmi_watchdog (void)
130 mdelay((10*1000)/nmi_hz); // wait 10 ticks 130 mdelay((10*1000)/nmi_hz); // wait 10 ticks
131 131
132 for (cpu = 0; cpu < NR_CPUS; cpu++) { 132 for (cpu = 0; cpu < NR_CPUS; cpu++) {
133#ifdef CONFIG_SMP
134 /* Check cpu_callin_map here because that is set
135 after the timer is started. */
136 if (!cpu_isset(cpu, cpu_callin_map))
137 continue;
138#endif
139 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { 133 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
140 printk("CPU#%d: NMI appears to be stuck (%d)!\n", 134 printk("CPU#%d: NMI appears to be stuck (%d)!\n",
141 cpu, 135 cpu,
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index e3cdbf9a88bd..1e379ed17b1d 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -27,6 +27,7 @@
27#include <asm/mach_apic.h> 27#include <asm/mach_apic.h>
28#include <asm/mmu_context.h> 28#include <asm/mmu_context.h>
29#include <asm/proto.h> 29#include <asm/proto.h>
30#include <asm/apicdef.h>
30 31
31/* 32/*
32 * Smarter SMP flushing macros. 33 * Smarter SMP flushing macros.
@@ -413,3 +414,27 @@ asmlinkage void smp_call_function_interrupt(void)
413 atomic_inc(&call_data->finished); 414 atomic_inc(&call_data->finished);
414 } 415 }
415} 416}
417
418int safe_smp_processor_id(void)
419{
420 int apicid, i;
421
422 if (disable_apic)
423 return 0;
424
425 apicid = hard_smp_processor_id();
426 if (x86_cpu_to_apicid[apicid] == apicid)
427 return apicid;
428
429 for (i = 0; i < NR_CPUS; ++i) {
430 if (x86_cpu_to_apicid[i] == apicid)
431 return i;
432 }
433
434 /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
435 * or called too early. Either way, we must be CPU 0. */
436 if (x86_cpu_to_apicid[0] == BAD_APICID)
437 return 0;
438
439 return 0; /* Should not happen */
440}
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 7e2de8925b0f..eca01e82d3c7 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -12,8 +12,7 @@
12 * Pentium Pro and Pentium-II/Xeon MP machines. 12 * Pentium Pro and Pentium-II/Xeon MP machines.
13 * Original development of Linux SMP code supported by Caldera. 13 * Original development of Linux SMP code supported by Caldera.
14 * 14 *
15 * This code is released under the GNU General Public License version 2 or 15 * This code is released under the GNU General Public License version 2
16 * later.
17 * 16 *
18 * Fixes 17 * Fixes
19 * Felix Koop : NR_CPUS used properly 18 * Felix Koop : NR_CPUS used properly
@@ -31,9 +30,13 @@
31 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. 30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
32 * Maciej W. Rozycki : Bits for genuine 82489DX APICs 31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
33 * Andi Kleen : Changed for SMP boot into long mode. 32 * Andi Kleen : Changed for SMP boot into long mode.
34 * Rusty Russell : Hacked into shape for new "hotplug" boot process. 33 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
34 * Andi Kleen : Converted to new state machine.
35 * Various cleanups.
36 * Probably mostly hotplug CPU ready now.
35 */ 37 */
36 38
39
37#include <linux/config.h> 40#include <linux/config.h>
38#include <linux/init.h> 41#include <linux/init.h>
39 42
@@ -54,11 +57,15 @@
54#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
55#include <asm/proto.h> 58#include <asm/proto.h>
56 59
60/* Change for real CPU hotplug. Note other files need to be fixed
61 first too. */
62#define __cpuinit __init
63#define __cpuinitdata __initdata
64
57/* Number of siblings per CPU package */ 65/* Number of siblings per CPU package */
58int smp_num_siblings = 1; 66int smp_num_siblings = 1;
59/* Package ID of each logical CPU */ 67/* Package ID of each logical CPU */
60u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; 68u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
61/* Core ID of each logical CPU */
62u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; 69u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
63EXPORT_SYMBOL(phys_proc_id); 70EXPORT_SYMBOL(phys_proc_id);
64EXPORT_SYMBOL(cpu_core_id); 71EXPORT_SYMBOL(cpu_core_id);
@@ -66,13 +73,24 @@ EXPORT_SYMBOL(cpu_core_id);
66/* Bitmask of currently online CPUs */ 73/* Bitmask of currently online CPUs */
67cpumask_t cpu_online_map; 74cpumask_t cpu_online_map;
68 75
76EXPORT_SYMBOL(cpu_online_map);
77
78/*
79 * Private maps to synchronize booting between AP and BP.
80 * Probably not needed anymore, but it makes for easier debugging. -AK
81 */
69cpumask_t cpu_callin_map; 82cpumask_t cpu_callin_map;
70cpumask_t cpu_callout_map; 83cpumask_t cpu_callout_map;
71static cpumask_t smp_commenced_mask; 84
85cpumask_t cpu_possible_map;
86EXPORT_SYMBOL(cpu_possible_map);
72 87
73/* Per CPU bogomips and other parameters */ 88/* Per CPU bogomips and other parameters */
74struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; 89struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
75 90
91/* Set when the idlers are all forked */
92int smp_threads_ready;
93
76cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; 94cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
77cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; 95cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
78 96
@@ -80,8 +98,8 @@ cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
80 * Trampoline 80x86 program as an array. 98 * Trampoline 80x86 program as an array.
81 */ 99 */
82 100
83extern unsigned char trampoline_data []; 101extern unsigned char trampoline_data[];
84extern unsigned char trampoline_end []; 102extern unsigned char trampoline_end[];
85 103
86/* 104/*
87 * Currently trivial. Write the real->protected mode 105 * Currently trivial. Write the real->protected mode
@@ -89,7 +107,7 @@ extern unsigned char trampoline_end [];
89 * has made sure it's suitably aligned. 107 * has made sure it's suitably aligned.
90 */ 108 */
91 109
92static unsigned long __init setup_trampoline(void) 110static unsigned long __cpuinit setup_trampoline(void)
93{ 111{
94 void *tramp = __va(SMP_TRAMPOLINE_BASE); 112 void *tramp = __va(SMP_TRAMPOLINE_BASE);
95 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); 113 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
@@ -101,7 +119,7 @@ static unsigned long __init setup_trampoline(void)
101 * a given CPU 119 * a given CPU
102 */ 120 */
103 121
104static void __init smp_store_cpu_info(int id) 122static void __cpuinit smp_store_cpu_info(int id)
105{ 123{
106 struct cpuinfo_x86 *c = cpu_data + id; 124 struct cpuinfo_x86 *c = cpu_data + id;
107 125
@@ -110,145 +128,101 @@ static void __init smp_store_cpu_info(int id)
110} 128}
111 129
112/* 130/*
113 * TSC synchronization. 131 * Synchronize TSCs of CPUs
114 * 132 *
115 * We first check whether all CPUs have their TSC's synchronized, 133 * This new algorithm is less accurate than the old "zero TSCs"
116 * then we print a warning if not, and always resync. 134 * one, but we cannot zero TSCs anymore in the new hotplug CPU
135 * model.
117 */ 136 */
118 137
119static atomic_t tsc_start_flag = ATOMIC_INIT(0); 138static atomic_t __cpuinitdata tsc_flag;
120static atomic_t tsc_count_start = ATOMIC_INIT(0); 139static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
121static atomic_t tsc_count_stop = ATOMIC_INIT(0); 140static unsigned long long __cpuinitdata bp_tsc, ap_tsc;
122static unsigned long long tsc_values[NR_CPUS];
123 141
124#define NR_LOOPS 5 142#define NR_LOOPS 5
125 143
126extern unsigned int fast_gettimeoffset_quotient; 144static void __cpuinit sync_tsc_bp_init(int init)
127
128static void __init synchronize_tsc_bp (void)
129{ 145{
130 int i; 146 if (init)
131 unsigned long long t0; 147 _raw_spin_lock(&tsc_sync_lock);
132 unsigned long long sum, avg; 148 else
133 long long delta; 149 _raw_spin_unlock(&tsc_sync_lock);
134 long one_usec; 150 atomic_set(&tsc_flag, 0);
135 int buggy = 0; 151}
136
137 printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",num_booting_cpus());
138
139 one_usec = cpu_khz;
140
141 atomic_set(&tsc_start_flag, 1);
142 wmb();
143
144 /*
145 * We loop a few times to get a primed instruction cache,
146 * then the last pass is more or less synchronized and
147 * the BP and APs set their cycle counters to zero all at
148 * once. This reduces the chance of having random offsets
149 * between the processors, and guarantees that the maximum
150 * delay between the cycle counters is never bigger than
151 * the latency of information-passing (cachelines) between
152 * two CPUs.
153 */
154 for (i = 0; i < NR_LOOPS; i++) {
155 /*
156 * all APs synchronize but they loop on '== num_cpus'
157 */
158 while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) mb();
159 atomic_set(&tsc_count_stop, 0);
160 wmb();
161 /*
162 * this lets the APs save their current TSC:
163 */
164 atomic_inc(&tsc_count_start);
165
166 sync_core();
167 rdtscll(tsc_values[smp_processor_id()]);
168 /*
169 * We clear the TSC in the last loop:
170 */
171 if (i == NR_LOOPS-1)
172 write_tsc(0, 0);
173
174 /*
175 * Wait for all APs to leave the synchronization point:
176 */
177 while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) mb();
178 atomic_set(&tsc_count_start, 0);
179 wmb();
180 atomic_inc(&tsc_count_stop);
181 }
182
183 sum = 0;
184 for (i = 0; i < NR_CPUS; i++) {
185 if (cpu_isset(i, cpu_callout_map)) {
186 t0 = tsc_values[i];
187 sum += t0;
188 }
189 }
190 avg = sum / num_booting_cpus();
191
192 sum = 0;
193 for (i = 0; i < NR_CPUS; i++) {
194 if (!cpu_isset(i, cpu_callout_map))
195 continue;
196
197 delta = tsc_values[i] - avg;
198 if (delta < 0)
199 delta = -delta;
200 /*
201 * We report bigger than 2 microseconds clock differences.
202 */
203 if (delta > 2*one_usec) {
204 long realdelta;
205 if (!buggy) {
206 buggy = 1;
207 printk("\n");
208 }
209 realdelta = delta / one_usec;
210 if (tsc_values[i] < avg)
211 realdelta = -realdelta;
212
213 printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
214 i, realdelta);
215 }
216 152
217 sum += delta; 153/*
218 } 154 * Synchronize TSC on AP with BP.
219 if (!buggy) 155 */
220 printk("passed.\n"); 156static void __cpuinit __sync_tsc_ap(void)
157{
158 if (!cpu_has_tsc)
159 return;
160 Dprintk("AP %d syncing TSC\n", smp_processor_id());
161
162 while (atomic_read(&tsc_flag) != 0)
163 cpu_relax();
164 atomic_inc(&tsc_flag);
165 mb();
166 _raw_spin_lock(&tsc_sync_lock);
167 wrmsrl(MSR_IA32_TSC, bp_tsc);
168 _raw_spin_unlock(&tsc_sync_lock);
169 rdtscll(ap_tsc);
170 mb();
171 atomic_inc(&tsc_flag);
172 mb();
221} 173}
222 174
223static void __init synchronize_tsc_ap (void) 175static void __cpuinit sync_tsc_ap(void)
224{ 176{
225 int i; 177 int i;
178 for (i = 0; i < NR_LOOPS; i++)
179 __sync_tsc_ap();
180}
226 181
227 /* 182/*
228 * Not every cpu is online at the time 183 * Synchronize TSC from BP to AP.
229 * this gets called, so we first wait for the BP to 184 */
230 * finish SMP initialization: 185static void __cpuinit __sync_tsc_bp(int cpu)
231 */ 186{
232 while (!atomic_read(&tsc_start_flag)) mb(); 187 if (!cpu_has_tsc)
233 188 return;
234 for (i = 0; i < NR_LOOPS; i++) {
235 atomic_inc(&tsc_count_start);
236 while (atomic_read(&tsc_count_start) != num_booting_cpus()) mb();
237 189
238 sync_core(); 190 /* Wait for AP */
239 rdtscll(tsc_values[smp_processor_id()]); 191 while (atomic_read(&tsc_flag) == 0)
240 if (i == NR_LOOPS-1) 192 cpu_relax();
241 write_tsc(0, 0); 193 /* Save BPs TSC */
194 sync_core();
195 rdtscll(bp_tsc);
196 /* Don't do the sync core here to avoid too much latency. */
197 mb();
198 /* Start the AP */
199 _raw_spin_unlock(&tsc_sync_lock);
200 /* Wait for AP again */
201 while (atomic_read(&tsc_flag) < 2)
202 cpu_relax();
203 rdtscl(bp_tsc);
204 barrier();
205}
242 206
243 atomic_inc(&tsc_count_stop); 207static void __cpuinit sync_tsc_bp(int cpu)
244 while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); 208{
209 int i;
210 for (i = 0; i < NR_LOOPS - 1; i++) {
211 __sync_tsc_bp(cpu);
212 sync_tsc_bp_init(1);
245 } 213 }
214 __sync_tsc_bp(cpu);
215 printk(KERN_INFO "Synced TSC of CPU %d difference %Ld\n",
216 cpu, ap_tsc - bp_tsc);
246} 217}
247#undef NR_LOOPS
248 218
249static atomic_t init_deasserted; 219static atomic_t init_deasserted __cpuinitdata;
250 220
251static void __init smp_callin(void) 221/*
222 * Report back to the Boot Processor.
223 * Running on AP.
224 */
225void __cpuinit smp_callin(void)
252{ 226{
253 int cpuid, phys_id; 227 int cpuid, phys_id;
254 unsigned long timeout; 228 unsigned long timeout;
@@ -259,7 +233,8 @@ static void __init smp_callin(void)
259 * our local APIC. We have to wait for the IPI or we'll 233 * our local APIC. We have to wait for the IPI or we'll
260 * lock up on an APIC access. 234 * lock up on an APIC access.
261 */ 235 */
262 while (!atomic_read(&init_deasserted)); 236 while (!atomic_read(&init_deasserted))
237 cpu_relax();
263 238
264 /* 239 /*
265 * (This works even if the APIC is not enabled.) 240 * (This works even if the APIC is not enabled.)
@@ -290,7 +265,7 @@ static void __init smp_callin(void)
290 */ 265 */
291 if (cpu_isset(cpuid, cpu_callout_map)) 266 if (cpu_isset(cpuid, cpu_callout_map))
292 break; 267 break;
293 rep_nop(); 268 cpu_relax();
294 } 269 }
295 270
296 if (!time_before(jiffies, timeout)) { 271 if (!time_before(jiffies, timeout)) {
@@ -325,20 +300,12 @@ static void __init smp_callin(void)
325 * Allow the master to continue. 300 * Allow the master to continue.
326 */ 301 */
327 cpu_set(cpuid, cpu_callin_map); 302 cpu_set(cpuid, cpu_callin_map);
328
329 /*
330 * Synchronize the TSC with the BP
331 */
332 if (cpu_has_tsc)
333 synchronize_tsc_ap();
334} 303}
335 304
336static int cpucount;
337
338/* 305/*
339 * Activate a secondary processor. 306 * Setup code on secondary processor (after comming out of the trampoline)
340 */ 307 */
341void __init start_secondary(void) 308void __cpuinit start_secondary(void)
342{ 309{
343 /* 310 /*
344 * Dont put anything before smp_callin(), SMP 311 * Dont put anything before smp_callin(), SMP
@@ -348,17 +315,18 @@ void __init start_secondary(void)
348 cpu_init(); 315 cpu_init();
349 smp_callin(); 316 smp_callin();
350 317
318 /*
319 * Synchronize the TSC with the BP
320 */
321 sync_tsc_ap();
322
351 /* otherwise gcc will move up the smp_processor_id before the cpu_init */ 323 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
352 barrier(); 324 barrier();
353 325
354 Dprintk("cpu %d: waiting for commence\n", smp_processor_id());
355 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
356 rep_nop();
357
358 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); 326 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
359 setup_secondary_APIC_clock(); 327 setup_secondary_APIC_clock();
360 328
361 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); 329 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
362 330
363 if (nmi_watchdog == NMI_IO_APIC) { 331 if (nmi_watchdog == NMI_IO_APIC) {
364 disable_8259A_irq(0); 332 disable_8259A_irq(0);
@@ -367,26 +335,22 @@ void __init start_secondary(void)
367 } 335 }
368 336
369 337
370 enable_APIC_timer(); 338 enable_APIC_timer();
371 339
372 /* 340 /*
373 * low-memory mappings have been cleared, flush them from 341 * Allow the master to continue.
374 * the local TLBs too.
375 */ 342 */
376 local_flush_tlb();
377
378 Dprintk("cpu %d eSetting cpu_online_map\n", smp_processor_id());
379 cpu_set(smp_processor_id(), cpu_online_map); 343 cpu_set(smp_processor_id(), cpu_online_map);
380 wmb(); 344 mb();
381 345
382 cpu_idle(); 346 cpu_idle();
383} 347}
384 348
385extern volatile unsigned long init_rsp; 349extern volatile unsigned long init_rsp;
386extern void (*initial_code)(void); 350extern void (*initial_code)(void);
387 351
388#if APIC_DEBUG 352#if APIC_DEBUG
389static inline void inquire_remote_apic(int apicid) 353static void inquire_remote_apic(int apicid)
390{ 354{
391 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 355 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
392 char *names[] = { "ID", "VERSION", "SPIV" }; 356 char *names[] = { "ID", "VERSION", "SPIV" };
@@ -423,7 +387,10 @@ static inline void inquire_remote_apic(int apicid)
423} 387}
424#endif 388#endif
425 389
426static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) 390/*
391 * Kick the secondary to wake up.
392 */
393static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
427{ 394{
428 unsigned long send_status = 0, accept_status = 0; 395 unsigned long send_status = 0, accept_status = 0;
429 int maxlvt, timeout, num_starts, j; 396 int maxlvt, timeout, num_starts, j;
@@ -546,33 +513,36 @@ static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_
546 return (send_status | accept_status); 513 return (send_status | accept_status);
547} 514}
548 515
549static void __init do_boot_cpu (int apicid) 516/*
517 * Boot one CPU.
518 */
519static int __cpuinit do_boot_cpu(int cpu, int apicid)
550{ 520{
551 struct task_struct *idle; 521 struct task_struct *idle;
552 unsigned long boot_error; 522 unsigned long boot_error;
553 int timeout, cpu; 523 int timeout;
554 unsigned long start_rip; 524 unsigned long start_rip;
555
556 cpu = ++cpucount;
557 /* 525 /*
558 * We can't use kernel_thread since we must avoid to 526 * We can't use kernel_thread since we must avoid to
559 * reschedule the child. 527 * reschedule the child.
560 */ 528 */
561 idle = fork_idle(cpu); 529 idle = fork_idle(cpu);
562 if (IS_ERR(idle)) 530 if (IS_ERR(idle)) {
563 panic("failed fork for CPU %d", cpu); 531 printk("failed fork for CPU %d\n", cpu);
532 return PTR_ERR(idle);
533 }
564 x86_cpu_to_apicid[cpu] = apicid; 534 x86_cpu_to_apicid[cpu] = apicid;
565 535
566 cpu_pda[cpu].pcurrent = idle; 536 cpu_pda[cpu].pcurrent = idle;
567 537
568 start_rip = setup_trampoline(); 538 start_rip = setup_trampoline();
569 539
570 init_rsp = idle->thread.rsp; 540 init_rsp = idle->thread.rsp;
571 per_cpu(init_tss,cpu).rsp0 = init_rsp; 541 per_cpu(init_tss,cpu).rsp0 = init_rsp;
572 initial_code = start_secondary; 542 initial_code = start_secondary;
573 clear_ti_thread_flag(idle->thread_info, TIF_FORK); 543 clear_ti_thread_flag(idle->thread_info, TIF_FORK);
574 544
575 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, 545 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
576 start_rip, init_rsp); 546 start_rip, init_rsp);
577 547
578 /* 548 /*
@@ -609,7 +579,7 @@ static void __init do_boot_cpu (int apicid)
609 /* 579 /*
610 * Starting actual IPI sequence... 580 * Starting actual IPI sequence...
611 */ 581 */
612 boot_error = wakeup_secondary_via_INIT(apicid, start_rip); 582 boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
613 583
614 if (!boot_error) { 584 if (!boot_error) {
615 /* 585 /*
@@ -650,58 +620,131 @@ static void __init do_boot_cpu (int apicid)
650 if (boot_error) { 620 if (boot_error) {
651 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ 621 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
652 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ 622 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
653 cpucount--; 623 cpu_clear(cpu, cpu_present_map);
624 cpu_clear(cpu, cpu_possible_map);
654 x86_cpu_to_apicid[cpu] = BAD_APICID; 625 x86_cpu_to_apicid[cpu] = BAD_APICID;
655 x86_cpu_to_log_apicid[cpu] = BAD_APICID; 626 x86_cpu_to_log_apicid[cpu] = BAD_APICID;
627 return -EIO;
656 } 628 }
629
630 return 0;
657} 631}
658 632
659static void smp_tune_scheduling (void) 633cycles_t cacheflush_time;
634unsigned long cache_decay_ticks;
635
636/*
637 * Construct cpu_sibling_map[], so that we can tell the sibling CPU
638 * on SMT systems efficiently.
639 */
640static __cpuinit void detect_siblings(void)
660{ 641{
661 int cachesize; /* kB */ 642 int cpu;
662 unsigned long bandwidth = 1000; /* MB/s */
663 /*
664 * Rough estimation for SMP scheduling, this is the number of
665 * cycles it takes for a fully memory-limited process to flush
666 * the SMP-local cache.
667 *
668 * (For a P5 this pretty much means we will choose another idle
669 * CPU almost always at wakeup time (this is due to the small
670 * L1 cache), on PIIs it's around 50-100 usecs, depending on
671 * the cache size)
672 */
673 643
674 if (!cpu_khz) { 644 for (cpu = 0; cpu < NR_CPUS; cpu++) {
675 return; 645 cpus_clear(cpu_sibling_map[cpu]);
676 } else { 646 cpus_clear(cpu_core_map[cpu]);
677 cachesize = boot_cpu_data.x86_cache_size; 647 }
678 if (cachesize == -1) { 648
679 cachesize = 16; /* Pentiums, 2x8kB cache */ 649 for_each_online_cpu (cpu) {
680 bandwidth = 100; 650 struct cpuinfo_x86 *c = cpu_data + cpu;
651 int siblings = 0;
652 int i;
653 if (smp_num_siblings > 1) {
654 for_each_online_cpu (i) {
655 if (cpu_core_id[cpu] == phys_proc_id[i]) {
656 siblings++;
657 cpu_set(i, cpu_sibling_map[cpu]);
658 }
659 }
660 } else {
661 siblings++;
662 cpu_set(cpu, cpu_sibling_map[cpu]);
663 }
664
665 if (siblings != smp_num_siblings) {
666 printk(KERN_WARNING
667 "WARNING: %d siblings found for CPU%d, should be %d\n",
668 siblings, cpu, smp_num_siblings);
669 smp_num_siblings = siblings;
681 } 670 }
671 if (c->x86_num_cores > 1) {
672 for_each_online_cpu(i) {
673 if (phys_proc_id[cpu] == phys_proc_id[i])
674 cpu_set(i, cpu_core_map[cpu]);
675 }
676 } else
677 cpu_core_map[cpu] = cpu_sibling_map[cpu];
682 } 678 }
683} 679}
684 680
685/* 681/*
686 * Cycle through the processors sending APIC IPIs to boot each. 682 * Cleanup possible dangling ends...
687 */ 683 */
688 684static __cpuinit void smp_cleanup_boot(void)
689static void __init smp_boot_cpus(unsigned int max_cpus)
690{ 685{
691 unsigned apicid, cpu, bit, kicked; 686 /*
687 * Paranoid: Set warm reset code and vector here back
688 * to default values.
689 */
690 CMOS_WRITE(0, 0xf);
692 691
693 nmi_watchdog_default(); 692 /*
693 * Reset trampoline flag
694 */
695 *((volatile int *) phys_to_virt(0x467)) = 0;
694 696
697#ifndef CONFIG_HOTPLUG_CPU
695 /* 698 /*
696 * Setup boot CPU information 699 * Free pages reserved for SMP bootup.
700 * When you add hotplug CPU support later remove this
701 * Note there is more work to be done for later CPU bootup.
697 */ 702 */
698 smp_store_cpu_info(0); /* Final full version of the data */
699 printk(KERN_INFO "CPU%d: ", 0);
700 print_cpu_info(&cpu_data[0]);
701 703
702 current_thread_info()->cpu = 0; 704 free_page((unsigned long) __va(PAGE_SIZE));
703 smp_tune_scheduling(); 705 free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE));
706#endif
707}
708
709/*
710 * Fall back to non SMP mode after errors.
711 *
712 * RED-PEN audit/test this more. I bet there is more state messed up here.
713 */
714static __cpuinit void disable_smp(void)
715{
716 cpu_present_map = cpumask_of_cpu(0);
717 cpu_possible_map = cpumask_of_cpu(0);
718 if (smp_found_config)
719 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
720 else
721 phys_cpu_present_map = physid_mask_of_physid(0);
722 cpu_set(0, cpu_sibling_map[0]);
723 cpu_set(0, cpu_core_map[0]);
724}
725
726/*
727 * Handle user cpus=... parameter.
728 */
729static __cpuinit void enforce_max_cpus(unsigned max_cpus)
730{
731 int i, k;
732 k = 0;
733 for (i = 0; i < NR_CPUS; i++) {
734 if (!cpu_possible(i))
735 continue;
736 if (++k > max_cpus) {
737 cpu_clear(i, cpu_possible_map);
738 cpu_clear(i, cpu_present_map);
739 }
740 }
741}
704 742
743/*
744 * Various sanity checks.
745 */
746static int __cpuinit smp_sanity_check(unsigned max_cpus)
747{
705 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 748 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
706 printk("weird, boot CPU (#%d) not listed by the BIOS.\n", 749 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
707 hard_smp_processor_id()); 750 hard_smp_processor_id());
@@ -714,15 +757,11 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
714 */ 757 */
715 if (!smp_found_config) { 758 if (!smp_found_config) {
716 printk(KERN_NOTICE "SMP motherboard not detected.\n"); 759 printk(KERN_NOTICE "SMP motherboard not detected.\n");
717 io_apic_irqs = 0; 760 disable_smp();
718 cpu_online_map = cpumask_of_cpu(0);
719 cpu_set(0, cpu_sibling_map[0]);
720 cpu_set(0, cpu_core_map[0]);
721 phys_cpu_present_map = physid_mask_of_physid(0);
722 if (APIC_init_uniprocessor()) 761 if (APIC_init_uniprocessor())
723 printk(KERN_NOTICE "Local APIC not detected." 762 printk(KERN_NOTICE "Local APIC not detected."
724 " Using dummy APIC emulation.\n"); 763 " Using dummy APIC emulation.\n");
725 goto smp_done; 764 return -1;
726 } 765 }
727 766
728 /* 767 /*
@@ -742,213 +781,146 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
742 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 781 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
743 boot_cpu_id); 782 boot_cpu_id);
744 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); 783 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
745 io_apic_irqs = 0; 784 nr_ioapics = 0;
746 cpu_online_map = cpumask_of_cpu(0); 785 return -1;
747 cpu_set(0, cpu_sibling_map[0]);
748 cpu_set(0, cpu_core_map[0]);
749 phys_cpu_present_map = physid_mask_of_physid(0);
750 disable_apic = 1;
751 goto smp_done;
752 } 786 }
753 787
754 verify_local_APIC();
755
756 /* 788 /*
757 * If SMP should be disabled, then really disable it! 789 * If SMP should be disabled, then really disable it!
758 */ 790 */
759 if (!max_cpus) { 791 if (!max_cpus) {
760 smp_found_config = 0;
761 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); 792 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
762 io_apic_irqs = 0; 793 nr_ioapics = 0;
763 cpu_online_map = cpumask_of_cpu(0); 794 return -1;
764 cpu_set(0, cpu_sibling_map[0]);
765 cpu_set(0, cpu_core_map[0]);
766 phys_cpu_present_map = physid_mask_of_physid(0);
767 disable_apic = 1;
768 goto smp_done;
769 } 795 }
770 796
771 connect_bsp_APIC(); 797 return 0;
772 setup_local_APIC(); 798}
773
774 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id)
775 BUG();
776
777 x86_cpu_to_apicid[0] = boot_cpu_id;
778
779 /*
780 * Now scan the CPU present map and fire up the other CPUs.
781 */
782 Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
783 799
784 kicked = 1; 800/*
785 for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { 801 * Prepare for SMP bootup. The MP table or ACPI has been read
786 apicid = cpu_present_to_apicid(bit); 802 * earlier. Just do some sanity checking here and enable APIC mode.
787 /* 803 */
788 * Don't even attempt to start the boot CPU! 804void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
789 */ 805{
790 if (apicid == boot_cpu_id || (apicid == BAD_APICID)) 806 int i;
791 continue;
792 807
793 if (!physid_isset(apicid, phys_cpu_present_map)) 808 nmi_watchdog_default();
794 continue; 809 current_cpu_data = boot_cpu_data;
795 if ((max_cpus >= 0) && (max_cpus <= cpucount+1)) 810 current_thread_info()->cpu = 0; /* needed? */
796 continue;
797 811
798 do_boot_cpu(apicid); 812 enforce_max_cpus(max_cpus);
799 ++kicked;
800 }
801 813
802 /* 814 /*
803 * Cleanup possible dangling ends... 815 * Fill in cpu_present_mask
804 */ 816 */
805 { 817 for (i = 0; i < NR_CPUS; i++) {
806 /* 818 int apicid = cpu_present_to_apicid(i);
807 * Install writable page 0 entry to set BIOS data area. 819 if (physid_isset(apicid, phys_cpu_present_map)) {
808 */ 820 cpu_set(i, cpu_present_map);
809 local_flush_tlb(); 821 /* possible map would be different if we supported real
810 822 CPU hotplug. */
811 /* 823 cpu_set(i, cpu_possible_map);
812 * Paranoid: Set warm reset code and vector here back 824 }
813 * to default values.
814 */
815 CMOS_WRITE(0, 0xf);
816
817 *((volatile int *) phys_to_virt(0x467)) = 0;
818 } 825 }
819 826
820 /* 827 if (smp_sanity_check(max_cpus) < 0) {
821 * Allow the user to impress friends. 828 printk(KERN_INFO "SMP disabled\n");
822 */ 829 disable_smp();
823 830 return;
824 Dprintk("Before bogomips.\n");
825 if (!cpucount) {
826 printk(KERN_INFO "Only one processor found.\n");
827 } else {
828 unsigned long bogosum = 0;
829 for (cpu = 0; cpu < NR_CPUS; cpu++)
830 if (cpu_isset(cpu, cpu_callout_map))
831 bogosum += cpu_data[cpu].loops_per_jiffy;
832 printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
833 cpucount+1,
834 bogosum/(500000/HZ),
835 (bogosum/(5000/HZ))%100);
836 Dprintk("Before bogocount - setting activated=1.\n");
837 } 831 }
838 832
833
839 /* 834 /*
840 * Construct cpu_sibling_map[], so that we can tell the 835 * Switch from PIC to APIC mode.
841 * sibling CPU efficiently.
842 */ 836 */
843 for (cpu = 0; cpu < NR_CPUS; cpu++) { 837 connect_bsp_APIC();
844 cpus_clear(cpu_sibling_map[cpu]); 838 setup_local_APIC();
845 cpus_clear(cpu_core_map[cpu]);
846 }
847
848 for (cpu = 0; cpu < NR_CPUS; cpu++) {
849 struct cpuinfo_x86 *c = cpu_data + cpu;
850 int siblings = 0;
851 int i;
852 if (!cpu_isset(cpu, cpu_callout_map))
853 continue;
854
855 if (smp_num_siblings > 1) {
856 for (i = 0; i < NR_CPUS; i++) {
857 if (!cpu_isset(i, cpu_callout_map))
858 continue;
859 if (phys_proc_id[cpu] == cpu_core_id[i]) {
860 siblings++;
861 cpu_set(i, cpu_sibling_map[cpu]);
862 }
863 }
864 } else {
865 siblings++;
866 cpu_set(cpu, cpu_sibling_map[cpu]);
867 }
868 839
869 if (siblings != smp_num_siblings) { 840 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
870 printk(KERN_WARNING 841 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
871 "WARNING: %d siblings found for CPU%d, should be %d\n", 842 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
872 siblings, cpu, smp_num_siblings); 843 /* Or can we switch back to PIC here? */
873 smp_num_siblings = siblings;
874 }
875 if (c->x86_num_cores > 1) {
876 for (i = 0; i < NR_CPUS; i++) {
877 if (!cpu_isset(i, cpu_callout_map))
878 continue;
879 if (phys_proc_id[cpu] == phys_proc_id[i]) {
880 cpu_set(i, cpu_core_map[cpu]);
881 }
882 }
883 } else
884 cpu_core_map[cpu] = cpu_sibling_map[cpu];
885 } 844 }
886 845 x86_cpu_to_apicid[0] = boot_cpu_id;
887 Dprintk("Boot done.\n");
888 846
889 /* 847 /*
890 * Here we can be sure that there is an IO-APIC in the system. Let's 848 * Now start the IO-APICs
891 * go and set it up:
892 */ 849 */
893 if (!skip_ioapic_setup && nr_ioapics) 850 if (!skip_ioapic_setup && nr_ioapics)
894 setup_IO_APIC(); 851 setup_IO_APIC();
895 else 852 else
896 nr_ioapics = 0; 853 nr_ioapics = 0;
897 854
898 setup_boot_APIC_clock();
899
900 /* 855 /*
901 * Synchronize the TSC with the AP 856 * Set up local APIC timer on boot CPU.
902 */ 857 */
903 if (cpu_has_tsc && cpucount)
904 synchronize_tsc_bp();
905 858
906 smp_done: 859 setup_boot_APIC_clock();
907 time_init_smp();
908} 860}
909 861
910/* These are wrappers to interface to the new boot process. Someone 862/*
911 who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ 863 * Early setup to make printk work.
912void __init smp_prepare_cpus(unsigned int max_cpus) 864 */
865void __init smp_prepare_boot_cpu(void)
913{ 866{
914 smp_boot_cpus(max_cpus); 867 int me = smp_processor_id();
868 cpu_set(me, cpu_online_map);
869 cpu_set(me, cpu_callout_map);
915} 870}
916 871
917void __devinit smp_prepare_boot_cpu(void) 872/*
873 * Entry point to boot a CPU.
874 *
875 * This is all __cpuinit, not __devinit for now because we don't support
876 * CPU hotplug (yet).
877 */
878int __cpuinit __cpu_up(unsigned int cpu)
918{ 879{
919 cpu_set(smp_processor_id(), cpu_online_map); 880 int err;
920 cpu_set(smp_processor_id(), cpu_callout_map); 881 int apicid = cpu_present_to_apicid(cpu);
921}
922 882
923int __devinit __cpu_up(unsigned int cpu) 883 WARN_ON(irqs_disabled());
924{
925 /* This only works at boot for x86. See "rewrite" above. */
926 if (cpu_isset(cpu, smp_commenced_mask)) {
927 local_irq_enable();
928 return -ENOSYS;
929 }
930 884
931 /* In case one didn't come up */ 885 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu);
932 if (!cpu_isset(cpu, cpu_callin_map)) { 886
933 local_irq_enable(); 887 if (apicid == BAD_APICID || apicid == boot_cpu_id ||
934 return -EIO; 888 !physid_isset(apicid, phys_cpu_present_map)) {
889 printk("__cpu_up: bad cpu %d\n", cpu);
890 return -EINVAL;
891 }
892 sync_tsc_bp_init(1);
893
894 /* Boot it! */
895 err = do_boot_cpu(cpu, apicid);
896 if (err < 0) {
897 sync_tsc_bp_init(0);
898 Dprintk("do_boot_cpu failed %d\n", err);
899 return err;
935 } 900 }
936 local_irq_enable(); 901
902 sync_tsc_bp(cpu);
937 903
938 /* Unleash the CPU! */ 904 /* Unleash the CPU! */
939 Dprintk("waiting for cpu %d\n", cpu); 905 Dprintk("waiting for cpu %d\n", cpu);
940 906
941 cpu_set(cpu, smp_commenced_mask);
942 while (!cpu_isset(cpu, cpu_online_map)) 907 while (!cpu_isset(cpu, cpu_online_map))
943 mb(); 908 cpu_relax();
944 return 0; 909 return 0;
945} 910}
946 911
947void __init smp_cpus_done(unsigned int max_cpus) 912/*
913 * Finish the SMP boot.
914 */
915void __cpuinit smp_cpus_done(unsigned int max_cpus)
948{ 916{
917 zap_low_mappings();
918 smp_cleanup_boot();
919
949#ifdef CONFIG_X86_IO_APIC 920#ifdef CONFIG_X86_IO_APIC
950 setup_ioapic_dest(); 921 setup_ioapic_dest();
951#endif 922#endif
952 zap_low_mappings();
953}
954 923
924 detect_siblings();
925 time_init_gtod();
926}
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index c7a1b50b4af3..943a9bfba209 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -916,9 +916,16 @@ void __init time_init(void)
916 setup_irq(0, &irq0); 916 setup_irq(0, &irq0);
917 917
918 set_cyc2ns_scale(cpu_khz / 1000); 918 set_cyc2ns_scale(cpu_khz / 1000);
919
920#ifndef CONFIG_SMP
921 time_init_gtod();
922#endif
919} 923}
920 924
921void __init time_init_smp(void) 925/*
926 * Decide after all CPUs are booted what mode gettimeofday should use.
927 */
928void __init time_init_gtod(void)
922{ 929{
923 char *timetype; 930 char *timetype;
924 931