diff options
Diffstat (limited to 'arch/x86/kernel/smpboot.c')
-rw-r--r-- | arch/x86/kernel/smpboot.c | 176 |
1 files changed, 88 insertions, 88 deletions
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 42a2dca984b3..2d5200e56357 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -102,6 +102,8 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); | |||
102 | DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); | 102 | DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); |
103 | EXPORT_PER_CPU_SYMBOL(cpu_info); | 103 | EXPORT_PER_CPU_SYMBOL(cpu_info); |
104 | 104 | ||
105 | static DEFINE_PER_CPU(struct completion, die_complete); | ||
106 | |||
105 | atomic_t init_deasserted; | 107 | atomic_t init_deasserted; |
106 | 108 | ||
107 | /* | 109 | /* |
@@ -111,7 +113,6 @@ atomic_t init_deasserted; | |||
111 | static void smp_callin(void) | 113 | static void smp_callin(void) |
112 | { | 114 | { |
113 | int cpuid, phys_id; | 115 | int cpuid, phys_id; |
114 | unsigned long timeout; | ||
115 | 116 | ||
116 | /* | 117 | /* |
117 | * If waken up by an INIT in an 82489DX configuration | 118 | * If waken up by an INIT in an 82489DX configuration |
@@ -130,37 +131,6 @@ static void smp_callin(void) | |||
130 | * (This works even if the APIC is not enabled.) | 131 | * (This works even if the APIC is not enabled.) |
131 | */ | 132 | */ |
132 | phys_id = read_apic_id(); | 133 | phys_id = read_apic_id(); |
133 | if (cpumask_test_cpu(cpuid, cpu_callin_mask)) { | ||
134 | panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, | ||
135 | phys_id, cpuid); | ||
136 | } | ||
137 | pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); | ||
138 | |||
139 | /* | ||
140 | * STARTUP IPIs are fragile beasts as they might sometimes | ||
141 | * trigger some glue motherboard logic. Complete APIC bus | ||
142 | * silence for 1 second, this overestimates the time the | ||
143 | * boot CPU is spending to send the up to 2 STARTUP IPIs | ||
144 | * by a factor of two. This should be enough. | ||
145 | */ | ||
146 | |||
147 | /* | ||
148 | * Waiting 2s total for startup (udelay is not yet working) | ||
149 | */ | ||
150 | timeout = jiffies + 2*HZ; | ||
151 | while (time_before(jiffies, timeout)) { | ||
152 | /* | ||
153 | * Has the boot CPU finished it's STARTUP sequence? | ||
154 | */ | ||
155 | if (cpumask_test_cpu(cpuid, cpu_callout_mask)) | ||
156 | break; | ||
157 | cpu_relax(); | ||
158 | } | ||
159 | |||
160 | if (!time_before(jiffies, timeout)) { | ||
161 | panic("%s: CPU%d started up but did not get a callout!\n", | ||
162 | __func__, cpuid); | ||
163 | } | ||
164 | 134 | ||
165 | /* | 135 | /* |
166 | * the boot CPU has finished the init stage and is spinning | 136 | * the boot CPU has finished the init stage and is spinning |
@@ -296,11 +266,19 @@ void smp_store_cpu_info(int id) | |||
296 | } | 266 | } |
297 | 267 | ||
298 | static bool | 268 | static bool |
269 | topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | ||
270 | { | ||
271 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; | ||
272 | |||
273 | return (cpu_to_node(cpu1) == cpu_to_node(cpu2)); | ||
274 | } | ||
275 | |||
276 | static bool | ||
299 | topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) | 277 | topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) |
300 | { | 278 | { |
301 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; | 279 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; |
302 | 280 | ||
303 | return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), | 281 | return !WARN_ONCE(!topology_same_node(c, o), |
304 | "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " | 282 | "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " |
305 | "[node: %d != %d]. Ignoring dependency.\n", | 283 | "[node: %d != %d]. Ignoring dependency.\n", |
306 | cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); | 284 | cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); |
@@ -341,17 +319,44 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | |||
341 | return false; | 319 | return false; |
342 | } | 320 | } |
343 | 321 | ||
344 | static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | 322 | /* |
323 | * Unlike the other levels, we do not enforce keeping a | ||
324 | * multicore group inside a NUMA node. If this happens, we will | ||
325 | * discard the MC level of the topology later. | ||
326 | */ | ||
327 | static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | ||
345 | { | 328 | { |
346 | if (c->phys_proc_id == o->phys_proc_id) { | 329 | if (c->phys_proc_id == o->phys_proc_id) |
347 | if (cpu_has(c, X86_FEATURE_AMD_DCM)) | 330 | return true; |
348 | return true; | ||
349 | |||
350 | return topology_sane(c, o, "mc"); | ||
351 | } | ||
352 | return false; | 331 | return false; |
353 | } | 332 | } |
354 | 333 | ||
334 | static struct sched_domain_topology_level numa_inside_package_topology[] = { | ||
335 | #ifdef CONFIG_SCHED_SMT | ||
336 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | ||
337 | #endif | ||
338 | #ifdef CONFIG_SCHED_MC | ||
339 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | ||
340 | #endif | ||
341 | { NULL, }, | ||
342 | }; | ||
343 | /* | ||
344 | * set_sched_topology() sets the topology internal to a CPU. The | ||
345 | * NUMA topologies are layered on top of it to build the full | ||
346 | * system topology. | ||
347 | * | ||
348 | * If NUMA nodes are observed to occur within a CPU package, this | ||
349 | * function should be called. It forces the sched domain code to | ||
350 | * only use the SMT level for the CPU portion of the topology. | ||
351 | * This essentially falls back to relying on NUMA information | ||
352 | * from the SRAT table to describe the entire system topology | ||
353 | * (except for hyperthreads). | ||
354 | */ | ||
355 | static void primarily_use_numa_for_topology(void) | ||
356 | { | ||
357 | set_sched_topology(numa_inside_package_topology); | ||
358 | } | ||
359 | |||
355 | void set_cpu_sibling_map(int cpu) | 360 | void set_cpu_sibling_map(int cpu) |
356 | { | 361 | { |
357 | bool has_smt = smp_num_siblings > 1; | 362 | bool has_smt = smp_num_siblings > 1; |
@@ -388,7 +393,7 @@ void set_cpu_sibling_map(int cpu) | |||
388 | for_each_cpu(i, cpu_sibling_setup_mask) { | 393 | for_each_cpu(i, cpu_sibling_setup_mask) { |
389 | o = &cpu_data(i); | 394 | o = &cpu_data(i); |
390 | 395 | ||
391 | if ((i == cpu) || (has_mp && match_mc(c, o))) { | 396 | if ((i == cpu) || (has_mp && match_die(c, o))) { |
392 | link_mask(core, cpu, i); | 397 | link_mask(core, cpu, i); |
393 | 398 | ||
394 | /* | 399 | /* |
@@ -410,6 +415,8 @@ void set_cpu_sibling_map(int cpu) | |||
410 | } else if (i != cpu && !c->booted_cores) | 415 | } else if (i != cpu && !c->booted_cores) |
411 | c->booted_cores = cpu_data(i).booted_cores; | 416 | c->booted_cores = cpu_data(i).booted_cores; |
412 | } | 417 | } |
418 | if (match_die(c, o) && !topology_same_node(c, o)) | ||
419 | primarily_use_numa_for_topology(); | ||
413 | } | 420 | } |
414 | } | 421 | } |
415 | 422 | ||
@@ -753,8 +760,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) | |||
753 | unsigned long start_ip = real_mode_header->trampoline_start; | 760 | unsigned long start_ip = real_mode_header->trampoline_start; |
754 | 761 | ||
755 | unsigned long boot_error = 0; | 762 | unsigned long boot_error = 0; |
756 | int timeout; | ||
757 | int cpu0_nmi_registered = 0; | 763 | int cpu0_nmi_registered = 0; |
764 | unsigned long timeout; | ||
758 | 765 | ||
759 | /* Just in case we booted with a single CPU. */ | 766 | /* Just in case we booted with a single CPU. */ |
760 | alternatives_enable_smp(); | 767 | alternatives_enable_smp(); |
@@ -802,6 +809,15 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) | |||
802 | } | 809 | } |
803 | 810 | ||
804 | /* | 811 | /* |
812 | * AP might wait on cpu_callout_mask in cpu_init() with | ||
813 | * cpu_initialized_mask set if previous attempt to online | ||
814 | * it timed-out. Clear cpu_initialized_mask so that after | ||
815 | * INIT/SIPI it could start with a clean state. | ||
816 | */ | ||
817 | cpumask_clear_cpu(cpu, cpu_initialized_mask); | ||
818 | smp_mb(); | ||
819 | |||
820 | /* | ||
805 | * Wake up a CPU in difference cases: | 821 | * Wake up a CPU in difference cases: |
806 | * - Use the method in the APIC driver if it's defined | 822 | * - Use the method in the APIC driver if it's defined |
807 | * Otherwise, | 823 | * Otherwise, |
@@ -815,53 +831,38 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) | |||
815 | 831 | ||
816 | if (!boot_error) { | 832 | if (!boot_error) { |
817 | /* | 833 | /* |
818 | * allow APs to start initializing. | 834 | * Wait 10s total for a response from AP |
819 | */ | 835 | */ |
820 | pr_debug("Before Callout %d\n", cpu); | 836 | boot_error = -1; |
821 | cpumask_set_cpu(cpu, cpu_callout_mask); | 837 | timeout = jiffies + 10*HZ; |
822 | pr_debug("After Callout %d\n", cpu); | 838 | while (time_before(jiffies, timeout)) { |
839 | if (cpumask_test_cpu(cpu, cpu_initialized_mask)) { | ||
840 | /* | ||
841 | * Tell AP to proceed with initialization | ||
842 | */ | ||
843 | cpumask_set_cpu(cpu, cpu_callout_mask); | ||
844 | boot_error = 0; | ||
845 | break; | ||
846 | } | ||
847 | udelay(100); | ||
848 | schedule(); | ||
849 | } | ||
850 | } | ||
823 | 851 | ||
852 | if (!boot_error) { | ||
824 | /* | 853 | /* |
825 | * Wait 5s total for a response | 854 | * Wait till AP completes initial initialization |
826 | */ | 855 | */ |
827 | for (timeout = 0; timeout < 50000; timeout++) { | 856 | while (!cpumask_test_cpu(cpu, cpu_callin_mask)) { |
828 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) | ||
829 | break; /* It has booted */ | ||
830 | udelay(100); | ||
831 | /* | 857 | /* |
832 | * Allow other tasks to run while we wait for the | 858 | * Allow other tasks to run while we wait for the |
833 | * AP to come online. This also gives a chance | 859 | * AP to come online. This also gives a chance |
834 | * for the MTRR work(triggered by the AP coming online) | 860 | * for the MTRR work(triggered by the AP coming online) |
835 | * to be completed in the stop machine context. | 861 | * to be completed in the stop machine context. |
836 | */ | 862 | */ |
863 | udelay(100); | ||
837 | schedule(); | 864 | schedule(); |
838 | } | 865 | } |
839 | |||
840 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) { | ||
841 | print_cpu_msr(&cpu_data(cpu)); | ||
842 | pr_debug("CPU%d: has booted.\n", cpu); | ||
843 | } else { | ||
844 | boot_error = 1; | ||
845 | if (*trampoline_status == 0xA5A5A5A5) | ||
846 | /* trampoline started but...? */ | ||
847 | pr_err("CPU%d: Stuck ??\n", cpu); | ||
848 | else | ||
849 | /* trampoline code not run */ | ||
850 | pr_err("CPU%d: Not responding\n", cpu); | ||
851 | if (apic->inquire_remote_apic) | ||
852 | apic->inquire_remote_apic(apicid); | ||
853 | } | ||
854 | } | ||
855 | |||
856 | if (boot_error) { | ||
857 | /* Try to put things back the way they were before ... */ | ||
858 | numa_remove_cpu(cpu); /* was set by numa_add_cpu */ | ||
859 | |||
860 | /* was set by do_boot_cpu() */ | ||
861 | cpumask_clear_cpu(cpu, cpu_callout_mask); | ||
862 | |||
863 | /* was set by cpu_init() */ | ||
864 | cpumask_clear_cpu(cpu, cpu_initialized_mask); | ||
865 | } | 866 | } |
866 | 867 | ||
867 | /* mark "stuck" area as not stuck */ | 868 | /* mark "stuck" area as not stuck */ |
@@ -1326,26 +1327,24 @@ int native_cpu_disable(void) | |||
1326 | return ret; | 1327 | return ret; |
1327 | 1328 | ||
1328 | clear_local_APIC(); | 1329 | clear_local_APIC(); |
1329 | 1330 | init_completion(&per_cpu(die_complete, smp_processor_id())); | |
1330 | cpu_disable_common(); | 1331 | cpu_disable_common(); |
1332 | |||
1331 | return 0; | 1333 | return 0; |
1332 | } | 1334 | } |
1333 | 1335 | ||
1334 | void native_cpu_die(unsigned int cpu) | 1336 | void native_cpu_die(unsigned int cpu) |
1335 | { | 1337 | { |
1336 | /* We don't do anything here: idle task is faking death itself. */ | 1338 | /* We don't do anything here: idle task is faking death itself. */ |
1337 | unsigned int i; | 1339 | wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); |
1338 | 1340 | ||
1339 | for (i = 0; i < 10; i++) { | 1341 | /* They ack this in play_dead() by setting CPU_DEAD */ |
1340 | /* They ack this in play_dead by setting CPU_DEAD */ | 1342 | if (per_cpu(cpu_state, cpu) == CPU_DEAD) { |
1341 | if (per_cpu(cpu_state, cpu) == CPU_DEAD) { | 1343 | if (system_state == SYSTEM_RUNNING) |
1342 | if (system_state == SYSTEM_RUNNING) | 1344 | pr_info("CPU %u is now offline\n", cpu); |
1343 | pr_info("CPU %u is now offline\n", cpu); | 1345 | } else { |
1344 | return; | 1346 | pr_err("CPU %u didn't die...\n", cpu); |
1345 | } | ||
1346 | msleep(100); | ||
1347 | } | 1347 | } |
1348 | pr_err("CPU %u didn't die...\n", cpu); | ||
1349 | } | 1348 | } |
1350 | 1349 | ||
1351 | void play_dead_common(void) | 1350 | void play_dead_common(void) |
@@ -1357,6 +1356,7 @@ void play_dead_common(void) | |||
1357 | mb(); | 1356 | mb(); |
1358 | /* Ack it */ | 1357 | /* Ack it */ |
1359 | __this_cpu_write(cpu_state, CPU_DEAD); | 1358 | __this_cpu_write(cpu_state, CPU_DEAD); |
1359 | complete(&per_cpu(die_complete, smp_processor_id())); | ||
1360 | 1360 | ||
1361 | /* | 1361 | /* |
1362 | * With physical CPU hotplug, we should halt the cpu | 1362 | * With physical CPU hotplug, we should halt the cpu |