aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIgor Mammedov <imammedo@redhat.com>2014-06-05 09:42:45 -0400
committerIngo Molnar <mingo@kernel.org>2014-06-05 10:33:08 -0400
commit3e1a878b7ccdb31da6d9d2b855c72ad87afeba3f (patch)
treed7d5de640f5d133af34f437b56c547005b437e4b
parentfeef1e8ecbadf24f8e6829c935df8683cabae41b (diff)
x86/smpboot: Initialize secondary CPU only if master CPU will wait for it
Hang is observed on virtual machines during CPU hotplug, especially in big guests with many CPUs. (It reproducible more often if host is over-committed). It happens because master CPU gives up waiting on secondary CPU and allows it to run wild. As result AP causes locking or crashing system. For example as described here: https://lkml.org/lkml/2014/3/6/257 If master CPU have sent STARTUP IPI successfully, and AP signalled to master CPU that it's ready to start initialization, make master CPU wait indefinitely till AP is onlined. To ensure that AP won't ever run wild, make it wait at early startup till master CPU confirms its intention to wait for AP. If AP doesn't respond in 10 seconds, the master CPU will timeout and cancel AP onlining. Signed-off-by: Igor Mammedov <imammedo@redhat.com> Acked-by: Toshi Kani <toshi.kani@hp.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1401975765-22328-4-git-send-email-imammedo@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/kernel/cpu/common.c27
-rw-r--r--arch/x86/kernel/smpboot.c99
2 files changed, 47 insertions, 79 deletions
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a135239badb7..a4bcbacdbe0b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1221,6 +1221,17 @@ static void dbg_restore_debug_regs(void)
1221#define dbg_restore_debug_regs() 1221#define dbg_restore_debug_regs()
1222#endif /* ! CONFIG_KGDB */ 1222#endif /* ! CONFIG_KGDB */
1223 1223
1224static void wait_for_master_cpu(int cpu)
1225{
1226 /*
1227 * wait for ACK from master CPU before continuing
1228 * with AP initialization
1229 */
1230 WARN_ON(cpumask_test_and_set_cpu(cpu, cpu_initialized_mask));
1231 while (!cpumask_test_cpu(cpu, cpu_callout_mask))
1232 cpu_relax();
1233}
1234
1224/* 1235/*
1225 * cpu_init() initializes state that is per-CPU. Some data is already 1236 * cpu_init() initializes state that is per-CPU. Some data is already
1226 * initialized (naturally) in the bootstrap process, such as the GDT 1237 * initialized (naturally) in the bootstrap process, such as the GDT
@@ -1236,16 +1247,17 @@ void cpu_init(void)
1236 struct task_struct *me; 1247 struct task_struct *me;
1237 struct tss_struct *t; 1248 struct tss_struct *t;
1238 unsigned long v; 1249 unsigned long v;
1239 int cpu; 1250 int cpu = stack_smp_processor_id();
1240 int i; 1251 int i;
1241 1252
1253 wait_for_master_cpu(cpu);
1254
1242 /* 1255 /*
1243 * Load microcode on this cpu if a valid microcode is available. 1256 * Load microcode on this cpu if a valid microcode is available.
1244 * This is early microcode loading procedure. 1257 * This is early microcode loading procedure.
1245 */ 1258 */
1246 load_ucode_ap(); 1259 load_ucode_ap();
1247 1260
1248 cpu = stack_smp_processor_id();
1249 t = &per_cpu(init_tss, cpu); 1261 t = &per_cpu(init_tss, cpu);
1250 oist = &per_cpu(orig_ist, cpu); 1262 oist = &per_cpu(orig_ist, cpu);
1251 1263
@@ -1257,9 +1269,6 @@ void cpu_init(void)
1257 1269
1258 me = current; 1270 me = current;
1259 1271
1260 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
1261 panic("CPU#%d already initialized!\n", cpu);
1262
1263 pr_debug("Initializing CPU#%d\n", cpu); 1272 pr_debug("Initializing CPU#%d\n", cpu);
1264 1273
1265 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1274 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
@@ -1336,13 +1345,9 @@ void cpu_init(void)
1336 struct tss_struct *t = &per_cpu(init_tss, cpu); 1345 struct tss_struct *t = &per_cpu(init_tss, cpu);
1337 struct thread_struct *thread = &curr->thread; 1346 struct thread_struct *thread = &curr->thread;
1338 1347
1339 show_ucode_info_early(); 1348 wait_for_master_cpu(cpu);
1340 1349
1341 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { 1350 show_ucode_info_early();
1342 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
1343 for (;;)
1344 local_irq_enable();
1345 }
1346 1351
1347 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1352 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1348 1353
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ae2fd975b782..bc52fac39dd3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -111,7 +111,6 @@ atomic_t init_deasserted;
111static void smp_callin(void) 111static void smp_callin(void)
112{ 112{
113 int cpuid, phys_id; 113 int cpuid, phys_id;
114 unsigned long timeout;
115 114
116 /* 115 /*
117 * If waken up by an INIT in an 82489DX configuration 116 * If waken up by an INIT in an 82489DX configuration
@@ -130,37 +129,6 @@ static void smp_callin(void)
130 * (This works even if the APIC is not enabled.) 129 * (This works even if the APIC is not enabled.)
131 */ 130 */
132 phys_id = read_apic_id(); 131 phys_id = read_apic_id();
133 if (cpumask_test_cpu(cpuid, cpu_callin_mask)) {
134 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
135 phys_id, cpuid);
136 }
137 pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
138
139 /*
140 * STARTUP IPIs are fragile beasts as they might sometimes
141 * trigger some glue motherboard logic. Complete APIC bus
142 * silence for 1 second, this overestimates the time the
143 * boot CPU is spending to send the up to 2 STARTUP IPIs
144 * by a factor of two. This should be enough.
145 */
146
147 /*
148 * Waiting 2s total for startup (udelay is not yet working)
149 */
150 timeout = jiffies + 2*HZ;
151 while (time_before(jiffies, timeout)) {
152 /*
153 * Has the boot CPU finished it's STARTUP sequence?
154 */
155 if (cpumask_test_cpu(cpuid, cpu_callout_mask))
156 break;
157 cpu_relax();
158 }
159
160 if (!time_before(jiffies, timeout)) {
161 panic("%s: CPU%d started up but did not get a callout!\n",
162 __func__, cpuid);
163 }
164 132
165 /* 133 /*
166 * the boot CPU has finished the init stage and is spinning 134 * the boot CPU has finished the init stage and is spinning
@@ -750,8 +718,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
750 unsigned long start_ip = real_mode_header->trampoline_start; 718 unsigned long start_ip = real_mode_header->trampoline_start;
751 719
752 unsigned long boot_error = 0; 720 unsigned long boot_error = 0;
753 int timeout;
754 int cpu0_nmi_registered = 0; 721 int cpu0_nmi_registered = 0;
722 unsigned long timeout;
755 723
756 /* Just in case we booted with a single CPU. */ 724 /* Just in case we booted with a single CPU. */
757 alternatives_enable_smp(); 725 alternatives_enable_smp();
@@ -799,6 +767,15 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
799 } 767 }
800 768
801 /* 769 /*
770 * AP might wait on cpu_callout_mask in cpu_init() with
771 * cpu_initialized_mask set if previous attempt to online
772 * it timed-out. Clear cpu_initialized_mask so that after
773 * INIT/SIPI it could start with a clean state.
774 */
775 cpumask_clear_cpu(cpu, cpu_initialized_mask);
776 smp_mb();
777
778 /*
802 * Wake up a CPU in difference cases: 779 * Wake up a CPU in difference cases:
803 * - Use the method in the APIC driver if it's defined 780 * - Use the method in the APIC driver if it's defined
804 * Otherwise, 781 * Otherwise,
@@ -810,55 +787,41 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
810 boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, 787 boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
811 &cpu0_nmi_registered); 788 &cpu0_nmi_registered);
812 789
790
813 if (!boot_error) { 791 if (!boot_error) {
814 /* 792 /*
815 * allow APs to start initializing. 793 * Wait 10s total for a response from AP
816 */ 794 */
817 pr_debug("Before Callout %d\n", cpu); 795 boot_error = -1;
818 cpumask_set_cpu(cpu, cpu_callout_mask); 796 timeout = jiffies + 10*HZ;
819 pr_debug("After Callout %d\n", cpu); 797 while (time_before(jiffies, timeout)) {
798 if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
799 /*
800 * Tell AP to proceed with initialization
801 */
802 cpumask_set_cpu(cpu, cpu_callout_mask);
803 boot_error = 0;
804 break;
805 }
806 udelay(100);
807 schedule();
808 }
809 }
820 810
811 if (!boot_error) {
821 /* 812 /*
822 * Wait 5s total for a response 813 * Wait till AP completes initial initialization
823 */ 814 */
824 for (timeout = 0; timeout < 50000; timeout++) { 815 while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
825 if (cpumask_test_cpu(cpu, cpu_callin_mask))
826 break; /* It has booted */
827 udelay(100);
828 /* 816 /*
829 * Allow other tasks to run while we wait for the 817 * Allow other tasks to run while we wait for the
830 * AP to come online. This also gives a chance 818 * AP to come online. This also gives a chance
831 * for the MTRR work(triggered by the AP coming online) 819 * for the MTRR work(triggered by the AP coming online)
832 * to be completed in the stop machine context. 820 * to be completed in the stop machine context.
833 */ 821 */
822 udelay(100);
834 schedule(); 823 schedule();
835 } 824 }
836
837 if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
838 print_cpu_msr(&cpu_data(cpu));
839 pr_debug("CPU%d: has booted.\n", cpu);
840 } else {
841 boot_error = 1;
842 if (*trampoline_status == 0xA5A5A5A5)
843 /* trampoline started but...? */
844 pr_err("CPU%d: Stuck ??\n", cpu);
845 else
846 /* trampoline code not run */
847 pr_err("CPU%d: Not responding\n", cpu);
848 if (apic->inquire_remote_apic)
849 apic->inquire_remote_apic(apicid);
850 }
851 }
852
853 if (boot_error) {
854 /* Try to put things back the way they were before ... */
855 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
856
857 /* was set by do_boot_cpu() */
858 cpumask_clear_cpu(cpu, cpu_callout_mask);
859
860 /* was set by cpu_init() */
861 cpumask_clear_cpu(cpu, cpu_initialized_mask);
862 } 825 }
863 826
864 /* mark "stuck" area as not stuck */ 827 /* mark "stuck" area as not stuck */