aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIgor Mammedov <imammedo@redhat.com>2014-06-20 08:23:11 -0400
committerIngo Molnar <mingo@kernel.org>2014-09-16 05:11:32 -0400
commitce4b1b16502b182368cda20a61de2995762c8bcc (patch)
tree0980e8e7cabc1489114e757ac311dda9986a745c
parent9e82bf014195d6f0054982c463575cdce24292be (diff)
x86/smpboot: Initialize secondary CPU only if master CPU will wait for it
Hang is observed on virtual machines during CPU hotplug, especially in big guests with many CPUs. (It reproducible more often if host is over-committed). It happens because master CPU gives up waiting on secondary CPU and allows it to run wild. As result AP causes locking or crashing system. For example as described here: https://lkml.org/lkml/2014/3/6/257 If master CPU have sent STARTUP IPI successfully, and AP signalled to master CPU that it's ready to start initialization, make master CPU wait indefinitely till AP is onlined. To ensure that AP won't ever run wild, make it wait at early startup till master CPU confirms its intention to wait for AP. If AP doesn't respond in 10 seconds, the master CPU will timeout and cancel AP onlining. Signed-off-by: Igor Mammedov <imammedo@redhat.com> Acked-by: Toshi Kani <toshi.kani@hp.com> Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1403266991-12233-1-git-send-email-imammedo@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/kernel/cpu/common.c29
-rw-r--r--arch/x86/kernel/smpboot.c98
-rw-r--r--arch/x86/xen/smp.c2
3 files changed, 50 insertions, 79 deletions
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e4ab2b42bd6f..426cfedefd04 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1266,6 +1266,19 @@ static void dbg_restore_debug_regs(void)
1266#define dbg_restore_debug_regs() 1266#define dbg_restore_debug_regs()
1267#endif /* ! CONFIG_KGDB */ 1267#endif /* ! CONFIG_KGDB */
1268 1268
1269static void wait_for_master_cpu(int cpu)
1270{
1271#ifdef CONFIG_SMP
1272 /*
1273 * wait for ACK from master CPU before continuing
1274 * with AP initialization
1275 */
1276 WARN_ON(cpumask_test_and_set_cpu(cpu, cpu_initialized_mask));
1277 while (!cpumask_test_cpu(cpu, cpu_callout_mask))
1278 cpu_relax();
1279#endif
1280}
1281
1269/* 1282/*
1270 * cpu_init() initializes state that is per-CPU. Some data is already 1283 * cpu_init() initializes state that is per-CPU. Some data is already
1271 * initialized (naturally) in the bootstrap process, such as the GDT 1284 * initialized (naturally) in the bootstrap process, such as the GDT
@@ -1281,16 +1294,17 @@ void cpu_init(void)
1281 struct task_struct *me; 1294 struct task_struct *me;
1282 struct tss_struct *t; 1295 struct tss_struct *t;
1283 unsigned long v; 1296 unsigned long v;
1284 int cpu; 1297 int cpu = stack_smp_processor_id();
1285 int i; 1298 int i;
1286 1299
1300 wait_for_master_cpu(cpu);
1301
1287 /* 1302 /*
1288 * Load microcode on this cpu if a valid microcode is available. 1303 * Load microcode on this cpu if a valid microcode is available.
1289 * This is early microcode loading procedure. 1304 * This is early microcode loading procedure.
1290 */ 1305 */
1291 load_ucode_ap(); 1306 load_ucode_ap();
1292 1307
1293 cpu = stack_smp_processor_id();
1294 t = &per_cpu(init_tss, cpu); 1308 t = &per_cpu(init_tss, cpu);
1295 oist = &per_cpu(orig_ist, cpu); 1309 oist = &per_cpu(orig_ist, cpu);
1296 1310
@@ -1302,9 +1316,6 @@ void cpu_init(void)
1302 1316
1303 me = current; 1317 me = current;
1304 1318
1305 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
1306 panic("CPU#%d already initialized!\n", cpu);
1307
1308 pr_debug("Initializing CPU#%d\n", cpu); 1319 pr_debug("Initializing CPU#%d\n", cpu);
1309 1320
1310 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1321 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
@@ -1381,13 +1392,9 @@ void cpu_init(void)
1381 struct tss_struct *t = &per_cpu(init_tss, cpu); 1392 struct tss_struct *t = &per_cpu(init_tss, cpu);
1382 struct thread_struct *thread = &curr->thread; 1393 struct thread_struct *thread = &curr->thread;
1383 1394
1384 show_ucode_info_early(); 1395 wait_for_master_cpu(cpu);
1385 1396
1386 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { 1397 show_ucode_info_early();
1387 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
1388 for (;;)
1389 local_irq_enable();
1390 }
1391 1398
1392 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1399 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1393 1400
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2d872e08fab9..735c420eba2d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -111,7 +111,6 @@ atomic_t init_deasserted;
111static void smp_callin(void) 111static void smp_callin(void)
112{ 112{
113 int cpuid, phys_id; 113 int cpuid, phys_id;
114 unsigned long timeout;
115 114
116 /* 115 /*
117 * If waken up by an INIT in an 82489DX configuration 116 * If waken up by an INIT in an 82489DX configuration
@@ -130,37 +129,6 @@ static void smp_callin(void)
130 * (This works even if the APIC is not enabled.) 129 * (This works even if the APIC is not enabled.)
131 */ 130 */
132 phys_id = read_apic_id(); 131 phys_id = read_apic_id();
133 if (cpumask_test_cpu(cpuid, cpu_callin_mask)) {
134 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
135 phys_id, cpuid);
136 }
137 pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
138
139 /*
140 * STARTUP IPIs are fragile beasts as they might sometimes
141 * trigger some glue motherboard logic. Complete APIC bus
142 * silence for 1 second, this overestimates the time the
143 * boot CPU is spending to send the up to 2 STARTUP IPIs
144 * by a factor of two. This should be enough.
145 */
146
147 /*
148 * Waiting 2s total for startup (udelay is not yet working)
149 */
150 timeout = jiffies + 2*HZ;
151 while (time_before(jiffies, timeout)) {
152 /*
153 * Has the boot CPU finished it's STARTUP sequence?
154 */
155 if (cpumask_test_cpu(cpuid, cpu_callout_mask))
156 break;
157 cpu_relax();
158 }
159
160 if (!time_before(jiffies, timeout)) {
161 panic("%s: CPU%d started up but did not get a callout!\n",
162 __func__, cpuid);
163 }
164 132
165 /* 133 /*
166 * the boot CPU has finished the init stage and is spinning 134 * the boot CPU has finished the init stage and is spinning
@@ -753,8 +721,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
753 unsigned long start_ip = real_mode_header->trampoline_start; 721 unsigned long start_ip = real_mode_header->trampoline_start;
754 722
755 unsigned long boot_error = 0; 723 unsigned long boot_error = 0;
756 int timeout;
757 int cpu0_nmi_registered = 0; 724 int cpu0_nmi_registered = 0;
725 unsigned long timeout;
758 726
759 /* Just in case we booted with a single CPU. */ 727 /* Just in case we booted with a single CPU. */
760 alternatives_enable_smp(); 728 alternatives_enable_smp();
@@ -802,6 +770,15 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
802 } 770 }
803 771
804 /* 772 /*
773 * AP might wait on cpu_callout_mask in cpu_init() with
774 * cpu_initialized_mask set if previous attempt to online
775 * it timed-out. Clear cpu_initialized_mask so that after
776 * INIT/SIPI it could start with a clean state.
777 */
778 cpumask_clear_cpu(cpu, cpu_initialized_mask);
779 smp_mb();
780
781 /*
805 * Wake up a CPU in difference cases: 782 * Wake up a CPU in difference cases:
806 * - Use the method in the APIC driver if it's defined 783 * - Use the method in the APIC driver if it's defined
807 * Otherwise, 784 * Otherwise,
@@ -815,53 +792,38 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
815 792
816 if (!boot_error) { 793 if (!boot_error) {
817 /* 794 /*
818 * allow APs to start initializing. 795 * Wait 10s total for a response from AP
819 */ 796 */
820 pr_debug("Before Callout %d\n", cpu); 797 boot_error = -1;
821 cpumask_set_cpu(cpu, cpu_callout_mask); 798 timeout = jiffies + 10*HZ;
822 pr_debug("After Callout %d\n", cpu); 799 while (time_before(jiffies, timeout)) {
800 if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
801 /*
802 * Tell AP to proceed with initialization
803 */
804 cpumask_set_cpu(cpu, cpu_callout_mask);
805 boot_error = 0;
806 break;
807 }
808 udelay(100);
809 schedule();
810 }
811 }
823 812
813 if (!boot_error) {
824 /* 814 /*
825 * Wait 5s total for a response 815 * Wait till AP completes initial initialization
826 */ 816 */
827 for (timeout = 0; timeout < 50000; timeout++) { 817 while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
828 if (cpumask_test_cpu(cpu, cpu_callin_mask))
829 break; /* It has booted */
830 udelay(100);
831 /* 818 /*
832 * Allow other tasks to run while we wait for the 819 * Allow other tasks to run while we wait for the
833 * AP to come online. This also gives a chance 820 * AP to come online. This also gives a chance
834 * for the MTRR work(triggered by the AP coming online) 821 * for the MTRR work(triggered by the AP coming online)
835 * to be completed in the stop machine context. 822 * to be completed in the stop machine context.
836 */ 823 */
824 udelay(100);
837 schedule(); 825 schedule();
838 } 826 }
839
840 if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
841 print_cpu_msr(&cpu_data(cpu));
842 pr_debug("CPU%d: has booted.\n", cpu);
843 } else {
844 boot_error = 1;
845 if (*trampoline_status == 0xA5A5A5A5)
846 /* trampoline started but...? */
847 pr_err("CPU%d: Stuck ??\n", cpu);
848 else
849 /* trampoline code not run */
850 pr_err("CPU%d: Not responding\n", cpu);
851 if (apic->inquire_remote_apic)
852 apic->inquire_remote_apic(apicid);
853 }
854 }
855
856 if (boot_error) {
857 /* Try to put things back the way they were before ... */
858 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
859
860 /* was set by do_boot_cpu() */
861 cpumask_clear_cpu(cpu, cpu_callout_mask);
862
863 /* was set by cpu_init() */
864 cpumask_clear_cpu(cpu, cpu_initialized_mask);
865 } 827 }
866 828
867 /* mark "stuck" area as not stuck */ 829 /* mark "stuck" area as not stuck */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 7005974c3ff3..3631e7129e8c 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -360,6 +360,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
360 struct desc_struct *gdt; 360 struct desc_struct *gdt;
361 unsigned long gdt_mfn; 361 unsigned long gdt_mfn;
362 362
363 /* used to tell cpu_init() that it can proceed with initialization */
364 cpumask_set_cpu(cpu, cpu_callout_mask);
363 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) 365 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
364 return 0; 366 return 0;
365 367