aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRuss Anderson <rja@sgi.com>2007-09-19 17:58:31 -0400
committerTony Luck <tony.luck@intel.com>2007-10-12 18:19:02 -0400
commite1b1eb011e15190eb859bad0bcae67679bda7d50 (patch)
treed86d48627b32051ec57ec3dbd47e9bcbd01a40e5
parent2bc5c282999af41042c2b703bf3a58ca1d7e3ee2 (diff)
[IA64] Fix race when multiple cpus go through MCA
Additional testing uncovered a situation where the MCA recovery code could hang due to a race condition. According to the SAL spec, SAL sends a rendezvous interrupt to all but the first CPU that goes into MCA. This includes other CPUs that go into MCA at the same time. Those other CPUs will go into the linux MCA handler (rather than the slave loop) with the rendezvous interrupt pending. When all the CPUs have completed MCA processing and the last monarch completes, freeing all the CPUs, the CPUs with the pended rendezvous interrupt then go into the ia64_mca_rendez_int_handler(). In ia64_mca_rendez_int_handler() the CPUs get marked as rendezvoused, but then leave the handler (due to no MCA). That leaves the CPUs marked as rendezvoused _before_ the next MCA event. When the next MCA hits, the monarch will mistakenly believe that all the CPUs are rendezvoused when they are not, opening up a window where a CPU can get stuck in the slave loop. This patch avoids leaving CPUs marked as rendezvoused when they are not. Signed-off-by: Russ Anderson <rja@sgi.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
-rw-r--r--arch/ia64/kernel/mca.c47
1 files changed, 25 insertions, 22 deletions
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 92367faecbbf..cc87025e8f54 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -701,8 +701,7 @@ ia64_mca_cmc_vector_enable_keventd(struct work_struct *unused)
701/* 701/*
702 * ia64_mca_wakeup 702 * ia64_mca_wakeup
703 * 703 *
704 * Send an inter-cpu interrupt to wake-up a particular cpu 704 * Send an inter-cpu interrupt to wake-up a particular cpu.
705 * and mark that cpu to be out of rendez.
706 * 705 *
707 * Inputs : cpuid 706 * Inputs : cpuid
708 * Outputs : None 707 * Outputs : None
@@ -711,14 +710,12 @@ static void
711ia64_mca_wakeup(int cpu) 710ia64_mca_wakeup(int cpu)
712{ 711{
713 platform_send_ipi(cpu, IA64_MCA_WAKEUP_VECTOR, IA64_IPI_DM_INT, 0); 712 platform_send_ipi(cpu, IA64_MCA_WAKEUP_VECTOR, IA64_IPI_DM_INT, 0);
714 ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
715
716} 713}
717 714
718/* 715/*
719 * ia64_mca_wakeup_all 716 * ia64_mca_wakeup_all
720 * 717 *
721 * Wakeup all the cpus which have rendez'ed previously. 718 * Wakeup all the slave cpus which have rendez'ed previously.
722 * 719 *
723 * Inputs : None 720 * Inputs : None
724 * Outputs : None 721 * Outputs : None
@@ -741,7 +738,10 @@ ia64_mca_wakeup_all(void)
741 * 738 *
742 * This is handler used to put slave processors into spinloop 739 * This is handler used to put slave processors into spinloop
743 * while the monarch processor does the mca handling and later 740 * while the monarch processor does the mca handling and later
744 * wake each slave up once the monarch is done. 741 * wake each slave up once the monarch is done. The state
742 * IA64_MCA_RENDEZ_CHECKIN_DONE indicates the cpu is rendez'ed
743 * in SAL. The state IA64_MCA_RENDEZ_CHECKIN_NOTDONE indicates
744 * the cpu has come out of OS rendezvous.
745 * 745 *
746 * Inputs : None 746 * Inputs : None
747 * Outputs : None 747 * Outputs : None
@@ -778,6 +778,7 @@ ia64_mca_rendez_int_handler(int rendez_irq, void *arg)
778 (long)&nd, 0, 0) == NOTIFY_STOP) 778 (long)&nd, 0, 0) == NOTIFY_STOP)
779 ia64_mca_spin(__FUNCTION__); 779 ia64_mca_spin(__FUNCTION__);
780 780
781 ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
781 /* Enable all interrupts */ 782 /* Enable all interrupts */
782 local_irq_restore(flags); 783 local_irq_restore(flags);
783 return IRQ_HANDLED; 784 return IRQ_HANDLED;
@@ -1221,26 +1222,27 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
1221 if (notify_die(DIE_MCA_MONARCH_ENTER, "MCA", regs, (long)&nd, 0, 0) 1222 if (notify_die(DIE_MCA_MONARCH_ENTER, "MCA", regs, (long)&nd, 0, 0)
1222 == NOTIFY_STOP) 1223 == NOTIFY_STOP)
1223 ia64_mca_spin(__FUNCTION__); 1224 ia64_mca_spin(__FUNCTION__);
1225
1226 ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_CONCURRENT_MCA;
1224 if (sos->monarch) { 1227 if (sos->monarch) {
1225 ia64_wait_for_slaves(cpu, "MCA"); 1228 ia64_wait_for_slaves(cpu, "MCA");
1229
1230 /* Wakeup all the processors which are spinning in the
1231 * rendezvous loop. They will leave SAL, then spin in the OS
1232 * with interrupts disabled until this monarch cpu leaves the
1233 * MCA handler. That gets control back to the OS so we can
1234 * backtrace the other cpus, backtrace when spinning in SAL
1235 * does not work.
1236 */
1237 ia64_mca_wakeup_all();
1238 if (notify_die(DIE_MCA_MONARCH_PROCESS, "MCA", regs, (long)&nd, 0, 0)
1239 == NOTIFY_STOP)
1240 ia64_mca_spin(__FUNCTION__);
1226 } else { 1241 } else {
1227 ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_CONCURRENT_MCA;
1228 while (cpu_isset(cpu, mca_cpu)) 1242 while (cpu_isset(cpu, mca_cpu))
1229 cpu_relax(); /* spin until monarch wakes us */ 1243 cpu_relax(); /* spin until monarch wakes us */
1230 ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
1231 } 1244 }
1232 1245
1233 /* Wakeup all the processors which are spinning in the rendezvous loop.
1234 * They will leave SAL, then spin in the OS with interrupts disabled
1235 * until this monarch cpu leaves the MCA handler. That gets control
1236 * back to the OS so we can backtrace the other cpus, backtrace when
1237 * spinning in SAL does not work.
1238 */
1239 ia64_mca_wakeup_all();
1240 if (notify_die(DIE_MCA_MONARCH_PROCESS, "MCA", regs, (long)&nd, 0, 0)
1241 == NOTIFY_STOP)
1242 ia64_mca_spin(__FUNCTION__);
1243
1244 /* Get the MCA error record and log it */ 1246 /* Get the MCA error record and log it */
1245 ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA); 1247 ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA);
1246 1248
@@ -1274,21 +1276,22 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
1274 /* wake up the next monarch cpu, 1276 /* wake up the next monarch cpu,
1275 * and put this cpu in the rendez loop. 1277 * and put this cpu in the rendez loop.
1276 */ 1278 */
1277 ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_CONCURRENT_MCA;
1278 for_each_online_cpu(i) { 1279 for_each_online_cpu(i) {
1279 if (cpu_isset(i, mca_cpu)) { 1280 if (cpu_isset(i, mca_cpu)) {
1280 monarch_cpu = i; 1281 monarch_cpu = i;
1281 cpu_clear(i, mca_cpu); /* wake next cpu */ 1282 cpu_clear(i, mca_cpu); /* wake next cpu */
1282 while (monarch_cpu != -1) 1283 while (monarch_cpu != -1)
1283 cpu_relax(); /* spin until last cpu leaves */ 1284 cpu_relax(); /* spin until last cpu leaves */
1284 ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
1285 set_curr_task(cpu, previous_current); 1285 set_curr_task(cpu, previous_current);
1286 ia64_mc_info.imi_rendez_checkin[cpu]
1287 = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
1286 return; 1288 return;
1287 } 1289 }
1288 } 1290 }
1289 } 1291 }
1290 set_curr_task(cpu, previous_current); 1292 set_curr_task(cpu, previous_current);
1291 monarch_cpu = -1; 1293 ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
1294 monarch_cpu = -1; /* This frees the slaves and previous monarchs */
1292} 1295}
1293 1296
1294static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd); 1297static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd);