aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc
diff options
context:
space:
mode:
authorMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>2014-06-11 04:47:56 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2014-06-11 05:15:13 -0400
commit2749a2f26a7c7eb4c7e3901695c8977cdb6b826d (patch)
tree5deed010cd60ecc7af5099ddeea0714048573a1a /arch/powerpc
parent357b2f3dd9b7e220ddbaef5bcc108f0359dc0fcf (diff)
powerpc/book3s: Fix machine check handling for unhandled errors
Current code does not check for unhandled/unrecovered errors and return from interrupt if it is recoverable exception which in-turn triggers same machine check exception in a loop causing hypervisor to be unresponsive. This patch fixes this situation and forces hypervisor to panic for unhandled/unrecovered errors. This patch also fixes another issue where unrecoverable_exception routine was called in real mode in case of unrecoverable exception (MSR_RI = 0). This causes another exception vector 0x300 (data access) during system crash leading to confusion while debugging cause of the system crash. Also turn ME bit off while going down, so that when another MCE is hit during panic path, system will checkstop and hypervisor will get restarted cleanly by SP. With the above fixes we now throw correct console messages (see below) while crashing the system in case of unhandled/unrecoverable machine checks. -------------- Severe Machine check interrupt [[Not recovered] Initiator: CPU Error type: UE [Instruction fetch] Effective address: 0000000030002864 Oops: Machine check, sig: 7 [#1] SMP NR_CPUS=2048 NUMA PowerNV Modules linked in: bork(O) bridge stp llc kvm [last unloaded: bork] CPU: 36 PID: 55162 Comm: bash Tainted: G O 3.14.0mce #1 task: c000002d72d022d0 ti: c000000007ec0000 task.ti: c000002d72de4000 NIP: 0000000030002864 LR: 00000000300151a4 CTR: 000000003001518c REGS: c000000007ec3d80 TRAP: 0200 Tainted: G O (3.14.0mce) MSR: 9000000000041002 <SF,HV,ME,RI> CR: 28222848 XER: 20000000 CFAR: 0000000030002838 DAR: d0000000004d0000 DSISR: 00000000 SOFTE: 1 GPR00: 000000003001512c 0000000031f92cb0 0000000030078af0 0000000030002864 GPR04: d0000000004d0000 0000000000000000 0000000030002864 ffffffffffffffc9 GPR08: 0000000000000024 0000000030008af0 000000000000002c c00000000150e728 GPR12: 9000000000041002 0000000031f90000 0000000010142550 0000000040000000 GPR16: 0000000010143cdc 0000000000000000 00000000101306fc 00000000101424dc GPR20: 00000000101424e0 000000001013c6f0 0000000000000000 0000000000000000 GPR24: 0000000010143ce0 00000000100f6440 c000002d72de7e00 c000002d72860250 GPR28: c000002d72860240 c000002d72ac0038 0000000000000008 0000000000040000 NIP [0000000030002864] 0x30002864 LR [00000000300151a4] 0x300151a4 Call Trace: Instruction dump: XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX ---[ end trace 7285f0beac1e29d3 ]--- Sending IPI to other CPUs IPI complete OPAL V3 detected ! -------------- Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc')
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S40
1 files changed, 37 insertions, 3 deletions
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 20f11eb4dff7..274a86d001c7 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1389,6 +1389,7 @@ machine_check_handle_early:
1389 bl save_nvgprs 1389 bl save_nvgprs
1390 addi r3,r1,STACK_FRAME_OVERHEAD 1390 addi r3,r1,STACK_FRAME_OVERHEAD
1391 bl machine_check_early 1391 bl machine_check_early
1392 std r3,RESULT(r1) /* Save result */
1392 ld r12,_MSR(r1) 1393 ld r12,_MSR(r1)
1393#ifdef CONFIG_PPC_P7_NAP 1394#ifdef CONFIG_PPC_P7_NAP
1394 /* 1395 /*
@@ -1443,11 +1444,33 @@ machine_check_handle_early:
1443 */ 1444 */
1444 andi. r11,r12,MSR_RI 1445 andi. r11,r12,MSR_RI
1445 bne 2f 1446 bne 2f
14461: addi r3,r1,STACK_FRAME_OVERHEAD 14471: mfspr r11,SPRN_SRR0
1447 bl unrecoverable_exception 1448 ld r10,PACAKBASE(r13)
1448 b 1b 1449 LOAD_HANDLER(r10,unrecover_mce)
1450 mtspr SPRN_SRR0,r10
1451 ld r10,PACAKMSR(r13)
1452 /*
1453 * We are going down. But there are chances that we might get hit by
1454 * another MCE during panic path and we may run into unstable state
1455 * with no way out. Hence, turn ME bit off while going down, so that
1456 * when another MCE is hit during panic path, system will checkstop
1457 * and hypervisor will get restarted cleanly by SP.
1458 */
1459 li r3,MSR_ME
1460 andc r10,r10,r3 /* Turn off MSR_ME */
1461 mtspr SPRN_SRR1,r10
1462 rfid
1463 b .
14492: 14642:
1450 /* 1465 /*
1466 * Check if we have successfully handled/recovered from error, if not
1467 * then stay on emergency stack and panic.
1468 */
1469 ld r3,RESULT(r1) /* Load result */
1470 cmpdi r3,0 /* see if we handled MCE successfully */
1471
1472 beq 1b /* if !handled then panic */
1473 /*
1451 * Return from MC interrupt. 1474 * Return from MC interrupt.
1452 * Queue up the MCE event so that we can log it later, while 1475 * Queue up the MCE event so that we can log it later, while
1453 * returning from kernel or opal call. 1476 * returning from kernel or opal call.
@@ -1460,6 +1483,17 @@ machine_check_handle_early:
1460 MACHINE_CHECK_HANDLER_WINDUP 1483 MACHINE_CHECK_HANDLER_WINDUP
1461 b machine_check_pSeries 1484 b machine_check_pSeries
1462 1485
1486unrecover_mce:
1487 /* Invoke machine_check_exception to print MCE event and panic. */
1488 addi r3,r1,STACK_FRAME_OVERHEAD
1489 bl .machine_check_exception
1490 /*
1491 * We will not reach here. Even if we did, there is no way out. Call
1492 * unrecoverable_exception and die.
1493 */
14941: addi r3,r1,STACK_FRAME_OVERHEAD
1495 bl .unrecoverable_exception
1496 b 1b
1463/* 1497/*
1464 * r13 points to the PACA, r9 contains the saved CR, 1498 * r13 points to the PACA, r9 contains the saved CR,
1465 * r12 contain the saved SRR1, SRR0 is still ready for return 1499 * r12 contain the saved SRR1, SRR0 is still ready for return