aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>2013-10-30 10:36:13 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2013-12-05 00:06:06 -0500
commitb63a0ffe35de7e5f9b907bbc2c783e702f7e15af (patch)
tree4ec5c5261fcfcb9789f7ff5a36852fada5dc8564
parent28446de2ce9992f6d13e4594a25fc9c3b9f4517b (diff)
powerpc/powernv: Machine check exception handling.
Add basic error handling in machine check exception handler. - If MSR_RI isn't set, we can not recover. - Check if disposition set to OpalMCE_DISPOSITION_RECOVERED. - Check if address at fault is inside kernel address space, if not then send SIGBUS to process if we hit exception when in userspace. - If address at fault is not provided then and if we get a synchronous machine check while in userspace then kill the task. Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r--arch/powerpc/include/asm/mce.h1
-rw-r--r--arch/powerpc/kernel/mce.c27
-rw-r--r--arch/powerpc/platforms/powernv/opal.c43
3 files changed, 70 insertions, 1 deletions
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 3276b409299c..a2b8c7b35fba 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -193,5 +193,6 @@ extern void release_mce_event(void);
193extern void machine_check_queue_event(void); 193extern void machine_check_queue_event(void);
194extern void machine_check_process_queued_event(void); 194extern void machine_check_process_queued_event(void);
195extern void machine_check_print_event_info(struct machine_check_event *evt); 195extern void machine_check_print_event_info(struct machine_check_event *evt);
196extern uint64_t get_mce_fault_addr(struct machine_check_event *evt);
196 197
197#endif /* __ASM_PPC64_MCE_H__ */ 198#endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 1c6d15701c56..c0c52ec1fca7 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -316,3 +316,30 @@ void machine_check_print_event_info(struct machine_check_event *evt)
316 break; 316 break;
317 } 317 }
318} 318}
319
320uint64_t get_mce_fault_addr(struct machine_check_event *evt)
321{
322 switch (evt->error_type) {
323 case MCE_ERROR_TYPE_UE:
324 if (evt->u.ue_error.effective_address_provided)
325 return evt->u.ue_error.effective_address;
326 break;
327 case MCE_ERROR_TYPE_SLB:
328 if (evt->u.slb_error.effective_address_provided)
329 return evt->u.slb_error.effective_address;
330 break;
331 case MCE_ERROR_TYPE_ERAT:
332 if (evt->u.erat_error.effective_address_provided)
333 return evt->u.erat_error.effective_address;
334 break;
335 case MCE_ERROR_TYPE_TLB:
336 if (evt->u.tlb_error.effective_address_provided)
337 return evt->u.tlb_error.effective_address;
338 break;
339 default:
340 case MCE_ERROR_TYPE_UNKNOWN:
341 break;
342 }
343 return 0;
344}
345EXPORT_SYMBOL(get_mce_fault_addr);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index f348bd49487c..01e74cbc67e9 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -18,6 +18,7 @@
18#include <linux/interrupt.h> 18#include <linux/interrupt.h>
19#include <linux/notifier.h> 19#include <linux/notifier.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/sched.h>
21#include <linux/kobject.h> 22#include <linux/kobject.h>
22#include <asm/opal.h> 23#include <asm/opal.h>
23#include <asm/firmware.h> 24#include <asm/firmware.h>
@@ -251,6 +252,44 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
251 return written; 252 return written;
252} 253}
253 254
255static int opal_recover_mce(struct pt_regs *regs,
256 struct machine_check_event *evt)
257{
258 int recovered = 0;
259 uint64_t ea = get_mce_fault_addr(evt);
260
261 if (!(regs->msr & MSR_RI)) {
262 /* If MSR_RI isn't set, we cannot recover */
263 recovered = 0;
264 } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
265 /* Platform corrected itself */
266 recovered = 1;
267 } else if (ea && !is_kernel_addr(ea)) {
268 /*
269 * Faulting address is not in kernel text. We should be fine.
270 * We need to find which process uses this address.
271 * For now, kill the task if we have received exception when
272 * in userspace.
273 *
274 * TODO: Queue up this address for hwpoisioning later.
275 */
276 if (user_mode(regs) && !is_global_init(current)) {
277 _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
278 recovered = 1;
279 } else
280 recovered = 0;
281 } else if (user_mode(regs) && !is_global_init(current) &&
282 evt->severity == MCE_SEV_ERROR_SYNC) {
283 /*
284 * If we have received a synchronous error when in userspace
285 * kill the task.
286 */
287 _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
288 recovered = 1;
289 }
290 return recovered;
291}
292
254int opal_machine_check(struct pt_regs *regs) 293int opal_machine_check(struct pt_regs *regs)
255{ 294{
256 struct machine_check_event evt; 295 struct machine_check_event evt;
@@ -266,7 +305,9 @@ int opal_machine_check(struct pt_regs *regs)
266 } 305 }
267 machine_check_print_event_info(&evt); 306 machine_check_print_event_info(&evt);
268 307
269 return evt.severity == MCE_SEV_FATAL ? 0 : 1; 308 if (opal_recover_mce(regs, &evt))
309 return 1;
310 return 0;
270} 311}
271 312
272static irqreturn_t opal_interrupt(int irq, void *data) 313static irqreturn_t opal_interrupt(int irq, void *data)