diff options
author | Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> | 2013-10-30 10:36:13 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2013-12-05 00:06:06 -0500 |
commit | b63a0ffe35de7e5f9b907bbc2c783e702f7e15af (patch) | |
tree | 4ec5c5261fcfcb9789f7ff5a36852fada5dc8564 | |
parent | 28446de2ce9992f6d13e4594a25fc9c3b9f4517b (diff) |
powerpc/powernv: Machine check exception handling.
Add basic error handling in machine check exception handler.
- If MSR_RI isn't set, we can not recover.
- Check if disposition set to OpalMCE_DISPOSITION_RECOVERED.
- Check if address at fault is inside kernel address space, if not then send
SIGBUS to process if we hit exception when in userspace.
- If address at fault is not provided then and if we get a synchronous machine
check while in userspace then kill the task.
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r-- | arch/powerpc/include/asm/mce.h | 1 | ||||
-rw-r--r-- | arch/powerpc/kernel/mce.c | 27 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal.c | 43 |
3 files changed, 70 insertions, 1 deletions
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 3276b409299c..a2b8c7b35fba 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h | |||
@@ -193,5 +193,6 @@ extern void release_mce_event(void); | |||
193 | extern void machine_check_queue_event(void); | 193 | extern void machine_check_queue_event(void); |
194 | extern void machine_check_process_queued_event(void); | 194 | extern void machine_check_process_queued_event(void); |
195 | extern void machine_check_print_event_info(struct machine_check_event *evt); | 195 | extern void machine_check_print_event_info(struct machine_check_event *evt); |
196 | extern uint64_t get_mce_fault_addr(struct machine_check_event *evt); | ||
196 | 197 | ||
197 | #endif /* __ASM_PPC64_MCE_H__ */ | 198 | #endif /* __ASM_PPC64_MCE_H__ */ |
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 1c6d15701c56..c0c52ec1fca7 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c | |||
@@ -316,3 +316,30 @@ void machine_check_print_event_info(struct machine_check_event *evt) | |||
316 | break; | 316 | break; |
317 | } | 317 | } |
318 | } | 318 | } |
319 | |||
320 | uint64_t get_mce_fault_addr(struct machine_check_event *evt) | ||
321 | { | ||
322 | switch (evt->error_type) { | ||
323 | case MCE_ERROR_TYPE_UE: | ||
324 | if (evt->u.ue_error.effective_address_provided) | ||
325 | return evt->u.ue_error.effective_address; | ||
326 | break; | ||
327 | case MCE_ERROR_TYPE_SLB: | ||
328 | if (evt->u.slb_error.effective_address_provided) | ||
329 | return evt->u.slb_error.effective_address; | ||
330 | break; | ||
331 | case MCE_ERROR_TYPE_ERAT: | ||
332 | if (evt->u.erat_error.effective_address_provided) | ||
333 | return evt->u.erat_error.effective_address; | ||
334 | break; | ||
335 | case MCE_ERROR_TYPE_TLB: | ||
336 | if (evt->u.tlb_error.effective_address_provided) | ||
337 | return evt->u.tlb_error.effective_address; | ||
338 | break; | ||
339 | default: | ||
340 | case MCE_ERROR_TYPE_UNKNOWN: | ||
341 | break; | ||
342 | } | ||
343 | return 0; | ||
344 | } | ||
345 | EXPORT_SYMBOL(get_mce_fault_addr); | ||
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index f348bd49487c..01e74cbc67e9 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/interrupt.h> | 18 | #include <linux/interrupt.h> |
19 | #include <linux/notifier.h> | 19 | #include <linux/notifier.h> |
20 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
21 | #include <linux/sched.h> | ||
21 | #include <linux/kobject.h> | 22 | #include <linux/kobject.h> |
22 | #include <asm/opal.h> | 23 | #include <asm/opal.h> |
23 | #include <asm/firmware.h> | 24 | #include <asm/firmware.h> |
@@ -251,6 +252,44 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len) | |||
251 | return written; | 252 | return written; |
252 | } | 253 | } |
253 | 254 | ||
255 | static int opal_recover_mce(struct pt_regs *regs, | ||
256 | struct machine_check_event *evt) | ||
257 | { | ||
258 | int recovered = 0; | ||
259 | uint64_t ea = get_mce_fault_addr(evt); | ||
260 | |||
261 | if (!(regs->msr & MSR_RI)) { | ||
262 | /* If MSR_RI isn't set, we cannot recover */ | ||
263 | recovered = 0; | ||
264 | } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { | ||
265 | /* Platform corrected itself */ | ||
266 | recovered = 1; | ||
267 | } else if (ea && !is_kernel_addr(ea)) { | ||
268 | /* | ||
269 | * Faulting address is not in kernel text. We should be fine. | ||
270 | * We need to find which process uses this address. | ||
271 | * For now, kill the task if we have received exception when | ||
272 | * in userspace. | ||
273 | * | ||
274 | * TODO: Queue up this address for hwpoisioning later. | ||
275 | */ | ||
276 | if (user_mode(regs) && !is_global_init(current)) { | ||
277 | _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); | ||
278 | recovered = 1; | ||
279 | } else | ||
280 | recovered = 0; | ||
281 | } else if (user_mode(regs) && !is_global_init(current) && | ||
282 | evt->severity == MCE_SEV_ERROR_SYNC) { | ||
283 | /* | ||
284 | * If we have received a synchronous error when in userspace | ||
285 | * kill the task. | ||
286 | */ | ||
287 | _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); | ||
288 | recovered = 1; | ||
289 | } | ||
290 | return recovered; | ||
291 | } | ||
292 | |||
254 | int opal_machine_check(struct pt_regs *regs) | 293 | int opal_machine_check(struct pt_regs *regs) |
255 | { | 294 | { |
256 | struct machine_check_event evt; | 295 | struct machine_check_event evt; |
@@ -266,7 +305,9 @@ int opal_machine_check(struct pt_regs *regs) | |||
266 | } | 305 | } |
267 | machine_check_print_event_info(&evt); | 306 | machine_check_print_event_info(&evt); |
268 | 307 | ||
269 | return evt.severity == MCE_SEV_FATAL ? 0 : 1; | 308 | if (opal_recover_mce(regs, &evt)) |
309 | return 1; | ||
310 | return 0; | ||
270 | } | 311 | } |
271 | 312 | ||
272 | static irqreturn_t opal_interrupt(int irq, void *data) | 313 | static irqreturn_t opal_interrupt(int irq, void *data) |