aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/platforms/pseries
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2011-01-11 14:49:19 -0500
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2011-01-20 22:08:38 -0500
commitd47d1d8af52e37bcf9059dd86878474e5ccc9c2a (patch)
treec8af975e293d17c96f06913db78cac6041e01a47 /arch/powerpc/platforms/pseries
parente49b1fae0ba4d06b29bd753a961abb447566bf4a (diff)
powerpc: Rework pseries machine check handler
Rework pseries machine check handler: - If MSR_RI isn't set, we cannot recover even if the machine check was fully recovered - Rename nonfatal to recovered - Handle RTAS_DISP_LIMITED_RECOVERY - Use BUS_MCEERR_AR instead of BUS_ADRERR - Don't check all the RTAS error log fields when receiving a synchronous machine check. Recent versions of the pseries firmware do not fill them in during a machine check and instead send a follow up error log with the detailed information. If we see a synchronous machine check, and we came from userspace then kill the task. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/platforms/pseries')
-rw-r--r--arch/powerpc/platforms/pseries/ras.c48
1 files changed, 30 insertions, 18 deletions
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 048e25711c6b..d194150cf342 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -259,31 +259,43 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
259 * Return 1 if corrected (or delivered a signal). 259 * Return 1 if corrected (or delivered a signal).
260 * Return 0 if there is nothing we can do. 260 * Return 0 if there is nothing we can do.
261 */ 261 */
262static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err) 262static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
263{ 263{
264 int nonfatal = 0; 264 int recovered = 0;
265 265
266 if (err->disposition == RTAS_DISP_FULLY_RECOVERED) { 266 if (!(regs->msr & MSR_RI)) {
267 /* If MSR_RI isn't set, we cannot recover */
268 recovered = 0;
269
270 } else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
267 /* Platform corrected itself */ 271 /* Platform corrected itself */
268 nonfatal = 1; 272 recovered = 1;
269 } else if ((regs->msr & MSR_RI) && 273
270 user_mode(regs) && 274 } else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) {
271 err->severity == RTAS_SEVERITY_ERROR_SYNC && 275 /* Platform corrected itself but could be degraded */
272 err->disposition == RTAS_DISP_NOT_RECOVERED && 276 printk(KERN_ERR "MCE: limited recovery, system may "
273 err->target == RTAS_TARGET_MEMORY && 277 "be degraded\n");
274 err->type == RTAS_TYPE_ECC_UNCORR && 278 recovered = 1;
275 !(current->pid == 0 || is_global_init(current))) { 279
276 /* Kill off a user process with an ECC error */ 280 } else if (user_mode(regs) && !is_global_init(current) &&
277 printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n", 281 err->severity == RTAS_SEVERITY_ERROR_SYNC) {
278 current->pid); 282
279 /* XXX something better for ECC error? */ 283 /*
280 _exception(SIGBUS, regs, BUS_ADRERR, regs->nip); 284 * If we received a synchronous error when in userspace
281 nonfatal = 1; 285 * kill the task. Firmware may report details of the fail
286 * asynchronously, so we can't rely on the target and type
287 * fields being valid here.
288 */
289 printk(KERN_ERR "MCE: uncorrectable error, killing task "
290 "%s:%d\n", current->comm, current->pid);
291
292 _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
293 recovered = 1;
282 } 294 }
283 295
284 log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 296 log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
285 297
286 return nonfatal; 298 return recovered;
287} 299}
288 300
289/* 301/*