diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries/ras.c')
| -rw-r--r-- | arch/powerpc/platforms/pseries/ras.c | 102 |
1 files changed, 69 insertions, 33 deletions
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index a4fc6da87c2e..c55d7ad9c648 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c | |||
| @@ -54,7 +54,8 @@ | |||
| 54 | static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; | 54 | static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; |
| 55 | static DEFINE_SPINLOCK(ras_log_buf_lock); | 55 | static DEFINE_SPINLOCK(ras_log_buf_lock); |
| 56 | 56 | ||
| 57 | static char mce_data_buf[RTAS_ERROR_LOG_MAX]; | 57 | static char global_mce_data_buf[RTAS_ERROR_LOG_MAX]; |
| 58 | static DEFINE_PER_CPU(__u64, mce_data_buf); | ||
| 58 | 59 | ||
| 59 | static int ras_get_sensor_state_token; | 60 | static int ras_get_sensor_state_token; |
| 60 | static int ras_check_exception_token; | 61 | static int ras_check_exception_token; |
| @@ -196,12 +197,24 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id) | |||
| 196 | return IRQ_HANDLED; | 197 | return IRQ_HANDLED; |
| 197 | } | 198 | } |
| 198 | 199 | ||
| 199 | /* Get the error information for errors coming through the | 200 | /* |
| 201 | * Some versions of FWNMI place the buffer inside the 4kB page starting at | ||
| 202 | * 0x7000. Other versions place it inside the rtas buffer. We check both. | ||
| 203 | */ | ||
| 204 | #define VALID_FWNMI_BUFFER(A) \ | ||
| 205 | ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ | ||
| 206 | (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) | ||
| 207 | |||
| 208 | /* | ||
| 209 | * Get the error information for errors coming through the | ||
| 200 | * FWNMI vectors. The pt_regs' r3 will be updated to reflect | 210 | * FWNMI vectors. The pt_regs' r3 will be updated to reflect |
| 201 | * the actual r3 if possible, and a ptr to the error log entry | 211 | * the actual r3 if possible, and a ptr to the error log entry |
| 202 | * will be returned if found. | 212 | * will be returned if found. |
| 203 | * | 213 | * |
| 204 | * The mce_data_buf does not have any locks or protection around it, | 214 | * If the RTAS error is not of the extended type, then we put it in a per |
| 215 | * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf. | ||
| 216 | * | ||
| 217 | * The global_mce_data_buf does not have any locks or protection around it, | ||
| 205 | * if a second machine check comes in, or a system reset is done | 218 | * if a second machine check comes in, or a system reset is done |
| 206 | * before we have logged the error, then we will get corruption in the | 219 | * before we have logged the error, then we will get corruption in the |
| 207 | * error log. This is preferable over holding off on calling | 220 | * error log. This is preferable over holding off on calling |
| @@ -210,20 +223,31 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id) | |||
| 210 | */ | 223 | */ |
| 211 | static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) | 224 | static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) |
| 212 | { | 225 | { |
| 213 | unsigned long errdata = regs->gpr[3]; | ||
| 214 | struct rtas_error_log *errhdr = NULL; | ||
| 215 | unsigned long *savep; | 226 | unsigned long *savep; |
| 227 | struct rtas_error_log *h, *errhdr = NULL; | ||
| 228 | |||
| 229 | if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { | ||
| 230 | printk(KERN_ERR "FWNMI: corrupt r3\n"); | ||
| 231 | return NULL; | ||
| 232 | } | ||
| 216 | 233 | ||
| 217 | if ((errdata >= 0x7000 && errdata < 0x7fff0) || | 234 | savep = __va(regs->gpr[3]); |
| 218 | (errdata >= rtas.base && errdata < rtas.base + rtas.size - 16)) { | 235 | regs->gpr[3] = savep[0]; /* restore original r3 */ |
| 219 | savep = __va(errdata); | 236 | |
| 220 | regs->gpr[3] = savep[0]; /* restore original r3 */ | 237 | /* If it isn't an extended log we can use the per cpu 64bit buffer */ |
| 221 | memset(mce_data_buf, 0, RTAS_ERROR_LOG_MAX); | 238 | h = (struct rtas_error_log *)&savep[1]; |
| 222 | memcpy(mce_data_buf, (char *)(savep + 1), RTAS_ERROR_LOG_MAX); | 239 | if (!h->extended) { |
| 223 | errhdr = (struct rtas_error_log *)mce_data_buf; | 240 | memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64)); |
| 241 | errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf); | ||
| 224 | } else { | 242 | } else { |
| 225 | printk("FWNMI: corrupt r3\n"); | 243 | int len; |
| 244 | |||
| 245 | len = max_t(int, 8+h->extended_log_length, RTAS_ERROR_LOG_MAX); | ||
| 246 | memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX); | ||
| 247 | memcpy(global_mce_data_buf, h, len); | ||
| 248 | errhdr = (struct rtas_error_log *)global_mce_data_buf; | ||
| 226 | } | 249 | } |
| 250 | |||
| 227 | return errhdr; | 251 | return errhdr; |
| 228 | } | 252 | } |
| 229 | 253 | ||
| @@ -235,7 +259,7 @@ static void fwnmi_release_errinfo(void) | |||
| 235 | { | 259 | { |
| 236 | int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); | 260 | int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); |
| 237 | if (ret != 0) | 261 | if (ret != 0) |
| 238 | printk("FWNMI: nmi-interlock failed: %d\n", ret); | 262 | printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); |
| 239 | } | 263 | } |
| 240 | 264 | ||
| 241 | int pSeries_system_reset_exception(struct pt_regs *regs) | 265 | int pSeries_system_reset_exception(struct pt_regs *regs) |
| @@ -259,31 +283,43 @@ int pSeries_system_reset_exception(struct pt_regs *regs) | |||
| 259 | * Return 1 if corrected (or delivered a signal). | 283 | * Return 1 if corrected (or delivered a signal). |
| 260 | * Return 0 if there is nothing we can do. | 284 | * Return 0 if there is nothing we can do. |
| 261 | */ | 285 | */ |
| 262 | static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err) | 286 | static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) |
| 263 | { | 287 | { |
| 264 | int nonfatal = 0; | 288 | int recovered = 0; |
| 265 | 289 | ||
| 266 | if (err->disposition == RTAS_DISP_FULLY_RECOVERED) { | 290 | if (!(regs->msr & MSR_RI)) { |
| 291 | /* If MSR_RI isn't set, we cannot recover */ | ||
| 292 | recovered = 0; | ||
| 293 | |||
| 294 | } else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) { | ||
| 267 | /* Platform corrected itself */ | 295 | /* Platform corrected itself */ |
| 268 | nonfatal = 1; | 296 | recovered = 1; |
| 269 | } else if ((regs->msr & MSR_RI) && | 297 | |
| 270 | user_mode(regs) && | 298 | } else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) { |
| 271 | err->severity == RTAS_SEVERITY_ERROR_SYNC && | 299 | /* Platform corrected itself but could be degraded */ |
| 272 | err->disposition == RTAS_DISP_NOT_RECOVERED && | 300 | printk(KERN_ERR "MCE: limited recovery, system may " |
| 273 | err->target == RTAS_TARGET_MEMORY && | 301 | "be degraded\n"); |
| 274 | err->type == RTAS_TYPE_ECC_UNCORR && | 302 | recovered = 1; |
| 275 | !(current->pid == 0 || is_global_init(current))) { | 303 | |
| 276 | /* Kill off a user process with an ECC error */ | 304 | } else if (user_mode(regs) && !is_global_init(current) && |
| 277 | printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n", | 305 | err->severity == RTAS_SEVERITY_ERROR_SYNC) { |
| 278 | current->pid); | 306 | |
| 279 | /* XXX something better for ECC error? */ | 307 | /* |
| 280 | _exception(SIGBUS, regs, BUS_ADRERR, regs->nip); | 308 | * If we received a synchronous error when in userspace |
| 281 | nonfatal = 1; | 309 | * kill the task. Firmware may report details of the fail |
| 310 | * asynchronously, so we can't rely on the target and type | ||
| 311 | * fields being valid here. | ||
| 312 | */ | ||
| 313 | printk(KERN_ERR "MCE: uncorrectable error, killing task " | ||
| 314 | "%s:%d\n", current->comm, current->pid); | ||
| 315 | |||
| 316 | _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); | ||
| 317 | recovered = 1; | ||
| 282 | } | 318 | } |
| 283 | 319 | ||
| 284 | log_error((char *)err, ERR_TYPE_RTAS_LOG, !nonfatal); | 320 | log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); |
| 285 | 321 | ||
| 286 | return nonfatal; | 322 | return recovered; |
| 287 | } | 323 | } |
| 288 | 324 | ||
| 289 | /* | 325 | /* |
