aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorAndi Kleen <andi@firstfloor.org>2009-05-27 15:56:54 -0400
committerH. Peter Anvin <hpa@zytor.com>2009-06-03 17:40:39 -0400
commita0189c70e5f17f4253dd7bc575c97469900e23d6 (patch)
treefc9d58a92124812aa0177fcff271f6c33903fafe /arch
parentde8a84d85ad8bb46d01d72ebc57030b95075603c (diff)
x86, mce: remove TSC print heuristic
Previously mce_panic used a simple heuristic to avoid printing old so far unreported machine check events on a mce panic. This worked by comparing the TSC value at the start of the machine check handler with the event time stamp and only printing newer ones. This has a couple of issues, in particular on systems where the TSC is not fully synchronized between CPUs it could lose events or print old ones. It is also problematic with full system synchronization as it is added by the next patch. Remove the TSC heuristic and instead replace it with a simple heuristic to print corrected errors first and after that uncorrected errors and finally the worst machine check as determined by the machine check handler. This simplifies the code because there is no need to pass the original TSC value around. Contains fixes from Ying Huang [ Impact: bug fix, cleanup ] Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Cc: Ying Huang <ying.huang@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c32
1 files changed, 18 insertions, 14 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 86806e52fc4..6773610061d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -158,6 +158,7 @@ void mce_log(struct mce *mce)
158 mcelog.entry[entry].finished = 1; 158 mcelog.entry[entry].finished = 1;
159 wmb(); 159 wmb();
160 160
161 mce->finished = 1;
161 set_bit(0, &notify_user); 162 set_bit(0, &notify_user);
162} 163}
163 164
@@ -190,23 +191,29 @@ static void print_mce(struct mce *m)
190 "and contact your hardware vendor\n"); 191 "and contact your hardware vendor\n");
191} 192}
192 193
193static void mce_panic(char *msg, struct mce *backup, u64 start) 194static void mce_panic(char *msg, struct mce *final)
194{ 195{
195 int i; 196 int i;
196 197
197 bust_spinlocks(1); 198 bust_spinlocks(1);
198 console_verbose(); 199 console_verbose();
200 /* First print corrected ones that are still unlogged */
199 for (i = 0; i < MCE_LOG_LEN; i++) { 201 for (i = 0; i < MCE_LOG_LEN; i++) {
200 u64 tsc = mcelog.entry[i].tsc; 202 struct mce *m = &mcelog.entry[i];
201 203 if ((m->status & MCI_STATUS_VAL) &&
202 if ((s64)(tsc - start) < 0) 204 !(m->status & MCI_STATUS_UC))
205 print_mce(m);
206 }
207 /* Now print uncorrected but with the final one last */
208 for (i = 0; i < MCE_LOG_LEN; i++) {
209 struct mce *m = &mcelog.entry[i];
210 if (!(m->status & MCI_STATUS_VAL))
203 continue; 211 continue;
204 print_mce(&mcelog.entry[i]); 212 if (!final || memcmp(m, final, sizeof(struct mce)))
205 if (backup && mcelog.entry[i].tsc == backup->tsc) 213 print_mce(m);
206 backup = NULL;
207 } 214 }
208 if (backup) 215 if (final)
209 print_mce(backup); 216 print_mce(final);
210 panic(msg); 217 panic(msg);
211} 218}
212 219
@@ -362,7 +369,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
362{ 369{
363 struct mce m, panicm; 370 struct mce m, panicm;
364 int panicm_found = 0; 371 int panicm_found = 0;
365 u64 mcestart = 0;
366 int i; 372 int i;
367 /* 373 /*
368 * If no_way_out gets set, there is no safe way to recover from this 374 * If no_way_out gets set, there is no safe way to recover from this
@@ -394,7 +400,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
394 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 400 if (!(m.mcgstatus & MCG_STATUS_RIPV))
395 no_way_out = 1; 401 no_way_out = 1;
396 402
397 rdtscll(mcestart);
398 barrier(); 403 barrier();
399 404
400 for (i = 0; i < banks; i++) { 405 for (i = 0; i < banks; i++) {
@@ -478,7 +483,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
478 * has not set tolerant to an insane level, give up and die. 483 * has not set tolerant to an insane level, give up and die.
479 */ 484 */
480 if (no_way_out && tolerant < 3) 485 if (no_way_out && tolerant < 3)
481 mce_panic("Machine check", &panicm, mcestart); 486 mce_panic("Machine check", &panicm);
482 487
483 /* 488 /*
484 * If the error seems to be unrecoverable, something should be 489 * If the error seems to be unrecoverable, something should be
@@ -506,8 +511,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
506 if (user_space) { 511 if (user_space) {
507 force_sig(SIGBUS, current); 512 force_sig(SIGBUS, current);
508 } else if (panic_on_oops || tolerant < 2) { 513 } else if (panic_on_oops || tolerant < 2) {
509 mce_panic("Uncorrected machine check", 514 mce_panic("Uncorrected machine check", &panicm);
510 &panicm, mcestart);
511 } 515 }
512 } 516 }
513 517