aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBorislav Petkov <bp@suse.de>2016-11-10 08:10:53 -0500
committerIngo Molnar <mingo@kernel.org>2016-11-11 02:08:24 -0500
commit54467353a96577f840cd2348981417c559b21b4b (patch)
treedc4a04116165c0f65dcce84ef5db9429fa67b51e
parentc09a8c40e0a0b4994925ac8eba91b85d76f440a3 (diff)
x86/MCE: Correct TSC timestamping of error records
We did have logic in the MCE code which would TSC-timestamp an error record only when it is exact - i.e., when it wasn't detected by polling. This isn't the case anymore. So let's fix that: We have a valid TSC timestamp in the error record only when it has been a precise detection, i.e., either in the #MC handler or in one of the interrupt handlers (thresholding, deferred, ...). All other error records still have mce.time which contains the wall time in order to be able to place the error record in time at least approximately. Also, this fixes another bug where machine_check_poll() would clear mce.tsc unconditionally even if we requested precise MCP_TIMESTAMP logging. The proper fix would be to generate timestamp only when it has been requested and not always. But that would require a more thorough code audit of all mce_gather_info/mce_setup() users. Add a FIXME for now. Signed-off-by: Borislav Petkov <bp@suse.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tony <tony.luck@intel.com> Cc: Tony Luck <tony.luck@intel.com> Cc: kernel test robot <xiaolong.ye@intel.com> Cc: linux-edac <linux-edac@vger.kernel.org> Cc: lkp@01.org Link: http://lkml.kernel.org/r/20161110131053.kybsijfs5venpjnf@pd.tnic Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c16
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c4
2 files changed, 12 insertions, 8 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 4ca00474804b..aab96f8d52b0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -706,6 +706,15 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
706 706
707 mce_gather_info(&m, NULL); 707 mce_gather_info(&m, NULL);
708 708
709 /*
710 * m.tsc was set in mce_setup(). Clear it if not requested.
711 *
712 * FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid
713 * that dance.
714 */
715 if (!(flags & MCP_TIMESTAMP))
716 m.tsc = 0;
717
709 for (i = 0; i < mca_cfg.banks; i++) { 718 for (i = 0; i < mca_cfg.banks; i++) {
710 if (!mce_banks[i].ctl || !test_bit(i, *b)) 719 if (!mce_banks[i].ctl || !test_bit(i, *b))
711 continue; 720 continue;
@@ -713,14 +722,12 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
713 m.misc = 0; 722 m.misc = 0;
714 m.addr = 0; 723 m.addr = 0;
715 m.bank = i; 724 m.bank = i;
716 m.tsc = 0;
717 725
718 barrier(); 726 barrier();
719 m.status = mce_rdmsrl(msr_ops.status(i)); 727 m.status = mce_rdmsrl(msr_ops.status(i));
720 if (!(m.status & MCI_STATUS_VAL)) 728 if (!(m.status & MCI_STATUS_VAL))
721 continue; 729 continue;
722 730
723
724 /* 731 /*
725 * Uncorrected or signalled events are handled by the exception 732 * Uncorrected or signalled events are handled by the exception
726 * handler when it is enabled, so don't process those here. 733 * handler when it is enabled, so don't process those here.
@@ -735,9 +742,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
735 742
736 mce_read_aux(&m, i); 743 mce_read_aux(&m, i);
737 744
738 if (!(flags & MCP_TIMESTAMP))
739 m.tsc = 0;
740
741 severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); 745 severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
742 746
743 if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) 747 if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
@@ -1394,7 +1398,7 @@ static void mce_timer_fn(unsigned long data)
1394 iv = __this_cpu_read(mce_next_interval); 1398 iv = __this_cpu_read(mce_next_interval);
1395 1399
1396 if (mce_available(this_cpu_ptr(&cpu_info))) { 1400 if (mce_available(this_cpu_ptr(&cpu_info))) {
1397 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks)); 1401 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1398 1402
1399 if (mce_intel_cmci_poll()) { 1403 if (mce_intel_cmci_poll()) {
1400 iv = mce_adjust_timer(iv); 1404 iv = mce_adjust_timer(iv);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 1defb8ea882c..be0b2fad47c5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -130,7 +130,7 @@ bool mce_intel_cmci_poll(void)
130 * Reset the counter if we've logged an error in the last poll 130 * Reset the counter if we've logged an error in the last poll
131 * during the storm. 131 * during the storm.
132 */ 132 */
133 if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned))) 133 if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
134 this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); 134 this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
135 else 135 else
136 this_cpu_dec(cmci_backoff_cnt); 136 this_cpu_dec(cmci_backoff_cnt);
@@ -342,7 +342,7 @@ void cmci_recheck(void)
342 return; 342 return;
343 343
344 local_irq_save(flags); 344 local_irq_save(flags);
345 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 345 machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
346 local_irq_restore(flags); 346 local_irq_restore(flags);
347} 347}
348 348