aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu/mcheck/mce.c
diff options
context:
space:
mode:
authorTony Luck <tony.luck@intel.com>2012-04-18 18:19:40 -0400
committerTony Luck <tony.luck@intel.com>2012-04-19 12:12:43 -0400
commit95022b8cf6ed7f3292b60c8e85fe59a12bfb1c9e (patch)
treed0af3bb590eadc51cf0ad8f08a4b91fad2cd3433 /arch/x86/kernel/cpu/mcheck/mce.c
parent0034102808e0dbbf3a2394b82b1bb40b5778de9e (diff)
x86/mce: Avoid reading every machine check bank register twice.
Reading machine check bank registers is slow. There is a trend of increasing the number of banks, and the number of cores. The main section of do_machine_check() is a serialized section where each cpu in turn checks every bank. Even on a little two socket SandyBridge-EP system that multiplies out as: 2 sockets * 8 cores * 2 hyperthreads * 20 banks = 640 MSRs We already scan the banks in parallel in mce_no_way_out() to see if there is a fatal error anywhere in the system. If we build a cache of VALID bits during this scan, we can avoid uselessly re-reading banks that have no data. Note that this cache is only a hint. If the valid bit is set in a shared bank, all cpus that share that bank will see it during the parallel scan, but the first to find it in the sequential scan will (usually) clear the bank. Acked-by: Borislav Petkov <borislav.petkov@amd.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce.c')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c16
1 files changed, 11 insertions, 5 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d086a09c087..66e1c51be08 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -641,16 +641,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
641 * Do a quick check if any of the events requires a panic. 641 * Do a quick check if any of the events requires a panic.
642 * This decides if we keep the events around or clear them. 642 * This decides if we keep the events around or clear them.
643 */ 643 */
644static int mce_no_way_out(struct mce *m, char **msg) 644static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp)
645{ 645{
646 int i; 646 int i, ret = 0;
647 647
648 for (i = 0; i < banks; i++) { 648 for (i = 0; i < banks; i++) {
649 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 649 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
650 if (m->status & MCI_STATUS_VAL)
651 __set_bit(i, validp);
650 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 652 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
651 return 1; 653 ret = 1;
652 } 654 }
653 return 0; 655 return ret;
654} 656}
655 657
656/* 658/*
@@ -1011,6 +1013,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1011 */ 1013 */
1012 int kill_it = 0; 1014 int kill_it = 0;
1013 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1015 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1016 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1014 char *msg = "Unknown"; 1017 char *msg = "Unknown";
1015 1018
1016 atomic_inc(&mce_entry); 1019 atomic_inc(&mce_entry);
@@ -1025,7 +1028,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1025 final = &__get_cpu_var(mces_seen); 1028 final = &__get_cpu_var(mces_seen);
1026 *final = m; 1029 *final = m;
1027 1030
1028 no_way_out = mce_no_way_out(&m, &msg); 1031 memset(valid_banks, 0, sizeof(valid_banks));
1032 no_way_out = mce_no_way_out(&m, &msg, valid_banks);
1029 1033
1030 barrier(); 1034 barrier();
1031 1035
@@ -1045,6 +1049,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1045 order = mce_start(&no_way_out); 1049 order = mce_start(&no_way_out);
1046 for (i = 0; i < banks; i++) { 1050 for (i = 0; i < banks; i++) {
1047 __clear_bit(i, toclear); 1051 __clear_bit(i, toclear);
1052 if (!test_bit(i, valid_banks))
1053 continue;
1048 if (!mce_banks[i].ctl) 1054 if (!mce_banks[i].ctl)
1049 continue; 1055 continue;
1050 1056