aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu/mcheck/mce_intel.c
diff options
context:
space:
mode:
authorBorislav Petkov <bp@suse.de>2015-01-13 09:08:51 -0500
committerBorislav Petkov <bp@suse.de>2015-02-19 07:24:25 -0500
commit3f2f0680d1161df96a0e8fea16930f1bd487a9cf (patch)
tree29009c1b6dcc24a7dc93ba485983c6ea5f31e0f0 /arch/x86/kernel/cpu/mcheck/mce_intel.c
parent0eac092d8307db61d320f77f9fce40e60b4ffa89 (diff)
x86/MCE/intel: Cleanup CMCI storm logic
Initially, this started with the yet another report about a race condition in the CMCI storm adaptive period length thing. Yes, we have to admit, it is fragile and error prone. So let's simplify it. The simpler logic is: now, after we enter storm mode, we go straight to polling with CMCI_STORM_INTERVAL, i.e. once a second. We remain in storm mode as long as we see errors being logged while polling. Theoretically, if we see an uninterrupted error stream, we will remain in storm mode indefinitely and keep polling the MSRs. However, when the storm is actually a burst of errors, once we have logged them all, we back out of it after ~5 mins of polling and no more errors logged. If we encounter an error during those 5 minutes, we reset the polling interval to 5 mins. Making machine_check_poll() return a bool and denoting whether it has seen an error or not lets us simplify a bunch of code and move the storm handling private to mce_intel.c. Some minor cleanups while at it. Reported-by: Calvin Owens <calvinowens@fb.com> Tested-by: Tony Luck <tony.luck@intel.com> Link: http://lkml.kernel.org/r/1417746575-23299-1-git-send-email-calvinowens@fb.com Signed-off-by: Borislav Petkov <bp@suse.de>
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce_intel.c')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c63
1 files changed, 42 insertions, 21 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b3c97bafc123..b4a41cf030ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -39,6 +39,15 @@
39static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); 39static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
40 40
41/* 41/*
42 * CMCI storm detection backoff counter
43 *
44 * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
45 * encountered an error. If not, we decrement it by one. We signal the end of
46 * the CMCI storm when it reaches 0.
47 */
48static DEFINE_PER_CPU(int, cmci_backoff_cnt);
49
50/*
42 * cmci_discover_lock protects against parallel discovery attempts 51 * cmci_discover_lock protects against parallel discovery attempts
43 * which could race against each other. 52 * which could race against each other.
44 */ 53 */
@@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
46 55
47#define CMCI_THRESHOLD 1 56#define CMCI_THRESHOLD 1
48#define CMCI_POLL_INTERVAL (30 * HZ) 57#define CMCI_POLL_INTERVAL (30 * HZ)
49#define CMCI_STORM_INTERVAL (1 * HZ) 58#define CMCI_STORM_INTERVAL (HZ)
50#define CMCI_STORM_THRESHOLD 15 59#define CMCI_STORM_THRESHOLD 15
51 60
52static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); 61static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
@@ -82,11 +91,21 @@ static int cmci_supported(int *banks)
82 return !!(cap & MCG_CMCI_P); 91 return !!(cap & MCG_CMCI_P);
83} 92}
84 93
85void mce_intel_cmci_poll(void) 94bool mce_intel_cmci_poll(void)
86{ 95{
87 if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) 96 if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
88 return; 97 return false;
89 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 98
99 /*
100 * Reset the counter if we've logged an error in the last poll
101 * during the storm.
102 */
103 if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
104 this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
105 else
106 this_cpu_dec(cmci_backoff_cnt);
107
108 return true;
90} 109}
91 110
92void mce_intel_hcpu_update(unsigned long cpu) 111void mce_intel_hcpu_update(unsigned long cpu)
@@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu)
97 per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; 116 per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
98} 117}
99 118
100unsigned long mce_intel_adjust_timer(unsigned long interval) 119unsigned long cmci_intel_adjust_timer(unsigned long interval)
101{ 120{
102 int r; 121 if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
103 122 (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
104 if (interval < CMCI_POLL_INTERVAL) 123 mce_notify_irq();
105 return interval; 124 return CMCI_STORM_INTERVAL;
125 }
106 126
107 switch (__this_cpu_read(cmci_storm_state)) { 127 switch (__this_cpu_read(cmci_storm_state)) {
108 case CMCI_STORM_ACTIVE: 128 case CMCI_STORM_ACTIVE:
129
109 /* 130 /*
110 * We switch back to interrupt mode once the poll timer has 131 * We switch back to interrupt mode once the poll timer has
111 * silenced itself. That means no events recorded and the 132 * silenced itself. That means no events recorded and the timer
112 * timer interval is back to our poll interval. 133 * interval is back to our poll interval.
113 */ 134 */
114 __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); 135 __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
115 r = atomic_sub_return(1, &cmci_storm_on_cpus); 136 if (!atomic_sub_return(1, &cmci_storm_on_cpus))
116 if (r == 0)
117 pr_notice("CMCI storm subsided: switching to interrupt mode\n"); 137 pr_notice("CMCI storm subsided: switching to interrupt mode\n");
138
118 /* FALLTHROUGH */ 139 /* FALLTHROUGH */
119 140
120 case CMCI_STORM_SUBSIDED: 141 case CMCI_STORM_SUBSIDED:
121 /* 142 /*
122 * We wait for all cpus to go back to SUBSIDED 143 * We wait for all CPUs to go back to SUBSIDED state. When that
123 * state. When that happens we switch back to 144 * happens we switch back to interrupt mode.
124 * interrupt mode.
125 */ 145 */
126 if (!atomic_read(&cmci_storm_on_cpus)) { 146 if (!atomic_read(&cmci_storm_on_cpus)) {
127 __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); 147 __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
@@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
130 } 150 }
131 return CMCI_POLL_INTERVAL; 151 return CMCI_POLL_INTERVAL;
132 default: 152 default:
133 /* 153
134 * We have shiny weather. Let the poll do whatever it 154 /* We have shiny weather. Let the poll do whatever it thinks. */
135 * thinks.
136 */
137 return interval; 155 return interval;
138 } 156 }
139} 157}
@@ -178,7 +196,8 @@ static bool cmci_storm_detect(void)
178 cmci_storm_disable_banks(); 196 cmci_storm_disable_banks();
179 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); 197 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
180 r = atomic_add_return(1, &cmci_storm_on_cpus); 198 r = atomic_add_return(1, &cmci_storm_on_cpus);
181 mce_timer_kick(CMCI_POLL_INTERVAL); 199 mce_timer_kick(CMCI_STORM_INTERVAL);
200 this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
182 201
183 if (r == 1) 202 if (r == 1)
184 pr_notice("CMCI storm detected: switching to poll mode\n"); 203 pr_notice("CMCI storm detected: switching to poll mode\n");
@@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void)
195{ 214{
196 if (cmci_storm_detect()) 215 if (cmci_storm_detect())
197 return; 216 return;
217
198 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 218 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
199 mce_notify_irq(); 219 mce_notify_irq();
200} 220}
@@ -286,6 +306,7 @@ void cmci_recheck(void)
286 306
287 if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) 307 if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
288 return; 308 return;
309
289 local_irq_save(flags); 310 local_irq_save(flags);
290 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 311 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
291 local_irq_restore(flags); 312 local_irq_restore(flags);