aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBorislav Petkov <bp@suse.de>2017-01-23 13:35:07 -0500
committerIngo Molnar <mingo@kernel.org>2017-01-24 03:14:53 -0500
commit9b052ea4ced0fa1ad30a2eafe86984a16297e6f1 (patch)
treea9513da62eb15668ced9662d6e6de66aea99441d
parentd4b2ac63b0eae461fc10c9791084be24724ef57a (diff)
x86/ras/therm_throt: Do not log a fake MCE for thermal events
We log a fake bank 128 MCE to note that we're handling a CPU thermal event. However, this confuses people into thinking that their hardware generates MCEs. Hijacking MCA for logging thermal events is a gross misuse anyway and it shouldn't have been done in the first place. And besides we have other means for dealing with thermal events which are much more suitable. So let's kill the MCE logging part. Signed-off-by: Borislav Petkov <bp@suse.de> Acked-by: Ashok Raj <ashok.raj@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tony Luck <tony.luck@intel.com> Cc: Yazen Ghannam <Yazen.Ghannam@amd.com> Cc: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/20170105213846.GA12024@gmail.com Link: http://lkml.kernel.org/r/20170123183514.13356-3-bp@alien8.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/include/asm/mce.h6
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c30
3 files changed, 11 insertions, 50 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 5132f2a6c0a2..a09ed05725c2 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -97,10 +97,6 @@
97 97
98#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ 98#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
99 99
100/* Software defined banks */
101#define MCE_EXTENDED_BANK 128
102#define MCE_THERMAL_BANK (MCE_EXTENDED_BANK + 0)
103
104#define MCE_LOG_LEN 32 100#define MCE_LOG_LEN 32
105#define MCE_LOG_SIGNATURE "MACHINECHECK" 101#define MCE_LOG_SIGNATURE "MACHINECHECK"
106 102
@@ -306,8 +302,6 @@ extern void (*deferred_error_int_vector)(void);
306 302
307void intel_init_thermal(struct cpuinfo_x86 *c); 303void intel_init_thermal(struct cpuinfo_x86 *c);
308 304
309void mce_log_therm_throt_event(__u64 status);
310
311/* Interrupt Handler for core thermal thresholds */ 305/* Interrupt Handler for core thermal thresholds */
312extern int (*platform_thermal_notify)(__u64 msr_val); 306extern int (*platform_thermal_notify)(__u64 msr_val);
313 307
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 00ef43233e03..6eef6fde0f02 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1331,31 +1331,6 @@ static void mce_process_work(struct work_struct *dummy)
1331 mce_gen_pool_process(); 1331 mce_gen_pool_process();
1332} 1332}
1333 1333
1334#ifdef CONFIG_X86_MCE_INTEL
1335/***
1336 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1337 * @cpu: The CPU on which the event occurred.
1338 * @status: Event status information
1339 *
1340 * This function should be called by the thermal interrupt after the
1341 * event has been processed and the decision was made to log the event
1342 * further.
1343 *
1344 * The status parameter will be saved to the 'status' field of 'struct mce'
1345 * and historically has been the register value of the
1346 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1347 */
1348void mce_log_therm_throt_event(__u64 status)
1349{
1350 struct mce m;
1351
1352 mce_setup(&m);
1353 m.bank = MCE_THERMAL_BANK;
1354 m.status = status;
1355 mce_log(&m);
1356}
1357#endif /* CONFIG_X86_MCE_INTEL */
1358
1359/* 1334/*
1360 * Periodic polling timer for "silent" machine check errors. If the 1335 * Periodic polling timer for "silent" machine check errors. If the
1361 * poller finds an MCE, poll 2x faster. When the poller finds no more 1336 * poller finds an MCE, poll 2x faster. When the poller finds no more
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 465aca8be009..85469f84c921 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * Maintains a counter in /sys that keeps track of the number of thermal 7 * Maintains a counter in /sys that keeps track of the number of thermal
8 * events, such that the user knows how bad the thermal problem might be 8 * events, such that the user knows how bad the thermal problem might be
9 * (since the logging to syslog and mcelog is rate limited). 9 * (since the logging to syslog is rate limited).
10 * 10 *
11 * Author: Dmitriy Zavin (dmitriyz@google.com) 11 * Author: Dmitriy Zavin (dmitriyz@google.com)
12 * 12 *
@@ -141,13 +141,8 @@ static struct attribute_group thermal_attr_group = {
141 * IRQ has been acknowledged. 141 * IRQ has been acknowledged.
142 * 142 *
143 * It will take care of rate limiting and printing messages to the syslog. 143 * It will take care of rate limiting and printing messages to the syslog.
144 *
145 * Returns: 0 : Event should NOT be further logged, i.e. still in
146 * "timeout" from previous log message.
147 * 1 : Event should be logged further, and a message has been
148 * printed to the syslog.
149 */ 144 */
150static int therm_throt_process(bool new_event, int event, int level) 145static void therm_throt_process(bool new_event, int event, int level)
151{ 146{
152 struct _thermal_state *state; 147 struct _thermal_state *state;
153 unsigned int this_cpu = smp_processor_id(); 148 unsigned int this_cpu = smp_processor_id();
@@ -162,16 +157,16 @@ static int therm_throt_process(bool new_event, int event, int level)
162 else if (event == POWER_LIMIT_EVENT) 157 else if (event == POWER_LIMIT_EVENT)
163 state = &pstate->core_power_limit; 158 state = &pstate->core_power_limit;
164 else 159 else
165 return 0; 160 return;
166 } else if (level == PACKAGE_LEVEL) { 161 } else if (level == PACKAGE_LEVEL) {
167 if (event == THERMAL_THROTTLING_EVENT) 162 if (event == THERMAL_THROTTLING_EVENT)
168 state = &pstate->package_throttle; 163 state = &pstate->package_throttle;
169 else if (event == POWER_LIMIT_EVENT) 164 else if (event == POWER_LIMIT_EVENT)
170 state = &pstate->package_power_limit; 165 state = &pstate->package_power_limit;
171 else 166 else
172 return 0; 167 return;
173 } else 168 } else
174 return 0; 169 return;
175 170
176 old_event = state->new_event; 171 old_event = state->new_event;
177 state->new_event = new_event; 172 state->new_event = new_event;
@@ -181,7 +176,7 @@ static int therm_throt_process(bool new_event, int event, int level)
181 176
182 if (time_before64(now, state->next_check) && 177 if (time_before64(now, state->next_check) &&
183 state->count != state->last_count) 178 state->count != state->last_count)
184 return 0; 179 return;
185 180
186 state->next_check = now + CHECK_INTERVAL; 181 state->next_check = now + CHECK_INTERVAL;
187 state->last_count = state->count; 182 state->last_count = state->count;
@@ -193,16 +188,14 @@ static int therm_throt_process(bool new_event, int event, int level)
193 this_cpu, 188 this_cpu,
194 level == CORE_LEVEL ? "Core" : "Package", 189 level == CORE_LEVEL ? "Core" : "Package",
195 state->count); 190 state->count);
196 return 1; 191 return;
197 } 192 }
198 if (old_event) { 193 if (old_event) {
199 if (event == THERMAL_THROTTLING_EVENT) 194 if (event == THERMAL_THROTTLING_EVENT)
200 pr_info("CPU%d: %s temperature/speed normal\n", this_cpu, 195 pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
201 level == CORE_LEVEL ? "Core" : "Package"); 196 level == CORE_LEVEL ? "Core" : "Package");
202 return 1; 197 return;
203 } 198 }
204
205 return 0;
206} 199}
207 200
208static int thresh_event_valid(int level, int event) 201static int thresh_event_valid(int level, int event)
@@ -365,10 +358,9 @@ static void intel_thermal_interrupt(void)
365 /* Check for violation of core thermal thresholds*/ 358 /* Check for violation of core thermal thresholds*/
366 notify_thresholds(msr_val); 359 notify_thresholds(msr_val);
367 360
368 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, 361 therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
369 THERMAL_THROTTLING_EVENT, 362 THERMAL_THROTTLING_EVENT,
370 CORE_LEVEL) != 0) 363 CORE_LEVEL);
371 mce_log_therm_throt_event(msr_val);
372 364
373 if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) 365 if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
374 therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, 366 therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,