aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBorislav Petkov <bp@suse.de>2015-01-13 09:08:51 -0500
committerBorislav Petkov <bp@suse.de>2015-02-19 07:24:25 -0500
commit3f2f0680d1161df96a0e8fea16930f1bd487a9cf (patch)
tree29009c1b6dcc24a7dc93ba485983c6ea5f31e0f0
parent0eac092d8307db61d320f77f9fce40e60b4ffa89 (diff)
x86/MCE/intel: Cleanup CMCI storm logic
Initially, this started with the yet another report about a race condition in the CMCI storm adaptive period length thing. Yes, we have to admit, it is fragile and error prone. So let's simplify it. The simpler logic is: now, after we enter storm mode, we go straight to polling with CMCI_STORM_INTERVAL, i.e. once a second. We remain in storm mode as long as we see errors being logged while polling. Theoretically, if we see an uninterrupted error stream, we will remain in storm mode indefinitely and keep polling the MSRs. However, when the storm is actually a burst of errors, once we have logged them all, we back out of it after ~5 mins of polling and no more errors logged. If we encounter an error during those 5 minutes, we reset the polling interval to 5 mins. Making machine_check_poll() return a bool and denoting whether it has seen an error or not lets us simplify a bunch of code and move the storm handling private to mce_intel.c. Some minor cleanups while at it. Reported-by: Calvin Owens <calvinowens@fb.com> Tested-by: Tony Luck <tony.luck@intel.com> Link: http://lkml.kernel.org/r/1417746575-23299-1-git-send-email-calvinowens@fb.com Signed-off-by: Borislav Petkov <bp@suse.de>
-rw-r--r--arch/x86/include/asm/mce.h8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h9
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c86
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c63
4 files changed, 96 insertions, 70 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 51b26e895933..13eeea518233 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -183,11 +183,11 @@ typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
183DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); 183DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
184 184
185enum mcp_flags { 185enum mcp_flags {
186 MCP_TIMESTAMP = (1 << 0), /* log time stamp */ 186 MCP_TIMESTAMP = BIT(0), /* log time stamp */
187 MCP_UC = (1 << 1), /* log uncorrected errors */ 187 MCP_UC = BIT(1), /* log uncorrected errors */
188 MCP_DONTLOG = (1 << 2), /* only clear, don't log */ 188 MCP_DONTLOG = BIT(2), /* only clear, don't log */
189}; 189};
190void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); 190bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
191 191
192int mce_notify_irq(void); 192int mce_notify_irq(void);
193void mce_notify_process(void); 193void mce_notify_process(void);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 10b46906767f..e12f0bfb45c1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -14,6 +14,7 @@ enum severity_level {
14}; 14};
15 15
16#define ATTR_LEN 16 16#define ATTR_LEN 16
17#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
17 18
18/* One object for each MCE bank, shared by all CPUs */ 19/* One object for each MCE bank, shared by all CPUs */
19struct mce_bank { 20struct mce_bank {
@@ -30,13 +31,13 @@ extern struct mce_bank *mce_banks;
30extern mce_banks_t mce_banks_ce_disabled; 31extern mce_banks_t mce_banks_ce_disabled;
31 32
32#ifdef CONFIG_X86_MCE_INTEL 33#ifdef CONFIG_X86_MCE_INTEL
33unsigned long mce_intel_adjust_timer(unsigned long interval); 34unsigned long cmci_intel_adjust_timer(unsigned long interval);
34void mce_intel_cmci_poll(void); 35bool mce_intel_cmci_poll(void);
35void mce_intel_hcpu_update(unsigned long cpu); 36void mce_intel_hcpu_update(unsigned long cpu);
36void cmci_disable_bank(int bank); 37void cmci_disable_bank(int bank);
37#else 38#else
38# define mce_intel_adjust_timer mce_adjust_timer_default 39# define cmci_intel_adjust_timer mce_adjust_timer_default
39static inline void mce_intel_cmci_poll(void) { } 40static inline bool mce_intel_cmci_poll(void) { return false; }
40static inline void mce_intel_hcpu_update(unsigned long cpu) { } 41static inline void mce_intel_hcpu_update(unsigned long cpu) { }
41static inline void cmci_disable_bank(int bank) { } 42static inline void cmci_disable_bank(int bank) { }
42#endif 43#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d2c611699cd9..d60cbb8d78f7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -58,7 +58,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
58#define CREATE_TRACE_POINTS 58#define CREATE_TRACE_POINTS
59#include <trace/events/mce.h> 59#include <trace/events/mce.h>
60 60
61#define SPINUNIT 100 /* 100ns */ 61#define SPINUNIT 100 /* 100ns */
62 62
63DEFINE_PER_CPU(unsigned, mce_exception_count); 63DEFINE_PER_CPU(unsigned, mce_exception_count);
64 64
@@ -87,9 +87,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
87static DEFINE_PER_CPU(struct mce, mces_seen); 87static DEFINE_PER_CPU(struct mce, mces_seen);
88static int cpu_missing; 88static int cpu_missing;
89 89
90/* CMCI storm detection filter */
91static DEFINE_PER_CPU(unsigned long, mce_polled_error);
92
93/* 90/*
94 * MCA banks polled by the period polling timer for corrected events. 91 * MCA banks polled by the period polling timer for corrected events.
95 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). 92 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
@@ -623,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
623 * is already totally * confused. In this case it's likely it will 620 * is already totally * confused. In this case it's likely it will
624 * not fully execute the machine check handler either. 621 * not fully execute the machine check handler either.
625 */ 622 */
626void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 623bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
627{ 624{
625 bool error_logged = false;
628 struct mce m; 626 struct mce m;
629 int severity; 627 int severity;
630 int i; 628 int i;
@@ -647,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
647 if (!(m.status & MCI_STATUS_VAL)) 645 if (!(m.status & MCI_STATUS_VAL))
648 continue; 646 continue;
649 647
650 this_cpu_write(mce_polled_error, 1); 648
651 /* 649 /*
652 * Uncorrected or signalled events are handled by the exception 650 * Uncorrected or signalled events are handled by the exception
653 * handler when it is enabled, so don't process those here. 651 * handler when it is enabled, so don't process those here.
@@ -680,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
680 * Don't get the IP here because it's unlikely to 678 * Don't get the IP here because it's unlikely to
681 * have anything to do with the actual error location. 679 * have anything to do with the actual error location.
682 */ 680 */
683 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) 681 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
682 error_logged = true;
684 mce_log(&m); 683 mce_log(&m);
684 }
685 685
686 /* 686 /*
687 * Clear state for this bank. 687 * Clear state for this bank.
@@ -695,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
695 */ 695 */
696 696
697 sync_core(); 697 sync_core();
698
699 return error_logged;
698} 700}
699EXPORT_SYMBOL_GPL(machine_check_poll); 701EXPORT_SYMBOL_GPL(machine_check_poll);
700 702
@@ -1311,7 +1313,7 @@ void mce_log_therm_throt_event(__u64 status)
1311 * poller finds an MCE, poll 2x faster. When the poller finds no more 1313 * poller finds an MCE, poll 2x faster. When the poller finds no more
1312 * errors, poll 2x slower (up to check_interval seconds). 1314 * errors, poll 2x slower (up to check_interval seconds).
1313 */ 1315 */
1314static unsigned long check_interval = 5 * 60; /* 5 minutes */ 1316static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1315 1317
1316static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1318static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1317static DEFINE_PER_CPU(struct timer_list, mce_timer); 1319static DEFINE_PER_CPU(struct timer_list, mce_timer);
@@ -1321,49 +1323,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
1321 return interval; 1323 return interval;
1322} 1324}
1323 1325
1324static unsigned long (*mce_adjust_timer)(unsigned long interval) = 1326static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1325 mce_adjust_timer_default;
1326 1327
1327static int cmc_error_seen(void) 1328static void __restart_timer(struct timer_list *t, unsigned long interval)
1328{ 1329{
1329 unsigned long *v = this_cpu_ptr(&mce_polled_error); 1330 unsigned long when = jiffies + interval;
1331 unsigned long flags;
1330 1332
1331 return test_and_clear_bit(0, v); 1333 local_irq_save(flags);
1334
1335 if (timer_pending(t)) {
1336 if (time_before(when, t->expires))
1337 mod_timer_pinned(t, when);
1338 } else {
1339 t->expires = round_jiffies(when);
1340 add_timer_on(t, smp_processor_id());
1341 }
1342
1343 local_irq_restore(flags);
1332} 1344}
1333 1345
1334static void mce_timer_fn(unsigned long data) 1346static void mce_timer_fn(unsigned long data)
1335{ 1347{
1336 struct timer_list *t = this_cpu_ptr(&mce_timer); 1348 struct timer_list *t = this_cpu_ptr(&mce_timer);
1349 int cpu = smp_processor_id();
1337 unsigned long iv; 1350 unsigned long iv;
1338 int notify;
1339 1351
1340 WARN_ON(smp_processor_id() != data); 1352 WARN_ON(cpu != data);
1353
1354 iv = __this_cpu_read(mce_next_interval);
1341 1355
1342 if (mce_available(this_cpu_ptr(&cpu_info))) { 1356 if (mce_available(this_cpu_ptr(&cpu_info))) {
1343 machine_check_poll(MCP_TIMESTAMP, 1357 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
1344 this_cpu_ptr(&mce_poll_banks)); 1358
1345 mce_intel_cmci_poll(); 1359 if (mce_intel_cmci_poll()) {
1360 iv = mce_adjust_timer(iv);
1361 goto done;
1362 }
1346 } 1363 }
1347 1364
1348 /* 1365 /*
1349 * Alert userspace if needed. If we logged an MCE, reduce the 1366 * Alert userspace if needed. If we logged an MCE, reduce the polling
1350 * polling interval, otherwise increase the polling interval. 1367 * interval, otherwise increase the polling interval.
1351 */ 1368 */
1352 iv = __this_cpu_read(mce_next_interval); 1369 if (mce_notify_irq())
1353 notify = mce_notify_irq();
1354 notify |= cmc_error_seen();
1355 if (notify) {
1356 iv = max(iv / 2, (unsigned long) HZ/100); 1370 iv = max(iv / 2, (unsigned long) HZ/100);
1357 } else { 1371 else
1358 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1372 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1359 iv = mce_adjust_timer(iv); 1373
1360 } 1374done:
1361 __this_cpu_write(mce_next_interval, iv); 1375 __this_cpu_write(mce_next_interval, iv);
1362 /* Might have become 0 after CMCI storm subsided */ 1376 __restart_timer(t, iv);
1363 if (iv) {
1364 t->expires = jiffies + iv;
1365 add_timer_on(t, smp_processor_id());
1366 }
1367} 1377}
1368 1378
1369/* 1379/*
@@ -1372,16 +1382,10 @@ static void mce_timer_fn(unsigned long data)
1372void mce_timer_kick(unsigned long interval) 1382void mce_timer_kick(unsigned long interval)
1373{ 1383{
1374 struct timer_list *t = this_cpu_ptr(&mce_timer); 1384 struct timer_list *t = this_cpu_ptr(&mce_timer);
1375 unsigned long when = jiffies + interval;
1376 unsigned long iv = __this_cpu_read(mce_next_interval); 1385 unsigned long iv = __this_cpu_read(mce_next_interval);
1377 1386
1378 if (timer_pending(t)) { 1387 __restart_timer(t, interval);
1379 if (time_before(when, t->expires)) 1388
1380 mod_timer_pinned(t, when);
1381 } else {
1382 t->expires = round_jiffies(when);
1383 add_timer_on(t, smp_processor_id());
1384 }
1385 if (interval < iv) 1389 if (interval < iv)
1386 __this_cpu_write(mce_next_interval, interval); 1390 __this_cpu_write(mce_next_interval, interval);
1387} 1391}
@@ -1682,7 +1686,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1682 switch (c->x86_vendor) { 1686 switch (c->x86_vendor) {
1683 case X86_VENDOR_INTEL: 1687 case X86_VENDOR_INTEL:
1684 mce_intel_feature_init(c); 1688 mce_intel_feature_init(c);
1685 mce_adjust_timer = mce_intel_adjust_timer; 1689 mce_adjust_timer = cmci_intel_adjust_timer;
1686 break; 1690 break;
1687 case X86_VENDOR_AMD: 1691 case X86_VENDOR_AMD:
1688 mce_amd_feature_init(c); 1692 mce_amd_feature_init(c);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b3c97bafc123..b4a41cf030ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -39,6 +39,15 @@
39static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); 39static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
40 40
41/* 41/*
42 * CMCI storm detection backoff counter
43 *
44 * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
45 * encountered an error. If not, we decrement it by one. We signal the end of
46 * the CMCI storm when it reaches 0.
47 */
48static DEFINE_PER_CPU(int, cmci_backoff_cnt);
49
50/*
42 * cmci_discover_lock protects against parallel discovery attempts 51 * cmci_discover_lock protects against parallel discovery attempts
43 * which could race against each other. 52 * which could race against each other.
44 */ 53 */
@@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
46 55
47#define CMCI_THRESHOLD 1 56#define CMCI_THRESHOLD 1
48#define CMCI_POLL_INTERVAL (30 * HZ) 57#define CMCI_POLL_INTERVAL (30 * HZ)
49#define CMCI_STORM_INTERVAL (1 * HZ) 58#define CMCI_STORM_INTERVAL (HZ)
50#define CMCI_STORM_THRESHOLD 15 59#define CMCI_STORM_THRESHOLD 15
51 60
52static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); 61static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
@@ -82,11 +91,21 @@ static int cmci_supported(int *banks)
82 return !!(cap & MCG_CMCI_P); 91 return !!(cap & MCG_CMCI_P);
83} 92}
84 93
85void mce_intel_cmci_poll(void) 94bool mce_intel_cmci_poll(void)
86{ 95{
87 if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) 96 if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
88 return; 97 return false;
89 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 98
99 /*
100 * Reset the counter if we've logged an error in the last poll
101 * during the storm.
102 */
103 if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
104 this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
105 else
106 this_cpu_dec(cmci_backoff_cnt);
107
108 return true;
90} 109}
91 110
92void mce_intel_hcpu_update(unsigned long cpu) 111void mce_intel_hcpu_update(unsigned long cpu)
@@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu)
97 per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; 116 per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
98} 117}
99 118
100unsigned long mce_intel_adjust_timer(unsigned long interval) 119unsigned long cmci_intel_adjust_timer(unsigned long interval)
101{ 120{
102 int r; 121 if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
103 122 (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
104 if (interval < CMCI_POLL_INTERVAL) 123 mce_notify_irq();
105 return interval; 124 return CMCI_STORM_INTERVAL;
125 }
106 126
107 switch (__this_cpu_read(cmci_storm_state)) { 127 switch (__this_cpu_read(cmci_storm_state)) {
108 case CMCI_STORM_ACTIVE: 128 case CMCI_STORM_ACTIVE:
129
109 /* 130 /*
110 * We switch back to interrupt mode once the poll timer has 131 * We switch back to interrupt mode once the poll timer has
111 * silenced itself. That means no events recorded and the 132 * silenced itself. That means no events recorded and the timer
112 * timer interval is back to our poll interval. 133 * interval is back to our poll interval.
113 */ 134 */
114 __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); 135 __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
115 r = atomic_sub_return(1, &cmci_storm_on_cpus); 136 if (!atomic_sub_return(1, &cmci_storm_on_cpus))
116 if (r == 0)
117 pr_notice("CMCI storm subsided: switching to interrupt mode\n"); 137 pr_notice("CMCI storm subsided: switching to interrupt mode\n");
138
118 /* FALLTHROUGH */ 139 /* FALLTHROUGH */
119 140
120 case CMCI_STORM_SUBSIDED: 141 case CMCI_STORM_SUBSIDED:
121 /* 142 /*
122 * We wait for all cpus to go back to SUBSIDED 143 * We wait for all CPUs to go back to SUBSIDED state. When that
123 * state. When that happens we switch back to 144 * happens we switch back to interrupt mode.
124 * interrupt mode.
125 */ 145 */
126 if (!atomic_read(&cmci_storm_on_cpus)) { 146 if (!atomic_read(&cmci_storm_on_cpus)) {
127 __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); 147 __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
@@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
130 } 150 }
131 return CMCI_POLL_INTERVAL; 151 return CMCI_POLL_INTERVAL;
132 default: 152 default:
133 /* 153
134 * We have shiny weather. Let the poll do whatever it 154 /* We have shiny weather. Let the poll do whatever it thinks. */
135 * thinks.
136 */
137 return interval; 155 return interval;
138 } 156 }
139} 157}
@@ -178,7 +196,8 @@ static bool cmci_storm_detect(void)
178 cmci_storm_disable_banks(); 196 cmci_storm_disable_banks();
179 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); 197 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
180 r = atomic_add_return(1, &cmci_storm_on_cpus); 198 r = atomic_add_return(1, &cmci_storm_on_cpus);
181 mce_timer_kick(CMCI_POLL_INTERVAL); 199 mce_timer_kick(CMCI_STORM_INTERVAL);
200 this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
182 201
183 if (r == 1) 202 if (r == 1)
184 pr_notice("CMCI storm detected: switching to poll mode\n"); 203 pr_notice("CMCI storm detected: switching to poll mode\n");
@@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void)
195{ 214{
196 if (cmci_storm_detect()) 215 if (cmci_storm_detect())
197 return; 216 return;
217
198 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 218 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
199 mce_notify_irq(); 219 mce_notify_irq();
200} 220}
@@ -286,6 +306,7 @@ void cmci_recheck(void)
286 306
287 if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) 307 if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
288 return; 308 return;
309
289 local_irq_save(flags); 310 local_irq_save(flags);
290 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); 311 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
291 local_irq_restore(flags); 312 local_irq_restore(flags);