aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChen Gong <gong.chen@linux.intel.com>2012-08-09 14:44:51 -0400
committerTony Luck <tony.luck@intel.com>2012-08-09 14:44:51 -0400
commit55babd8f41f122f5f4c7cebf520c766c983282c6 (patch)
tree64c2913c55c5d6018051a638557cefcc48bbead6
parent4670a300a2169e1e922593c5d35b0cdaee112901 (diff)
x86/mce: Add CMCI poll mode
On Intel systems corrected machine check interrupts (CMCI) may be sent to multiple logical processors; possibly to all processors on the affected socket (SDM Volume 3B "15.5.1 CMCI Local APIC Interface"). This means that a persistent error (such as a stuck bit in ECC memory) may cause a storm of interrupts that greatly hinders or prevents forward progress (probably on many processors). To solve this we keep track of the rate at which each processor sees CMCI. If we exceed a threshold, we disable CMCI delivery and switch to polling the machine check banks. If the storm subsides (none of the affected processors see any more errors for a complete poll interval) we re-enable CMCI. [Tony: Added console messages when storm begins/ends and increased storm threshold from 5 to 15 so we have a few more logged entries before we disable interrupts and start dropping reports] Signed-off-by: Chen Gong <gong.chen@linux.intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Chen Gong <gong.chen@linux.intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c47
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c108
3 files changed, 160 insertions, 7 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index ed44c8a65858..6a05c1d327a9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,6 +28,18 @@ extern int mce_ser;
28 28
29extern struct mce_bank *mce_banks; 29extern struct mce_bank *mce_banks;
30 30
31#ifdef CONFIG_X86_MCE_INTEL
32unsigned long mce_intel_adjust_timer(unsigned long interval);
33void mce_intel_cmci_poll(void);
34void mce_intel_hcpu_update(unsigned long cpu);
35#else
36# define mce_intel_adjust_timer mce_adjust_timer_default
37static inline void mce_intel_cmci_poll(void) { }
38static inline void mce_intel_hcpu_update(unsigned long cpu) { }
39#endif
40
41void mce_timer_kick(unsigned long interval);
42
31#ifdef CONFIG_ACPI_APEI 43#ifdef CONFIG_ACPI_APEI
32int apei_write_mce(struct mce *m); 44int apei_write_mce(struct mce *m);
33ssize_t apei_read_mce(struct mce *m, u64 *record_id); 45ssize_t apei_read_mce(struct mce *m, u64 *record_id);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b4dde1527edd..8c1beea6cabf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1260,6 +1260,14 @@ static unsigned long check_interval = 5 * 60; /* 5 minutes */
1260static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1260static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1261static DEFINE_PER_CPU(struct timer_list, mce_timer); 1261static DEFINE_PER_CPU(struct timer_list, mce_timer);
1262 1262
1263static unsigned long mce_adjust_timer_default(unsigned long interval)
1264{
1265 return interval;
1266}
1267
1268static unsigned long (*mce_adjust_timer)(unsigned long interval) =
1269 mce_adjust_timer_default;
1270
1263static void mce_timer_fn(unsigned long data) 1271static void mce_timer_fn(unsigned long data)
1264{ 1272{
1265 struct timer_list *t = &__get_cpu_var(mce_timer); 1273 struct timer_list *t = &__get_cpu_var(mce_timer);
@@ -1270,6 +1278,7 @@ static void mce_timer_fn(unsigned long data)
1270 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1278 if (mce_available(__this_cpu_ptr(&cpu_info))) {
1271 machine_check_poll(MCP_TIMESTAMP, 1279 machine_check_poll(MCP_TIMESTAMP,
1272 &__get_cpu_var(mce_poll_banks)); 1280 &__get_cpu_var(mce_poll_banks));
1281 mce_intel_cmci_poll();
1273 } 1282 }
1274 1283
1275 /* 1284 /*
@@ -1277,14 +1286,38 @@ static void mce_timer_fn(unsigned long data)
1277 * polling interval, otherwise increase the polling interval. 1286 * polling interval, otherwise increase the polling interval.
1278 */ 1287 */
1279 iv = __this_cpu_read(mce_next_interval); 1288 iv = __this_cpu_read(mce_next_interval);
1280 if (mce_notify_irq()) 1289 if (mce_notify_irq()) {
1281 iv = max(iv / 2, (unsigned long) HZ/100); 1290 iv = max(iv / 2, (unsigned long) HZ/100);
1282 else 1291 } else {
1283 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1292 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1293 iv = mce_adjust_timer(iv);
1294 }
1284 __this_cpu_write(mce_next_interval, iv); 1295 __this_cpu_write(mce_next_interval, iv);
1296 /* Might have become 0 after CMCI storm subsided */
1297 if (iv) {
1298 t->expires = jiffies + iv;
1299 add_timer_on(t, smp_processor_id());
1300 }
1301}
1285 1302
1286 t->expires = jiffies + iv; 1303/*
1287 add_timer_on(t, smp_processor_id()); 1304 * Ensure that the timer is firing in @interval from now.
1305 */
1306void mce_timer_kick(unsigned long interval)
1307{
1308 struct timer_list *t = &__get_cpu_var(mce_timer);
1309 unsigned long when = jiffies + interval;
1310 unsigned long iv = __this_cpu_read(mce_next_interval);
1311
1312 if (timer_pending(t)) {
1313 if (time_before(when, t->expires))
1314 mod_timer_pinned(t, when);
1315 } else {
1316 t->expires = round_jiffies(when);
1317 add_timer_on(t, smp_processor_id());
1318 }
1319 if (interval < iv)
1320 __this_cpu_write(mce_next_interval, interval);
1288} 1321}
1289 1322
1290/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1323/* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -1548,6 +1581,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1548 switch (c->x86_vendor) { 1581 switch (c->x86_vendor) {
1549 case X86_VENDOR_INTEL: 1582 case X86_VENDOR_INTEL:
1550 mce_intel_feature_init(c); 1583 mce_intel_feature_init(c);
1584 mce_adjust_timer = mce_intel_adjust_timer;
1551 break; 1585 break;
1552 case X86_VENDOR_AMD: 1586 case X86_VENDOR_AMD:
1553 mce_amd_feature_init(c); 1587 mce_amd_feature_init(c);
@@ -1559,7 +1593,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1559 1593
1560static void mce_start_timer(unsigned int cpu, struct timer_list *t) 1594static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1561{ 1595{
1562 unsigned long iv = check_interval * HZ; 1596 unsigned long iv = mce_adjust_timer(check_interval * HZ);
1563 1597
1564 __this_cpu_write(mce_next_interval, iv); 1598 __this_cpu_write(mce_next_interval, iv);
1565 1599
@@ -2272,10 +2306,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2272 if (threshold_cpu_callback) 2306 if (threshold_cpu_callback)
2273 threshold_cpu_callback(action, cpu); 2307 threshold_cpu_callback(action, cpu);
2274 mce_device_remove(cpu); 2308 mce_device_remove(cpu);
2309 mce_intel_hcpu_update(cpu);
2275 break; 2310 break;
2276 case CPU_DOWN_PREPARE: 2311 case CPU_DOWN_PREPARE:
2277 del_timer_sync(t);
2278 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2312 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2313 del_timer_sync(t);
2279 break; 2314 break;
2280 case CPU_DOWN_FAILED: 2315 case CPU_DOWN_FAILED:
2281 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2316 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 59648e48a145..098386fed48e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -15,6 +15,8 @@
15#include <asm/msr.h> 15#include <asm/msr.h>
16#include <asm/mce.h> 16#include <asm/mce.h>
17 17
18#include "mce-internal.h"
19
18/* 20/*
19 * Support for Intel Correct Machine Check Interrupts. This allows 21 * Support for Intel Correct Machine Check Interrupts. This allows
20 * the CPU to raise an interrupt when a corrected machine check happened. 22 * the CPU to raise an interrupt when a corrected machine check happened.
@@ -30,7 +32,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
30 */ 32 */
31static DEFINE_RAW_SPINLOCK(cmci_discover_lock); 33static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
32 34
33#define CMCI_THRESHOLD 1 35#define CMCI_THRESHOLD 1
36#define CMCI_POLL_INTERVAL (30 * HZ)
37#define CMCI_STORM_INTERVAL (1 * HZ)
38#define CMCI_STORM_THRESHOLD 15
39
40static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
41static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
42static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
43
44enum {
45 CMCI_STORM_NONE,
46 CMCI_STORM_ACTIVE,
47 CMCI_STORM_SUBSIDED,
48};
49
50static atomic_t cmci_storm_on_cpus;
34 51
35static int cmci_supported(int *banks) 52static int cmci_supported(int *banks)
36{ 53{
@@ -53,6 +70,93 @@ static int cmci_supported(int *banks)
53 return !!(cap & MCG_CMCI_P); 70 return !!(cap & MCG_CMCI_P);
54} 71}
55 72
73void mce_intel_cmci_poll(void)
74{
75 if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
76 return;
77 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
78}
79
80void mce_intel_hcpu_update(unsigned long cpu)
81{
82 if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
83 atomic_dec(&cmci_storm_on_cpus);
84
85 per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
86}
87
88unsigned long mce_intel_adjust_timer(unsigned long interval)
89{
90 int r;
91
92 if (interval < CMCI_POLL_INTERVAL)
93 return interval;
94
95 switch (__this_cpu_read(cmci_storm_state)) {
96 case CMCI_STORM_ACTIVE:
97 /*
98 * We switch back to interrupt mode once the poll timer has
99 * silenced itself. That means no events recorded and the
100 * timer interval is back to our poll interval.
101 */
102 __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
103 r = atomic_sub_return(1, &cmci_storm_on_cpus);
104 if (r == 0)
105 pr_notice("CMCI storm subsided: switching to interrupt mode\n");
106 /* FALLTHROUGH */
107
108 case CMCI_STORM_SUBSIDED:
109 /*
110 * We wait for all cpus to go back to SUBSIDED
111 * state. When that happens we switch back to
112 * interrupt mode.
113 */
114 if (!atomic_read(&cmci_storm_on_cpus)) {
115 __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
116 cmci_reenable();
117 cmci_recheck();
118 }
119 return CMCI_POLL_INTERVAL;
120 default:
121 /*
122 * We have shiny weather. Let the poll do whatever it
123 * thinks.
124 */
125 return interval;
126 }
127}
128
129static bool cmci_storm_detect(void)
130{
131 unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
132 unsigned long ts = __this_cpu_read(cmci_time_stamp);
133 unsigned long now = jiffies;
134 int r;
135
136 if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
137 return true;
138
139 if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
140 cnt++;
141 } else {
142 cnt = 1;
143 __this_cpu_write(cmci_time_stamp, now);
144 }
145 __this_cpu_write(cmci_storm_cnt, cnt);
146
147 if (cnt <= CMCI_STORM_THRESHOLD)
148 return false;
149
150 cmci_clear();
151 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
152 r = atomic_add_return(1, &cmci_storm_on_cpus);
153 mce_timer_kick(CMCI_POLL_INTERVAL);
154
155 if (r == 1)
156 pr_notice("CMCI storm detected: switching to poll mode\n");
157 return true;
158}
159
56/* 160/*
57 * The interrupt handler. This is called on every event. 161 * The interrupt handler. This is called on every event.
58 * Just call the poller directly to log any events. 162 * Just call the poller directly to log any events.
@@ -61,6 +165,8 @@ static int cmci_supported(int *banks)
61 */ 165 */
62static void intel_threshold_interrupt(void) 166static void intel_threshold_interrupt(void)
63{ 167{
168 if (cmci_storm_detect())
169 return;
64 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 170 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
65 mce_notify_irq(); 171 mce_notify_irq();
66} 172}