diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-01 14:12:13 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-01 14:12:13 -0400 |
| commit | 7687b80a4f5a178fe292d071c91ebf273ebe12db (patch) | |
| tree | cc5091575aed17806f7cb1efb1ceec86c39ff8b9 | |
| parent | ac07f5c3cb0cf19258c55cdf210aa4ac91ca7330 (diff) | |
| parent | 39ba5010d349109e53eaf9819bebe3f501bb5edf (diff) | |
Merge branch 'x86-mce-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86/MCE update from Ingo Molnar:
"Various MCE robustness enhancements.
One of the changes adds CMCI (Corrected Machine Check Interrupt) poll
mode on Intel Nehalem+ CPUs, which mode is automatically entered when
the rate of messages is too high - and exited once the storm is over.
An MCE events storm will roughly look like this:
[ 5342.740616] mce: [Hardware Error]: Machine check events logged
[ 5342.746501] mce: [Hardware Error]: Machine check events logged
[ 5342.757971] CMCI storm detected: switching to poll mode
[ 5372.674957] CMCI storm subsided: switching to interrupt mode
This should make such events more survivable"
* 'x86-mce-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mce: Provide boot argument to honour bios-set CMCI threshold
x86, MCE: Remove unused defines
x86, mce: Enable MCA support by default
x86/mce: Add CMCI poll mode
x86/mce: Make cmci_discover() quiet
x86: mce: Remove the frozen cases in the hotplug code
x86: mce: Split timer init
x86: mce: Serialize mce injection
x86: mce: Disable preemption when calling raise_local()
| -rw-r--r-- | Documentation/x86/x86_64/boot-options.txt | 7 | ||||
| -rw-r--r-- | arch/x86/Kconfig | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/mce.h | 13 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-inject.c | 8 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-internal.h | 12 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 94 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce_intel.c | 168 |
7 files changed, 244 insertions, 59 deletions
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index c54b4f503e2a..de38429beb71 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt | |||
| @@ -50,6 +50,13 @@ Machine check | |||
| 50 | monarchtimeout: | 50 | monarchtimeout: |
| 51 | Sets the time in us to wait for other CPUs on machine checks. 0 | 51 | Sets the time in us to wait for other CPUs on machine checks. 0 |
| 52 | to disable. | 52 | to disable. |
| 53 | mce=bios_cmci_threshold | ||
| 54 | Don't overwrite the bios-set CMCI threshold. This boot option | ||
| 55 | prevents Linux from overwriting the CMCI threshold set by the | ||
| 56 | bios. Without this option, Linux always sets the CMCI | ||
| 57 | threshold to 1. Enabling this may make memory predictive failure | ||
| 58 | analysis less effective if the bios sets thresholds for memory | ||
| 59 | errors since we will not see details for all errors. | ||
| 53 | 60 | ||
| 54 | nomce (for compatibility with i386): same as mce=off | 61 | nomce (for compatibility with i386): same as mce=off |
| 55 | 62 | ||
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 57fecc1db94d..6cd6f24e1223 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -874,6 +874,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS | |||
| 874 | 874 | ||
| 875 | config X86_MCE | 875 | config X86_MCE |
| 876 | bool "Machine Check / overheating reporting" | 876 | bool "Machine Check / overheating reporting" |
| 877 | default y | ||
| 877 | ---help--- | 878 | ---help--- |
| 878 | Machine Check support allows the processor to notify the | 879 | Machine Check support allows the processor to notify the |
| 879 | kernel if it detects a problem (e.g. overheating, data corruption). | 880 | kernel if it detects a problem (e.g. overheating, data corruption). |
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index a3ac52b29cbf..54d73b1f00a0 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
| @@ -116,19 +116,9 @@ struct mce_log { | |||
| 116 | /* Software defined banks */ | 116 | /* Software defined banks */ |
| 117 | #define MCE_EXTENDED_BANK 128 | 117 | #define MCE_EXTENDED_BANK 128 |
| 118 | #define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0 | 118 | #define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0 |
| 119 | 119 | #define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) | |
| 120 | #define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */ | ||
| 121 | #define K8_MCE_THRESHOLD_BANK_0 (MCE_THRESHOLD_BASE + 0 * 9) | ||
| 122 | #define K8_MCE_THRESHOLD_BANK_1 (MCE_THRESHOLD_BASE + 1 * 9) | ||
| 123 | #define K8_MCE_THRESHOLD_BANK_2 (MCE_THRESHOLD_BASE + 2 * 9) | ||
| 124 | #define K8_MCE_THRESHOLD_BANK_3 (MCE_THRESHOLD_BASE + 3 * 9) | ||
| 125 | #define K8_MCE_THRESHOLD_BANK_4 (MCE_THRESHOLD_BASE + 4 * 9) | ||
| 126 | #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) | ||
| 127 | #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) | ||
| 128 | |||
| 129 | 120 | ||
| 130 | #ifdef __KERNEL__ | 121 | #ifdef __KERNEL__ |
| 131 | |||
| 132 | extern void mce_register_decode_chain(struct notifier_block *nb); | 122 | extern void mce_register_decode_chain(struct notifier_block *nb); |
| 133 | extern void mce_unregister_decode_chain(struct notifier_block *nb); | 123 | extern void mce_unregister_decode_chain(struct notifier_block *nb); |
| 134 | 124 | ||
| @@ -171,6 +161,7 @@ DECLARE_PER_CPU(struct device *, mce_device); | |||
| 171 | #ifdef CONFIG_X86_MCE_INTEL | 161 | #ifdef CONFIG_X86_MCE_INTEL |
| 172 | extern int mce_cmci_disabled; | 162 | extern int mce_cmci_disabled; |
| 173 | extern int mce_ignore_ce; | 163 | extern int mce_ignore_ce; |
| 164 | extern int mce_bios_cmci_threshold; | ||
| 174 | void mce_intel_feature_init(struct cpuinfo_x86 *c); | 165 | void mce_intel_feature_init(struct cpuinfo_x86 *c); |
| 175 | void cmci_clear(void); | 166 | void cmci_clear(void); |
| 176 | void cmci_reenable(void); | 167 | void cmci_reenable(void); |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index fc4beb393577..ddc72f839332 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c | |||
| @@ -78,6 +78,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) | |||
| 78 | } | 78 | } |
| 79 | 79 | ||
| 80 | static cpumask_var_t mce_inject_cpumask; | 80 | static cpumask_var_t mce_inject_cpumask; |
| 81 | static DEFINE_MUTEX(mce_inject_mutex); | ||
| 81 | 82 | ||
| 82 | static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) | 83 | static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) |
| 83 | { | 84 | { |
| @@ -194,7 +195,11 @@ static void raise_mce(struct mce *m) | |||
| 194 | put_online_cpus(); | 195 | put_online_cpus(); |
| 195 | } else | 196 | } else |
| 196 | #endif | 197 | #endif |
| 198 | { | ||
| 199 | preempt_disable(); | ||
| 197 | raise_local(); | 200 | raise_local(); |
| 201 | preempt_enable(); | ||
| 202 | } | ||
| 198 | } | 203 | } |
| 199 | 204 | ||
| 200 | /* Error injection interface */ | 205 | /* Error injection interface */ |
| @@ -225,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, | |||
| 225 | * so do it a jiffie or two later everywhere. | 230 | * so do it a jiffie or two later everywhere. |
| 226 | */ | 231 | */ |
| 227 | schedule_timeout(2); | 232 | schedule_timeout(2); |
| 233 | |||
| 234 | mutex_lock(&mce_inject_mutex); | ||
| 228 | raise_mce(&m); | 235 | raise_mce(&m); |
| 236 | mutex_unlock(&mce_inject_mutex); | ||
| 229 | return usize; | 237 | return usize; |
| 230 | } | 238 | } |
| 231 | 239 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index ed44c8a65858..6a05c1d327a9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h | |||
| @@ -28,6 +28,18 @@ extern int mce_ser; | |||
| 28 | 28 | ||
| 29 | extern struct mce_bank *mce_banks; | 29 | extern struct mce_bank *mce_banks; |
| 30 | 30 | ||
| 31 | #ifdef CONFIG_X86_MCE_INTEL | ||
| 32 | unsigned long mce_intel_adjust_timer(unsigned long interval); | ||
| 33 | void mce_intel_cmci_poll(void); | ||
| 34 | void mce_intel_hcpu_update(unsigned long cpu); | ||
| 35 | #else | ||
| 36 | # define mce_intel_adjust_timer mce_adjust_timer_default | ||
| 37 | static inline void mce_intel_cmci_poll(void) { } | ||
| 38 | static inline void mce_intel_hcpu_update(unsigned long cpu) { } | ||
| 39 | #endif | ||
| 40 | |||
| 41 | void mce_timer_kick(unsigned long interval); | ||
| 42 | |||
| 31 | #ifdef CONFIG_ACPI_APEI | 43 | #ifdef CONFIG_ACPI_APEI |
| 32 | int apei_write_mce(struct mce *m); | 44 | int apei_write_mce(struct mce *m); |
| 33 | ssize_t apei_read_mce(struct mce *m, u64 *record_id); | 45 | ssize_t apei_read_mce(struct mce *m, u64 *record_id); |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 292d0258311c..29e87d3b2843 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
| @@ -83,6 +83,7 @@ static int mce_dont_log_ce __read_mostly; | |||
| 83 | int mce_cmci_disabled __read_mostly; | 83 | int mce_cmci_disabled __read_mostly; |
| 84 | int mce_ignore_ce __read_mostly; | 84 | int mce_ignore_ce __read_mostly; |
| 85 | int mce_ser __read_mostly; | 85 | int mce_ser __read_mostly; |
| 86 | int mce_bios_cmci_threshold __read_mostly; | ||
| 86 | 87 | ||
| 87 | struct mce_bank *mce_banks __read_mostly; | 88 | struct mce_bank *mce_banks __read_mostly; |
| 88 | 89 | ||
| @@ -1266,6 +1267,14 @@ static unsigned long check_interval = 5 * 60; /* 5 minutes */ | |||
| 1266 | static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ | 1267 | static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ |
| 1267 | static DEFINE_PER_CPU(struct timer_list, mce_timer); | 1268 | static DEFINE_PER_CPU(struct timer_list, mce_timer); |
| 1268 | 1269 | ||
| 1270 | static unsigned long mce_adjust_timer_default(unsigned long interval) | ||
| 1271 | { | ||
| 1272 | return interval; | ||
| 1273 | } | ||
| 1274 | |||
| 1275 | static unsigned long (*mce_adjust_timer)(unsigned long interval) = | ||
| 1276 | mce_adjust_timer_default; | ||
| 1277 | |||
| 1269 | static void mce_timer_fn(unsigned long data) | 1278 | static void mce_timer_fn(unsigned long data) |
| 1270 | { | 1279 | { |
| 1271 | struct timer_list *t = &__get_cpu_var(mce_timer); | 1280 | struct timer_list *t = &__get_cpu_var(mce_timer); |
| @@ -1276,6 +1285,7 @@ static void mce_timer_fn(unsigned long data) | |||
| 1276 | if (mce_available(__this_cpu_ptr(&cpu_info))) { | 1285 | if (mce_available(__this_cpu_ptr(&cpu_info))) { |
| 1277 | machine_check_poll(MCP_TIMESTAMP, | 1286 | machine_check_poll(MCP_TIMESTAMP, |
| 1278 | &__get_cpu_var(mce_poll_banks)); | 1287 | &__get_cpu_var(mce_poll_banks)); |
| 1288 | mce_intel_cmci_poll(); | ||
| 1279 | } | 1289 | } |
| 1280 | 1290 | ||
| 1281 | /* | 1291 | /* |
| @@ -1283,14 +1293,38 @@ static void mce_timer_fn(unsigned long data) | |||
| 1283 | * polling interval, otherwise increase the polling interval. | 1293 | * polling interval, otherwise increase the polling interval. |
| 1284 | */ | 1294 | */ |
| 1285 | iv = __this_cpu_read(mce_next_interval); | 1295 | iv = __this_cpu_read(mce_next_interval); |
| 1286 | if (mce_notify_irq()) | 1296 | if (mce_notify_irq()) { |
| 1287 | iv = max(iv / 2, (unsigned long) HZ/100); | 1297 | iv = max(iv / 2, (unsigned long) HZ/100); |
| 1288 | else | 1298 | } else { |
| 1289 | iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); | 1299 | iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); |
| 1300 | iv = mce_adjust_timer(iv); | ||
| 1301 | } | ||
| 1290 | __this_cpu_write(mce_next_interval, iv); | 1302 | __this_cpu_write(mce_next_interval, iv); |
| 1303 | /* Might have become 0 after CMCI storm subsided */ | ||
| 1304 | if (iv) { | ||
| 1305 | t->expires = jiffies + iv; | ||
| 1306 | add_timer_on(t, smp_processor_id()); | ||
| 1307 | } | ||
| 1308 | } | ||
| 1291 | 1309 | ||
| 1292 | t->expires = jiffies + iv; | 1310 | /* |
| 1293 | add_timer_on(t, smp_processor_id()); | 1311 | * Ensure that the timer is firing in @interval from now. |
| 1312 | */ | ||
| 1313 | void mce_timer_kick(unsigned long interval) | ||
| 1314 | { | ||
| 1315 | struct timer_list *t = &__get_cpu_var(mce_timer); | ||
| 1316 | unsigned long when = jiffies + interval; | ||
| 1317 | unsigned long iv = __this_cpu_read(mce_next_interval); | ||
| 1318 | |||
| 1319 | if (timer_pending(t)) { | ||
| 1320 | if (time_before(when, t->expires)) | ||
| 1321 | mod_timer_pinned(t, when); | ||
| 1322 | } else { | ||
| 1323 | t->expires = round_jiffies(when); | ||
| 1324 | add_timer_on(t, smp_processor_id()); | ||
| 1325 | } | ||
| 1326 | if (interval < iv) | ||
| 1327 | __this_cpu_write(mce_next_interval, interval); | ||
| 1294 | } | 1328 | } |
| 1295 | 1329 | ||
| 1296 | /* Must not be called in IRQ context where del_timer_sync() can deadlock */ | 1330 | /* Must not be called in IRQ context where del_timer_sync() can deadlock */ |
| @@ -1585,6 +1619,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) | |||
| 1585 | switch (c->x86_vendor) { | 1619 | switch (c->x86_vendor) { |
| 1586 | case X86_VENDOR_INTEL: | 1620 | case X86_VENDOR_INTEL: |
| 1587 | mce_intel_feature_init(c); | 1621 | mce_intel_feature_init(c); |
| 1622 | mce_adjust_timer = mce_intel_adjust_timer; | ||
| 1588 | break; | 1623 | break; |
| 1589 | case X86_VENDOR_AMD: | 1624 | case X86_VENDOR_AMD: |
| 1590 | mce_amd_feature_init(c); | 1625 | mce_amd_feature_init(c); |
| @@ -1594,23 +1629,28 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) | |||
| 1594 | } | 1629 | } |
| 1595 | } | 1630 | } |
| 1596 | 1631 | ||
| 1597 | static void __mcheck_cpu_init_timer(void) | 1632 | static void mce_start_timer(unsigned int cpu, struct timer_list *t) |
| 1598 | { | 1633 | { |
| 1599 | struct timer_list *t = &__get_cpu_var(mce_timer); | 1634 | unsigned long iv = mce_adjust_timer(check_interval * HZ); |
| 1600 | unsigned long iv = check_interval * HZ; | ||
| 1601 | 1635 | ||
| 1602 | setup_timer(t, mce_timer_fn, smp_processor_id()); | 1636 | __this_cpu_write(mce_next_interval, iv); |
| 1603 | 1637 | ||
| 1604 | if (mce_ignore_ce) | 1638 | if (mce_ignore_ce || !iv) |
| 1605 | return; | 1639 | return; |
| 1606 | 1640 | ||
| 1607 | __this_cpu_write(mce_next_interval, iv); | ||
| 1608 | if (!iv) | ||
| 1609 | return; | ||
| 1610 | t->expires = round_jiffies(jiffies + iv); | 1641 | t->expires = round_jiffies(jiffies + iv); |
| 1611 | add_timer_on(t, smp_processor_id()); | 1642 | add_timer_on(t, smp_processor_id()); |
| 1612 | } | 1643 | } |
| 1613 | 1644 | ||
| 1645 | static void __mcheck_cpu_init_timer(void) | ||
| 1646 | { | ||
| 1647 | struct timer_list *t = &__get_cpu_var(mce_timer); | ||
| 1648 | unsigned int cpu = smp_processor_id(); | ||
| 1649 | |||
| 1650 | setup_timer(t, mce_timer_fn, cpu); | ||
| 1651 | mce_start_timer(cpu, t); | ||
| 1652 | } | ||
| 1653 | |||
| 1614 | /* Handle unconfigured int18 (should never happen) */ | 1654 | /* Handle unconfigured int18 (should never happen) */ |
| 1615 | static void unexpected_machine_check(struct pt_regs *regs, long error_code) | 1655 | static void unexpected_machine_check(struct pt_regs *regs, long error_code) |
| 1616 | { | 1656 | { |
| @@ -1907,6 +1947,7 @@ static struct miscdevice mce_chrdev_device = { | |||
| 1907 | * check, or 0 to not wait | 1947 | * check, or 0 to not wait |
| 1908 | * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. | 1948 | * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. |
| 1909 | * mce=nobootlog Don't log MCEs from before booting. | 1949 | * mce=nobootlog Don't log MCEs from before booting. |
| 1950 | * mce=bios_cmci_threshold Don't program the CMCI threshold | ||
| 1910 | */ | 1951 | */ |
| 1911 | static int __init mcheck_enable(char *str) | 1952 | static int __init mcheck_enable(char *str) |
| 1912 | { | 1953 | { |
| @@ -1926,6 +1967,8 @@ static int __init mcheck_enable(char *str) | |||
| 1926 | mce_ignore_ce = 1; | 1967 | mce_ignore_ce = 1; |
| 1927 | else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) | 1968 | else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) |
| 1928 | mce_bootlog = (str[0] == 'b'); | 1969 | mce_bootlog = (str[0] == 'b'); |
| 1970 | else if (!strcmp(str, "bios_cmci_threshold")) | ||
| 1971 | mce_bios_cmci_threshold = 1; | ||
| 1929 | else if (isdigit(str[0])) { | 1972 | else if (isdigit(str[0])) { |
| 1930 | get_option(&str, &tolerant); | 1973 | get_option(&str, &tolerant); |
| 1931 | if (*str == ',') { | 1974 | if (*str == ',') { |
| @@ -2166,6 +2209,11 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { | |||
| 2166 | &mce_cmci_disabled | 2209 | &mce_cmci_disabled |
| 2167 | }; | 2210 | }; |
| 2168 | 2211 | ||
| 2212 | static struct dev_ext_attribute dev_attr_bios_cmci_threshold = { | ||
| 2213 | __ATTR(bios_cmci_threshold, 0444, device_show_int, NULL), | ||
| 2214 | &mce_bios_cmci_threshold | ||
| 2215 | }; | ||
| 2216 | |||
| 2169 | static struct device_attribute *mce_device_attrs[] = { | 2217 | static struct device_attribute *mce_device_attrs[] = { |
| 2170 | &dev_attr_tolerant.attr, | 2218 | &dev_attr_tolerant.attr, |
| 2171 | &dev_attr_check_interval.attr, | 2219 | &dev_attr_check_interval.attr, |
| @@ -2174,6 +2222,7 @@ static struct device_attribute *mce_device_attrs[] = { | |||
| 2174 | &dev_attr_dont_log_ce.attr, | 2222 | &dev_attr_dont_log_ce.attr, |
| 2175 | &dev_attr_ignore_ce.attr, | 2223 | &dev_attr_ignore_ce.attr, |
| 2176 | &dev_attr_cmci_disabled.attr, | 2224 | &dev_attr_cmci_disabled.attr, |
| 2225 | &dev_attr_bios_cmci_threshold.attr, | ||
| 2177 | NULL | 2226 | NULL |
| 2178 | }; | 2227 | }; |
| 2179 | 2228 | ||
| @@ -2294,38 +2343,33 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 2294 | unsigned int cpu = (unsigned long)hcpu; | 2343 | unsigned int cpu = (unsigned long)hcpu; |
| 2295 | struct timer_list *t = &per_cpu(mce_timer, cpu); | 2344 | struct timer_list *t = &per_cpu(mce_timer, cpu); |
| 2296 | 2345 | ||
| 2297 | switch (action) { | 2346 | switch (action & ~CPU_TASKS_FROZEN) { |
| 2298 | case CPU_ONLINE: | 2347 | case CPU_ONLINE: |
| 2299 | case CPU_ONLINE_FROZEN: | ||
| 2300 | mce_device_create(cpu); | 2348 | mce_device_create(cpu); |
| 2301 | if (threshold_cpu_callback) | 2349 | if (threshold_cpu_callback) |
| 2302 | threshold_cpu_callback(action, cpu); | 2350 | threshold_cpu_callback(action, cpu); |
| 2303 | break; | 2351 | break; |
| 2304 | case CPU_DEAD: | 2352 | case CPU_DEAD: |
| 2305 | case CPU_DEAD_FROZEN: | ||
| 2306 | if (threshold_cpu_callback) | 2353 | if (threshold_cpu_callback) |
| 2307 | threshold_cpu_callback(action, cpu); | 2354 | threshold_cpu_callback(action, cpu); |
| 2308 | mce_device_remove(cpu); | 2355 | mce_device_remove(cpu); |
| 2356 | mce_intel_hcpu_update(cpu); | ||
| 2309 | break; | 2357 | break; |
| 2310 | case CPU_DOWN_PREPARE: | 2358 | case CPU_DOWN_PREPARE: |
| 2311 | case CPU_DOWN_PREPARE_FROZEN: | ||
| 2312 | del_timer_sync(t); | ||
| 2313 | smp_call_function_single(cpu, mce_disable_cpu, &action, 1); | 2359 | smp_call_function_single(cpu, mce_disable_cpu, &action, 1); |
| 2360 | del_timer_sync(t); | ||
| 2314 | break; | 2361 | break; |
| 2315 | case CPU_DOWN_FAILED: | 2362 | case CPU_DOWN_FAILED: |
| 2316 | case CPU_DOWN_FAILED_FROZEN: | ||
| 2317 | if (!mce_ignore_ce && check_interval) { | ||
| 2318 | t->expires = round_jiffies(jiffies + | ||
| 2319 | per_cpu(mce_next_interval, cpu)); | ||
| 2320 | add_timer_on(t, cpu); | ||
| 2321 | } | ||
| 2322 | smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); | 2363 | smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); |
| 2364 | mce_start_timer(cpu, t); | ||
| 2323 | break; | 2365 | break; |
| 2324 | case CPU_POST_DEAD: | 2366 | } |
| 2367 | |||
| 2368 | if (action == CPU_POST_DEAD) { | ||
| 2325 | /* intentionally ignoring frozen here */ | 2369 | /* intentionally ignoring frozen here */ |
| 2326 | cmci_rediscover(cpu); | 2370 | cmci_rediscover(cpu); |
| 2327 | break; | ||
| 2328 | } | 2371 | } |
| 2372 | |||
| 2329 | return NOTIFY_OK; | 2373 | return NOTIFY_OK; |
| 2330 | } | 2374 | } |
| 2331 | 2375 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 38e49bc95ffc..5f88abf07e9c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c | |||
| @@ -15,6 +15,8 @@ | |||
| 15 | #include <asm/msr.h> | 15 | #include <asm/msr.h> |
| 16 | #include <asm/mce.h> | 16 | #include <asm/mce.h> |
| 17 | 17 | ||
| 18 | #include "mce-internal.h" | ||
| 19 | |||
| 18 | /* | 20 | /* |
| 19 | * Support for Intel Correct Machine Check Interrupts. This allows | 21 | * Support for Intel Correct Machine Check Interrupts. This allows |
| 20 | * the CPU to raise an interrupt when a corrected machine check happened. | 22 | * the CPU to raise an interrupt when a corrected machine check happened. |
| @@ -30,7 +32,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); | |||
| 30 | */ | 32 | */ |
| 31 | static DEFINE_RAW_SPINLOCK(cmci_discover_lock); | 33 | static DEFINE_RAW_SPINLOCK(cmci_discover_lock); |
| 32 | 34 | ||
| 33 | #define CMCI_THRESHOLD 1 | 35 | #define CMCI_THRESHOLD 1 |
| 36 | #define CMCI_POLL_INTERVAL (30 * HZ) | ||
| 37 | #define CMCI_STORM_INTERVAL (1 * HZ) | ||
| 38 | #define CMCI_STORM_THRESHOLD 15 | ||
| 39 | |||
| 40 | static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); | ||
| 41 | static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); | ||
| 42 | static DEFINE_PER_CPU(unsigned int, cmci_storm_state); | ||
| 43 | |||
| 44 | enum { | ||
| 45 | CMCI_STORM_NONE, | ||
| 46 | CMCI_STORM_ACTIVE, | ||
| 47 | CMCI_STORM_SUBSIDED, | ||
| 48 | }; | ||
| 49 | |||
| 50 | static atomic_t cmci_storm_on_cpus; | ||
| 34 | 51 | ||
| 35 | static int cmci_supported(int *banks) | 52 | static int cmci_supported(int *banks) |
| 36 | { | 53 | { |
| @@ -53,6 +70,93 @@ static int cmci_supported(int *banks) | |||
| 53 | return !!(cap & MCG_CMCI_P); | 70 | return !!(cap & MCG_CMCI_P); |
| 54 | } | 71 | } |
| 55 | 72 | ||
| 73 | void mce_intel_cmci_poll(void) | ||
| 74 | { | ||
| 75 | if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) | ||
| 76 | return; | ||
| 77 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | ||
| 78 | } | ||
| 79 | |||
| 80 | void mce_intel_hcpu_update(unsigned long cpu) | ||
| 81 | { | ||
| 82 | if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) | ||
| 83 | atomic_dec(&cmci_storm_on_cpus); | ||
| 84 | |||
| 85 | per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; | ||
| 86 | } | ||
| 87 | |||
| 88 | unsigned long mce_intel_adjust_timer(unsigned long interval) | ||
| 89 | { | ||
| 90 | int r; | ||
| 91 | |||
| 92 | if (interval < CMCI_POLL_INTERVAL) | ||
| 93 | return interval; | ||
| 94 | |||
| 95 | switch (__this_cpu_read(cmci_storm_state)) { | ||
| 96 | case CMCI_STORM_ACTIVE: | ||
| 97 | /* | ||
| 98 | * We switch back to interrupt mode once the poll timer has | ||
| 99 | * silenced itself. That means no events recorded and the | ||
| 100 | * timer interval is back to our poll interval. | ||
| 101 | */ | ||
| 102 | __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); | ||
| 103 | r = atomic_sub_return(1, &cmci_storm_on_cpus); | ||
| 104 | if (r == 0) | ||
| 105 | pr_notice("CMCI storm subsided: switching to interrupt mode\n"); | ||
| 106 | /* FALLTHROUGH */ | ||
| 107 | |||
| 108 | case CMCI_STORM_SUBSIDED: | ||
| 109 | /* | ||
| 110 | * We wait for all cpus to go back to SUBSIDED | ||
| 111 | * state. When that happens we switch back to | ||
| 112 | * interrupt mode. | ||
| 113 | */ | ||
| 114 | if (!atomic_read(&cmci_storm_on_cpus)) { | ||
| 115 | __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); | ||
| 116 | cmci_reenable(); | ||
| 117 | cmci_recheck(); | ||
| 118 | } | ||
| 119 | return CMCI_POLL_INTERVAL; | ||
| 120 | default: | ||
| 121 | /* | ||
| 122 | * We have shiny weather. Let the poll do whatever it | ||
| 123 | * thinks. | ||
| 124 | */ | ||
| 125 | return interval; | ||
| 126 | } | ||
| 127 | } | ||
| 128 | |||
| 129 | static bool cmci_storm_detect(void) | ||
| 130 | { | ||
| 131 | unsigned int cnt = __this_cpu_read(cmci_storm_cnt); | ||
| 132 | unsigned long ts = __this_cpu_read(cmci_time_stamp); | ||
| 133 | unsigned long now = jiffies; | ||
| 134 | int r; | ||
| 135 | |||
| 136 | if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) | ||
| 137 | return true; | ||
| 138 | |||
| 139 | if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { | ||
| 140 | cnt++; | ||
| 141 | } else { | ||
| 142 | cnt = 1; | ||
| 143 | __this_cpu_write(cmci_time_stamp, now); | ||
| 144 | } | ||
| 145 | __this_cpu_write(cmci_storm_cnt, cnt); | ||
| 146 | |||
| 147 | if (cnt <= CMCI_STORM_THRESHOLD) | ||
| 148 | return false; | ||
| 149 | |||
| 150 | cmci_clear(); | ||
| 151 | __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); | ||
| 152 | r = atomic_add_return(1, &cmci_storm_on_cpus); | ||
| 153 | mce_timer_kick(CMCI_POLL_INTERVAL); | ||
| 154 | |||
| 155 | if (r == 1) | ||
| 156 | pr_notice("CMCI storm detected: switching to poll mode\n"); | ||
| 157 | return true; | ||
| 158 | } | ||
| 159 | |||
| 56 | /* | 160 | /* |
| 57 | * The interrupt handler. This is called on every event. | 161 | * The interrupt handler. This is called on every event. |
| 58 | * Just call the poller directly to log any events. | 162 | * Just call the poller directly to log any events. |
| @@ -61,33 +165,28 @@ static int cmci_supported(int *banks) | |||
| 61 | */ | 165 | */ |
| 62 | static void intel_threshold_interrupt(void) | 166 | static void intel_threshold_interrupt(void) |
| 63 | { | 167 | { |
| 168 | if (cmci_storm_detect()) | ||
| 169 | return; | ||
| 64 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | 170 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); |
| 65 | mce_notify_irq(); | 171 | mce_notify_irq(); |
| 66 | } | 172 | } |
| 67 | 173 | ||
| 68 | static void print_update(char *type, int *hdr, int num) | ||
| 69 | { | ||
| 70 | if (*hdr == 0) | ||
| 71 | printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); | ||
| 72 | *hdr = 1; | ||
| 73 | printk(KERN_CONT " %s:%d", type, num); | ||
| 74 | } | ||
| 75 | |||
| 76 | /* | 174 | /* |
| 77 | * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks | 175 | * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks |
| 78 | * on this CPU. Use the algorithm recommended in the SDM to discover shared | 176 | * on this CPU. Use the algorithm recommended in the SDM to discover shared |
| 79 | * banks. | 177 | * banks. |
| 80 | */ | 178 | */ |
| 81 | static void cmci_discover(int banks, int boot) | 179 | static void cmci_discover(int banks) |
| 82 | { | 180 | { |
| 83 | unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); | 181 | unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); |
| 84 | unsigned long flags; | 182 | unsigned long flags; |
| 85 | int hdr = 0; | ||
| 86 | int i; | 183 | int i; |
| 184 | int bios_wrong_thresh = 0; | ||
| 87 | 185 | ||
| 88 | raw_spin_lock_irqsave(&cmci_discover_lock, flags); | 186 | raw_spin_lock_irqsave(&cmci_discover_lock, flags); |
| 89 | for (i = 0; i < banks; i++) { | 187 | for (i = 0; i < banks; i++) { |
| 90 | u64 val; | 188 | u64 val; |
| 189 | int bios_zero_thresh = 0; | ||
| 91 | 190 | ||
| 92 | if (test_bit(i, owned)) | 191 | if (test_bit(i, owned)) |
| 93 | continue; | 192 | continue; |
| @@ -96,29 +195,52 @@ static void cmci_discover(int banks, int boot) | |||
| 96 | 195 | ||
| 97 | /* Already owned by someone else? */ | 196 | /* Already owned by someone else? */ |
| 98 | if (val & MCI_CTL2_CMCI_EN) { | 197 | if (val & MCI_CTL2_CMCI_EN) { |
| 99 | if (test_and_clear_bit(i, owned) && !boot) | 198 | clear_bit(i, owned); |
| 100 | print_update("SHD", &hdr, i); | ||
| 101 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | 199 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); |
| 102 | continue; | 200 | continue; |
| 103 | } | 201 | } |
| 104 | 202 | ||
| 105 | val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; | 203 | if (!mce_bios_cmci_threshold) { |
| 106 | val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; | 204 | val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; |
| 205 | val |= CMCI_THRESHOLD; | ||
| 206 | } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { | ||
| 207 | /* | ||
| 208 | * If bios_cmci_threshold boot option was specified | ||
| 209 | * but the threshold is zero, we'll try to initialize | ||
| 210 | * it to 1. | ||
| 211 | */ | ||
| 212 | bios_zero_thresh = 1; | ||
| 213 | val |= CMCI_THRESHOLD; | ||
| 214 | } | ||
| 215 | |||
| 216 | val |= MCI_CTL2_CMCI_EN; | ||
| 107 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); | 217 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); |
| 108 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); | 218 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
| 109 | 219 | ||
| 110 | /* Did the enable bit stick? -- the bank supports CMCI */ | 220 | /* Did the enable bit stick? -- the bank supports CMCI */ |
| 111 | if (val & MCI_CTL2_CMCI_EN) { | 221 | if (val & MCI_CTL2_CMCI_EN) { |
| 112 | if (!test_and_set_bit(i, owned) && !boot) | 222 | set_bit(i, owned); |
| 113 | print_update("CMCI", &hdr, i); | ||
| 114 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | 223 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); |
| 224 | /* | ||
| 225 | * We are able to set thresholds for some banks that | ||
| 226 | * had a threshold of 0. This means the BIOS has not | ||
| 227 | * set the thresholds properly or does not work with | ||
| 228 | * this boot option. Note down now and report later. | ||
| 229 | */ | ||
| 230 | if (mce_bios_cmci_threshold && bios_zero_thresh && | ||
| 231 | (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) | ||
| 232 | bios_wrong_thresh = 1; | ||
| 115 | } else { | 233 | } else { |
| 116 | WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); | 234 | WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); |
| 117 | } | 235 | } |
| 118 | } | 236 | } |
| 119 | raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); | 237 | raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); |
| 120 | if (hdr) | 238 | if (mce_bios_cmci_threshold && bios_wrong_thresh) { |
| 121 | printk(KERN_CONT "\n"); | 239 | pr_info_once( |
| 240 | "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); | ||
| 241 | pr_info_once( | ||
| 242 | "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); | ||
| 243 | } | ||
| 122 | } | 244 | } |
| 123 | 245 | ||
| 124 | /* | 246 | /* |
| @@ -156,7 +278,7 @@ void cmci_clear(void) | |||
| 156 | continue; | 278 | continue; |
| 157 | /* Disable CMCI */ | 279 | /* Disable CMCI */ |
| 158 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); | 280 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
| 159 | val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); | 281 | val &= ~MCI_CTL2_CMCI_EN; |
| 160 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); | 282 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); |
| 161 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); | 283 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); |
| 162 | } | 284 | } |
| @@ -186,7 +308,7 @@ void cmci_rediscover(int dying) | |||
| 186 | continue; | 308 | continue; |
| 187 | /* Recheck banks in case CPUs don't all have the same */ | 309 | /* Recheck banks in case CPUs don't all have the same */ |
| 188 | if (cmci_supported(&banks)) | 310 | if (cmci_supported(&banks)) |
| 189 | cmci_discover(banks, 0); | 311 | cmci_discover(banks); |
| 190 | } | 312 | } |
| 191 | 313 | ||
| 192 | set_cpus_allowed_ptr(current, old); | 314 | set_cpus_allowed_ptr(current, old); |
| @@ -200,7 +322,7 @@ void cmci_reenable(void) | |||
| 200 | { | 322 | { |
| 201 | int banks; | 323 | int banks; |
| 202 | if (cmci_supported(&banks)) | 324 | if (cmci_supported(&banks)) |
| 203 | cmci_discover(banks, 0); | 325 | cmci_discover(banks); |
| 204 | } | 326 | } |
| 205 | 327 | ||
| 206 | static void intel_init_cmci(void) | 328 | static void intel_init_cmci(void) |
| @@ -211,7 +333,7 @@ static void intel_init_cmci(void) | |||
| 211 | return; | 333 | return; |
| 212 | 334 | ||
| 213 | mce_threshold_vector = intel_threshold_interrupt; | 335 | mce_threshold_vector = intel_threshold_interrupt; |
| 214 | cmci_discover(banks, 1); | 336 | cmci_discover(banks); |
| 215 | /* | 337 | /* |
| 216 | * For CPU #0 this runs with still disabled APIC, but that's | 338 | * For CPU #0 this runs with still disabled APIC, but that's |
| 217 | * ok because only the vector is set up. We still do another | 339 | * ok because only the vector is set up. We still do another |
