5 files changed, 391 insertions, 153 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a84ac7b570e6..5b8394a3a6b2 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
           that might execute the to be patched code.
           Other CPUs are not running. */
        stop_nmi();
-#ifdef CONFIG_X86_MCE
-        stop_mce();
+        /*
-#endif
+         * Don't stop machine check exceptions while patching.
+         * MCEs only happen when something got corrupted and in this
+         * case we must do something about the corruption.
+         * Ignoring it is worse than a unlikely patching race.
+         * Also machine checks tend to be broadcast and if one CPU
+         * goes into machine check the others follow quickly, so we don't
+         * expect a machine check to cause undue problems during to code
+         * patching.
+         */
        apply_alternatives(__alt_instructions, __alt_instructions_end);
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
                                (unsigned long)__smp_locks_end);
        restart_nmi();
-#ifdef CONFIG_X86_MCE
-        restart_mce();
-#endif
 }
 /**
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633e..3552119b091d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
        }
 }
-static unsigned long old_cr4 __initdata;
-void __init stop_mce(void)
-{
-        old_cr4 = read_cr4();
-        clear_in_cr4(X86_CR4_MCE);
-}
-void __init restart_mce(void)
-{
-        if (old_cr4 & X86_CR4_MCE)
-                set_in_cr4(X86_CR4_MCE);
-}
 static int __init mcheck_disable(char *str)
 {
        mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f2..a4a7c686ce90 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
 * Rest from unknown author(s).
 * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
 */
 #include <linux/init.h>
@@ -24,6 +26,8 @@
 #include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/kdebug.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -32,7 +36,12 @@
 #include <asm/idle.h>
 #define MISC_MCELOG_MINOR 227
-#define NR_SYSFS_BANKS 6
+/*
+ * To support more than 128 would need to escape the predefined
+ * Linux defined extended banks first.
+ */
+#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
 atomic_t mce_entry;
@@ -47,7 +56,7 @@ static int mce_dont_init;
 */
 static int tolerant = 1;
 static int banks;
-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
+static u64 *bank;
 static unsigned long notify_user;
 static int rip_msr;
 static int mce_bootlog = -1;
@@ -58,6 +67,14 @@ static char *trigger_argv[2] = { trigger, NULL };
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+        memset(m, 0, sizeof(struct mce));
+        m->cpu = smp_processor_id();
+        rdtscll(m->tsc);
+}
 /*
 * Lockless MCE logging infrastructure.
 * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
                        print_symbol("{%s}", m->ip);
                printk("\n");
        }
-        printk(KERN_EMERG "TSC %Lx ", m->tsc);
+        printk(KERN_EMERG "TSC %llx ", m->tsc);
        if (m->addr)
-                printk("ADDR %Lx ", m->addr);
+                printk("ADDR %llx ", m->addr);
        if (m->misc)
-                printk("MISC %Lx ", m->misc);
+                printk("MISC %llx ", m->misc);
        printk("\n");
        printk(KERN_EMERG "This is not a software problem!\n");
        printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -151,6 +168,8 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 static int mce_available(struct cpuinfo_x86 *c)
 {
+        if (mce_dont_init)
+                return 0;
        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 }
 /*
- * The actual machine check handler
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ */
+void machine_check_poll(enum mcp_flags flags)
+{
+        struct mce m;
+        int i;
+        mce_setup(&m);
+        rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+        for (i = 0; i < banks; i++) {
+                if (!bank[i])
+                        continue;
+                m.misc = 0;
+                m.addr = 0;
+                m.bank = i;
+                m.tsc = 0;
+                barrier();
+                rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+                if (!(m.status & MCI_STATUS_VAL))
+                        continue;
+                /*
+                 * Uncorrected events are handled by the exception handler
+                 * when it is enabled. But when the exception is disabled log
+                 * everything.
+                 *
+                 * TBD do the same check for MCI_STATUS_EN here?
+                 */
+                if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+                        continue;
+                if (m.status & MCI_STATUS_MISCV)
+                        rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+                if (m.status & MCI_STATUS_ADDRV)
+                        rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+                if (!(flags & MCP_TIMESTAMP))
+                        m.tsc = 0;
+                /*
+                 * Don't get the IP here because it's unlikely to
+                 * have anything to do with the actual error location.
+                 */
+                mce_log(&m);
+                add_taint(TAINT_MACHINE_CHECK);
+                /*
+                 * Clear state for this bank.
+                 */
+                wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+        }
+        /*
+         * Don't clear MCG_STATUS here because it's only defined for
+         * exceptions.
+         */
+}
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
 */
 void do_machine_check(struct pt_regs * regs, long error_code)
 {
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
         * error.
         */
        int kill_it = 0;
+        DECLARE_BITMAP(toclear, MAX_NR_BANKS);
        atomic_inc(&mce_entry);
-        if ((regs
+        if (notify_die(DIE_NMI, "machine check", regs, error_code,
-             && notify_die(DIE_NMI, "machine check", regs, error_code,
                           18, SIGKILL) == NOTIFY_STOP)
-            || !banks)
                goto out2;
+        if (!banks)
+                goto out2;
+        mce_setup(&m);
-        memset(&m, 0, sizeof(struct mce));
-        m.cpu = smp_processor_id();
        rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
        /* if the restart IP is not valid, we're done for */
        if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
        barrier();
        for (i = 0; i < banks; i++) {
-                if (i < NR_SYSFS_BANKS && !bank[i])
+                __clear_bit(i, toclear);
+                if (!bank[i])
                        continue;
                m.misc = 0;
                m.addr = 0;
                m.bank = i;
-                m.tsc = 0;
                rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
                if ((m.status & MCI_STATUS_VAL) == 0)
                        continue;
+                /*
+                 * Non uncorrected errors are handled by machine_check_poll
+                 * Leave them alone.
+                 */
+                if ((m.status & MCI_STATUS_UC) == 0)
+                        continue;
+                /*
+                 * Set taint even when machine check was not enabled.
+                 */
+                add_taint(TAINT_MACHINE_CHECK);
+                __set_bit(i, toclear);
                if (m.status & MCI_STATUS_EN) {
                        /* if PCC was set, there's no way out */
                        no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                                        no_way_out = 1;
                                kill_it = 1;
                        }
+                } else {
+                        /*
+                         * Machine check event was not enabled. Clear, but
+                         * ignore.
+                         */
+                        continue;
                }
                if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                        rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
                mce_get_rip(&m, regs);
-                if (error_code >= 0)
+                mce_log(&m);
-                        rdtscll(m.tsc);
-                if (error_code != -2)
-                        mce_log(&m);
                /* Did this bank cause the exception? */
                /* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                        panicm = m;
                        panicm_found = 1;
                }
-                add_taint(TAINT_MACHINE_CHECK);
        }
-        /* Never do anything final in the polling timer */
-        if (!regs)
-                goto out;
        /* If we didn't find an uncorrectable error, pick
           the last one (shouldn't happen, just being safe). */
        if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
        /* notify userspace ASAP */
        set_thread_flag(TIF_MCE_NOTIFY);
- out:
        /* the last thing we do is clear state */
-        for (i = 0; i < banks; i++)
+        for (i = 0; i < banks; i++) {
-                wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+                if (test_bit(i, toclear))
+                        wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+        }
        wrmsrl(MSR_IA32_MCG_STATUS, 0);
 out2:
        atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 * and historically has been the register value of the
 * MSR_IA32_THERMAL_STATUS (Intel) msr.
 */
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+void mce_log_therm_throt_event(__u64 status)
 {
        struct mce m;
-        memset(&m, 0, sizeof(m));
+        mce_setup(&m);
-        m.cpu = cpu;
        m.bank = MCE_THERMAL_BANK;
        m.status = status;
-        rdtscll(m.tsc);
        mce_log(&m);
 }
 #endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,17 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 static int check_interval = 5 * 60; /* 5 minutes */
 static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
+static void mcheck_timer(unsigned long);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mcheck_check_cpu(void *info)
+static void mcheck_timer(unsigned long data)
 {
-        if (mce_available(&current_cpu_data))
+        struct timer_list *t = &per_cpu(mce_timer, data);
-                do_machine_check(NULL, 0);
-}
-static void mcheck_timer(struct work_struct *work)
+        WARN_ON(smp_processor_id() != data);
-{
-        on_each_cpu(mcheck_check_cpu, NULL, 1);
+        if (mce_available(&current_cpu_data))
+                machine_check_poll(MCP_TIMESTAMP);
        /*
         * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -377,14 +476,21 @@ static void mcheck_timer(struct work_struct *work)
                                (int)round_jiffies_relative(check_interval*HZ));
        }
-        schedule_delayed_work(&mcheck_work, next_interval);
+        t->expires = jiffies + next_interval;
+        add_timer(t);
 }
+static void mce_do_trigger(struct work_struct *work)
+{
+        call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
+}
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 /*
- * This is only called from process context.  This is where we do
+ * Notify the user(s) about new machine check events.
- * anything we need to alert userspace about new MCEs.  This is called
+ * Can be called from interrupt context, but not from machine check/NMI
- * directly from the poller and also from entry.S and idle, thanks to
+ * context.
- * TIF_MCE_NOTIFY.
 */
 int mce_notify_user(void)
 {
@@ -394,9 +500,14 @@ int mce_notify_user(void)
                unsigned long now = jiffies;
                wake_up_interruptible(&mce_wait);
-                if (trigger[0])
-                        call_usermodehelper(trigger, trigger_argv, NULL,
+                /*
-                                                UMH_NO_WAIT);
+                 * There is no risk of missing notifications because
+                 * work_pending is always cleared before the function is
+                 * executed.
+                 */
+                if (trigger[0] && !work_pending(&mce_trigger_work))
+                        schedule_work(&mce_trigger_work);
                if (time_after_eq(now, last_print + (check_interval*HZ))) {
                        last_print = now;
@@ -425,63 +536,76 @@ static struct notifier_block mce_idle_notifier = {
 static __init int periodic_mcheck_init(void)
 {
-        next_interval = check_interval * HZ;
+       idle_notifier_register(&mce_idle_notifier);
-        if (next_interval)
+       return 0;
-                schedule_delayed_work(&mcheck_work,
-                                      round_jiffies_relative(next_interval));
-        idle_notifier_register(&mce_idle_notifier);
-        return 0;
 }
 __initcall(periodic_mcheck_init);
 /*
 * Initialize Machine Checks for a CPU.
 */
-static void mce_init(void *dummy)
+static int mce_cap_init(void)
 {
        u64 cap;
-        int i;
+        unsigned b;
        rdmsrl(MSR_IA32_MCG_CAP, cap);
-        banks = cap & 0xff;
+        b = cap & 0xff;
-        if (banks > MCE_EXTENDED_BANK) {
+        if (b > MAX_NR_BANKS) {
-                banks = MCE_EXTENDED_BANK;
+                printk(KERN_WARNING
-                printk(KERN_INFO "MCE: warning: using only %d banks\n",
+                       "MCE: Using only %u machine check banks out of %u\n",
-                       MCE_EXTENDED_BANK);
+                        MAX_NR_BANKS, b);
+                b = MAX_NR_BANKS;
        }
+        /* Don't support asymmetric configurations today */
+        WARN_ON(banks != 0 && b != banks);
+        banks = b;
+        if (!bank) {
+                bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+                if (!bank)
+                        return -ENOMEM;
+                memset(bank, 0xff, banks * sizeof(u64));
+        }
        /* Use accurate RIP reporting if available. */
        if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
                rip_msr = MSR_IA32_MCG_EIP;
-        /* Log the machine checks left over from the previous reset.
+        return 0;
-           This also clears all registers */
+}
-        do_machine_check(NULL, mce_bootlog ? -1 : -2);
+static void mce_init(void *dummy)
+{
+        u64 cap;
+        int i;
+        /*
+         * Log the machine checks left over from the previous reset.
+         */
+        machine_check_poll(MCP_UC);
        set_in_cr4(X86_CR4_MCE);
+        rdmsrl(MSR_IA32_MCG_CAP, cap);
        if (cap & MCG_CTL_P)
                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
        for (i = 0; i < banks; i++) {
-                if (i < NR_SYSFS_BANKS)
+                wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
-                        wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
-                else
-                        wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
                wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
        }
 }
 /* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
        /* This should be disabled by the BIOS, but isn't always */
        if (c->x86_vendor == X86_VENDOR_AMD) {
-                if(c->x86 == 15)
+                if (c->x86 == 15 && banks > 4)
                        /* disable GART TBL walk error reporting, which trips off
                           incorrectly with the IOMMU & 3ware & Cerberus. */
-                        clear_bit(10, &bank[4]);
+                        clear_bit(10, (unsigned long *)&bank[4]);
                if(c->x86 <= 17 && mce_bootlog < 0)
                        /* Lots of broken BIOS around that don't clear them
                           by default and leave crap in there. Don't log. */
@@ -504,20 +628,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
        }
 }
+static void mce_init_timer(void)
+{
+        struct timer_list *t = &__get_cpu_var(mce_timer);
+        /* data race harmless because everyone sets to the same value */
+        if (!next_interval)
+                next_interval = check_interval * HZ;
+        if (!next_interval)
+                return;
+        setup_timer(t, mcheck_timer, smp_processor_id());
+        t->expires = round_jiffies_relative(jiffies + next_interval);
+        add_timer(t);
+}
 /*
 * Called for each booted CPU to set up machine checks.
 * Must be called with preempt off.
 */
 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
-        mce_cpu_quirks(c);
+        if (!mce_available(c))
+                return;
-        if (mce_dont_init ||
+        if (mce_cap_init() < 0) {
-            !mce_available(c))
+                mce_dont_init = 1;
                return;
+        }
+        mce_cpu_quirks(c);
        mce_init(NULL);
        mce_cpu_features(c);
+        mce_init_timer();
 }
 /*
@@ -573,7 +715,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 {
        unsigned long *cpu_tsc;
        static DEFINE_MUTEX(mce_read_mutex);
-        unsigned next;
+        unsigned prev, next;
        char __user *buf = ubuf;
        int i, err;
@@ -592,25 +734,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
        }
        err = 0;
-        for (i = 0; i < next; i++) {
+        prev = 0;
-                unsigned long start = jiffies;
+        do {
+                for (i = prev; i < next; i++) {
-                while (!mcelog.entry[i].finished) {
+                        unsigned long start = jiffies;
-                        if (time_after_eq(jiffies, start + 2)) {
-                                memset(mcelog.entry + i,0, sizeof(struct mce));
+                        while (!mcelog.entry[i].finished) {
-                                goto timeout;
+                                if (time_after_eq(jiffies, start + 2)) {
+                                        memset(mcelog.entry + i, 0,
+                                               sizeof(struct mce));
+                                        goto timeout;
+                                }
+                                cpu_relax();
                        }
-                        cpu_relax();
+                        smp_rmb();
+                        err |= copy_to_user(buf, mcelog.entry + i,
+                                            sizeof(struct mce));
+                        buf += sizeof(struct mce);
+timeout:
+                        ;
                }
-                smp_rmb();
-                err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
-                buf += sizeof(struct mce);
- timeout:
-                ;
-        }
-        memset(mcelog.entry, 0, next * sizeof(struct mce));
+                memset(mcelog.entry + prev, 0,
-        mcelog.next = 0;
+                       (next - prev) * sizeof(struct mce));
+                prev = next;
+                next = cmpxchg(&mcelog.next, prev, 0);
+        } while (next != prev);
        synchronize_sched();
@@ -680,20 +829,6 @@ static struct miscdevice mce_log_device = {
        &mce_chrdev_ops,
 };
-static unsigned long old_cr4 __initdata;
-void __init stop_mce(void)
-{
-        old_cr4 = read_cr4();
-        clear_in_cr4(X86_CR4_MCE);
-}
-void __init restart_mce(void)
-{
-        if (old_cr4 & X86_CR4_MCE)
-                set_in_cr4(X86_CR4_MCE);
-}
 /*
 * Old style boot options parsing. Only for compatibility.
 */
@@ -703,8 +838,7 @@ static int __init mcheck_disable(char *str)
        return 1;
 }
-/* mce=off disables machine check. Note you can re-enable it later
+/* mce=off disables machine check.
-   using sysfs.
   mce=TOLERANCELEVEL (number, see above)
   mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
   mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +862,29 @@ __setup("mce=", mcheck_enable);
 * Sysfs support
 */
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+        int i;
+        for (i = 0; i < banks; i++)
+                wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+        return 0;
+}
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+        return mce_disable();
+}
+static int mce_shutdown(struct sys_device *dev)
+{
+        return mce_disable();
+}
 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
   Only one CPU is active at this time, the others get readded later using
   CPU hotplug. */
@@ -738,20 +895,24 @@ static int mce_resume(struct sys_device *dev)
        return 0;
 }
+static void mce_cpu_restart(void *data)
+{
+        del_timer_sync(&__get_cpu_var(mce_timer));
+        if (mce_available(&current_cpu_data))
+                mce_init(NULL);
+        mce_init_timer();
+}
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
-        if (next_interval)
-                cancel_delayed_work(&mcheck_work);
-        /* Timer race is harmless here */
-        on_each_cpu(mce_init, NULL, 1);
        next_interval = check_interval * HZ;
-        if (next_interval)
+        on_each_cpu(mce_cpu_restart, NULL, 1);
-                schedule_delayed_work(&mcheck_work,
-                                      round_jiffies_relative(next_interval));
 }
 static struct sysdev_class mce_sysclass = {
+        .suspend = mce_suspend,
+        .shutdown = mce_shutdown,
        .resume = mce_resume,
        .name = "machinecheck",
 };
@@ -778,16 +939,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
        }                                                               \
        static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
-/*
+static struct sysdev_attribute *bank_attrs;
- * TBD should generate these dynamically based on number of available banks.
- * Have only 6 contol banks in /sysfs until then.
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
- */
+                         char *buf)
-ACCESSOR(bank0ctl,bank[0],mce_restart())
+{
-ACCESSOR(bank1ctl,bank[1],mce_restart())
+        u64 b = bank[attr - bank_attrs];
-ACCESSOR(bank2ctl,bank[2],mce_restart())
+        return sprintf(buf, "%llx\n", b);
-ACCESSOR(bank3ctl,bank[3],mce_restart())
+}
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+                        const char *buf, size_t siz)
+{
+        char *end;
+        u64 new = simple_strtoull(buf, &end, 0);
+        if (end == buf)
+                return -EINVAL;
+        bank[attr - bank_attrs] = new;
+        mce_restart();
+        return end-buf;
+}
 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
                                char *buf)
@@ -814,8 +985,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 ACCESSOR(check_interval,check_interval,mce_restart())
 static struct sysdev_attribute *mce_attributes[] = {
-        &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
-        &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
        &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
        NULL
 };
@@ -845,11 +1014,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
                if (err)
                        goto error;
        }
+        for (i = 0; i < banks; i++) {
+                err = sysdev_create_file(&per_cpu(device_mce, cpu),
+                                        &bank_attrs[i]);
+                if (err)
+                        goto error2;
+        }
        cpu_set(cpu, mce_device_initialized);
        return 0;
+error2:
+        while (--i >= 0) {
+                sysdev_remove_file(&per_cpu(device_mce, cpu),
+                                        &bank_attrs[i]);
+        }
 error:
-        while (i--) {
+        while (--i >= 0) {
                sysdev_remove_file(&per_cpu(device_mce,cpu),
                                   mce_attributes[i]);
        }
@@ -868,15 +1048,40 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
        for (i = 0; mce_attributes[i]; i++)
                sysdev_remove_file(&per_cpu(device_mce,cpu),
                        mce_attributes[i]);
+        for (i = 0; i < banks; i++)
+                sysdev_remove_file(&per_cpu(device_mce, cpu),
+                        &bank_attrs[i]);
        sysdev_unregister(&per_cpu(device_mce,cpu));
        cpu_clear(cpu, mce_device_initialized);
 }
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void *h)
+{
+        int i;
+        if (!mce_available(&current_cpu_data))
+                return;
+        for (i = 0; i < banks; i++)
+                wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+}
+static void mce_reenable_cpu(void *h)
+{
+        int i;
+        if (!mce_available(&current_cpu_data))
+                return;
+        for (i = 0; i < banks; i++)
+                wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+}
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
+        struct timer_list *t = &per_cpu(mce_timer, cpu);
        switch (action) {
        case CPU_ONLINE:
@@ -891,6 +1096,17 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
                        threshold_cpu_callback(action, cpu);
                mce_remove_device(cpu);
                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                del_timer_sync(t);
+                smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
+                break;
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+                t->expires = round_jiffies_relative(jiffies + next_interval);
+                add_timer_on(t, cpu);
+                smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
+                break;
        }
        return NOTIFY_OK;
 }
@@ -899,6 +1115,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
        .notifier_call = mce_cpu_callback,
 };
+static __init int mce_init_banks(void)
+{
+        int i;
+        bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+                                GFP_KERNEL);
+        if (!bank_attrs)
+                return -ENOMEM;
+        for (i = 0; i < banks; i++) {
+                struct sysdev_attribute *a = &bank_attrs[i];
+                a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+                if (!a->attr.name)
+                        goto nomem;
+                a->attr.mode = 0644;
+                a->show = show_bank;
+                a->store = set_bank;
+        }
+        return 0;
+nomem:
+        while (--i >= 0)
+                kfree(bank_attrs[i].attr.name);
+        kfree(bank_attrs);
+        bank_attrs = NULL;
+        return -ENOMEM;
+}
 static __init int mce_init_device(void)
 {
        int err;
@@ -906,6 +1150,11 @@ static __init int mce_init_device(void)
        if (!mce_available(&boot_cpu_data))
                return -EIO;
+        err = mce_init_banks();
+        if (err)
+                return err;
        err = sysdev_class_register(&mce_sysclass);
        if (err)
                return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index f2ee0ae29bd6..e82c8208b81e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -197,9 +197,7 @@ asmlinkage void mce_threshold_interrupt(void)
        exit_idle();
        irq_enter();
-        memset(&m, 0, sizeof(m));
+        mce_setup(&m);
-        rdtscll(m.tsc);
-        m.cpu = smp_processor_id();
        /* assume first bank caused it */
        for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,7 @@ asmlinkage void mce_threshold_interrupt(void)
                        /* Log the machine check that caused the threshold
                           event. */
-                        do_machine_check(NULL, 0);
+                        machine_check_poll(MCP_TIMESTAMP);
                        if (high & MASK_OVERFLOW_HI) {
                                rdmsrl(address, m.misc);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index f44c36624360..1b1491a76b55 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -24,7 +24,7 @@ asmlinkage void smp_thermal_interrupt(void)
        rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
        if (therm_throt_process(msr_val & 1))
-                mce_log_therm_throt_event(smp_processor_id(), msr_val);
+                mce_log_therm_throt_event(msr_val);
        inc_irq_stat(irq_thermal_count);
        irq_exit();