13 files changed, 804 insertions, 558 deletions
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 525514cf33c..46674fbb62b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -62,6 +62,8 @@ static void __init check_fpu(void)
                return;
        }
+        kernel_fpu_begin();
        /*
         * trap_init() enabled FXSR and company _before_ testing for FP
         * problems here.
@@ -80,6 +82,8 @@ static void __init check_fpu(void)
                : "=m" (*&fdiv_bug)
                : "m" (*&x), "m" (*&y));
+        kernel_fpu_end();
        boot_cpu_data.fdiv_bug = fdiv_bug;
        if (boot_cpu_data.fdiv_bug)
                printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 22a073d7fbf..62184390a60 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,7 +21,7 @@
 #include <linux/topology.h>
 #include <linux/cpumask.h>
 #include <asm/pgtable.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
 #include <asm/apic.h>
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 8095f8611f8..755f64fb074 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -32,11 +32,11 @@
 */
 static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
-        &x86_hyper_vmware,
-        &x86_hyper_ms_hyperv,
 #ifdef CONFIG_XEN_PVHVM
        &x86_hyper_xen_hvm,
 #endif
+        &x86_hyper_vmware,
+        &x86_hyper_ms_hyperv,
 };
 const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1edf5ba4fb2..ed6086eedf1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -456,6 +456,24 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
        if (cpu_has(c, X86_FEATURE_VMX))
                detect_vmx_virtcap(c);
+        /*
+         * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
+         * x86_energy_perf_policy(8) is available to change it at run-time
+         */
+        if (cpu_has(c, X86_FEATURE_EPB)) {
+                u64 epb;
+                rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+                if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
+                        printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
+                                " Set to 'normal', was 'performance'\n"
+                                "ENERGY_PERF_BIAS: View and update with"
+                                " x86_energy_perf_policy(8)\n");
+                        epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+                        wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+                }
+        }
 }
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1e8d66c1336..7395d5f4272 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -43,61 +43,105 @@ static struct severity {
        unsigned char covered;
        char *msg;
 } severities[] = {
-#define KERNEL .context = IN_KERNEL
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
-#define USER .context = IN_USER
+#define  KERNEL         .context = IN_KERNEL
-#define SER .ser = SER_REQUIRED
+#define  USER           .context = IN_USER
-#define NOSER .ser = NO_SER
+#define  SER            .ser = SER_REQUIRED
-#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
+#define  NOSER          .ser = NO_SER
-#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
+#define  BITCLR(x)      .mask = x, .result = 0
-#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
+#define  BITSET(x)      .mask = x, .result = x
-#define MCGMASK(x, res, s, m, r...) \
+#define  MCGMASK(x, y)  .mcgmask = x, .mcgres = y
-        { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
+#define  MASK(x, y)     .mask = x, .result = y
-#define MASK(x, y, s, m, r...) \
-        { .mask = x, .result = y, SEV(s), .msg = m, ## r }
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #define MCACOD 0xffff
-        BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
+        MCESEV(
-        BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
+                NO, "Invalid",
-        BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
+                BITCLR(MCI_STATUS_VAL)
+                ),
+        MCESEV(
+                NO, "Not enabled",
+                BITCLR(MCI_STATUS_EN)
+                ),
+        MCESEV(
+                PANIC, "Processor context corrupt",
+                BITSET(MCI_STATUS_PCC)
+                ),
        /* When MCIP is not set something is very confused */
-        MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
+        MCESEV(
+                PANIC, "MCIP not set in MCA handler",
+                MCGMASK(MCG_STATUS_MCIP, 0)
+                ),
        /* Neither return not error IP -- no chance to recover -> PANIC */
-        MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
+        MCESEV(
-                "Neither restart nor error IP"),
+                PANIC, "Neither restart nor error IP",
-        MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
+                MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
-                KERNEL),
+                ),
-        BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
+        MCESEV(
-        MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
+                PANIC, "In kernel and no restart IP",
-             "Spurious not enabled", SER),
+                KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+                ),
+        MCESEV(
+                KEEP, "Corrected error",
+                NOSER, BITCLR(MCI_STATUS_UC)
+                ),
        /* ignore OVER for UCNA */
-        MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
+        MCESEV(
-             "Uncorrected no action required", SER),
+                KEEP, "Uncorrected no action required",
-        MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
+                SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
-             "Illegal combination (UCNA with AR=1)", SER),
+                ),
-        MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
+        MCESEV(
+                PANIC, "Illegal combination (UCNA with AR=1)",
+                SER,
+                MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+                ),
+        MCESEV(
+                KEEP, "Non signalled machine check",
+                SER, BITCLR(MCI_STATUS_S)
+                ),
        /* AR add known MCACODs here */
-        MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
+        MCESEV(
-             "Action required with lost events", SER),
+                PANIC, "Action required with lost events",
-        MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
+                SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
-             "Action required; unknown MCACOD", SER),
+                ),
+        MCESEV(
+                PANIC, "Action required: unknown MCACOD",
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+                ),
        /* known AO MCACODs: */
-        MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
+        MCESEV(
-             "Action optional: memory scrubbing error", SER),
+                AO, "Action optional: memory scrubbing error",
-        MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
-             "Action optional: last level cache writeback error", SER),
+                ),
+        MCESEV(
-        MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
+                AO, "Action optional: last level cache writeback error",
-             "Action optional unknown MCACOD", SER),
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
-        MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
+                ),
-             "Action optional with lost events", SER),
+        MCESEV(
-        BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
+                SOME, "Action optional: unknown MCACOD",
-        BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
-        BITSET(0, SOME, "No match")     /* always matches. keep at end */
+                ),
+        MCESEV(
+                SOME, "Action optional with lost events",
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+                ),
+        MCESEV(
+                PANIC, "Overflowed uncorrected",
+                BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+                ),
+        MCESEV(
+                UC, "Uncorrected",
+                BITSET(MCI_STATUS_UC)
+                ),
+        MCESEV(
+                SOME, "No match",
+                BITSET(0)
+                )       /* always matches. keep at end */
 };
 /*
@@ -112,15 +156,15 @@ static int error_context(struct mce *m)
        return IN_KERNEL;
 }
-int mce_severity(struct mce *a, int tolerant, char **msg)
+int mce_severity(struct mce *m, int tolerant, char **msg)
 {
-        enum context ctx = error_context(a);
+        enum context ctx = error_context(m);
        struct severity *s;
        for (s = severities;; s++) {
-                if ((a->status & s->mask) != s->result)
+                if ((m->status & s->mask) != s->result)
                        continue;
-                if ((a->mcgstatus & s->mcgmask) != s->mcgres)
+                if ((m->mcgstatus & s->mcgmask) != s->mcgres)
                        continue;
                if (s->ser == SER_REQUIRED && !mce_ser)
                        continue;
@@ -197,15 +241,15 @@ static const struct file_operations severities_coverage_fops = {
 static int __init severities_debugfs_init(void)
 {
-        struct dentry *dmce = NULL, *fseverities_coverage = NULL;
+        struct dentry *dmce, *fsev;
        dmce = mce_get_debugfs_dir();
-        if (dmce == NULL)
+        if (!dmce)
                goto err_out;
-        fseverities_coverage = debugfs_create_file("severities-coverage",
-                                                   0444, dmce, NULL,
+        fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
-                                                   &severities_coverage_fops);
+                                   &severities_coverage_fops);
-        if (fseverities_coverage == NULL)
+        if (!fsev)
                goto err_out;
        return 0;
@@ -214,4 +258,4 @@ err_out:
        return -ENOMEM;
 }
 late_initcall(severities_debugfs_init);
-#endif
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ff1ae9b6464..08363b04212 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -10,7 +10,6 @@
 #include <linux/thread_info.h>
 #include <linux/capability.h>
 #include <linux/miscdevice.h>
-#include <linux/interrupt.h>
 #include <linux/ratelimit.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
@@ -38,23 +37,20 @@
 #include <linux/mm.h>
 #include <linux/debugfs.h>
 #include <linux/edac_mce.h>
+#include <linux/irq_work.h>
 #include <asm/processor.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/ipi.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include "mce-internal.h"
-static DEFINE_MUTEX(mce_read_mutex);
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
 #define rcu_dereference_check_mce(p) \
        rcu_dereference_index_check((p), \
                              rcu_read_lock_sched_held() || \
-                              lockdep_is_held(&mce_read_mutex))
+                              lockdep_is_held(&mce_chrdev_read_mutex))
 #define CREATE_TRACE_POINTS
 #include <trace/events/mce.h>
@@ -94,7 +90,8 @@ static unsigned long		mce_need_notify;
 static char                     mce_helper[128];
 static char                     *mce_helper_argv[2] = { mce_helper, NULL };
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int                      cpu_missing;
@@ -373,6 +370,31 @@ static void mce_wrmsrl(u32 msr, u64 v)
 }
 /*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+{
+        mce_setup(m);
+        m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+        if (regs) {
+                /*
+                 * Get the address of the instruction at the time of
+                 * the machine check error.
+                 */
+                if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+                        m->ip = regs->ip;
+                        m->cs = regs->cs;
+                }
+                /* Use accurate RIP reporting if available. */
+                if (rip_msr)
+                        m->ip = mce_rdmsrl(rip_msr);
+        }
+}
+/*
 * Simple lockless ring to communicate PFNs from the exception handler with the
 * process context work function. This is vastly simplified because there's
 * only a single reader and a single writer.
@@ -443,40 +465,13 @@ static void mce_schedule_work(void)
        }
 }
-/*
+DEFINE_PER_CPU(struct irq_work, mce_irq_work);
- * Get the address of the instruction at the time of the machine check
- * error.
- */
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
-        if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
-                m->ip = regs->ip;
-                m->cs = regs->cs;
-        } else {
-                m->ip = 0;
-                m->cs = 0;
-        }
-        if (rip_msr)
-                m->ip = mce_rdmsrl(rip_msr);
-}
-#ifdef CONFIG_X86_LOCAL_APIC
+static void mce_irq_work_cb(struct irq_work *entry)
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
 {
-        ack_APIC_irq();
-        exit_idle();
-        irq_enter();
        mce_notify_irq();
        mce_schedule_work();
-        irq_exit();
 }
-#endif
 static void mce_report_event(struct pt_regs *regs)
 {
@@ -492,29 +487,7 @@ static void mce_report_event(struct pt_regs *regs)
                return;
        }
-#ifdef CONFIG_X86_LOCAL_APIC
+        irq_work_queue(&__get_cpu_var(mce_irq_work));
-        /*
-         * Without APIC do not notify. The event will be picked
-         * up eventually.
-         */
-        if (!cpu_has_apic)
-                return;
-        /*
-         * When interrupts are disabled we cannot use
-         * kernel services safely. Trigger an self interrupt
-         * through the APIC to instead do the notification
-         * after interrupts are reenabled again.
-         */
-        apic->send_IPI_self(MCE_SELF_VECTOR);
-        /*
-         * Wait for idle afterwards again so that we don't leave the
-         * APIC in a non idle state because the normal APIC writes
-         * cannot exclude us.
-         */
-        apic_wait_icr_idle();
-#endif
 }
 DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -541,9 +514,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
        percpu_inc(mce_poll_count);
-        mce_setup(&m);
+        mce_gather_info(&m, NULL);
-        m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
        for (i = 0; i < banks; i++) {
                if (!mce_banks[i].ctl || !test_bit(i, *b))
                        continue;
@@ -879,9 +851,9 @@ static int mce_usable_address(struct mce *m)
 {
        if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
                return 0;
-        if ((m->misc & 0x3f) > PAGE_SHIFT)
+        if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
                return 0;
-        if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+        if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
                return 0;
        return 1;
 }
@@ -942,9 +914,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
        if (!banks)
                goto out;
-        mce_setup(&m);
+        mce_gather_info(&m, regs);
-        m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
        final = &__get_cpu_var(mces_seen);
        *final = m;
@@ -1028,7 +999,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
                        mce_ring_add(m.addr >> PAGE_SHIFT);
-                mce_get_rip(&m, regs);
                mce_log(&m);
                if (severity > worst) {
@@ -1190,7 +1160,8 @@ int mce_notify_irq(void)
        clear_thread_flag(TIF_MCE_NOTIFY);
        if (test_and_clear_bit(0, &mce_need_notify)) {
-                wake_up_interruptible(&mce_wait);
+                /* wake processes polling /dev/mcelog */
+                wake_up_interruptible(&mce_chrdev_wait);
                /*
                 * There is no risk of missing notifications because
@@ -1363,18 +1334,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
        return 0;
 }
-static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
 {
        if (c->x86 != 5)
-                return;
+                return 0;
        switch (c->x86_vendor) {
        case X86_VENDOR_INTEL:
                intel_p5_mcheck_init(c);
+                return 1;
                break;
        case X86_VENDOR_CENTAUR:
                winchip_mcheck_init(c);
+                return 1;
                break;
        }
+        return 0;
 }
 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
@@ -1428,7 +1404,8 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
        if (mce_disabled)
                return;
-        __mcheck_cpu_ancient_init(c);
+        if (__mcheck_cpu_ancient_init(c))
+                return;
        if (!mce_available(c))
                return;
@@ -1444,44 +1421,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
        __mcheck_cpu_init_vendor(c);
        __mcheck_cpu_init_timer();
        INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
+        init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
 }
 /*
- * Character device to read and clear the MCE log.
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
 */
-static DEFINE_SPINLOCK(mce_state_lock);
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
-static int              open_count;             /* #times opened */
+static int mce_chrdev_open_count;       /* #times opened */
-static int              open_exclu;             /* already open exclusive? */
+static int mce_chrdev_open_exclu;       /* already open exclusive? */
-static int mce_open(struct inode *inode, struct file *file)
+static int mce_chrdev_open(struct inode *inode, struct file *file)
 {
-        spin_lock(&mce_state_lock);
+        spin_lock(&mce_chrdev_state_lock);
-        if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
+        if (mce_chrdev_open_exclu ||
-                spin_unlock(&mce_state_lock);
+            (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+                spin_unlock(&mce_chrdev_state_lock);
                return -EBUSY;
        }
        if (file->f_flags & O_EXCL)
-                open_exclu = 1;
+                mce_chrdev_open_exclu = 1;
-        open_count++;
+        mce_chrdev_open_count++;
-        spin_unlock(&mce_state_lock);
+        spin_unlock(&mce_chrdev_state_lock);
        return nonseekable_open(inode, file);
 }
-static int mce_release(struct inode *inode, struct file *file)
+static int mce_chrdev_release(struct inode *inode, struct file *file)
 {
-        spin_lock(&mce_state_lock);
+        spin_lock(&mce_chrdev_state_lock);
-        open_count--;
+        mce_chrdev_open_count--;
-        open_exclu = 0;
+        mce_chrdev_open_exclu = 0;
-        spin_unlock(&mce_state_lock);
+        spin_unlock(&mce_chrdev_state_lock);
        return 0;
 }
@@ -1530,8 +1508,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
        return 0;
 }
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
-                        loff_t *off)
+                                size_t usize, loff_t *off)
 {
        char __user *buf = ubuf;
        unsigned long *cpu_tsc;
@@ -1542,7 +1520,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
        if (!cpu_tsc)
                return -ENOMEM;
-        mutex_lock(&mce_read_mutex);
+        mutex_lock(&mce_chrdev_read_mutex);
        if (!mce_apei_read_done) {
                err = __mce_read_apei(&buf, usize);
@@ -1562,19 +1540,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
        do {
                for (i = prev; i < next; i++) {
                        unsigned long start = jiffies;
+                        struct mce *m = &mcelog.entry[i];
-                        while (!mcelog.entry[i].finished) {
+                        while (!m->finished) {
                                if (time_after_eq(jiffies, start + 2)) {
-                                        memset(mcelog.entry + i, 0,
+                                        memset(m, 0, sizeof(*m));
-                                               sizeof(struct mce));
                                        goto timeout;
                                }
                                cpu_relax();
                        }
                        smp_rmb();
-                        err |= copy_to_user(buf, mcelog.entry + i,
+                        err |= copy_to_user(buf, m, sizeof(*m));
-                                            sizeof(struct mce));
+                        buf += sizeof(*m);
-                        buf += sizeof(struct mce);
 timeout:
                        ;
                }
@@ -1594,13 +1571,13 @@ timeout:
        on_each_cpu(collect_tscs, cpu_tsc, 1);
        for (i = next; i < MCE_LOG_LEN; i++) {
-                if (mcelog.entry[i].finished &&
+                struct mce *m = &mcelog.entry[i];
-                    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
-                        err |= copy_to_user(buf, mcelog.entry+i,
+                if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
-                                            sizeof(struct mce));
+                        err |= copy_to_user(buf, m, sizeof(*m));
                        smp_rmb();
-                        buf += sizeof(struct mce);
+                        buf += sizeof(*m);
-                        memset(&mcelog.entry[i], 0, sizeof(struct mce));
+                        memset(m, 0, sizeof(*m));
                }
        }
@@ -1608,15 +1585,15 @@ timeout:
                err = -EFAULT;
 out:
-        mutex_unlock(&mce_read_mutex);
+        mutex_unlock(&mce_chrdev_read_mutex);
        kfree(cpu_tsc);
        return err ? err : buf - ubuf;
 }
-static unsigned int mce_poll(struct file *file, poll_table *wait)
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
 {
-        poll_wait(file, &mce_wait, wait);
+        poll_wait(file, &mce_chrdev_wait, wait);
        if (rcu_access_index(mcelog.next))
                return POLLIN | POLLRDNORM;
        if (!mce_apei_read_done && apei_check_mce())
@@ -1624,7 +1601,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
        return 0;
 }
-static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+                                unsigned long arg)
 {
        int __user *p = (int __user *)arg;
@@ -1652,16 +1630,16 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 /* Modified in mce-inject.c, so not static or const */
 struct file_operations mce_chrdev_ops = {
-        .open                   = mce_open,
+        .open                   = mce_chrdev_open,
-        .release                = mce_release,
+        .release                = mce_chrdev_release,
-        .read                   = mce_read,
+        .read                   = mce_chrdev_read,
-        .poll                   = mce_poll,
+        .poll                   = mce_chrdev_poll,
-        .unlocked_ioctl         = mce_ioctl,
+        .unlocked_ioctl         = mce_chrdev_ioctl,
-        .llseek         = no_llseek,
+        .llseek                 = no_llseek,
 };
 EXPORT_SYMBOL_GPL(mce_chrdev_ops);
-static struct miscdevice mce_log_device = {
+static struct miscdevice mce_chrdev_device = {
        MISC_MCELOG_MINOR,
        "mcelog",
        &mce_chrdev_ops,
@@ -1719,7 +1697,7 @@ int __init mcheck_init(void)
 }
 /*
- * Sysfs support
+ * mce_syscore: PM support
 */
 /*
@@ -1739,12 +1717,12 @@ static int mce_disable_error_reporting(void)
        return 0;
 }
-static int mce_suspend(void)
+static int mce_syscore_suspend(void)
 {
        return mce_disable_error_reporting();
 }
-static void mce_shutdown(void)
+static void mce_syscore_shutdown(void)
 {
        mce_disable_error_reporting();
 }
@@ -1754,18 +1732,22 @@ static void mce_shutdown(void)
 * Only one CPU is active at this time, the others get re-added later using
 * CPU hotplug:
 */
-static void mce_resume(void)
+static void mce_syscore_resume(void)
 {
        __mcheck_cpu_init_generic();
        __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
 }
 static struct syscore_ops mce_syscore_ops = {
-        .suspend        = mce_suspend,
+        .suspend        = mce_syscore_suspend,
-        .shutdown       = mce_shutdown,
+        .shutdown       = mce_syscore_shutdown,
-        .resume         = mce_resume,
+        .resume         = mce_syscore_resume,
 };
+/*
+ * mce_sysdev: Sysfs support
+ */
 static void mce_cpu_restart(void *data)
 {
        del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1801,11 +1783,11 @@ static void mce_enable_ce(void *all)
                __mcheck_cpu_init_timer();
 }
-static struct sysdev_class mce_sysclass = {
+static struct sysdev_class mce_sysdev_class = {
        .name           = "machinecheck",
 };
-DEFINE_PER_CPU(struct sys_device, mce_dev);
+DEFINE_PER_CPU(struct sys_device, mce_sysdev);
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -1934,7 +1916,7 @@ static struct sysdev_ext_attribute attr_cmci_disabled = {
        &mce_cmci_disabled
 };
-static struct sysdev_attribute *mce_attrs[] = {
+static struct sysdev_attribute *mce_sysdev_attrs[] = {
        &attr_tolerant.attr,
        &attr_check_interval.attr,
        &attr_trigger,
@@ -1945,66 +1927,67 @@ static struct sysdev_attribute *mce_attrs[] = {
        NULL
 };
-static cpumask_var_t mce_dev_initialized;
+static cpumask_var_t mce_sysdev_initialized;
 /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_create_device(unsigned int cpu)
+static __cpuinit int mce_sysdev_create(unsigned int cpu)
 {
+        struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
        int err;
        int i, j;
        if (!mce_available(&boot_cpu_data))
                return -EIO;
-        memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
+        memset(&sysdev->kobj, 0, sizeof(struct kobject));
-        per_cpu(mce_dev, cpu).id        = cpu;
+        sysdev->id  = cpu;
-        per_cpu(mce_dev, cpu).cls       = &mce_sysclass;
+        sysdev->cls = &mce_sysdev_class;
-        err = sysdev_register(&per_cpu(mce_dev, cpu));
+        err = sysdev_register(sysdev);
        if (err)
                return err;
-        for (i = 0; mce_attrs[i]; i++) {
+        for (i = 0; mce_sysdev_attrs[i]; i++) {
-                err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+                err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
                if (err)
                        goto error;
        }
        for (j = 0; j < banks; j++) {
-                err = sysdev_create_file(&per_cpu(mce_dev, cpu),
+                err = sysdev_create_file(sysdev, &mce_banks[j].attr);
-                                        &mce_banks[j].attr);
                if (err)
                        goto error2;
        }
-        cpumask_set_cpu(cpu, mce_dev_initialized);
+        cpumask_set_cpu(cpu, mce_sysdev_initialized);
        return 0;
 error2:
        while (--j >= 0)
-                sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
+                sysdev_remove_file(sysdev, &mce_banks[j].attr);
 error:
        while (--i >= 0)
-                sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+                sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
-        sysdev_unregister(&per_cpu(mce_dev, cpu));
+        sysdev_unregister(sysdev);
        return err;
 }
-static __cpuinit void mce_remove_device(unsigned int cpu)
+static __cpuinit void mce_sysdev_remove(unsigned int cpu)
 {
+        struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
        int i;
-        if (!cpumask_test_cpu(cpu, mce_dev_initialized))
+        if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
                return;
-        for (i = 0; mce_attrs[i]; i++)
+        for (i = 0; mce_sysdev_attrs[i]; i++)
-                sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+                sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
        for (i = 0; i < banks; i++)
-                sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
+                sysdev_remove_file(sysdev, &mce_banks[i].attr);
-        sysdev_unregister(&per_cpu(mce_dev, cpu));
+        sysdev_unregister(sysdev);
-        cpumask_clear_cpu(cpu, mce_dev_initialized);
+        cpumask_clear_cpu(cpu, mce_sysdev_initialized);
 }
 /* Make sure there are no machine checks on offlined CPUs. */
@@ -2054,7 +2037,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        switch (action) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                mce_create_device(cpu);
+                mce_sysdev_create(cpu);
                if (threshold_cpu_callback)
                        threshold_cpu_callback(action, cpu);
                break;
@@ -2062,7 +2045,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        case CPU_DEAD_FROZEN:
                if (threshold_cpu_callback)
                        threshold_cpu_callback(action, cpu);
-                mce_remove_device(cpu);
+                mce_sysdev_remove(cpu);
                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
@@ -2116,27 +2099,28 @@ static __init int mcheck_init_device(void)
        if (!mce_available(&boot_cpu_data))
                return -EIO;
-        zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
+        zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
        mce_init_banks();
-        err = sysdev_class_register(&mce_sysclass);
+        err = sysdev_class_register(&mce_sysdev_class);
        if (err)
                return err;
        for_each_online_cpu(i) {
-                err = mce_create_device(i);
+                err = mce_sysdev_create(i);
                if (err)
                        return err;
        }
        register_syscore_ops(&mce_syscore_ops);
        register_hotcpu_notifier(&mce_cpu_notifier);
-        misc_register(&mce_log_device);
+        /* register character device /dev/mcelog */
+        misc_register(&mce_chrdev_device);
        return err;
 }
 device_initcall(mcheck_init_device);
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index bb0adad3514..f5474218cff 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -548,7 +548,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                if (!b)
                        goto out;
-                err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
+                err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
                                        b->kobj, name);
                if (err)
                        goto out;
@@ -571,7 +571,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                goto out;
        }
-        b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
+        b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
        if (!b->kobj)
                goto out_free;
@@ -591,7 +591,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                if (i == cpu)
                        continue;
-                err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
+                err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
                                        b->kobj, name);
                if (err)
                        goto out;
@@ -669,7 +669,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
        /* sibling symlink */
        if (shared_bank[bank] && b->blocks->cpu != cpu) {
-                sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
+                sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
                per_cpu(threshold_banks, cpu)[bank] = NULL;
                return;
@@ -681,7 +681,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
                if (i == cpu)
                        continue;
-                sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
+                sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
                per_cpu(threshold_banks, i)[bank] = NULL;
        }
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 929739a653d..6b96110bb0c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -79,7 +79,6 @@ void set_mtrr_ops(const struct mtrr_ops *ops)
 static int have_wrcomb(void)
 {
        struct pci_dev *dev;
-        u8 rev;
        dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
        if (dev != NULL) {
@@ -89,13 +88,11 @@ static int have_wrcomb(void)
                 * chipsets to be tagged
                 */
                if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
-                    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
+                    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
-                        pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+                    dev->revision <= 5) {
-                        if (rev <= 5) {
+                        pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
-                                pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+                        pci_dev_put(dev);
-                                pci_dev_put(dev);
+                        return 0;
-                                return 0;
-                        }
                }
                /*
                 * Intel 450NX errata # 23. Non ascending cacheline evictions to
@@ -137,56 +134,42 @@ static void __init init_table(void)
 }
 struct set_mtrr_data {
-        atomic_t        count;
-        atomic_t        gate;
        unsigned long   smp_base;
        unsigned long   smp_size;
        unsigned int    smp_reg;
        mtrr_type       smp_type;
 };
-static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
 /**
- * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
+ * by all the CPUs.
 * @info: pointer to mtrr configuration data
 *
 * Returns nothing.
 */
-static int mtrr_work_handler(void *info)
+static int mtrr_rendezvous_handler(void *info)
 {
-#ifdef CONFIG_SMP
        struct set_mtrr_data *data = info;
-        unsigned long flags;
-        atomic_dec(&data->count);
+        /*
-        while (!atomic_read(&data->gate))
+         * We use this same function to initialize the mtrrs during boot,
-                cpu_relax();
+         * resume, runtime cpu online and on an explicit request to set a
+         * specific MTRR.
-        local_irq_save(flags);
+         *
+         * During boot or suspend, the state of the boot cpu's mtrrs has been
-        atomic_dec(&data->count);
+         * saved, and we want to replicate that across all the cpus that come
-        while (atomic_read(&data->gate))
+         * online (either at the end of boot or resume or during a runtime cpu
-                cpu_relax();
+         * online). If we're doing that, @reg is set to something special and on
+         * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
-        /*  The master has cleared me to execute  */
+         * started the boot/resume sequence, this might be a duplicate
+         * set_all()).
+         */
        if (data->smp_reg != ~0U) {
                mtrr_if->set(data->smp_reg, data->smp_base,
                             data->smp_size, data->smp_type);
-        } else if (mtrr_aps_delayed_init) {
+        } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
-                /*
-                 * Initialize the MTRRs inaddition to the synchronisation.
-                 */
                mtrr_if->set_all();
        }
-        atomic_dec(&data->count);
-        while (!atomic_read(&data->gate))
-                cpu_relax();
-        atomic_dec(&data->count);
-        local_irq_restore(flags);
-#endif
        return 0;
 }
@@ -223,20 +206,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
 * 14. Wait for buddies to catch up
 * 15. Enable interrupts.
 *
- * What does that mean for us? Well, first we set data.count to the number
+ * What does that mean for us? Well, stop_machine() will ensure that
- * of CPUs. As each CPU announces that it started the rendezvous handler by
+ * the rendezvous handler is started on each CPU. And in lockstep they
- * decrementing the count, We reset data.count and set the data.gate flag
+ * do the state transition of disabling interrupts, updating MTRR's
- * allowing all the cpu's to proceed with the work. As each cpu disables
+ * (the CPU vendors may each do it differently, so we call mtrr_if->set()
- * interrupts, it'll decrement data.count once. We wait until it hits 0 and
+ * callback and let them take care of it.) and enabling interrupts.
- * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
- * are waiting for that flag to be cleared. Once it's cleared, each
- * CPU goes through the transition of updating MTRRs.
- * The CPU vendors may each do it differently,
- * so we call mtrr_if->set() callback and let them take care of it.
- * When they're done, they again decrement data->count and wait for data.gate
- * to be set.
- * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
- * Everyone then enables interrupts and we all continue on.
 *
 * Note that the mechanism is the same for UP systems, too; all the SMP stuff
 * becomes nops.
@@ -244,92 +218,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
 static void
 set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
 {
-        struct set_mtrr_data data;
+        struct set_mtrr_data data = { .smp_reg = reg,
-        unsigned long flags;
+                                      .smp_base = base,
-        int cpu;
+                                      .smp_size = size,
+                                      .smp_type = type
+                                    };
-        preempt_disable();
+        stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
-        data.smp_reg = reg;
-        data.smp_base = base;
-        data.smp_size = size;
-        data.smp_type = type;
-        atomic_set(&data.count, num_booting_cpus() - 1);
-        /* Make sure data.count is visible before unleashing other CPUs */
-        smp_wmb();
-        atomic_set(&data.gate, 0);
-        /* Start the ball rolling on other CPUs */
-        for_each_online_cpu(cpu) {
-                struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
-                if (cpu == smp_processor_id())
-                        continue;
-                stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
-        }
-        while (atomic_read(&data.count))
-                cpu_relax();
-        /* Ok, reset count and toggle gate */
-        atomic_set(&data.count, num_booting_cpus() - 1);
-        smp_wmb();
-        atomic_set(&data.gate, 1);
-        local_irq_save(flags);
-        while (atomic_read(&data.count))
-                cpu_relax();
-        /* Ok, reset count and toggle gate */
-        atomic_set(&data.count, num_booting_cpus() - 1);
-        smp_wmb();
-        atomic_set(&data.gate, 0);
-        /* Do our MTRR business */
-        /*
-         * HACK!
-         *
-         * We use this same function to initialize the mtrrs during boot,
-         * resume, runtime cpu online and on an explicit request to set a
-         * specific MTRR.
-         *
-         * During boot or suspend, the state of the boot cpu's mtrrs has been
-         * saved, and we want to replicate that across all the cpus that come
-         * online (either at the end of boot or resume or during a runtime cpu
-         * online). If we're doing that, @reg is set to something special and on
-         * this cpu we still do mtrr_if->set_all(). During boot/resume, this
-         * is unnecessary if at this point we are still on the cpu that started
-         * the boot/resume sequence. But there is no guarantee that we are still
-         * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
-         * sure that we are in sync with everyone else.
-         */
-        if (reg != ~0U)
-                mtrr_if->set(reg, base, size, type);
-        else
-                mtrr_if->set_all();
-        /* Wait for the others */
-        while (atomic_read(&data.count))
-                cpu_relax();
-        atomic_set(&data.count, num_booting_cpus() - 1);
-        smp_wmb();
-        atomic_set(&data.gate, 1);
-        /*
-         * Wait here for everyone to have seen the gate change
-         * So we're the last ones to touch 'data'
-         */
-        while (atomic_read(&data.count))
-                cpu_relax();
-        local_irq_restore(flags);
+static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
-        preempt_enable();
+                                      unsigned long size, mtrr_type type)
+{
+        struct set_mtrr_data data = { .smp_reg = reg,
+                                      .smp_base = base,
+                                      .smp_size = size,
+                                      .smp_type = type
+                                    };
+        stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
+                                       cpu_callout_mask);
 }
 /**
@@ -783,7 +691,7 @@ void mtrr_ap_init(void)
         *   2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
         *      lock to prevent mtrr entry changes
         */
-        set_mtrr(~0U, 0, 0, 0);
+        set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
 }
 /**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3a0338b4b17..cfa62ec090e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -22,7 +22,6 @@
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
-#include <linux/highmem.h>
 #include <linux/cpu.h>
 #include <linux/bitops.h>
@@ -45,38 +44,27 @@ do {								\
 #endif
 /*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ *          |   NHM/WSM    |      SNB     |
+ * register -------------------------------
+ *          |  HT  | no HT |  HT  | no HT |
+ *-----------------------------------------
+ * offcore  | core | core  | cpu  | core  |
+ * lbr_sel  | core | core  | cpu  | core  |
+ * ld_lat   | cpu  | core  | cpu  | core  |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
 */
-static unsigned long
+enum extra_reg_type {
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+        EXTRA_REG_NONE  = -1,   /* not used */
-{
-        unsigned long offset, addr = (unsigned long)from;
-        unsigned long size, len = 0;
-        struct page *page;
-        void *map;
-        int ret;
-        do {
-                ret = __get_user_pages_fast(addr, 1, 0, &page);
-                if (!ret)
-                        break;
-                offset = addr & (PAGE_SIZE - 1);
-                size = min(PAGE_SIZE - offset, n - len);
-                map = kmap_atomic(page);
-                memcpy(to, map+offset, size);
-                kunmap_atomic(map);
-                put_page(page);
-                len  += size;
+        EXTRA_REG_RSP_0 = 0,    /* offcore_response_0 */
-                to   += size;
+        EXTRA_REG_RSP_1 = 1,    /* offcore_response_1 */
-                addr += size;
-        } while (len < n);
+        EXTRA_REG_MAX           /* number of entries needed */
+};
-        return len;
-}
 struct event_constraint {
        union {
@@ -132,11 +120,10 @@ struct cpu_hw_events {
        struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
        /*
-         * Intel percore register state.
+         * manage shared (per-core, per-cpu) registers
-         * Coordinate shared resources between HT threads.
+         * used on Intel NHM/WSM/SNB
         */
-        int                             percore_used; /* Used by this CPU? */
+        struct intel_shared_regs        *shared_regs;
-        struct intel_percore            *per_core;
        /*
         * AMD specific bits
@@ -187,26 +174,45 @@ struct cpu_hw_events {
        for ((e) = (c); (e)->weight; (e)++)
 /*
+ * Per register state.
+ */
+struct er_account {
+        raw_spinlock_t          lock;   /* per-core: protect structure */
+        u64                     config; /* extra MSR config */
+        u64                     reg;    /* extra MSR number */
+        atomic_t                ref;    /* reference count */
+};
+/*
 * Extra registers for specific events.
+ *
 * Some events need large masks and require external MSRs.
- * Define a mapping to these extra registers.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
 */
 struct extra_reg {
        unsigned int            event;
        unsigned int            msr;
        u64                     config_mask;
        u64                     valid_mask;
+        int                     idx;  /* per_xxx->regs[] reg index */
 };
-#define EVENT_EXTRA_REG(e, ms, m, vm) { \
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) {      \
        .event = (e),           \
        .msr = (ms),            \
        .config_mask = (m),     \
        .valid_mask = (vm),     \
+        .idx = EXTRA_REG_##i    \
        }
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm)   \
-        EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)      \
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
+        EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
 union perf_capabilities {
        struct {
@@ -252,7 +258,6 @@ struct x86_pmu {
        void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
                                                 struct perf_event *event);
        struct event_constraint *event_constraints;
-        struct event_constraint *percore_constraints;
        void            (*quirks)(void);
        int             perfctr_second_write;
@@ -286,8 +291,12 @@ struct x86_pmu {
         * Extra registers for events
         */
        struct extra_reg *extra_regs;
+        unsigned int er_flags;
 };
+#define ERF_NO_HT_SHARING       1
+#define ERF_HAS_RSP_1           2
 static struct x86_pmu x86_pmu __read_mostly;
 static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -393,10 +402,10 @@ static inline unsigned int x86_pmu_event_addr(int index)
 */
 static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 {
+        struct hw_perf_event_extra *reg;
        struct extra_reg *er;
-        event->hw.extra_reg = 0;
+        reg = &event->hw.extra_reg;
-        event->hw.extra_config = 0;
        if (!x86_pmu.extra_regs)
                return 0;
@@ -406,8 +415,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
                        continue;
                if (event->attr.config1 & ~er->valid_mask)
                        return -EINVAL;
-                event->hw.extra_reg = er->msr;
-                event->hw.extra_config = event->attr.config1;
+                reg->idx = er->idx;
+                reg->config = event->attr.config1;
+                reg->reg = er->msr;
                break;
        }
        return 0;
@@ -706,6 +717,9 @@ static int __x86_pmu_event_init(struct perf_event *event)
        event->hw.last_cpu = -1;
        event->hw.last_tag = ~0ULL;
+        /* mark unused */
+        event->hw.extra_reg.idx = EXTRA_REG_NONE;
        return x86_pmu.hw_config(event);
 }
@@ -747,8 +761,8 @@ static void x86_pmu_disable(struct pmu *pmu)
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
                                          u64 enable_mask)
 {
-        if (hwc->extra_reg)
+        if (hwc->extra_reg.reg)
-                wrmsrl(hwc->extra_reg, hwc->extra_config);
+                wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
        wrmsrl(hwc->config_base, hwc->config | enable_mask);
 }
@@ -1332,7 +1346,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
                if (!x86_perf_event_set_period(event))
                        continue;
-                if (perf_event_overflow(event, 1, &data, regs))
+                if (perf_event_overflow(event, &data, regs))
                        x86_pmu_stop(event, 0);
        }
@@ -1637,6 +1651,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
        perf_pmu_enable(pmu);
        return 0;
 }
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+        kfree(cpuc->shared_regs);
+        kfree(cpuc);
+}
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+        struct cpu_hw_events *cpuc;
+        int cpu = raw_smp_processor_id();
+        cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+        if (!cpuc)
+                return ERR_PTR(-ENOMEM);
+        /* only needed, if we have extra_regs */
+        if (x86_pmu.extra_regs) {
+                cpuc->shared_regs = allocate_shared_regs(cpu);
+                if (!cpuc->shared_regs)
+                        goto error;
+        }
+        return cpuc;
+error:
+        free_fake_cpuc(cpuc);
+        return ERR_PTR(-ENOMEM);
+}
 /*
 * validate that we can schedule this event
@@ -1647,9 +1695,9 @@ static int validate_event(struct perf_event *event)
        struct event_constraint *c;
        int ret = 0;
-        fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
+        fake_cpuc = allocate_fake_cpuc();
-        if (!fake_cpuc)
+        if (IS_ERR(fake_cpuc))
-                return -ENOMEM;
+                return PTR_ERR(fake_cpuc);
        c = x86_pmu.get_event_constraints(fake_cpuc, event);
@@ -1659,7 +1707,7 @@ static int validate_event(struct perf_event *event)
        if (x86_pmu.put_event_constraints)
                x86_pmu.put_event_constraints(fake_cpuc, event);
-        kfree(fake_cpuc);
+        free_fake_cpuc(fake_cpuc);
        return ret;
 }
@@ -1679,36 +1727,32 @@ static int validate_group(struct perf_event *event)
 {
        struct perf_event *leader = event->group_leader;
        struct cpu_hw_events *fake_cpuc;
-        int ret, n;
+        int ret = -ENOSPC, n;
-        ret = -ENOMEM;
-        fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
-        if (!fake_cpuc)
-                goto out;
+        fake_cpuc = allocate_fake_cpuc();
+        if (IS_ERR(fake_cpuc))
+                return PTR_ERR(fake_cpuc);
        /*
         * the event is not yet connected with its
         * siblings therefore we must first collect
         * existing siblings, then add the new event
         * before we can simulate the scheduling
         */
-        ret = -ENOSPC;
        n = collect_events(fake_cpuc, leader, true);
        if (n < 0)
-                goto out_free;
+                goto out;
        fake_cpuc->n_events = n;
        n = collect_events(fake_cpuc, event, false);
        if (n < 0)
-                goto out_free;
+                goto out;
        fake_cpuc->n_events = n;
        ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
-out_free:
-        kfree(fake_cpuc);
 out:
+        free_fake_cpuc(fake_cpuc);
        return ret;
 }
@@ -1856,6 +1900,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
        perf_callchain_store(entry, regs->ip);
+        if (!current->mm)
+                return;
        if (perf_callchain_user32(regs, entry))
                return;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index fe29c1d2219..941caa2e449 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids
                [ C(RESULT_MISS)   ] = -1,
        },
 },
+ [ C(NODE) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+                [ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+ },
 };
 /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 41178c826c4..f88af2c2a56 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,25 +1,15 @@
 #ifdef CONFIG_CPU_SUP_INTEL
-#define MAX_EXTRA_REGS 2
-/*
- * Per register state.
- */
-struct er_account {
-        int                     ref;            /* reference count */
-        unsigned int            extra_reg;      /* extra MSR number */
-        u64                     extra_config;   /* extra MSR config */
-};
 /*
- * Per core state
+ * Per core/cpu state
- * This used to coordinate shared registers for HT threads.
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
 */
-struct intel_percore {
+struct intel_shared_regs {
-        raw_spinlock_t          lock;           /* protect structure */
+        struct er_account       regs[EXTRA_REG_MAX];
-        struct er_account       regs[MAX_EXTRA_REGS];
+        int                     refcnt;         /* per-core: #HT threads */
-        int                     refcnt;         /* number of threads */
+        unsigned                core_id;        /* per-core: core id */
-        unsigned                core_id;
 };
 /*
@@ -88,16 +78,10 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
 static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
 {
-        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
        EVENT_EXTRA_END
 };
-static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
-{
-        INTEL_EVENT_CONSTRAINT(0xb7, 0),
-        EVENT_CONSTRAINT_END
-};
 static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
 {
        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -116,8 +100,6 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
        FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
        /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
        INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
-        INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
-        INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
        INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
        INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
        EVENT_CONSTRAINT_END
@@ -125,15 +107,13 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
 {
-        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
-        INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
+        INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
        EVENT_EXTRA_END
 };
-static struct event_constraint intel_westmere_percore_constraints[] __read_mostly =
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
 {
-        INTEL_EVENT_CONSTRAINT(0xb7, 0),
-        INTEL_EVENT_CONSTRAINT(0xbb, 0),
        EVENT_CONSTRAINT_END
 };
@@ -145,6 +125,12 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
        EVENT_CONSTRAINT_END
 };
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+        INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+        EVENT_EXTRA_END
+};
 static u64 intel_pmu_event_map(int hw_event)
 {
        return intel_perfmon_event_map[hw_event];
@@ -245,6 +231,21 @@ static __initconst const u64 snb_hw_cache_event_ids
                [ C(RESULT_MISS)   ] = -1,
        },
 },
+ [ C(NODE) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+ },
 };
 static __initconst const u64 westmere_hw_cache_event_ids
@@ -346,6 +347,20 @@ static __initconst const u64 westmere_hw_cache_event_ids
                [ C(RESULT_MISS)   ] = -1,
        },
 },
+ [ C(NODE) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+ },
 };
 /*
@@ -398,7 +413,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
                [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
                [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
        },
- }
+ },
+ [ C(NODE) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
+                [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
+                [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
+                [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
+        },
+ },
 };
 static __initconst const u64 nehalem_hw_cache_event_ids
@@ -500,6 +529,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids
                [ C(RESULT_MISS)   ] = -1,
        },
 },
+ [ C(NODE) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+ },
 };
 static __initconst const u64 core2_hw_cache_event_ids
@@ -1003,7 +1046,7 @@ again:
                data.period = event->hw.last_period;
-                if (perf_event_overflow(event, 1, &data, regs))
+                if (perf_event_overflow(event, &data, regs))
                        x86_pmu_stop(event, 0);
        }
@@ -1037,65 +1080,121 @@ intel_bts_constraints(struct perf_event *event)
        return NULL;
 }
+static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+{
+        if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+                return false;
+        if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
+                event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+                event->hw.config |= 0x01bb;
+                event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
+                event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+        } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+                event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+                event->hw.config |= 0x01b7;
+                event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
+                event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+        }
+        if (event->hw.extra_reg.idx == orig_idx)
+                return false;
+        return true;
+}
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
 static struct event_constraint *
-intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+                                   struct perf_event *event)
 {
-        struct hw_perf_event *hwc = &event->hw;
+        struct event_constraint *c = &emptyconstraint;
-        unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
+        struct hw_perf_event_extra *reg = &event->hw.extra_reg;
-        struct event_constraint *c;
-        struct intel_percore *pc;
        struct er_account *era;
-        int i;
+        unsigned long flags;
-        int free_slot;
+        int orig_idx = reg->idx;
-        int found;
-        if (!x86_pmu.percore_constraints || hwc->extra_alloc)
+        /* already allocated shared msr */
-                return NULL;
+        if (reg->alloc)
+                return &unconstrained;
-        for (c = x86_pmu.percore_constraints; c->cmask; c++) {
+again:
-                if (e != c->code)
+        era = &cpuc->shared_regs->regs[reg->idx];
-                        continue;
+        /*
+         * we use spin_lock_irqsave() to avoid lockdep issues when
+         * passing a fake cpuc
+         */
+        raw_spin_lock_irqsave(&era->lock, flags);
+        if (!atomic_read(&era->ref) || era->config == reg->config) {
+                /* lock in msr value */
+                era->config = reg->config;
+                era->reg = reg->reg;
+                /* one more user */
+                atomic_inc(&era->ref);
+                /* no need to reallocate during incremental event scheduling */
+                reg->alloc = 1;
                /*
-                 * Allocate resource per core.
+                 * All events using extra_reg are unconstrained.
+                 * Avoids calling x86_get_event_constraints()
+                 *
+                 * Must revisit if extra_reg controlling events
+                 * ever have constraints. Worst case we go through
+                 * the regular event constraint table.
                 */
-                pc = cpuc->per_core;
+                c = &unconstrained;
-                if (!pc)
+        } else if (intel_try_alt_er(event, orig_idx)) {
-                        break;
+                raw_spin_unlock(&era->lock);
-                c = &emptyconstraint;
+                goto again;
-                raw_spin_lock(&pc->lock);
-                free_slot = -1;
-                found = 0;
-                for (i = 0; i < MAX_EXTRA_REGS; i++) {
-                        era = &pc->regs[i];
-                        if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
-                                /* Allow sharing same config */
-                                if (hwc->extra_config == era->extra_config) {
-                                        era->ref++;
-                                        cpuc->percore_used = 1;
-                                        hwc->extra_alloc = 1;
-                                        c = NULL;
-                                }
-                                /* else conflict */
-                                found = 1;
-                                break;
-                        } else if (era->ref == 0 && free_slot == -1)
-                                free_slot = i;
-                }
-                if (!found && free_slot != -1) {
-                        era = &pc->regs[free_slot];
-                        era->ref = 1;
-                        era->extra_reg = hwc->extra_reg;
-                        era->extra_config = hwc->extra_config;
-                        cpuc->percore_used = 1;
-                        hwc->extra_alloc = 1;
-                        c = NULL;
-                }
-                raw_spin_unlock(&pc->lock);
-                return c;
        }
+        raw_spin_unlock_irqrestore(&era->lock, flags);
-        return NULL;
+        return c;
+}
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+                                   struct hw_perf_event_extra *reg)
+{
+        struct er_account *era;
+        /*
+         * only put constraint if extra reg was actually
+         * allocated. Also takes care of event which do
+         * not use an extra shared reg
+         */
+        if (!reg->alloc)
+                return;
+        era = &cpuc->shared_regs->regs[reg->idx];
+        /* one fewer user */
+        atomic_dec(&era->ref);
+        /* allocate again next time */
+        reg->alloc = 0;
+}
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+                              struct perf_event *event)
+{
+        struct event_constraint *c = NULL;
+        if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
+                c = __intel_shared_reg_get_constraints(cpuc, event);
+        return c;
 }
 static struct event_constraint *
@@ -1111,49 +1210,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
        if (c)
                return c;
-        c = intel_percore_constraints(cpuc, event);
+        c = intel_shared_regs_constraints(cpuc, event);
        if (c)
                return c;
        return x86_get_event_constraints(cpuc, event);
 }
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
                                        struct perf_event *event)
 {
-        struct extra_reg *er;
+        struct hw_perf_event_extra *reg;
-        struct intel_percore *pc;
-        struct er_account *era;
-        struct hw_perf_event *hwc = &event->hw;
-        int i, allref;
-        if (!cpuc->percore_used)
+        reg = &event->hw.extra_reg;
-                return;
+        if (reg->idx != EXTRA_REG_NONE)
+                __intel_shared_reg_put_constraints(cpuc, reg);
-        for (er = x86_pmu.extra_regs; er->msr; er++) {
+}
-                if (er->event != (hwc->config & er->config_mask))
-                        continue;
-                pc = cpuc->per_core;
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
-                raw_spin_lock(&pc->lock);
+                                        struct perf_event *event)
-                for (i = 0; i < MAX_EXTRA_REGS; i++) {
+{
-                        era = &pc->regs[i];
+        intel_put_shared_regs_event_constraints(cpuc, event);
-                        if (era->ref > 0 &&
-                            era->extra_config == hwc->extra_config &&
-                            era->extra_reg == er->msr) {
-                                era->ref--;
-                                hwc->extra_alloc = 0;
-                                break;
-                        }
-                }
-                allref = 0;
-                for (i = 0; i < MAX_EXTRA_REGS; i++)
-                        allref += pc->regs[i].ref;
-                if (allref == 0)
-                        cpuc->percore_used = 0;
-                raw_spin_unlock(&pc->lock);
-                break;
-        }
 }
 static int intel_pmu_hw_config(struct perf_event *event)
@@ -1231,20 +1309,36 @@ static __initconst const struct x86_pmu core_pmu = {
        .event_constraints      = intel_core_event_constraints,
 };
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+        struct intel_shared_regs *regs;
+        int i;
+        regs = kzalloc_node(sizeof(struct intel_shared_regs),
+                            GFP_KERNEL, cpu_to_node(cpu));
+        if (regs) {
+                /*
+                 * initialize the locks to keep lockdep happy
+                 */
+                for (i = 0; i < EXTRA_REG_MAX; i++)
+                        raw_spin_lock_init(&regs->regs[i].lock);
+                regs->core_id = -1;
+        }
+        return regs;
+}
 static int intel_pmu_cpu_prepare(int cpu)
 {
        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-        if (!cpu_has_ht_siblings())
+        if (!x86_pmu.extra_regs)
                return NOTIFY_OK;
-        cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
+        cpuc->shared_regs = allocate_shared_regs(cpu);
-                                      GFP_KERNEL, cpu_to_node(cpu));
+        if (!cpuc->shared_regs)
-        if (!cpuc->per_core)
                return NOTIFY_BAD;
-        raw_spin_lock_init(&cpuc->per_core->lock);
-        cpuc->per_core->core_id = -1;
        return NOTIFY_OK;
 }
@@ -1260,32 +1354,34 @@ static void intel_pmu_cpu_starting(int cpu)
         */
        intel_pmu_lbr_reset();
-        if (!cpu_has_ht_siblings())
+        if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
                return;
        for_each_cpu(i, topology_thread_cpumask(cpu)) {
-                struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
+                struct intel_shared_regs *pc;
+                pc = per_cpu(cpu_hw_events, i).shared_regs;
                if (pc && pc->core_id == core_id) {
-                        kfree(cpuc->per_core);
+                        kfree(cpuc->shared_regs);
-                        cpuc->per_core = pc;
+                        cpuc->shared_regs = pc;
                        break;
                }
        }
-        cpuc->per_core->core_id = core_id;
+        cpuc->shared_regs->core_id = core_id;
-        cpuc->per_core->refcnt++;
+        cpuc->shared_regs->refcnt++;
 }
 static void intel_pmu_cpu_dying(int cpu)
 {
        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-        struct intel_percore *pc = cpuc->per_core;
+        struct intel_shared_regs *pc;
+        pc = cpuc->shared_regs;
        if (pc) {
                if (pc->core_id == -1 || --pc->refcnt == 0)
                        kfree(pc);
-                cpuc->per_core = NULL;
+                cpuc->shared_regs = NULL;
        }
        fini_debug_store_on_cpu(cpu);
@@ -1436,7 +1532,6 @@ static __init int intel_pmu_init(void)
                x86_pmu.event_constraints = intel_nehalem_event_constraints;
                x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
-                x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
                x86_pmu.enable_all = intel_pmu_nhm_enable_all;
                x86_pmu.extra_regs = intel_nehalem_extra_regs;
@@ -1481,10 +1576,10 @@ static __init int intel_pmu_init(void)
                intel_pmu_lbr_init_nhm();
                x86_pmu.event_constraints = intel_westmere_event_constraints;
-                x86_pmu.percore_constraints = intel_westmere_percore_constraints;
                x86_pmu.enable_all = intel_pmu_nhm_enable_all;
                x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
                x86_pmu.extra_regs = intel_westmere_extra_regs;
+                x86_pmu.er_flags |= ERF_HAS_RSP_1;
                /* UOPS_ISSUED.STALLED_CYCLES */
                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1495,6 +1590,7 @@ static __init int intel_pmu_init(void)
                break;
        case 42: /* SandyBridge */
+        case 45: /* SandyBridge, "Romely-EP" */
                memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
@@ -1502,6 +1598,10 @@ static __init int intel_pmu_init(void)
                x86_pmu.event_constraints = intel_snb_event_constraints;
                x86_pmu.pebs_constraints = intel_snb_pebs_events;
+                x86_pmu.extra_regs = intel_snb_extra_regs;
+                /* all extra regs are per-cpu when HT is on */
+                x86_pmu.er_flags |= ERF_HAS_RSP_1;
+                x86_pmu.er_flags |= ERF_NO_HT_SHARING;
                /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1512,11 +1612,19 @@ static __init int intel_pmu_init(void)
                break;
        default:
-                /*
+                switch (x86_pmu.version) {
-                 * default constraints for v2 and up
+                case 1:
-                 */
+                        x86_pmu.event_constraints = intel_v1_event_constraints;
-                x86_pmu.event_constraints = intel_gen_event_constraints;
+                        pr_cont("generic architected perfmon v1, ");
-                pr_cont("generic architected perfmon, ");
+                        break;
+                default:
+                        /*
+                         * default constraints for v2 and up
+                         */
+                        x86_pmu.event_constraints = intel_gen_event_constraints;
+                        pr_cont("generic architected perfmon, ");
+                        break;
+                }
        }
        return 0;
 }
@@ -1528,4 +1636,8 @@ static int intel_pmu_init(void)
        return 0;
 }
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+        return NULL;
+}
 #endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index bab491b8ee2..3213c52db76 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void)
         */
        perf_prepare_sample(&header, &data, event, &regs);
-        if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+        if (perf_output_begin(&handle, event, header.size * (top - at)))
                return 1;
        for (; at < top; at++) {
@@ -508,6 +508,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
        unsigned long from = cpuc->lbr_entries[0].from;
        unsigned long old_to, to = cpuc->lbr_entries[0].to;
        unsigned long ip = regs->ip;
+        int is_64bit = 0;
        /*
         * We don't need to fixup if the PEBS assist is fault like
@@ -559,7 +560,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
                } else
                        kaddr = (void *)to;
-                kernel_insn_init(&insn, kaddr);
+#ifdef CONFIG_X86_64
+                is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
+#endif
+                insn_init(&insn, kaddr, is_64bit);
                insn_get_length(&insn);
                to += insn.length;
        } while (to < ip);
@@ -616,7 +620,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
        else
                regs.flags &= ~PERF_EFLAGS_EXACT;
-        if (perf_event_overflow(event, 1, &data, &regs))
+        if (perf_event_overflow(event, &data, &regs))
                x86_pmu_stop(event, 0);
 }
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ead584fb6a7..7809d2bcb20 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -554,13 +554,102 @@ static __initconst const u64 p4_hw_cache_event_ids
                [ C(RESULT_MISS)   ] = -1,
        },
 },
+ [ C(NODE) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+ },
 };
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+struct p4_event_alias {
+        u64 original;
+        u64 alternative;
+} p4_event_aliases[] = {
+        {
+                /*
+                 * Non-halted cycles can be substituted with non-sleeping cycles (see
+                 * Intel SDM Vol3b for details). We need this alias to be able
+                 * to run nmi-watchdog and 'perf top' (or any other user space tool
+                 * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+                 * simultaneously.
+                 */
+        .original       =
+                p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+        .alternative    =
+                p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)             |
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+                p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT          |
+                                    P4_CCCR_COMPARE),
+        },
+};
+static u64 p4_get_alias_event(u64 config)
+{
+        u64 config_match;
+        int i;
+        /*
+         * Only event with special mark is allowed,
+         * we're to be sure it didn't come as malformed
+         * RAW event.
+         */
+        if (!(config & P4_CONFIG_ALIASABLE))
+                return 0;
+        config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+        for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+                if (config_match == p4_event_aliases[i].original) {
+                        config_match = p4_event_aliases[i].alternative;
+                        break;
+                } else if (config_match == p4_event_aliases[i].alternative) {
+                        config_match = p4_event_aliases[i].original;
+                        break;
+                }
+        }
+        if (i >= ARRAY_SIZE(p4_event_aliases))
+                return 0;
+        return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
 static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
  /* non-halted CPU clocks */
  [PERF_COUNT_HW_CPU_CYCLES] =
        p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
-                P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+                P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))       |
+                P4_CONFIG_ALIASABLE,
  /*
   * retired instructions
@@ -945,7 +1034,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
                if (!x86_perf_event_set_period(event))
                        continue;
-                if (perf_event_overflow(event, 1, &data, regs))
+                if (perf_event_overflow(event, &data, regs))
                        x86_pmu_stop(event, 0);
        }
@@ -1120,6 +1209,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
        struct p4_event_bind *bind;
        unsigned int i, thread, num;
        int cntr_idx, escr_idx;
+        u64 config_alias;
+        int pass;
        bitmap_zero(used_mask, X86_PMC_IDX_MAX);
        bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
@@ -1128,6 +1219,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
                hwc = &cpuc->event_list[i]->hw;
                thread = p4_ht_thread(cpu);
+                pass = 0;
+again:
+                /*
+                 * It's possible to hit a circular lock
+                 * between original and alternative events
+                 * if both are scheduled already.
+                 */
+                if (pass > 2)
+                        goto done;
                bind = p4_config_get_bind(hwc->config);
                escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
                if (unlikely(escr_idx == -1))
@@ -1141,8 +1243,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
                }
                cntr_idx = p4_next_cntr(thread, used_mask, bind);
-                if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
+                if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
-                        goto done;
+                        /*
+                         * Check whether an event alias is still available.
+                         */
+                        config_alias = p4_get_alias_event(hwc->config);
+                        if (!config_alias)
+                                goto done;
+                        hwc->config = config_alias;
+                        pass++;
+                        goto again;
+                }
                p4_pmu_swap_config_ts(hwc, cpu);
                if (assign)