aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/bugs.c4
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c18
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c152
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c288
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c184
-rw-r--r--arch/x86/kernel/cpu/perf_event.c171
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c14
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c386
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c10
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c119
13 files changed, 804 insertions, 558 deletions
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 525514cf33c..46674fbb62b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -62,6 +62,8 @@ static void __init check_fpu(void)
62 return; 62 return;
63 } 63 }
64 64
65 kernel_fpu_begin();
66
65 /* 67 /*
66 * trap_init() enabled FXSR and company _before_ testing for FP 68 * trap_init() enabled FXSR and company _before_ testing for FP
67 * problems here. 69 * problems here.
@@ -80,6 +82,8 @@ static void __init check_fpu(void)
80 : "=m" (*&fdiv_bug) 82 : "=m" (*&fdiv_bug)
81 : "m" (*&x), "m" (*&y)); 83 : "m" (*&x), "m" (*&y));
82 84
85 kernel_fpu_end();
86
83 boot_cpu_data.fdiv_bug = fdiv_bug; 87 boot_cpu_data.fdiv_bug = fdiv_bug;
84 if (boot_cpu_data.fdiv_bug) 88 if (boot_cpu_data.fdiv_bug)
85 printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); 89 printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 22a073d7fbf..62184390a60 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,7 +21,7 @@
21#include <linux/topology.h> 21#include <linux/topology.h>
22#include <linux/cpumask.h> 22#include <linux/cpumask.h>
23#include <asm/pgtable.h> 23#include <asm/pgtable.h>
24#include <asm/atomic.h> 24#include <linux/atomic.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/setup.h> 26#include <asm/setup.h>
27#include <asm/apic.h> 27#include <asm/apic.h>
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 8095f8611f8..755f64fb074 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -32,11 +32,11 @@
32 */ 32 */
33static const __initconst struct hypervisor_x86 * const hypervisors[] = 33static const __initconst struct hypervisor_x86 * const hypervisors[] =
34{ 34{
35 &x86_hyper_vmware,
36 &x86_hyper_ms_hyperv,
37#ifdef CONFIG_XEN_PVHVM 35#ifdef CONFIG_XEN_PVHVM
38 &x86_hyper_xen_hvm, 36 &x86_hyper_xen_hvm,
39#endif 37#endif
38 &x86_hyper_vmware,
39 &x86_hyper_ms_hyperv,
40}; 40};
41 41
42const struct hypervisor_x86 *x86_hyper; 42const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1edf5ba4fb2..ed6086eedf1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -456,6 +456,24 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
456 456
457 if (cpu_has(c, X86_FEATURE_VMX)) 457 if (cpu_has(c, X86_FEATURE_VMX))
458 detect_vmx_virtcap(c); 458 detect_vmx_virtcap(c);
459
460 /*
461 * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
462 * x86_energy_perf_policy(8) is available to change it at run-time
463 */
464 if (cpu_has(c, X86_FEATURE_EPB)) {
465 u64 epb;
466
467 rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
468 if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
469 printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
470 " Set to 'normal', was 'performance'\n"
471 "ENERGY_PERF_BIAS: View and update with"
472 " x86_energy_perf_policy(8)\n");
473 epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
474 wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
475 }
476 }
459} 477}
460 478
461#ifdef CONFIG_X86_32 479#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1e8d66c1336..7395d5f4272 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -43,61 +43,105 @@ static struct severity {
43 unsigned char covered; 43 unsigned char covered;
44 char *msg; 44 char *msg;
45} severities[] = { 45} severities[] = {
46#define KERNEL .context = IN_KERNEL 46#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
47#define USER .context = IN_USER 47#define KERNEL .context = IN_KERNEL
48#define SER .ser = SER_REQUIRED 48#define USER .context = IN_USER
49#define NOSER .ser = NO_SER 49#define SER .ser = SER_REQUIRED
50#define SEV(s) .sev = MCE_ ## s ## _SEVERITY 50#define NOSER .ser = NO_SER
51#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } 51#define BITCLR(x) .mask = x, .result = 0
52#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } 52#define BITSET(x) .mask = x, .result = x
53#define MCGMASK(x, res, s, m, r...) \ 53#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
54 { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } 54#define MASK(x, y) .mask = x, .result = y
55#define MASK(x, y, s, m, r...) \
56 { .mask = x, .result = y, SEV(s), .msg = m, ## r }
57#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) 55#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
58#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) 56#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
59#define MCACOD 0xffff 57#define MCACOD 0xffff
60 58
61 BITCLR(MCI_STATUS_VAL, NO, "Invalid"), 59 MCESEV(
62 BITCLR(MCI_STATUS_EN, NO, "Not enabled"), 60 NO, "Invalid",
63 BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), 61 BITCLR(MCI_STATUS_VAL)
62 ),
63 MCESEV(
64 NO, "Not enabled",
65 BITCLR(MCI_STATUS_EN)
66 ),
67 MCESEV(
68 PANIC, "Processor context corrupt",
69 BITSET(MCI_STATUS_PCC)
70 ),
64 /* When MCIP is not set something is very confused */ 71 /* When MCIP is not set something is very confused */
65 MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), 72 MCESEV(
73 PANIC, "MCIP not set in MCA handler",
74 MCGMASK(MCG_STATUS_MCIP, 0)
75 ),
66 /* Neither return not error IP -- no chance to recover -> PANIC */ 76 /* Neither return not error IP -- no chance to recover -> PANIC */
67 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, 77 MCESEV(
68 "Neither restart nor error IP"), 78 PANIC, "Neither restart nor error IP",
69 MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", 79 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
70 KERNEL), 80 ),
71 BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), 81 MCESEV(
72 MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, 82 PANIC, "In kernel and no restart IP",
73 "Spurious not enabled", SER), 83 KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
84 ),
85 MCESEV(
86 KEEP, "Corrected error",
87 NOSER, BITCLR(MCI_STATUS_UC)
88 ),
74 89
75 /* ignore OVER for UCNA */ 90 /* ignore OVER for UCNA */
76 MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, 91 MCESEV(
77 "Uncorrected no action required", SER), 92 KEEP, "Uncorrected no action required",
78 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, 93 SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
79 "Illegal combination (UCNA with AR=1)", SER), 94 ),
80 MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), 95 MCESEV(
96 PANIC, "Illegal combination (UCNA with AR=1)",
97 SER,
98 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
99 ),
100 MCESEV(
101 KEEP, "Non signalled machine check",
102 SER, BITCLR(MCI_STATUS_S)
103 ),
81 104
82 /* AR add known MCACODs here */ 105 /* AR add known MCACODs here */
83 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, 106 MCESEV(
84 "Action required with lost events", SER), 107 PANIC, "Action required with lost events",
85 MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, 108 SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
86 "Action required; unknown MCACOD", SER), 109 ),
110 MCESEV(
111 PANIC, "Action required: unknown MCACOD",
112 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
113 ),
87 114
88 /* known AO MCACODs: */ 115 /* known AO MCACODs: */
89 MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, 116 MCESEV(
90 "Action optional: memory scrubbing error", SER), 117 AO, "Action optional: memory scrubbing error",
91 MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, 118 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
92 "Action optional: last level cache writeback error", SER), 119 ),
93 120 MCESEV(
94 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, 121 AO, "Action optional: last level cache writeback error",
95 "Action optional unknown MCACOD", SER), 122 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
96 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, 123 ),
97 "Action optional with lost events", SER), 124 MCESEV(
98 BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), 125 SOME, "Action optional: unknown MCACOD",
99 BITSET(MCI_STATUS_UC, UC, "Uncorrected"), 126 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
100 BITSET(0, SOME, "No match") /* always matches. keep at end */ 127 ),
128 MCESEV(
129 SOME, "Action optional with lost events",
130 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
131 ),
132
133 MCESEV(
134 PANIC, "Overflowed uncorrected",
135 BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
136 ),
137 MCESEV(
138 UC, "Uncorrected",
139 BITSET(MCI_STATUS_UC)
140 ),
141 MCESEV(
142 SOME, "No match",
143 BITSET(0)
144 ) /* always matches. keep at end */
101}; 145};
102 146
103/* 147/*
@@ -112,15 +156,15 @@ static int error_context(struct mce *m)
112 return IN_KERNEL; 156 return IN_KERNEL;
113} 157}
114 158
115int mce_severity(struct mce *a, int tolerant, char **msg) 159int mce_severity(struct mce *m, int tolerant, char **msg)
116{ 160{
117 enum context ctx = error_context(a); 161 enum context ctx = error_context(m);
118 struct severity *s; 162 struct severity *s;
119 163
120 for (s = severities;; s++) { 164 for (s = severities;; s++) {
121 if ((a->status & s->mask) != s->result) 165 if ((m->status & s->mask) != s->result)
122 continue; 166 continue;
123 if ((a->mcgstatus & s->mcgmask) != s->mcgres) 167 if ((m->mcgstatus & s->mcgmask) != s->mcgres)
124 continue; 168 continue;
125 if (s->ser == SER_REQUIRED && !mce_ser) 169 if (s->ser == SER_REQUIRED && !mce_ser)
126 continue; 170 continue;
@@ -197,15 +241,15 @@ static const struct file_operations severities_coverage_fops = {
197 241
198static int __init severities_debugfs_init(void) 242static int __init severities_debugfs_init(void)
199{ 243{
200 struct dentry *dmce = NULL, *fseverities_coverage = NULL; 244 struct dentry *dmce, *fsev;
201 245
202 dmce = mce_get_debugfs_dir(); 246 dmce = mce_get_debugfs_dir();
203 if (dmce == NULL) 247 if (!dmce)
204 goto err_out; 248 goto err_out;
205 fseverities_coverage = debugfs_create_file("severities-coverage", 249
206 0444, dmce, NULL, 250 fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
207 &severities_coverage_fops); 251 &severities_coverage_fops);
208 if (fseverities_coverage == NULL) 252 if (!fsev)
209 goto err_out; 253 goto err_out;
210 254
211 return 0; 255 return 0;
@@ -214,4 +258,4 @@ err_out:
214 return -ENOMEM; 258 return -ENOMEM;
215} 259}
216late_initcall(severities_debugfs_init); 260late_initcall(severities_debugfs_init);
217#endif 261#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ff1ae9b6464..08363b04212 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -10,7 +10,6 @@
10#include <linux/thread_info.h> 10#include <linux/thread_info.h>
11#include <linux/capability.h> 11#include <linux/capability.h>
12#include <linux/miscdevice.h> 12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h> 13#include <linux/ratelimit.h>
15#include <linux/kallsyms.h> 14#include <linux/kallsyms.h>
16#include <linux/rcupdate.h> 15#include <linux/rcupdate.h>
@@ -38,23 +37,20 @@
38#include <linux/mm.h> 37#include <linux/mm.h>
39#include <linux/debugfs.h> 38#include <linux/debugfs.h>
40#include <linux/edac_mce.h> 39#include <linux/edac_mce.h>
40#include <linux/irq_work.h>
41 41
42#include <asm/processor.h> 42#include <asm/processor.h>
43#include <asm/hw_irq.h>
44#include <asm/apic.h>
45#include <asm/idle.h>
46#include <asm/ipi.h>
47#include <asm/mce.h> 43#include <asm/mce.h>
48#include <asm/msr.h> 44#include <asm/msr.h>
49 45
50#include "mce-internal.h" 46#include "mce-internal.h"
51 47
52static DEFINE_MUTEX(mce_read_mutex); 48static DEFINE_MUTEX(mce_chrdev_read_mutex);
53 49
54#define rcu_dereference_check_mce(p) \ 50#define rcu_dereference_check_mce(p) \
55 rcu_dereference_index_check((p), \ 51 rcu_dereference_index_check((p), \
56 rcu_read_lock_sched_held() || \ 52 rcu_read_lock_sched_held() || \
57 lockdep_is_held(&mce_read_mutex)) 53 lockdep_is_held(&mce_chrdev_read_mutex))
58 54
59#define CREATE_TRACE_POINTS 55#define CREATE_TRACE_POINTS
60#include <trace/events/mce.h> 56#include <trace/events/mce.h>
@@ -94,7 +90,8 @@ static unsigned long mce_need_notify;
94static char mce_helper[128]; 90static char mce_helper[128];
95static char *mce_helper_argv[2] = { mce_helper, NULL }; 91static char *mce_helper_argv[2] = { mce_helper, NULL };
96 92
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 93static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
94
98static DEFINE_PER_CPU(struct mce, mces_seen); 95static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing; 96static int cpu_missing;
100 97
@@ -373,6 +370,31 @@ static void mce_wrmsrl(u32 msr, u64 v)
373} 370}
374 371
375/* 372/*
373 * Collect all global (w.r.t. this processor) status about this machine
374 * check into our "mce" struct so that we can use it later to assess
375 * the severity of the problem as we read per-bank specific details.
376 */
377static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
378{
379 mce_setup(m);
380
381 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
382 if (regs) {
383 /*
384 * Get the address of the instruction at the time of
385 * the machine check error.
386 */
387 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
388 m->ip = regs->ip;
389 m->cs = regs->cs;
390 }
391 /* Use accurate RIP reporting if available. */
392 if (rip_msr)
393 m->ip = mce_rdmsrl(rip_msr);
394 }
395}
396
397/*
376 * Simple lockless ring to communicate PFNs from the exception handler with the 398 * Simple lockless ring to communicate PFNs from the exception handler with the
377 * process context work function. This is vastly simplified because there's 399 * process context work function. This is vastly simplified because there's
378 * only a single reader and a single writer. 400 * only a single reader and a single writer.
@@ -443,40 +465,13 @@ static void mce_schedule_work(void)
443 } 465 }
444} 466}
445 467
446/* 468DEFINE_PER_CPU(struct irq_work, mce_irq_work);
447 * Get the address of the instruction at the time of the machine check
448 * error.
449 */
450static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
451{
452
453 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
454 m->ip = regs->ip;
455 m->cs = regs->cs;
456 } else {
457 m->ip = 0;
458 m->cs = 0;
459 }
460 if (rip_msr)
461 m->ip = mce_rdmsrl(rip_msr);
462}
463 469
464#ifdef CONFIG_X86_LOCAL_APIC 470static void mce_irq_work_cb(struct irq_work *entry)
465/*
466 * Called after interrupts have been reenabled again
467 * when a MCE happened during an interrupts off region
468 * in the kernel.
469 */
470asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
471{ 471{
472 ack_APIC_irq();
473 exit_idle();
474 irq_enter();
475 mce_notify_irq(); 472 mce_notify_irq();
476 mce_schedule_work(); 473 mce_schedule_work();
477 irq_exit();
478} 474}
479#endif
480 475
481static void mce_report_event(struct pt_regs *regs) 476static void mce_report_event(struct pt_regs *regs)
482{ 477{
@@ -492,29 +487,7 @@ static void mce_report_event(struct pt_regs *regs)
492 return; 487 return;
493 } 488 }
494 489
495#ifdef CONFIG_X86_LOCAL_APIC 490 irq_work_queue(&__get_cpu_var(mce_irq_work));
496 /*
497 * Without APIC do not notify. The event will be picked
498 * up eventually.
499 */
500 if (!cpu_has_apic)
501 return;
502
503 /*
504 * When interrupts are disabled we cannot use
505 * kernel services safely. Trigger an self interrupt
506 * through the APIC to instead do the notification
507 * after interrupts are reenabled again.
508 */
509 apic->send_IPI_self(MCE_SELF_VECTOR);
510
511 /*
512 * Wait for idle afterwards again so that we don't leave the
513 * APIC in a non idle state because the normal APIC writes
514 * cannot exclude us.
515 */
516 apic_wait_icr_idle();
517#endif
518} 491}
519 492
520DEFINE_PER_CPU(unsigned, mce_poll_count); 493DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -541,9 +514,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
541 514
542 percpu_inc(mce_poll_count); 515 percpu_inc(mce_poll_count);
543 516
544 mce_setup(&m); 517 mce_gather_info(&m, NULL);
545 518
546 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
547 for (i = 0; i < banks; i++) { 519 for (i = 0; i < banks; i++) {
548 if (!mce_banks[i].ctl || !test_bit(i, *b)) 520 if (!mce_banks[i].ctl || !test_bit(i, *b))
549 continue; 521 continue;
@@ -879,9 +851,9 @@ static int mce_usable_address(struct mce *m)
879{ 851{
880 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 852 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
881 return 0; 853 return 0;
882 if ((m->misc & 0x3f) > PAGE_SHIFT) 854 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
883 return 0; 855 return 0;
884 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 856 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
885 return 0; 857 return 0;
886 return 1; 858 return 1;
887} 859}
@@ -942,9 +914,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
942 if (!banks) 914 if (!banks)
943 goto out; 915 goto out;
944 916
945 mce_setup(&m); 917 mce_gather_info(&m, regs);
946 918
947 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
948 final = &__get_cpu_var(mces_seen); 919 final = &__get_cpu_var(mces_seen);
949 *final = m; 920 *final = m;
950 921
@@ -1028,7 +999,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1028 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 999 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1029 mce_ring_add(m.addr >> PAGE_SHIFT); 1000 mce_ring_add(m.addr >> PAGE_SHIFT);
1030 1001
1031 mce_get_rip(&m, regs);
1032 mce_log(&m); 1002 mce_log(&m);
1033 1003
1034 if (severity > worst) { 1004 if (severity > worst) {
@@ -1190,7 +1160,8 @@ int mce_notify_irq(void)
1190 clear_thread_flag(TIF_MCE_NOTIFY); 1160 clear_thread_flag(TIF_MCE_NOTIFY);
1191 1161
1192 if (test_and_clear_bit(0, &mce_need_notify)) { 1162 if (test_and_clear_bit(0, &mce_need_notify)) {
1193 wake_up_interruptible(&mce_wait); 1163 /* wake processes polling /dev/mcelog */
1164 wake_up_interruptible(&mce_chrdev_wait);
1194 1165
1195 /* 1166 /*
1196 * There is no risk of missing notifications because 1167 * There is no risk of missing notifications because
@@ -1363,18 +1334,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1363 return 0; 1334 return 0;
1364} 1335}
1365 1336
1366static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1337static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1367{ 1338{
1368 if (c->x86 != 5) 1339 if (c->x86 != 5)
1369 return; 1340 return 0;
1341
1370 switch (c->x86_vendor) { 1342 switch (c->x86_vendor) {
1371 case X86_VENDOR_INTEL: 1343 case X86_VENDOR_INTEL:
1372 intel_p5_mcheck_init(c); 1344 intel_p5_mcheck_init(c);
1345 return 1;
1373 break; 1346 break;
1374 case X86_VENDOR_CENTAUR: 1347 case X86_VENDOR_CENTAUR:
1375 winchip_mcheck_init(c); 1348 winchip_mcheck_init(c);
1349 return 1;
1376 break; 1350 break;
1377 } 1351 }
1352
1353 return 0;
1378} 1354}
1379 1355
1380static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1356static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
@@ -1428,7 +1404,8 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1428 if (mce_disabled) 1404 if (mce_disabled)
1429 return; 1405 return;
1430 1406
1431 __mcheck_cpu_ancient_init(c); 1407 if (__mcheck_cpu_ancient_init(c))
1408 return;
1432 1409
1433 if (!mce_available(c)) 1410 if (!mce_available(c))
1434 return; 1411 return;
@@ -1444,44 +1421,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1444 __mcheck_cpu_init_vendor(c); 1421 __mcheck_cpu_init_vendor(c);
1445 __mcheck_cpu_init_timer(); 1422 __mcheck_cpu_init_timer();
1446 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1423 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1447 1424 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
1448} 1425}
1449 1426
1450/* 1427/*
1451 * Character device to read and clear the MCE log. 1428 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1452 */ 1429 */
1453 1430
1454static DEFINE_SPINLOCK(mce_state_lock); 1431static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1455static int open_count; /* #times opened */ 1432static int mce_chrdev_open_count; /* #times opened */
1456static int open_exclu; /* already open exclusive? */ 1433static int mce_chrdev_open_exclu; /* already open exclusive? */
1457 1434
1458static int mce_open(struct inode *inode, struct file *file) 1435static int mce_chrdev_open(struct inode *inode, struct file *file)
1459{ 1436{
1460 spin_lock(&mce_state_lock); 1437 spin_lock(&mce_chrdev_state_lock);
1461 1438
1462 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1439 if (mce_chrdev_open_exclu ||
1463 spin_unlock(&mce_state_lock); 1440 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1441 spin_unlock(&mce_chrdev_state_lock);
1464 1442
1465 return -EBUSY; 1443 return -EBUSY;
1466 } 1444 }
1467 1445
1468 if (file->f_flags & O_EXCL) 1446 if (file->f_flags & O_EXCL)
1469 open_exclu = 1; 1447 mce_chrdev_open_exclu = 1;
1470 open_count++; 1448 mce_chrdev_open_count++;
1471 1449
1472 spin_unlock(&mce_state_lock); 1450 spin_unlock(&mce_chrdev_state_lock);
1473 1451
1474 return nonseekable_open(inode, file); 1452 return nonseekable_open(inode, file);
1475} 1453}
1476 1454
1477static int mce_release(struct inode *inode, struct file *file) 1455static int mce_chrdev_release(struct inode *inode, struct file *file)
1478{ 1456{
1479 spin_lock(&mce_state_lock); 1457 spin_lock(&mce_chrdev_state_lock);
1480 1458
1481 open_count--; 1459 mce_chrdev_open_count--;
1482 open_exclu = 0; 1460 mce_chrdev_open_exclu = 0;
1483 1461
1484 spin_unlock(&mce_state_lock); 1462 spin_unlock(&mce_chrdev_state_lock);
1485 1463
1486 return 0; 1464 return 0;
1487} 1465}
@@ -1530,8 +1508,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
1530 return 0; 1508 return 0;
1531} 1509}
1532 1510
1533static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1511static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1534 loff_t *off) 1512 size_t usize, loff_t *off)
1535{ 1513{
1536 char __user *buf = ubuf; 1514 char __user *buf = ubuf;
1537 unsigned long *cpu_tsc; 1515 unsigned long *cpu_tsc;
@@ -1542,7 +1520,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1542 if (!cpu_tsc) 1520 if (!cpu_tsc)
1543 return -ENOMEM; 1521 return -ENOMEM;
1544 1522
1545 mutex_lock(&mce_read_mutex); 1523 mutex_lock(&mce_chrdev_read_mutex);
1546 1524
1547 if (!mce_apei_read_done) { 1525 if (!mce_apei_read_done) {
1548 err = __mce_read_apei(&buf, usize); 1526 err = __mce_read_apei(&buf, usize);
@@ -1562,19 +1540,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1562 do { 1540 do {
1563 for (i = prev; i < next; i++) { 1541 for (i = prev; i < next; i++) {
1564 unsigned long start = jiffies; 1542 unsigned long start = jiffies;
1543 struct mce *m = &mcelog.entry[i];
1565 1544
1566 while (!mcelog.entry[i].finished) { 1545 while (!m->finished) {
1567 if (time_after_eq(jiffies, start + 2)) { 1546 if (time_after_eq(jiffies, start + 2)) {
1568 memset(mcelog.entry + i, 0, 1547 memset(m, 0, sizeof(*m));
1569 sizeof(struct mce));
1570 goto timeout; 1548 goto timeout;
1571 } 1549 }
1572 cpu_relax(); 1550 cpu_relax();
1573 } 1551 }
1574 smp_rmb(); 1552 smp_rmb();
1575 err |= copy_to_user(buf, mcelog.entry + i, 1553 err |= copy_to_user(buf, m, sizeof(*m));
1576 sizeof(struct mce)); 1554 buf += sizeof(*m);
1577 buf += sizeof(struct mce);
1578timeout: 1555timeout:
1579 ; 1556 ;
1580 } 1557 }
@@ -1594,13 +1571,13 @@ timeout:
1594 on_each_cpu(collect_tscs, cpu_tsc, 1); 1571 on_each_cpu(collect_tscs, cpu_tsc, 1);
1595 1572
1596 for (i = next; i < MCE_LOG_LEN; i++) { 1573 for (i = next; i < MCE_LOG_LEN; i++) {
1597 if (mcelog.entry[i].finished && 1574 struct mce *m = &mcelog.entry[i];
1598 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1575
1599 err |= copy_to_user(buf, mcelog.entry+i, 1576 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1600 sizeof(struct mce)); 1577 err |= copy_to_user(buf, m, sizeof(*m));
1601 smp_rmb(); 1578 smp_rmb();
1602 buf += sizeof(struct mce); 1579 buf += sizeof(*m);
1603 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1580 memset(m, 0, sizeof(*m));
1604 } 1581 }
1605 } 1582 }
1606 1583
@@ -1608,15 +1585,15 @@ timeout:
1608 err = -EFAULT; 1585 err = -EFAULT;
1609 1586
1610out: 1587out:
1611 mutex_unlock(&mce_read_mutex); 1588 mutex_unlock(&mce_chrdev_read_mutex);
1612 kfree(cpu_tsc); 1589 kfree(cpu_tsc);
1613 1590
1614 return err ? err : buf - ubuf; 1591 return err ? err : buf - ubuf;
1615} 1592}
1616 1593
1617static unsigned int mce_poll(struct file *file, poll_table *wait) 1594static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1618{ 1595{
1619 poll_wait(file, &mce_wait, wait); 1596 poll_wait(file, &mce_chrdev_wait, wait);
1620 if (rcu_access_index(mcelog.next)) 1597 if (rcu_access_index(mcelog.next))
1621 return POLLIN | POLLRDNORM; 1598 return POLLIN | POLLRDNORM;
1622 if (!mce_apei_read_done && apei_check_mce()) 1599 if (!mce_apei_read_done && apei_check_mce())
@@ -1624,7 +1601,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
1624 return 0; 1601 return 0;
1625} 1602}
1626 1603
1627static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1604static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1605 unsigned long arg)
1628{ 1606{
1629 int __user *p = (int __user *)arg; 1607 int __user *p = (int __user *)arg;
1630 1608
@@ -1652,16 +1630,16 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1652 1630
1653/* Modified in mce-inject.c, so not static or const */ 1631/* Modified in mce-inject.c, so not static or const */
1654struct file_operations mce_chrdev_ops = { 1632struct file_operations mce_chrdev_ops = {
1655 .open = mce_open, 1633 .open = mce_chrdev_open,
1656 .release = mce_release, 1634 .release = mce_chrdev_release,
1657 .read = mce_read, 1635 .read = mce_chrdev_read,
1658 .poll = mce_poll, 1636 .poll = mce_chrdev_poll,
1659 .unlocked_ioctl = mce_ioctl, 1637 .unlocked_ioctl = mce_chrdev_ioctl,
1660 .llseek = no_llseek, 1638 .llseek = no_llseek,
1661}; 1639};
1662EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1640EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1663 1641
1664static struct miscdevice mce_log_device = { 1642static struct miscdevice mce_chrdev_device = {
1665 MISC_MCELOG_MINOR, 1643 MISC_MCELOG_MINOR,
1666 "mcelog", 1644 "mcelog",
1667 &mce_chrdev_ops, 1645 &mce_chrdev_ops,
@@ -1719,7 +1697,7 @@ int __init mcheck_init(void)
1719} 1697}
1720 1698
1721/* 1699/*
1722 * Sysfs support 1700 * mce_syscore: PM support
1723 */ 1701 */
1724 1702
1725/* 1703/*
@@ -1739,12 +1717,12 @@ static int mce_disable_error_reporting(void)
1739 return 0; 1717 return 0;
1740} 1718}
1741 1719
1742static int mce_suspend(void) 1720static int mce_syscore_suspend(void)
1743{ 1721{
1744 return mce_disable_error_reporting(); 1722 return mce_disable_error_reporting();
1745} 1723}
1746 1724
1747static void mce_shutdown(void) 1725static void mce_syscore_shutdown(void)
1748{ 1726{
1749 mce_disable_error_reporting(); 1727 mce_disable_error_reporting();
1750} 1728}
@@ -1754,18 +1732,22 @@ static void mce_shutdown(void)
1754 * Only one CPU is active at this time, the others get re-added later using 1732 * Only one CPU is active at this time, the others get re-added later using
1755 * CPU hotplug: 1733 * CPU hotplug:
1756 */ 1734 */
1757static void mce_resume(void) 1735static void mce_syscore_resume(void)
1758{ 1736{
1759 __mcheck_cpu_init_generic(); 1737 __mcheck_cpu_init_generic();
1760 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1738 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1761} 1739}
1762 1740
1763static struct syscore_ops mce_syscore_ops = { 1741static struct syscore_ops mce_syscore_ops = {
1764 .suspend = mce_suspend, 1742 .suspend = mce_syscore_suspend,
1765 .shutdown = mce_shutdown, 1743 .shutdown = mce_syscore_shutdown,
1766 .resume = mce_resume, 1744 .resume = mce_syscore_resume,
1767}; 1745};
1768 1746
1747/*
1748 * mce_sysdev: Sysfs support
1749 */
1750
1769static void mce_cpu_restart(void *data) 1751static void mce_cpu_restart(void *data)
1770{ 1752{
1771 del_timer_sync(&__get_cpu_var(mce_timer)); 1753 del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1801,11 +1783,11 @@ static void mce_enable_ce(void *all)
1801 __mcheck_cpu_init_timer(); 1783 __mcheck_cpu_init_timer();
1802} 1784}
1803 1785
1804static struct sysdev_class mce_sysclass = { 1786static struct sysdev_class mce_sysdev_class = {
1805 .name = "machinecheck", 1787 .name = "machinecheck",
1806}; 1788};
1807 1789
1808DEFINE_PER_CPU(struct sys_device, mce_dev); 1790DEFINE_PER_CPU(struct sys_device, mce_sysdev);
1809 1791
1810__cpuinitdata 1792__cpuinitdata
1811void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1793void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -1934,7 +1916,7 @@ static struct sysdev_ext_attribute attr_cmci_disabled = {
1934 &mce_cmci_disabled 1916 &mce_cmci_disabled
1935}; 1917};
1936 1918
1937static struct sysdev_attribute *mce_attrs[] = { 1919static struct sysdev_attribute *mce_sysdev_attrs[] = {
1938 &attr_tolerant.attr, 1920 &attr_tolerant.attr,
1939 &attr_check_interval.attr, 1921 &attr_check_interval.attr,
1940 &attr_trigger, 1922 &attr_trigger,
@@ -1945,66 +1927,67 @@ static struct sysdev_attribute *mce_attrs[] = {
1945 NULL 1927 NULL
1946}; 1928};
1947 1929
1948static cpumask_var_t mce_dev_initialized; 1930static cpumask_var_t mce_sysdev_initialized;
1949 1931
1950/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1932/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1951static __cpuinit int mce_create_device(unsigned int cpu) 1933static __cpuinit int mce_sysdev_create(unsigned int cpu)
1952{ 1934{
1935 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
1953 int err; 1936 int err;
1954 int i, j; 1937 int i, j;
1955 1938
1956 if (!mce_available(&boot_cpu_data)) 1939 if (!mce_available(&boot_cpu_data))
1957 return -EIO; 1940 return -EIO;
1958 1941
1959 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1942 memset(&sysdev->kobj, 0, sizeof(struct kobject));
1960 per_cpu(mce_dev, cpu).id = cpu; 1943 sysdev->id = cpu;
1961 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1944 sysdev->cls = &mce_sysdev_class;
1962 1945
1963 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1946 err = sysdev_register(sysdev);
1964 if (err) 1947 if (err)
1965 return err; 1948 return err;
1966 1949
1967 for (i = 0; mce_attrs[i]; i++) { 1950 for (i = 0; mce_sysdev_attrs[i]; i++) {
1968 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1951 err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
1969 if (err) 1952 if (err)
1970 goto error; 1953 goto error;
1971 } 1954 }
1972 for (j = 0; j < banks; j++) { 1955 for (j = 0; j < banks; j++) {
1973 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1956 err = sysdev_create_file(sysdev, &mce_banks[j].attr);
1974 &mce_banks[j].attr);
1975 if (err) 1957 if (err)
1976 goto error2; 1958 goto error2;
1977 } 1959 }
1978 cpumask_set_cpu(cpu, mce_dev_initialized); 1960 cpumask_set_cpu(cpu, mce_sysdev_initialized);
1979 1961
1980 return 0; 1962 return 0;
1981error2: 1963error2:
1982 while (--j >= 0) 1964 while (--j >= 0)
1983 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1965 sysdev_remove_file(sysdev, &mce_banks[j].attr);
1984error: 1966error:
1985 while (--i >= 0) 1967 while (--i >= 0)
1986 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1968 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
1987 1969
1988 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1970 sysdev_unregister(sysdev);
1989 1971
1990 return err; 1972 return err;
1991} 1973}
1992 1974
1993static __cpuinit void mce_remove_device(unsigned int cpu) 1975static __cpuinit void mce_sysdev_remove(unsigned int cpu)
1994{ 1976{
1977 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
1995 int i; 1978 int i;
1996 1979
1997 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1980 if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
1998 return; 1981 return;
1999 1982
2000 for (i = 0; mce_attrs[i]; i++) 1983 for (i = 0; mce_sysdev_attrs[i]; i++)
2001 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1984 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
2002 1985
2003 for (i = 0; i < banks; i++) 1986 for (i = 0; i < banks; i++)
2004 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1987 sysdev_remove_file(sysdev, &mce_banks[i].attr);
2005 1988
2006 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1989 sysdev_unregister(sysdev);
2007 cpumask_clear_cpu(cpu, mce_dev_initialized); 1990 cpumask_clear_cpu(cpu, mce_sysdev_initialized);
2008} 1991}
2009 1992
2010/* Make sure there are no machine checks on offlined CPUs. */ 1993/* Make sure there are no machine checks on offlined CPUs. */
@@ -2054,7 +2037,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2054 switch (action) { 2037 switch (action) {
2055 case CPU_ONLINE: 2038 case CPU_ONLINE:
2056 case CPU_ONLINE_FROZEN: 2039 case CPU_ONLINE_FROZEN:
2057 mce_create_device(cpu); 2040 mce_sysdev_create(cpu);
2058 if (threshold_cpu_callback) 2041 if (threshold_cpu_callback)
2059 threshold_cpu_callback(action, cpu); 2042 threshold_cpu_callback(action, cpu);
2060 break; 2043 break;
@@ -2062,7 +2045,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2062 case CPU_DEAD_FROZEN: 2045 case CPU_DEAD_FROZEN:
2063 if (threshold_cpu_callback) 2046 if (threshold_cpu_callback)
2064 threshold_cpu_callback(action, cpu); 2047 threshold_cpu_callback(action, cpu);
2065 mce_remove_device(cpu); 2048 mce_sysdev_remove(cpu);
2066 break; 2049 break;
2067 case CPU_DOWN_PREPARE: 2050 case CPU_DOWN_PREPARE:
2068 case CPU_DOWN_PREPARE_FROZEN: 2051 case CPU_DOWN_PREPARE_FROZEN:
@@ -2116,27 +2099,28 @@ static __init int mcheck_init_device(void)
2116 if (!mce_available(&boot_cpu_data)) 2099 if (!mce_available(&boot_cpu_data))
2117 return -EIO; 2100 return -EIO;
2118 2101
2119 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2102 zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
2120 2103
2121 mce_init_banks(); 2104 mce_init_banks();
2122 2105
2123 err = sysdev_class_register(&mce_sysclass); 2106 err = sysdev_class_register(&mce_sysdev_class);
2124 if (err) 2107 if (err)
2125 return err; 2108 return err;
2126 2109
2127 for_each_online_cpu(i) { 2110 for_each_online_cpu(i) {
2128 err = mce_create_device(i); 2111 err = mce_sysdev_create(i);
2129 if (err) 2112 if (err)
2130 return err; 2113 return err;
2131 } 2114 }
2132 2115
2133 register_syscore_ops(&mce_syscore_ops); 2116 register_syscore_ops(&mce_syscore_ops);
2134 register_hotcpu_notifier(&mce_cpu_notifier); 2117 register_hotcpu_notifier(&mce_cpu_notifier);
2135 misc_register(&mce_log_device); 2118
2119 /* register character device /dev/mcelog */
2120 misc_register(&mce_chrdev_device);
2136 2121
2137 return err; 2122 return err;
2138} 2123}
2139
2140device_initcall(mcheck_init_device); 2124device_initcall(mcheck_init_device);
2141 2125
2142/* 2126/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index bb0adad3514..f5474218cff 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -548,7 +548,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
548 if (!b) 548 if (!b)
549 goto out; 549 goto out;
550 550
551 err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, 551 err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
552 b->kobj, name); 552 b->kobj, name);
553 if (err) 553 if (err)
554 goto out; 554 goto out;
@@ -571,7 +571,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
571 goto out; 571 goto out;
572 } 572 }
573 573
574 b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); 574 b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
575 if (!b->kobj) 575 if (!b->kobj)
576 goto out_free; 576 goto out_free;
577 577
@@ -591,7 +591,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
591 if (i == cpu) 591 if (i == cpu)
592 continue; 592 continue;
593 593
594 err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, 594 err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
595 b->kobj, name); 595 b->kobj, name);
596 if (err) 596 if (err)
597 goto out; 597 goto out;
@@ -669,7 +669,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
669#ifdef CONFIG_SMP 669#ifdef CONFIG_SMP
670 /* sibling symlink */ 670 /* sibling symlink */
671 if (shared_bank[bank] && b->blocks->cpu != cpu) { 671 if (shared_bank[bank] && b->blocks->cpu != cpu) {
672 sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); 672 sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
673 per_cpu(threshold_banks, cpu)[bank] = NULL; 673 per_cpu(threshold_banks, cpu)[bank] = NULL;
674 674
675 return; 675 return;
@@ -681,7 +681,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
681 if (i == cpu) 681 if (i == cpu)
682 continue; 682 continue;
683 683
684 sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); 684 sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
685 per_cpu(threshold_banks, i)[bank] = NULL; 685 per_cpu(threshold_banks, i)[bank] = NULL;
686 } 686 }
687 687
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 929739a653d..6b96110bb0c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -79,7 +79,6 @@ void set_mtrr_ops(const struct mtrr_ops *ops)
79static int have_wrcomb(void) 79static int have_wrcomb(void)
80{ 80{
81 struct pci_dev *dev; 81 struct pci_dev *dev;
82 u8 rev;
83 82
84 dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); 83 dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
85 if (dev != NULL) { 84 if (dev != NULL) {
@@ -89,13 +88,11 @@ static int have_wrcomb(void)
89 * chipsets to be tagged 88 * chipsets to be tagged
90 */ 89 */
91 if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && 90 if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
92 dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { 91 dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
93 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 92 dev->revision <= 5) {
94 if (rev <= 5) { 93 pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
95 pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); 94 pci_dev_put(dev);
96 pci_dev_put(dev); 95 return 0;
97 return 0;
98 }
99 } 96 }
100 /* 97 /*
101 * Intel 450NX errata # 23. Non ascending cacheline evictions to 98 * Intel 450NX errata # 23. Non ascending cacheline evictions to
@@ -137,56 +134,42 @@ static void __init init_table(void)
137} 134}
138 135
139struct set_mtrr_data { 136struct set_mtrr_data {
140 atomic_t count;
141 atomic_t gate;
142 unsigned long smp_base; 137 unsigned long smp_base;
143 unsigned long smp_size; 138 unsigned long smp_size;
144 unsigned int smp_reg; 139 unsigned int smp_reg;
145 mtrr_type smp_type; 140 mtrr_type smp_type;
146}; 141};
147 142
148static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
149
150/** 143/**
151 * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. 144 * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
145 * by all the CPUs.
152 * @info: pointer to mtrr configuration data 146 * @info: pointer to mtrr configuration data
153 * 147 *
154 * Returns nothing. 148 * Returns nothing.
155 */ 149 */
156static int mtrr_work_handler(void *info) 150static int mtrr_rendezvous_handler(void *info)
157{ 151{
158#ifdef CONFIG_SMP
159 struct set_mtrr_data *data = info; 152 struct set_mtrr_data *data = info;
160 unsigned long flags;
161 153
162 atomic_dec(&data->count); 154 /*
163 while (!atomic_read(&data->gate)) 155 * We use this same function to initialize the mtrrs during boot,
164 cpu_relax(); 156 * resume, runtime cpu online and on an explicit request to set a
165 157 * specific MTRR.
166 local_irq_save(flags); 158 *
167 159 * During boot or suspend, the state of the boot cpu's mtrrs has been
168 atomic_dec(&data->count); 160 * saved, and we want to replicate that across all the cpus that come
169 while (atomic_read(&data->gate)) 161 * online (either at the end of boot or resume or during a runtime cpu
170 cpu_relax(); 162 * online). If we're doing that, @reg is set to something special and on
171 163 * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
172 /* The master has cleared me to execute */ 164 * started the boot/resume sequence, this might be a duplicate
165 * set_all()).
166 */
173 if (data->smp_reg != ~0U) { 167 if (data->smp_reg != ~0U) {
174 mtrr_if->set(data->smp_reg, data->smp_base, 168 mtrr_if->set(data->smp_reg, data->smp_base,
175 data->smp_size, data->smp_type); 169 data->smp_size, data->smp_type);
176 } else if (mtrr_aps_delayed_init) { 170 } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
177 /*
178 * Initialize the MTRRs inaddition to the synchronisation.
179 */
180 mtrr_if->set_all(); 171 mtrr_if->set_all();
181 } 172 }
182
183 atomic_dec(&data->count);
184 while (!atomic_read(&data->gate))
185 cpu_relax();
186
187 atomic_dec(&data->count);
188 local_irq_restore(flags);
189#endif
190 return 0; 173 return 0;
191} 174}
192 175
@@ -223,20 +206,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
223 * 14. Wait for buddies to catch up 206 * 14. Wait for buddies to catch up
224 * 15. Enable interrupts. 207 * 15. Enable interrupts.
225 * 208 *
226 * What does that mean for us? Well, first we set data.count to the number 209 * What does that mean for us? Well, stop_machine() will ensure that
227 * of CPUs. As each CPU announces that it started the rendezvous handler by 210 * the rendezvous handler is started on each CPU. And in lockstep they
228 * decrementing the count, We reset data.count and set the data.gate flag 211 * do the state transition of disabling interrupts, updating MTRR's
229 * allowing all the cpu's to proceed with the work. As each cpu disables 212 * (the CPU vendors may each do it differently, so we call mtrr_if->set()
230 * interrupts, it'll decrement data.count once. We wait until it hits 0 and 213 * callback and let them take care of it.) and enabling interrupts.
231 * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
232 * are waiting for that flag to be cleared. Once it's cleared, each
233 * CPU goes through the transition of updating MTRRs.
234 * The CPU vendors may each do it differently,
235 * so we call mtrr_if->set() callback and let them take care of it.
236 * When they're done, they again decrement data->count and wait for data.gate
237 * to be set.
238 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
239 * Everyone then enables interrupts and we all continue on.
240 * 214 *
241 * Note that the mechanism is the same for UP systems, too; all the SMP stuff 215 * Note that the mechanism is the same for UP systems, too; all the SMP stuff
242 * becomes nops. 216 * becomes nops.
@@ -244,92 +218,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
244static void 218static void
245set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) 219set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
246{ 220{
247 struct set_mtrr_data data; 221 struct set_mtrr_data data = { .smp_reg = reg,
248 unsigned long flags; 222 .smp_base = base,
249 int cpu; 223 .smp_size = size,
224 .smp_type = type
225 };
250 226
251 preempt_disable(); 227 stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
252 228}
253 data.smp_reg = reg;
254 data.smp_base = base;
255 data.smp_size = size;
256 data.smp_type = type;
257 atomic_set(&data.count, num_booting_cpus() - 1);
258
259 /* Make sure data.count is visible before unleashing other CPUs */
260 smp_wmb();
261 atomic_set(&data.gate, 0);
262
263 /* Start the ball rolling on other CPUs */
264 for_each_online_cpu(cpu) {
265 struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
266
267 if (cpu == smp_processor_id())
268 continue;
269
270 stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
271 }
272
273
274 while (atomic_read(&data.count))
275 cpu_relax();
276
277 /* Ok, reset count and toggle gate */
278 atomic_set(&data.count, num_booting_cpus() - 1);
279 smp_wmb();
280 atomic_set(&data.gate, 1);
281
282 local_irq_save(flags);
283
284 while (atomic_read(&data.count))
285 cpu_relax();
286
287 /* Ok, reset count and toggle gate */
288 atomic_set(&data.count, num_booting_cpus() - 1);
289 smp_wmb();
290 atomic_set(&data.gate, 0);
291
292 /* Do our MTRR business */
293
294 /*
295 * HACK!
296 *
297 * We use this same function to initialize the mtrrs during boot,
298 * resume, runtime cpu online and on an explicit request to set a
299 * specific MTRR.
300 *
301 * During boot or suspend, the state of the boot cpu's mtrrs has been
302 * saved, and we want to replicate that across all the cpus that come
303 * online (either at the end of boot or resume or during a runtime cpu
304 * online). If we're doing that, @reg is set to something special and on
305 * this cpu we still do mtrr_if->set_all(). During boot/resume, this
306 * is unnecessary if at this point we are still on the cpu that started
307 * the boot/resume sequence. But there is no guarantee that we are still
308 * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
309 * sure that we are in sync with everyone else.
310 */
311 if (reg != ~0U)
312 mtrr_if->set(reg, base, size, type);
313 else
314 mtrr_if->set_all();
315
316 /* Wait for the others */
317 while (atomic_read(&data.count))
318 cpu_relax();
319
320 atomic_set(&data.count, num_booting_cpus() - 1);
321 smp_wmb();
322 atomic_set(&data.gate, 1);
323
324 /*
325 * Wait here for everyone to have seen the gate change
326 * So we're the last ones to touch 'data'
327 */
328 while (atomic_read(&data.count))
329 cpu_relax();
330 229
331 local_irq_restore(flags); 230static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
332 preempt_enable(); 231 unsigned long size, mtrr_type type)
232{
233 struct set_mtrr_data data = { .smp_reg = reg,
234 .smp_base = base,
235 .smp_size = size,
236 .smp_type = type
237 };
238
239 stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
240 cpu_callout_mask);
333} 241}
334 242
335/** 243/**
@@ -783,7 +691,7 @@ void mtrr_ap_init(void)
783 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug 691 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
784 * lock to prevent mtrr entry changes 692 * lock to prevent mtrr entry changes
785 */ 693 */
786 set_mtrr(~0U, 0, 0, 0); 694 set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
787} 695}
788 696
789/** 697/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3a0338b4b17..cfa62ec090e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -22,7 +22,6 @@
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/highmem.h>
26#include <linux/cpu.h> 25#include <linux/cpu.h>
27#include <linux/bitops.h> 26#include <linux/bitops.h>
28 27
@@ -45,38 +44,27 @@ do { \
45#endif 44#endif
46 45
47/* 46/*
48 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context 47 * | NHM/WSM | SNB |
48 * register -------------------------------
49 * | HT | no HT | HT | no HT |
50 *-----------------------------------------
51 * offcore | core | core | cpu | core |
52 * lbr_sel | core | core | cpu | core |
53 * ld_lat | cpu | core | cpu | core |
54 *-----------------------------------------
55 *
56 * Given that there is a small number of shared regs,
57 * we can pre-allocate their slot in the per-cpu
58 * per-core reg tables.
49 */ 59 */
50static unsigned long 60enum extra_reg_type {
51copy_from_user_nmi(void *to, const void __user *from, unsigned long n) 61 EXTRA_REG_NONE = -1, /* not used */
52{
53 unsigned long offset, addr = (unsigned long)from;
54 unsigned long size, len = 0;
55 struct page *page;
56 void *map;
57 int ret;
58
59 do {
60 ret = __get_user_pages_fast(addr, 1, 0, &page);
61 if (!ret)
62 break;
63
64 offset = addr & (PAGE_SIZE - 1);
65 size = min(PAGE_SIZE - offset, n - len);
66
67 map = kmap_atomic(page);
68 memcpy(to, map+offset, size);
69 kunmap_atomic(map);
70 put_page(page);
71 62
72 len += size; 63 EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
73 to += size; 64 EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
74 addr += size;
75 65
76 } while (len < n); 66 EXTRA_REG_MAX /* number of entries needed */
77 67};
78 return len;
79}
80 68
81struct event_constraint { 69struct event_constraint {
82 union { 70 union {
@@ -132,11 +120,10 @@ struct cpu_hw_events {
132 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 120 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
133 121
134 /* 122 /*
135 * Intel percore register state. 123 * manage shared (per-core, per-cpu) registers
136 * Coordinate shared resources between HT threads. 124 * used on Intel NHM/WSM/SNB
137 */ 125 */
138 int percore_used; /* Used by this CPU? */ 126 struct intel_shared_regs *shared_regs;
139 struct intel_percore *per_core;
140 127
141 /* 128 /*
142 * AMD specific bits 129 * AMD specific bits
@@ -187,26 +174,45 @@ struct cpu_hw_events {
187 for ((e) = (c); (e)->weight; (e)++) 174 for ((e) = (c); (e)->weight; (e)++)
188 175
189/* 176/*
177 * Per register state.
178 */
179struct er_account {
180 raw_spinlock_t lock; /* per-core: protect structure */
181 u64 config; /* extra MSR config */
182 u64 reg; /* extra MSR number */
183 atomic_t ref; /* reference count */
184};
185
186/*
190 * Extra registers for specific events. 187 * Extra registers for specific events.
188 *
191 * Some events need large masks and require external MSRs. 189 * Some events need large masks and require external MSRs.
192 * Define a mapping to these extra registers. 190 * Those extra MSRs end up being shared for all events on
191 * a PMU and sometimes between PMU of sibling HT threads.
192 * In either case, the kernel needs to handle conflicting
193 * accesses to those extra, shared, regs. The data structure
194 * to manage those registers is stored in cpu_hw_event.
193 */ 195 */
194struct extra_reg { 196struct extra_reg {
195 unsigned int event; 197 unsigned int event;
196 unsigned int msr; 198 unsigned int msr;
197 u64 config_mask; 199 u64 config_mask;
198 u64 valid_mask; 200 u64 valid_mask;
201 int idx; /* per_xxx->regs[] reg index */
199}; 202};
200 203
201#define EVENT_EXTRA_REG(e, ms, m, vm) { \ 204#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
202 .event = (e), \ 205 .event = (e), \
203 .msr = (ms), \ 206 .msr = (ms), \
204 .config_mask = (m), \ 207 .config_mask = (m), \
205 .valid_mask = (vm), \ 208 .valid_mask = (vm), \
209 .idx = EXTRA_REG_##i \
206 } 210 }
207#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \ 211
208 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm) 212#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
209#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0) 213 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
214
215#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
210 216
211union perf_capabilities { 217union perf_capabilities {
212 struct { 218 struct {
@@ -252,7 +258,6 @@ struct x86_pmu {
252 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 258 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
253 struct perf_event *event); 259 struct perf_event *event);
254 struct event_constraint *event_constraints; 260 struct event_constraint *event_constraints;
255 struct event_constraint *percore_constraints;
256 void (*quirks)(void); 261 void (*quirks)(void);
257 int perfctr_second_write; 262 int perfctr_second_write;
258 263
@@ -286,8 +291,12 @@ struct x86_pmu {
286 * Extra registers for events 291 * Extra registers for events
287 */ 292 */
288 struct extra_reg *extra_regs; 293 struct extra_reg *extra_regs;
294 unsigned int er_flags;
289}; 295};
290 296
297#define ERF_NO_HT_SHARING 1
298#define ERF_HAS_RSP_1 2
299
291static struct x86_pmu x86_pmu __read_mostly; 300static struct x86_pmu x86_pmu __read_mostly;
292 301
293static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 302static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -393,10 +402,10 @@ static inline unsigned int x86_pmu_event_addr(int index)
393 */ 402 */
394static int x86_pmu_extra_regs(u64 config, struct perf_event *event) 403static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
395{ 404{
405 struct hw_perf_event_extra *reg;
396 struct extra_reg *er; 406 struct extra_reg *er;
397 407
398 event->hw.extra_reg = 0; 408 reg = &event->hw.extra_reg;
399 event->hw.extra_config = 0;
400 409
401 if (!x86_pmu.extra_regs) 410 if (!x86_pmu.extra_regs)
402 return 0; 411 return 0;
@@ -406,8 +415,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
406 continue; 415 continue;
407 if (event->attr.config1 & ~er->valid_mask) 416 if (event->attr.config1 & ~er->valid_mask)
408 return -EINVAL; 417 return -EINVAL;
409 event->hw.extra_reg = er->msr; 418
410 event->hw.extra_config = event->attr.config1; 419 reg->idx = er->idx;
420 reg->config = event->attr.config1;
421 reg->reg = er->msr;
411 break; 422 break;
412 } 423 }
413 return 0; 424 return 0;
@@ -706,6 +717,9 @@ static int __x86_pmu_event_init(struct perf_event *event)
706 event->hw.last_cpu = -1; 717 event->hw.last_cpu = -1;
707 event->hw.last_tag = ~0ULL; 718 event->hw.last_tag = ~0ULL;
708 719
720 /* mark unused */
721 event->hw.extra_reg.idx = EXTRA_REG_NONE;
722
709 return x86_pmu.hw_config(event); 723 return x86_pmu.hw_config(event);
710} 724}
711 725
@@ -747,8 +761,8 @@ static void x86_pmu_disable(struct pmu *pmu)
747static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, 761static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
748 u64 enable_mask) 762 u64 enable_mask)
749{ 763{
750 if (hwc->extra_reg) 764 if (hwc->extra_reg.reg)
751 wrmsrl(hwc->extra_reg, hwc->extra_config); 765 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
752 wrmsrl(hwc->config_base, hwc->config | enable_mask); 766 wrmsrl(hwc->config_base, hwc->config | enable_mask);
753} 767}
754 768
@@ -1332,7 +1346,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1332 if (!x86_perf_event_set_period(event)) 1346 if (!x86_perf_event_set_period(event))
1333 continue; 1347 continue;
1334 1348
1335 if (perf_event_overflow(event, 1, &data, regs)) 1349 if (perf_event_overflow(event, &data, regs))
1336 x86_pmu_stop(event, 0); 1350 x86_pmu_stop(event, 0);
1337 } 1351 }
1338 1352
@@ -1637,6 +1651,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
1637 perf_pmu_enable(pmu); 1651 perf_pmu_enable(pmu);
1638 return 0; 1652 return 0;
1639} 1653}
1654/*
1655 * a fake_cpuc is used to validate event groups. Due to
1656 * the extra reg logic, we need to also allocate a fake
1657 * per_core and per_cpu structure. Otherwise, group events
1658 * using extra reg may conflict without the kernel being
1659 * able to catch this when the last event gets added to
1660 * the group.
1661 */
1662static void free_fake_cpuc(struct cpu_hw_events *cpuc)
1663{
1664 kfree(cpuc->shared_regs);
1665 kfree(cpuc);
1666}
1667
1668static struct cpu_hw_events *allocate_fake_cpuc(void)
1669{
1670 struct cpu_hw_events *cpuc;
1671 int cpu = raw_smp_processor_id();
1672
1673 cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
1674 if (!cpuc)
1675 return ERR_PTR(-ENOMEM);
1676
1677 /* only needed, if we have extra_regs */
1678 if (x86_pmu.extra_regs) {
1679 cpuc->shared_regs = allocate_shared_regs(cpu);
1680 if (!cpuc->shared_regs)
1681 goto error;
1682 }
1683 return cpuc;
1684error:
1685 free_fake_cpuc(cpuc);
1686 return ERR_PTR(-ENOMEM);
1687}
1640 1688
1641/* 1689/*
1642 * validate that we can schedule this event 1690 * validate that we can schedule this event
@@ -1647,9 +1695,9 @@ static int validate_event(struct perf_event *event)
1647 struct event_constraint *c; 1695 struct event_constraint *c;
1648 int ret = 0; 1696 int ret = 0;
1649 1697
1650 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); 1698 fake_cpuc = allocate_fake_cpuc();
1651 if (!fake_cpuc) 1699 if (IS_ERR(fake_cpuc))
1652 return -ENOMEM; 1700 return PTR_ERR(fake_cpuc);
1653 1701
1654 c = x86_pmu.get_event_constraints(fake_cpuc, event); 1702 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1655 1703
@@ -1659,7 +1707,7 @@ static int validate_event(struct perf_event *event)
1659 if (x86_pmu.put_event_constraints) 1707 if (x86_pmu.put_event_constraints)
1660 x86_pmu.put_event_constraints(fake_cpuc, event); 1708 x86_pmu.put_event_constraints(fake_cpuc, event);
1661 1709
1662 kfree(fake_cpuc); 1710 free_fake_cpuc(fake_cpuc);
1663 1711
1664 return ret; 1712 return ret;
1665} 1713}
@@ -1679,36 +1727,32 @@ static int validate_group(struct perf_event *event)
1679{ 1727{
1680 struct perf_event *leader = event->group_leader; 1728 struct perf_event *leader = event->group_leader;
1681 struct cpu_hw_events *fake_cpuc; 1729 struct cpu_hw_events *fake_cpuc;
1682 int ret, n; 1730 int ret = -ENOSPC, n;
1683
1684 ret = -ENOMEM;
1685 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1686 if (!fake_cpuc)
1687 goto out;
1688 1731
1732 fake_cpuc = allocate_fake_cpuc();
1733 if (IS_ERR(fake_cpuc))
1734 return PTR_ERR(fake_cpuc);
1689 /* 1735 /*
1690 * the event is not yet connected with its 1736 * the event is not yet connected with its
1691 * siblings therefore we must first collect 1737 * siblings therefore we must first collect
1692 * existing siblings, then add the new event 1738 * existing siblings, then add the new event
1693 * before we can simulate the scheduling 1739 * before we can simulate the scheduling
1694 */ 1740 */
1695 ret = -ENOSPC;
1696 n = collect_events(fake_cpuc, leader, true); 1741 n = collect_events(fake_cpuc, leader, true);
1697 if (n < 0) 1742 if (n < 0)
1698 goto out_free; 1743 goto out;
1699 1744
1700 fake_cpuc->n_events = n; 1745 fake_cpuc->n_events = n;
1701 n = collect_events(fake_cpuc, event, false); 1746 n = collect_events(fake_cpuc, event, false);
1702 if (n < 0) 1747 if (n < 0)
1703 goto out_free; 1748 goto out;
1704 1749
1705 fake_cpuc->n_events = n; 1750 fake_cpuc->n_events = n;
1706 1751
1707 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); 1752 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1708 1753
1709out_free:
1710 kfree(fake_cpuc);
1711out: 1754out:
1755 free_fake_cpuc(fake_cpuc);
1712 return ret; 1756 return ret;
1713} 1757}
1714 1758
@@ -1856,6 +1900,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1856 1900
1857 perf_callchain_store(entry, regs->ip); 1901 perf_callchain_store(entry, regs->ip);
1858 1902
1903 if (!current->mm)
1904 return;
1905
1859 if (perf_callchain_user32(regs, entry)) 1906 if (perf_callchain_user32(regs, entry))
1860 return; 1907 return;
1861 1908
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index fe29c1d2219..941caa2e449 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids
89 [ C(RESULT_MISS) ] = -1, 89 [ C(RESULT_MISS) ] = -1,
90 }, 90 },
91 }, 91 },
92 [ C(NODE) ] = {
93 [ C(OP_READ) ] = {
94 [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
95 [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */
96 },
97 [ C(OP_WRITE) ] = {
98 [ C(RESULT_ACCESS) ] = -1,
99 [ C(RESULT_MISS) ] = -1,
100 },
101 [ C(OP_PREFETCH) ] = {
102 [ C(RESULT_ACCESS) ] = -1,
103 [ C(RESULT_MISS) ] = -1,
104 },
105 },
92}; 106};
93 107
94/* 108/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 41178c826c4..f88af2c2a56 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,25 +1,15 @@
1#ifdef CONFIG_CPU_SUP_INTEL 1#ifdef CONFIG_CPU_SUP_INTEL
2 2
3#define MAX_EXTRA_REGS 2
4
5/*
6 * Per register state.
7 */
8struct er_account {
9 int ref; /* reference count */
10 unsigned int extra_reg; /* extra MSR number */
11 u64 extra_config; /* extra MSR config */
12};
13
14/* 3/*
15 * Per core state 4 * Per core/cpu state
16 * This used to coordinate shared registers for HT threads. 5 *
6 * Used to coordinate shared registers between HT threads or
7 * among events on a single PMU.
17 */ 8 */
18struct intel_percore { 9struct intel_shared_regs {
19 raw_spinlock_t lock; /* protect structure */ 10 struct er_account regs[EXTRA_REG_MAX];
20 struct er_account regs[MAX_EXTRA_REGS]; 11 int refcnt; /* per-core: #HT threads */
21 int refcnt; /* number of threads */ 12 unsigned core_id; /* per-core: core id */
22 unsigned core_id;
23}; 13};
24 14
25/* 15/*
@@ -88,16 +78,10 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
88 78
89static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = 79static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
90{ 80{
91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), 81 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
92 EVENT_EXTRA_END 82 EVENT_EXTRA_END
93}; 83};
94 84
95static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
96{
97 INTEL_EVENT_CONSTRAINT(0xb7, 0),
98 EVENT_CONSTRAINT_END
99};
100
101static struct event_constraint intel_westmere_event_constraints[] __read_mostly = 85static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
102{ 86{
103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 87 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -116,8 +100,6 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 100 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
117 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 101 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
118 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ 102 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
119 INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
120 INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
121 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ 103 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
122 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ 104 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
123 EVENT_CONSTRAINT_END 105 EVENT_CONSTRAINT_END
@@ -125,15 +107,13 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
125 107
126static struct extra_reg intel_westmere_extra_regs[] __read_mostly = 108static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
127{ 109{
128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), 110 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), 111 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
130 EVENT_EXTRA_END 112 EVENT_EXTRA_END
131}; 113};
132 114
133static struct event_constraint intel_westmere_percore_constraints[] __read_mostly = 115static struct event_constraint intel_v1_event_constraints[] __read_mostly =
134{ 116{
135 INTEL_EVENT_CONSTRAINT(0xb7, 0),
136 INTEL_EVENT_CONSTRAINT(0xbb, 0),
137 EVENT_CONSTRAINT_END 117 EVENT_CONSTRAINT_END
138}; 118};
139 119
@@ -145,6 +125,12 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
145 EVENT_CONSTRAINT_END 125 EVENT_CONSTRAINT_END
146}; 126};
147 127
128static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
129 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
130 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
131 EVENT_EXTRA_END
132};
133
148static u64 intel_pmu_event_map(int hw_event) 134static u64 intel_pmu_event_map(int hw_event)
149{ 135{
150 return intel_perfmon_event_map[hw_event]; 136 return intel_perfmon_event_map[hw_event];
@@ -245,6 +231,21 @@ static __initconst const u64 snb_hw_cache_event_ids
245 [ C(RESULT_MISS) ] = -1, 231 [ C(RESULT_MISS) ] = -1,
246 }, 232 },
247 }, 233 },
234 [ C(NODE) ] = {
235 [ C(OP_READ) ] = {
236 [ C(RESULT_ACCESS) ] = -1,
237 [ C(RESULT_MISS) ] = -1,
238 },
239 [ C(OP_WRITE) ] = {
240 [ C(RESULT_ACCESS) ] = -1,
241 [ C(RESULT_MISS) ] = -1,
242 },
243 [ C(OP_PREFETCH) ] = {
244 [ C(RESULT_ACCESS) ] = -1,
245 [ C(RESULT_MISS) ] = -1,
246 },
247 },
248
248}; 249};
249 250
250static __initconst const u64 westmere_hw_cache_event_ids 251static __initconst const u64 westmere_hw_cache_event_ids
@@ -346,6 +347,20 @@ static __initconst const u64 westmere_hw_cache_event_ids
346 [ C(RESULT_MISS) ] = -1, 347 [ C(RESULT_MISS) ] = -1,
347 }, 348 },
348 }, 349 },
350 [ C(NODE) ] = {
351 [ C(OP_READ) ] = {
352 [ C(RESULT_ACCESS) ] = 0x01b7,
353 [ C(RESULT_MISS) ] = 0x01b7,
354 },
355 [ C(OP_WRITE) ] = {
356 [ C(RESULT_ACCESS) ] = 0x01b7,
357 [ C(RESULT_MISS) ] = 0x01b7,
358 },
359 [ C(OP_PREFETCH) ] = {
360 [ C(RESULT_ACCESS) ] = 0x01b7,
361 [ C(RESULT_MISS) ] = 0x01b7,
362 },
363 },
349}; 364};
350 365
351/* 366/*
@@ -398,7 +413,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
398 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, 413 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
399 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, 414 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
400 }, 415 },
401 } 416 },
417 [ C(NODE) ] = {
418 [ C(OP_READ) ] = {
419 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
420 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
421 },
422 [ C(OP_WRITE) ] = {
423 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
424 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
425 },
426 [ C(OP_PREFETCH) ] = {
427 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
428 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
429 },
430 },
402}; 431};
403 432
404static __initconst const u64 nehalem_hw_cache_event_ids 433static __initconst const u64 nehalem_hw_cache_event_ids
@@ -500,6 +529,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids
500 [ C(RESULT_MISS) ] = -1, 529 [ C(RESULT_MISS) ] = -1,
501 }, 530 },
502 }, 531 },
532 [ C(NODE) ] = {
533 [ C(OP_READ) ] = {
534 [ C(RESULT_ACCESS) ] = 0x01b7,
535 [ C(RESULT_MISS) ] = 0x01b7,
536 },
537 [ C(OP_WRITE) ] = {
538 [ C(RESULT_ACCESS) ] = 0x01b7,
539 [ C(RESULT_MISS) ] = 0x01b7,
540 },
541 [ C(OP_PREFETCH) ] = {
542 [ C(RESULT_ACCESS) ] = 0x01b7,
543 [ C(RESULT_MISS) ] = 0x01b7,
544 },
545 },
503}; 546};
504 547
505static __initconst const u64 core2_hw_cache_event_ids 548static __initconst const u64 core2_hw_cache_event_ids
@@ -1003,7 +1046,7 @@ again:
1003 1046
1004 data.period = event->hw.last_period; 1047 data.period = event->hw.last_period;
1005 1048
1006 if (perf_event_overflow(event, 1, &data, regs)) 1049 if (perf_event_overflow(event, &data, regs))
1007 x86_pmu_stop(event, 0); 1050 x86_pmu_stop(event, 0);
1008 } 1051 }
1009 1052
@@ -1037,65 +1080,121 @@ intel_bts_constraints(struct perf_event *event)
1037 return NULL; 1080 return NULL;
1038} 1081}
1039 1082
1083static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
1084{
1085 if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
1086 return false;
1087
1088 if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
1089 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1090 event->hw.config |= 0x01bb;
1091 event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
1092 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
1093 } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
1094 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1095 event->hw.config |= 0x01b7;
1096 event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
1097 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
1098 }
1099
1100 if (event->hw.extra_reg.idx == orig_idx)
1101 return false;
1102
1103 return true;
1104}
1105
1106/*
1107 * manage allocation of shared extra msr for certain events
1108 *
1109 * sharing can be:
1110 * per-cpu: to be shared between the various events on a single PMU
1111 * per-core: per-cpu + shared by HT threads
1112 */
1040static struct event_constraint * 1113static struct event_constraint *
1041intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 1114__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
1115 struct perf_event *event)
1042{ 1116{
1043 struct hw_perf_event *hwc = &event->hw; 1117 struct event_constraint *c = &emptyconstraint;
1044 unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT; 1118 struct hw_perf_event_extra *reg = &event->hw.extra_reg;
1045 struct event_constraint *c;
1046 struct intel_percore *pc;
1047 struct er_account *era; 1119 struct er_account *era;
1048 int i; 1120 unsigned long flags;
1049 int free_slot; 1121 int orig_idx = reg->idx;
1050 int found;
1051 1122
1052 if (!x86_pmu.percore_constraints || hwc->extra_alloc) 1123 /* already allocated shared msr */
1053 return NULL; 1124 if (reg->alloc)
1125 return &unconstrained;
1054 1126
1055 for (c = x86_pmu.percore_constraints; c->cmask; c++) { 1127again:
1056 if (e != c->code) 1128 era = &cpuc->shared_regs->regs[reg->idx];
1057 continue; 1129 /*
1130 * we use spin_lock_irqsave() to avoid lockdep issues when
1131 * passing a fake cpuc
1132 */
1133 raw_spin_lock_irqsave(&era->lock, flags);
1134
1135 if (!atomic_read(&era->ref) || era->config == reg->config) {
1136
1137 /* lock in msr value */
1138 era->config = reg->config;
1139 era->reg = reg->reg;
1140
1141 /* one more user */
1142 atomic_inc(&era->ref);
1143
1144 /* no need to reallocate during incremental event scheduling */
1145 reg->alloc = 1;
1058 1146
1059 /* 1147 /*
1060 * Allocate resource per core. 1148 * All events using extra_reg are unconstrained.
1149 * Avoids calling x86_get_event_constraints()
1150 *
1151 * Must revisit if extra_reg controlling events
1152 * ever have constraints. Worst case we go through
1153 * the regular event constraint table.
1061 */ 1154 */
1062 pc = cpuc->per_core; 1155 c = &unconstrained;
1063 if (!pc) 1156 } else if (intel_try_alt_er(event, orig_idx)) {
1064 break; 1157 raw_spin_unlock(&era->lock);
1065 c = &emptyconstraint; 1158 goto again;
1066 raw_spin_lock(&pc->lock);
1067 free_slot = -1;
1068 found = 0;
1069 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1070 era = &pc->regs[i];
1071 if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
1072 /* Allow sharing same config */
1073 if (hwc->extra_config == era->extra_config) {
1074 era->ref++;
1075 cpuc->percore_used = 1;
1076 hwc->extra_alloc = 1;
1077 c = NULL;
1078 }
1079 /* else conflict */
1080 found = 1;
1081 break;
1082 } else if (era->ref == 0 && free_slot == -1)
1083 free_slot = i;
1084 }
1085 if (!found && free_slot != -1) {
1086 era = &pc->regs[free_slot];
1087 era->ref = 1;
1088 era->extra_reg = hwc->extra_reg;
1089 era->extra_config = hwc->extra_config;
1090 cpuc->percore_used = 1;
1091 hwc->extra_alloc = 1;
1092 c = NULL;
1093 }
1094 raw_spin_unlock(&pc->lock);
1095 return c;
1096 } 1159 }
1160 raw_spin_unlock_irqrestore(&era->lock, flags);
1097 1161
1098 return NULL; 1162 return c;
1163}
1164
1165static void
1166__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
1167 struct hw_perf_event_extra *reg)
1168{
1169 struct er_account *era;
1170
1171 /*
1172 * only put constraint if extra reg was actually
1173 * allocated. Also takes care of event which do
1174 * not use an extra shared reg
1175 */
1176 if (!reg->alloc)
1177 return;
1178
1179 era = &cpuc->shared_regs->regs[reg->idx];
1180
1181 /* one fewer user */
1182 atomic_dec(&era->ref);
1183
1184 /* allocate again next time */
1185 reg->alloc = 0;
1186}
1187
1188static struct event_constraint *
1189intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
1190 struct perf_event *event)
1191{
1192 struct event_constraint *c = NULL;
1193
1194 if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
1195 c = __intel_shared_reg_get_constraints(cpuc, event);
1196
1197 return c;
1099} 1198}
1100 1199
1101static struct event_constraint * 1200static struct event_constraint *
@@ -1111,49 +1210,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
1111 if (c) 1210 if (c)
1112 return c; 1211 return c;
1113 1212
1114 c = intel_percore_constraints(cpuc, event); 1213 c = intel_shared_regs_constraints(cpuc, event);
1115 if (c) 1214 if (c)
1116 return c; 1215 return c;
1117 1216
1118 return x86_get_event_constraints(cpuc, event); 1217 return x86_get_event_constraints(cpuc, event);
1119} 1218}
1120 1219
1121static void intel_put_event_constraints(struct cpu_hw_events *cpuc, 1220static void
1221intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
1122 struct perf_event *event) 1222 struct perf_event *event)
1123{ 1223{
1124 struct extra_reg *er; 1224 struct hw_perf_event_extra *reg;
1125 struct intel_percore *pc;
1126 struct er_account *era;
1127 struct hw_perf_event *hwc = &event->hw;
1128 int i, allref;
1129 1225
1130 if (!cpuc->percore_used) 1226 reg = &event->hw.extra_reg;
1131 return; 1227 if (reg->idx != EXTRA_REG_NONE)
1132 1228 __intel_shared_reg_put_constraints(cpuc, reg);
1133 for (er = x86_pmu.extra_regs; er->msr; er++) { 1229}
1134 if (er->event != (hwc->config & er->config_mask))
1135 continue;
1136 1230
1137 pc = cpuc->per_core; 1231static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1138 raw_spin_lock(&pc->lock); 1232 struct perf_event *event)
1139 for (i = 0; i < MAX_EXTRA_REGS; i++) { 1233{
1140 era = &pc->regs[i]; 1234 intel_put_shared_regs_event_constraints(cpuc, event);
1141 if (era->ref > 0 &&
1142 era->extra_config == hwc->extra_config &&
1143 era->extra_reg == er->msr) {
1144 era->ref--;
1145 hwc->extra_alloc = 0;
1146 break;
1147 }
1148 }
1149 allref = 0;
1150 for (i = 0; i < MAX_EXTRA_REGS; i++)
1151 allref += pc->regs[i].ref;
1152 if (allref == 0)
1153 cpuc->percore_used = 0;
1154 raw_spin_unlock(&pc->lock);
1155 break;
1156 }
1157} 1235}
1158 1236
1159static int intel_pmu_hw_config(struct perf_event *event) 1237static int intel_pmu_hw_config(struct perf_event *event)
@@ -1231,20 +1309,36 @@ static __initconst const struct x86_pmu core_pmu = {
1231 .event_constraints = intel_core_event_constraints, 1309 .event_constraints = intel_core_event_constraints,
1232}; 1310};
1233 1311
1312static struct intel_shared_regs *allocate_shared_regs(int cpu)
1313{
1314 struct intel_shared_regs *regs;
1315 int i;
1316
1317 regs = kzalloc_node(sizeof(struct intel_shared_regs),
1318 GFP_KERNEL, cpu_to_node(cpu));
1319 if (regs) {
1320 /*
1321 * initialize the locks to keep lockdep happy
1322 */
1323 for (i = 0; i < EXTRA_REG_MAX; i++)
1324 raw_spin_lock_init(&regs->regs[i].lock);
1325
1326 regs->core_id = -1;
1327 }
1328 return regs;
1329}
1330
1234static int intel_pmu_cpu_prepare(int cpu) 1331static int intel_pmu_cpu_prepare(int cpu)
1235{ 1332{
1236 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1333 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1237 1334
1238 if (!cpu_has_ht_siblings()) 1335 if (!x86_pmu.extra_regs)
1239 return NOTIFY_OK; 1336 return NOTIFY_OK;
1240 1337
1241 cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), 1338 cpuc->shared_regs = allocate_shared_regs(cpu);
1242 GFP_KERNEL, cpu_to_node(cpu)); 1339 if (!cpuc->shared_regs)
1243 if (!cpuc->per_core)
1244 return NOTIFY_BAD; 1340 return NOTIFY_BAD;
1245 1341
1246 raw_spin_lock_init(&cpuc->per_core->lock);
1247 cpuc->per_core->core_id = -1;
1248 return NOTIFY_OK; 1342 return NOTIFY_OK;
1249} 1343}
1250 1344
@@ -1260,32 +1354,34 @@ static void intel_pmu_cpu_starting(int cpu)
1260 */ 1354 */
1261 intel_pmu_lbr_reset(); 1355 intel_pmu_lbr_reset();
1262 1356
1263 if (!cpu_has_ht_siblings()) 1357 if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
1264 return; 1358 return;
1265 1359
1266 for_each_cpu(i, topology_thread_cpumask(cpu)) { 1360 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1267 struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core; 1361 struct intel_shared_regs *pc;
1268 1362
1363 pc = per_cpu(cpu_hw_events, i).shared_regs;
1269 if (pc && pc->core_id == core_id) { 1364 if (pc && pc->core_id == core_id) {
1270 kfree(cpuc->per_core); 1365 kfree(cpuc->shared_regs);
1271 cpuc->per_core = pc; 1366 cpuc->shared_regs = pc;
1272 break; 1367 break;
1273 } 1368 }
1274 } 1369 }
1275 1370
1276 cpuc->per_core->core_id = core_id; 1371 cpuc->shared_regs->core_id = core_id;
1277 cpuc->per_core->refcnt++; 1372 cpuc->shared_regs->refcnt++;
1278} 1373}
1279 1374
1280static void intel_pmu_cpu_dying(int cpu) 1375static void intel_pmu_cpu_dying(int cpu)
1281{ 1376{
1282 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1377 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1283 struct intel_percore *pc = cpuc->per_core; 1378 struct intel_shared_regs *pc;
1284 1379
1380 pc = cpuc->shared_regs;
1285 if (pc) { 1381 if (pc) {
1286 if (pc->core_id == -1 || --pc->refcnt == 0) 1382 if (pc->core_id == -1 || --pc->refcnt == 0)
1287 kfree(pc); 1383 kfree(pc);
1288 cpuc->per_core = NULL; 1384 cpuc->shared_regs = NULL;
1289 } 1385 }
1290 1386
1291 fini_debug_store_on_cpu(cpu); 1387 fini_debug_store_on_cpu(cpu);
@@ -1436,7 +1532,6 @@ static __init int intel_pmu_init(void)
1436 1532
1437 x86_pmu.event_constraints = intel_nehalem_event_constraints; 1533 x86_pmu.event_constraints = intel_nehalem_event_constraints;
1438 x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; 1534 x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
1439 x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
1440 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1535 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1441 x86_pmu.extra_regs = intel_nehalem_extra_regs; 1536 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1442 1537
@@ -1481,10 +1576,10 @@ static __init int intel_pmu_init(void)
1481 intel_pmu_lbr_init_nhm(); 1576 intel_pmu_lbr_init_nhm();
1482 1577
1483 x86_pmu.event_constraints = intel_westmere_event_constraints; 1578 x86_pmu.event_constraints = intel_westmere_event_constraints;
1484 x86_pmu.percore_constraints = intel_westmere_percore_constraints;
1485 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1579 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1486 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; 1580 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
1487 x86_pmu.extra_regs = intel_westmere_extra_regs; 1581 x86_pmu.extra_regs = intel_westmere_extra_regs;
1582 x86_pmu.er_flags |= ERF_HAS_RSP_1;
1488 1583
1489 /* UOPS_ISSUED.STALLED_CYCLES */ 1584 /* UOPS_ISSUED.STALLED_CYCLES */
1490 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1585 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1495,6 +1590,7 @@ static __init int intel_pmu_init(void)
1495 break; 1590 break;
1496 1591
1497 case 42: /* SandyBridge */ 1592 case 42: /* SandyBridge */
1593 case 45: /* SandyBridge, "Romely-EP" */
1498 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 1594 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1499 sizeof(hw_cache_event_ids)); 1595 sizeof(hw_cache_event_ids));
1500 1596
@@ -1502,6 +1598,10 @@ static __init int intel_pmu_init(void)
1502 1598
1503 x86_pmu.event_constraints = intel_snb_event_constraints; 1599 x86_pmu.event_constraints = intel_snb_event_constraints;
1504 x86_pmu.pebs_constraints = intel_snb_pebs_events; 1600 x86_pmu.pebs_constraints = intel_snb_pebs_events;
1601 x86_pmu.extra_regs = intel_snb_extra_regs;
1602 /* all extra regs are per-cpu when HT is on */
1603 x86_pmu.er_flags |= ERF_HAS_RSP_1;
1604 x86_pmu.er_flags |= ERF_NO_HT_SHARING;
1505 1605
1506 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ 1606 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
1507 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1607 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1512,11 +1612,19 @@ static __init int intel_pmu_init(void)
1512 break; 1612 break;
1513 1613
1514 default: 1614 default:
1515 /* 1615 switch (x86_pmu.version) {
1516 * default constraints for v2 and up 1616 case 1:
1517 */ 1617 x86_pmu.event_constraints = intel_v1_event_constraints;
1518 x86_pmu.event_constraints = intel_gen_event_constraints; 1618 pr_cont("generic architected perfmon v1, ");
1519 pr_cont("generic architected perfmon, "); 1619 break;
1620 default:
1621 /*
1622 * default constraints for v2 and up
1623 */
1624 x86_pmu.event_constraints = intel_gen_event_constraints;
1625 pr_cont("generic architected perfmon, ");
1626 break;
1627 }
1520 } 1628 }
1521 return 0; 1629 return 0;
1522} 1630}
@@ -1528,4 +1636,8 @@ static int intel_pmu_init(void)
1528 return 0; 1636 return 0;
1529} 1637}
1530 1638
1639static struct intel_shared_regs *allocate_shared_regs(int cpu)
1640{
1641 return NULL;
1642}
1531#endif /* CONFIG_CPU_SUP_INTEL */ 1643#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index bab491b8ee2..3213c52db76 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void)
340 */ 340 */
341 perf_prepare_sample(&header, &data, event, &regs); 341 perf_prepare_sample(&header, &data, event, &regs);
342 342
343 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) 343 if (perf_output_begin(&handle, event, header.size * (top - at)))
344 return 1; 344 return 1;
345 345
346 for (; at < top; at++) { 346 for (; at < top; at++) {
@@ -508,6 +508,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
508 unsigned long from = cpuc->lbr_entries[0].from; 508 unsigned long from = cpuc->lbr_entries[0].from;
509 unsigned long old_to, to = cpuc->lbr_entries[0].to; 509 unsigned long old_to, to = cpuc->lbr_entries[0].to;
510 unsigned long ip = regs->ip; 510 unsigned long ip = regs->ip;
511 int is_64bit = 0;
511 512
512 /* 513 /*
513 * We don't need to fixup if the PEBS assist is fault like 514 * We don't need to fixup if the PEBS assist is fault like
@@ -559,7 +560,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
559 } else 560 } else
560 kaddr = (void *)to; 561 kaddr = (void *)to;
561 562
562 kernel_insn_init(&insn, kaddr); 563#ifdef CONFIG_X86_64
564 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
565#endif
566 insn_init(&insn, kaddr, is_64bit);
563 insn_get_length(&insn); 567 insn_get_length(&insn);
564 to += insn.length; 568 to += insn.length;
565 } while (to < ip); 569 } while (to < ip);
@@ -616,7 +620,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
616 else 620 else
617 regs.flags &= ~PERF_EFLAGS_EXACT; 621 regs.flags &= ~PERF_EFLAGS_EXACT;
618 622
619 if (perf_event_overflow(event, 1, &data, &regs)) 623 if (perf_event_overflow(event, &data, &regs))
620 x86_pmu_stop(event, 0); 624 x86_pmu_stop(event, 0);
621} 625}
622 626
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ead584fb6a7..7809d2bcb20 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -554,13 +554,102 @@ static __initconst const u64 p4_hw_cache_event_ids
554 [ C(RESULT_MISS) ] = -1, 554 [ C(RESULT_MISS) ] = -1,
555 }, 555 },
556 }, 556 },
557 [ C(NODE) ] = {
558 [ C(OP_READ) ] = {
559 [ C(RESULT_ACCESS) ] = -1,
560 [ C(RESULT_MISS) ] = -1,
561 },
562 [ C(OP_WRITE) ] = {
563 [ C(RESULT_ACCESS) ] = -1,
564 [ C(RESULT_MISS) ] = -1,
565 },
566 [ C(OP_PREFETCH) ] = {
567 [ C(RESULT_ACCESS) ] = -1,
568 [ C(RESULT_MISS) ] = -1,
569 },
570 },
557}; 571};
558 572
573/*
574 * Because of Netburst being quite restricted in how many
575 * identical events may run simultaneously, we introduce event aliases,
576 * ie the different events which have the same functionality but
577 * utilize non-intersected resources (ESCR/CCCR/counter registers).
578 *
579 * This allow us to relax restrictions a bit and run two or more
580 * identical events together.
581 *
582 * Never set any custom internal bits such as P4_CONFIG_HT,
583 * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
584 * either up to date automatically or not applicable at all.
585 */
586struct p4_event_alias {
587 u64 original;
588 u64 alternative;
589} p4_event_aliases[] = {
590 {
591 /*
592 * Non-halted cycles can be substituted with non-sleeping cycles (see
593 * Intel SDM Vol3b for details). We need this alias to be able
594 * to run nmi-watchdog and 'perf top' (or any other user space tool
595 * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
596 * simultaneously.
597 */
598 .original =
599 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
600 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
601 .alternative =
602 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
603 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
604 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
605 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
606 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
607 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
608 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
609 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
610 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
611 p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
612 P4_CCCR_COMPARE),
613 },
614};
615
616static u64 p4_get_alias_event(u64 config)
617{
618 u64 config_match;
619 int i;
620
621 /*
622 * Only event with special mark is allowed,
623 * we're to be sure it didn't come as malformed
624 * RAW event.
625 */
626 if (!(config & P4_CONFIG_ALIASABLE))
627 return 0;
628
629 config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
630
631 for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
632 if (config_match == p4_event_aliases[i].original) {
633 config_match = p4_event_aliases[i].alternative;
634 break;
635 } else if (config_match == p4_event_aliases[i].alternative) {
636 config_match = p4_event_aliases[i].original;
637 break;
638 }
639 }
640
641 if (i >= ARRAY_SIZE(p4_event_aliases))
642 return 0;
643
644 return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
645}
646
559static u64 p4_general_events[PERF_COUNT_HW_MAX] = { 647static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
560 /* non-halted CPU clocks */ 648 /* non-halted CPU clocks */
561 [PERF_COUNT_HW_CPU_CYCLES] = 649 [PERF_COUNT_HW_CPU_CYCLES] =
562 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | 650 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
563 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), 651 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) |
652 P4_CONFIG_ALIASABLE,
564 653
565 /* 654 /*
566 * retired instructions 655 * retired instructions
@@ -945,7 +1034,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
945 1034
946 if (!x86_perf_event_set_period(event)) 1035 if (!x86_perf_event_set_period(event))
947 continue; 1036 continue;
948 if (perf_event_overflow(event, 1, &data, regs)) 1037 if (perf_event_overflow(event, &data, regs))
949 x86_pmu_stop(event, 0); 1038 x86_pmu_stop(event, 0);
950 } 1039 }
951 1040
@@ -1120,6 +1209,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
1120 struct p4_event_bind *bind; 1209 struct p4_event_bind *bind;
1121 unsigned int i, thread, num; 1210 unsigned int i, thread, num;
1122 int cntr_idx, escr_idx; 1211 int cntr_idx, escr_idx;
1212 u64 config_alias;
1213 int pass;
1123 1214
1124 bitmap_zero(used_mask, X86_PMC_IDX_MAX); 1215 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1125 bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); 1216 bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
@@ -1128,6 +1219,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
1128 1219
1129 hwc = &cpuc->event_list[i]->hw; 1220 hwc = &cpuc->event_list[i]->hw;
1130 thread = p4_ht_thread(cpu); 1221 thread = p4_ht_thread(cpu);
1222 pass = 0;
1223
1224again:
1225 /*
1226 * It's possible to hit a circular lock
1227 * between original and alternative events
1228 * if both are scheduled already.
1229 */
1230 if (pass > 2)
1231 goto done;
1232
1131 bind = p4_config_get_bind(hwc->config); 1233 bind = p4_config_get_bind(hwc->config);
1132 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); 1234 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
1133 if (unlikely(escr_idx == -1)) 1235 if (unlikely(escr_idx == -1))
@@ -1141,8 +1243,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
1141 } 1243 }
1142 1244
1143 cntr_idx = p4_next_cntr(thread, used_mask, bind); 1245 cntr_idx = p4_next_cntr(thread, used_mask, bind);
1144 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) 1246 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
1145 goto done; 1247 /*
1248 * Check whether an event alias is still available.
1249 */
1250 config_alias = p4_get_alias_event(hwc->config);
1251 if (!config_alias)
1252 goto done;
1253 hwc->config = config_alias;
1254 pass++;
1255 goto again;
1256 }
1146 1257
1147 p4_pmu_swap_config_ts(hwc, cpu); 1258 p4_pmu_swap_config_ts(hwc, cpu);
1148 if (assign) 1259 if (assign)