aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2010-10-23 01:01:49 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2010-10-23 01:01:49 -0400
commit3dd41424090a0ca3a660218d06afe6ff4441bad3 (patch)
tree511ef1bb1799027fc5aad574adce49120ecadd87 /arch/x86/kernel/cpu
parent5c5456402d467969b217d7fdd6670f8c8600f5a8 (diff)
parentf6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff)
Merge commit 'v2.6.36' into wip-merge-2.6.36
Conflicts: Makefile arch/x86/include/asm/unistd_32.h arch/x86/kernel/syscall_table_32.S kernel/sched.c kernel/time/tick-sched.c Relevant API and functions changes (solved in this commit): - (API) .enqueue_task() (enqueue_task_litmus), dequeue_task() (dequeue_task_litmus), [litmus/sched_litmus.c] - (API) .select_task_rq() (select_task_rq_litmus) [litmus/sched_litmus.c] - (API) sysrq_dump_trace_buffer() and sysrq_handle_kill_rt_tasks() [litmus/sched_trace.c] - struct kfifo internal buffer name changed (buffer -> buf) [litmus/sched_trace.c] - add_wait_queue_exclusive_locked -> __add_wait_queue_tail_exclusive [litmus/fmlp.c] - syscall numbers for both x86_32 and x86_64
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/amd.c77
-rw-r--r--arch/x86/kernel/cpu/bugs.c2
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c70
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c53
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h26
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c25
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c188
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h2
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c55
-rw-r--r--arch/x86/kernel/cpu/intel.c9
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c263
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c138
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h23
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c128
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c13
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c216
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c56
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c56
-rw-r--r--arch/x86/kernel/cpu/perf_event.c940
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c50
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c416
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c641
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c218
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c951
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c31
-rw-r--r--arch/x86/kernel/cpu/scattered.c64
-rw-r--r--arch/x86/kernel/cpu/topology.c (renamed from arch/x86/kernel/cpu/addon_cpuid_features.c)56
-rw-r--r--arch/x86/kernel/cpu/vmware.c47
43 files changed, 3861 insertions, 1154 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62f3671..3f0ebe429a01 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,11 +12,11 @@ endif
12nostackp := $(call cc-option, -fno-stack-protector) 12nostackp := $(call cc-option, -fno-stack-protector)
13CFLAGS_common.o := $(nostackp) 13CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o addon_cpuid_features.o 15obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18 18
19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o
20obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
21 21
22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d2..ba5f62f45f01 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
466 } 466 }
467 467
468 } 468 }
469 if (c->x86 == 0x10 || c->x86 == 0x11) 469 if (c->x86 >= 0x10)
470 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 470 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
471 471
472 /* get apicid instead of initial apic id from cpuid */ 472 /* get apicid instead of initial apic id from cpuid */
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
529 num_cache_leaves = 3; 529 num_cache_leaves = 3;
530 } 530 }
531 531
532 if (c->x86 >= 0xf && c->x86 <= 0x11) 532 if (c->x86 >= 0xf)
533 set_cpu_cap(c, X86_FEATURE_K8); 533 set_cpu_cap(c, X86_FEATURE_K8);
534 534
535 if (cpu_has_xmm2) { 535 if (cpu_has_xmm2) {
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
546 fam10h_check_enable_mmcfg(); 546 fam10h_check_enable_mmcfg();
547 } 547 }
548 548
549 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { 549 if (c == &boot_cpu_data && c->x86 >= 0xf) {
550 unsigned long long tseg; 550 unsigned long long tseg;
551 551
552 /* 552 /*
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
609}; 609};
610 610
611cpu_dev_register(amd_cpu_dev); 611cpu_dev_register(amd_cpu_dev);
612
613/*
614 * AMD errata checking
615 *
616 * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
617 * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
618 * have an OSVW id assigned, which it takes as first argument. Both take a
619 * variable number of family-specific model-stepping ranges created by
620 * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
621 * int[] in arch/x86/include/asm/processor.h.
622 *
623 * Example:
624 *
625 * const int amd_erratum_319[] =
626 * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
627 * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
628 * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
629 */
630
631const int amd_erratum_400[] =
632 AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
633 AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
634EXPORT_SYMBOL_GPL(amd_erratum_400);
635
636const int amd_erratum_383[] =
637 AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
638EXPORT_SYMBOL_GPL(amd_erratum_383);
639
640bool cpu_has_amd_erratum(const int *erratum)
641{
642 struct cpuinfo_x86 *cpu = &current_cpu_data;
643 int osvw_id = *erratum++;
644 u32 range;
645 u32 ms;
646
647 /*
648 * If called early enough that current_cpu_data hasn't been initialized
649 * yet, fall back to boot_cpu_data.
650 */
651 if (cpu->x86 == 0)
652 cpu = &boot_cpu_data;
653
654 if (cpu->x86_vendor != X86_VENDOR_AMD)
655 return false;
656
657 if (osvw_id >= 0 && osvw_id < 65536 &&
658 cpu_has(cpu, X86_FEATURE_OSVW)) {
659 u64 osvw_len;
660
661 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
662 if (osvw_id < osvw_len) {
663 u64 osvw_bits;
664
665 rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
666 osvw_bits);
667 return osvw_bits & (1ULL << (osvw_id & 0x3f));
668 }
669 }
670
671 /* OSVW unavailable or ID unknown, match family-model-stepping range */
672 ms = (cpu->x86_model << 4) | cpu->x86_mask;
673 while ((range = *erratum++))
674 if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
675 (ms >= AMD_MODEL_RANGE_START(range)) &&
676 (ms <= AMD_MODEL_RANGE_END(range)))
677 return true;
678
679 return false;
680}
681
682EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01a265212395..c39576cb3018 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
86 86
87static void __init check_hlt(void) 87static void __init check_hlt(void)
88{ 88{
89 if (paravirt_enabled()) 89 if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
90 return; 90 return;
91 91
92 printk(KERN_INFO "Checking 'hlt' instruction... "); 92 printk(KERN_INFO "Checking 'hlt' instruction... ");
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
deleted file mode 100644
index 2056ccf572cc..000000000000
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ /dev/null
@@ -1,72 +0,0 @@
1/*
2 * cmpxchg*() fallbacks for CPU not supporting these instructions
3 */
4
5#include <linux/kernel.h>
6#include <linux/smp.h>
7#include <linux/module.h>
8
9#ifndef CONFIG_X86_CMPXCHG
10unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
11{
12 u8 prev;
13 unsigned long flags;
14
15 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
16 local_irq_save(flags);
17 prev = *(u8 *)ptr;
18 if (prev == old)
19 *(u8 *)ptr = new;
20 local_irq_restore(flags);
21 return prev;
22}
23EXPORT_SYMBOL(cmpxchg_386_u8);
24
25unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
26{
27 u16 prev;
28 unsigned long flags;
29
30 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
31 local_irq_save(flags);
32 prev = *(u16 *)ptr;
33 if (prev == old)
34 *(u16 *)ptr = new;
35 local_irq_restore(flags);
36 return prev;
37}
38EXPORT_SYMBOL(cmpxchg_386_u16);
39
40unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
41{
42 u32 prev;
43 unsigned long flags;
44
45 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
46 local_irq_save(flags);
47 prev = *(u32 *)ptr;
48 if (prev == old)
49 *(u32 *)ptr = new;
50 local_irq_restore(flags);
51 return prev;
52}
53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4868e4a951ee..f2f9ac7da25c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
140static int __init x86_xsave_setup(char *s) 140static int __init x86_xsave_setup(char *s)
141{ 141{
142 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 142 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
143 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
143 return 1; 144 return 1;
144} 145}
145__setup("noxsave", x86_xsave_setup); 146__setup("noxsave", x86_xsave_setup);
146 147
148static int __init x86_xsaveopt_setup(char *s)
149{
150 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
151 return 1;
152}
153__setup("noxsaveopt", x86_xsaveopt_setup);
154
147#ifdef CONFIG_X86_32 155#ifdef CONFIG_X86_32
148static int cachesize_override __cpuinitdata = -1; 156static int cachesize_override __cpuinitdata = -1;
149static int disable_x86_serial_nr __cpuinitdata = 1; 157static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -537,7 +545,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
537 } 545 }
538} 546}
539 547
540static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) 548void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
541{ 549{
542 u32 tfms, xlvl; 550 u32 tfms, xlvl;
543 u32 ebx; 551 u32 ebx;
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
551 c->x86_capability[4] = excap; 559 c->x86_capability[4] = excap;
552 } 560 }
553 561
562 /* Additional Intel-defined flags: level 0x00000007 */
563 if (c->cpuid_level >= 0x00000007) {
564 u32 eax, ebx, ecx, edx;
565
566 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
567
568 if (eax > 0)
569 c->x86_capability[9] = ebx;
570 }
571
554 /* AMD-defined flags: level 0x80000001 */ 572 /* AMD-defined flags: level 0x80000001 */
555 xlvl = cpuid_eax(0x80000000); 573 xlvl = cpuid_eax(0x80000000);
556 c->extended_cpuid_level = xlvl; 574 c->extended_cpuid_level = xlvl;
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
576 if (c->extended_cpuid_level >= 0x80000007) 594 if (c->extended_cpuid_level >= 0x80000007)
577 c->x86_power = cpuid_edx(0x80000007); 595 c->x86_power = cpuid_edx(0x80000007);
578 596
597 init_scattered_cpuid_features(c);
579} 598}
580 599
581static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) 600static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
731 750
732 get_model_name(c); /* Default name */ 751 get_model_name(c); /* Default name */
733 752
734 init_scattered_cpuid_features(c);
735 detect_nopl(c); 753 detect_nopl(c);
736} 754}
737 755
@@ -1084,6 +1102,20 @@ static void clear_all_debug_regs(void)
1084 } 1102 }
1085} 1103}
1086 1104
1105#ifdef CONFIG_KGDB
1106/*
1107 * Restore debug regs if using kgdbwait and you have a kernel debugger
1108 * connection established.
1109 */
1110static void dbg_restore_debug_regs(void)
1111{
1112 if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
1113 arch_kgdb_ops.correct_hw_break();
1114}
1115#else /* ! CONFIG_KGDB */
1116#define dbg_restore_debug_regs()
1117#endif /* ! CONFIG_KGDB */
1118
1087/* 1119/*
1088 * cpu_init() initializes state that is per-CPU. Some data is already 1120 * cpu_init() initializes state that is per-CPU. Some data is already
1089 * initialized (naturally) in the bootstrap process, such as the GDT 1121 * initialized (naturally) in the bootstrap process, such as the GDT
@@ -1107,9 +1139,9 @@ void __cpuinit cpu_init(void)
1107 oist = &per_cpu(orig_ist, cpu); 1139 oist = &per_cpu(orig_ist, cpu);
1108 1140
1109#ifdef CONFIG_NUMA 1141#ifdef CONFIG_NUMA
1110 if (cpu != 0 && percpu_read(node_number) == 0 && 1142 if (cpu != 0 && percpu_read(numa_node) == 0 &&
1111 cpu_to_node(cpu) != NUMA_NO_NODE) 1143 early_cpu_to_node(cpu) != NUMA_NO_NODE)
1112 percpu_write(node_number, cpu_to_node(cpu)); 1144 set_numa_node(early_cpu_to_node(cpu));
1113#endif 1145#endif
1114 1146
1115 me = current; 1147 me = current;
@@ -1174,20 +1206,11 @@ void __cpuinit cpu_init(void)
1174 load_TR_desc(); 1206 load_TR_desc();
1175 load_LDT(&init_mm.context); 1207 load_LDT(&init_mm.context);
1176 1208
1177#ifdef CONFIG_KGDB 1209 clear_all_debug_regs();
1178 /* 1210 dbg_restore_debug_regs();
1179 * If the kgdb is connected no debug regs should be altered. This
1180 * is only applicable when KGDB and a KGDB I/O module are built
1181 * into the kernel and you are using early debugging with
1182 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1183 */
1184 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1185 arch_kgdb_ops.correct_hw_break();
1186 else
1187#endif
1188 clear_all_debug_regs();
1189 1211
1190 fpu_init(); 1212 fpu_init();
1213 xsave_init();
1191 1214
1192 raw_local_save_flags(kernel_eflags); 1215 raw_local_save_flags(kernel_eflags);
1193 1216
@@ -1239,23 +1262,16 @@ void __cpuinit cpu_init(void)
1239#endif 1262#endif
1240 1263
1241 clear_all_debug_regs(); 1264 clear_all_debug_regs();
1265 dbg_restore_debug_regs();
1242 1266
1243 /* 1267 /*
1244 * Force FPU initialization: 1268 * Force FPU initialization:
1245 */ 1269 */
1246 if (cpu_has_xsave) 1270 current_thread_info()->status = 0;
1247 current_thread_info()->status = TS_XSAVE;
1248 else
1249 current_thread_info()->status = 0;
1250 clear_used_math(); 1271 clear_used_math();
1251 mxcsr_feature_mask_init(); 1272 mxcsr_feature_mask_init();
1252 1273
1253 /* 1274 fpu_init();
1254 * Boot processor to setup the FP and extended state context info.
1255 */
1256 if (smp_processor_id() == boot_cpu_id)
1257 init_thread_xstate();
1258
1259 xsave_init(); 1275 xsave_init();
1260} 1276}
1261#endif 1277#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 3624e8a0f71b..f668bb1f7d43 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -33,5 +33,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); 35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36extern void get_cpu_cap(struct cpuinfo_x86 *c);
36 37
37#endif 38#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 1840c0a5170b..bd54bf67e6fb 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,8 @@
2# K8 systems. ACPI is preferred to all other hardware-specific drivers. 2# K8 systems. ACPI is preferred to all other hardware-specific drivers.
3# speedstep-* is preferred over p4-clockmod. 3# speedstep-* is preferred over p4-clockmod.
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o 7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 459168083b77..cd8da247dda1 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -34,7 +34,6 @@
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <trace/events/power.h>
38 37
39#include <linux/acpi.h> 38#include <linux/acpi.h>
40#include <linux/io.h> 39#include <linux/io.h>
@@ -46,6 +45,7 @@
46#include <asm/msr.h> 45#include <asm/msr.h>
47#include <asm/processor.h> 46#include <asm/processor.h>
48#include <asm/cpufeature.h> 47#include <asm/cpufeature.h>
48#include "mperf.h"
49 49
50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ 50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
51 "acpi-cpufreq", msg) 51 "acpi-cpufreq", msg)
@@ -71,10 +71,8 @@ struct acpi_cpufreq_data {
71 71
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); 72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
73 73
74static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
75
76/* acpi_perf_data is a pointer to percpu data. */ 74/* acpi_perf_data is a pointer to percpu data. */
77static struct acpi_processor_performance *acpi_perf_data; 75static struct acpi_processor_performance __percpu *acpi_perf_data;
78 76
79static struct cpufreq_driver acpi_cpufreq_driver; 77static struct cpufreq_driver acpi_cpufreq_driver;
80 78
@@ -240,45 +238,6 @@ static u32 get_cur_val(const struct cpumask *mask)
240 return cmd.val; 238 return cmd.val;
241} 239}
242 240
243/* Called via smp_call_function_single(), on the target CPU */
244static void read_measured_perf_ctrs(void *_cur)
245{
246 struct aperfmperf *am = _cur;
247
248 get_aperfmperf(am);
249}
250
251/*
252 * Return the measured active (C0) frequency on this CPU since last call
253 * to this function.
254 * Input: cpu number
255 * Return: Average CPU frequency in terms of max frequency (zero on error)
256 *
257 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
258 * over a period of time, while CPU is in C0 state.
259 * IA32_MPERF counts at the rate of max advertised frequency
260 * IA32_APERF counts at the rate of actual CPU frequency
261 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
262 * no meaning should be associated with absolute values of these MSRs.
263 */
264static unsigned int get_measured_perf(struct cpufreq_policy *policy,
265 unsigned int cpu)
266{
267 struct aperfmperf perf;
268 unsigned long ratio;
269 unsigned int retval;
270
271 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
272 return 0;
273
274 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
275 per_cpu(acfreq_old_perf, cpu) = perf;
276
277 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
278
279 return retval;
280}
281
282static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 241static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
283{ 242{
284 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); 243 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -364,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
364 } 323 }
365 } 324 }
366 325
367 trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
368
369 switch (data->cpu_feature) { 326 switch (data->cpu_feature) {
370 case SYSTEM_INTEL_MSR_CAPABLE: 327 case SYSTEM_INTEL_MSR_CAPABLE:
371 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 328 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
@@ -391,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
391 348
392 freqs.old = perf->states[perf->state].core_frequency * 1000; 349 freqs.old = perf->states[perf->state].core_frequency * 1000;
393 freqs.new = data->freq_table[next_state].frequency; 350 freqs.new = data->freq_table[next_state].frequency;
394 for_each_cpu(i, cmd.mask) { 351 for_each_cpu(i, policy->cpus) {
395 freqs.cpu = i; 352 freqs.cpu = i;
396 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 353 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
397 } 354 }
@@ -407,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
407 } 364 }
408 } 365 }
409 366
410 for_each_cpu(i, cmd.mask) { 367 for_each_cpu(i, policy->cpus) {
411 freqs.cpu = i; 368 freqs.cpu = i;
412 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 369 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
413 } 370 }
@@ -702,7 +659,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
702 659
703 /* Check for APERF/MPERF support in hardware */ 660 /* Check for APERF/MPERF support in hardware */
704 if (cpu_has(c, X86_FEATURE_APERFMPERF)) 661 if (cpu_has(c, X86_FEATURE_APERFMPERF))
705 acpi_cpufreq_driver.getavg = get_measured_perf; 662 acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
706 663
707 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 664 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
708 for (i = 0; i < perf->state_count; i++) 665 for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 16e3483be9e3..32974cf84232 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -169,12 +169,9 @@ static int gx_freq_mult[16] = {
169 * Low Level chipset interface * 169 * Low Level chipset interface *
170 ****************************************************************/ 170 ****************************************************************/
171static struct pci_device_id gx_chipset_tbl[] __initdata = { 171static struct pci_device_id gx_chipset_tbl[] __initdata = {
172 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, 172 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
173 PCI_ANY_ID, PCI_ANY_ID }, 173 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
174 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, 174 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
175 PCI_ANY_ID, PCI_ANY_ID },
176 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
177 PCI_ANY_ID, PCI_ANY_ID },
178 { 0, }, 175 { 0, },
179}; 176};
180 177
@@ -199,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
199 } 196 }
200 197
201 /* detect which companion chip is used */ 198 /* detect which companion chip is used */
202 while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { 199 for_each_pci_dev(gx_pci) {
203 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) 200 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
204 return gx_pci; 201 return gx_pci;
205 } 202 }
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 7e7eea4f8261..03162dac6271 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -426,7 +426,7 @@ static int guess_fsb(int mult)
426} 426}
427 427
428 428
429static int __init longhaul_get_ranges(void) 429static int __cpuinit longhaul_get_ranges(void)
430{ 430{
431 unsigned int i, j, k = 0; 431 unsigned int i, j, k = 0;
432 unsigned int ratio; 432 unsigned int ratio;
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void)
530} 530}
531 531
532 532
533static void __init longhaul_setup_voltagescaling(void) 533static void __cpuinit longhaul_setup_voltagescaling(void)
534{ 534{
535 union msr_longhaul longhaul; 535 union msr_longhaul longhaul;
536 struct mV_pos minvid, maxvid, vid; 536 struct mV_pos minvid, maxvid, vid;
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void)
784 return 0; 784 return 0;
785} 785}
786 786
787static int __init longhaul_cpu_init(struct cpufreq_policy *policy) 787static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
788{ 788{
789 struct cpuinfo_x86 *c = &cpu_data(0); 789 struct cpuinfo_x86 *c = &cpu_data(0);
790 char *cpuname = NULL; 790 char *cpuname = NULL;
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index e2360a469f79..cbf48fbca881 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -56,7 +56,7 @@ union msr_longhaul {
56/* 56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0) 57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */ 58 */
59static const int __initdata samuel1_mults[16] = { 59static const int __cpuinitdata samuel1_mults[16] = {
60 -1, /* 0000 -> RESERVED */ 60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */ 61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */ 62 40, /* 0010 -> 4.0x */
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = {
75 -1, /* 1111 -> RESERVED */ 75 -1, /* 1111 -> RESERVED */
76}; 76};
77 77
78static const int __initdata samuel1_eblcr[16] = { 78static const int __cpuinitdata samuel1_eblcr[16] = {
79 50, /* 0000 -> RESERVED */ 79 50, /* 0000 -> RESERVED */
80 30, /* 0001 -> 3.0x */ 80 30, /* 0001 -> 3.0x */
81 40, /* 0010 -> 4.0x */ 81 40, /* 0010 -> 4.0x */
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = {
97/* 97/*
98 * VIA C3 Samuel2 Stepping 1->15 98 * VIA C3 Samuel2 Stepping 1->15
99 */ 99 */
100static const int __initdata samuel2_eblcr[16] = { 100static const int __cpuinitdata samuel2_eblcr[16] = {
101 50, /* 0000 -> 5.0x */ 101 50, /* 0000 -> 5.0x */
102 30, /* 0001 -> 3.0x */ 102 30, /* 0001 -> 3.0x */
103 40, /* 0010 -> 4.0x */ 103 40, /* 0010 -> 4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
119/* 119/*
120 * VIA C3 Ezra 120 * VIA C3 Ezra
121 */ 121 */
122static const int __initdata ezra_mults[16] = { 122static const int __cpuinitdata ezra_mults[16] = {
123 100, /* 0000 -> 10.0x */ 123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */ 124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */ 125 40, /* 0010 -> 4.0x */
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = {
138 120, /* 1111 -> 12.0x */ 138 120, /* 1111 -> 12.0x */
139}; 139};
140 140
141static const int __initdata ezra_eblcr[16] = { 141static const int __cpuinitdata ezra_eblcr[16] = {
142 50, /* 0000 -> 5.0x */ 142 50, /* 0000 -> 5.0x */
143 30, /* 0001 -> 3.0x */ 143 30, /* 0001 -> 3.0x */
144 40, /* 0010 -> 4.0x */ 144 40, /* 0010 -> 4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
160/* 160/*
161 * VIA C3 (Ezra-T) [C5M]. 161 * VIA C3 (Ezra-T) [C5M].
162 */ 162 */
163static const int __initdata ezrat_mults[32] = { 163static const int __cpuinitdata ezrat_mults[32] = {
164 100, /* 0000 -> 10.0x */ 164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */ 165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */ 166 40, /* 0010 -> 4.0x */
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = {
196 -1, /* 1111 -> RESERVED (12.0x) */ 196 -1, /* 1111 -> RESERVED (12.0x) */
197}; 197};
198 198
199static const int __initdata ezrat_eblcr[32] = { 199static const int __cpuinitdata ezrat_eblcr[32] = {
200 50, /* 0000 -> 5.0x */ 200 50, /* 0000 -> 5.0x */
201 30, /* 0001 -> 3.0x */ 201 30, /* 0001 -> 3.0x */
202 40, /* 0010 -> 4.0x */ 202 40, /* 0010 -> 4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
235/* 235/*
236 * VIA C3 Nehemiah */ 236 * VIA C3 Nehemiah */
237 237
238static const int __initdata nehemiah_mults[32] = { 238static const int __cpuinitdata nehemiah_mults[32] = {
239 100, /* 0000 -> 10.0x */ 239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */ 240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */ 241 40, /* 0010 -> 4.0x */
@@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = {
270 -1, /* 1111 -> 12.0x */ 270 -1, /* 1111 -> 12.0x */
271}; 271};
272 272
273static const int __initdata nehemiah_eblcr[32] = { 273static const int __cpuinitdata nehemiah_eblcr[32] = {
274 50, /* 0000 -> 5.0x */ 274 50, /* 0000 -> 5.0x */
275 160, /* 0001 -> 16.0x */ 275 160, /* 0001 -> 16.0x */
276 40, /* 0010 -> 4.0x */ 276 40, /* 0010 -> 4.0x */
@@ -315,7 +315,7 @@ struct mV_pos {
315 unsigned short pos; 315 unsigned short pos;
316}; 316};
317 317
318static const struct mV_pos __initdata vrm85_mV[32] = { 318static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, 319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, 320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, 321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = {
326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} 326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
327}; 327};
328 328
329static const unsigned char __initdata mV_vrm85[32] = { 329static const unsigned char __cpuinitdata mV_vrm85[32] = {
330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, 330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, 331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, 332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
334}; 334};
335 335
336static const struct mV_pos __initdata mobilevrm_mV[32] = { 336static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, 337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, 338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, 339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = {
344 {675, 3}, {650, 2}, {625, 1}, {600, 0} 344 {675, 3}, {650, 2}, {625, 1}, {600, 0}
345}; 345};
346 346
347static const unsigned char __initdata mV_mobilevrm[32] = { 347static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index e7b559d74c52..fc09f142d94d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu)
165 * TMTA rules: 165 * TMTA rules:
166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) 166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
167 */ 167 */
168static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, 168static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
169 unsigned int *high_freq) 169 unsigned int *high_freq)
170{ 170{
171 u32 msr_lo, msr_hi; 171 u32 msr_lo, msr_hi;
172 u32 save_lo, save_hi; 172 u32 save_lo, save_hi;
@@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
258} 258}
259 259
260 260
261static int __init longrun_cpu_init(struct cpufreq_policy *policy) 261static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
262{ 262{
263 int result = 0; 263 int result = 0;
264 264
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 000000000000..911e193018ae
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
1#include <linux/kernel.h>
2#include <linux/smp.h>
3#include <linux/module.h>
4#include <linux/init.h>
5#include <linux/cpufreq.h>
6#include <linux/slab.h>
7
8#include "mperf.h"
9
10static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
11
12/* Called via smp_call_function_single(), on the target CPU */
13static void read_measured_perf_ctrs(void *_cur)
14{
15 struct aperfmperf *am = _cur;
16
17 get_aperfmperf(am);
18}
19
20/*
21 * Return the measured active (C0) frequency on this CPU since last call
22 * to this function.
23 * Input: cpu number
24 * Return: Average CPU frequency in terms of max frequency (zero on error)
25 *
26 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
27 * over a period of time, while CPU is in C0 state.
28 * IA32_MPERF counts at the rate of max advertised frequency
29 * IA32_APERF counts at the rate of actual CPU frequency
30 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
31 * no meaning should be associated with absolute values of these MSRs.
32 */
33unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
34 unsigned int cpu)
35{
36 struct aperfmperf perf;
37 unsigned long ratio;
38 unsigned int retval;
39
40 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
41 return 0;
42
43 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
44 per_cpu(acfreq_old_perf, cpu) = perf;
45
46 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
47
48 return retval;
49}
50EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
51MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 000000000000..5dbf2950dc22
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
1/*
2 * (c) 2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
9 unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 7b8a8ba67b07..bd1cac747f67 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
178 } 178 }
179 } 179 }
180 180
181 if (c->x86 != 0xF) { 181 if (c->x86 != 0xF)
182 if (!cpu_has(c, X86_FEATURE_EST))
183 printk(KERN_WARNING PFX "Unknown CPU. "
184 "Please send an e-mail to "
185 "<cpufreq@vger.kernel.org>\n");
186 return 0; 182 return 0;
187 }
188 183
189 /* on P-4s, the TSC runs with constant frequency independent whether 184 /* on P-4s, the TSC runs with constant frequency independent whether
190 * throttling is active or not. */ 185 * throttling is active or not. */
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index ce7cde713e71..4f6f679f2799 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -110,7 +110,7 @@ struct pcc_cpu {
110 u32 output_offset; 110 u32 output_offset;
111}; 111};
112 112
113static struct pcc_cpu *pcc_cpu_info; 113static struct pcc_cpu __percpu *pcc_cpu_info;
114 114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy) 115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{ 116{
@@ -397,13 +397,17 @@ static int __init pcc_cpufreq_probe(void)
397 struct pcc_memory_resource *mem_resource; 397 struct pcc_memory_resource *mem_resource;
398 struct pcc_register_resource *reg_resource; 398 struct pcc_register_resource *reg_resource;
399 union acpi_object *out_obj, *member; 399 union acpi_object *out_obj, *member;
400 acpi_handle handle, osc_handle; 400 acpi_handle handle, osc_handle, pcch_handle;
401 int ret = 0; 401 int ret = 0;
402 402
403 status = acpi_get_handle(NULL, "\\_SB", &handle); 403 status = acpi_get_handle(NULL, "\\_SB", &handle);
404 if (ACPI_FAILURE(status)) 404 if (ACPI_FAILURE(status))
405 return -ENODEV; 405 return -ENODEV;
406 406
407 status = acpi_get_handle(handle, "PCCH", &pcch_handle);
408 if (ACPI_FAILURE(status))
409 return -ENODEV;
410
407 status = acpi_get_handle(handle, "_OSC", &osc_handle); 411 status = acpi_get_handle(handle, "_OSC", &osc_handle);
408 if (ACPI_SUCCESS(status)) { 412 if (ACPI_SUCCESS(status)) {
409 ret = pcc_cpufreq_do_osc(&osc_handle); 413 ret = pcc_cpufreq_do_osc(&osc_handle);
@@ -543,13 +547,13 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
543 547
544 if (!pcch_virt_addr) { 548 if (!pcch_virt_addr) {
545 result = -1; 549 result = -1;
546 goto pcch_null; 550 goto out;
547 } 551 }
548 552
549 result = pcc_get_offset(cpu); 553 result = pcc_get_offset(cpu);
550 if (result) { 554 if (result) {
551 dprintk("init: PCCP evaluation failed\n"); 555 dprintk("init: PCCP evaluation failed\n");
552 goto free; 556 goto out;
553 } 557 }
554 558
555 policy->max = policy->cpuinfo.max_freq = 559 policy->max = policy->cpuinfo.max_freq =
@@ -558,14 +562,15 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
558 ioread32(&pcch_hdr->minimum_frequency) * 1000; 562 ioread32(&pcch_hdr->minimum_frequency) * 1000;
559 policy->cur = pcc_get_freq(cpu); 563 policy->cur = pcc_get_freq(cpu);
560 564
565 if (!policy->cur) {
566 dprintk("init: Unable to get current CPU frequency\n");
567 result = -EINVAL;
568 goto out;
569 }
570
561 dprintk("init: policy->max is %d, policy->min is %d\n", 571 dprintk("init: policy->max is %d, policy->min is %d\n",
562 policy->max, policy->min); 572 policy->max, policy->min);
563 573out:
564 return 0;
565free:
566 pcc_clear_mapping();
567 free_percpu(pcc_cpu_info);
568pcch_null:
569 return result; 574 return result;
570} 575}
571 576
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 9a97116f89e5..4a45fd6e41ba 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy)
569 * We will then get the same kind of behaviour already tested under 569 * We will then get the same kind of behaviour already tested under
570 * the "well-known" other OS. 570 * the "well-known" other OS.
571 */ 571 */
572static int __init fixup_sgtc(void) 572static int __cpuinit fixup_sgtc(void)
573{ 573{
574 unsigned int sgtc; 574 unsigned int sgtc;
575 unsigned int m; 575 unsigned int m;
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu)
603} 603}
604 604
605 605
606static int __init acer_cpufreq_pst(const struct dmi_system_id *d) 606static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
607{ 607{
608 printk(KERN_WARNING PFX 608 printk(KERN_WARNING PFX
609 "%s laptop with broken PST tables in BIOS detected.\n", 609 "%s laptop with broken PST tables in BIOS detected.\n",
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
621 * A BIOS update is all that can save them. 621 * A BIOS update is all that can save them.
622 * Mention this, and disable cpufreq. 622 * Mention this, and disable cpufreq.
623 */ 623 */
624static struct dmi_system_id __initdata powernow_dmi_table[] = { 624static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
625 { 625 {
626 .callback = acer_cpufreq_pst, 626 .callback = acer_cpufreq_pst,
627 .ident = "Acer Aspire", 627 .ident = "Acer Aspire",
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
633 { } 633 { }
634}; 634};
635 635
636static int __init powernow_cpu_init(struct cpufreq_policy *policy) 636static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
637{ 637{
638 union msr_fidvidstatus fidvidstatus; 638 union msr_fidvidstatus fidvidstatus;
639 int result; 639 int result;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index b6215b9798e2..491977baf6c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
1
2/* 1/*
3 * (c) 2003-2006 Advanced Micro Devices, Inc. 2 * (c) 2003-2010 Advanced Micro Devices, Inc.
4 * Your use of this code is subject to the terms and conditions of the 3 * Your use of this code is subject to the terms and conditions of the
5 * GNU general public license version 2. See "COPYING" or 4 * GNU general public license version 2. See "COPYING" or
6 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
@@ -10,7 +9,7 @@
10 * Based on the powernow-k7.c module written by Dave Jones. 9 * Based on the powernow-k7.c module written by Dave Jones.
11 * (C) 2003 Dave Jones on behalf of SuSE Labs 10 * (C) 2003 Dave Jones on behalf of SuSE Labs
12 * (C) 2004 Dominik Brodowski <linux@brodo.de> 11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
13 * (C) 2004 Pavel Machek <pavel@suse.cz> 12 * (C) 2004 Pavel Machek <pavel@ucw.cz>
14 * Licensed under the terms of the GNU GPL License version 2. 13 * Licensed under the terms of the GNU GPL License version 2.
15 * Based upon datasheets & sample CPUs kindly provided by AMD. 14 * Based upon datasheets & sample CPUs kindly provided by AMD.
16 * 15 *
@@ -46,6 +45,7 @@
46#define PFX "powernow-k8: " 45#define PFX "powernow-k8: "
47#define VERSION "version 2.20.00" 46#define VERSION "version 2.20.00"
48#include "powernow-k8.h" 47#include "powernow-k8.h"
48#include "mperf.h"
49 49
50/* serialize freq changes */ 50/* serialize freq changes */
51static DEFINE_MUTEX(fidvid_mutex); 51static DEFINE_MUTEX(fidvid_mutex);
@@ -54,6 +54,12 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
54 54
55static int cpu_family = CPU_OPTERON; 55static int cpu_family = CPU_OPTERON;
56 56
57/* core performance boost */
58static bool cpb_capable, cpb_enabled;
59static struct msr __percpu *msrs;
60
61static struct cpufreq_driver cpufreq_amd64_driver;
62
57#ifndef CONFIG_SMP 63#ifndef CONFIG_SMP
58static inline const struct cpumask *cpu_core_mask(int cpu) 64static inline const struct cpumask *cpu_core_mask(int cpu)
59{ 65{
@@ -800,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data)
800 * www.amd.com 806 * www.amd.com
801 */ 807 */
802 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); 808 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
809 printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
810 " and Cool'N'Quiet support is enabled in BIOS setup\n");
803 return -ENODEV; 811 return -ENODEV;
804} 812}
805 813
@@ -904,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
904{ 912{
905 int i; 913 int i;
906 u32 hi = 0, lo = 0; 914 u32 hi = 0, lo = 0;
907 rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); 915 rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
908 data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; 916 data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
909 917
910 for (i = 0; i < data->acpi_data.state_count; i++) { 918 for (i = 0; i < data->acpi_data.state_count; i++) {
911 u32 index; 919 u32 index;
@@ -1017,13 +1025,12 @@ static int get_transition_latency(struct powernow_k8_data *data)
1017 } 1025 }
1018 if (max_latency == 0) { 1026 if (max_latency == 0) {
1019 /* 1027 /*
1020 * Fam 11h always returns 0 as transition latency. 1028 * Fam 11h and later may return 0 as transition latency. This
1021 * This is intended and means "very fast". While cpufreq core 1029 * is intended and means "very fast". While cpufreq core and
1022 * and governors currently can handle that gracefully, better 1030 * governors currently can handle that gracefully, better set it
1023 * set it to 1 to avoid problems in the future. 1031 * to 1 to avoid problems in the future.
1024 * For all others it's a BIOS bug.
1025 */ 1032 */
1026 if (boot_cpu_data.x86 != 0x11) 1033 if (boot_cpu_data.x86 < 0x11)
1027 printk(KERN_ERR FW_WARN PFX "Invalid zero transition " 1034 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1028 "latency\n"); 1035 "latency\n");
1029 max_latency = 1; 1036 max_latency = 1;
@@ -1249,6 +1256,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1249 struct powernow_k8_data *data; 1256 struct powernow_k8_data *data;
1250 struct init_on_cpu init_on_cpu; 1257 struct init_on_cpu init_on_cpu;
1251 int rc; 1258 int rc;
1259 struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
1252 1260
1253 if (!cpu_online(pol->cpu)) 1261 if (!cpu_online(pol->cpu))
1254 return -ENODEV; 1262 return -ENODEV;
@@ -1323,6 +1331,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1323 return -EINVAL; 1331 return -EINVAL;
1324 } 1332 }
1325 1333
1334 /* Check for APERF/MPERF support in hardware */
1335 if (cpu_has(c, X86_FEATURE_APERFMPERF))
1336 cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
1337
1326 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); 1338 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1327 1339
1328 if (cpu_family == CPU_HW_PSTATE) 1340 if (cpu_family == CPU_HW_PSTATE)
@@ -1394,8 +1406,77 @@ out:
1394 return khz; 1406 return khz;
1395} 1407}
1396 1408
1409static void _cpb_toggle_msrs(bool t)
1410{
1411 int cpu;
1412
1413 get_online_cpus();
1414
1415 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1416
1417 for_each_cpu(cpu, cpu_online_mask) {
1418 struct msr *reg = per_cpu_ptr(msrs, cpu);
1419 if (t)
1420 reg->l &= ~BIT(25);
1421 else
1422 reg->l |= BIT(25);
1423 }
1424 wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1425
1426 put_online_cpus();
1427}
1428
1429/*
1430 * Switch on/off core performance boosting.
1431 *
1432 * 0=disable
1433 * 1=enable.
1434 */
1435static void cpb_toggle(bool t)
1436{
1437 if (!cpb_capable)
1438 return;
1439
1440 if (t && !cpb_enabled) {
1441 cpb_enabled = true;
1442 _cpb_toggle_msrs(t);
1443 printk(KERN_INFO PFX "Core Boosting enabled.\n");
1444 } else if (!t && cpb_enabled) {
1445 cpb_enabled = false;
1446 _cpb_toggle_msrs(t);
1447 printk(KERN_INFO PFX "Core Boosting disabled.\n");
1448 }
1449}
1450
1451static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
1452 size_t count)
1453{
1454 int ret = -EINVAL;
1455 unsigned long val = 0;
1456
1457 ret = strict_strtoul(buf, 10, &val);
1458 if (!ret && (val == 0 || val == 1) && cpb_capable)
1459 cpb_toggle(val);
1460 else
1461 return -EINVAL;
1462
1463 return count;
1464}
1465
1466static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
1467{
1468 return sprintf(buf, "%u\n", cpb_enabled);
1469}
1470
1471#define define_one_rw(_name) \
1472static struct freq_attr _name = \
1473__ATTR(_name, 0644, show_##_name, store_##_name)
1474
1475define_one_rw(cpb);
1476
1397static struct freq_attr *powernow_k8_attr[] = { 1477static struct freq_attr *powernow_k8_attr[] = {
1398 &cpufreq_freq_attr_scaling_available_freqs, 1478 &cpufreq_freq_attr_scaling_available_freqs,
1479 &cpb,
1399 NULL, 1480 NULL,
1400}; 1481};
1401 1482
@@ -1411,10 +1492,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
1411 .attr = powernow_k8_attr, 1492 .attr = powernow_k8_attr,
1412}; 1493};
1413 1494
1495/*
1496 * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
1497 * cannot block the remaining ones from boosting. On the CPU_UP path we
1498 * simply keep the boost-disable flag in sync with the current global
1499 * state.
1500 */
1501static int cpb_notify(struct notifier_block *nb, unsigned long action,
1502 void *hcpu)
1503{
1504 unsigned cpu = (long)hcpu;
1505 u32 lo, hi;
1506
1507 switch (action) {
1508 case CPU_UP_PREPARE:
1509 case CPU_UP_PREPARE_FROZEN:
1510
1511 if (!cpb_enabled) {
1512 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1513 lo |= BIT(25);
1514 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1515 }
1516 break;
1517
1518 case CPU_DOWN_PREPARE:
1519 case CPU_DOWN_PREPARE_FROZEN:
1520 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1521 lo &= ~BIT(25);
1522 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1523 break;
1524
1525 default:
1526 break;
1527 }
1528
1529 return NOTIFY_OK;
1530}
1531
1532static struct notifier_block cpb_nb = {
1533 .notifier_call = cpb_notify,
1534};
1535
1414/* driver entry point for init */ 1536/* driver entry point for init */
1415static int __cpuinit powernowk8_init(void) 1537static int __cpuinit powernowk8_init(void)
1416{ 1538{
1417 unsigned int i, supported_cpus = 0; 1539 unsigned int i, supported_cpus = 0, cpu;
1418 1540
1419 for_each_online_cpu(i) { 1541 for_each_online_cpu(i) {
1420 int rc; 1542 int rc;
@@ -1423,15 +1545,36 @@ static int __cpuinit powernowk8_init(void)
1423 supported_cpus++; 1545 supported_cpus++;
1424 } 1546 }
1425 1547
1426 if (supported_cpus == num_online_cpus()) { 1548 if (supported_cpus != num_online_cpus())
1427 printk(KERN_INFO PFX "Found %d %s " 1549 return -ENODEV;
1428 "processors (%d cpu cores) (" VERSION ")\n", 1550
1429 num_online_nodes(), 1551 printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
1430 boot_cpu_data.x86_model_id, supported_cpus); 1552 num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
1431 return cpufreq_register_driver(&cpufreq_amd64_driver); 1553
1554 if (boot_cpu_has(X86_FEATURE_CPB)) {
1555
1556 cpb_capable = true;
1557
1558 register_cpu_notifier(&cpb_nb);
1559
1560 msrs = msrs_alloc();
1561 if (!msrs) {
1562 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
1563 return -ENOMEM;
1564 }
1565
1566 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1567
1568 for_each_cpu(cpu, cpu_online_mask) {
1569 struct msr *reg = per_cpu_ptr(msrs, cpu);
1570 cpb_enabled |= !(!!(reg->l & BIT(25)));
1571 }
1572
1573 printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
1574 (cpb_enabled ? "on" : "off"));
1432 } 1575 }
1433 1576
1434 return -ENODEV; 1577 return cpufreq_register_driver(&cpufreq_amd64_driver);
1435} 1578}
1436 1579
1437/* driver entry point for term */ 1580/* driver entry point for term */
@@ -1439,6 +1582,13 @@ static void __exit powernowk8_exit(void)
1439{ 1582{
1440 dprintk("exit\n"); 1583 dprintk("exit\n");
1441 1584
1585 if (boot_cpu_has(X86_FEATURE_CPB)) {
1586 msrs_free(msrs);
1587 msrs = NULL;
1588
1589 unregister_cpu_notifier(&cpb_nb);
1590 }
1591
1442 cpufreq_unregister_driver(&cpufreq_amd64_driver); 1592 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1443} 1593}
1444 1594
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824073cb..df3529b1c02d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
5 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
6 */ 6 */
7 7
8
9enum pstate { 8enum pstate {
10 HW_PSTATE_INVALID = 0xff, 9 HW_PSTATE_INVALID = 0xff,
11 HW_PSTATE_0 = 0, 10 HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
55 struct cpumask *available_cores; 54 struct cpumask *available_cores;
56}; 55};
57 56
58
59/* processor's cpuid instruction support */ 57/* processor's cpuid instruction support */
60#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ 58#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
61#define CPUID_XFAM 0x0ff00000 /* extended family */ 59#define CPUID_XFAM 0x0ff00000 /* extended family */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922de33a..8095f8611f8a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,37 +21,58 @@
21 * 21 *
22 */ 22 */
23 23
24#include <linux/module.h>
24#include <asm/processor.h> 25#include <asm/processor.h>
25#include <asm/vmware.h>
26#include <asm/hypervisor.h> 26#include <asm/hypervisor.h>
27 27
28static inline void __cpuinit 28/*
29detect_hypervisor_vendor(struct cpuinfo_x86 *c) 29 * Hypervisor detect order. This is specified explicitly here because
30 * some hypervisors might implement compatibility modes for other
31 * hypervisors and therefore need to be detected in specific sequence.
32 */
33static const __initconst struct hypervisor_x86 * const hypervisors[] =
30{ 34{
31 if (vmware_platform()) 35 &x86_hyper_vmware,
32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; 36 &x86_hyper_ms_hyperv,
33 else 37#ifdef CONFIG_XEN_PVHVM
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; 38 &x86_hyper_xen_hvm,
35} 39#endif
40};
36 41
37static inline void __cpuinit 42const struct hypervisor_x86 *x86_hyper;
38hypervisor_set_feature_bits(struct cpuinfo_x86 *c) 43EXPORT_SYMBOL(x86_hyper);
44
45static inline void __init
46detect_hypervisor_vendor(void)
39{ 47{
40 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { 48 const struct hypervisor_x86 *h, * const *p;
41 vmware_set_feature_bits(c); 49
42 return; 50 for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
51 h = *p;
52 if (h->detect()) {
53 x86_hyper = h;
54 printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
55 break;
56 }
43 } 57 }
44} 58}
45 59
46void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) 60void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
47{ 61{
48 detect_hypervisor_vendor(c); 62 if (x86_hyper && x86_hyper->set_cpu_features)
49 hypervisor_set_feature_bits(c); 63 x86_hyper->set_cpu_features(c);
50} 64}
51 65
52void __init init_hypervisor_platform(void) 66void __init init_hypervisor_platform(void)
53{ 67{
68
69 detect_hypervisor_vendor();
70
71 if (!x86_hyper)
72 return;
73
54 init_hypervisor(&boot_cpu_data); 74 init_hypervisor(&boot_cpu_data);
55 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) 75
56 vmware_platform_setup(); 76 if (x86_hyper->init_platform)
77 x86_hyper->init_platform();
57} 78}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1366c7cfd483..b4389441efbb 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
12#include <asm/processor.h> 12#include <asm/processor.h>
13#include <asm/pgtable.h> 13#include <asm/pgtable.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17#include <asm/cpu.h> 16#include <asm/cpu.h>
18 17
@@ -40,6 +39,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
40 misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; 39 misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
41 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 40 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
42 c->cpuid_level = cpuid_eax(0); 41 c->cpuid_level = cpuid_eax(0);
42 get_cpu_cap(c);
43 } 43 }
44 } 44 }
45 45
@@ -373,12 +373,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
373 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 373 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
374 } 374 }
375 375
376 if (c->cpuid_level > 6) {
377 unsigned ecx = cpuid_ecx(6);
378 if (ecx & 0x01)
379 set_cpu_cap(c, X86_FEATURE_APERFMPERF);
380 }
381
382 if (cpu_has_xmm2) 376 if (cpu_has_xmm2)
383 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 377 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
384 if (cpu_has_ds) { 378 if (cpu_has_ds) {
@@ -388,7 +382,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
388 set_cpu_cap(c, X86_FEATURE_BTS); 382 set_cpu_cap(c, X86_FEATURE_BTS);
389 if (!(l1 & (1<<12))) 383 if (!(l1 & (1<<12)))
390 set_cpu_cap(c, X86_FEATURE_PEBS); 384 set_cpu_cap(c, X86_FEATURE_PEBS);
391 ds_init_intel(c);
392 } 385 }
393 386
394 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) 387 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 94d8e475744c..3fec7d9bfd62 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx {
148 u32 full; 148 u32 full;
149}; 149};
150 150
151struct amd_l3_cache {
152 struct pci_dev *dev;
153 bool can_disable;
154 unsigned indices;
155 u8 subcaches[4];
156};
157
151struct _cpuid4_info { 158struct _cpuid4_info {
152 union _cpuid4_leaf_eax eax; 159 union _cpuid4_leaf_eax eax;
153 union _cpuid4_leaf_ebx ebx; 160 union _cpuid4_leaf_ebx ebx;
154 union _cpuid4_leaf_ecx ecx; 161 union _cpuid4_leaf_ecx ecx;
155 unsigned long size; 162 unsigned long size;
156 bool can_disable; 163 struct amd_l3_cache *l3;
157 unsigned int l3_indices;
158 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 164 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
159}; 165};
160 166
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs {
164 union _cpuid4_leaf_ebx ebx; 170 union _cpuid4_leaf_ebx ebx;
165 union _cpuid4_leaf_ecx ecx; 171 union _cpuid4_leaf_ecx ecx;
166 unsigned long size; 172 unsigned long size;
167 bool can_disable; 173 struct amd_l3_cache *l3;
168 unsigned int l3_indices;
169}; 174};
170 175
171unsigned short num_cache_leaves; 176unsigned short num_cache_leaves;
@@ -302,124 +307,246 @@ struct _cache_attr {
302}; 307};
303 308
304#ifdef CONFIG_CPU_SUP_AMD 309#ifdef CONFIG_CPU_SUP_AMD
305static unsigned int __cpuinit amd_calc_l3_indices(void) 310
311/*
312 * L3 cache descriptors
313 */
314static struct amd_l3_cache **__cpuinitdata l3_caches;
315
316static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
306{ 317{
307 /*
308 * We're called over smp_call_function_single() and therefore
309 * are on the correct cpu.
310 */
311 int cpu = smp_processor_id();
312 int node = cpu_to_node(cpu);
313 struct pci_dev *dev = node_to_k8_nb_misc(node);
314 unsigned int sc0, sc1, sc2, sc3; 318 unsigned int sc0, sc1, sc2, sc3;
315 u32 val = 0; 319 u32 val = 0;
316 320
317 pci_read_config_dword(dev, 0x1C4, &val); 321 pci_read_config_dword(l3->dev, 0x1C4, &val);
318 322
319 /* calculate subcache sizes */ 323 /* calculate subcache sizes */
320 sc0 = !(val & BIT(0)); 324 l3->subcaches[0] = sc0 = !(val & BIT(0));
321 sc1 = !(val & BIT(4)); 325 l3->subcaches[1] = sc1 = !(val & BIT(4));
322 sc2 = !(val & BIT(8)) + !(val & BIT(9)); 326 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
323 sc3 = !(val & BIT(12)) + !(val & BIT(13)); 327 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
324 328
325 return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; 329 l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
326} 330}
327 331
328static void __cpuinit 332static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
329amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
330{ 333{
331 if (index < 3) 334 struct amd_l3_cache *l3;
335 struct pci_dev *dev = node_to_k8_nb_misc(node);
336
337 l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
338 if (!l3) {
339 printk(KERN_WARNING "Error allocating L3 struct\n");
340 return NULL;
341 }
342
343 l3->dev = dev;
344
345 amd_calc_l3_indices(l3);
346
347 return l3;
348}
349
350static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
351 int index)
352{
353 int node;
354
355 if (boot_cpu_data.x86 != 0x10)
332 return; 356 return;
333 357
334 if (boot_cpu_data.x86 == 0x11) 358 if (index < 3)
335 return; 359 return;
336 360
337 /* see errata #382 and #388 */ 361 /* see errata #382 and #388 */
338 if ((boot_cpu_data.x86 == 0x10) && 362 if (boot_cpu_data.x86_model < 0x8)
339 ((boot_cpu_data.x86_model < 0x8) ||
340 (boot_cpu_data.x86_mask < 0x1)))
341 return; 363 return;
342 364
365 if ((boot_cpu_data.x86_model == 0x8 ||
366 boot_cpu_data.x86_model == 0x9)
367 &&
368 boot_cpu_data.x86_mask < 0x1)
369 return;
370
343 /* not in virtualized environments */ 371 /* not in virtualized environments */
344 if (num_k8_northbridges == 0) 372 if (num_k8_northbridges == 0)
345 return; 373 return;
346 374
347 this_leaf->can_disable = true; 375 /*
348 this_leaf->l3_indices = amd_calc_l3_indices(); 376 * Strictly speaking, the amount in @size below is leaked since it is
377 * never freed but this is done only on shutdown so it doesn't matter.
378 */
379 if (!l3_caches) {
380 int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
381
382 l3_caches = kzalloc(size, GFP_ATOMIC);
383 if (!l3_caches)
384 return;
385 }
386
387 node = amd_get_nb_id(smp_processor_id());
388
389 if (!l3_caches[node]) {
390 l3_caches[node] = amd_init_l3_cache(node);
391 l3_caches[node]->can_disable = true;
392 }
393
394 WARN_ON(!l3_caches[node]);
395
396 this_leaf->l3 = l3_caches[node];
349} 397}
350 398
351static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, 399/*
352 unsigned int index) 400 * check whether a slot used for disabling an L3 index is occupied.
401 * @l3: L3 cache descriptor
402 * @slot: slot number (0..1)
403 *
404 * @returns: the disabled index if used or negative value if slot free.
405 */
406int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
353{ 407{
354 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
355 int node = amd_get_nb_id(cpu);
356 struct pci_dev *dev = node_to_k8_nb_misc(node);
357 unsigned int reg = 0; 408 unsigned int reg = 0;
358 409
359 if (!this_leaf->can_disable) 410 pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
360 return -EINVAL; 411
412 /* check whether this slot is activated already */
413 if (reg & (3UL << 30))
414 return reg & 0xfff;
415
416 return -1;
417}
418
419static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
420 unsigned int slot)
421{
422 int index;
361 423
362 if (!dev) 424 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
363 return -EINVAL; 425 return -EINVAL;
364 426
365 pci_read_config_dword(dev, 0x1BC + index * 4, &reg); 427 index = amd_get_l3_disable_slot(this_leaf->l3, slot);
366 return sprintf(buf, "0x%08x\n", reg); 428 if (index >= 0)
429 return sprintf(buf, "%d\n", index);
430
431 return sprintf(buf, "FREE\n");
367} 432}
368 433
369#define SHOW_CACHE_DISABLE(index) \ 434#define SHOW_CACHE_DISABLE(slot) \
370static ssize_t \ 435static ssize_t \
371show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ 436show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \
372{ \ 437{ \
373 return show_cache_disable(this_leaf, buf, index); \ 438 return show_cache_disable(this_leaf, buf, slot); \
374} 439}
375SHOW_CACHE_DISABLE(0) 440SHOW_CACHE_DISABLE(0)
376SHOW_CACHE_DISABLE(1) 441SHOW_CACHE_DISABLE(1)
377 442
378static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, 443static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
379 const char *buf, size_t count, unsigned int index) 444 unsigned slot, unsigned long idx)
380{ 445{
381 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 446 int i;
382 int node = amd_get_nb_id(cpu); 447
383 struct pci_dev *dev = node_to_k8_nb_misc(node); 448 idx |= BIT(30);
384 unsigned long val = 0; 449
450 /*
451 * disable index in all 4 subcaches
452 */
453 for (i = 0; i < 4; i++) {
454 u32 reg = idx | (i << 20);
455
456 if (!l3->subcaches[i])
457 continue;
458
459 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
460
461 /*
462 * We need to WBINVD on a core on the node containing the L3
463 * cache which indices we disable therefore a simple wbinvd()
464 * is not sufficient.
465 */
466 wbinvd_on_cpu(cpu);
467
468 reg |= BIT(31);
469 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
470 }
471}
472
473/*
474 * disable a L3 cache index by using a disable-slot
475 *
476 * @l3: L3 cache descriptor
477 * @cpu: A CPU on the node containing the L3 cache
478 * @slot: slot number (0..1)
479 * @index: index to disable
480 *
481 * @return: 0 on success, error status on failure
482 */
483int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
484 unsigned long index)
485{
486 int ret = 0;
385 487
386#define SUBCACHE_MASK (3UL << 20) 488#define SUBCACHE_MASK (3UL << 20)
387#define SUBCACHE_INDEX 0xfff 489#define SUBCACHE_INDEX 0xfff
388 490
389 if (!this_leaf->can_disable) 491 /*
492 * check whether this slot is already used or
493 * the index is already disabled
494 */
495 ret = amd_get_l3_disable_slot(l3, slot);
496 if (ret >= 0)
390 return -EINVAL; 497 return -EINVAL;
391 498
499 /*
500 * check whether the other slot has disabled the
501 * same index already
502 */
503 if (index == amd_get_l3_disable_slot(l3, !slot))
504 return -EINVAL;
505
506 /* do not allow writes outside of allowed bits */
507 if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
508 ((index & SUBCACHE_INDEX) > l3->indices))
509 return -EINVAL;
510
511 amd_l3_disable_index(l3, cpu, slot, index);
512
513 return 0;
514}
515
516static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
517 const char *buf, size_t count,
518 unsigned int slot)
519{
520 unsigned long val = 0;
521 int cpu, err = 0;
522
392 if (!capable(CAP_SYS_ADMIN)) 523 if (!capable(CAP_SYS_ADMIN))
393 return -EPERM; 524 return -EPERM;
394 525
395 if (!dev) 526 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
396 return -EINVAL; 527 return -EINVAL;
397 528
398 if (strict_strtoul(buf, 10, &val) < 0) 529 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
399 return -EINVAL;
400 530
401 /* do not allow writes outside of allowed bits */ 531 if (strict_strtoul(buf, 10, &val) < 0)
402 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
403 ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
404 return -EINVAL; 532 return -EINVAL;
405 533
406 val |= BIT(30); 534 err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
407 pci_write_config_dword(dev, 0x1BC + index * 4, val); 535 if (err) {
408 /* 536 if (err == -EEXIST)
409 * We need to WBINVD on a core on the node containing the L3 cache which 537 printk(KERN_WARNING "L3 disable slot %d in use!\n",
410 * indices we disable therefore a simple wbinvd() is not sufficient. 538 slot);
411 */ 539 return err;
412 wbinvd_on_cpu(cpu); 540 }
413 pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
414 return count; 541 return count;
415} 542}
416 543
417#define STORE_CACHE_DISABLE(index) \ 544#define STORE_CACHE_DISABLE(slot) \
418static ssize_t \ 545static ssize_t \
419store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ 546store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
420 const char *buf, size_t count) \ 547 const char *buf, size_t count) \
421{ \ 548{ \
422 return store_cache_disable(this_leaf, buf, count, index); \ 549 return store_cache_disable(this_leaf, buf, count, slot); \
423} 550}
424STORE_CACHE_DISABLE(0) 551STORE_CACHE_DISABLE(0)
425STORE_CACHE_DISABLE(1) 552STORE_CACHE_DISABLE(1)
@@ -431,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
431 558
432#else /* CONFIG_CPU_SUP_AMD */ 559#else /* CONFIG_CPU_SUP_AMD */
433static void __cpuinit 560static void __cpuinit
434amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 561amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
435{ 562{
436}; 563};
437#endif /* CONFIG_CPU_SUP_AMD */ 564#endif /* CONFIG_CPU_SUP_AMD */
@@ -447,8 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
447 574
448 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 575 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
449 amd_cpuid4(index, &eax, &ebx, &ecx); 576 amd_cpuid4(index, &eax, &ebx, &ecx);
450 if (boot_cpu_data.x86 >= 0x10) 577 amd_check_l3_disable(this_leaf, index);
451 amd_check_l3_disable(index, this_leaf);
452 } else { 578 } else {
453 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 579 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
454 } 580 }
@@ -722,6 +848,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
722 for (i = 0; i < num_cache_leaves; i++) 848 for (i = 0; i < num_cache_leaves; i++)
723 cache_remove_shared_cpu_map(cpu, i); 849 cache_remove_shared_cpu_map(cpu, i);
724 850
851 kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
725 kfree(per_cpu(ici_cpuid4_info, cpu)); 852 kfree(per_cpu(ici_cpuid4_info, cpu));
726 per_cpu(ici_cpuid4_info, cpu) = NULL; 853 per_cpu(ici_cpuid4_info, cpu) = NULL;
727} 854}
@@ -1006,7 +1133,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
1006 1133
1007 this_leaf = CPUID4_INFO_IDX(cpu, i); 1134 this_leaf = CPUID4_INFO_IDX(cpu, i);
1008 1135
1009 if (this_leaf->can_disable) 1136 if (this_leaf->l3 && this_leaf->l3->can_disable)
1010 ktype_cache.default_attrs = default_l3_attrs; 1137 ktype_cache.default_attrs = default_l3_attrs;
1011 else 1138 else
1012 ktype_cache.default_attrs = default_attrs; 1139 ktype_cache.default_attrs = default_attrs;
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 4ac6d48fe11b..bb34b03af252 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o 7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
8 8
9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o 9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
10
11obj-$(CONFIG_ACPI_APEI) += mce-apei.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
new file mode 100644
index 000000000000..8209472b27a5
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -0,0 +1,138 @@
1/*
2 * Bridge between MCE and APEI
3 *
4 * On some machine, corrected memory errors are reported via APEI
5 * generic hardware error source (GHES) instead of corrected Machine
6 * Check. These corrected memory errors can be reported to user space
7 * through /dev/mcelog via faking a corrected Machine Check, so that
8 * the error memory page can be offlined by /sbin/mcelog if the error
9 * count for one page is beyond the threshold.
10 *
11 * For fatal MCE, save MCE record into persistent storage via ERST, so
12 * that the MCE record can be logged after reboot via ERST.
13 *
14 * Copyright 2010 Intel Corp.
15 * Author: Huang Ying <ying.huang@intel.com>
16 *
17 * This program is free software; you can redistribute it and/or
18 * modify it under the terms of the GNU General Public License version
19 * 2 as published by the Free Software Foundation.
20 *
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
25 *
26 * You should have received a copy of the GNU General Public License
27 * along with this program; if not, write to the Free Software
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 */
30
31#include <linux/kernel.h>
32#include <linux/acpi.h>
33#include <linux/cper.h>
34#include <acpi/apei.h>
35#include <asm/mce.h>
36
37#include "mce-internal.h"
38
39void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
40{
41 struct mce m;
42
43 /* Only corrected MC is reported */
44 if (!corrected)
45 return;
46
47 mce_setup(&m);
48 m.bank = 1;
49 /* Fake a memory read corrected error with unknown channel */
50 m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
51 m.addr = mem_err->physical_addr;
52 mce_log(&m);
53 mce_notify_irq();
54}
55EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
56
57#define CPER_CREATOR_MCE \
58 UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
59 0x64, 0x90, 0xb8, 0x9d)
60#define CPER_SECTION_TYPE_MCE \
61 UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
62 0x04, 0x4a, 0x38, 0xfc)
63
64/*
65 * CPER specification (in UEFI specification 2.3 appendix N) requires
66 * byte-packed.
67 */
68struct cper_mce_record {
69 struct cper_record_header hdr;
70 struct cper_section_descriptor sec_hdr;
71 struct mce mce;
72} __packed;
73
74int apei_write_mce(struct mce *m)
75{
76 struct cper_mce_record rcd;
77
78 memset(&rcd, 0, sizeof(rcd));
79 memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
80 rcd.hdr.revision = CPER_RECORD_REV;
81 rcd.hdr.signature_end = CPER_SIG_END;
82 rcd.hdr.section_count = 1;
83 rcd.hdr.error_severity = CPER_SEV_FATAL;
84 /* timestamp, platform_id, partition_id are all invalid */
85 rcd.hdr.validation_bits = 0;
86 rcd.hdr.record_length = sizeof(rcd);
87 rcd.hdr.creator_id = CPER_CREATOR_MCE;
88 rcd.hdr.notification_type = CPER_NOTIFY_MCE;
89 rcd.hdr.record_id = cper_next_record_id();
90 rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
91
92 rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
93 rcd.sec_hdr.section_length = sizeof(rcd.mce);
94 rcd.sec_hdr.revision = CPER_SEC_REV;
95 /* fru_id and fru_text is invalid */
96 rcd.sec_hdr.validation_bits = 0;
97 rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
98 rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
99 rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
100
101 memcpy(&rcd.mce, m, sizeof(*m));
102
103 return erst_write(&rcd.hdr);
104}
105
106ssize_t apei_read_mce(struct mce *m, u64 *record_id)
107{
108 struct cper_mce_record rcd;
109 ssize_t len;
110
111 len = erst_read_next(&rcd.hdr, sizeof(rcd));
112 if (len <= 0)
113 return len;
114 /* Can not skip other records in storage via ERST unless clear them */
115 else if (len != sizeof(rcd) ||
116 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
117 if (printk_ratelimit())
118 pr_warning(
119 "MCE-APEI: Can not skip the unknown record in ERST");
120 return -EIO;
121 }
122
123 memcpy(m, &rcd.mce, sizeof(*m));
124 *record_id = rcd.hdr.record_id;
125
126 return sizeof(*m);
127}
128
129/* Check whether there is record in ERST */
130int apei_check_mce(void)
131{
132 return erst_get_record_count();
133}
134
135int apei_clear_mce(u64 record_id)
136{
137 return erst_clear(record_id);
138}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 32996f9fab67..fefcc69ee8b5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,3 +28,26 @@ extern int mce_ser;
28 28
29extern struct mce_bank *mce_banks; 29extern struct mce_bank *mce_banks;
30 30
31#ifdef CONFIG_ACPI_APEI
32int apei_write_mce(struct mce *m);
33ssize_t apei_read_mce(struct mce *m, u64 *record_id);
34int apei_check_mce(void);
35int apei_clear_mce(u64 record_id);
36#else
37static inline int apei_write_mce(struct mce *m)
38{
39 return -EINVAL;
40}
41static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
42{
43 return 0;
44}
45static inline int apei_check_mce(void)
46{
47 return 0;
48}
49static inline int apei_clear_mce(u64 record_id)
50{
51 return -EINVAL;
52}
53#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767e..ed41562909fe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -36,6 +36,7 @@
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/mm.h> 37#include <linux/mm.h>
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/edac_mce.h>
39 40
40#include <asm/processor.h> 41#include <asm/processor.h>
41#include <asm/hw_irq.h> 42#include <asm/hw_irq.h>
@@ -50,7 +51,7 @@
50static DEFINE_MUTEX(mce_read_mutex); 51static DEFINE_MUTEX(mce_read_mutex);
51 52
52#define rcu_dereference_check_mce(p) \ 53#define rcu_dereference_check_mce(p) \
53 rcu_dereference_check((p), \ 54 rcu_dereference_index_check((p), \
54 rcu_read_lock_sched_held() || \ 55 rcu_read_lock_sched_held() || \
55 lockdep_is_held(&mce_read_mutex)) 56 lockdep_is_held(&mce_read_mutex))
56 57
@@ -106,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
106static int default_decode_mce(struct notifier_block *nb, unsigned long val, 107static int default_decode_mce(struct notifier_block *nb, unsigned long val,
107 void *data) 108 void *data)
108{ 109{
109 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 110 pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
110 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 111 pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
111 112
112 return NOTIFY_STOP; 113 return NOTIFY_STOP;
113} 114}
@@ -169,6 +170,15 @@ void mce_log(struct mce *mce)
169 entry = rcu_dereference_check_mce(mcelog.next); 170 entry = rcu_dereference_check_mce(mcelog.next);
170 for (;;) { 171 for (;;) {
171 /* 172 /*
173 * If edac_mce is enabled, it will check the error type
174 * and will process it, if it is a known error.
175 * Otherwise, the error will be sent through mcelog
176 * interface
177 */
178 if (edac_mce_parse(mce))
179 return;
180
181 /*
172 * When the buffer fills up discard new entries. 182 * When the buffer fills up discard new entries.
173 * Assume that the earlier errors are the more 183 * Assume that the earlier errors are the more
174 * interesting ones: 184 * interesting ones:
@@ -201,11 +211,11 @@ void mce_log(struct mce *mce)
201 211
202static void print_mce(struct mce *m) 212static void print_mce(struct mce *m)
203{ 213{
204 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 214 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
205 m->extcpu, m->mcgstatus, m->bank, m->status); 215 m->extcpu, m->mcgstatus, m->bank, m->status);
206 216
207 if (m->ip) { 217 if (m->ip) {
208 pr_emerg("RIP%s %02x:<%016Lx> ", 218 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
209 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 219 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
210 m->cs, m->ip); 220 m->cs, m->ip);
211 221
@@ -214,14 +224,14 @@ static void print_mce(struct mce *m)
214 pr_cont("\n"); 224 pr_cont("\n");
215 } 225 }
216 226
217 pr_emerg("TSC %llx ", m->tsc); 227 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
218 if (m->addr) 228 if (m->addr)
219 pr_cont("ADDR %llx ", m->addr); 229 pr_cont("ADDR %llx ", m->addr);
220 if (m->misc) 230 if (m->misc)
221 pr_cont("MISC %llx ", m->misc); 231 pr_cont("MISC %llx ", m->misc);
222 232
223 pr_cont("\n"); 233 pr_cont("\n");
224 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 234 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
225 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 235 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
226 236
227 /* 237 /*
@@ -231,16 +241,6 @@ static void print_mce(struct mce *m)
231 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
232} 242}
233 243
234static void print_mce_head(void)
235{
236 pr_emerg("\nHARDWARE ERROR\n");
237}
238
239static void print_mce_tail(void)
240{
241 pr_emerg("This is not a software problem!\n");
242}
243
244#define PANIC_TIMEOUT 5 /* 5 seconds */ 244#define PANIC_TIMEOUT 5 /* 5 seconds */
245 245
246static atomic_t mce_paniced; 246static atomic_t mce_paniced;
@@ -264,7 +264,7 @@ static void wait_for_panic(void)
264 264
265static void mce_panic(char *msg, struct mce *final, char *exp) 265static void mce_panic(char *msg, struct mce *final, char *exp)
266{ 266{
267 int i; 267 int i, apei_err = 0;
268 268
269 if (!fake_panic) { 269 if (!fake_panic) {
270 /* 270 /*
@@ -281,14 +281,16 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
281 if (atomic_inc_return(&mce_fake_paniced) > 1) 281 if (atomic_inc_return(&mce_fake_paniced) > 1)
282 return; 282 return;
283 } 283 }
284 print_mce_head();
285 /* First print corrected ones that are still unlogged */ 284 /* First print corrected ones that are still unlogged */
286 for (i = 0; i < MCE_LOG_LEN; i++) { 285 for (i = 0; i < MCE_LOG_LEN; i++) {
287 struct mce *m = &mcelog.entry[i]; 286 struct mce *m = &mcelog.entry[i];
288 if (!(m->status & MCI_STATUS_VAL)) 287 if (!(m->status & MCI_STATUS_VAL))
289 continue; 288 continue;
290 if (!(m->status & MCI_STATUS_UC)) 289 if (!(m->status & MCI_STATUS_UC)) {
291 print_mce(m); 290 print_mce(m);
291 if (!apei_err)
292 apei_err = apei_write_mce(m);
293 }
292 } 294 }
293 /* Now print uncorrected but with the final one last */ 295 /* Now print uncorrected but with the final one last */
294 for (i = 0; i < MCE_LOG_LEN; i++) { 296 for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -297,22 +299,27 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
297 continue; 299 continue;
298 if (!(m->status & MCI_STATUS_UC)) 300 if (!(m->status & MCI_STATUS_UC))
299 continue; 301 continue;
300 if (!final || memcmp(m, final, sizeof(struct mce))) 302 if (!final || memcmp(m, final, sizeof(struct mce))) {
301 print_mce(m); 303 print_mce(m);
304 if (!apei_err)
305 apei_err = apei_write_mce(m);
306 }
302 } 307 }
303 if (final) 308 if (final) {
304 print_mce(final); 309 print_mce(final);
310 if (!apei_err)
311 apei_err = apei_write_mce(final);
312 }
305 if (cpu_missing) 313 if (cpu_missing)
306 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 314 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
307 print_mce_tail();
308 if (exp) 315 if (exp)
309 printk(KERN_EMERG "Machine check: %s\n", exp); 316 pr_emerg(HW_ERR "Machine check: %s\n", exp);
310 if (!fake_panic) { 317 if (!fake_panic) {
311 if (panic_timeout == 0) 318 if (panic_timeout == 0)
312 panic_timeout = mce_panic_timeout; 319 panic_timeout = mce_panic_timeout;
313 panic(msg); 320 panic(msg);
314 } else 321 } else
315 printk(KERN_EMERG "Fake kernel panic: %s\n", msg); 322 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
316} 323}
317 324
318/* Support code for software error injection */ 325/* Support code for software error injection */
@@ -539,7 +546,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
539 struct mce m; 546 struct mce m;
540 int i; 547 int i;
541 548
542 __get_cpu_var(mce_poll_count)++; 549 percpu_inc(mce_poll_count);
543 550
544 mce_setup(&m); 551 mce_setup(&m);
545 552
@@ -581,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
581 */ 588 */
582 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 589 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
583 mce_log(&m); 590 mce_log(&m);
591 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
584 add_taint(TAINT_MACHINE_CHECK); 592 add_taint(TAINT_MACHINE_CHECK);
585 } 593 }
586 594
@@ -934,7 +942,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
934 942
935 atomic_inc(&mce_entry); 943 atomic_inc(&mce_entry);
936 944
937 __get_cpu_var(mce_exception_count)++; 945 percpu_inc(mce_exception_count);
938 946
939 if (notify_die(DIE_NMI, "machine check", regs, error_code, 947 if (notify_die(DIE_NMI, "machine check", regs, error_code,
940 18, SIGKILL) == NOTIFY_STOP) 948 18, SIGKILL) == NOTIFY_STOP)
@@ -1201,7 +1209,7 @@ int mce_notify_irq(void)
1201 schedule_work(&mce_trigger_work); 1209 schedule_work(&mce_trigger_work);
1202 1210
1203 if (__ratelimit(&ratelimit)) 1211 if (__ratelimit(&ratelimit))
1204 printk(KERN_INFO "Machine check events logged\n"); 1212 pr_info(HW_ERR "Machine check events logged\n");
1205 1213
1206 return 1; 1214 return 1;
1207 } 1215 }
@@ -1493,6 +1501,43 @@ static void collect_tscs(void *data)
1493 rdtscll(cpu_tsc[smp_processor_id()]); 1501 rdtscll(cpu_tsc[smp_processor_id()]);
1494} 1502}
1495 1503
1504static int mce_apei_read_done;
1505
1506/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1507static int __mce_read_apei(char __user **ubuf, size_t usize)
1508{
1509 int rc;
1510 u64 record_id;
1511 struct mce m;
1512
1513 if (usize < sizeof(struct mce))
1514 return -EINVAL;
1515
1516 rc = apei_read_mce(&m, &record_id);
1517 /* Error or no more MCE record */
1518 if (rc <= 0) {
1519 mce_apei_read_done = 1;
1520 return rc;
1521 }
1522 rc = -EFAULT;
1523 if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1524 return rc;
1525 /*
1526 * In fact, we should have cleared the record after that has
1527 * been flushed to the disk or sent to network in
1528 * /sbin/mcelog, but we have no interface to support that now,
1529 * so just clear it to avoid duplication.
1530 */
1531 rc = apei_clear_mce(record_id);
1532 if (rc) {
1533 mce_apei_read_done = 1;
1534 return rc;
1535 }
1536 *ubuf += sizeof(struct mce);
1537
1538 return 0;
1539}
1540
1496static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1541static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1497 loff_t *off) 1542 loff_t *off)
1498{ 1543{
@@ -1506,15 +1551,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1506 return -ENOMEM; 1551 return -ENOMEM;
1507 1552
1508 mutex_lock(&mce_read_mutex); 1553 mutex_lock(&mce_read_mutex);
1554
1555 if (!mce_apei_read_done) {
1556 err = __mce_read_apei(&buf, usize);
1557 if (err || buf != ubuf)
1558 goto out;
1559 }
1560
1509 next = rcu_dereference_check_mce(mcelog.next); 1561 next = rcu_dereference_check_mce(mcelog.next);
1510 1562
1511 /* Only supports full reads right now */ 1563 /* Only supports full reads right now */
1512 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1564 err = -EINVAL;
1513 mutex_unlock(&mce_read_mutex); 1565 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1514 kfree(cpu_tsc); 1566 goto out;
1515
1516 return -EINVAL;
1517 }
1518 1567
1519 err = 0; 1568 err = 0;
1520 prev = 0; 1569 prev = 0;
@@ -1562,10 +1611,15 @@ timeout:
1562 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1611 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1563 } 1612 }
1564 } 1613 }
1614
1615 if (err)
1616 err = -EFAULT;
1617
1618out:
1565 mutex_unlock(&mce_read_mutex); 1619 mutex_unlock(&mce_read_mutex);
1566 kfree(cpu_tsc); 1620 kfree(cpu_tsc);
1567 1621
1568 return err ? -EFAULT : buf - ubuf; 1622 return err ? err : buf - ubuf;
1569} 1623}
1570 1624
1571static unsigned int mce_poll(struct file *file, poll_table *wait) 1625static unsigned int mce_poll(struct file *file, poll_table *wait)
@@ -1573,6 +1627,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
1573 poll_wait(file, &mce_wait, wait); 1627 poll_wait(file, &mce_wait, wait);
1574 if (rcu_dereference_check_mce(mcelog.next)) 1628 if (rcu_dereference_check_mce(mcelog.next))
1575 return POLLIN | POLLRDNORM; 1629 return POLLIN | POLLRDNORM;
1630 if (!mce_apei_read_done && apei_check_mce())
1631 return POLLIN | POLLRDNORM;
1576 return 0; 1632 return 0;
1577} 1633}
1578 1634
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 224392d8fe8c..39aaee5c1ab2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -141,6 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
141 address = (low & MASK_BLKPTR_LO) >> 21; 141 address = (low & MASK_BLKPTR_LO) >> 21;
142 if (!address) 142 if (!address)
143 break; 143 break;
144
144 address += MCG_XBLK_ADDR; 145 address += MCG_XBLK_ADDR;
145 } else 146 } else
146 ++address; 147 ++address;
@@ -148,12 +149,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
148 if (rdmsr_safe(address, &low, &high)) 149 if (rdmsr_safe(address, &low, &high))
149 break; 150 break;
150 151
151 if (!(high & MASK_VALID_HI)) { 152 if (!(high & MASK_VALID_HI))
152 if (block) 153 continue;
153 continue;
154 else
155 break;
156 }
157 154
158 if (!(high & MASK_CNTP_HI) || 155 if (!(high & MASK_CNTP_HI) ||
159 (high & MASK_LOCKED_HI)) 156 (high & MASK_LOCKED_HI))
@@ -530,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
530 err = -ENOMEM; 527 err = -ENOMEM;
531 goto out; 528 goto out;
532 } 529 }
533 if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { 530 if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
534 kfree(b); 531 kfree(b);
535 err = -ENOMEM; 532 err = -ENOMEM;
536 goto out; 533 goto out;
@@ -543,7 +540,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
543#ifndef CONFIG_SMP 540#ifndef CONFIG_SMP
544 cpumask_setall(b->cpus); 541 cpumask_setall(b->cpus);
545#else 542#else
546 cpumask_copy(b->cpus, c->llc_shared_map); 543 cpumask_set_cpu(cpu, b->cpus);
547#endif 544#endif
548 545
549 per_cpu(threshold_banks, cpu)[bank] = b; 546 per_cpu(threshold_banks, cpu)[bank] = b;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 62b48e40920a..6fcd0936194f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot)
95 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 95 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
96 96
97 /* Already owned by someone else? */ 97 /* Already owned by someone else? */
98 if (val & CMCI_EN) { 98 if (val & MCI_CTL2_CMCI_EN) {
99 if (test_and_clear_bit(i, owned) && !boot) 99 if (test_and_clear_bit(i, owned) && !boot)
100 print_update("SHD", &hdr, i); 100 print_update("SHD", &hdr, i);
101 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 101 __clear_bit(i, __get_cpu_var(mce_poll_banks));
102 continue; 102 continue;
103 } 103 }
104 104
105 val |= CMCI_EN | CMCI_THRESHOLD; 105 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
106 val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
106 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 107 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
107 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 108 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
108 109
109 /* Did the enable bit stick? -- the bank supports CMCI */ 110 /* Did the enable bit stick? -- the bank supports CMCI */
110 if (val & CMCI_EN) { 111 if (val & MCI_CTL2_CMCI_EN) {
111 if (!test_and_set_bit(i, owned) && !boot) 112 if (!test_and_set_bit(i, owned) && !boot)
112 print_update("CMCI", &hdr, i); 113 print_update("CMCI", &hdr, i);
113 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 114 __clear_bit(i, __get_cpu_var(mce_poll_banks));
@@ -155,7 +156,7 @@ void cmci_clear(void)
155 continue; 156 continue;
156 /* Disable CMCI */ 157 /* Disable CMCI */
157 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 158 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
158 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); 159 val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
159 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 160 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
160 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 161 __clear_bit(i, __get_cpu_var(mce_banks_owned));
161 } 162 }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 81c499eceb21..169d8804a9f8 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,15 +34,25 @@
34/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
35#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
36 36
37#define THERMAL_THROTTLING_EVENT 0
38#define POWER_LIMIT_EVENT 1
39
37/* 40/*
38 * Current thermal throttling state: 41 * Current thermal event state:
39 */ 42 */
40struct thermal_state { 43struct _thermal_state {
41 bool is_throttled; 44 bool new_event;
42 45 int event;
43 u64 next_check; 46 u64 next_check;
44 unsigned long throttle_count; 47 unsigned long count;
45 unsigned long last_throttle_count; 48 unsigned long last_count;
49};
50
51struct thermal_state {
52 struct _thermal_state core_throttle;
53 struct _thermal_state core_power_limit;
54 struct _thermal_state package_throttle;
55 struct _thermal_state package_power_limit;
46}; 56};
47 57
48static DEFINE_PER_CPU(struct thermal_state, thermal_state); 58static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly;
53 63
54#ifdef CONFIG_SYSFS 64#ifdef CONFIG_SYSFS
55#define define_therm_throt_sysdev_one_ro(_name) \ 65#define define_therm_throt_sysdev_one_ro(_name) \
56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 66 static SYSDEV_ATTR(_name, 0444, \
67 therm_throt_sysdev_show_##_name, \
68 NULL) \
57 69
58#define define_therm_throt_sysdev_show_func(name) \ 70#define define_therm_throt_sysdev_show_func(event, name) \
59 \ 71 \
60static ssize_t therm_throt_sysdev_show_##name( \ 72static ssize_t therm_throt_sysdev_show_##event##_##name( \
61 struct sys_device *dev, \ 73 struct sys_device *dev, \
62 struct sysdev_attribute *attr, \ 74 struct sysdev_attribute *attr, \
63 char *buf) \ 75 char *buf) \
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \
66 ssize_t ret; \ 78 ssize_t ret; \
67 \ 79 \
68 preempt_disable(); /* CPU hotplug */ \ 80 preempt_disable(); /* CPU hotplug */ \
69 if (cpu_online(cpu)) \ 81 if (cpu_online(cpu)) { \
70 ret = sprintf(buf, "%lu\n", \ 82 ret = sprintf(buf, "%lu\n", \
71 per_cpu(thermal_state, cpu).name); \ 83 per_cpu(thermal_state, cpu).event.name); \
72 else \ 84 } else \
73 ret = 0; \ 85 ret = 0; \
74 preempt_enable(); \ 86 preempt_enable(); \
75 \ 87 \
76 return ret; \ 88 return ret; \
77} 89}
78 90
79define_therm_throt_sysdev_show_func(throttle_count); 91define_therm_throt_sysdev_show_func(core_throttle, count);
80define_therm_throt_sysdev_one_ro(throttle_count); 92define_therm_throt_sysdev_one_ro(core_throttle_count);
93
94define_therm_throt_sysdev_show_func(core_power_limit, count);
95define_therm_throt_sysdev_one_ro(core_power_limit_count);
96
97define_therm_throt_sysdev_show_func(package_throttle, count);
98define_therm_throt_sysdev_one_ro(package_throttle_count);
99
100define_therm_throt_sysdev_show_func(package_power_limit, count);
101define_therm_throt_sysdev_one_ro(package_power_limit_count);
81 102
82static struct attribute *thermal_throttle_attrs[] = { 103static struct attribute *thermal_throttle_attrs[] = {
83 &attr_throttle_count.attr, 104 &attr_core_throttle_count.attr,
84 NULL 105 NULL
85}; 106};
86 107
87static struct attribute_group thermal_throttle_attr_group = { 108static struct attribute_group thermal_attr_group = {
88 .attrs = thermal_throttle_attrs, 109 .attrs = thermal_throttle_attrs,
89 .name = "thermal_throttle" 110 .name = "thermal_throttle"
90}; 111};
91#endif /* CONFIG_SYSFS */ 112#endif /* CONFIG_SYSFS */
92 113
114#define CORE_LEVEL 0
115#define PACKAGE_LEVEL 1
116
93/*** 117/***
94 * therm_throt_process - Process thermal throttling event from interrupt 118 * therm_throt_process - Process thermal throttling event from interrupt
95 * @curr: Whether the condition is current or not (boolean), since the 119 * @curr: Whether the condition is current or not (boolean), since the
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
106 * 1 : Event should be logged further, and a message has been 130 * 1 : Event should be logged further, and a message has been
107 * printed to the syslog. 131 * printed to the syslog.
108 */ 132 */
109static int therm_throt_process(bool is_throttled) 133static int therm_throt_process(bool new_event, int event, int level)
110{ 134{
111 struct thermal_state *state; 135 struct _thermal_state *state;
112 unsigned int this_cpu; 136 unsigned int this_cpu = smp_processor_id();
113 bool was_throttled; 137 bool old_event;
114 u64 now; 138 u64 now;
139 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
115 140
116 this_cpu = smp_processor_id();
117 now = get_jiffies_64(); 141 now = get_jiffies_64();
118 state = &per_cpu(thermal_state, this_cpu); 142 if (level == CORE_LEVEL) {
143 if (event == THERMAL_THROTTLING_EVENT)
144 state = &pstate->core_throttle;
145 else if (event == POWER_LIMIT_EVENT)
146 state = &pstate->core_power_limit;
147 else
148 return 0;
149 } else if (level == PACKAGE_LEVEL) {
150 if (event == THERMAL_THROTTLING_EVENT)
151 state = &pstate->package_throttle;
152 else if (event == POWER_LIMIT_EVENT)
153 state = &pstate->package_power_limit;
154 else
155 return 0;
156 } else
157 return 0;
119 158
120 was_throttled = state->is_throttled; 159 old_event = state->new_event;
121 state->is_throttled = is_throttled; 160 state->new_event = new_event;
122 161
123 if (is_throttled) 162 if (new_event)
124 state->throttle_count++; 163 state->count++;
125 164
126 if (time_before64(now, state->next_check) && 165 if (time_before64(now, state->next_check) &&
127 state->throttle_count != state->last_throttle_count) 166 state->count != state->last_count)
128 return 0; 167 return 0;
129 168
130 state->next_check = now + CHECK_INTERVAL; 169 state->next_check = now + CHECK_INTERVAL;
131 state->last_throttle_count = state->throttle_count; 170 state->last_count = state->count;
132 171
133 /* if we just entered the thermal event */ 172 /* if we just entered the thermal event */
134 if (is_throttled) { 173 if (new_event) {
135 printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); 174 if (event == THERMAL_THROTTLING_EVENT)
175 printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
176 this_cpu,
177 level == CORE_LEVEL ? "Core" : "Package",
178 state->count);
179 else
180 printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
181 this_cpu,
182 level == CORE_LEVEL ? "Core" : "Package",
183 state->count);
136 184
137 add_taint(TAINT_MACHINE_CHECK); 185 add_taint(TAINT_MACHINE_CHECK);
138 return 1; 186 return 1;
139 } 187 }
140 if (was_throttled) { 188 if (old_event) {
141 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); 189 if (event == THERMAL_THROTTLING_EVENT)
190 printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
191 this_cpu,
192 level == CORE_LEVEL ? "Core" : "Package");
193 else
194 printk(KERN_INFO "CPU%d: %s power limit normal\n",
195 this_cpu,
196 level == CORE_LEVEL ? "Core" : "Package");
142 return 1; 197 return 1;
143 } 198 }
144 199
@@ -147,15 +202,36 @@ static int therm_throt_process(bool is_throttled)
147 202
148#ifdef CONFIG_SYSFS 203#ifdef CONFIG_SYSFS
149/* Add/Remove thermal_throttle interface for CPU device: */ 204/* Add/Remove thermal_throttle interface for CPU device: */
150static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 205static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
206 unsigned int cpu)
151{ 207{
152 return sysfs_create_group(&sys_dev->kobj, 208 int err;
153 &thermal_throttle_attr_group); 209 struct cpuinfo_x86 *c = &cpu_data(cpu);
210
211 err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
212 if (err)
213 return err;
214
215 if (cpu_has(c, X86_FEATURE_PLN))
216 err = sysfs_add_file_to_group(&sys_dev->kobj,
217 &attr_core_power_limit_count.attr,
218 thermal_attr_group.name);
219 if (cpu_has(c, X86_FEATURE_PTS)) {
220 err = sysfs_add_file_to_group(&sys_dev->kobj,
221 &attr_package_throttle_count.attr,
222 thermal_attr_group.name);
223 if (cpu_has(c, X86_FEATURE_PLN))
224 err = sysfs_add_file_to_group(&sys_dev->kobj,
225 &attr_package_power_limit_count.attr,
226 thermal_attr_group.name);
227 }
228
229 return err;
154} 230}
155 231
156static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 232static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
157{ 233{
158 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 234 sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
159} 235}
160 236
161/* Mutex protecting device creation against CPU hotplug: */ 237/* Mutex protecting device creation against CPU hotplug: */
@@ -177,7 +253,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
177 case CPU_UP_PREPARE: 253 case CPU_UP_PREPARE:
178 case CPU_UP_PREPARE_FROZEN: 254 case CPU_UP_PREPARE_FROZEN:
179 mutex_lock(&therm_cpu_lock); 255 mutex_lock(&therm_cpu_lock);
180 err = thermal_throttle_add_dev(sys_dev); 256 err = thermal_throttle_add_dev(sys_dev, cpu);
181 mutex_unlock(&therm_cpu_lock); 257 mutex_unlock(&therm_cpu_lock);
182 WARN_ON(err); 258 WARN_ON(err);
183 break; 259 break;
@@ -190,7 +266,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
190 mutex_unlock(&therm_cpu_lock); 266 mutex_unlock(&therm_cpu_lock);
191 break; 267 break;
192 } 268 }
193 return err ? NOTIFY_BAD : NOTIFY_OK; 269 return notifier_from_errno(err);
194} 270}
195 271
196static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = 272static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
@@ -213,7 +289,7 @@ static __init int thermal_throttle_init_device(void)
213#endif 289#endif
214 /* connect live CPUs to sysfs */ 290 /* connect live CPUs to sysfs */
215 for_each_online_cpu(cpu) { 291 for_each_online_cpu(cpu) {
216 err = thermal_throttle_add_dev(get_cpu_sysdev(cpu)); 292 err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
217 WARN_ON(err); 293 WARN_ON(err);
218 } 294 }
219#ifdef CONFIG_HOTPLUG_CPU 295#ifdef CONFIG_HOTPLUG_CPU
@@ -226,14 +302,50 @@ device_initcall(thermal_throttle_init_device);
226 302
227#endif /* CONFIG_SYSFS */ 303#endif /* CONFIG_SYSFS */
228 304
305/*
306 * Set up the most two significant bit to notify mce log that this thermal
307 * event type.
308 * This is a temp solution. May be changed in the future with mce log
309 * infrasture.
310 */
311#define CORE_THROTTLED (0)
312#define CORE_POWER_LIMIT ((__u64)1 << 62)
313#define PACKAGE_THROTTLED ((__u64)2 << 62)
314#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
315
229/* Thermal transition interrupt handler */ 316/* Thermal transition interrupt handler */
230static void intel_thermal_interrupt(void) 317static void intel_thermal_interrupt(void)
231{ 318{
232 __u64 msr_val; 319 __u64 msr_val;
320 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
233 321
234 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 322 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
235 if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) 323
236 mce_log_therm_throt_event(msr_val); 324 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
325 THERMAL_THROTTLING_EVENT,
326 CORE_LEVEL) != 0)
327 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
328
329 if (cpu_has(c, X86_FEATURE_PLN))
330 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
331 POWER_LIMIT_EVENT,
332 CORE_LEVEL) != 0)
333 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
334
335 if (cpu_has(c, X86_FEATURE_PTS)) {
336 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
337 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
338 THERMAL_THROTTLING_EVENT,
339 PACKAGE_LEVEL) != 0)
340 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
341 if (cpu_has(c, X86_FEATURE_PLN))
342 if (therm_throt_process(msr_val &
343 PACKAGE_THERM_STATUS_POWER_LIMIT,
344 POWER_LIMIT_EVENT,
345 PACKAGE_LEVEL) != 0)
346 mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
347 | msr_val);
348 }
237} 349}
238 350
239static void unexpected_thermal_interrupt(void) 351static void unexpected_thermal_interrupt(void)
@@ -335,8 +447,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
335 apic_write(APIC_LVTTHMR, h); 447 apic_write(APIC_LVTTHMR, h);
336 448
337 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 449 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
338 wrmsr(MSR_IA32_THERM_INTERRUPT, 450 if (cpu_has(c, X86_FEATURE_PLN))
339 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); 451 wrmsr(MSR_IA32_THERM_INTERRUPT,
452 l | (THERM_INT_LOW_ENABLE
453 | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
454 else
455 wrmsr(MSR_IA32_THERM_INTERRUPT,
456 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
457
458 if (cpu_has(c, X86_FEATURE_PTS)) {
459 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
460 if (cpu_has(c, X86_FEATURE_PLN))
461 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
462 l | (PACKAGE_THERM_INT_LOW_ENABLE
463 | PACKAGE_THERM_INT_HIGH_ENABLE
464 | PACKAGE_THERM_INT_PLN_ENABLE), h);
465 else
466 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
467 l | (PACKAGE_THERM_INT_LOW_ENABLE
468 | PACKAGE_THERM_INT_HIGH_ENABLE), h);
469 }
340 470
341 smp_thermal_vector = intel_thermal_interrupt; 471 smp_thermal_vector = intel_thermal_interrupt;
342 472
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 000000000000..d944bf6c50e9
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,56 @@
1/*
2 * HyperV Detection code.
3 *
4 * Copyright (C) 2010, Novell, Inc.
5 * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; version 2 of the License.
10 *
11 */
12
13#include <linux/types.h>
14#include <linux/module.h>
15#include <asm/processor.h>
16#include <asm/hypervisor.h>
17#include <asm/hyperv.h>
18#include <asm/mshyperv.h>
19
20struct ms_hyperv_info ms_hyperv;
21EXPORT_SYMBOL_GPL(ms_hyperv);
22
23static bool __init ms_hyperv_platform(void)
24{
25 u32 eax;
26 u32 hyp_signature[3];
27
28 if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
29 return false;
30
31 cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
32 &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
33
34 return eax >= HYPERV_CPUID_MIN &&
35 eax <= HYPERV_CPUID_MAX &&
36 !memcmp("Microsoft Hv", hyp_signature, 12);
37}
38
39static void __init ms_hyperv_init_platform(void)
40{
41 /*
42 * Extract the features and hints
43 */
44 ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
45 ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
46
47 printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
48 ms_hyperv.features, ms_hyperv.hints);
49}
50
51const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
52 .name = "Microsoft HyperV",
53 .detect = ms_hyperv_platform,
54 .init_platform = ms_hyperv_init_platform,
55};
56EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f012..c5f59d071425 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
632 unsigned long gran_base, chunk_base, lose_base; 632 unsigned long gran_base, chunk_base, lose_base;
633 char gran_factor, chunk_factor, lose_factor; 633 char gran_factor, chunk_factor, lose_factor;
634 634
635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), 637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
638 638
639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", 639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
640 result[i].bad ? "*BAD*" : " ", 640 result[i].bad ? "*BAD*" : " ",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61c..7d28d7d03885 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
433{ 433{
434 unsigned int mask_lo, mask_hi, base_lo, base_hi; 434 unsigned int mask_lo, mask_hi, base_lo, base_hi;
435 unsigned int tmp, hi; 435 unsigned int tmp, hi;
436 int cpu;
437 436
438 /* 437 /*
439 * get_mtrr doesn't need to update mtrr_state, also it could be called 438 * get_mtrr doesn't need to update mtrr_state, also it could be called
440 * from any cpu, so try to print it out directly. 439 * from any cpu, so try to print it out directly.
441 */ 440 */
442 cpu = get_cpu(); 441 get_cpu();
443 442
444 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 443 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
445 444
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79556bd9b602..01c0f3ee6cc3 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
35 35
36#include <linux/types.h> /* FIXME: kvm_para.h needs this */ 36#include <linux/types.h> /* FIXME: kvm_para.h needs this */
37 37
38#include <linux/stop_machine.h>
38#include <linux/kvm_para.h> 39#include <linux/kvm_para.h>
39#include <linux/uaccess.h> 40#include <linux/uaccess.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -143,22 +144,28 @@ struct set_mtrr_data {
143 mtrr_type smp_type; 144 mtrr_type smp_type;
144}; 145};
145 146
147static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
148
146/** 149/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 150 * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data 151 * @info: pointer to mtrr configuration data
149 * 152 *
150 * Returns nothing. 153 * Returns nothing.
151 */ 154 */
152static void ipi_handler(void *info) 155static int mtrr_work_handler(void *info)
153{ 156{
154#ifdef CONFIG_SMP 157#ifdef CONFIG_SMP
155 struct set_mtrr_data *data = info; 158 struct set_mtrr_data *data = info;
156 unsigned long flags; 159 unsigned long flags;
157 160
161 atomic_dec(&data->count);
162 while (!atomic_read(&data->gate))
163 cpu_relax();
164
158 local_irq_save(flags); 165 local_irq_save(flags);
159 166
160 atomic_dec(&data->count); 167 atomic_dec(&data->count);
161 while (!atomic_read(&data->gate)) 168 while (atomic_read(&data->gate))
162 cpu_relax(); 169 cpu_relax();
163 170
164 /* The master has cleared me to execute */ 171 /* The master has cleared me to execute */
@@ -173,12 +180,13 @@ static void ipi_handler(void *info)
173 } 180 }
174 181
175 atomic_dec(&data->count); 182 atomic_dec(&data->count);
176 while (atomic_read(&data->gate)) 183 while (!atomic_read(&data->gate))
177 cpu_relax(); 184 cpu_relax();
178 185
179 atomic_dec(&data->count); 186 atomic_dec(&data->count);
180 local_irq_restore(flags); 187 local_irq_restore(flags);
181#endif 188#endif
189 return 0;
182} 190}
183 191
184static inline int types_compatible(mtrr_type type1, mtrr_type type2) 192static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
198 * 206 *
199 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: 207 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
200 * 208 *
201 * 1. Send IPI to do the following: 209 * 1. Queue work to do the following on all processors:
202 * 2. Disable Interrupts 210 * 2. Disable Interrupts
203 * 3. Wait for all procs to do so 211 * 3. Wait for all procs to do so
204 * 4. Enter no-fill cache mode 212 * 4. Enter no-fill cache mode
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
215 * 15. Enable interrupts. 223 * 15. Enable interrupts.
216 * 224 *
217 * What does that mean for us? Well, first we set data.count to the number 225 * What does that mean for us? Well, first we set data.count to the number
218 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait 226 * of CPUs. As each CPU announces that it started the rendezvous handler by
219 * until it hits 0 and proceed. We set the data.gate flag and reset data.count. 227 * decrementing the count, We reset data.count and set the data.gate flag
220 * Meanwhile, they are waiting for that flag to be set. Once it's set, each 228 * allowing all the cpu's to proceed with the work. As each cpu disables
229 * interrupts, it'll decrement data.count once. We wait until it hits 0 and
230 * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
231 * are waiting for that flag to be cleared. Once it's cleared, each
221 * CPU goes through the transition of updating MTRRs. 232 * CPU goes through the transition of updating MTRRs.
222 * The CPU vendors may each do it differently, 233 * The CPU vendors may each do it differently,
223 * so we call mtrr_if->set() callback and let them take care of it. 234 * so we call mtrr_if->set() callback and let them take care of it.
224 * When they're done, they again decrement data->count and wait for data.gate 235 * When they're done, they again decrement data->count and wait for data.gate
225 * to be reset. 236 * to be set.
226 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag 237 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
227 * Everyone then enables interrupts and we all continue on. 238 * Everyone then enables interrupts and we all continue on.
228 * 239 *
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
234{ 245{
235 struct set_mtrr_data data; 246 struct set_mtrr_data data;
236 unsigned long flags; 247 unsigned long flags;
248 int cpu;
249
250 preempt_disable();
237 251
238 data.smp_reg = reg; 252 data.smp_reg = reg;
239 data.smp_base = base; 253 data.smp_base = base;
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
246 atomic_set(&data.gate, 0); 260 atomic_set(&data.gate, 0);
247 261
248 /* Start the ball rolling on other CPUs */ 262 /* Start the ball rolling on other CPUs */
249 if (smp_call_function(ipi_handler, &data, 0) != 0) 263 for_each_online_cpu(cpu) {
250 panic("mtrr: timed out waiting for other CPUs\n"); 264 struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
265
266 if (cpu == smp_processor_id())
267 continue;
268
269 stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
270 }
251 271
252 local_irq_save(flags);
253 272
254 while (atomic_read(&data.count)) 273 while (atomic_read(&data.count))
255 cpu_relax(); 274 cpu_relax();
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
259 smp_wmb(); 278 smp_wmb();
260 atomic_set(&data.gate, 1); 279 atomic_set(&data.gate, 1);
261 280
281 local_irq_save(flags);
282
283 while (atomic_read(&data.count))
284 cpu_relax();
285
286 /* Ok, reset count and toggle gate */
287 atomic_set(&data.count, num_booting_cpus() - 1);
288 smp_wmb();
289 atomic_set(&data.gate, 0);
290
262 /* Do our MTRR business */ 291 /* Do our MTRR business */
263 292
264 /* 293 /*
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
279 308
280 atomic_set(&data.count, num_booting_cpus() - 1); 309 atomic_set(&data.count, num_booting_cpus() - 1);
281 smp_wmb(); 310 smp_wmb();
282 atomic_set(&data.gate, 0); 311 atomic_set(&data.gate, 1);
283 312
284 /* 313 /*
285 * Wait here for everyone to have seen the gate change 314 * Wait here for everyone to have seen the gate change
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
289 cpu_relax(); 318 cpu_relax();
290 319
291 local_irq_restore(flags); 320 local_irq_restore(flags);
321 preempt_enable();
292} 322}
293 323
294/** 324/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index db5bdc8addf8..03a5b0385ad6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,46 +31,51 @@
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33 33
34static u64 perf_event_mask __read_mostly; 34#if 0
35#undef wrmsrl
36#define wrmsrl(msr, val) \
37do { \
38 trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
39 (unsigned long)(val)); \
40 native_write_msr((msr), (u32)((u64)(val)), \
41 (u32)((u64)(val) >> 32)); \
42} while (0)
43#endif
35 44
36/* The maximal number of PEBS events: */ 45/*
37#define MAX_PEBS_EVENTS 4 46 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
47 */
48static unsigned long
49copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
50{
51 unsigned long offset, addr = (unsigned long)from;
52 int type = in_nmi() ? KM_NMI : KM_IRQ0;
53 unsigned long size, len = 0;
54 struct page *page;
55 void *map;
56 int ret;
38 57
39/* The size of a BTS record in bytes: */ 58 do {
40#define BTS_RECORD_SIZE 24 59 ret = __get_user_pages_fast(addr, 1, 0, &page);
60 if (!ret)
61 break;
41 62
42/* The size of a per-cpu BTS buffer in bytes: */ 63 offset = addr & (PAGE_SIZE - 1);
43#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) 64 size = min(PAGE_SIZE - offset, n - len);
44 65
45/* The BTS overflow threshold in bytes from the end of the buffer: */ 66 map = kmap_atomic(page, type);
46#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) 67 memcpy(to, map+offset, size);
68 kunmap_atomic(map, type);
69 put_page(page);
47 70
71 len += size;
72 to += size;
73 addr += size;
48 74
49/* 75 } while (len < n);
50 * Bits in the debugctlmsr controlling branch tracing.
51 */
52#define X86_DEBUGCTL_TR (1 << 6)
53#define X86_DEBUGCTL_BTS (1 << 7)
54#define X86_DEBUGCTL_BTINT (1 << 8)
55#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
56#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
57 76
58/* 77 return len;
59 * A debug store configuration. 78}
60 *
61 * We only support architectures that use 64bit fields.
62 */
63struct debug_store {
64 u64 bts_buffer_base;
65 u64 bts_index;
66 u64 bts_absolute_maximum;
67 u64 bts_interrupt_threshold;
68 u64 pebs_buffer_base;
69 u64 pebs_index;
70 u64 pebs_absolute_maximum;
71 u64 pebs_interrupt_threshold;
72 u64 pebs_event_reset[MAX_PEBS_EVENTS];
73};
74 79
75struct event_constraint { 80struct event_constraint {
76 union { 81 union {
@@ -89,18 +94,43 @@ struct amd_nb {
89 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 94 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
90}; 95};
91 96
97#define MAX_LBR_ENTRIES 16
98
92struct cpu_hw_events { 99struct cpu_hw_events {
100 /*
101 * Generic x86 PMC bits
102 */
93 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ 103 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
94 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 104 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
95 unsigned long interrupts; 105 unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
96 int enabled; 106 int enabled;
97 struct debug_store *ds;
98 107
99 int n_events; 108 int n_events;
100 int n_added; 109 int n_added;
110 int n_txn;
101 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ 111 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
102 u64 tags[X86_PMC_IDX_MAX]; 112 u64 tags[X86_PMC_IDX_MAX];
103 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ 113 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
114
115 unsigned int group_flag;
116
117 /*
118 * Intel DebugStore bits
119 */
120 struct debug_store *ds;
121 u64 pebs_enabled;
122
123 /*
124 * Intel LBR bits
125 */
126 int lbr_users;
127 void *lbr_context;
128 struct perf_branch_stack lbr_stack;
129 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
130
131 /*
132 * AMD specific bits
133 */
104 struct amd_nb *amd_nb; 134 struct amd_nb *amd_nb;
105}; 135};
106 136
@@ -114,44 +144,75 @@ struct cpu_hw_events {
114#define EVENT_CONSTRAINT(c, n, m) \ 144#define EVENT_CONSTRAINT(c, n, m) \
115 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) 145 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
116 146
147/*
148 * Constraint on the Event code.
149 */
117#define INTEL_EVENT_CONSTRAINT(c, n) \ 150#define INTEL_EVENT_CONSTRAINT(c, n) \
118 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) 151 EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
119 152
153/*
154 * Constraint on the Event code + UMask + fixed-mask
155 *
156 * filter mask to validate fixed counter events.
157 * the following filters disqualify for fixed counters:
158 * - inv
159 * - edge
160 * - cnt-mask
161 * The other filters are supported by fixed counters.
162 * The any-thread option is supported starting with v3.
163 */
120#define FIXED_EVENT_CONSTRAINT(c, n) \ 164#define FIXED_EVENT_CONSTRAINT(c, n) \
121 EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) 165 EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
166
167/*
168 * Constraint on the Event code + UMask
169 */
170#define PEBS_EVENT_CONSTRAINT(c, n) \
171 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
122 172
123#define EVENT_CONSTRAINT_END \ 173#define EVENT_CONSTRAINT_END \
124 EVENT_CONSTRAINT(0, 0, 0) 174 EVENT_CONSTRAINT(0, 0, 0)
125 175
126#define for_each_event_constraint(e, c) \ 176#define for_each_event_constraint(e, c) \
127 for ((e) = (c); (e)->cmask; (e)++) 177 for ((e) = (c); (e)->weight; (e)++)
178
179union perf_capabilities {
180 struct {
181 u64 lbr_format : 6;
182 u64 pebs_trap : 1;
183 u64 pebs_arch_reg : 1;
184 u64 pebs_format : 4;
185 u64 smm_freeze : 1;
186 };
187 u64 capabilities;
188};
128 189
129/* 190/*
130 * struct x86_pmu - generic x86 pmu 191 * struct x86_pmu - generic x86 pmu
131 */ 192 */
132struct x86_pmu { 193struct x86_pmu {
194 /*
195 * Generic x86 PMC bits
196 */
133 const char *name; 197 const char *name;
134 int version; 198 int version;
135 int (*handle_irq)(struct pt_regs *); 199 int (*handle_irq)(struct pt_regs *);
136 void (*disable_all)(void); 200 void (*disable_all)(void);
137 void (*enable_all)(void); 201 void (*enable_all)(int added);
138 void (*enable)(struct perf_event *); 202 void (*enable)(struct perf_event *);
139 void (*disable)(struct perf_event *); 203 void (*disable)(struct perf_event *);
204 int (*hw_config)(struct perf_event *event);
205 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
140 unsigned eventsel; 206 unsigned eventsel;
141 unsigned perfctr; 207 unsigned perfctr;
142 u64 (*event_map)(int); 208 u64 (*event_map)(int);
143 u64 (*raw_event)(u64);
144 int max_events; 209 int max_events;
145 int num_events; 210 int num_counters;
146 int num_events_fixed; 211 int num_counters_fixed;
147 int event_bits; 212 int cntval_bits;
148 u64 event_mask; 213 u64 cntval_mask;
149 int apic; 214 int apic;
150 u64 max_period; 215 u64 max_period;
151 u64 intel_ctrl;
152 void (*enable_bts)(u64 config);
153 void (*disable_bts)(void);
154
155 struct event_constraint * 216 struct event_constraint *
156 (*get_event_constraints)(struct cpu_hw_events *cpuc, 217 (*get_event_constraints)(struct cpu_hw_events *cpuc,
157 struct perf_event *event); 218 struct perf_event *event);
@@ -159,11 +220,33 @@ struct x86_pmu {
159 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 220 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
160 struct perf_event *event); 221 struct perf_event *event);
161 struct event_constraint *event_constraints; 222 struct event_constraint *event_constraints;
223 void (*quirks)(void);
224 int perfctr_second_write;
162 225
163 int (*cpu_prepare)(int cpu); 226 int (*cpu_prepare)(int cpu);
164 void (*cpu_starting)(int cpu); 227 void (*cpu_starting)(int cpu);
165 void (*cpu_dying)(int cpu); 228 void (*cpu_dying)(int cpu);
166 void (*cpu_dead)(int cpu); 229 void (*cpu_dead)(int cpu);
230
231 /*
232 * Intel Arch Perfmon v2+
233 */
234 u64 intel_ctrl;
235 union perf_capabilities intel_cap;
236
237 /*
238 * Intel DebugStore bits
239 */
240 int bts, pebs;
241 int pebs_record_size;
242 void (*drain_pebs)(struct pt_regs *regs);
243 struct event_constraint *pebs_constraints;
244
245 /*
246 * Intel LBR
247 */
248 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
249 int lbr_nr; /* hardware stack size */
167}; 250};
168 251
169static struct x86_pmu x86_pmu __read_mostly; 252static struct x86_pmu x86_pmu __read_mostly;
@@ -198,7 +281,7 @@ static u64
198x86_perf_event_update(struct perf_event *event) 281x86_perf_event_update(struct perf_event *event)
199{ 282{
200 struct hw_perf_event *hwc = &event->hw; 283 struct hw_perf_event *hwc = &event->hw;
201 int shift = 64 - x86_pmu.event_bits; 284 int shift = 64 - x86_pmu.cntval_bits;
202 u64 prev_raw_count, new_raw_count; 285 u64 prev_raw_count, new_raw_count;
203 int idx = hwc->idx; 286 int idx = hwc->idx;
204 s64 delta; 287 s64 delta;
@@ -214,10 +297,10 @@ x86_perf_event_update(struct perf_event *event)
214 * count to the generic event atomically: 297 * count to the generic event atomically:
215 */ 298 */
216again: 299again:
217 prev_raw_count = atomic64_read(&hwc->prev_count); 300 prev_raw_count = local64_read(&hwc->prev_count);
218 rdmsrl(hwc->event_base + idx, new_raw_count); 301 rdmsrl(hwc->event_base + idx, new_raw_count);
219 302
220 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, 303 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
221 new_raw_count) != prev_raw_count) 304 new_raw_count) != prev_raw_count)
222 goto again; 305 goto again;
223 306
@@ -232,8 +315,8 @@ again:
232 delta = (new_raw_count << shift) - (prev_raw_count << shift); 315 delta = (new_raw_count << shift) - (prev_raw_count << shift);
233 delta >>= shift; 316 delta >>= shift;
234 317
235 atomic64_add(delta, &event->count); 318 local64_add(delta, &event->count);
236 atomic64_sub(delta, &hwc->period_left); 319 local64_sub(delta, &hwc->period_left);
237 320
238 return new_raw_count; 321 return new_raw_count;
239} 322}
@@ -241,33 +324,32 @@ again:
241static atomic_t active_events; 324static atomic_t active_events;
242static DEFINE_MUTEX(pmc_reserve_mutex); 325static DEFINE_MUTEX(pmc_reserve_mutex);
243 326
327#ifdef CONFIG_X86_LOCAL_APIC
328
244static bool reserve_pmc_hardware(void) 329static bool reserve_pmc_hardware(void)
245{ 330{
246#ifdef CONFIG_X86_LOCAL_APIC
247 int i; 331 int i;
248 332
249 if (nmi_watchdog == NMI_LOCAL_APIC) 333 if (nmi_watchdog == NMI_LOCAL_APIC)
250 disable_lapic_nmi_watchdog(); 334 disable_lapic_nmi_watchdog();
251 335
252 for (i = 0; i < x86_pmu.num_events; i++) { 336 for (i = 0; i < x86_pmu.num_counters; i++) {
253 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 337 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
254 goto perfctr_fail; 338 goto perfctr_fail;
255 } 339 }
256 340
257 for (i = 0; i < x86_pmu.num_events; i++) { 341 for (i = 0; i < x86_pmu.num_counters; i++) {
258 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 342 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
259 goto eventsel_fail; 343 goto eventsel_fail;
260 } 344 }
261#endif
262 345
263 return true; 346 return true;
264 347
265#ifdef CONFIG_X86_LOCAL_APIC
266eventsel_fail: 348eventsel_fail:
267 for (i--; i >= 0; i--) 349 for (i--; i >= 0; i--)
268 release_evntsel_nmi(x86_pmu.eventsel + i); 350 release_evntsel_nmi(x86_pmu.eventsel + i);
269 351
270 i = x86_pmu.num_events; 352 i = x86_pmu.num_counters;
271 353
272perfctr_fail: 354perfctr_fail:
273 for (i--; i >= 0; i--) 355 for (i--; i >= 0; i--)
@@ -277,128 +359,36 @@ perfctr_fail:
277 enable_lapic_nmi_watchdog(); 359 enable_lapic_nmi_watchdog();
278 360
279 return false; 361 return false;
280#endif
281} 362}
282 363
283static void release_pmc_hardware(void) 364static void release_pmc_hardware(void)
284{ 365{
285#ifdef CONFIG_X86_LOCAL_APIC
286 int i; 366 int i;
287 367
288 for (i = 0; i < x86_pmu.num_events; i++) { 368 for (i = 0; i < x86_pmu.num_counters; i++) {
289 release_perfctr_nmi(x86_pmu.perfctr + i); 369 release_perfctr_nmi(x86_pmu.perfctr + i);
290 release_evntsel_nmi(x86_pmu.eventsel + i); 370 release_evntsel_nmi(x86_pmu.eventsel + i);
291 } 371 }
292 372
293 if (nmi_watchdog == NMI_LOCAL_APIC) 373 if (nmi_watchdog == NMI_LOCAL_APIC)
294 enable_lapic_nmi_watchdog(); 374 enable_lapic_nmi_watchdog();
295#endif
296} 375}
297 376
298static inline bool bts_available(void) 377#else
299{
300 return x86_pmu.enable_bts != NULL;
301}
302
303static void init_debug_store_on_cpu(int cpu)
304{
305 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
306
307 if (!ds)
308 return;
309
310 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
311 (u32)((u64)(unsigned long)ds),
312 (u32)((u64)(unsigned long)ds >> 32));
313}
314
315static void fini_debug_store_on_cpu(int cpu)
316{
317 if (!per_cpu(cpu_hw_events, cpu).ds)
318 return;
319
320 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
321}
322
323static void release_bts_hardware(void)
324{
325 int cpu;
326
327 if (!bts_available())
328 return;
329
330 get_online_cpus();
331
332 for_each_online_cpu(cpu)
333 fini_debug_store_on_cpu(cpu);
334
335 for_each_possible_cpu(cpu) {
336 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
337
338 if (!ds)
339 continue;
340
341 per_cpu(cpu_hw_events, cpu).ds = NULL;
342
343 kfree((void *)(unsigned long)ds->bts_buffer_base);
344 kfree(ds);
345 }
346
347 put_online_cpus();
348}
349
350static int reserve_bts_hardware(void)
351{
352 int cpu, err = 0;
353
354 if (!bts_available())
355 return 0;
356
357 get_online_cpus();
358
359 for_each_possible_cpu(cpu) {
360 struct debug_store *ds;
361 void *buffer;
362
363 err = -ENOMEM;
364 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
365 if (unlikely(!buffer))
366 break;
367
368 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
369 if (unlikely(!ds)) {
370 kfree(buffer);
371 break;
372 }
373 378
374 ds->bts_buffer_base = (u64)(unsigned long)buffer; 379static bool reserve_pmc_hardware(void) { return true; }
375 ds->bts_index = ds->bts_buffer_base; 380static void release_pmc_hardware(void) {}
376 ds->bts_absolute_maximum =
377 ds->bts_buffer_base + BTS_BUFFER_SIZE;
378 ds->bts_interrupt_threshold =
379 ds->bts_absolute_maximum - BTS_OVFL_TH;
380 381
381 per_cpu(cpu_hw_events, cpu).ds = ds; 382#endif
382 err = 0;
383 }
384
385 if (err)
386 release_bts_hardware();
387 else {
388 for_each_online_cpu(cpu)
389 init_debug_store_on_cpu(cpu);
390 }
391
392 put_online_cpus();
393 383
394 return err; 384static int reserve_ds_buffers(void);
395} 385static void release_ds_buffers(void);
396 386
397static void hw_perf_event_destroy(struct perf_event *event) 387static void hw_perf_event_destroy(struct perf_event *event)
398{ 388{
399 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { 389 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
400 release_pmc_hardware(); 390 release_pmc_hardware();
401 release_bts_hardware(); 391 release_ds_buffers();
402 mutex_unlock(&pmc_reserve_mutex); 392 mutex_unlock(&pmc_reserve_mutex);
403 } 393 }
404} 394}
@@ -441,59 +431,16 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
441 return 0; 431 return 0;
442} 432}
443 433
444/* 434static int x86_setup_perfctr(struct perf_event *event)
445 * Setup the hardware configuration for a given attr_type
446 */
447static int __hw_perf_event_init(struct perf_event *event)
448{ 435{
449 struct perf_event_attr *attr = &event->attr; 436 struct perf_event_attr *attr = &event->attr;
450 struct hw_perf_event *hwc = &event->hw; 437 struct hw_perf_event *hwc = &event->hw;
451 u64 config; 438 u64 config;
452 int err;
453
454 if (!x86_pmu_initialized())
455 return -ENODEV;
456
457 err = 0;
458 if (!atomic_inc_not_zero(&active_events)) {
459 mutex_lock(&pmc_reserve_mutex);
460 if (atomic_read(&active_events) == 0) {
461 if (!reserve_pmc_hardware())
462 err = -EBUSY;
463 else
464 err = reserve_bts_hardware();
465 }
466 if (!err)
467 atomic_inc(&active_events);
468 mutex_unlock(&pmc_reserve_mutex);
469 }
470 if (err)
471 return err;
472
473 event->destroy = hw_perf_event_destroy;
474
475 /*
476 * Generate PMC IRQs:
477 * (keep 'enabled' bit clear for now)
478 */
479 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
480
481 hwc->idx = -1;
482 hwc->last_cpu = -1;
483 hwc->last_tag = ~0ULL;
484
485 /*
486 * Count user and OS events unless requested not to.
487 */
488 if (!attr->exclude_user)
489 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
490 if (!attr->exclude_kernel)
491 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
492 439
493 if (!hwc->sample_period) { 440 if (!hwc->sample_period) {
494 hwc->sample_period = x86_pmu.max_period; 441 hwc->sample_period = x86_pmu.max_period;
495 hwc->last_period = hwc->sample_period; 442 hwc->last_period = hwc->sample_period;
496 atomic64_set(&hwc->period_left, hwc->sample_period); 443 local64_set(&hwc->period_left, hwc->sample_period);
497 } else { 444 } else {
498 /* 445 /*
499 * If we have a PMU initialized but no APIC 446 * If we have a PMU initialized but no APIC
@@ -505,16 +452,8 @@ static int __hw_perf_event_init(struct perf_event *event)
505 return -EOPNOTSUPP; 452 return -EOPNOTSUPP;
506 } 453 }
507 454
508 /* 455 if (attr->type == PERF_TYPE_RAW)
509 * Raw hw_event type provide the config in the hw_event structure
510 */
511 if (attr->type == PERF_TYPE_RAW) {
512 hwc->config |= x86_pmu.raw_event(attr->config);
513 if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
514 perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
515 return -EACCES;
516 return 0; 456 return 0;
517 }
518 457
519 if (attr->type == PERF_TYPE_HW_CACHE) 458 if (attr->type == PERF_TYPE_HW_CACHE)
520 return set_ext_hw_attr(hwc, attr); 459 return set_ext_hw_attr(hwc, attr);
@@ -539,11 +478,11 @@ static int __hw_perf_event_init(struct perf_event *event)
539 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && 478 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
540 (hwc->sample_period == 1)) { 479 (hwc->sample_period == 1)) {
541 /* BTS is not supported by this architecture. */ 480 /* BTS is not supported by this architecture. */
542 if (!bts_available()) 481 if (!x86_pmu.bts)
543 return -EOPNOTSUPP; 482 return -EOPNOTSUPP;
544 483
545 /* BTS is currently only allowed for user-mode. */ 484 /* BTS is currently only allowed for user-mode. */
546 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) 485 if (!attr->exclude_kernel)
547 return -EOPNOTSUPP; 486 return -EOPNOTSUPP;
548 } 487 }
549 488
@@ -552,12 +491,87 @@ static int __hw_perf_event_init(struct perf_event *event)
552 return 0; 491 return 0;
553} 492}
554 493
494static int x86_pmu_hw_config(struct perf_event *event)
495{
496 if (event->attr.precise_ip) {
497 int precise = 0;
498
499 /* Support for constant skid */
500 if (x86_pmu.pebs)
501 precise++;
502
503 /* Support for IP fixup */
504 if (x86_pmu.lbr_nr)
505 precise++;
506
507 if (event->attr.precise_ip > precise)
508 return -EOPNOTSUPP;
509 }
510
511 /*
512 * Generate PMC IRQs:
513 * (keep 'enabled' bit clear for now)
514 */
515 event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
516
517 /*
518 * Count user and OS events unless requested not to
519 */
520 if (!event->attr.exclude_user)
521 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
522 if (!event->attr.exclude_kernel)
523 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
524
525 if (event->attr.type == PERF_TYPE_RAW)
526 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
527
528 return x86_setup_perfctr(event);
529}
530
531/*
532 * Setup the hardware configuration for a given attr_type
533 */
534static int __hw_perf_event_init(struct perf_event *event)
535{
536 int err;
537
538 if (!x86_pmu_initialized())
539 return -ENODEV;
540
541 err = 0;
542 if (!atomic_inc_not_zero(&active_events)) {
543 mutex_lock(&pmc_reserve_mutex);
544 if (atomic_read(&active_events) == 0) {
545 if (!reserve_pmc_hardware())
546 err = -EBUSY;
547 else {
548 err = reserve_ds_buffers();
549 if (err)
550 release_pmc_hardware();
551 }
552 }
553 if (!err)
554 atomic_inc(&active_events);
555 mutex_unlock(&pmc_reserve_mutex);
556 }
557 if (err)
558 return err;
559
560 event->destroy = hw_perf_event_destroy;
561
562 event->hw.idx = -1;
563 event->hw.last_cpu = -1;
564 event->hw.last_tag = ~0ULL;
565
566 return x86_pmu.hw_config(event);
567}
568
555static void x86_pmu_disable_all(void) 569static void x86_pmu_disable_all(void)
556{ 570{
557 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 571 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
558 int idx; 572 int idx;
559 573
560 for (idx = 0; idx < x86_pmu.num_events; idx++) { 574 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
561 u64 val; 575 u64 val;
562 576
563 if (!test_bit(idx, cpuc->active_mask)) 577 if (!test_bit(idx, cpuc->active_mask))
@@ -587,12 +601,12 @@ void hw_perf_disable(void)
587 x86_pmu.disable_all(); 601 x86_pmu.disable_all();
588} 602}
589 603
590static void x86_pmu_enable_all(void) 604static void x86_pmu_enable_all(int added)
591{ 605{
592 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 606 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
593 int idx; 607 int idx;
594 608
595 for (idx = 0; idx < x86_pmu.num_events; idx++) { 609 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
596 struct perf_event *event = cpuc->events[idx]; 610 struct perf_event *event = cpuc->events[idx];
597 u64 val; 611 u64 val;
598 612
@@ -667,14 +681,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
667 * assign events to counters starting with most 681 * assign events to counters starting with most
668 * constrained events. 682 * constrained events.
669 */ 683 */
670 wmax = x86_pmu.num_events; 684 wmax = x86_pmu.num_counters;
671 685
672 /* 686 /*
673 * when fixed event counters are present, 687 * when fixed event counters are present,
674 * wmax is incremented by 1 to account 688 * wmax is incremented by 1 to account
675 * for one more choice 689 * for one more choice
676 */ 690 */
677 if (x86_pmu.num_events_fixed) 691 if (x86_pmu.num_counters_fixed)
678 wmax++; 692 wmax++;
679 693
680 for (w = 1, num = n; num && w <= wmax; w++) { 694 for (w = 1, num = n; num && w <= wmax; w++) {
@@ -724,7 +738,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
724 struct perf_event *event; 738 struct perf_event *event;
725 int n, max_count; 739 int n, max_count;
726 740
727 max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; 741 max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
728 742
729 /* current number of events already accepted */ 743 /* current number of events already accepted */
730 n = cpuc->n_events; 744 n = cpuc->n_events;
@@ -795,7 +809,7 @@ void hw_perf_enable(void)
795 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 809 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
796 struct perf_event *event; 810 struct perf_event *event;
797 struct hw_perf_event *hwc; 811 struct hw_perf_event *hwc;
798 int i; 812 int i, added = cpuc->n_added;
799 813
800 if (!x86_pmu_initialized()) 814 if (!x86_pmu_initialized())
801 return; 815 return;
@@ -847,19 +861,20 @@ void hw_perf_enable(void)
847 cpuc->enabled = 1; 861 cpuc->enabled = 1;
848 barrier(); 862 barrier();
849 863
850 x86_pmu.enable_all(); 864 x86_pmu.enable_all(added);
851} 865}
852 866
853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) 867static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
868 u64 enable_mask)
854{ 869{
855 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 870 wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
856 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
857} 871}
858 872
859static inline void x86_pmu_disable_event(struct perf_event *event) 873static inline void x86_pmu_disable_event(struct perf_event *event)
860{ 874{
861 struct hw_perf_event *hwc = &event->hw; 875 struct hw_perf_event *hwc = &event->hw;
862 (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); 876
877 wrmsrl(hwc->config_base + hwc->idx, hwc->config);
863} 878}
864 879
865static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 880static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -872,9 +887,9 @@ static int
872x86_perf_event_set_period(struct perf_event *event) 887x86_perf_event_set_period(struct perf_event *event)
873{ 888{
874 struct hw_perf_event *hwc = &event->hw; 889 struct hw_perf_event *hwc = &event->hw;
875 s64 left = atomic64_read(&hwc->period_left); 890 s64 left = local64_read(&hwc->period_left);
876 s64 period = hwc->sample_period; 891 s64 period = hwc->sample_period;
877 int err, ret = 0, idx = hwc->idx; 892 int ret = 0, idx = hwc->idx;
878 893
879 if (idx == X86_PMC_IDX_FIXED_BTS) 894 if (idx == X86_PMC_IDX_FIXED_BTS)
880 return 0; 895 return 0;
@@ -884,14 +899,14 @@ x86_perf_event_set_period(struct perf_event *event)
884 */ 899 */
885 if (unlikely(left <= -period)) { 900 if (unlikely(left <= -period)) {
886 left = period; 901 left = period;
887 atomic64_set(&hwc->period_left, left); 902 local64_set(&hwc->period_left, left);
888 hwc->last_period = period; 903 hwc->last_period = period;
889 ret = 1; 904 ret = 1;
890 } 905 }
891 906
892 if (unlikely(left <= 0)) { 907 if (unlikely(left <= 0)) {
893 left += period; 908 left += period;
894 atomic64_set(&hwc->period_left, left); 909 local64_set(&hwc->period_left, left);
895 hwc->last_period = period; 910 hwc->last_period = period;
896 ret = 1; 911 ret = 1;
897 } 912 }
@@ -910,10 +925,19 @@ x86_perf_event_set_period(struct perf_event *event)
910 * The hw event starts counting from this event offset, 925 * The hw event starts counting from this event offset,
911 * mark it to be able to extra future deltas: 926 * mark it to be able to extra future deltas:
912 */ 927 */
913 atomic64_set(&hwc->prev_count, (u64)-left); 928 local64_set(&hwc->prev_count, (u64)-left);
914 929
915 err = checking_wrmsrl(hwc->event_base + idx, 930 wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
916 (u64)(-left) & x86_pmu.event_mask); 931
932 /*
933 * Due to erratum on certan cpu we need
934 * a second write to be sure the register
935 * is updated properly
936 */
937 if (x86_pmu.perfctr_second_write) {
938 wrmsrl(hwc->event_base + idx,
939 (u64)(-left) & x86_pmu.cntval_mask);
940 }
917 941
918 perf_event_update_userpage(event); 942 perf_event_update_userpage(event);
919 943
@@ -924,7 +948,8 @@ static void x86_pmu_enable_event(struct perf_event *event)
924{ 948{
925 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 949 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
926 if (cpuc->enabled) 950 if (cpuc->enabled)
927 __x86_pmu_enable_event(&event->hw); 951 __x86_pmu_enable_event(&event->hw,
952 ARCH_PERFMON_EVENTSEL_ENABLE);
928} 953}
929 954
930/* 955/*
@@ -950,7 +975,15 @@ static int x86_pmu_enable(struct perf_event *event)
950 if (n < 0) 975 if (n < 0)
951 return n; 976 return n;
952 977
953 ret = x86_schedule_events(cpuc, n, assign); 978 /*
979 * If group events scheduling transaction was started,
980 * skip the schedulability test here, it will be peformed
981 * at commit time(->commit_txn) as a whole
982 */
983 if (cpuc->group_flag & PERF_EVENT_TXN)
984 goto out;
985
986 ret = x86_pmu.schedule_events(cpuc, n, assign);
954 if (ret) 987 if (ret)
955 return ret; 988 return ret;
956 /* 989 /*
@@ -959,8 +992,10 @@ static int x86_pmu_enable(struct perf_event *event)
959 */ 992 */
960 memcpy(cpuc->assign, assign, n*sizeof(int)); 993 memcpy(cpuc->assign, assign, n*sizeof(int));
961 994
995out:
962 cpuc->n_events = n; 996 cpuc->n_events = n;
963 cpuc->n_added += n - n0; 997 cpuc->n_added += n - n0;
998 cpuc->n_txn += n - n0;
964 999
965 return 0; 1000 return 0;
966} 1001}
@@ -976,6 +1011,7 @@ static int x86_pmu_start(struct perf_event *event)
976 x86_perf_event_set_period(event); 1011 x86_perf_event_set_period(event);
977 cpuc->events[idx] = event; 1012 cpuc->events[idx] = event;
978 __set_bit(idx, cpuc->active_mask); 1013 __set_bit(idx, cpuc->active_mask);
1014 __set_bit(idx, cpuc->running);
979 x86_pmu.enable(event); 1015 x86_pmu.enable(event);
980 perf_event_update_userpage(event); 1016 perf_event_update_userpage(event);
981 1017
@@ -991,11 +1027,12 @@ static void x86_pmu_unthrottle(struct perf_event *event)
991void perf_event_print_debug(void) 1027void perf_event_print_debug(void)
992{ 1028{
993 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1029 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1030 u64 pebs;
994 struct cpu_hw_events *cpuc; 1031 struct cpu_hw_events *cpuc;
995 unsigned long flags; 1032 unsigned long flags;
996 int cpu, idx; 1033 int cpu, idx;
997 1034
998 if (!x86_pmu.num_events) 1035 if (!x86_pmu.num_counters)
999 return; 1036 return;
1000 1037
1001 local_irq_save(flags); 1038 local_irq_save(flags);
@@ -1008,16 +1045,18 @@ void perf_event_print_debug(void)
1008 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1045 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1009 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1046 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1010 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1047 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1048 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1011 1049
1012 pr_info("\n"); 1050 pr_info("\n");
1013 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1051 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1014 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1052 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1015 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1053 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1016 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1054 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1055 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
1017 } 1056 }
1018 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1057 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1019 1058
1020 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1059 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1021 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1060 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1022 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1061 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1023 1062
@@ -1030,7 +1069,7 @@ void perf_event_print_debug(void)
1030 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1069 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1031 cpu, idx, prev_left); 1070 cpu, idx, prev_left);
1032 } 1071 }
1033 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 1072 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1034 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1073 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1035 1074
1036 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1075 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1064,6 +1103,14 @@ static void x86_pmu_disable(struct perf_event *event)
1064 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1103 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1065 int i; 1104 int i;
1066 1105
1106 /*
1107 * If we're called during a txn, we don't need to do anything.
1108 * The events never got scheduled and ->cancel_txn will truncate
1109 * the event_list.
1110 */
1111 if (cpuc->group_flag & PERF_EVENT_TXN)
1112 return;
1113
1067 x86_pmu_stop(event); 1114 x86_pmu_stop(event);
1068 1115
1069 for (i = 0; i < cpuc->n_events; i++) { 1116 for (i = 0; i < cpuc->n_events; i++) {
@@ -1095,21 +1142,29 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1095 1142
1096 cpuc = &__get_cpu_var(cpu_hw_events); 1143 cpuc = &__get_cpu_var(cpu_hw_events);
1097 1144
1098 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1145 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1099 if (!test_bit(idx, cpuc->active_mask)) 1146 if (!test_bit(idx, cpuc->active_mask)) {
1147 /*
1148 * Though we deactivated the counter some cpus
1149 * might still deliver spurious interrupts still
1150 * in flight. Catch them:
1151 */
1152 if (__test_and_clear_bit(idx, cpuc->running))
1153 handled++;
1100 continue; 1154 continue;
1155 }
1101 1156
1102 event = cpuc->events[idx]; 1157 event = cpuc->events[idx];
1103 hwc = &event->hw; 1158 hwc = &event->hw;
1104 1159
1105 val = x86_perf_event_update(event); 1160 val = x86_perf_event_update(event);
1106 if (val & (1ULL << (x86_pmu.event_bits - 1))) 1161 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1107 continue; 1162 continue;
1108 1163
1109 /* 1164 /*
1110 * event overflow 1165 * event overflow
1111 */ 1166 */
1112 handled = 1; 1167 handled++;
1113 data.period = event->hw.last_period; 1168 data.period = event->hw.last_period;
1114 1169
1115 if (!x86_perf_event_set_period(event)) 1170 if (!x86_perf_event_set_period(event))
@@ -1146,7 +1201,6 @@ void set_perf_event_pending(void)
1146 1201
1147void perf_events_lapic_init(void) 1202void perf_events_lapic_init(void)
1148{ 1203{
1149#ifdef CONFIG_X86_LOCAL_APIC
1150 if (!x86_pmu.apic || !x86_pmu_initialized()) 1204 if (!x86_pmu.apic || !x86_pmu_initialized())
1151 return; 1205 return;
1152 1206
@@ -1154,15 +1208,22 @@ void perf_events_lapic_init(void)
1154 * Always use NMI for PMU 1208 * Always use NMI for PMU
1155 */ 1209 */
1156 apic_write(APIC_LVTPC, APIC_DM_NMI); 1210 apic_write(APIC_LVTPC, APIC_DM_NMI);
1157#endif
1158} 1211}
1159 1212
1213struct pmu_nmi_state {
1214 unsigned int marked;
1215 int handled;
1216};
1217
1218static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
1219
1160static int __kprobes 1220static int __kprobes
1161perf_event_nmi_handler(struct notifier_block *self, 1221perf_event_nmi_handler(struct notifier_block *self,
1162 unsigned long cmd, void *__args) 1222 unsigned long cmd, void *__args)
1163{ 1223{
1164 struct die_args *args = __args; 1224 struct die_args *args = __args;
1165 struct pt_regs *regs; 1225 unsigned int this_nmi;
1226 int handled;
1166 1227
1167 if (!atomic_read(&active_events)) 1228 if (!atomic_read(&active_events))
1168 return NOTIFY_DONE; 1229 return NOTIFY_DONE;
@@ -1171,24 +1232,47 @@ perf_event_nmi_handler(struct notifier_block *self,
1171 case DIE_NMI: 1232 case DIE_NMI:
1172 case DIE_NMI_IPI: 1233 case DIE_NMI_IPI:
1173 break; 1234 break;
1174 1235 case DIE_NMIUNKNOWN:
1236 this_nmi = percpu_read(irq_stat.__nmi_count);
1237 if (this_nmi != __get_cpu_var(pmu_nmi).marked)
1238 /* let the kernel handle the unknown nmi */
1239 return NOTIFY_DONE;
1240 /*
1241 * This one is a PMU back-to-back nmi. Two events
1242 * trigger 'simultaneously' raising two back-to-back
1243 * NMIs. If the first NMI handles both, the latter
1244 * will be empty and daze the CPU. So, we drop it to
1245 * avoid false-positive 'unknown nmi' messages.
1246 */
1247 return NOTIFY_STOP;
1175 default: 1248 default:
1176 return NOTIFY_DONE; 1249 return NOTIFY_DONE;
1177 } 1250 }
1178 1251
1179 regs = args->regs;
1180
1181#ifdef CONFIG_X86_LOCAL_APIC
1182 apic_write(APIC_LVTPC, APIC_DM_NMI); 1252 apic_write(APIC_LVTPC, APIC_DM_NMI);
1183#endif 1253
1184 /* 1254 handled = x86_pmu.handle_irq(args->regs);
1185 * Can't rely on the handled return value to say it was our NMI, two 1255 if (!handled)
1186 * events could trigger 'simultaneously' raising two back-to-back NMIs. 1256 return NOTIFY_DONE;
1187 * 1257
1188 * If the first NMI handles both, the latter will be empty and daze 1258 this_nmi = percpu_read(irq_stat.__nmi_count);
1189 * the CPU. 1259 if ((handled > 1) ||
1190 */ 1260 /* the next nmi could be a back-to-back nmi */
1191 x86_pmu.handle_irq(regs); 1261 ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
1262 (__get_cpu_var(pmu_nmi).handled > 1))) {
1263 /*
1264 * We could have two subsequent back-to-back nmis: The
1265 * first handles more than one counter, the 2nd
1266 * handles only one counter and the 3rd handles no
1267 * counter.
1268 *
1269 * This is the 2nd nmi because the previous was
1270 * handling more than one counter. We will mark the
1271 * next (3rd) and then drop it if unhandled.
1272 */
1273 __get_cpu_var(pmu_nmi).marked = this_nmi + 1;
1274 __get_cpu_var(pmu_nmi).handled = handled;
1275 }
1192 1276
1193 return NOTIFY_STOP; 1277 return NOTIFY_STOP;
1194} 1278}
@@ -1217,118 +1301,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1217 return &unconstrained; 1301 return &unconstrained;
1218} 1302}
1219 1303
1220static int x86_event_sched_in(struct perf_event *event,
1221 struct perf_cpu_context *cpuctx)
1222{
1223 int ret = 0;
1224
1225 event->state = PERF_EVENT_STATE_ACTIVE;
1226 event->oncpu = smp_processor_id();
1227 event->tstamp_running += event->ctx->time - event->tstamp_stopped;
1228
1229 if (!is_x86_event(event))
1230 ret = event->pmu->enable(event);
1231
1232 if (!ret && !is_software_event(event))
1233 cpuctx->active_oncpu++;
1234
1235 if (!ret && event->attr.exclusive)
1236 cpuctx->exclusive = 1;
1237
1238 return ret;
1239}
1240
1241static void x86_event_sched_out(struct perf_event *event,
1242 struct perf_cpu_context *cpuctx)
1243{
1244 event->state = PERF_EVENT_STATE_INACTIVE;
1245 event->oncpu = -1;
1246
1247 if (!is_x86_event(event))
1248 event->pmu->disable(event);
1249
1250 event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
1251
1252 if (!is_software_event(event))
1253 cpuctx->active_oncpu--;
1254
1255 if (event->attr.exclusive || !cpuctx->active_oncpu)
1256 cpuctx->exclusive = 0;
1257}
1258
1259/*
1260 * Called to enable a whole group of events.
1261 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
1262 * Assumes the caller has disabled interrupts and has
1263 * frozen the PMU with hw_perf_save_disable.
1264 *
1265 * called with PMU disabled. If successful and return value 1,
1266 * then guaranteed to call perf_enable() and hw_perf_enable()
1267 */
1268int hw_perf_group_sched_in(struct perf_event *leader,
1269 struct perf_cpu_context *cpuctx,
1270 struct perf_event_context *ctx)
1271{
1272 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1273 struct perf_event *sub;
1274 int assign[X86_PMC_IDX_MAX];
1275 int n0, n1, ret;
1276
1277 /* n0 = total number of events */
1278 n0 = collect_events(cpuc, leader, true);
1279 if (n0 < 0)
1280 return n0;
1281
1282 ret = x86_schedule_events(cpuc, n0, assign);
1283 if (ret)
1284 return ret;
1285
1286 ret = x86_event_sched_in(leader, cpuctx);
1287 if (ret)
1288 return ret;
1289
1290 n1 = 1;
1291 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1292 if (sub->state > PERF_EVENT_STATE_OFF) {
1293 ret = x86_event_sched_in(sub, cpuctx);
1294 if (ret)
1295 goto undo;
1296 ++n1;
1297 }
1298 }
1299 /*
1300 * copy new assignment, now we know it is possible
1301 * will be used by hw_perf_enable()
1302 */
1303 memcpy(cpuc->assign, assign, n0*sizeof(int));
1304
1305 cpuc->n_events = n0;
1306 cpuc->n_added += n1;
1307 ctx->nr_active += n1;
1308
1309 /*
1310 * 1 means successful and events are active
1311 * This is not quite true because we defer
1312 * actual activation until hw_perf_enable() but
1313 * this way we* ensure caller won't try to enable
1314 * individual events
1315 */
1316 return 1;
1317undo:
1318 x86_event_sched_out(leader, cpuctx);
1319 n0 = 1;
1320 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1321 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
1322 x86_event_sched_out(sub, cpuctx);
1323 if (++n0 == n1)
1324 break;
1325 }
1326 }
1327 return ret;
1328}
1329
1330#include "perf_event_amd.c" 1304#include "perf_event_amd.c"
1331#include "perf_event_p6.c" 1305#include "perf_event_p6.c"
1306#include "perf_event_p4.c"
1307#include "perf_event_intel_lbr.c"
1308#include "perf_event_intel_ds.c"
1332#include "perf_event_intel.c" 1309#include "perf_event_intel.c"
1333 1310
1334static int __cpuinit 1311static int __cpuinit
@@ -1402,48 +1379,50 @@ void __init init_hw_perf_events(void)
1402 1379
1403 pr_cont("%s PMU driver.\n", x86_pmu.name); 1380 pr_cont("%s PMU driver.\n", x86_pmu.name);
1404 1381
1405 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 1382 if (x86_pmu.quirks)
1383 x86_pmu.quirks();
1384
1385 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1406 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1386 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1407 x86_pmu.num_events, X86_PMC_MAX_GENERIC); 1387 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1408 x86_pmu.num_events = X86_PMC_MAX_GENERIC; 1388 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1409 } 1389 }
1410 perf_event_mask = (1 << x86_pmu.num_events) - 1; 1390 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1411 perf_max_events = x86_pmu.num_events; 1391 perf_max_events = x86_pmu.num_counters;
1412 1392
1413 if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { 1393 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1414 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", 1394 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1415 x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); 1395 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1416 x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; 1396 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1417 } 1397 }
1418 1398
1419 perf_event_mask |= 1399 x86_pmu.intel_ctrl |=
1420 ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; 1400 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1421 x86_pmu.intel_ctrl = perf_event_mask;
1422 1401
1423 perf_events_lapic_init(); 1402 perf_events_lapic_init();
1424 register_die_notifier(&perf_event_nmi_notifier); 1403 register_die_notifier(&perf_event_nmi_notifier);
1425 1404
1426 unconstrained = (struct event_constraint) 1405 unconstrained = (struct event_constraint)
1427 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 1406 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1428 0, x86_pmu.num_events); 1407 0, x86_pmu.num_counters);
1429 1408
1430 if (x86_pmu.event_constraints) { 1409 if (x86_pmu.event_constraints) {
1431 for_each_event_constraint(c, x86_pmu.event_constraints) { 1410 for_each_event_constraint(c, x86_pmu.event_constraints) {
1432 if (c->cmask != INTEL_ARCH_FIXED_MASK) 1411 if (c->cmask != X86_RAW_EVENT_MASK)
1433 continue; 1412 continue;
1434 1413
1435 c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; 1414 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1436 c->weight += x86_pmu.num_events; 1415 c->weight += x86_pmu.num_counters;
1437 } 1416 }
1438 } 1417 }
1439 1418
1440 pr_info("... version: %d\n", x86_pmu.version); 1419 pr_info("... version: %d\n", x86_pmu.version);
1441 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1420 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
1442 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1421 pr_info("... generic registers: %d\n", x86_pmu.num_counters);
1443 pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); 1422 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
1444 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1423 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1445 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); 1424 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
1446 pr_info("... event mask: %016Lx\n", perf_event_mask); 1425 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1447 1426
1448 perf_cpu_notifier(x86_pmu_notifier); 1427 perf_cpu_notifier(x86_pmu_notifier);
1449} 1428}
@@ -1453,6 +1432,67 @@ static inline void x86_pmu_read(struct perf_event *event)
1453 x86_perf_event_update(event); 1432 x86_perf_event_update(event);
1454} 1433}
1455 1434
1435/*
1436 * Start group events scheduling transaction
1437 * Set the flag to make pmu::enable() not perform the
1438 * schedulability test, it will be performed at commit time
1439 */
1440static void x86_pmu_start_txn(const struct pmu *pmu)
1441{
1442 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1443
1444 cpuc->group_flag |= PERF_EVENT_TXN;
1445 cpuc->n_txn = 0;
1446}
1447
1448/*
1449 * Stop group events scheduling transaction
1450 * Clear the flag and pmu::enable() will perform the
1451 * schedulability test.
1452 */
1453static void x86_pmu_cancel_txn(const struct pmu *pmu)
1454{
1455 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1456
1457 cpuc->group_flag &= ~PERF_EVENT_TXN;
1458 /*
1459 * Truncate the collected events.
1460 */
1461 cpuc->n_added -= cpuc->n_txn;
1462 cpuc->n_events -= cpuc->n_txn;
1463}
1464
1465/*
1466 * Commit group events scheduling transaction
1467 * Perform the group schedulability test as a whole
1468 * Return 0 if success
1469 */
1470static int x86_pmu_commit_txn(const struct pmu *pmu)
1471{
1472 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1473 int assign[X86_PMC_IDX_MAX];
1474 int n, ret;
1475
1476 n = cpuc->n_events;
1477
1478 if (!x86_pmu_initialized())
1479 return -EAGAIN;
1480
1481 ret = x86_pmu.schedule_events(cpuc, n, assign);
1482 if (ret)
1483 return ret;
1484
1485 /*
1486 * copy new assignment, now we know it is possible
1487 * will be used by hw_perf_enable()
1488 */
1489 memcpy(cpuc->assign, assign, n*sizeof(int));
1490
1491 cpuc->group_flag &= ~PERF_EVENT_TXN;
1492
1493 return 0;
1494}
1495
1456static const struct pmu pmu = { 1496static const struct pmu pmu = {
1457 .enable = x86_pmu_enable, 1497 .enable = x86_pmu_enable,
1458 .disable = x86_pmu_disable, 1498 .disable = x86_pmu_disable,
@@ -1460,9 +1500,38 @@ static const struct pmu pmu = {
1460 .stop = x86_pmu_stop, 1500 .stop = x86_pmu_stop,
1461 .read = x86_pmu_read, 1501 .read = x86_pmu_read,
1462 .unthrottle = x86_pmu_unthrottle, 1502 .unthrottle = x86_pmu_unthrottle,
1503 .start_txn = x86_pmu_start_txn,
1504 .cancel_txn = x86_pmu_cancel_txn,
1505 .commit_txn = x86_pmu_commit_txn,
1463}; 1506};
1464 1507
1465/* 1508/*
1509 * validate that we can schedule this event
1510 */
1511static int validate_event(struct perf_event *event)
1512{
1513 struct cpu_hw_events *fake_cpuc;
1514 struct event_constraint *c;
1515 int ret = 0;
1516
1517 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1518 if (!fake_cpuc)
1519 return -ENOMEM;
1520
1521 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1522
1523 if (!c || !c->weight)
1524 ret = -ENOSPC;
1525
1526 if (x86_pmu.put_event_constraints)
1527 x86_pmu.put_event_constraints(fake_cpuc, event);
1528
1529 kfree(fake_cpuc);
1530
1531 return ret;
1532}
1533
1534/*
1466 * validate a single event group 1535 * validate a single event group
1467 * 1536 *
1468 * validation include: 1537 * validation include:
@@ -1502,7 +1571,7 @@ static int validate_group(struct perf_event *event)
1502 1571
1503 fake_cpuc->n_events = n; 1572 fake_cpuc->n_events = n;
1504 1573
1505 ret = x86_schedule_events(fake_cpuc, n, NULL); 1574 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1506 1575
1507out_free: 1576out_free:
1508 kfree(fake_cpuc); 1577 kfree(fake_cpuc);
@@ -1527,6 +1596,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
1527 1596
1528 if (event->group_leader != event) 1597 if (event->group_leader != event)
1529 err = validate_group(event); 1598 err = validate_group(event);
1599 else
1600 err = validate_event(event);
1530 1601
1531 event->pmu = tmp; 1602 event->pmu = tmp;
1532 } 1603 }
@@ -1574,8 +1645,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1574{ 1645{
1575 struct perf_callchain_entry *entry = data; 1646 struct perf_callchain_entry *entry = data;
1576 1647
1577 if (reliable) 1648 callchain_store(entry, addr);
1578 callchain_store(entry, addr);
1579} 1649}
1580 1650
1581static const struct stacktrace_ops backtrace_ops = { 1651static const struct stacktrace_ops backtrace_ops = {
@@ -1586,8 +1656,6 @@ static const struct stacktrace_ops backtrace_ops = {
1586 .walk_stack = print_context_stack_bp, 1656 .walk_stack = print_context_stack_bp,
1587}; 1657};
1588 1658
1589#include "../dumpstack.h"
1590
1591static void 1659static void
1592perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) 1660perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1593{ 1661{
@@ -1597,41 +1665,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1597 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1665 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
1598} 1666}
1599 1667
1600/*
1601 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
1602 */
1603static unsigned long
1604copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
1605{
1606 unsigned long offset, addr = (unsigned long)from;
1607 int type = in_nmi() ? KM_NMI : KM_IRQ0;
1608 unsigned long size, len = 0;
1609 struct page *page;
1610 void *map;
1611 int ret;
1612
1613 do {
1614 ret = __get_user_pages_fast(addr, 1, 0, &page);
1615 if (!ret)
1616 break;
1617
1618 offset = addr & (PAGE_SIZE - 1);
1619 size = min(PAGE_SIZE - offset, n - len);
1620
1621 map = kmap_atomic(page, type);
1622 memcpy(to, map+offset, size);
1623 kunmap_atomic(map, type);
1624 put_page(page);
1625
1626 len += size;
1627 to += size;
1628 addr += size;
1629
1630 } while (len < n);
1631
1632 return len;
1633}
1634
1635#ifdef CONFIG_COMPAT 1668#ifdef CONFIG_COMPAT
1636static inline int 1669static inline int
1637perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) 1670perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -1727,6 +1760,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1727{ 1760{
1728 struct perf_callchain_entry *entry; 1761 struct perf_callchain_entry *entry;
1729 1762
1763 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1764 /* TODO: We don't support guest os callchain now */
1765 return NULL;
1766 }
1767
1730 if (in_nmi()) 1768 if (in_nmi())
1731 entry = &__get_cpu_var(pmc_nmi_entry); 1769 entry = &__get_cpu_var(pmc_nmi_entry);
1732 else 1770 else
@@ -1739,14 +1777,36 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1739 return entry; 1777 return entry;
1740} 1778}
1741 1779
1742void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) 1780unsigned long perf_instruction_pointer(struct pt_regs *regs)
1743{ 1781{
1744 regs->ip = ip; 1782 unsigned long ip;
1745 /* 1783
1746 * perf_arch_fetch_caller_regs adds another call, we need to increment 1784 if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1747 * the skip level 1785 ip = perf_guest_cbs->get_guest_ip();
1748 */ 1786 else
1749 regs->bp = rewind_frame_pointer(skip + 1); 1787 ip = instruction_pointer(regs);
1750 regs->cs = __KERNEL_CS; 1788
1751 local_save_flags(regs->flags); 1789 return ip;
1790}
1791
1792unsigned long perf_misc_flags(struct pt_regs *regs)
1793{
1794 int misc = 0;
1795
1796 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1797 if (perf_guest_cbs->is_user_mode())
1798 misc |= PERF_RECORD_MISC_GUEST_USER;
1799 else
1800 misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1801 } else {
1802 if (user_mode(regs))
1803 misc |= PERF_RECORD_MISC_USER;
1804 else
1805 misc |= PERF_RECORD_MISC_KERNEL;
1806 }
1807
1808 if (regs->flags & PERF_EFLAGS_EXACT)
1809 misc |= PERF_RECORD_MISC_EXACT_IP;
1810
1811 return misc;
1752} 1812}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index db6f7d4056e1..c2897b7b4a3b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,7 +2,7 @@
2 2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock); 3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4 4
5static __initconst u64 amd_hw_cache_event_ids 5static __initconst const u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX] 6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX] 7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -102,8 +102,8 @@ static const u64 amd_perfmon_event_map[] =
102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, 103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, 104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, 105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
107}; 107};
108 108
109static u64 amd_pmu_event_map(int hw_event) 109static u64 amd_pmu_event_map(int hw_event)
@@ -111,22 +111,19 @@ static u64 amd_pmu_event_map(int hw_event)
111 return amd_perfmon_event_map[hw_event]; 111 return amd_perfmon_event_map[hw_event];
112} 112}
113 113
114static u64 amd_pmu_raw_event(u64 hw_event) 114static int amd_pmu_hw_config(struct perf_event *event)
115{ 115{
116#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL 116 int ret = x86_pmu_hw_config(event);
117#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL 117
118#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL 118 if (ret)
119#define K7_EVNTSEL_INV_MASK 0x000800000ULL 119 return ret;
120#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL 120
121 121 if (event->attr.type != PERF_TYPE_RAW)
122#define K7_EVNTSEL_MASK \ 122 return 0;
123 (K7_EVNTSEL_EVENT_MASK | \ 123
124 K7_EVNTSEL_UNIT_MASK | \ 124 event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
125 K7_EVNTSEL_EDGE_MASK | \ 125
126 K7_EVNTSEL_INV_MASK | \ 126 return 0;
127 K7_EVNTSEL_REG_MASK)
128
129 return hw_event & K7_EVNTSEL_MASK;
130} 127}
131 128
132/* 129/*
@@ -165,7 +162,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
165 * be removed on one CPU at a time AND PMU is disabled 162 * be removed on one CPU at a time AND PMU is disabled
166 * when we come here 163 * when we come here
167 */ 164 */
168 for (i = 0; i < x86_pmu.num_events; i++) { 165 for (i = 0; i < x86_pmu.num_counters; i++) {
169 if (nb->owners[i] == event) { 166 if (nb->owners[i] == event) {
170 cmpxchg(nb->owners+i, event, NULL); 167 cmpxchg(nb->owners+i, event, NULL);
171 break; 168 break;
@@ -215,7 +212,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
215 struct hw_perf_event *hwc = &event->hw; 212 struct hw_perf_event *hwc = &event->hw;
216 struct amd_nb *nb = cpuc->amd_nb; 213 struct amd_nb *nb = cpuc->amd_nb;
217 struct perf_event *old = NULL; 214 struct perf_event *old = NULL;
218 int max = x86_pmu.num_events; 215 int max = x86_pmu.num_counters;
219 int i, j, k = -1; 216 int i, j, k = -1;
220 217
221 /* 218 /*
@@ -293,7 +290,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
293 /* 290 /*
294 * initialize all possible NB constraints 291 * initialize all possible NB constraints
295 */ 292 */
296 for (i = 0; i < x86_pmu.num_events; i++) { 293 for (i = 0; i < x86_pmu.num_counters; i++) {
297 __set_bit(i, nb->event_constraints[i].idxmsk); 294 __set_bit(i, nb->event_constraints[i].idxmsk);
298 nb->event_constraints[i].weight = 1; 295 nb->event_constraints[i].weight = 1;
299 } 296 }
@@ -371,21 +368,22 @@ static void amd_pmu_cpu_dead(int cpu)
371 raw_spin_unlock(&amd_nb_lock); 368 raw_spin_unlock(&amd_nb_lock);
372} 369}
373 370
374static __initconst struct x86_pmu amd_pmu = { 371static __initconst const struct x86_pmu amd_pmu = {
375 .name = "AMD", 372 .name = "AMD",
376 .handle_irq = x86_pmu_handle_irq, 373 .handle_irq = x86_pmu_handle_irq,
377 .disable_all = x86_pmu_disable_all, 374 .disable_all = x86_pmu_disable_all,
378 .enable_all = x86_pmu_enable_all, 375 .enable_all = x86_pmu_enable_all,
379 .enable = x86_pmu_enable_event, 376 .enable = x86_pmu_enable_event,
380 .disable = x86_pmu_disable_event, 377 .disable = x86_pmu_disable_event,
378 .hw_config = amd_pmu_hw_config,
379 .schedule_events = x86_schedule_events,
381 .eventsel = MSR_K7_EVNTSEL0, 380 .eventsel = MSR_K7_EVNTSEL0,
382 .perfctr = MSR_K7_PERFCTR0, 381 .perfctr = MSR_K7_PERFCTR0,
383 .event_map = amd_pmu_event_map, 382 .event_map = amd_pmu_event_map,
384 .raw_event = amd_pmu_raw_event,
385 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 383 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
386 .num_events = 4, 384 .num_counters = 4,
387 .event_bits = 48, 385 .cntval_bits = 48,
388 .event_mask = (1ULL << 48) - 1, 386 .cntval_mask = (1ULL << 48) - 1,
389 .apic = 1, 387 .apic = 1,
390 /* use highest bit to detect overflow */ 388 /* use highest bit to detect overflow */
391 .max_period = (1ULL << 47) - 1, 389 .max_period = (1ULL << 47) - 1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9c794ac87837..ee05c90012d2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -72,6 +72,7 @@ static struct event_constraint intel_westmere_event_constraints[] =
72 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ 72 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
73 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ 73 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
74 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ 74 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
75 INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
75 EVENT_CONSTRAINT_END 76 EVENT_CONSTRAINT_END
76}; 77};
77 78
@@ -88,7 +89,7 @@ static u64 intel_pmu_event_map(int hw_event)
88 return intel_perfmon_event_map[hw_event]; 89 return intel_perfmon_event_map[hw_event];
89} 90}
90 91
91static __initconst u64 westmere_hw_cache_event_ids 92static __initconst const u64 westmere_hw_cache_event_ids
92 [PERF_COUNT_HW_CACHE_MAX] 93 [PERF_COUNT_HW_CACHE_MAX]
93 [PERF_COUNT_HW_CACHE_OP_MAX] 94 [PERF_COUNT_HW_CACHE_OP_MAX]
94 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 95 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -179,7 +180,7 @@ static __initconst u64 westmere_hw_cache_event_ids
179 }, 180 },
180}; 181};
181 182
182static __initconst u64 nehalem_hw_cache_event_ids 183static __initconst const u64 nehalem_hw_cache_event_ids
183 [PERF_COUNT_HW_CACHE_MAX] 184 [PERF_COUNT_HW_CACHE_MAX]
184 [PERF_COUNT_HW_CACHE_OP_MAX] 185 [PERF_COUNT_HW_CACHE_OP_MAX]
185 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 186 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -270,7 +271,7 @@ static __initconst u64 nehalem_hw_cache_event_ids
270 }, 271 },
271}; 272};
272 273
273static __initconst u64 core2_hw_cache_event_ids 274static __initconst const u64 core2_hw_cache_event_ids
274 [PERF_COUNT_HW_CACHE_MAX] 275 [PERF_COUNT_HW_CACHE_MAX]
275 [PERF_COUNT_HW_CACHE_OP_MAX] 276 [PERF_COUNT_HW_CACHE_OP_MAX]
276 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 277 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -361,7 +362,7 @@ static __initconst u64 core2_hw_cache_event_ids
361 }, 362 },
362}; 363};
363 364
364static __initconst u64 atom_hw_cache_event_ids 365static __initconst const u64 atom_hw_cache_event_ids
365 [PERF_COUNT_HW_CACHE_MAX] 366 [PERF_COUNT_HW_CACHE_MAX]
366 [PERF_COUNT_HW_CACHE_OP_MAX] 367 [PERF_COUNT_HW_CACHE_OP_MAX]
367 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 368 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -452,60 +453,6 @@ static __initconst u64 atom_hw_cache_event_ids
452 }, 453 },
453}; 454};
454 455
455static u64 intel_pmu_raw_event(u64 hw_event)
456{
457#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
458#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
459#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
460#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
461#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
462
463#define CORE_EVNTSEL_MASK \
464 (INTEL_ARCH_EVTSEL_MASK | \
465 INTEL_ARCH_UNIT_MASK | \
466 INTEL_ARCH_EDGE_MASK | \
467 INTEL_ARCH_INV_MASK | \
468 INTEL_ARCH_CNT_MASK)
469
470 return hw_event & CORE_EVNTSEL_MASK;
471}
472
473static void intel_pmu_enable_bts(u64 config)
474{
475 unsigned long debugctlmsr;
476
477 debugctlmsr = get_debugctlmsr();
478
479 debugctlmsr |= X86_DEBUGCTL_TR;
480 debugctlmsr |= X86_DEBUGCTL_BTS;
481 debugctlmsr |= X86_DEBUGCTL_BTINT;
482
483 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
484 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
485
486 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
487 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
488
489 update_debugctlmsr(debugctlmsr);
490}
491
492static void intel_pmu_disable_bts(void)
493{
494 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
495 unsigned long debugctlmsr;
496
497 if (!cpuc->ds)
498 return;
499
500 debugctlmsr = get_debugctlmsr();
501
502 debugctlmsr &=
503 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
504 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
505
506 update_debugctlmsr(debugctlmsr);
507}
508
509static void intel_pmu_disable_all(void) 456static void intel_pmu_disable_all(void)
510{ 457{
511 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 458 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -514,12 +461,17 @@ static void intel_pmu_disable_all(void)
514 461
515 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 462 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
516 intel_pmu_disable_bts(); 463 intel_pmu_disable_bts();
464
465 intel_pmu_pebs_disable_all();
466 intel_pmu_lbr_disable_all();
517} 467}
518 468
519static void intel_pmu_enable_all(void) 469static void intel_pmu_enable_all(int added)
520{ 470{
521 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 471 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
522 472
473 intel_pmu_pebs_enable_all();
474 intel_pmu_lbr_enable_all();
523 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 475 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
524 476
525 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 477 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
@@ -533,6 +485,87 @@ static void intel_pmu_enable_all(void)
533 } 485 }
534} 486}
535 487
488/*
489 * Workaround for:
490 * Intel Errata AAK100 (model 26)
491 * Intel Errata AAP53 (model 30)
492 * Intel Errata BD53 (model 44)
493 *
494 * The official story:
495 * These chips need to be 'reset' when adding counters by programming the
496 * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
497 * in sequence on the same PMC or on different PMCs.
498 *
499 * In practise it appears some of these events do in fact count, and
500 * we need to programm all 4 events.
501 */
502static void intel_pmu_nhm_workaround(void)
503{
504 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
505 static const unsigned long nhm_magic[4] = {
506 0x4300B5,
507 0x4300D2,
508 0x4300B1,
509 0x4300B1
510 };
511 struct perf_event *event;
512 int i;
513
514 /*
515 * The Errata requires below steps:
516 * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
517 * 2) Configure 4 PERFEVTSELx with the magic events and clear
518 * the corresponding PMCx;
519 * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
520 * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
521 * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
522 */
523
524 /*
525 * The real steps we choose are a little different from above.
526 * A) To reduce MSR operations, we don't run step 1) as they
527 * are already cleared before this function is called;
528 * B) Call x86_perf_event_update to save PMCx before configuring
529 * PERFEVTSELx with magic number;
530 * C) With step 5), we do clear only when the PERFEVTSELx is
531 * not used currently.
532 * D) Call x86_perf_event_set_period to restore PMCx;
533 */
534
535 /* We always operate 4 pairs of PERF Counters */
536 for (i = 0; i < 4; i++) {
537 event = cpuc->events[i];
538 if (event)
539 x86_perf_event_update(event);
540 }
541
542 for (i = 0; i < 4; i++) {
543 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
544 wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
545 }
546
547 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
548 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
549
550 for (i = 0; i < 4; i++) {
551 event = cpuc->events[i];
552
553 if (event) {
554 x86_perf_event_set_period(event);
555 __x86_pmu_enable_event(&event->hw,
556 ARCH_PERFMON_EVENTSEL_ENABLE);
557 } else
558 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
559 }
560}
561
562static void intel_pmu_nhm_enable_all(int added)
563{
564 if (added)
565 intel_pmu_nhm_workaround();
566 intel_pmu_enable_all(added);
567}
568
536static inline u64 intel_pmu_get_status(void) 569static inline u64 intel_pmu_get_status(void)
537{ 570{
538 u64 status; 571 u64 status;
@@ -547,8 +580,7 @@ static inline void intel_pmu_ack_status(u64 ack)
547 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 580 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
548} 581}
549 582
550static inline void 583static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
551intel_pmu_disable_fixed(struct hw_perf_event *hwc)
552{ 584{
553 int idx = hwc->idx - X86_PMC_IDX_FIXED; 585 int idx = hwc->idx - X86_PMC_IDX_FIXED;
554 u64 ctrl_val, mask; 586 u64 ctrl_val, mask;
@@ -557,71 +589,10 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc)
557 589
558 rdmsrl(hwc->config_base, ctrl_val); 590 rdmsrl(hwc->config_base, ctrl_val);
559 ctrl_val &= ~mask; 591 ctrl_val &= ~mask;
560 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 592 wrmsrl(hwc->config_base, ctrl_val);
561}
562
563static void intel_pmu_drain_bts_buffer(void)
564{
565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
566 struct debug_store *ds = cpuc->ds;
567 struct bts_record {
568 u64 from;
569 u64 to;
570 u64 flags;
571 };
572 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
573 struct bts_record *at, *top;
574 struct perf_output_handle handle;
575 struct perf_event_header header;
576 struct perf_sample_data data;
577 struct pt_regs regs;
578
579 if (!event)
580 return;
581
582 if (!ds)
583 return;
584
585 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
586 top = (struct bts_record *)(unsigned long)ds->bts_index;
587
588 if (top <= at)
589 return;
590
591 ds->bts_index = ds->bts_buffer_base;
592
593 perf_sample_data_init(&data, 0);
594
595 data.period = event->hw.last_period;
596 regs.ip = 0;
597
598 /*
599 * Prepare a generic sample, i.e. fill in the invariant fields.
600 * We will overwrite the from and to address before we output
601 * the sample.
602 */
603 perf_prepare_sample(&header, &data, event, &regs);
604
605 if (perf_output_begin(&handle, event,
606 header.size * (top - at), 1, 1))
607 return;
608
609 for (; at < top; at++) {
610 data.ip = at->from;
611 data.addr = at->to;
612
613 perf_output_sample(&handle, &header, &data, event);
614 }
615
616 perf_output_end(&handle);
617
618 /* There's new data available. */
619 event->hw.interrupts++;
620 event->pending_kill = POLL_IN;
621} 593}
622 594
623static inline void 595static void intel_pmu_disable_event(struct perf_event *event)
624intel_pmu_disable_event(struct perf_event *event)
625{ 596{
626 struct hw_perf_event *hwc = &event->hw; 597 struct hw_perf_event *hwc = &event->hw;
627 598
@@ -637,14 +608,15 @@ intel_pmu_disable_event(struct perf_event *event)
637 } 608 }
638 609
639 x86_pmu_disable_event(event); 610 x86_pmu_disable_event(event);
611
612 if (unlikely(event->attr.precise_ip))
613 intel_pmu_pebs_disable(event);
640} 614}
641 615
642static inline void 616static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
643intel_pmu_enable_fixed(struct hw_perf_event *hwc)
644{ 617{
645 int idx = hwc->idx - X86_PMC_IDX_FIXED; 618 int idx = hwc->idx - X86_PMC_IDX_FIXED;
646 u64 ctrl_val, bits, mask; 619 u64 ctrl_val, bits, mask;
647 int err;
648 620
649 /* 621 /*
650 * Enable IRQ generation (0x8), 622 * Enable IRQ generation (0x8),
@@ -669,7 +641,7 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc)
669 rdmsrl(hwc->config_base, ctrl_val); 641 rdmsrl(hwc->config_base, ctrl_val);
670 ctrl_val &= ~mask; 642 ctrl_val &= ~mask;
671 ctrl_val |= bits; 643 ctrl_val |= bits;
672 err = checking_wrmsrl(hwc->config_base, ctrl_val); 644 wrmsrl(hwc->config_base, ctrl_val);
673} 645}
674 646
675static void intel_pmu_enable_event(struct perf_event *event) 647static void intel_pmu_enable_event(struct perf_event *event)
@@ -689,7 +661,10 @@ static void intel_pmu_enable_event(struct perf_event *event)
689 return; 661 return;
690 } 662 }
691 663
692 __x86_pmu_enable_event(hwc); 664 if (unlikely(event->attr.precise_ip))
665 intel_pmu_pebs_enable(event);
666
667 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
693} 668}
694 669
695/* 670/*
@@ -708,20 +683,20 @@ static void intel_pmu_reset(void)
708 unsigned long flags; 683 unsigned long flags;
709 int idx; 684 int idx;
710 685
711 if (!x86_pmu.num_events) 686 if (!x86_pmu.num_counters)
712 return; 687 return;
713 688
714 local_irq_save(flags); 689 local_irq_save(flags);
715 690
716 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 691 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
717 692
718 for (idx = 0; idx < x86_pmu.num_events; idx++) { 693 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
719 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 694 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
720 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 695 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
721 } 696 }
722 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 697 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
723 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 698 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
724 } 699
725 if (ds) 700 if (ds)
726 ds->bts_index = ds->bts_buffer_base; 701 ds->bts_index = ds->bts_buffer_base;
727 702
@@ -737,7 +712,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
737 struct perf_sample_data data; 712 struct perf_sample_data data;
738 struct cpu_hw_events *cpuc; 713 struct cpu_hw_events *cpuc;
739 int bit, loops; 714 int bit, loops;
740 u64 ack, status; 715 u64 status;
716 int handled = 0;
741 717
742 perf_sample_data_init(&data, 0); 718 perf_sample_data_init(&data, 0);
743 719
@@ -747,12 +723,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
747 intel_pmu_drain_bts_buffer(); 723 intel_pmu_drain_bts_buffer();
748 status = intel_pmu_get_status(); 724 status = intel_pmu_get_status();
749 if (!status) { 725 if (!status) {
750 intel_pmu_enable_all(); 726 intel_pmu_enable_all(0);
751 return 0; 727 return 0;
752 } 728 }
753 729
754 loops = 0; 730 loops = 0;
755again: 731again:
732 intel_pmu_ack_status(status);
756 if (++loops > 100) { 733 if (++loops > 100) {
757 WARN_ONCE(1, "perfevents: irq loop stuck!\n"); 734 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
758 perf_event_print_debug(); 735 perf_event_print_debug();
@@ -761,10 +738,22 @@ again:
761 } 738 }
762 739
763 inc_irq_stat(apic_perf_irqs); 740 inc_irq_stat(apic_perf_irqs);
764 ack = status; 741
742 intel_pmu_lbr_read();
743
744 /*
745 * PEBS overflow sets bit 62 in the global status register
746 */
747 if (__test_and_clear_bit(62, (unsigned long *)&status)) {
748 handled++;
749 x86_pmu.drain_pebs(regs);
750 }
751
765 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 752 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
766 struct perf_event *event = cpuc->events[bit]; 753 struct perf_event *event = cpuc->events[bit];
767 754
755 handled++;
756
768 if (!test_bit(bit, cpuc->active_mask)) 757 if (!test_bit(bit, cpuc->active_mask))
769 continue; 758 continue;
770 759
@@ -777,8 +766,6 @@ again:
777 x86_pmu_stop(event); 766 x86_pmu_stop(event);
778 } 767 }
779 768
780 intel_pmu_ack_status(ack);
781
782 /* 769 /*
783 * Repeat if there is more work to be done: 770 * Repeat if there is more work to be done:
784 */ 771 */
@@ -787,26 +774,22 @@ again:
787 goto again; 774 goto again;
788 775
789done: 776done:
790 intel_pmu_enable_all(); 777 intel_pmu_enable_all(0);
791 return 1; 778 return handled;
792} 779}
793 780
794static struct event_constraint bts_constraint =
795 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
796
797static struct event_constraint * 781static struct event_constraint *
798intel_special_constraints(struct perf_event *event) 782intel_bts_constraints(struct perf_event *event)
799{ 783{
800 unsigned int hw_event; 784 struct hw_perf_event *hwc = &event->hw;
801 785 unsigned int hw_event, bts_event;
802 hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
803 786
804 if (unlikely((hw_event == 787 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
805 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && 788 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
806 (event->hw.sample_period == 1))) {
807 789
790 if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
808 return &bts_constraint; 791 return &bts_constraint;
809 } 792
810 return NULL; 793 return NULL;
811} 794}
812 795
@@ -815,24 +798,53 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
815{ 798{
816 struct event_constraint *c; 799 struct event_constraint *c;
817 800
818 c = intel_special_constraints(event); 801 c = intel_bts_constraints(event);
802 if (c)
803 return c;
804
805 c = intel_pebs_constraints(event);
819 if (c) 806 if (c)
820 return c; 807 return c;
821 808
822 return x86_get_event_constraints(cpuc, event); 809 return x86_get_event_constraints(cpuc, event);
823} 810}
824 811
825static __initconst struct x86_pmu core_pmu = { 812static int intel_pmu_hw_config(struct perf_event *event)
813{
814 int ret = x86_pmu_hw_config(event);
815
816 if (ret)
817 return ret;
818
819 if (event->attr.type != PERF_TYPE_RAW)
820 return 0;
821
822 if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
823 return 0;
824
825 if (x86_pmu.version < 3)
826 return -EINVAL;
827
828 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
829 return -EACCES;
830
831 event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
832
833 return 0;
834}
835
836static __initconst const struct x86_pmu core_pmu = {
826 .name = "core", 837 .name = "core",
827 .handle_irq = x86_pmu_handle_irq, 838 .handle_irq = x86_pmu_handle_irq,
828 .disable_all = x86_pmu_disable_all, 839 .disable_all = x86_pmu_disable_all,
829 .enable_all = x86_pmu_enable_all, 840 .enable_all = x86_pmu_enable_all,
830 .enable = x86_pmu_enable_event, 841 .enable = x86_pmu_enable_event,
831 .disable = x86_pmu_disable_event, 842 .disable = x86_pmu_disable_event,
843 .hw_config = x86_pmu_hw_config,
844 .schedule_events = x86_schedule_events,
832 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 845 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
833 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 846 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
834 .event_map = intel_pmu_event_map, 847 .event_map = intel_pmu_event_map,
835 .raw_event = intel_pmu_raw_event,
836 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 848 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
837 .apic = 1, 849 .apic = 1,
838 /* 850 /*
@@ -845,17 +857,32 @@ static __initconst struct x86_pmu core_pmu = {
845 .event_constraints = intel_core_event_constraints, 857 .event_constraints = intel_core_event_constraints,
846}; 858};
847 859
848static __initconst struct x86_pmu intel_pmu = { 860static void intel_pmu_cpu_starting(int cpu)
861{
862 init_debug_store_on_cpu(cpu);
863 /*
864 * Deal with CPUs that don't clear their LBRs on power-up.
865 */
866 intel_pmu_lbr_reset();
867}
868
869static void intel_pmu_cpu_dying(int cpu)
870{
871 fini_debug_store_on_cpu(cpu);
872}
873
874static __initconst const struct x86_pmu intel_pmu = {
849 .name = "Intel", 875 .name = "Intel",
850 .handle_irq = intel_pmu_handle_irq, 876 .handle_irq = intel_pmu_handle_irq,
851 .disable_all = intel_pmu_disable_all, 877 .disable_all = intel_pmu_disable_all,
852 .enable_all = intel_pmu_enable_all, 878 .enable_all = intel_pmu_enable_all,
853 .enable = intel_pmu_enable_event, 879 .enable = intel_pmu_enable_event,
854 .disable = intel_pmu_disable_event, 880 .disable = intel_pmu_disable_event,
881 .hw_config = intel_pmu_hw_config,
882 .schedule_events = x86_schedule_events,
855 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 883 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
856 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 884 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
857 .event_map = intel_pmu_event_map, 885 .event_map = intel_pmu_event_map,
858 .raw_event = intel_pmu_raw_event,
859 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 886 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
860 .apic = 1, 887 .apic = 1,
861 /* 888 /*
@@ -864,14 +891,38 @@ static __initconst struct x86_pmu intel_pmu = {
864 * the generic event period: 891 * the generic event period:
865 */ 892 */
866 .max_period = (1ULL << 31) - 1, 893 .max_period = (1ULL << 31) - 1,
867 .enable_bts = intel_pmu_enable_bts,
868 .disable_bts = intel_pmu_disable_bts,
869 .get_event_constraints = intel_get_event_constraints, 894 .get_event_constraints = intel_get_event_constraints,
870 895
871 .cpu_starting = init_debug_store_on_cpu, 896 .cpu_starting = intel_pmu_cpu_starting,
872 .cpu_dying = fini_debug_store_on_cpu, 897 .cpu_dying = intel_pmu_cpu_dying,
873}; 898};
874 899
900static void intel_clovertown_quirks(void)
901{
902 /*
903 * PEBS is unreliable due to:
904 *
905 * AJ67 - PEBS may experience CPL leaks
906 * AJ68 - PEBS PMI may be delayed by one event
907 * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
908 * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
909 *
910 * AJ67 could be worked around by restricting the OS/USR flags.
911 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
912 *
913 * AJ106 could possibly be worked around by not allowing LBR
914 * usage from PEBS, including the fixup.
915 * AJ68 could possibly be worked around by always programming
916 * a pebs_event_reset[0] value and coping with the lost events.
917 *
918 * But taken together it might just make sense to not enable PEBS on
919 * these chips.
920 */
921 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
922 x86_pmu.pebs = 0;
923 x86_pmu.pebs_constraints = NULL;
924}
925
875static __init int intel_pmu_init(void) 926static __init int intel_pmu_init(void)
876{ 927{
877 union cpuid10_edx edx; 928 union cpuid10_edx edx;
@@ -881,12 +932,13 @@ static __init int intel_pmu_init(void)
881 int version; 932 int version;
882 933
883 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 934 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
884 /* check for P6 processor family */ 935 switch (boot_cpu_data.x86) {
885 if (boot_cpu_data.x86 == 6) { 936 case 0x6:
886 return p6_pmu_init(); 937 return p6_pmu_init();
887 } else { 938 case 0xf:
939 return p4_pmu_init();
940 }
888 return -ENODEV; 941 return -ENODEV;
889 }
890 } 942 }
891 943
892 /* 944 /*
@@ -904,16 +956,28 @@ static __init int intel_pmu_init(void)
904 x86_pmu = intel_pmu; 956 x86_pmu = intel_pmu;
905 957
906 x86_pmu.version = version; 958 x86_pmu.version = version;
907 x86_pmu.num_events = eax.split.num_events; 959 x86_pmu.num_counters = eax.split.num_counters;
908 x86_pmu.event_bits = eax.split.bit_width; 960 x86_pmu.cntval_bits = eax.split.bit_width;
909 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; 961 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
910 962
911 /* 963 /*
912 * Quirk: v2 perfmon does not report fixed-purpose events, so 964 * Quirk: v2 perfmon does not report fixed-purpose events, so
913 * assume at least 3 events: 965 * assume at least 3 events:
914 */ 966 */
915 if (version > 1) 967 if (version > 1)
916 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); 968 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
969
970 /*
971 * v2 and above have a perf capabilities MSR
972 */
973 if (version > 1) {
974 u64 capabilities;
975
976 rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
977 x86_pmu.intel_cap.capabilities = capabilities;
978 }
979
980 intel_ds_init();
917 981
918 /* 982 /*
919 * Install the hw-cache-events table: 983 * Install the hw-cache-events table:
@@ -924,12 +988,15 @@ static __init int intel_pmu_init(void)
924 break; 988 break;
925 989
926 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 990 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
991 x86_pmu.quirks = intel_clovertown_quirks;
927 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 992 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
928 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 993 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
929 case 29: /* six-core 45 nm xeon "Dunnington" */ 994 case 29: /* six-core 45 nm xeon "Dunnington" */
930 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, 995 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
931 sizeof(hw_cache_event_ids)); 996 sizeof(hw_cache_event_ids));
932 997
998 intel_pmu_lbr_init_core();
999
933 x86_pmu.event_constraints = intel_core2_event_constraints; 1000 x86_pmu.event_constraints = intel_core2_event_constraints;
934 pr_cont("Core2 events, "); 1001 pr_cont("Core2 events, ");
935 break; 1002 break;
@@ -940,13 +1007,19 @@ static __init int intel_pmu_init(void)
940 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 1007 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
941 sizeof(hw_cache_event_ids)); 1008 sizeof(hw_cache_event_ids));
942 1009
1010 intel_pmu_lbr_init_nhm();
1011
943 x86_pmu.event_constraints = intel_nehalem_event_constraints; 1012 x86_pmu.event_constraints = intel_nehalem_event_constraints;
944 pr_cont("Nehalem/Corei7 events, "); 1013 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1014 pr_cont("Nehalem events, ");
945 break; 1015 break;
1016
946 case 28: /* Atom */ 1017 case 28: /* Atom */
947 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, 1018 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
948 sizeof(hw_cache_event_ids)); 1019 sizeof(hw_cache_event_ids));
949 1020
1021 intel_pmu_lbr_init_atom();
1022
950 x86_pmu.event_constraints = intel_gen_event_constraints; 1023 x86_pmu.event_constraints = intel_gen_event_constraints;
951 pr_cont("Atom events, "); 1024 pr_cont("Atom events, ");
952 break; 1025 break;
@@ -956,7 +1029,10 @@ static __init int intel_pmu_init(void)
956 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 1029 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
957 sizeof(hw_cache_event_ids)); 1030 sizeof(hw_cache_event_ids));
958 1031
1032 intel_pmu_lbr_init_nhm();
1033
959 x86_pmu.event_constraints = intel_westmere_event_constraints; 1034 x86_pmu.event_constraints = intel_westmere_event_constraints;
1035 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
960 pr_cont("Westmere events, "); 1036 pr_cont("Westmere events, ");
961 break; 1037 break;
962 1038
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 000000000000..18018d1311cd
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,641 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/* The maximal number of PEBS events: */
4#define MAX_PEBS_EVENTS 4
5
6/* The size of a BTS record in bytes: */
7#define BTS_RECORD_SIZE 24
8
9#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
10#define PEBS_BUFFER_SIZE PAGE_SIZE
11
12/*
13 * pebs_record_32 for p4 and core not supported
14
15struct pebs_record_32 {
16 u32 flags, ip;
17 u32 ax, bc, cx, dx;
18 u32 si, di, bp, sp;
19};
20
21 */
22
23struct pebs_record_core {
24 u64 flags, ip;
25 u64 ax, bx, cx, dx;
26 u64 si, di, bp, sp;
27 u64 r8, r9, r10, r11;
28 u64 r12, r13, r14, r15;
29};
30
31struct pebs_record_nhm {
32 u64 flags, ip;
33 u64 ax, bx, cx, dx;
34 u64 si, di, bp, sp;
35 u64 r8, r9, r10, r11;
36 u64 r12, r13, r14, r15;
37 u64 status, dla, dse, lat;
38};
39
40/*
41 * A debug store configuration.
42 *
43 * We only support architectures that use 64bit fields.
44 */
45struct debug_store {
46 u64 bts_buffer_base;
47 u64 bts_index;
48 u64 bts_absolute_maximum;
49 u64 bts_interrupt_threshold;
50 u64 pebs_buffer_base;
51 u64 pebs_index;
52 u64 pebs_absolute_maximum;
53 u64 pebs_interrupt_threshold;
54 u64 pebs_event_reset[MAX_PEBS_EVENTS];
55};
56
57static void init_debug_store_on_cpu(int cpu)
58{
59 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
60
61 if (!ds)
62 return;
63
64 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
65 (u32)((u64)(unsigned long)ds),
66 (u32)((u64)(unsigned long)ds >> 32));
67}
68
69static void fini_debug_store_on_cpu(int cpu)
70{
71 if (!per_cpu(cpu_hw_events, cpu).ds)
72 return;
73
74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
75}
76
77static void release_ds_buffers(void)
78{
79 int cpu;
80
81 if (!x86_pmu.bts && !x86_pmu.pebs)
82 return;
83
84 get_online_cpus();
85
86 for_each_online_cpu(cpu)
87 fini_debug_store_on_cpu(cpu);
88
89 for_each_possible_cpu(cpu) {
90 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
91
92 if (!ds)
93 continue;
94
95 per_cpu(cpu_hw_events, cpu).ds = NULL;
96
97 kfree((void *)(unsigned long)ds->pebs_buffer_base);
98 kfree((void *)(unsigned long)ds->bts_buffer_base);
99 kfree(ds);
100 }
101
102 put_online_cpus();
103}
104
105static int reserve_ds_buffers(void)
106{
107 int cpu, err = 0;
108
109 if (!x86_pmu.bts && !x86_pmu.pebs)
110 return 0;
111
112 get_online_cpus();
113
114 for_each_possible_cpu(cpu) {
115 struct debug_store *ds;
116 void *buffer;
117 int max, thresh;
118
119 err = -ENOMEM;
120 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
121 if (unlikely(!ds))
122 break;
123 per_cpu(cpu_hw_events, cpu).ds = ds;
124
125 if (x86_pmu.bts) {
126 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
127 if (unlikely(!buffer))
128 break;
129
130 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
131 thresh = max / 16;
132
133 ds->bts_buffer_base = (u64)(unsigned long)buffer;
134 ds->bts_index = ds->bts_buffer_base;
135 ds->bts_absolute_maximum = ds->bts_buffer_base +
136 max * BTS_RECORD_SIZE;
137 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
138 thresh * BTS_RECORD_SIZE;
139 }
140
141 if (x86_pmu.pebs) {
142 buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
143 if (unlikely(!buffer))
144 break;
145
146 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
147
148 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
149 ds->pebs_index = ds->pebs_buffer_base;
150 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
151 max * x86_pmu.pebs_record_size;
152 /*
153 * Always use single record PEBS
154 */
155 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
156 x86_pmu.pebs_record_size;
157 }
158
159 err = 0;
160 }
161
162 if (err)
163 release_ds_buffers();
164 else {
165 for_each_online_cpu(cpu)
166 init_debug_store_on_cpu(cpu);
167 }
168
169 put_online_cpus();
170
171 return err;
172}
173
174/*
175 * BTS
176 */
177
178static struct event_constraint bts_constraint =
179 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
180
181static void intel_pmu_enable_bts(u64 config)
182{
183 unsigned long debugctlmsr;
184
185 debugctlmsr = get_debugctlmsr();
186
187 debugctlmsr |= DEBUGCTLMSR_TR;
188 debugctlmsr |= DEBUGCTLMSR_BTS;
189 debugctlmsr |= DEBUGCTLMSR_BTINT;
190
191 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
192 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
193
194 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
195 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
196
197 update_debugctlmsr(debugctlmsr);
198}
199
200static void intel_pmu_disable_bts(void)
201{
202 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
203 unsigned long debugctlmsr;
204
205 if (!cpuc->ds)
206 return;
207
208 debugctlmsr = get_debugctlmsr();
209
210 debugctlmsr &=
211 ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
212 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
213
214 update_debugctlmsr(debugctlmsr);
215}
216
217static void intel_pmu_drain_bts_buffer(void)
218{
219 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
220 struct debug_store *ds = cpuc->ds;
221 struct bts_record {
222 u64 from;
223 u64 to;
224 u64 flags;
225 };
226 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
227 struct bts_record *at, *top;
228 struct perf_output_handle handle;
229 struct perf_event_header header;
230 struct perf_sample_data data;
231 struct pt_regs regs;
232
233 if (!event)
234 return;
235
236 if (!ds)
237 return;
238
239 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
240 top = (struct bts_record *)(unsigned long)ds->bts_index;
241
242 if (top <= at)
243 return;
244
245 ds->bts_index = ds->bts_buffer_base;
246
247 perf_sample_data_init(&data, 0);
248 data.period = event->hw.last_period;
249 regs.ip = 0;
250
251 /*
252 * Prepare a generic sample, i.e. fill in the invariant fields.
253 * We will overwrite the from and to address before we output
254 * the sample.
255 */
256 perf_prepare_sample(&header, &data, event, &regs);
257
258 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
259 return;
260
261 for (; at < top; at++) {
262 data.ip = at->from;
263 data.addr = at->to;
264
265 perf_output_sample(&handle, &header, &data, event);
266 }
267
268 perf_output_end(&handle);
269
270 /* There's new data available. */
271 event->hw.interrupts++;
272 event->pending_kill = POLL_IN;
273}
274
275/*
276 * PEBS
277 */
278
279static struct event_constraint intel_core_pebs_events[] = {
280 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
281 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
282 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
283 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
284 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
285 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
286 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
287 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
288 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
289 EVENT_CONSTRAINT_END
290};
291
292static struct event_constraint intel_nehalem_pebs_events[] = {
293 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
294 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
295 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
296 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
297 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
298 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
299 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
300 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
301 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
302 EVENT_CONSTRAINT_END
303};
304
305static struct event_constraint *
306intel_pebs_constraints(struct perf_event *event)
307{
308 struct event_constraint *c;
309
310 if (!event->attr.precise_ip)
311 return NULL;
312
313 if (x86_pmu.pebs_constraints) {
314 for_each_event_constraint(c, x86_pmu.pebs_constraints) {
315 if ((event->hw.config & c->cmask) == c->code)
316 return c;
317 }
318 }
319
320 return &emptyconstraint;
321}
322
323static void intel_pmu_pebs_enable(struct perf_event *event)
324{
325 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
326 struct hw_perf_event *hwc = &event->hw;
327
328 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
329
330 cpuc->pebs_enabled |= 1ULL << hwc->idx;
331 WARN_ON_ONCE(cpuc->enabled);
332
333 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
334 intel_pmu_lbr_enable(event);
335}
336
337static void intel_pmu_pebs_disable(struct perf_event *event)
338{
339 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
340 struct hw_perf_event *hwc = &event->hw;
341
342 cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
343 if (cpuc->enabled)
344 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
345
346 hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
347
348 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
349 intel_pmu_lbr_disable(event);
350}
351
352static void intel_pmu_pebs_enable_all(void)
353{
354 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
355
356 if (cpuc->pebs_enabled)
357 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
358}
359
360static void intel_pmu_pebs_disable_all(void)
361{
362 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
363
364 if (cpuc->pebs_enabled)
365 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
366}
367
368#include <asm/insn.h>
369
370static inline bool kernel_ip(unsigned long ip)
371{
372#ifdef CONFIG_X86_32
373 return ip > PAGE_OFFSET;
374#else
375 return (long)ip < 0;
376#endif
377}
378
379static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
380{
381 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
382 unsigned long from = cpuc->lbr_entries[0].from;
383 unsigned long old_to, to = cpuc->lbr_entries[0].to;
384 unsigned long ip = regs->ip;
385
386 /*
387 * We don't need to fixup if the PEBS assist is fault like
388 */
389 if (!x86_pmu.intel_cap.pebs_trap)
390 return 1;
391
392 /*
393 * No LBR entry, no basic block, no rewinding
394 */
395 if (!cpuc->lbr_stack.nr || !from || !to)
396 return 0;
397
398 /*
399 * Basic blocks should never cross user/kernel boundaries
400 */
401 if (kernel_ip(ip) != kernel_ip(to))
402 return 0;
403
404 /*
405 * unsigned math, either ip is before the start (impossible) or
406 * the basic block is larger than 1 page (sanity)
407 */
408 if ((ip - to) > PAGE_SIZE)
409 return 0;
410
411 /*
412 * We sampled a branch insn, rewind using the LBR stack
413 */
414 if (ip == to) {
415 regs->ip = from;
416 return 1;
417 }
418
419 do {
420 struct insn insn;
421 u8 buf[MAX_INSN_SIZE];
422 void *kaddr;
423
424 old_to = to;
425 if (!kernel_ip(ip)) {
426 int bytes, size = MAX_INSN_SIZE;
427
428 bytes = copy_from_user_nmi(buf, (void __user *)to, size);
429 if (bytes != size)
430 return 0;
431
432 kaddr = buf;
433 } else
434 kaddr = (void *)to;
435
436 kernel_insn_init(&insn, kaddr);
437 insn_get_length(&insn);
438 to += insn.length;
439 } while (to < ip);
440
441 if (to == ip) {
442 regs->ip = old_to;
443 return 1;
444 }
445
446 /*
447 * Even though we decoded the basic block, the instruction stream
448 * never matched the given IP, either the TO or the IP got corrupted.
449 */
450 return 0;
451}
452
453static int intel_pmu_save_and_restart(struct perf_event *event);
454
455static void __intel_pmu_pebs_event(struct perf_event *event,
456 struct pt_regs *iregs, void *__pebs)
457{
458 /*
459 * We cast to pebs_record_core since that is a subset of
460 * both formats and we don't use the other fields in this
461 * routine.
462 */
463 struct pebs_record_core *pebs = __pebs;
464 struct perf_sample_data data;
465 struct pt_regs regs;
466
467 if (!intel_pmu_save_and_restart(event))
468 return;
469
470 perf_sample_data_init(&data, 0);
471 data.period = event->hw.last_period;
472
473 /*
474 * We use the interrupt regs as a base because the PEBS record
475 * does not contain a full regs set, specifically it seems to
476 * lack segment descriptors, which get used by things like
477 * user_mode().
478 *
479 * In the simple case fix up only the IP and BP,SP regs, for
480 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
481 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
482 */
483 regs = *iregs;
484 regs.ip = pebs->ip;
485 regs.bp = pebs->bp;
486 regs.sp = pebs->sp;
487
488 if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
489 regs.flags |= PERF_EFLAGS_EXACT;
490 else
491 regs.flags &= ~PERF_EFLAGS_EXACT;
492
493 if (perf_event_overflow(event, 1, &data, &regs))
494 x86_pmu_stop(event);
495}
496
497static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
498{
499 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
500 struct debug_store *ds = cpuc->ds;
501 struct perf_event *event = cpuc->events[0]; /* PMC0 only */
502 struct pebs_record_core *at, *top;
503 int n;
504
505 if (!ds || !x86_pmu.pebs)
506 return;
507
508 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
509 top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
510
511 /*
512 * Whatever else happens, drain the thing
513 */
514 ds->pebs_index = ds->pebs_buffer_base;
515
516 if (!test_bit(0, cpuc->active_mask))
517 return;
518
519 WARN_ON_ONCE(!event);
520
521 if (!event->attr.precise_ip)
522 return;
523
524 n = top - at;
525 if (n <= 0)
526 return;
527
528 /*
529 * Should not happen, we program the threshold at 1 and do not
530 * set a reset value.
531 */
532 WARN_ON_ONCE(n > 1);
533 at += n - 1;
534
535 __intel_pmu_pebs_event(event, iregs, at);
536}
537
538static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
539{
540 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
541 struct debug_store *ds = cpuc->ds;
542 struct pebs_record_nhm *at, *top;
543 struct perf_event *event = NULL;
544 u64 status = 0;
545 int bit, n;
546
547 if (!ds || !x86_pmu.pebs)
548 return;
549
550 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
551 top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
552
553 ds->pebs_index = ds->pebs_buffer_base;
554
555 n = top - at;
556 if (n <= 0)
557 return;
558
559 /*
560 * Should not happen, we program the threshold at 1 and do not
561 * set a reset value.
562 */
563 WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
564
565 for ( ; at < top; at++) {
566 for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
567 event = cpuc->events[bit];
568 if (!test_bit(bit, cpuc->active_mask))
569 continue;
570
571 WARN_ON_ONCE(!event);
572
573 if (!event->attr.precise_ip)
574 continue;
575
576 if (__test_and_set_bit(bit, (unsigned long *)&status))
577 continue;
578
579 break;
580 }
581
582 if (!event || bit >= MAX_PEBS_EVENTS)
583 continue;
584
585 __intel_pmu_pebs_event(event, iregs, at);
586 }
587}
588
589/*
590 * BTS, PEBS probe and setup
591 */
592
593static void intel_ds_init(void)
594{
595 /*
596 * No support for 32bit formats
597 */
598 if (!boot_cpu_has(X86_FEATURE_DTES64))
599 return;
600
601 x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
602 x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
603 if (x86_pmu.pebs) {
604 char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
605 int format = x86_pmu.intel_cap.pebs_format;
606
607 switch (format) {
608 case 0:
609 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
610 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
611 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
612 x86_pmu.pebs_constraints = intel_core_pebs_events;
613 break;
614
615 case 1:
616 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
617 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
618 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
619 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
620 break;
621
622 default:
623 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
624 x86_pmu.pebs = 0;
625 break;
626 }
627 }
628}
629
630#else /* CONFIG_CPU_SUP_INTEL */
631
632static int reserve_ds_buffers(void)
633{
634 return 0;
635}
636
637static void release_ds_buffers(void)
638{
639}
640
641#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 000000000000..d202c1bece1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,218 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3enum {
4 LBR_FORMAT_32 = 0x00,
5 LBR_FORMAT_LIP = 0x01,
6 LBR_FORMAT_EIP = 0x02,
7 LBR_FORMAT_EIP_FLAGS = 0x03,
8};
9
10/*
11 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
12 * otherwise it becomes near impossible to get a reliable stack.
13 */
14
15static void __intel_pmu_lbr_enable(void)
16{
17 u64 debugctl;
18
19 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
20 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
21 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
22}
23
24static void __intel_pmu_lbr_disable(void)
25{
26 u64 debugctl;
27
28 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
29 debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
30 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
31}
32
33static void intel_pmu_lbr_reset_32(void)
34{
35 int i;
36
37 for (i = 0; i < x86_pmu.lbr_nr; i++)
38 wrmsrl(x86_pmu.lbr_from + i, 0);
39}
40
41static void intel_pmu_lbr_reset_64(void)
42{
43 int i;
44
45 for (i = 0; i < x86_pmu.lbr_nr; i++) {
46 wrmsrl(x86_pmu.lbr_from + i, 0);
47 wrmsrl(x86_pmu.lbr_to + i, 0);
48 }
49}
50
51static void intel_pmu_lbr_reset(void)
52{
53 if (!x86_pmu.lbr_nr)
54 return;
55
56 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
57 intel_pmu_lbr_reset_32();
58 else
59 intel_pmu_lbr_reset_64();
60}
61
62static void intel_pmu_lbr_enable(struct perf_event *event)
63{
64 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
65
66 if (!x86_pmu.lbr_nr)
67 return;
68
69 WARN_ON_ONCE(cpuc->enabled);
70
71 /*
72 * Reset the LBR stack if we changed task context to
73 * avoid data leaks.
74 */
75
76 if (event->ctx->task && cpuc->lbr_context != event->ctx) {
77 intel_pmu_lbr_reset();
78 cpuc->lbr_context = event->ctx;
79 }
80
81 cpuc->lbr_users++;
82}
83
84static void intel_pmu_lbr_disable(struct perf_event *event)
85{
86 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
87
88 if (!x86_pmu.lbr_nr)
89 return;
90
91 cpuc->lbr_users--;
92 WARN_ON_ONCE(cpuc->lbr_users < 0);
93
94 if (cpuc->enabled && !cpuc->lbr_users)
95 __intel_pmu_lbr_disable();
96}
97
98static void intel_pmu_lbr_enable_all(void)
99{
100 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
101
102 if (cpuc->lbr_users)
103 __intel_pmu_lbr_enable();
104}
105
106static void intel_pmu_lbr_disable_all(void)
107{
108 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
109
110 if (cpuc->lbr_users)
111 __intel_pmu_lbr_disable();
112}
113
114static inline u64 intel_pmu_lbr_tos(void)
115{
116 u64 tos;
117
118 rdmsrl(x86_pmu.lbr_tos, tos);
119
120 return tos;
121}
122
123static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
124{
125 unsigned long mask = x86_pmu.lbr_nr - 1;
126 u64 tos = intel_pmu_lbr_tos();
127 int i;
128
129 for (i = 0; i < x86_pmu.lbr_nr; i++) {
130 unsigned long lbr_idx = (tos - i) & mask;
131 union {
132 struct {
133 u32 from;
134 u32 to;
135 };
136 u64 lbr;
137 } msr_lastbranch;
138
139 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
140
141 cpuc->lbr_entries[i].from = msr_lastbranch.from;
142 cpuc->lbr_entries[i].to = msr_lastbranch.to;
143 cpuc->lbr_entries[i].flags = 0;
144 }
145 cpuc->lbr_stack.nr = i;
146}
147
148#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
149
150/*
151 * Due to lack of segmentation in Linux the effective address (offset)
152 * is the same as the linear address, allowing us to merge the LIP and EIP
153 * LBR formats.
154 */
155static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
156{
157 unsigned long mask = x86_pmu.lbr_nr - 1;
158 int lbr_format = x86_pmu.intel_cap.lbr_format;
159 u64 tos = intel_pmu_lbr_tos();
160 int i;
161
162 for (i = 0; i < x86_pmu.lbr_nr; i++) {
163 unsigned long lbr_idx = (tos - i) & mask;
164 u64 from, to, flags = 0;
165
166 rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
167 rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
168
169 if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
170 flags = !!(from & LBR_FROM_FLAG_MISPRED);
171 from = (u64)((((s64)from) << 1) >> 1);
172 }
173
174 cpuc->lbr_entries[i].from = from;
175 cpuc->lbr_entries[i].to = to;
176 cpuc->lbr_entries[i].flags = flags;
177 }
178 cpuc->lbr_stack.nr = i;
179}
180
181static void intel_pmu_lbr_read(void)
182{
183 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
184
185 if (!cpuc->lbr_users)
186 return;
187
188 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
189 intel_pmu_lbr_read_32(cpuc);
190 else
191 intel_pmu_lbr_read_64(cpuc);
192}
193
194static void intel_pmu_lbr_init_core(void)
195{
196 x86_pmu.lbr_nr = 4;
197 x86_pmu.lbr_tos = 0x01c9;
198 x86_pmu.lbr_from = 0x40;
199 x86_pmu.lbr_to = 0x60;
200}
201
202static void intel_pmu_lbr_init_nhm(void)
203{
204 x86_pmu.lbr_nr = 16;
205 x86_pmu.lbr_tos = 0x01c9;
206 x86_pmu.lbr_from = 0x680;
207 x86_pmu.lbr_to = 0x6c0;
208}
209
210static void intel_pmu_lbr_init_atom(void)
211{
212 x86_pmu.lbr_nr = 8;
213 x86_pmu.lbr_tos = 0x01c9;
214 x86_pmu.lbr_from = 0x40;
215 x86_pmu.lbr_to = 0x60;
216}
217
218#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 000000000000..249015173992
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,951 @@
1/*
2 * Netburst Perfomance Events (P4, old Xeon)
3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#ifdef CONFIG_CPU_SUP_INTEL
11
12#include <asm/perf_event_p4.h>
13
14#define P4_CNTR_LIMIT 3
15/*
16 * array indices: 0,1 - HT threads, used with HT enabled cpu
17 */
18struct p4_event_bind {
19 unsigned int opcode; /* Event code and ESCR selector */
20 unsigned int escr_msr[2]; /* ESCR MSR for this event */
21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22};
23
24struct p4_pebs_bind {
25 unsigned int metric_pebs;
26 unsigned int metric_vert;
27};
28
29/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
30#define P4_GEN_PEBS_BIND(name, pebs, vert) \
31 [P4_PEBS_METRIC__##name] = { \
32 .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
33 .metric_vert = vert, \
34 }
35
36/*
37 * note we have P4_PEBS_ENABLE_UOP_TAG always set here
38 *
39 * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
40 * event configuration to find out which values are to be
41 * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
42 * resgisters
43 */
44static struct p4_pebs_bind p4_pebs_bind_map[] = {
45 P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
46 P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
47 P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
48 P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
49 P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
50 P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
51 P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
52 P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
53 P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
54};
55
56/*
57 * Note that we don't use CCCR1 here, there is an
58 * exception for P4_BSQ_ALLOCATION but we just have
59 * no workaround
60 *
61 * consider this binding as resources which particular
62 * event may borrow, it doesn't contain EventMask,
63 * Tags and friends -- they are left to a caller
64 */
65static struct p4_event_bind p4_event_bind_map[] = {
66 [P4_EVENT_TC_DELIVER_MODE] = {
67 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
68 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
69 .cntr = { {4, 5, -1}, {6, 7, -1} },
70 },
71 [P4_EVENT_BPU_FETCH_REQUEST] = {
72 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
73 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
74 .cntr = { {0, -1, -1}, {2, -1, -1} },
75 },
76 [P4_EVENT_ITLB_REFERENCE] = {
77 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
78 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
79 .cntr = { {0, -1, -1}, {2, -1, -1} },
80 },
81 [P4_EVENT_MEMORY_CANCEL] = {
82 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
83 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
84 .cntr = { {8, 9, -1}, {10, 11, -1} },
85 },
86 [P4_EVENT_MEMORY_COMPLETE] = {
87 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
88 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
89 .cntr = { {8, 9, -1}, {10, 11, -1} },
90 },
91 [P4_EVENT_LOAD_PORT_REPLAY] = {
92 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
93 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
94 .cntr = { {8, 9, -1}, {10, 11, -1} },
95 },
96 [P4_EVENT_STORE_PORT_REPLAY] = {
97 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
98 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
99 .cntr = { {8, 9, -1}, {10, 11, -1} },
100 },
101 [P4_EVENT_MOB_LOAD_REPLAY] = {
102 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
103 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
104 .cntr = { {0, -1, -1}, {2, -1, -1} },
105 },
106 [P4_EVENT_PAGE_WALK_TYPE] = {
107 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
108 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
109 .cntr = { {0, -1, -1}, {2, -1, -1} },
110 },
111 [P4_EVENT_BSQ_CACHE_REFERENCE] = {
112 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
113 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
114 .cntr = { {0, -1, -1}, {2, -1, -1} },
115 },
116 [P4_EVENT_IOQ_ALLOCATION] = {
117 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
118 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
119 .cntr = { {0, -1, -1}, {2, -1, -1} },
120 },
121 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
122 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
123 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
124 .cntr = { {2, -1, -1}, {3, -1, -1} },
125 },
126 [P4_EVENT_FSB_DATA_ACTIVITY] = {
127 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
128 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
129 .cntr = { {0, -1, -1}, {2, -1, -1} },
130 },
131 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
132 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
133 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
134 .cntr = { {0, -1, -1}, {1, -1, -1} },
135 },
136 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
137 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
138 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
139 .cntr = { {2, -1, -1}, {3, -1, -1} },
140 },
141 [P4_EVENT_SSE_INPUT_ASSIST] = {
142 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
143 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
144 .cntr = { {8, 9, -1}, {10, 11, -1} },
145 },
146 [P4_EVENT_PACKED_SP_UOP] = {
147 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
148 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
149 .cntr = { {8, 9, -1}, {10, 11, -1} },
150 },
151 [P4_EVENT_PACKED_DP_UOP] = {
152 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
153 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
154 .cntr = { {8, 9, -1}, {10, 11, -1} },
155 },
156 [P4_EVENT_SCALAR_SP_UOP] = {
157 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
158 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
159 .cntr = { {8, 9, -1}, {10, 11, -1} },
160 },
161 [P4_EVENT_SCALAR_DP_UOP] = {
162 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
163 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
164 .cntr = { {8, 9, -1}, {10, 11, -1} },
165 },
166 [P4_EVENT_64BIT_MMX_UOP] = {
167 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
168 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
169 .cntr = { {8, 9, -1}, {10, 11, -1} },
170 },
171 [P4_EVENT_128BIT_MMX_UOP] = {
172 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
173 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
174 .cntr = { {8, 9, -1}, {10, 11, -1} },
175 },
176 [P4_EVENT_X87_FP_UOP] = {
177 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
178 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
179 .cntr = { {8, 9, -1}, {10, 11, -1} },
180 },
181 [P4_EVENT_TC_MISC] = {
182 .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
183 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
184 .cntr = { {4, 5, -1}, {6, 7, -1} },
185 },
186 [P4_EVENT_GLOBAL_POWER_EVENTS] = {
187 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
188 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
189 .cntr = { {0, -1, -1}, {2, -1, -1} },
190 },
191 [P4_EVENT_TC_MS_XFER] = {
192 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
193 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
194 .cntr = { {4, 5, -1}, {6, 7, -1} },
195 },
196 [P4_EVENT_UOP_QUEUE_WRITES] = {
197 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
198 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
199 .cntr = { {4, 5, -1}, {6, 7, -1} },
200 },
201 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
202 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
203 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
204 .cntr = { {4, 5, -1}, {6, 7, -1} },
205 },
206 [P4_EVENT_RETIRED_BRANCH_TYPE] = {
207 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
208 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
209 .cntr = { {4, 5, -1}, {6, 7, -1} },
210 },
211 [P4_EVENT_RESOURCE_STALL] = {
212 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
213 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
214 .cntr = { {12, 13, 16}, {14, 15, 17} },
215 },
216 [P4_EVENT_WC_BUFFER] = {
217 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
218 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
219 .cntr = { {8, 9, -1}, {10, 11, -1} },
220 },
221 [P4_EVENT_B2B_CYCLES] = {
222 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
223 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
224 .cntr = { {0, -1, -1}, {2, -1, -1} },
225 },
226 [P4_EVENT_BNR] = {
227 .opcode = P4_OPCODE(P4_EVENT_BNR),
228 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
229 .cntr = { {0, -1, -1}, {2, -1, -1} },
230 },
231 [P4_EVENT_SNOOP] = {
232 .opcode = P4_OPCODE(P4_EVENT_SNOOP),
233 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
234 .cntr = { {0, -1, -1}, {2, -1, -1} },
235 },
236 [P4_EVENT_RESPONSE] = {
237 .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
238 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
239 .cntr = { {0, -1, -1}, {2, -1, -1} },
240 },
241 [P4_EVENT_FRONT_END_EVENT] = {
242 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
243 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
244 .cntr = { {12, 13, 16}, {14, 15, 17} },
245 },
246 [P4_EVENT_EXECUTION_EVENT] = {
247 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
248 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
249 .cntr = { {12, 13, 16}, {14, 15, 17} },
250 },
251 [P4_EVENT_REPLAY_EVENT] = {
252 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
253 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
254 .cntr = { {12, 13, 16}, {14, 15, 17} },
255 },
256 [P4_EVENT_INSTR_RETIRED] = {
257 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
258 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
259 .cntr = { {12, 13, 16}, {14, 15, 17} },
260 },
261 [P4_EVENT_UOPS_RETIRED] = {
262 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
263 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
264 .cntr = { {12, 13, 16}, {14, 15, 17} },
265 },
266 [P4_EVENT_UOP_TYPE] = {
267 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
268 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
269 .cntr = { {12, 13, 16}, {14, 15, 17} },
270 },
271 [P4_EVENT_BRANCH_RETIRED] = {
272 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
273 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
274 .cntr = { {12, 13, 16}, {14, 15, 17} },
275 },
276 [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
277 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
278 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
279 .cntr = { {12, 13, 16}, {14, 15, 17} },
280 },
281 [P4_EVENT_X87_ASSIST] = {
282 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
283 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
284 .cntr = { {12, 13, 16}, {14, 15, 17} },
285 },
286 [P4_EVENT_MACHINE_CLEAR] = {
287 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
288 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
289 .cntr = { {12, 13, 16}, {14, 15, 17} },
290 },
291 [P4_EVENT_INSTR_COMPLETED] = {
292 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
293 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
294 .cntr = { {12, 13, 16}, {14, 15, 17} },
295 },
296};
297
298#define P4_GEN_CACHE_EVENT(event, bit, metric) \
299 p4_config_pack_escr(P4_ESCR_EVENT(event) | \
300 P4_ESCR_EMASK_BIT(event, bit)) | \
301 p4_config_pack_cccr(metric | \
302 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
303
304static __initconst const u64 p4_hw_cache_event_ids
305 [PERF_COUNT_HW_CACHE_MAX]
306 [PERF_COUNT_HW_CACHE_OP_MAX]
307 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
308{
309 [ C(L1D ) ] = {
310 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x0,
312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
313 P4_PEBS_METRIC__1stl_cache_load_miss_retired),
314 },
315 },
316 [ C(LL ) ] = {
317 [ C(OP_READ) ] = {
318 [ C(RESULT_ACCESS) ] = 0x0,
319 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
320 P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
321 },
322},
323 [ C(DTLB) ] = {
324 [ C(OP_READ) ] = {
325 [ C(RESULT_ACCESS) ] = 0x0,
326 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
327 P4_PEBS_METRIC__dtlb_load_miss_retired),
328 },
329 [ C(OP_WRITE) ] = {
330 [ C(RESULT_ACCESS) ] = 0x0,
331 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
332 P4_PEBS_METRIC__dtlb_store_miss_retired),
333 },
334 },
335 [ C(ITLB) ] = {
336 [ C(OP_READ) ] = {
337 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
338 P4_PEBS_METRIC__none),
339 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
340 P4_PEBS_METRIC__none),
341 },
342 [ C(OP_WRITE) ] = {
343 [ C(RESULT_ACCESS) ] = -1,
344 [ C(RESULT_MISS) ] = -1,
345 },
346 [ C(OP_PREFETCH) ] = {
347 [ C(RESULT_ACCESS) ] = -1,
348 [ C(RESULT_MISS) ] = -1,
349 },
350 },
351};
352
353static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
354 /* non-halted CPU clocks */
355 [PERF_COUNT_HW_CPU_CYCLES] =
356 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
357 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
358
359 /*
360 * retired instructions
361 * in a sake of simplicity we don't use the FSB tagging
362 */
363 [PERF_COUNT_HW_INSTRUCTIONS] =
364 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
365 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
366 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
367
368 /* cache hits */
369 [PERF_COUNT_HW_CACHE_REFERENCES] =
370 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
371 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
372 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
373 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
374 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
375 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
376 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
377
378 /* cache misses */
379 [PERF_COUNT_HW_CACHE_MISSES] =
380 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
381 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
382 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
383 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
384
385 /* branch instructions retired */
386 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
387 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
388 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
389 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
390 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
391 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
392
393 /* mispredicted branches retired */
394 [PERF_COUNT_HW_BRANCH_MISSES] =
395 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
396 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
397
398 /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
399 [PERF_COUNT_HW_BUS_CYCLES] =
400 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
401 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
402 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
403 p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
404};
405
406static struct p4_event_bind *p4_config_get_bind(u64 config)
407{
408 unsigned int evnt = p4_config_unpack_event(config);
409 struct p4_event_bind *bind = NULL;
410
411 if (evnt < ARRAY_SIZE(p4_event_bind_map))
412 bind = &p4_event_bind_map[evnt];
413
414 return bind;
415}
416
417static u64 p4_pmu_event_map(int hw_event)
418{
419 struct p4_event_bind *bind;
420 unsigned int esel;
421 u64 config;
422
423 config = p4_general_events[hw_event];
424 bind = p4_config_get_bind(config);
425 esel = P4_OPCODE_ESEL(bind->opcode);
426 config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
427
428 return config;
429}
430
431static int p4_validate_raw_event(struct perf_event *event)
432{
433 unsigned int v;
434
435 /* user data may have out-of-bound event index */
436 v = p4_config_unpack_event(event->attr.config);
437 if (v >= ARRAY_SIZE(p4_event_bind_map)) {
438 pr_warning("P4 PMU: Unknown event code: %d\n", v);
439 return -EINVAL;
440 }
441
442 /*
443 * it may have some screwed PEBS bits
444 */
445 if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
446 pr_warning("P4 PMU: PEBS are not supported yet\n");
447 return -EINVAL;
448 }
449 v = p4_config_unpack_metric(event->attr.config);
450 if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
451 pr_warning("P4 PMU: Unknown metric code: %d\n", v);
452 return -EINVAL;
453 }
454
455 return 0;
456}
457
458static int p4_hw_config(struct perf_event *event)
459{
460 int cpu = get_cpu();
461 int rc = 0;
462 u32 escr, cccr;
463
464 /*
465 * the reason we use cpu that early is that: if we get scheduled
466 * first time on the same cpu -- we will not need swap thread
467 * specific flags in config (and will save some cpu cycles)
468 */
469
470 cccr = p4_default_cccr_conf(cpu);
471 escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
472 event->attr.exclude_user);
473 event->hw.config = p4_config_pack_escr(escr) |
474 p4_config_pack_cccr(cccr);
475
476 if (p4_ht_active() && p4_ht_thread(cpu))
477 event->hw.config = p4_set_ht_bit(event->hw.config);
478
479 if (event->attr.type == PERF_TYPE_RAW) {
480
481 rc = p4_validate_raw_event(event);
482 if (rc)
483 goto out;
484
485 /*
486 * We don't control raw events so it's up to the caller
487 * to pass sane values (and we don't count the thread number
488 * on HT machine but allow HT-compatible specifics to be
489 * passed on)
490 *
491 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
492 * bits since we keep additional info here (for cache events and etc)
493 *
494 * XXX: HT wide things should check perf_paranoid_cpu() &&
495 * CAP_SYS_ADMIN
496 */
497 event->hw.config |= event->attr.config &
498 (p4_config_pack_escr(P4_ESCR_MASK_HT) |
499 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
500
501 event->hw.config &= ~P4_CCCR_FORCE_OVF;
502 }
503
504 rc = x86_setup_perfctr(event);
505out:
506 put_cpu();
507 return rc;
508}
509
510static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
511{
512 int overflow = 0;
513 u32 low, high;
514
515 rdmsr(hwc->config_base + hwc->idx, low, high);
516
517 /* we need to check high bit for unflagged overflows */
518 if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
519 overflow = 1;
520 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
521 ((u64)low) & ~P4_CCCR_OVF);
522 }
523
524 return overflow;
525}
526
527static void p4_pmu_disable_pebs(void)
528{
529 /*
530 * FIXME
531 *
532 * It's still allowed that two threads setup same cache
533 * events so we can't simply clear metrics until we knew
534 * noone is depending on us, so we need kind of counter
535 * for "ReplayEvent" users.
536 *
537 * What is more complex -- RAW events, if user (for some
538 * reason) will pass some cache event metric with improper
539 * event opcode -- it's fine from hardware point of view
540 * but completely nonsence from "meaning" of such action.
541 *
542 * So at moment let leave metrics turned on forever -- it's
543 * ok for now but need to be revisited!
544 *
545 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
546 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
547 */
548}
549
550static inline void p4_pmu_disable_event(struct perf_event *event)
551{
552 struct hw_perf_event *hwc = &event->hw;
553
554 /*
555 * If event gets disabled while counter is in overflowed
556 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
557 * asserted again and again
558 */
559 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
560 (u64)(p4_config_unpack_cccr(hwc->config)) &
561 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
562}
563
564static void p4_pmu_disable_all(void)
565{
566 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
567 int idx;
568
569 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
570 struct perf_event *event = cpuc->events[idx];
571 if (!test_bit(idx, cpuc->active_mask))
572 continue;
573 p4_pmu_disable_event(event);
574 }
575
576 p4_pmu_disable_pebs();
577}
578
579/* configuration must be valid */
580static void p4_pmu_enable_pebs(u64 config)
581{
582 struct p4_pebs_bind *bind;
583 unsigned int idx;
584
585 BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
586
587 idx = p4_config_unpack_metric(config);
588 if (idx == P4_PEBS_METRIC__none)
589 return;
590
591 bind = &p4_pebs_bind_map[idx];
592
593 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
594 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
595}
596
597static void p4_pmu_enable_event(struct perf_event *event)
598{
599 struct hw_perf_event *hwc = &event->hw;
600 int thread = p4_ht_config_thread(hwc->config);
601 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
602 unsigned int idx = p4_config_unpack_event(hwc->config);
603 struct p4_event_bind *bind;
604 u64 escr_addr, cccr;
605
606 bind = &p4_event_bind_map[idx];
607 escr_addr = (u64)bind->escr_msr[thread];
608
609 /*
610 * - we dont support cascaded counters yet
611 * - and counter 1 is broken (erratum)
612 */
613 WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
614 WARN_ON_ONCE(hwc->idx == 1);
615
616 /* we need a real Event value */
617 escr_conf &= ~P4_ESCR_EVENT_MASK;
618 escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
619
620 cccr = p4_config_unpack_cccr(hwc->config);
621
622 /*
623 * it could be Cache event so we need to write metrics
624 * into additional MSRs
625 */
626 p4_pmu_enable_pebs(hwc->config);
627
628 (void)checking_wrmsrl(escr_addr, escr_conf);
629 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
630 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
631}
632
633static void p4_pmu_enable_all(int added)
634{
635 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
636 int idx;
637
638 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
639 struct perf_event *event = cpuc->events[idx];
640 if (!test_bit(idx, cpuc->active_mask))
641 continue;
642 p4_pmu_enable_event(event);
643 }
644}
645
646static int p4_pmu_handle_irq(struct pt_regs *regs)
647{
648 struct perf_sample_data data;
649 struct cpu_hw_events *cpuc;
650 struct perf_event *event;
651 struct hw_perf_event *hwc;
652 int idx, handled = 0;
653 u64 val;
654
655 data.addr = 0;
656 data.raw = NULL;
657
658 cpuc = &__get_cpu_var(cpu_hw_events);
659
660 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
661 int overflow;
662
663 if (!test_bit(idx, cpuc->active_mask)) {
664 /* catch in-flight IRQs */
665 if (__test_and_clear_bit(idx, cpuc->running))
666 handled++;
667 continue;
668 }
669
670 event = cpuc->events[idx];
671 hwc = &event->hw;
672
673 WARN_ON_ONCE(hwc->idx != idx);
674
675 /* it might be unflagged overflow */
676 overflow = p4_pmu_clear_cccr_ovf(hwc);
677
678 val = x86_perf_event_update(event);
679 if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
680 continue;
681
682 handled += overflow;
683
684 /* event overflow for sure */
685 data.period = event->hw.last_period;
686
687 if (!x86_perf_event_set_period(event))
688 continue;
689 if (perf_event_overflow(event, 1, &data, regs))
690 p4_pmu_disable_event(event);
691 }
692
693 if (handled) {
694 /* p4 quirk: unmask it again */
695 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
696 inc_irq_stat(apic_perf_irqs);
697 }
698
699 return handled;
700}
701
702/*
703 * swap thread specific fields according to a thread
704 * we are going to run on
705 */
706static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
707{
708 u32 escr, cccr;
709
710 /*
711 * we either lucky and continue on same cpu or no HT support
712 */
713 if (!p4_should_swap_ts(hwc->config, cpu))
714 return;
715
716 /*
717 * the event is migrated from an another logical
718 * cpu, so we need to swap thread specific flags
719 */
720
721 escr = p4_config_unpack_escr(hwc->config);
722 cccr = p4_config_unpack_cccr(hwc->config);
723
724 if (p4_ht_thread(cpu)) {
725 cccr &= ~P4_CCCR_OVF_PMI_T0;
726 cccr |= P4_CCCR_OVF_PMI_T1;
727 if (escr & P4_ESCR_T0_OS) {
728 escr &= ~P4_ESCR_T0_OS;
729 escr |= P4_ESCR_T1_OS;
730 }
731 if (escr & P4_ESCR_T0_USR) {
732 escr &= ~P4_ESCR_T0_USR;
733 escr |= P4_ESCR_T1_USR;
734 }
735 hwc->config = p4_config_pack_escr(escr);
736 hwc->config |= p4_config_pack_cccr(cccr);
737 hwc->config |= P4_CONFIG_HT;
738 } else {
739 cccr &= ~P4_CCCR_OVF_PMI_T1;
740 cccr |= P4_CCCR_OVF_PMI_T0;
741 if (escr & P4_ESCR_T1_OS) {
742 escr &= ~P4_ESCR_T1_OS;
743 escr |= P4_ESCR_T0_OS;
744 }
745 if (escr & P4_ESCR_T1_USR) {
746 escr &= ~P4_ESCR_T1_USR;
747 escr |= P4_ESCR_T0_USR;
748 }
749 hwc->config = p4_config_pack_escr(escr);
750 hwc->config |= p4_config_pack_cccr(cccr);
751 hwc->config &= ~P4_CONFIG_HT;
752 }
753}
754
755/*
756 * ESCR address hashing is tricky, ESCRs are not sequential
757 * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
758 * the metric between any ESCRs is laid in range [0xa0,0xe1]
759 *
760 * so we make ~70% filled hashtable
761 */
762
763#define P4_ESCR_MSR_BASE 0x000003a0
764#define P4_ESCR_MSR_MAX 0x000003e1
765#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
766#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE)
767#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr
768
769static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
770 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
771 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
772 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
773 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
774 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
775 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
776 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
777 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
778 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
779 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
780 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
781 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
782 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
783 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
784 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
785 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
786 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
787 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
788 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
789 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
790 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
791 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
792 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
793 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
794 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
795 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
796 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
797 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
798 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
799 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
800 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
801 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
802 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
803 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
804 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
805 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
806 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
807 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
808 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
809 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
810 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
811 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
812 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
813 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
814 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
815 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
816};
817
818static int p4_get_escr_idx(unsigned int addr)
819{
820 unsigned int idx = P4_ESCR_MSR_IDX(addr);
821
822 if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
823 !p4_escr_table[idx] ||
824 p4_escr_table[idx] != addr)) {
825 WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
826 return -1;
827 }
828
829 return idx;
830}
831
832static int p4_next_cntr(int thread, unsigned long *used_mask,
833 struct p4_event_bind *bind)
834{
835 int i, j;
836
837 for (i = 0; i < P4_CNTR_LIMIT; i++) {
838 j = bind->cntr[thread][i];
839 if (j != -1 && !test_bit(j, used_mask))
840 return j;
841 }
842
843 return -1;
844}
845
846static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
847{
848 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
849 unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
850 int cpu = smp_processor_id();
851 struct hw_perf_event *hwc;
852 struct p4_event_bind *bind;
853 unsigned int i, thread, num;
854 int cntr_idx, escr_idx;
855
856 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
857 bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
858
859 for (i = 0, num = n; i < n; i++, num--) {
860
861 hwc = &cpuc->event_list[i]->hw;
862 thread = p4_ht_thread(cpu);
863 bind = p4_config_get_bind(hwc->config);
864 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
865 if (unlikely(escr_idx == -1))
866 goto done;
867
868 if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
869 cntr_idx = hwc->idx;
870 if (assign)
871 assign[i] = hwc->idx;
872 goto reserve;
873 }
874
875 cntr_idx = p4_next_cntr(thread, used_mask, bind);
876 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
877 goto done;
878
879 p4_pmu_swap_config_ts(hwc, cpu);
880 if (assign)
881 assign[i] = cntr_idx;
882reserve:
883 set_bit(cntr_idx, used_mask);
884 set_bit(escr_idx, escr_mask);
885 }
886
887done:
888 return num ? -ENOSPC : 0;
889}
890
891static __initconst const struct x86_pmu p4_pmu = {
892 .name = "Netburst P4/Xeon",
893 .handle_irq = p4_pmu_handle_irq,
894 .disable_all = p4_pmu_disable_all,
895 .enable_all = p4_pmu_enable_all,
896 .enable = p4_pmu_enable_event,
897 .disable = p4_pmu_disable_event,
898 .eventsel = MSR_P4_BPU_CCCR0,
899 .perfctr = MSR_P4_BPU_PERFCTR0,
900 .event_map = p4_pmu_event_map,
901 .max_events = ARRAY_SIZE(p4_general_events),
902 .get_event_constraints = x86_get_event_constraints,
903 /*
904 * IF HT disabled we may need to use all
905 * ARCH_P4_MAX_CCCR counters simulaneously
906 * though leave it restricted at moment assuming
907 * HT is on
908 */
909 .num_counters = ARCH_P4_MAX_CCCR,
910 .apic = 1,
911 .cntval_bits = 40,
912 .cntval_mask = (1ULL << 40) - 1,
913 .max_period = (1ULL << 39) - 1,
914 .hw_config = p4_hw_config,
915 .schedule_events = p4_pmu_schedule_events,
916 /*
917 * This handles erratum N15 in intel doc 249199-029,
918 * the counter may not be updated correctly on write
919 * so we need a second write operation to do the trick
920 * (the official workaround didn't work)
921 *
922 * the former idea is taken from OProfile code
923 */
924 .perfctr_second_write = 1,
925};
926
927static __init int p4_pmu_init(void)
928{
929 unsigned int low, high;
930
931 /* If we get stripped -- indexig fails */
932 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
933
934 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
935 if (!(low & (1 << 7))) {
936 pr_cont("unsupported Netburst CPU model %d ",
937 boot_cpu_data.x86_model);
938 return -ENODEV;
939 }
940
941 memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
942 sizeof(hw_cache_event_ids));
943
944 pr_cont("Netburst events, ");
945
946 x86_pmu = p4_pmu;
947
948 return 0;
949}
950
951#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index a330485d14da..34ba07be2cda 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event)
27 */ 27 */
28#define P6_NOP_EVENT 0x0000002EULL 28#define P6_NOP_EVENT 0x0000002EULL
29 29
30static u64 p6_pmu_raw_event(u64 hw_event)
31{
32#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
33#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
34#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
35#define P6_EVNTSEL_INV_MASK 0x00800000ULL
36#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
37
38#define P6_EVNTSEL_MASK \
39 (P6_EVNTSEL_EVENT_MASK | \
40 P6_EVNTSEL_UNIT_MASK | \
41 P6_EVNTSEL_EDGE_MASK | \
42 P6_EVNTSEL_INV_MASK | \
43 P6_EVNTSEL_REG_MASK)
44
45 return hw_event & P6_EVNTSEL_MASK;
46}
47
48static struct event_constraint p6_event_constraints[] = 30static struct event_constraint p6_event_constraints[] =
49{ 31{
50 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ 32 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
@@ -66,7 +48,7 @@ static void p6_pmu_disable_all(void)
66 wrmsrl(MSR_P6_EVNTSEL0, val); 48 wrmsrl(MSR_P6_EVNTSEL0, val);
67} 49}
68 50
69static void p6_pmu_enable_all(void) 51static void p6_pmu_enable_all(int added)
70{ 52{
71 unsigned long val; 53 unsigned long val;
72 54
@@ -102,22 +84,23 @@ static void p6_pmu_enable_event(struct perf_event *event)
102 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
103} 85}
104 86
105static __initconst struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
106 .name = "p6", 88 .name = "p6",
107 .handle_irq = x86_pmu_handle_irq, 89 .handle_irq = x86_pmu_handle_irq,
108 .disable_all = p6_pmu_disable_all, 90 .disable_all = p6_pmu_disable_all,
109 .enable_all = p6_pmu_enable_all, 91 .enable_all = p6_pmu_enable_all,
110 .enable = p6_pmu_enable_event, 92 .enable = p6_pmu_enable_event,
111 .disable = p6_pmu_disable_event, 93 .disable = p6_pmu_disable_event,
94 .hw_config = x86_pmu_hw_config,
95 .schedule_events = x86_schedule_events,
112 .eventsel = MSR_P6_EVNTSEL0, 96 .eventsel = MSR_P6_EVNTSEL0,
113 .perfctr = MSR_P6_PERFCTR0, 97 .perfctr = MSR_P6_PERFCTR0,
114 .event_map = p6_pmu_event_map, 98 .event_map = p6_pmu_event_map,
115 .raw_event = p6_pmu_raw_event,
116 .max_events = ARRAY_SIZE(p6_perfmon_event_map), 99 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
117 .apic = 1, 100 .apic = 1,
118 .max_period = (1ULL << 31) - 1, 101 .max_period = (1ULL << 31) - 1,
119 .version = 0, 102 .version = 0,
120 .num_events = 2, 103 .num_counters = 2,
121 /* 104 /*
122 * Events have 40 bits implemented. However they are designed such 105 * Events have 40 bits implemented. However they are designed such
123 * that bits [32-39] are sign extensions of bit 31. As such the 106 * that bits [32-39] are sign extensions of bit 31. As such the
@@ -125,8 +108,8 @@ static __initconst struct x86_pmu p6_pmu = {
125 * 108 *
126 * See IA-32 Intel Architecture Software developer manual Vol 3B 109 * See IA-32 Intel Architecture Software developer manual Vol 3B
127 */ 110 */
128 .event_bits = 32, 111 .cntval_bits = 32,
129 .event_mask = (1ULL << 32) - 1, 112 .cntval_mask = (1ULL << 32) - 1,
130 .get_event_constraints = x86_get_event_constraints, 113 .get_event_constraints = x86_get_event_constraints,
131 .event_constraints = p6_event_constraints, 114 .event_constraints = p6_event_constraints,
132}; 115};
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000000..d49079515122
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,64 @@
1/*
2 * Routines to indentify additional cpu features that are scattered in
3 * cpuid space.
4 */
5#include <linux/cpu.h>
6
7#include <asm/pat.h>
8#include <asm/processor.h>
9
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17 u32 sub_leaf;
18};
19
20enum cpuid_regs {
21 CR_EAX = 0,
22 CR_ECX,
23 CR_EDX,
24 CR_EBX
25};
26
27void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
28{
29 u32 max_level;
30 u32 regs[4];
31 const struct cpuid_bit *cb;
32
33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
34 { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 },
35 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
36 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
37 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
38 { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
39 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
40 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
41 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
42 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
43 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
46 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
47 { 0, 0, 0, 0, 0 }
48 };
49
50 for (cb = cpuid_bits; cb->feature; cb++) {
51
52 /* Verify that the level is valid */
53 max_level = cpuid_eax(cb->level & 0xffff0000);
54 if (max_level < cb->level ||
55 max_level > (cb->level | 0xffff))
56 continue;
57
58 cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
59 &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
60
61 if (regs[cb->reg] & (1 << cb->bit))
62 set_cpu_cap(c, cb->feature);
63 }
64}
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c
index 97ad79cdf688..4397e987a1cf 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,60 +1,14 @@
1/* 1/*
2 * Routines to indentify additional cpu features that are scattered in 2 * Check for extended topology enumeration cpuid leaf 0xb and if it
3 * cpuid space. 3 * exists, use it for populating initial_apicid and cpu topology
4 * detection.
4 */ 5 */
5#include <linux/cpu.h>
6 6
7#include <linux/cpu.h>
8#include <asm/apic.h>
7#include <asm/pat.h> 9#include <asm/pat.h>
8#include <asm/processor.h> 10#include <asm/processor.h>
9 11
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17};
18
19enum cpuid_regs {
20 CR_EAX = 0,
21 CR_ECX,
22 CR_EDX,
23 CR_EBX
24};
25
26void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
27{
28 u32 max_level;
29 u32 regs[4];
30 const struct cpuid_bit *cb;
31
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
36 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
37 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
38 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
39 { 0, 0, 0, 0 }
40 };
41
42 for (cb = cpuid_bits; cb->feature; cb++) {
43
44 /* Verify that the level is valid */
45 max_level = cpuid_eax(cb->level & 0xffff0000);
46 if (max_level < cb->level ||
47 max_level > (cb->level | 0xffff))
48 continue;
49
50 cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
51 &regs[CR_ECX], &regs[CR_EDX]);
52
53 if (regs[cb->reg] & (1 << cb->bit))
54 set_cpu_cap(c, cb->feature);
55 }
56}
57
58/* leaf 0xb SMT level */ 12/* leaf 0xb SMT level */
59#define SMT_LEVEL 0 13#define SMT_LEVEL 0
60 14
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index dfdb4dba2320..227b0448960d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,8 +24,8 @@
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <asm/div64.h> 26#include <asm/div64.h>
27#include <asm/vmware.h>
28#include <asm/x86_init.h> 27#include <asm/x86_init.h>
28#include <asm/hypervisor.h>
29 29
30#define CPUID_VMWARE_INFO_LEAF 0x40000000 30#define CPUID_VMWARE_INFO_LEAF 0x40000000
31#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 31#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void)
51 51
52static unsigned long vmware_get_tsc_khz(void) 52static unsigned long vmware_get_tsc_khz(void)
53{ 53{
54 uint64_t tsc_hz; 54 uint64_t tsc_hz, lpj;
55 uint32_t eax, ebx, ecx, edx; 55 uint32_t eax, ebx, ecx, edx;
56 56
57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); 57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,10 +62,17 @@ static unsigned long vmware_get_tsc_khz(void)
62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", 62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
63 (unsigned long) tsc_hz / 1000, 63 (unsigned long) tsc_hz / 1000,
64 (unsigned long) tsc_hz % 1000); 64 (unsigned long) tsc_hz % 1000);
65
66 if (!preset_lpj) {
67 lpj = ((u64)tsc_hz * 1000);
68 do_div(lpj, HZ);
69 preset_lpj = lpj;
70 }
71
65 return tsc_hz; 72 return tsc_hz;
66} 73}
67 74
68void __init vmware_platform_setup(void) 75static void __init vmware_platform_setup(void)
69{ 76{
70 uint32_t eax, ebx, ecx, edx; 77 uint32_t eax, ebx, ecx, edx;
71 78
@@ -83,26 +90,22 @@ void __init vmware_platform_setup(void)
83 * serial key should be enough, as this will always have a VMware 90 * serial key should be enough, as this will always have a VMware
84 * specific string when running under VMware hypervisor. 91 * specific string when running under VMware hypervisor.
85 */ 92 */
86int vmware_platform(void) 93static bool __init vmware_platform(void)
87{ 94{
88 if (cpu_has_hypervisor) { 95 if (cpu_has_hypervisor) {
89 unsigned int eax, ebx, ecx, edx; 96 unsigned int eax;
90 char hyper_vendor_id[13]; 97 unsigned int hyper_vendor_id[3];
91 98
92 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); 99 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
93 memcpy(hyper_vendor_id + 0, &ebx, 4); 100 &hyper_vendor_id[1], &hyper_vendor_id[2]);
94 memcpy(hyper_vendor_id + 4, &ecx, 4); 101 if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
95 memcpy(hyper_vendor_id + 8, &edx, 4); 102 return true;
96 hyper_vendor_id[12] = '\0';
97 if (!strcmp(hyper_vendor_id, "VMwareVMware"))
98 return 1;
99 } else if (dmi_available && dmi_name_in_serial("VMware") && 103 } else if (dmi_available && dmi_name_in_serial("VMware") &&
100 __vmware_platform()) 104 __vmware_platform())
101 return 1; 105 return true;
102 106
103 return 0; 107 return false;
104} 108}
105EXPORT_SYMBOL(vmware_platform);
106 109
107/* 110/*
108 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 111 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -116,8 +119,16 @@ EXPORT_SYMBOL(vmware_platform);
116 * so that the kernel could just trust the hypervisor with providing a 119 * so that the kernel could just trust the hypervisor with providing a
117 * reliable virtual TSC that is suitable for timekeeping. 120 * reliable virtual TSC that is suitable for timekeeping.
118 */ 121 */
119void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) 122static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
120{ 123{
121 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 124 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
122 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); 125 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
123} 126}
127
128const __refconst struct hypervisor_x86 x86_hyper_vmware = {
129 .name = "VMware",
130 .detect = vmware_platform,
131 .set_cpu_features = vmware_set_cpu_features,
132 .init_platform = vmware_platform_setup,
133};
134EXPORT_SYMBOL(x86_hyper_vmware);