aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/amd.c77
-rw-r--r--arch/x86/kernel/cpu/bugs.c2
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c70
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile5
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c54
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c12
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h26
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c626
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c200
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c1
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c55
-rw-r--r--arch/x86/kernel/cpu/intel.c33
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c477
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c138
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h23
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c141
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c16
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c216
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c56
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c214
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c14
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c63
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h6
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event.c2585
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c420
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c1056
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c641
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c218
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c951
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c142
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c13
-rw-r--r--arch/x86/kernel/cpu/scattered.c64
-rw-r--r--arch/x86/kernel/cpu/topology.c (renamed from arch/x86/kernel/cpu/addon_cpuid_features.c)52
-rw-r--r--arch/x86/kernel/cpu/vmware.c47
59 files changed, 6527 insertions, 2496 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62f3671..3f0ebe429a01 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,11 +12,11 @@ endif
12nostackp := $(call cc-option, -fno-stack-protector) 12nostackp := $(call cc-option, -fno-stack-protector)
13CFLAGS_common.o := $(nostackp) 13CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o addon_cpuid_features.o 15obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18 18
19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o
20obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
21 21
22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d2..ba5f62f45f01 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
466 } 466 }
467 467
468 } 468 }
469 if (c->x86 == 0x10 || c->x86 == 0x11) 469 if (c->x86 >= 0x10)
470 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 470 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
471 471
472 /* get apicid instead of initial apic id from cpuid */ 472 /* get apicid instead of initial apic id from cpuid */
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
529 num_cache_leaves = 3; 529 num_cache_leaves = 3;
530 } 530 }
531 531
532 if (c->x86 >= 0xf && c->x86 <= 0x11) 532 if (c->x86 >= 0xf)
533 set_cpu_cap(c, X86_FEATURE_K8); 533 set_cpu_cap(c, X86_FEATURE_K8);
534 534
535 if (cpu_has_xmm2) { 535 if (cpu_has_xmm2) {
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
546 fam10h_check_enable_mmcfg(); 546 fam10h_check_enable_mmcfg();
547 } 547 }
548 548
549 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { 549 if (c == &boot_cpu_data && c->x86 >= 0xf) {
550 unsigned long long tseg; 550 unsigned long long tseg;
551 551
552 /* 552 /*
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
609}; 609};
610 610
611cpu_dev_register(amd_cpu_dev); 611cpu_dev_register(amd_cpu_dev);
612
613/*
614 * AMD errata checking
615 *
616 * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
617 * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
618 * have an OSVW id assigned, which it takes as first argument. Both take a
619 * variable number of family-specific model-stepping ranges created by
620 * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
621 * int[] in arch/x86/include/asm/processor.h.
622 *
623 * Example:
624 *
625 * const int amd_erratum_319[] =
626 * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
627 * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
628 * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
629 */
630
631const int amd_erratum_400[] =
632 AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
633 AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
634EXPORT_SYMBOL_GPL(amd_erratum_400);
635
636const int amd_erratum_383[] =
637 AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
638EXPORT_SYMBOL_GPL(amd_erratum_383);
639
640bool cpu_has_amd_erratum(const int *erratum)
641{
642 struct cpuinfo_x86 *cpu = &current_cpu_data;
643 int osvw_id = *erratum++;
644 u32 range;
645 u32 ms;
646
647 /*
648 * If called early enough that current_cpu_data hasn't been initialized
649 * yet, fall back to boot_cpu_data.
650 */
651 if (cpu->x86 == 0)
652 cpu = &boot_cpu_data;
653
654 if (cpu->x86_vendor != X86_VENDOR_AMD)
655 return false;
656
657 if (osvw_id >= 0 && osvw_id < 65536 &&
658 cpu_has(cpu, X86_FEATURE_OSVW)) {
659 u64 osvw_len;
660
661 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
662 if (osvw_id < osvw_len) {
663 u64 osvw_bits;
664
665 rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
666 osvw_bits);
667 return osvw_bits & (1ULL << (osvw_id & 0x3f));
668 }
669 }
670
671 /* OSVW unavailable or ID unknown, match family-model-stepping range */
672 ms = (cpu->x86_model << 4) | cpu->x86_mask;
673 while ((range = *erratum++))
674 if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
675 (ms >= AMD_MODEL_RANGE_START(range)) &&
676 (ms <= AMD_MODEL_RANGE_END(range)))
677 return true;
678
679 return false;
680}
681
682EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01a265212395..c39576cb3018 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
86 86
87static void __init check_hlt(void) 87static void __init check_hlt(void)
88{ 88{
89 if (paravirt_enabled()) 89 if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
90 return; 90 return;
91 91
92 printk(KERN_INFO "Checking 'hlt' instruction... "); 92 printk(KERN_INFO "Checking 'hlt' instruction... ");
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
deleted file mode 100644
index 2056ccf572cc..000000000000
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ /dev/null
@@ -1,72 +0,0 @@
1/*
2 * cmpxchg*() fallbacks for CPU not supporting these instructions
3 */
4
5#include <linux/kernel.h>
6#include <linux/smp.h>
7#include <linux/module.h>
8
9#ifndef CONFIG_X86_CMPXCHG
10unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
11{
12 u8 prev;
13 unsigned long flags;
14
15 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
16 local_irq_save(flags);
17 prev = *(u8 *)ptr;
18 if (prev == old)
19 *(u8 *)ptr = new;
20 local_irq_restore(flags);
21 return prev;
22}
23EXPORT_SYMBOL(cmpxchg_386_u8);
24
25unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
26{
27 u16 prev;
28 unsigned long flags;
29
30 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
31 local_irq_save(flags);
32 prev = *(u16 *)ptr;
33 if (prev == old)
34 *(u16 *)ptr = new;
35 local_irq_restore(flags);
36 return prev;
37}
38EXPORT_SYMBOL(cmpxchg_386_u16);
39
40unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
41{
42 u32 prev;
43 unsigned long flags;
44
45 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
46 local_irq_save(flags);
47 prev = *(u32 *)ptr;
48 if (prev == old)
49 *(u32 *)ptr = new;
50 local_irq_restore(flags);
51 return prev;
52}
53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4868e4a951ee..f2f9ac7da25c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
140static int __init x86_xsave_setup(char *s) 140static int __init x86_xsave_setup(char *s)
141{ 141{
142 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 142 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
143 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
143 return 1; 144 return 1;
144} 145}
145__setup("noxsave", x86_xsave_setup); 146__setup("noxsave", x86_xsave_setup);
146 147
148static int __init x86_xsaveopt_setup(char *s)
149{
150 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
151 return 1;
152}
153__setup("noxsaveopt", x86_xsaveopt_setup);
154
147#ifdef CONFIG_X86_32 155#ifdef CONFIG_X86_32
148static int cachesize_override __cpuinitdata = -1; 156static int cachesize_override __cpuinitdata = -1;
149static int disable_x86_serial_nr __cpuinitdata = 1; 157static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -537,7 +545,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
537 } 545 }
538} 546}
539 547
540static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) 548void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
541{ 549{
542 u32 tfms, xlvl; 550 u32 tfms, xlvl;
543 u32 ebx; 551 u32 ebx;
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
551 c->x86_capability[4] = excap; 559 c->x86_capability[4] = excap;
552 } 560 }
553 561
562 /* Additional Intel-defined flags: level 0x00000007 */
563 if (c->cpuid_level >= 0x00000007) {
564 u32 eax, ebx, ecx, edx;
565
566 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
567
568 if (eax > 0)
569 c->x86_capability[9] = ebx;
570 }
571
554 /* AMD-defined flags: level 0x80000001 */ 572 /* AMD-defined flags: level 0x80000001 */
555 xlvl = cpuid_eax(0x80000000); 573 xlvl = cpuid_eax(0x80000000);
556 c->extended_cpuid_level = xlvl; 574 c->extended_cpuid_level = xlvl;
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
576 if (c->extended_cpuid_level >= 0x80000007) 594 if (c->extended_cpuid_level >= 0x80000007)
577 c->x86_power = cpuid_edx(0x80000007); 595 c->x86_power = cpuid_edx(0x80000007);
578 596
597 init_scattered_cpuid_features(c);
579} 598}
580 599
581static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) 600static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
731 750
732 get_model_name(c); /* Default name */ 751 get_model_name(c); /* Default name */
733 752
734 init_scattered_cpuid_features(c);
735 detect_nopl(c); 753 detect_nopl(c);
736} 754}
737 755
@@ -1084,6 +1102,20 @@ static void clear_all_debug_regs(void)
1084 } 1102 }
1085} 1103}
1086 1104
1105#ifdef CONFIG_KGDB
1106/*
1107 * Restore debug regs if using kgdbwait and you have a kernel debugger
1108 * connection established.
1109 */
1110static void dbg_restore_debug_regs(void)
1111{
1112 if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
1113 arch_kgdb_ops.correct_hw_break();
1114}
1115#else /* ! CONFIG_KGDB */
1116#define dbg_restore_debug_regs()
1117#endif /* ! CONFIG_KGDB */
1118
1087/* 1119/*
1088 * cpu_init() initializes state that is per-CPU. Some data is already 1120 * cpu_init() initializes state that is per-CPU. Some data is already
1089 * initialized (naturally) in the bootstrap process, such as the GDT 1121 * initialized (naturally) in the bootstrap process, such as the GDT
@@ -1107,9 +1139,9 @@ void __cpuinit cpu_init(void)
1107 oist = &per_cpu(orig_ist, cpu); 1139 oist = &per_cpu(orig_ist, cpu);
1108 1140
1109#ifdef CONFIG_NUMA 1141#ifdef CONFIG_NUMA
1110 if (cpu != 0 && percpu_read(node_number) == 0 && 1142 if (cpu != 0 && percpu_read(numa_node) == 0 &&
1111 cpu_to_node(cpu) != NUMA_NO_NODE) 1143 early_cpu_to_node(cpu) != NUMA_NO_NODE)
1112 percpu_write(node_number, cpu_to_node(cpu)); 1144 set_numa_node(early_cpu_to_node(cpu));
1113#endif 1145#endif
1114 1146
1115 me = current; 1147 me = current;
@@ -1174,20 +1206,11 @@ void __cpuinit cpu_init(void)
1174 load_TR_desc(); 1206 load_TR_desc();
1175 load_LDT(&init_mm.context); 1207 load_LDT(&init_mm.context);
1176 1208
1177#ifdef CONFIG_KGDB 1209 clear_all_debug_regs();
1178 /* 1210 dbg_restore_debug_regs();
1179 * If the kgdb is connected no debug regs should be altered. This
1180 * is only applicable when KGDB and a KGDB I/O module are built
1181 * into the kernel and you are using early debugging with
1182 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1183 */
1184 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1185 arch_kgdb_ops.correct_hw_break();
1186 else
1187#endif
1188 clear_all_debug_regs();
1189 1211
1190 fpu_init(); 1212 fpu_init();
1213 xsave_init();
1191 1214
1192 raw_local_save_flags(kernel_eflags); 1215 raw_local_save_flags(kernel_eflags);
1193 1216
@@ -1239,23 +1262,16 @@ void __cpuinit cpu_init(void)
1239#endif 1262#endif
1240 1263
1241 clear_all_debug_regs(); 1264 clear_all_debug_regs();
1265 dbg_restore_debug_regs();
1242 1266
1243 /* 1267 /*
1244 * Force FPU initialization: 1268 * Force FPU initialization:
1245 */ 1269 */
1246 if (cpu_has_xsave) 1270 current_thread_info()->status = 0;
1247 current_thread_info()->status = TS_XSAVE;
1248 else
1249 current_thread_info()->status = 0;
1250 clear_used_math(); 1271 clear_used_math();
1251 mxcsr_feature_mask_init(); 1272 mxcsr_feature_mask_init();
1252 1273
1253 /* 1274 fpu_init();
1254 * Boot processor to setup the FP and extended state context info.
1255 */
1256 if (smp_processor_id() == boot_cpu_id)
1257 init_thread_xstate();
1258
1259 xsave_init(); 1275 xsave_init();
1260} 1276}
1261#endif 1277#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 3624e8a0f71b..f668bb1f7d43 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -33,5 +33,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); 35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36extern void get_cpu_cap(struct cpuinfo_x86 *c);
36 37
37#endif 38#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index f138c6c389b9..870e6cc6ad28 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -10,6 +10,20 @@ if CPU_FREQ
10 10
11comment "CPUFreq processor drivers" 11comment "CPUFreq processor drivers"
12 12
13config X86_PCC_CPUFREQ
14 tristate "Processor Clocking Control interface driver"
15 depends on ACPI && ACPI_PROCESSOR
16 help
17 This driver adds support for the PCC interface.
18
19 For details, take a look at:
20 <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
21
22 To compile this driver as a module, choose M here: the
23 module will be called pcc-cpufreq.
24
25 If in doubt, say N.
26
13config X86_ACPI_CPUFREQ 27config X86_ACPI_CPUFREQ
14 tristate "ACPI Processor P-States driver" 28 tristate "ACPI Processor P-States driver"
15 select CPU_FREQ_TABLE 29 select CPU_FREQ_TABLE
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 509296df294d..bd54bf67e6fb 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,9 @@
2# K8 systems. ACPI is preferred to all other hardware-specific drivers. 2# K8 systems. ACPI is preferred to all other hardware-specific drivers.
3# speedstep-* is preferred over p4-clockmod. 3# speedstep-* is preferred over p4-clockmod.
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
7obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
8obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
9obj-$(CONFIG_X86_LONGHAUL) += longhaul.o 10obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1b1920fa7c80..cd8da247dda1 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <trace/events/power.h> 36#include <linux/slab.h>
37 37
38#include <linux/acpi.h> 38#include <linux/acpi.h>
39#include <linux/io.h> 39#include <linux/io.h>
@@ -45,6 +45,7 @@
45#include <asm/msr.h> 45#include <asm/msr.h>
46#include <asm/processor.h> 46#include <asm/processor.h>
47#include <asm/cpufeature.h> 47#include <asm/cpufeature.h>
48#include "mperf.h"
48 49
49#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ 50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
50 "acpi-cpufreq", msg) 51 "acpi-cpufreq", msg)
@@ -70,10 +71,8 @@ struct acpi_cpufreq_data {
70 71
71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); 72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
72 73
73static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
74
75/* acpi_perf_data is a pointer to percpu data. */ 74/* acpi_perf_data is a pointer to percpu data. */
76static struct acpi_processor_performance *acpi_perf_data; 75static struct acpi_processor_performance __percpu *acpi_perf_data;
77 76
78static struct cpufreq_driver acpi_cpufreq_driver; 77static struct cpufreq_driver acpi_cpufreq_driver;
79 78
@@ -239,45 +238,6 @@ static u32 get_cur_val(const struct cpumask *mask)
239 return cmd.val; 238 return cmd.val;
240} 239}
241 240
242/* Called via smp_call_function_single(), on the target CPU */
243static void read_measured_perf_ctrs(void *_cur)
244{
245 struct aperfmperf *am = _cur;
246
247 get_aperfmperf(am);
248}
249
250/*
251 * Return the measured active (C0) frequency on this CPU since last call
252 * to this function.
253 * Input: cpu number
254 * Return: Average CPU frequency in terms of max frequency (zero on error)
255 *
256 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
257 * over a period of time, while CPU is in C0 state.
258 * IA32_MPERF counts at the rate of max advertised frequency
259 * IA32_APERF counts at the rate of actual CPU frequency
260 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
261 * no meaning should be associated with absolute values of these MSRs.
262 */
263static unsigned int get_measured_perf(struct cpufreq_policy *policy,
264 unsigned int cpu)
265{
266 struct aperfmperf perf;
267 unsigned long ratio;
268 unsigned int retval;
269
270 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
271 return 0;
272
273 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
274 per_cpu(acfreq_old_perf, cpu) = perf;
275
276 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
277
278 return retval;
279}
280
281static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 241static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
282{ 242{
283 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); 243 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -363,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
363 } 323 }
364 } 324 }
365 325
366 trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
367
368 switch (data->cpu_feature) { 326 switch (data->cpu_feature) {
369 case SYSTEM_INTEL_MSR_CAPABLE: 327 case SYSTEM_INTEL_MSR_CAPABLE:
370 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 328 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
@@ -390,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
390 348
391 freqs.old = perf->states[perf->state].core_frequency * 1000; 349 freqs.old = perf->states[perf->state].core_frequency * 1000;
392 freqs.new = data->freq_table[next_state].frequency; 350 freqs.new = data->freq_table[next_state].frequency;
393 for_each_cpu(i, cmd.mask) { 351 for_each_cpu(i, policy->cpus) {
394 freqs.cpu = i; 352 freqs.cpu = i;
395 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 353 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
396 } 354 }
@@ -406,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
406 } 364 }
407 } 365 }
408 366
409 for_each_cpu(i, cmd.mask) { 367 for_each_cpu(i, policy->cpus) {
410 freqs.cpu = i; 368 freqs.cpu = i;
411 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 369 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
412 } 370 }
@@ -701,7 +659,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
701 659
702 /* Check for APERF/MPERF support in hardware */ 660 /* Check for APERF/MPERF support in hardware */
703 if (cpu_has(c, X86_FEATURE_APERFMPERF)) 661 if (cpu_has(c, X86_FEATURE_APERFMPERF))
704 acpi_cpufreq_driver.getavg = get_measured_perf; 662 acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
705 663
706 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 664 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
707 for (i = 0; i < perf->state_count; i++) 665 for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 006b278b0d5d..c587db472a75 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -20,7 +20,6 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/init.h> 21#include <linux/init.h>
22 22
23#include <linux/slab.h>
24#include <linux/delay.h> 23#include <linux/delay.h>
25#include <linux/cpufreq.h> 24#include <linux/cpufreq.h>
26 25
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index ac27ec2264d5..32974cf84232 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -80,6 +80,7 @@
80#include <linux/cpufreq.h> 80#include <linux/cpufreq.h>
81#include <linux/pci.h> 81#include <linux/pci.h>
82#include <linux/errno.h> 82#include <linux/errno.h>
83#include <linux/slab.h>
83 84
84#include <asm/processor-cyrix.h> 85#include <asm/processor-cyrix.h>
85 86
@@ -168,12 +169,9 @@ static int gx_freq_mult[16] = {
168 * Low Level chipset interface * 169 * Low Level chipset interface *
169 ****************************************************************/ 170 ****************************************************************/
170static struct pci_device_id gx_chipset_tbl[] __initdata = { 171static struct pci_device_id gx_chipset_tbl[] __initdata = {
171 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, 172 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
172 PCI_ANY_ID, PCI_ANY_ID }, 173 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
173 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, 174 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
174 PCI_ANY_ID, PCI_ANY_ID },
175 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
176 PCI_ANY_ID, PCI_ANY_ID },
177 { 0, }, 175 { 0, },
178}; 176};
179 177
@@ -198,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
198 } 196 }
199 197
200 /* detect which companion chip is used */ 198 /* detect which companion chip is used */
201 while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { 199 for_each_pci_dev(gx_pci) {
202 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) 200 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
203 return gx_pci; 201 return gx_pci;
204 } 202 }
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 7e7eea4f8261..03162dac6271 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -426,7 +426,7 @@ static int guess_fsb(int mult)
426} 426}
427 427
428 428
429static int __init longhaul_get_ranges(void) 429static int __cpuinit longhaul_get_ranges(void)
430{ 430{
431 unsigned int i, j, k = 0; 431 unsigned int i, j, k = 0;
432 unsigned int ratio; 432 unsigned int ratio;
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void)
530} 530}
531 531
532 532
533static void __init longhaul_setup_voltagescaling(void) 533static void __cpuinit longhaul_setup_voltagescaling(void)
534{ 534{
535 union msr_longhaul longhaul; 535 union msr_longhaul longhaul;
536 struct mV_pos minvid, maxvid, vid; 536 struct mV_pos minvid, maxvid, vid;
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void)
784 return 0; 784 return 0;
785} 785}
786 786
787static int __init longhaul_cpu_init(struct cpufreq_policy *policy) 787static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
788{ 788{
789 struct cpuinfo_x86 *c = &cpu_data(0); 789 struct cpuinfo_x86 *c = &cpu_data(0);
790 char *cpuname = NULL; 790 char *cpuname = NULL;
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index e2360a469f79..cbf48fbca881 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -56,7 +56,7 @@ union msr_longhaul {
56/* 56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0) 57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */ 58 */
59static const int __initdata samuel1_mults[16] = { 59static const int __cpuinitdata samuel1_mults[16] = {
60 -1, /* 0000 -> RESERVED */ 60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */ 61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */ 62 40, /* 0010 -> 4.0x */
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = {
75 -1, /* 1111 -> RESERVED */ 75 -1, /* 1111 -> RESERVED */
76}; 76};
77 77
78static const int __initdata samuel1_eblcr[16] = { 78static const int __cpuinitdata samuel1_eblcr[16] = {
79 50, /* 0000 -> RESERVED */ 79 50, /* 0000 -> RESERVED */
80 30, /* 0001 -> 3.0x */ 80 30, /* 0001 -> 3.0x */
81 40, /* 0010 -> 4.0x */ 81 40, /* 0010 -> 4.0x */
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = {
97/* 97/*
98 * VIA C3 Samuel2 Stepping 1->15 98 * VIA C3 Samuel2 Stepping 1->15
99 */ 99 */
100static const int __initdata samuel2_eblcr[16] = { 100static const int __cpuinitdata samuel2_eblcr[16] = {
101 50, /* 0000 -> 5.0x */ 101 50, /* 0000 -> 5.0x */
102 30, /* 0001 -> 3.0x */ 102 30, /* 0001 -> 3.0x */
103 40, /* 0010 -> 4.0x */ 103 40, /* 0010 -> 4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
119/* 119/*
120 * VIA C3 Ezra 120 * VIA C3 Ezra
121 */ 121 */
122static const int __initdata ezra_mults[16] = { 122static const int __cpuinitdata ezra_mults[16] = {
123 100, /* 0000 -> 10.0x */ 123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */ 124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */ 125 40, /* 0010 -> 4.0x */
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = {
138 120, /* 1111 -> 12.0x */ 138 120, /* 1111 -> 12.0x */
139}; 139};
140 140
141static const int __initdata ezra_eblcr[16] = { 141static const int __cpuinitdata ezra_eblcr[16] = {
142 50, /* 0000 -> 5.0x */ 142 50, /* 0000 -> 5.0x */
143 30, /* 0001 -> 3.0x */ 143 30, /* 0001 -> 3.0x */
144 40, /* 0010 -> 4.0x */ 144 40, /* 0010 -> 4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
160/* 160/*
161 * VIA C3 (Ezra-T) [C5M]. 161 * VIA C3 (Ezra-T) [C5M].
162 */ 162 */
163static const int __initdata ezrat_mults[32] = { 163static const int __cpuinitdata ezrat_mults[32] = {
164 100, /* 0000 -> 10.0x */ 164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */ 165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */ 166 40, /* 0010 -> 4.0x */
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = {
196 -1, /* 1111 -> RESERVED (12.0x) */ 196 -1, /* 1111 -> RESERVED (12.0x) */
197}; 197};
198 198
199static const int __initdata ezrat_eblcr[32] = { 199static const int __cpuinitdata ezrat_eblcr[32] = {
200 50, /* 0000 -> 5.0x */ 200 50, /* 0000 -> 5.0x */
201 30, /* 0001 -> 3.0x */ 201 30, /* 0001 -> 3.0x */
202 40, /* 0010 -> 4.0x */ 202 40, /* 0010 -> 4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
235/* 235/*
236 * VIA C3 Nehemiah */ 236 * VIA C3 Nehemiah */
237 237
238static const int __initdata nehemiah_mults[32] = { 238static const int __cpuinitdata nehemiah_mults[32] = {
239 100, /* 0000 -> 10.0x */ 239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */ 240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */ 241 40, /* 0010 -> 4.0x */
@@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = {
270 -1, /* 1111 -> 12.0x */ 270 -1, /* 1111 -> 12.0x */
271}; 271};
272 272
273static const int __initdata nehemiah_eblcr[32] = { 273static const int __cpuinitdata nehemiah_eblcr[32] = {
274 50, /* 0000 -> 5.0x */ 274 50, /* 0000 -> 5.0x */
275 160, /* 0001 -> 16.0x */ 275 160, /* 0001 -> 16.0x */
276 40, /* 0010 -> 4.0x */ 276 40, /* 0010 -> 4.0x */
@@ -315,7 +315,7 @@ struct mV_pos {
315 unsigned short pos; 315 unsigned short pos;
316}; 316};
317 317
318static const struct mV_pos __initdata vrm85_mV[32] = { 318static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, 319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, 320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, 321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = {
326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} 326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
327}; 327};
328 328
329static const unsigned char __initdata mV_vrm85[32] = { 329static const unsigned char __cpuinitdata mV_vrm85[32] = {
330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, 330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, 331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, 332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
334}; 334};
335 335
336static const struct mV_pos __initdata mobilevrm_mV[32] = { 336static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, 337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, 338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, 339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = {
344 {675, 3}, {650, 2}, {625, 1}, {600, 0} 344 {675, 3}, {650, 2}, {625, 1}, {600, 0}
345}; 345};
346 346
347static const unsigned char __initdata mV_mobilevrm[32] = { 347static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index da5f70fcb766..fc09f142d94d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -9,7 +9,6 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/slab.h>
13#include <linux/cpufreq.h> 12#include <linux/cpufreq.h>
14#include <linux/timex.h> 13#include <linux/timex.h>
15 14
@@ -166,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu)
166 * TMTA rules: 165 * TMTA rules:
167 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) 166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
168 */ 167 */
169static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, 168static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
170 unsigned int *high_freq) 169 unsigned int *high_freq)
171{ 170{
172 u32 msr_lo, msr_hi; 171 u32 msr_lo, msr_hi;
173 u32 save_lo, save_hi; 172 u32 save_lo, save_hi;
@@ -259,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
259} 258}
260 259
261 260
262static int __init longrun_cpu_init(struct cpufreq_policy *policy) 261static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
263{ 262{
264 int result = 0; 263 int result = 0;
265 264
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 000000000000..911e193018ae
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
1#include <linux/kernel.h>
2#include <linux/smp.h>
3#include <linux/module.h>
4#include <linux/init.h>
5#include <linux/cpufreq.h>
6#include <linux/slab.h>
7
8#include "mperf.h"
9
10static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
11
12/* Called via smp_call_function_single(), on the target CPU */
13static void read_measured_perf_ctrs(void *_cur)
14{
15 struct aperfmperf *am = _cur;
16
17 get_aperfmperf(am);
18}
19
20/*
21 * Return the measured active (C0) frequency on this CPU since last call
22 * to this function.
23 * Input: cpu number
24 * Return: Average CPU frequency in terms of max frequency (zero on error)
25 *
26 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
27 * over a period of time, while CPU is in C0 state.
28 * IA32_MPERF counts at the rate of max advertised frequency
29 * IA32_APERF counts at the rate of actual CPU frequency
30 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
31 * no meaning should be associated with absolute values of these MSRs.
32 */
33unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
34 unsigned int cpu)
35{
36 struct aperfmperf perf;
37 unsigned long ratio;
38 unsigned int retval;
39
40 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
41 return 0;
42
43 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
44 per_cpu(acfreq_old_perf, cpu) = perf;
45
46 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
47
48 return retval;
49}
50EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
51MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 000000000000..5dbf2950dc22
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
1/*
2 * (c) 2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
9 unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 869615193720..bd1cac747f67 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -25,7 +25,6 @@
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/smp.h> 26#include <linux/smp.h>
27#include <linux/cpufreq.h> 27#include <linux/cpufreq.h>
28#include <linux/slab.h>
29#include <linux/cpumask.h> 28#include <linux/cpumask.h>
30#include <linux/timex.h> 29#include <linux/timex.h>
31 30
@@ -179,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
179 } 178 }
180 } 179 }
181 180
182 if (c->x86 != 0xF) { 181 if (c->x86 != 0xF)
183 if (!cpu_has(c, X86_FEATURE_EST))
184 printk(KERN_WARNING PFX "Unknown CPU. "
185 "Please send an e-mail to "
186 "<cpufreq@vger.kernel.org>\n");
187 return 0; 182 return 0;
188 }
189 183
190 /* on P-4s, the TSC runs with constant frequency independent whether 184 /* on P-4s, the TSC runs with constant frequency independent whether
191 * throttling is active or not. */ 185 * throttling is active or not. */
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
new file mode 100644
index 000000000000..4f6f679f2799
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -0,0 +1,626 @@
1/*
2 * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/kernel.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/sched.h>
31#include <linux/cpufreq.h>
32#include <linux/compiler.h>
33#include <linux/slab.h>
34
35#include <linux/acpi.h>
36#include <linux/io.h>
37#include <linux/spinlock.h>
38#include <linux/uaccess.h>
39
40#include <acpi/processor.h>
41
42#define PCC_VERSION "1.00.00"
43#define POLL_LOOPS 300
44
45#define CMD_COMPLETE 0x1
46#define CMD_GET_FREQ 0x0
47#define CMD_SET_FREQ 0x1
48
49#define BUF_SZ 4
50
51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
52 "pcc-cpufreq", msg)
53
54struct pcc_register_resource {
55 u8 descriptor;
56 u16 length;
57 u8 space_id;
58 u8 bit_width;
59 u8 bit_offset;
60 u8 access_size;
61 u64 address;
62} __attribute__ ((packed));
63
64struct pcc_memory_resource {
65 u8 descriptor;
66 u16 length;
67 u8 space_id;
68 u8 resource_usage;
69 u8 type_specific;
70 u64 granularity;
71 u64 minimum;
72 u64 maximum;
73 u64 translation_offset;
74 u64 address_length;
75} __attribute__ ((packed));
76
77static struct cpufreq_driver pcc_cpufreq_driver;
78
79struct pcc_header {
80 u32 signature;
81 u16 length;
82 u8 major;
83 u8 minor;
84 u32 features;
85 u16 command;
86 u16 status;
87 u32 latency;
88 u32 minimum_time;
89 u32 maximum_time;
90 u32 nominal;
91 u32 throttled_frequency;
92 u32 minimum_frequency;
93};
94
95static void __iomem *pcch_virt_addr;
96static struct pcc_header __iomem *pcch_hdr;
97
98static DEFINE_SPINLOCK(pcc_lock);
99
100static struct acpi_generic_address doorbell;
101
102static u64 doorbell_preserve;
103static u64 doorbell_write;
104
105static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
106 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
107
108struct pcc_cpu {
109 u32 input_offset;
110 u32 output_offset;
111};
112
113static struct pcc_cpu __percpu *pcc_cpu_info;
114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{
117 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
118 policy->cpuinfo.max_freq);
119 return 0;
120}
121
122static inline void pcc_cmd(void)
123{
124 u64 doorbell_value;
125 int i;
126
127 acpi_read(&doorbell_value, &doorbell);
128 acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
129 &doorbell);
130
131 for (i = 0; i < POLL_LOOPS; i++) {
132 if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
133 break;
134 }
135}
136
137static inline void pcc_clear_mapping(void)
138{
139 if (pcch_virt_addr)
140 iounmap(pcch_virt_addr);
141 pcch_virt_addr = NULL;
142}
143
144static unsigned int pcc_get_freq(unsigned int cpu)
145{
146 struct pcc_cpu *pcc_cpu_data;
147 unsigned int curr_freq;
148 unsigned int freq_limit;
149 u16 status;
150 u32 input_buffer;
151 u32 output_buffer;
152
153 spin_lock(&pcc_lock);
154
155 dprintk("get: get_freq for CPU %d\n", cpu);
156 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
157
158 input_buffer = 0x1;
159 iowrite32(input_buffer,
160 (pcch_virt_addr + pcc_cpu_data->input_offset));
161 iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
162
163 pcc_cmd();
164
165 output_buffer =
166 ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
167
168 /* Clear the input buffer - we are done with the current command */
169 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
170
171 status = ioread16(&pcch_hdr->status);
172 if (status != CMD_COMPLETE) {
173 dprintk("get: FAILED: for CPU %d, status is %d\n",
174 cpu, status);
175 goto cmd_incomplete;
176 }
177 iowrite16(0, &pcch_hdr->status);
178 curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
179 / 100) * 1000);
180
181 dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
182 "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
183 cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
184 output_buffer, curr_freq);
185
186 freq_limit = (output_buffer >> 8) & 0xff;
187 if (freq_limit != 0xff) {
188 dprintk("get: frequency for cpu %d is being temporarily"
189 " capped at %d\n", cpu, curr_freq);
190 }
191
192 spin_unlock(&pcc_lock);
193 return curr_freq;
194
195cmd_incomplete:
196 iowrite16(0, &pcch_hdr->status);
197 spin_unlock(&pcc_lock);
198 return -EINVAL;
199}
200
201static int pcc_cpufreq_target(struct cpufreq_policy *policy,
202 unsigned int target_freq,
203 unsigned int relation)
204{
205 struct pcc_cpu *pcc_cpu_data;
206 struct cpufreq_freqs freqs;
207 u16 status;
208 u32 input_buffer;
209 int cpu;
210
211 spin_lock(&pcc_lock);
212 cpu = policy->cpu;
213 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
214
215 dprintk("target: CPU %d should go to target freq: %d "
216 "(virtual) input_offset is 0x%x\n",
217 cpu, target_freq,
218 (pcch_virt_addr + pcc_cpu_data->input_offset));
219
220 freqs.new = target_freq;
221 freqs.cpu = cpu;
222 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
223
224 input_buffer = 0x1 | (((target_freq * 100)
225 / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
226 iowrite32(input_buffer,
227 (pcch_virt_addr + pcc_cpu_data->input_offset));
228 iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
229
230 pcc_cmd();
231
232 /* Clear the input buffer - we are done with the current command */
233 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
234
235 status = ioread16(&pcch_hdr->status);
236 if (status != CMD_COMPLETE) {
237 dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
238 cpu, status);
239 goto cmd_incomplete;
240 }
241 iowrite16(0, &pcch_hdr->status);
242
243 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
244 dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
245 spin_unlock(&pcc_lock);
246
247 return 0;
248
249cmd_incomplete:
250 iowrite16(0, &pcch_hdr->status);
251 spin_unlock(&pcc_lock);
252 return -EINVAL;
253}
254
255static int pcc_get_offset(int cpu)
256{
257 acpi_status status;
258 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
259 union acpi_object *pccp, *offset;
260 struct pcc_cpu *pcc_cpu_data;
261 struct acpi_processor *pr;
262 int ret = 0;
263
264 pr = per_cpu(processors, cpu);
265 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
266
267 status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
268 if (ACPI_FAILURE(status))
269 return -ENODEV;
270
271 pccp = buffer.pointer;
272 if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
273 ret = -ENODEV;
274 goto out_free;
275 };
276
277 offset = &(pccp->package.elements[0]);
278 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
279 ret = -ENODEV;
280 goto out_free;
281 }
282
283 pcc_cpu_data->input_offset = offset->integer.value;
284
285 offset = &(pccp->package.elements[1]);
286 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
287 ret = -ENODEV;
288 goto out_free;
289 }
290
291 pcc_cpu_data->output_offset = offset->integer.value;
292
293 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
294 memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
295
296 dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
297 "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
298 cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
299out_free:
300 kfree(buffer.pointer);
301 return ret;
302}
303
304static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
305{
306 acpi_status status;
307 struct acpi_object_list input;
308 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
309 union acpi_object in_params[4];
310 union acpi_object *out_obj;
311 u32 capabilities[2];
312 u32 errors;
313 u32 supported;
314 int ret = 0;
315
316 input.count = 4;
317 input.pointer = in_params;
318 input.count = 4;
319 input.pointer = in_params;
320 in_params[0].type = ACPI_TYPE_BUFFER;
321 in_params[0].buffer.length = 16;
322 in_params[0].buffer.pointer = OSC_UUID;
323 in_params[1].type = ACPI_TYPE_INTEGER;
324 in_params[1].integer.value = 1;
325 in_params[2].type = ACPI_TYPE_INTEGER;
326 in_params[2].integer.value = 2;
327 in_params[3].type = ACPI_TYPE_BUFFER;
328 in_params[3].buffer.length = 8;
329 in_params[3].buffer.pointer = (u8 *)&capabilities;
330
331 capabilities[0] = OSC_QUERY_ENABLE;
332 capabilities[1] = 0x1;
333
334 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
335 if (ACPI_FAILURE(status))
336 return -ENODEV;
337
338 if (!output.length)
339 return -ENODEV;
340
341 out_obj = output.pointer;
342 if (out_obj->type != ACPI_TYPE_BUFFER) {
343 ret = -ENODEV;
344 goto out_free;
345 }
346
347 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
348 if (errors) {
349 ret = -ENODEV;
350 goto out_free;
351 }
352
353 supported = *((u32 *)(out_obj->buffer.pointer + 4));
354 if (!(supported & 0x1)) {
355 ret = -ENODEV;
356 goto out_free;
357 }
358
359 kfree(output.pointer);
360 capabilities[0] = 0x0;
361 capabilities[1] = 0x1;
362
363 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
364 if (ACPI_FAILURE(status))
365 return -ENODEV;
366
367 if (!output.length)
368 return -ENODEV;
369
370 out_obj = output.pointer;
371 if (out_obj->type != ACPI_TYPE_BUFFER) {
372 ret = -ENODEV;
373 goto out_free;
374 }
375
376 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
377 if (errors) {
378 ret = -ENODEV;
379 goto out_free;
380 }
381
382 supported = *((u32 *)(out_obj->buffer.pointer + 4));
383 if (!(supported & 0x1)) {
384 ret = -ENODEV;
385 goto out_free;
386 }
387
388out_free:
389 kfree(output.pointer);
390 return ret;
391}
392
393static int __init pcc_cpufreq_probe(void)
394{
395 acpi_status status;
396 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
397 struct pcc_memory_resource *mem_resource;
398 struct pcc_register_resource *reg_resource;
399 union acpi_object *out_obj, *member;
400 acpi_handle handle, osc_handle, pcch_handle;
401 int ret = 0;
402
403 status = acpi_get_handle(NULL, "\\_SB", &handle);
404 if (ACPI_FAILURE(status))
405 return -ENODEV;
406
407 status = acpi_get_handle(handle, "PCCH", &pcch_handle);
408 if (ACPI_FAILURE(status))
409 return -ENODEV;
410
411 status = acpi_get_handle(handle, "_OSC", &osc_handle);
412 if (ACPI_SUCCESS(status)) {
413 ret = pcc_cpufreq_do_osc(&osc_handle);
414 if (ret)
415 dprintk("probe: _OSC evaluation did not succeed\n");
416 /* Firmware's use of _OSC is optional */
417 ret = 0;
418 }
419
420 status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
421 if (ACPI_FAILURE(status))
422 return -ENODEV;
423
424 out_obj = output.pointer;
425 if (out_obj->type != ACPI_TYPE_PACKAGE) {
426 ret = -ENODEV;
427 goto out_free;
428 }
429
430 member = &out_obj->package.elements[0];
431 if (member->type != ACPI_TYPE_BUFFER) {
432 ret = -ENODEV;
433 goto out_free;
434 }
435
436 mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
437
438 dprintk("probe: mem_resource descriptor: 0x%x,"
439 " length: %d, space_id: %d, resource_usage: %d,"
440 " type_specific: %d, granularity: 0x%llx,"
441 " minimum: 0x%llx, maximum: 0x%llx,"
442 " translation_offset: 0x%llx, address_length: 0x%llx\n",
443 mem_resource->descriptor, mem_resource->length,
444 mem_resource->space_id, mem_resource->resource_usage,
445 mem_resource->type_specific, mem_resource->granularity,
446 mem_resource->minimum, mem_resource->maximum,
447 mem_resource->translation_offset,
448 mem_resource->address_length);
449
450 if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
451 ret = -ENODEV;
452 goto out_free;
453 }
454
455 pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
456 mem_resource->address_length);
457 if (pcch_virt_addr == NULL) {
458 dprintk("probe: could not map shared mem region\n");
459 goto out_free;
460 }
461 pcch_hdr = pcch_virt_addr;
462
463 dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
464 dprintk("probe: PCCH header is at physical address: 0x%llx,"
465 " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
466 " supported features: 0x%x, command field: 0x%x,"
467 " status field: 0x%x, nominal latency: %d us\n",
468 mem_resource->minimum, ioread32(&pcch_hdr->signature),
469 ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
470 ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
471 ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
472 ioread32(&pcch_hdr->latency));
473
474 dprintk("probe: min time between commands: %d us,"
475 " max time between commands: %d us,"
476 " nominal CPU frequency: %d MHz,"
477 " minimum CPU frequency: %d MHz,"
478 " minimum CPU frequency without throttling: %d MHz\n",
479 ioread32(&pcch_hdr->minimum_time),
480 ioread32(&pcch_hdr->maximum_time),
481 ioread32(&pcch_hdr->nominal),
482 ioread32(&pcch_hdr->throttled_frequency),
483 ioread32(&pcch_hdr->minimum_frequency));
484
485 member = &out_obj->package.elements[1];
486 if (member->type != ACPI_TYPE_BUFFER) {
487 ret = -ENODEV;
488 goto pcch_free;
489 }
490
491 reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
492
493 doorbell.space_id = reg_resource->space_id;
494 doorbell.bit_width = reg_resource->bit_width;
495 doorbell.bit_offset = reg_resource->bit_offset;
496 doorbell.access_width = 64;
497 doorbell.address = reg_resource->address;
498
499 dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
500 "bit_offset is %d, access_width is %d, address is 0x%llx\n",
501 doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
502 doorbell.access_width, reg_resource->address);
503
504 member = &out_obj->package.elements[2];
505 if (member->type != ACPI_TYPE_INTEGER) {
506 ret = -ENODEV;
507 goto pcch_free;
508 }
509
510 doorbell_preserve = member->integer.value;
511
512 member = &out_obj->package.elements[3];
513 if (member->type != ACPI_TYPE_INTEGER) {
514 ret = -ENODEV;
515 goto pcch_free;
516 }
517
518 doorbell_write = member->integer.value;
519
520 dprintk("probe: doorbell_preserve: 0x%llx,"
521 " doorbell_write: 0x%llx\n",
522 doorbell_preserve, doorbell_write);
523
524 pcc_cpu_info = alloc_percpu(struct pcc_cpu);
525 if (!pcc_cpu_info) {
526 ret = -ENOMEM;
527 goto pcch_free;
528 }
529
530 printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
531 " limits: %d MHz, %d MHz\n", PCC_VERSION,
532 ioread32(&pcch_hdr->minimum_frequency),
533 ioread32(&pcch_hdr->nominal));
534 kfree(output.pointer);
535 return ret;
536pcch_free:
537 pcc_clear_mapping();
538out_free:
539 kfree(output.pointer);
540 return ret;
541}
542
543static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
544{
545 unsigned int cpu = policy->cpu;
546 unsigned int result = 0;
547
548 if (!pcch_virt_addr) {
549 result = -1;
550 goto out;
551 }
552
553 result = pcc_get_offset(cpu);
554 if (result) {
555 dprintk("init: PCCP evaluation failed\n");
556 goto out;
557 }
558
559 policy->max = policy->cpuinfo.max_freq =
560 ioread32(&pcch_hdr->nominal) * 1000;
561 policy->min = policy->cpuinfo.min_freq =
562 ioread32(&pcch_hdr->minimum_frequency) * 1000;
563 policy->cur = pcc_get_freq(cpu);
564
565 if (!policy->cur) {
566 dprintk("init: Unable to get current CPU frequency\n");
567 result = -EINVAL;
568 goto out;
569 }
570
571 dprintk("init: policy->max is %d, policy->min is %d\n",
572 policy->max, policy->min);
573out:
574 return result;
575}
576
577static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
578{
579 return 0;
580}
581
582static struct cpufreq_driver pcc_cpufreq_driver = {
583 .flags = CPUFREQ_CONST_LOOPS,
584 .get = pcc_get_freq,
585 .verify = pcc_cpufreq_verify,
586 .target = pcc_cpufreq_target,
587 .init = pcc_cpufreq_cpu_init,
588 .exit = pcc_cpufreq_cpu_exit,
589 .name = "pcc-cpufreq",
590 .owner = THIS_MODULE,
591};
592
593static int __init pcc_cpufreq_init(void)
594{
595 int ret;
596
597 if (acpi_disabled)
598 return 0;
599
600 ret = pcc_cpufreq_probe();
601 if (ret) {
602 dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
603 return ret;
604 }
605
606 ret = cpufreq_register_driver(&pcc_cpufreq_driver);
607
608 return ret;
609}
610
611static void __exit pcc_cpufreq_exit(void)
612{
613 cpufreq_unregister_driver(&pcc_cpufreq_driver);
614
615 pcc_clear_mapping();
616
617 free_percpu(pcc_cpu_info);
618}
619
620MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
621MODULE_VERSION(PCC_VERSION);
622MODULE_DESCRIPTION("Processor Clocking Control interface driver");
623MODULE_LICENSE("GPL");
624
625late_initcall(pcc_cpufreq_init);
626module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index cb01dac267d3..b3379d6a5c57 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -13,7 +13,6 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/cpufreq.h> 14#include <linux/cpufreq.h>
15#include <linux/ioport.h> 15#include <linux/ioport.h>
16#include <linux/slab.h>
17#include <linux/timex.h> 16#include <linux/timex.h>
18#include <linux/io.h> 17#include <linux/io.h>
19 18
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 9a97116f89e5..4a45fd6e41ba 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy)
569 * We will then get the same kind of behaviour already tested under 569 * We will then get the same kind of behaviour already tested under
570 * the "well-known" other OS. 570 * the "well-known" other OS.
571 */ 571 */
572static int __init fixup_sgtc(void) 572static int __cpuinit fixup_sgtc(void)
573{ 573{
574 unsigned int sgtc; 574 unsigned int sgtc;
575 unsigned int m; 575 unsigned int m;
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu)
603} 603}
604 604
605 605
606static int __init acer_cpufreq_pst(const struct dmi_system_id *d) 606static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
607{ 607{
608 printk(KERN_WARNING PFX 608 printk(KERN_WARNING PFX
609 "%s laptop with broken PST tables in BIOS detected.\n", 609 "%s laptop with broken PST tables in BIOS detected.\n",
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
621 * A BIOS update is all that can save them. 621 * A BIOS update is all that can save them.
622 * Mention this, and disable cpufreq. 622 * Mention this, and disable cpufreq.
623 */ 623 */
624static struct dmi_system_id __initdata powernow_dmi_table[] = { 624static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
625 { 625 {
626 .callback = acer_cpufreq_pst, 626 .callback = acer_cpufreq_pst,
627 .ident = "Acer Aspire", 627 .ident = "Acer Aspire",
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
633 { } 633 { }
634}; 634};
635 635
636static int __init powernow_cpu_init(struct cpufreq_policy *policy) 636static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
637{ 637{
638 union msr_fidvidstatus fidvidstatus; 638 union msr_fidvidstatus fidvidstatus;
639 int result; 639 int result;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index f125e5c551c0..491977baf6c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
1
2/* 1/*
3 * (c) 2003-2006 Advanced Micro Devices, Inc. 2 * (c) 2003-2010 Advanced Micro Devices, Inc.
4 * Your use of this code is subject to the terms and conditions of the 3 * Your use of this code is subject to the terms and conditions of the
5 * GNU general public license version 2. See "COPYING" or 4 * GNU general public license version 2. See "COPYING" or
6 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
@@ -10,7 +9,7 @@
10 * Based on the powernow-k7.c module written by Dave Jones. 9 * Based on the powernow-k7.c module written by Dave Jones.
11 * (C) 2003 Dave Jones on behalf of SuSE Labs 10 * (C) 2003 Dave Jones on behalf of SuSE Labs
12 * (C) 2004 Dominik Brodowski <linux@brodo.de> 11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
13 * (C) 2004 Pavel Machek <pavel@suse.cz> 12 * (C) 2004 Pavel Machek <pavel@ucw.cz>
14 * Licensed under the terms of the GNU GPL License version 2. 13 * Licensed under the terms of the GNU GPL License version 2.
15 * Based upon datasheets & sample CPUs kindly provided by AMD. 14 * Based upon datasheets & sample CPUs kindly provided by AMD.
16 * 15 *
@@ -46,6 +45,7 @@
46#define PFX "powernow-k8: " 45#define PFX "powernow-k8: "
47#define VERSION "version 2.20.00" 46#define VERSION "version 2.20.00"
48#include "powernow-k8.h" 47#include "powernow-k8.h"
48#include "mperf.h"
49 49
50/* serialize freq changes */ 50/* serialize freq changes */
51static DEFINE_MUTEX(fidvid_mutex); 51static DEFINE_MUTEX(fidvid_mutex);
@@ -54,6 +54,12 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
54 54
55static int cpu_family = CPU_OPTERON; 55static int cpu_family = CPU_OPTERON;
56 56
57/* core performance boost */
58static bool cpb_capable, cpb_enabled;
59static struct msr __percpu *msrs;
60
61static struct cpufreq_driver cpufreq_amd64_driver;
62
57#ifndef CONFIG_SMP 63#ifndef CONFIG_SMP
58static inline const struct cpumask *cpu_core_mask(int cpu) 64static inline const struct cpumask *cpu_core_mask(int cpu)
59{ 65{
@@ -800,13 +806,15 @@ static int find_psb_table(struct powernow_k8_data *data)
800 * www.amd.com 806 * www.amd.com
801 */ 807 */
802 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); 808 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
809 printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
810 " and Cool'N'Quiet support is enabled in BIOS setup\n");
803 return -ENODEV; 811 return -ENODEV;
804} 812}
805 813
806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, 814static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
807 unsigned int index) 815 unsigned int index)
808{ 816{
809 acpi_integer control; 817 u64 control;
810 818
811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 819 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
812 return; 820 return;
@@ -824,7 +832,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
824{ 832{
825 struct cpufreq_frequency_table *powernow_table; 833 struct cpufreq_frequency_table *powernow_table;
826 int ret_val = -ENODEV; 834 int ret_val = -ENODEV;
827 acpi_integer control, status; 835 u64 control, status;
828 836
829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 837 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
830 dprintk("register performance failed: bad ACPI data\n"); 838 dprintk("register performance failed: bad ACPI data\n");
@@ -904,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
904{ 912{
905 int i; 913 int i;
906 u32 hi = 0, lo = 0; 914 u32 hi = 0, lo = 0;
907 rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); 915 rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
908 data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; 916 data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
909 917
910 for (i = 0; i < data->acpi_data.state_count; i++) { 918 for (i = 0; i < data->acpi_data.state_count; i++) {
911 u32 index; 919 u32 index;
@@ -929,7 +937,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
929 powernow_table[i].index = index; 937 powernow_table[i].index = index;
930 938
931 /* Frequency may be rounded for these */ 939 /* Frequency may be rounded for these */
932 if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) { 940 if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
941 || boot_cpu_data.x86 == 0x11) {
933 powernow_table[i].frequency = 942 powernow_table[i].frequency =
934 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); 943 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
935 } else 944 } else
@@ -948,7 +957,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
948 u32 fid; 957 u32 fid;
949 u32 vid; 958 u32 vid;
950 u32 freq, index; 959 u32 freq, index;
951 acpi_integer status, control; 960 u64 status, control;
952 961
953 if (data->exttype) { 962 if (data->exttype) {
954 status = data->acpi_data.states[i].status; 963 status = data->acpi_data.states[i].status;
@@ -1016,13 +1025,12 @@ static int get_transition_latency(struct powernow_k8_data *data)
1016 } 1025 }
1017 if (max_latency == 0) { 1026 if (max_latency == 0) {
1018 /* 1027 /*
1019 * Fam 11h always returns 0 as transition latency. 1028 * Fam 11h and later may return 0 as transition latency. This
1020 * This is intended and means "very fast". While cpufreq core 1029 * is intended and means "very fast". While cpufreq core and
1021 * and governors currently can handle that gracefully, better 1030 * governors currently can handle that gracefully, better set it
1022 * set it to 1 to avoid problems in the future. 1031 * to 1 to avoid problems in the future.
1023 * For all others it's a BIOS bug.
1024 */ 1032 */
1025 if (boot_cpu_data.x86 != 0x11) 1033 if (boot_cpu_data.x86 < 0x11)
1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition " 1034 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1027 "latency\n"); 1035 "latency\n");
1028 max_latency = 1; 1036 max_latency = 1;
@@ -1248,6 +1256,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1248 struct powernow_k8_data *data; 1256 struct powernow_k8_data *data;
1249 struct init_on_cpu init_on_cpu; 1257 struct init_on_cpu init_on_cpu;
1250 int rc; 1258 int rc;
1259 struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
1251 1260
1252 if (!cpu_online(pol->cpu)) 1261 if (!cpu_online(pol->cpu))
1253 return -ENODEV; 1262 return -ENODEV;
@@ -1322,6 +1331,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1322 return -EINVAL; 1331 return -EINVAL;
1323 } 1332 }
1324 1333
1334 /* Check for APERF/MPERF support in hardware */
1335 if (cpu_has(c, X86_FEATURE_APERFMPERF))
1336 cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
1337
1325 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); 1338 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1326 1339
1327 if (cpu_family == CPU_HW_PSTATE) 1340 if (cpu_family == CPU_HW_PSTATE)
@@ -1356,6 +1369,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1356 1369
1357 kfree(data->powernow_table); 1370 kfree(data->powernow_table);
1358 kfree(data); 1371 kfree(data);
1372 per_cpu(powernow_data, pol->cpu) = NULL;
1359 1373
1360 return 0; 1374 return 0;
1361} 1375}
@@ -1375,7 +1389,7 @@ static unsigned int powernowk8_get(unsigned int cpu)
1375 int err; 1389 int err;
1376 1390
1377 if (!data) 1391 if (!data)
1378 return -EINVAL; 1392 return 0;
1379 1393
1380 smp_call_function_single(cpu, query_values_on_cpu, &err, true); 1394 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1381 if (err) 1395 if (err)
@@ -1392,8 +1406,77 @@ out:
1392 return khz; 1406 return khz;
1393} 1407}
1394 1408
1409static void _cpb_toggle_msrs(bool t)
1410{
1411 int cpu;
1412
1413 get_online_cpus();
1414
1415 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1416
1417 for_each_cpu(cpu, cpu_online_mask) {
1418 struct msr *reg = per_cpu_ptr(msrs, cpu);
1419 if (t)
1420 reg->l &= ~BIT(25);
1421 else
1422 reg->l |= BIT(25);
1423 }
1424 wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1425
1426 put_online_cpus();
1427}
1428
1429/*
1430 * Switch on/off core performance boosting.
1431 *
1432 * 0=disable
1433 * 1=enable.
1434 */
1435static void cpb_toggle(bool t)
1436{
1437 if (!cpb_capable)
1438 return;
1439
1440 if (t && !cpb_enabled) {
1441 cpb_enabled = true;
1442 _cpb_toggle_msrs(t);
1443 printk(KERN_INFO PFX "Core Boosting enabled.\n");
1444 } else if (!t && cpb_enabled) {
1445 cpb_enabled = false;
1446 _cpb_toggle_msrs(t);
1447 printk(KERN_INFO PFX "Core Boosting disabled.\n");
1448 }
1449}
1450
1451static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
1452 size_t count)
1453{
1454 int ret = -EINVAL;
1455 unsigned long val = 0;
1456
1457 ret = strict_strtoul(buf, 10, &val);
1458 if (!ret && (val == 0 || val == 1) && cpb_capable)
1459 cpb_toggle(val);
1460 else
1461 return -EINVAL;
1462
1463 return count;
1464}
1465
1466static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
1467{
1468 return sprintf(buf, "%u\n", cpb_enabled);
1469}
1470
1471#define define_one_rw(_name) \
1472static struct freq_attr _name = \
1473__ATTR(_name, 0644, show_##_name, store_##_name)
1474
1475define_one_rw(cpb);
1476
1395static struct freq_attr *powernow_k8_attr[] = { 1477static struct freq_attr *powernow_k8_attr[] = {
1396 &cpufreq_freq_attr_scaling_available_freqs, 1478 &cpufreq_freq_attr_scaling_available_freqs,
1479 &cpb,
1397 NULL, 1480 NULL,
1398}; 1481};
1399 1482
@@ -1409,10 +1492,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
1409 .attr = powernow_k8_attr, 1492 .attr = powernow_k8_attr,
1410}; 1493};
1411 1494
1495/*
1496 * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
1497 * cannot block the remaining ones from boosting. On the CPU_UP path we
1498 * simply keep the boost-disable flag in sync with the current global
1499 * state.
1500 */
1501static int cpb_notify(struct notifier_block *nb, unsigned long action,
1502 void *hcpu)
1503{
1504 unsigned cpu = (long)hcpu;
1505 u32 lo, hi;
1506
1507 switch (action) {
1508 case CPU_UP_PREPARE:
1509 case CPU_UP_PREPARE_FROZEN:
1510
1511 if (!cpb_enabled) {
1512 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1513 lo |= BIT(25);
1514 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1515 }
1516 break;
1517
1518 case CPU_DOWN_PREPARE:
1519 case CPU_DOWN_PREPARE_FROZEN:
1520 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1521 lo &= ~BIT(25);
1522 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1523 break;
1524
1525 default:
1526 break;
1527 }
1528
1529 return NOTIFY_OK;
1530}
1531
1532static struct notifier_block cpb_nb = {
1533 .notifier_call = cpb_notify,
1534};
1535
1412/* driver entry point for init */ 1536/* driver entry point for init */
1413static int __cpuinit powernowk8_init(void) 1537static int __cpuinit powernowk8_init(void)
1414{ 1538{
1415 unsigned int i, supported_cpus = 0; 1539 unsigned int i, supported_cpus = 0, cpu;
1416 1540
1417 for_each_online_cpu(i) { 1541 for_each_online_cpu(i) {
1418 int rc; 1542 int rc;
@@ -1421,15 +1545,36 @@ static int __cpuinit powernowk8_init(void)
1421 supported_cpus++; 1545 supported_cpus++;
1422 } 1546 }
1423 1547
1424 if (supported_cpus == num_online_cpus()) { 1548 if (supported_cpus != num_online_cpus())
1425 printk(KERN_INFO PFX "Found %d %s " 1549 return -ENODEV;
1426 "processors (%d cpu cores) (" VERSION ")\n", 1550
1427 num_online_nodes(), 1551 printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
1428 boot_cpu_data.x86_model_id, supported_cpus); 1552 num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
1429 return cpufreq_register_driver(&cpufreq_amd64_driver); 1553
1554 if (boot_cpu_has(X86_FEATURE_CPB)) {
1555
1556 cpb_capable = true;
1557
1558 register_cpu_notifier(&cpb_nb);
1559
1560 msrs = msrs_alloc();
1561 if (!msrs) {
1562 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
1563 return -ENOMEM;
1564 }
1565
1566 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1567
1568 for_each_cpu(cpu, cpu_online_mask) {
1569 struct msr *reg = per_cpu_ptr(msrs, cpu);
1570 cpb_enabled |= !(!!(reg->l & BIT(25)));
1571 }
1572
1573 printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
1574 (cpb_enabled ? "on" : "off"));
1430 } 1575 }
1431 1576
1432 return -ENODEV; 1577 return cpufreq_register_driver(&cpufreq_amd64_driver);
1433} 1578}
1434 1579
1435/* driver entry point for term */ 1580/* driver entry point for term */
@@ -1437,6 +1582,13 @@ static void __exit powernowk8_exit(void)
1437{ 1582{
1438 dprintk("exit\n"); 1583 dprintk("exit\n");
1439 1584
1585 if (boot_cpu_has(X86_FEATURE_CPB)) {
1586 msrs_free(msrs);
1587 msrs = NULL;
1588
1589 unregister_cpu_notifier(&cpb_nb);
1590 }
1591
1440 cpufreq_unregister_driver(&cpufreq_amd64_driver); 1592 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1441} 1593}
1442 1594
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824073cb..df3529b1c02d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
5 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
6 */ 6 */
7 7
8
9enum pstate { 8enum pstate {
10 HW_PSTATE_INVALID = 0xff, 9 HW_PSTATE_INVALID = 0xff,
11 HW_PSTATE_0 = 0, 10 HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
55 struct cpumask *available_cores; 54 struct cpumask *available_cores;
56}; 55};
57 56
58
59/* processor's cpuid instruction support */ 57/* processor's cpuid instruction support */
60#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ 58#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
61#define CPUID_XFAM 0x0ff00000 /* extended family */ 59#define CPUID_XFAM 0x0ff00000 /* extended family */
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 8d672ef162ce..9b1ff37de46a 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> /* current */ 20#include <linux/sched.h> /* current */
21#include <linux/delay.h> 21#include <linux/delay.h>
22#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/gfp.h>
23 24
24#include <asm/msr.h> 25#include <asm/msr.h>
25#include <asm/processor.h> 26#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 2ce8e0b5cc54..561758e95180 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -23,7 +23,6 @@
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/cpufreq.h> 24#include <linux/cpufreq.h>
25#include <linux/pci.h> 25#include <linux/pci.h>
26#include <linux/slab.h>
27#include <linux/sched.h> 26#include <linux/sched.h>
28 27
29#include "speedstep-lib.h" 28#include "speedstep-lib.h"
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index ad0083abfa23..a94ec6be69fa 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -13,7 +13,6 @@
13#include <linux/moduleparam.h> 13#include <linux/moduleparam.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/cpufreq.h> 15#include <linux/cpufreq.h>
16#include <linux/slab.h>
17 16
18#include <asm/msr.h> 17#include <asm/msr.h>
19#include <asm/tsc.h> 18#include <asm/tsc.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 04d73c114e49..8abd869baabf 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -17,7 +17,6 @@
17#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/cpufreq.h> 19#include <linux/cpufreq.h>
20#include <linux/slab.h>
21#include <linux/delay.h> 20#include <linux/delay.h>
22#include <linux/io.h> 21#include <linux/io.h>
23#include <asm/ist.h> 22#include <asm/ist.h>
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922de33a..8095f8611f8a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,37 +21,58 @@
21 * 21 *
22 */ 22 */
23 23
24#include <linux/module.h>
24#include <asm/processor.h> 25#include <asm/processor.h>
25#include <asm/vmware.h>
26#include <asm/hypervisor.h> 26#include <asm/hypervisor.h>
27 27
28static inline void __cpuinit 28/*
29detect_hypervisor_vendor(struct cpuinfo_x86 *c) 29 * Hypervisor detect order. This is specified explicitly here because
30 * some hypervisors might implement compatibility modes for other
31 * hypervisors and therefore need to be detected in specific sequence.
32 */
33static const __initconst struct hypervisor_x86 * const hypervisors[] =
30{ 34{
31 if (vmware_platform()) 35 &x86_hyper_vmware,
32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; 36 &x86_hyper_ms_hyperv,
33 else 37#ifdef CONFIG_XEN_PVHVM
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; 38 &x86_hyper_xen_hvm,
35} 39#endif
40};
36 41
37static inline void __cpuinit 42const struct hypervisor_x86 *x86_hyper;
38hypervisor_set_feature_bits(struct cpuinfo_x86 *c) 43EXPORT_SYMBOL(x86_hyper);
44
45static inline void __init
46detect_hypervisor_vendor(void)
39{ 47{
40 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { 48 const struct hypervisor_x86 *h, * const *p;
41 vmware_set_feature_bits(c); 49
42 return; 50 for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
51 h = *p;
52 if (h->detect()) {
53 x86_hyper = h;
54 printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
55 break;
56 }
43 } 57 }
44} 58}
45 59
46void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) 60void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
47{ 61{
48 detect_hypervisor_vendor(c); 62 if (x86_hyper && x86_hyper->set_cpu_features)
49 hypervisor_set_feature_bits(c); 63 x86_hyper->set_cpu_features(c);
50} 64}
51 65
52void __init init_hypervisor_platform(void) 66void __init init_hypervisor_platform(void)
53{ 67{
68
69 detect_hypervisor_vendor();
70
71 if (!x86_hyper)
72 return;
73
54 init_hypervisor(&boot_cpu_data); 74 init_hypervisor(&boot_cpu_data);
55 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) 75
56 vmware_platform_setup(); 76 if (x86_hyper->init_platform)
77 x86_hyper->init_platform();
57} 78}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 879666f4d871..b4389441efbb 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
12#include <asm/processor.h> 12#include <asm/processor.h>
13#include <asm/pgtable.h> 13#include <asm/pgtable.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17#include <asm/cpu.h> 16#include <asm/cpu.h>
18 17
@@ -40,6 +39,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
40 misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; 39 misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
41 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 40 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
42 c->cpuid_level = cpuid_eax(0); 41 c->cpuid_level = cpuid_eax(0);
42 get_cpu_cap(c);
43 } 43 }
44 } 44 }
45 45
@@ -47,6 +47,27 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
47 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 47 (c->x86 == 0x6 && c->x86_model >= 0x0e))
48 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 48 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
49 49
50 /*
51 * Atom erratum AAE44/AAF40/AAG38/AAH41:
52 *
53 * A race condition between speculative fetches and invalidating
54 * a large page. This is worked around in microcode, but we
55 * need the microcode to have already been loaded... so if it is
56 * not, recommend a BIOS update and disable large pages.
57 */
58 if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) {
59 u32 ucode, junk;
60
61 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
62 sync_core();
63 rdmsr(MSR_IA32_UCODE_REV, junk, ucode);
64
65 if (ucode < 0x20e) {
66 printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
67 clear_cpu_cap(c, X86_FEATURE_PSE);
68 }
69 }
70
50#ifdef CONFIG_X86_64 71#ifdef CONFIG_X86_64
51 set_cpu_cap(c, X86_FEATURE_SYSENTER32); 72 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
52#else 73#else
@@ -70,7 +91,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
70 if (c->x86_power & (1 << 8)) { 91 if (c->x86_power & (1 << 8)) {
71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 92 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 93 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
73 sched_clock_stable = 1; 94 if (!check_tsc_unstable())
95 sched_clock_stable = 1;
74 } 96 }
75 97
76 /* 98 /*
@@ -351,12 +373,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
351 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 373 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
352 } 374 }
353 375
354 if (c->cpuid_level > 6) {
355 unsigned ecx = cpuid_ecx(6);
356 if (ecx & 0x01)
357 set_cpu_cap(c, X86_FEATURE_APERFMPERF);
358 }
359
360 if (cpu_has_xmm2) 376 if (cpu_has_xmm2)
361 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 377 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
362 if (cpu_has_ds) { 378 if (cpu_has_ds) {
@@ -366,7 +382,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
366 set_cpu_cap(c, X86_FEATURE_BTS); 382 set_cpu_cap(c, X86_FEATURE_BTS);
367 if (!(l1 & (1<<12))) 383 if (!(l1 & (1<<12)))
368 set_cpu_cap(c, X86_FEATURE_PEBS); 384 set_cpu_cap(c, X86_FEATURE_PEBS);
369 ds_init_intel(c);
370 } 385 }
371 386
372 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) 387 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index fc6c8ef92dcc..898c2f4eab88 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -18,6 +18,7 @@
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/k8.h>
21#include <asm/smp.h>
21 22
22#define LVL_1_INST 1 23#define LVL_1_INST 1
23#define LVL_1_DATA 2 24#define LVL_1_DATA 2
@@ -31,6 +32,8 @@ struct _cache_table {
31 short size; 32 short size;
32}; 33};
33 34
35#define MB(x) ((x) * 1024)
36
34/* All the cache descriptor types we care about (no TLB or 37/* All the cache descriptor types we care about (no TLB or
35 trace cache entries) */ 38 trace cache entries) */
36 39
@@ -44,9 +47,9 @@ static const struct _cache_table __cpuinitconst cache_table[] =
44 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ 47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
45 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ 48 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
46 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 49 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
47 { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 50 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
48 { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 51 { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */
49 { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 52 { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */
50 { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ 53 { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
51 { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ 54 { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
52 { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 55 { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -59,16 +62,16 @@ static const struct _cache_table __cpuinitconst cache_table[] =
59 { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ 62 { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
60 { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ 63 { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
61 { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ 64 { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
62 { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */ 65 { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */
63 { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */ 66 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
64 { 0x46, LVL_3, 4096 }, /* 4-way set assoc, 64 byte line size */ 67 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
65 { 0x47, LVL_3, 8192 }, /* 8-way set assoc, 64 byte line size */ 68 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
66 { 0x49, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 69 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
67 { 0x4a, LVL_3, 6144 }, /* 12-way set assoc, 64 byte line size */ 70 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
68 { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 71 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
69 { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ 72 { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */
70 { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ 73 { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */
71 { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */ 74 { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */
72 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 75 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
73 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 76 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
74 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 77 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -77,34 +80,34 @@ static const struct _cache_table __cpuinitconst cache_table[] =
77 { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ 80 { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
78 { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ 81 { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
79 { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ 82 { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
80 { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */ 83 { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
81 { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 84 { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
82 { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 85 { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
83 { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 86 { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
84 { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 87 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
85 { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */ 88 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
86 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ 89 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
87 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ 90 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
88 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ 91 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
89 { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */ 92 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
90 { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */ 93 { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */
91 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ 94 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
92 { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */ 95 { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */
93 { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ 96 { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ 97 { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */
95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ 98 { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */
96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ 99 { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */
97 { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */ 100 { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */
98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 101 { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ 102 { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */
100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 103 { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
101 { 0xde, LVL_3, 8192 }, /* 12-way set assoc, 64 byte line size */ 104 { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */
102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ 105 { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */
103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 106 { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 107 { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
105 { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */ 108 { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */
106 { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */ 109 { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */
107 { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */ 110 { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */
108 { 0x00, 0, 0} 111 { 0x00, 0, 0}
109}; 112};
110 113
@@ -145,12 +148,19 @@ union _cpuid4_leaf_ecx {
145 u32 full; 148 u32 full;
146}; 149};
147 150
151struct amd_l3_cache {
152 struct pci_dev *dev;
153 bool can_disable;
154 unsigned indices;
155 u8 subcaches[4];
156};
157
148struct _cpuid4_info { 158struct _cpuid4_info {
149 union _cpuid4_leaf_eax eax; 159 union _cpuid4_leaf_eax eax;
150 union _cpuid4_leaf_ebx ebx; 160 union _cpuid4_leaf_ebx ebx;
151 union _cpuid4_leaf_ecx ecx; 161 union _cpuid4_leaf_ecx ecx;
152 unsigned long size; 162 unsigned long size;
153 unsigned long can_disable; 163 struct amd_l3_cache *l3;
154 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 164 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
155}; 165};
156 166
@@ -160,7 +170,7 @@ struct _cpuid4_info_regs {
160 union _cpuid4_leaf_ebx ebx; 170 union _cpuid4_leaf_ebx ebx;
161 union _cpuid4_leaf_ecx ecx; 171 union _cpuid4_leaf_ecx ecx;
162 unsigned long size; 172 unsigned long size;
163 unsigned long can_disable; 173 struct amd_l3_cache *l3;
164}; 174};
165 175
166unsigned short num_cache_leaves; 176unsigned short num_cache_leaves;
@@ -290,22 +300,269 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
290 (ebx->split.ways_of_associativity + 1) - 1; 300 (ebx->split.ways_of_associativity + 1) - 1;
291} 301}
292 302
293static void __cpuinit 303struct _cache_attr {
294amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 304 struct attribute attr;
305 ssize_t (*show)(struct _cpuid4_info *, char *);
306 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
307};
308
309#ifdef CONFIG_CPU_SUP_AMD
310
311/*
312 * L3 cache descriptors
313 */
314static struct amd_l3_cache **__cpuinitdata l3_caches;
315
316static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
317{
318 unsigned int sc0, sc1, sc2, sc3;
319 u32 val = 0;
320
321 pci_read_config_dword(l3->dev, 0x1C4, &val);
322
323 /* calculate subcache sizes */
324 l3->subcaches[0] = sc0 = !(val & BIT(0));
325 l3->subcaches[1] = sc1 = !(val & BIT(4));
326 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
327 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
328
329 l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
330}
331
332static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
333{
334 struct amd_l3_cache *l3;
335 struct pci_dev *dev = node_to_k8_nb_misc(node);
336
337 l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
338 if (!l3) {
339 printk(KERN_WARNING "Error allocating L3 struct\n");
340 return NULL;
341 }
342
343 l3->dev = dev;
344
345 amd_calc_l3_indices(l3);
346
347 return l3;
348}
349
350static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
351 int index)
295{ 352{
353 int node;
354
355 if (boot_cpu_data.x86 != 0x10)
356 return;
357
296 if (index < 3) 358 if (index < 3)
297 return; 359 return;
298 360
299 if (boot_cpu_data.x86 == 0x11) 361 /* see errata #382 and #388 */
362 if (boot_cpu_data.x86_model < 0x8)
300 return; 363 return;
301 364
302 /* see erratum #382 */ 365 if ((boot_cpu_data.x86_model == 0x8 ||
303 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) 366 boot_cpu_data.x86_model == 0x9)
367 &&
368 boot_cpu_data.x86_mask < 0x1)
369 return;
370
371 /* not in virtualized environments */
372 if (num_k8_northbridges == 0)
304 return; 373 return;
305 374
306 this_leaf->can_disable = 1; 375 /*
376 * Strictly speaking, the amount in @size below is leaked since it is
377 * never freed but this is done only on shutdown so it doesn't matter.
378 */
379 if (!l3_caches) {
380 int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
381
382 l3_caches = kzalloc(size, GFP_ATOMIC);
383 if (!l3_caches)
384 return;
385 }
386
387 node = amd_get_nb_id(smp_processor_id());
388
389 if (!l3_caches[node]) {
390 l3_caches[node] = amd_init_l3_cache(node);
391 l3_caches[node]->can_disable = true;
392 }
393
394 WARN_ON(!l3_caches[node]);
395
396 this_leaf->l3 = l3_caches[node];
397}
398
399/*
400 * check whether a slot used for disabling an L3 index is occupied.
401 * @l3: L3 cache descriptor
402 * @slot: slot number (0..1)
403 *
404 * @returns: the disabled index if used or negative value if slot free.
405 */
406int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
407{
408 unsigned int reg = 0;
409
410 pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
411
412 /* check whether this slot is activated already */
413 if (reg & (3UL << 30))
414 return reg & 0xfff;
415
416 return -1;
417}
418
419static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
420 unsigned int slot)
421{
422 int index;
423
424 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
425 return -EINVAL;
426
427 index = amd_get_l3_disable_slot(this_leaf->l3, slot);
428 if (index >= 0)
429 return sprintf(buf, "%d\n", index);
430
431 return sprintf(buf, "FREE\n");
432}
433
434#define SHOW_CACHE_DISABLE(slot) \
435static ssize_t \
436show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \
437{ \
438 return show_cache_disable(this_leaf, buf, slot); \
439}
440SHOW_CACHE_DISABLE(0)
441SHOW_CACHE_DISABLE(1)
442
443static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
444 unsigned slot, unsigned long idx)
445{
446 int i;
447
448 idx |= BIT(30);
449
450 /*
451 * disable index in all 4 subcaches
452 */
453 for (i = 0; i < 4; i++) {
454 u32 reg = idx | (i << 20);
455
456 if (!l3->subcaches[i])
457 continue;
458
459 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
460
461 /*
462 * We need to WBINVD on a core on the node containing the L3
463 * cache which indices we disable therefore a simple wbinvd()
464 * is not sufficient.
465 */
466 wbinvd_on_cpu(cpu);
467
468 reg |= BIT(31);
469 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
470 }
307} 471}
308 472
473/*
474 * disable a L3 cache index by using a disable-slot
475 *
476 * @l3: L3 cache descriptor
477 * @cpu: A CPU on the node containing the L3 cache
478 * @slot: slot number (0..1)
479 * @index: index to disable
480 *
481 * @return: 0 on success, error status on failure
482 */
483int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
484 unsigned long index)
485{
486 int ret = 0;
487
488#define SUBCACHE_MASK (3UL << 20)
489#define SUBCACHE_INDEX 0xfff
490
491 /*
492 * check whether this slot is already used or
493 * the index is already disabled
494 */
495 ret = amd_get_l3_disable_slot(l3, slot);
496 if (ret >= 0)
497 return -EINVAL;
498
499 /*
500 * check whether the other slot has disabled the
501 * same index already
502 */
503 if (index == amd_get_l3_disable_slot(l3, !slot))
504 return -EINVAL;
505
506 /* do not allow writes outside of allowed bits */
507 if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
508 ((index & SUBCACHE_INDEX) > l3->indices))
509 return -EINVAL;
510
511 amd_l3_disable_index(l3, cpu, slot, index);
512
513 return 0;
514}
515
516static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
517 const char *buf, size_t count,
518 unsigned int slot)
519{
520 unsigned long val = 0;
521 int cpu, err = 0;
522
523 if (!capable(CAP_SYS_ADMIN))
524 return -EPERM;
525
526 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
527 return -EINVAL;
528
529 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
530
531 if (strict_strtoul(buf, 10, &val) < 0)
532 return -EINVAL;
533
534 err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
535 if (err) {
536 if (err == -EEXIST)
537 printk(KERN_WARNING "L3 disable slot %d in use!\n",
538 slot);
539 return err;
540 }
541 return count;
542}
543
544#define STORE_CACHE_DISABLE(slot) \
545static ssize_t \
546store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
547 const char *buf, size_t count) \
548{ \
549 return store_cache_disable(this_leaf, buf, count, slot); \
550}
551STORE_CACHE_DISABLE(0)
552STORE_CACHE_DISABLE(1)
553
554static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
555 show_cache_disable_0, store_cache_disable_0);
556static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
557 show_cache_disable_1, store_cache_disable_1);
558
559#else /* CONFIG_CPU_SUP_AMD */
560static void __cpuinit
561amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
562{
563};
564#endif /* CONFIG_CPU_SUP_AMD */
565
309static int 566static int
310__cpuinit cpuid4_cache_lookup_regs(int index, 567__cpuinit cpuid4_cache_lookup_regs(int index,
311 struct _cpuid4_info_regs *this_leaf) 568 struct _cpuid4_info_regs *this_leaf)
@@ -317,8 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
317 574
318 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 575 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
319 amd_cpuid4(index, &eax, &ebx, &ecx); 576 amd_cpuid4(index, &eax, &ebx, &ecx);
320 if (boot_cpu_data.x86 >= 0x10) 577 amd_check_l3_disable(this_leaf, index);
321 amd_check_l3_disable(index, this_leaf);
322 } else { 578 } else {
323 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 579 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
324 } 580 }
@@ -575,6 +831,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
575 for (i = 0; i < num_cache_leaves; i++) 831 for (i = 0; i < num_cache_leaves; i++)
576 cache_remove_shared_cpu_map(cpu, i); 832 cache_remove_shared_cpu_map(cpu, i);
577 833
834 kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
578 kfree(per_cpu(ici_cpuid4_info, cpu)); 835 kfree(per_cpu(ici_cpuid4_info, cpu));
579 per_cpu(ici_cpuid4_info, cpu) = NULL; 836 per_cpu(ici_cpuid4_info, cpu) = NULL;
580} 837}
@@ -711,82 +968,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
711#define to_object(k) container_of(k, struct _index_kobject, kobj) 968#define to_object(k) container_of(k, struct _index_kobject, kobj)
712#define to_attr(a) container_of(a, struct _cache_attr, attr) 969#define to_attr(a) container_of(a, struct _cache_attr, attr)
713 970
714static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
715 unsigned int index)
716{
717 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
718 int node = cpu_to_node(cpu);
719 struct pci_dev *dev = node_to_k8_nb_misc(node);
720 unsigned int reg = 0;
721
722 if (!this_leaf->can_disable)
723 return -EINVAL;
724
725 if (!dev)
726 return -EINVAL;
727
728 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
729 return sprintf(buf, "%x\n", reg);
730}
731
732#define SHOW_CACHE_DISABLE(index) \
733static ssize_t \
734show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
735{ \
736 return show_cache_disable(this_leaf, buf, index); \
737}
738SHOW_CACHE_DISABLE(0)
739SHOW_CACHE_DISABLE(1)
740
741static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
742 const char *buf, size_t count, unsigned int index)
743{
744 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
745 int node = cpu_to_node(cpu);
746 struct pci_dev *dev = node_to_k8_nb_misc(node);
747 unsigned long val = 0;
748 unsigned int scrubber = 0;
749
750 if (!this_leaf->can_disable)
751 return -EINVAL;
752
753 if (!capable(CAP_SYS_ADMIN))
754 return -EPERM;
755
756 if (!dev)
757 return -EINVAL;
758
759 if (strict_strtoul(buf, 10, &val) < 0)
760 return -EINVAL;
761
762 val |= 0xc0000000;
763
764 pci_read_config_dword(dev, 0x58, &scrubber);
765 scrubber &= ~0x1f000000;
766 pci_write_config_dword(dev, 0x58, scrubber);
767
768 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
769 wbinvd();
770 pci_write_config_dword(dev, 0x1BC + index * 4, val);
771 return count;
772}
773
774#define STORE_CACHE_DISABLE(index) \
775static ssize_t \
776store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
777 const char *buf, size_t count) \
778{ \
779 return store_cache_disable(this_leaf, buf, count, index); \
780}
781STORE_CACHE_DISABLE(0)
782STORE_CACHE_DISABLE(1)
783
784struct _cache_attr {
785 struct attribute attr;
786 ssize_t (*show)(struct _cpuid4_info *, char *);
787 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
788};
789
790#define define_one_ro(_name) \ 971#define define_one_ro(_name) \
791static struct _cache_attr _name = \ 972static struct _cache_attr _name = \
792 __ATTR(_name, 0444, show_##_name, NULL) 973 __ATTR(_name, 0444, show_##_name, NULL)
@@ -801,23 +982,28 @@ define_one_ro(size);
801define_one_ro(shared_cpu_map); 982define_one_ro(shared_cpu_map);
802define_one_ro(shared_cpu_list); 983define_one_ro(shared_cpu_list);
803 984
804static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, 985#define DEFAULT_SYSFS_CACHE_ATTRS \
805 show_cache_disable_0, store_cache_disable_0); 986 &type.attr, \
806static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 987 &level.attr, \
807 show_cache_disable_1, store_cache_disable_1); 988 &coherency_line_size.attr, \
989 &physical_line_partition.attr, \
990 &ways_of_associativity.attr, \
991 &number_of_sets.attr, \
992 &size.attr, \
993 &shared_cpu_map.attr, \
994 &shared_cpu_list.attr
808 995
809static struct attribute *default_attrs[] = { 996static struct attribute *default_attrs[] = {
810 &type.attr, 997 DEFAULT_SYSFS_CACHE_ATTRS,
811 &level.attr, 998 NULL
812 &coherency_line_size.attr, 999};
813 &physical_line_partition.attr, 1000
814 &ways_of_associativity.attr, 1001static struct attribute *default_l3_attrs[] = {
815 &number_of_sets.attr, 1002 DEFAULT_SYSFS_CACHE_ATTRS,
816 &size.attr, 1003#ifdef CONFIG_CPU_SUP_AMD
817 &shared_cpu_map.attr,
818 &shared_cpu_list.attr,
819 &cache_disable_0.attr, 1004 &cache_disable_0.attr,
820 &cache_disable_1.attr, 1005 &cache_disable_1.attr,
1006#endif
821 NULL 1007 NULL
822}; 1008};
823 1009
@@ -848,7 +1034,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
848 return ret; 1034 return ret;
849} 1035}
850 1036
851static struct sysfs_ops sysfs_ops = { 1037static const struct sysfs_ops sysfs_ops = {
852 .show = show, 1038 .show = show,
853 .store = store, 1039 .store = store,
854}; 1040};
@@ -908,6 +1094,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
908 unsigned int cpu = sys_dev->id; 1094 unsigned int cpu = sys_dev->id;
909 unsigned long i, j; 1095 unsigned long i, j;
910 struct _index_kobject *this_object; 1096 struct _index_kobject *this_object;
1097 struct _cpuid4_info *this_leaf;
911 int retval; 1098 int retval;
912 1099
913 retval = cpuid4_cache_sysfs_init(cpu); 1100 retval = cpuid4_cache_sysfs_init(cpu);
@@ -926,6 +1113,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
926 this_object = INDEX_KOBJECT_PTR(cpu, i); 1113 this_object = INDEX_KOBJECT_PTR(cpu, i);
927 this_object->cpu = cpu; 1114 this_object->cpu = cpu;
928 this_object->index = i; 1115 this_object->index = i;
1116
1117 this_leaf = CPUID4_INFO_IDX(cpu, i);
1118
1119 if (this_leaf->l3 && this_leaf->l3->can_disable)
1120 ktype_cache.default_attrs = default_l3_attrs;
1121 else
1122 ktype_cache.default_attrs = default_attrs;
1123
929 retval = kobject_init_and_add(&(this_object->kobj), 1124 retval = kobject_init_and_add(&(this_object->kobj),
930 &ktype_cache, 1125 &ktype_cache,
931 per_cpu(ici_cache_kobject, cpu), 1126 per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 4ac6d48fe11b..bb34b03af252 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o 7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
8 8
9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o 9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
10
11obj-$(CONFIG_ACPI_APEI) += mce-apei.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
new file mode 100644
index 000000000000..8209472b27a5
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -0,0 +1,138 @@
1/*
2 * Bridge between MCE and APEI
3 *
4 * On some machine, corrected memory errors are reported via APEI
5 * generic hardware error source (GHES) instead of corrected Machine
6 * Check. These corrected memory errors can be reported to user space
7 * through /dev/mcelog via faking a corrected Machine Check, so that
8 * the error memory page can be offlined by /sbin/mcelog if the error
9 * count for one page is beyond the threshold.
10 *
11 * For fatal MCE, save MCE record into persistent storage via ERST, so
12 * that the MCE record can be logged after reboot via ERST.
13 *
14 * Copyright 2010 Intel Corp.
15 * Author: Huang Ying <ying.huang@intel.com>
16 *
17 * This program is free software; you can redistribute it and/or
18 * modify it under the terms of the GNU General Public License version
19 * 2 as published by the Free Software Foundation.
20 *
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
25 *
26 * You should have received a copy of the GNU General Public License
27 * along with this program; if not, write to the Free Software
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 */
30
31#include <linux/kernel.h>
32#include <linux/acpi.h>
33#include <linux/cper.h>
34#include <acpi/apei.h>
35#include <asm/mce.h>
36
37#include "mce-internal.h"
38
39void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
40{
41 struct mce m;
42
43 /* Only corrected MC is reported */
44 if (!corrected)
45 return;
46
47 mce_setup(&m);
48 m.bank = 1;
49 /* Fake a memory read corrected error with unknown channel */
50 m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
51 m.addr = mem_err->physical_addr;
52 mce_log(&m);
53 mce_notify_irq();
54}
55EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
56
57#define CPER_CREATOR_MCE \
58 UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
59 0x64, 0x90, 0xb8, 0x9d)
60#define CPER_SECTION_TYPE_MCE \
61 UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
62 0x04, 0x4a, 0x38, 0xfc)
63
64/*
65 * CPER specification (in UEFI specification 2.3 appendix N) requires
66 * byte-packed.
67 */
68struct cper_mce_record {
69 struct cper_record_header hdr;
70 struct cper_section_descriptor sec_hdr;
71 struct mce mce;
72} __packed;
73
74int apei_write_mce(struct mce *m)
75{
76 struct cper_mce_record rcd;
77
78 memset(&rcd, 0, sizeof(rcd));
79 memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
80 rcd.hdr.revision = CPER_RECORD_REV;
81 rcd.hdr.signature_end = CPER_SIG_END;
82 rcd.hdr.section_count = 1;
83 rcd.hdr.error_severity = CPER_SEV_FATAL;
84 /* timestamp, platform_id, partition_id are all invalid */
85 rcd.hdr.validation_bits = 0;
86 rcd.hdr.record_length = sizeof(rcd);
87 rcd.hdr.creator_id = CPER_CREATOR_MCE;
88 rcd.hdr.notification_type = CPER_NOTIFY_MCE;
89 rcd.hdr.record_id = cper_next_record_id();
90 rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
91
92 rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
93 rcd.sec_hdr.section_length = sizeof(rcd.mce);
94 rcd.sec_hdr.revision = CPER_SEC_REV;
95 /* fru_id and fru_text is invalid */
96 rcd.sec_hdr.validation_bits = 0;
97 rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
98 rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
99 rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
100
101 memcpy(&rcd.mce, m, sizeof(*m));
102
103 return erst_write(&rcd.hdr);
104}
105
106ssize_t apei_read_mce(struct mce *m, u64 *record_id)
107{
108 struct cper_mce_record rcd;
109 ssize_t len;
110
111 len = erst_read_next(&rcd.hdr, sizeof(rcd));
112 if (len <= 0)
113 return len;
114 /* Can not skip other records in storage via ERST unless clear them */
115 else if (len != sizeof(rcd) ||
116 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
117 if (printk_ratelimit())
118 pr_warning(
119 "MCE-APEI: Can not skip the unknown record in ERST");
120 return -EIO;
121 }
122
123 memcpy(m, &rcd.mce, sizeof(*m));
124 *record_id = rcd.hdr.record_id;
125
126 return sizeof(*m);
127}
128
129/* Check whether there is record in ERST */
130int apei_check_mce(void)
131{
132 return erst_get_record_count();
133}
134
135int apei_clear_mce(u64 record_id)
136{
137 return erst_clear(record_id);
138}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 73734baa50f2..e7dbde7bfedb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -22,6 +22,7 @@
22#include <linux/kdebug.h> 22#include <linux/kdebug.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/gfp.h>
25#include <asm/mce.h> 26#include <asm/mce.h>
26#include <asm/apic.h> 27#include <asm/apic.h>
27 28
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 32996f9fab67..fefcc69ee8b5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,3 +28,26 @@ extern int mce_ser;
28 28
29extern struct mce_bank *mce_banks; 29extern struct mce_bank *mce_banks;
30 30
31#ifdef CONFIG_ACPI_APEI
32int apei_write_mce(struct mce *m);
33ssize_t apei_read_mce(struct mce *m, u64 *record_id);
34int apei_check_mce(void);
35int apei_clear_mce(u64 record_id);
36#else
37static inline int apei_write_mce(struct mce *m)
38{
39 return -EINVAL;
40}
41static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
42{
43 return 0;
44}
45static inline int apei_check_mce(void)
46{
47 return 0;
48}
49static inline int apei_clear_mce(u64 record_id)
50{
51 return -EINVAL;
52}
53#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a8aacd4b513c..ed41562909fe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -26,6 +26,7 @@
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/sysfs.h> 27#include <linux/sysfs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
29#include <linux/init.h> 30#include <linux/init.h>
30#include <linux/kmod.h> 31#include <linux/kmod.h>
31#include <linux/poll.h> 32#include <linux/poll.h>
@@ -35,6 +36,7 @@
35#include <linux/fs.h> 36#include <linux/fs.h>
36#include <linux/mm.h> 37#include <linux/mm.h>
37#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/edac_mce.h>
38 40
39#include <asm/processor.h> 41#include <asm/processor.h>
40#include <asm/hw_irq.h> 42#include <asm/hw_irq.h>
@@ -46,6 +48,13 @@
46 48
47#include "mce-internal.h" 49#include "mce-internal.h"
48 50
51static DEFINE_MUTEX(mce_read_mutex);
52
53#define rcu_dereference_check_mce(p) \
54 rcu_dereference_index_check((p), \
55 rcu_read_lock_sched_held() || \
56 lockdep_is_held(&mce_read_mutex))
57
49#define CREATE_TRACE_POINTS 58#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h> 59#include <trace/events/mce.h>
51 60
@@ -98,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
98static int default_decode_mce(struct notifier_block *nb, unsigned long val, 107static int default_decode_mce(struct notifier_block *nb, unsigned long val,
99 void *data) 108 void *data)
100{ 109{
101 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 110 pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
102 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 111 pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
103 112
104 return NOTIFY_STOP; 113 return NOTIFY_STOP;
105} 114}
@@ -158,9 +167,18 @@ void mce_log(struct mce *mce)
158 mce->finished = 0; 167 mce->finished = 0;
159 wmb(); 168 wmb();
160 for (;;) { 169 for (;;) {
161 entry = rcu_dereference(mcelog.next); 170 entry = rcu_dereference_check_mce(mcelog.next);
162 for (;;) { 171 for (;;) {
163 /* 172 /*
173 * If edac_mce is enabled, it will check the error type
174 * and will process it, if it is a known error.
175 * Otherwise, the error will be sent through mcelog
176 * interface
177 */
178 if (edac_mce_parse(mce))
179 return;
180
181 /*
164 * When the buffer fills up discard new entries. 182 * When the buffer fills up discard new entries.
165 * Assume that the earlier errors are the more 183 * Assume that the earlier errors are the more
166 * interesting ones: 184 * interesting ones:
@@ -193,11 +211,11 @@ void mce_log(struct mce *mce)
193 211
194static void print_mce(struct mce *m) 212static void print_mce(struct mce *m)
195{ 213{
196 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 214 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
197 m->extcpu, m->mcgstatus, m->bank, m->status); 215 m->extcpu, m->mcgstatus, m->bank, m->status);
198 216
199 if (m->ip) { 217 if (m->ip) {
200 pr_emerg("RIP%s %02x:<%016Lx> ", 218 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
201 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 219 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
202 m->cs, m->ip); 220 m->cs, m->ip);
203 221
@@ -206,14 +224,14 @@ static void print_mce(struct mce *m)
206 pr_cont("\n"); 224 pr_cont("\n");
207 } 225 }
208 226
209 pr_emerg("TSC %llx ", m->tsc); 227 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
210 if (m->addr) 228 if (m->addr)
211 pr_cont("ADDR %llx ", m->addr); 229 pr_cont("ADDR %llx ", m->addr);
212 if (m->misc) 230 if (m->misc)
213 pr_cont("MISC %llx ", m->misc); 231 pr_cont("MISC %llx ", m->misc);
214 232
215 pr_cont("\n"); 233 pr_cont("\n");
216 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 234 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
217 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 235 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
218 236
219 /* 237 /*
@@ -223,16 +241,6 @@ static void print_mce(struct mce *m)
223 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
224} 242}
225 243
226static void print_mce_head(void)
227{
228 pr_emerg("\nHARDWARE ERROR\n");
229}
230
231static void print_mce_tail(void)
232{
233 pr_emerg("This is not a software problem!\n");
234}
235
236#define PANIC_TIMEOUT 5 /* 5 seconds */ 244#define PANIC_TIMEOUT 5 /* 5 seconds */
237 245
238static atomic_t mce_paniced; 246static atomic_t mce_paniced;
@@ -256,7 +264,7 @@ static void wait_for_panic(void)
256 264
257static void mce_panic(char *msg, struct mce *final, char *exp) 265static void mce_panic(char *msg, struct mce *final, char *exp)
258{ 266{
259 int i; 267 int i, apei_err = 0;
260 268
261 if (!fake_panic) { 269 if (!fake_panic) {
262 /* 270 /*
@@ -273,14 +281,16 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
273 if (atomic_inc_return(&mce_fake_paniced) > 1) 281 if (atomic_inc_return(&mce_fake_paniced) > 1)
274 return; 282 return;
275 } 283 }
276 print_mce_head();
277 /* First print corrected ones that are still unlogged */ 284 /* First print corrected ones that are still unlogged */
278 for (i = 0; i < MCE_LOG_LEN; i++) { 285 for (i = 0; i < MCE_LOG_LEN; i++) {
279 struct mce *m = &mcelog.entry[i]; 286 struct mce *m = &mcelog.entry[i];
280 if (!(m->status & MCI_STATUS_VAL)) 287 if (!(m->status & MCI_STATUS_VAL))
281 continue; 288 continue;
282 if (!(m->status & MCI_STATUS_UC)) 289 if (!(m->status & MCI_STATUS_UC)) {
283 print_mce(m); 290 print_mce(m);
291 if (!apei_err)
292 apei_err = apei_write_mce(m);
293 }
284 } 294 }
285 /* Now print uncorrected but with the final one last */ 295 /* Now print uncorrected but with the final one last */
286 for (i = 0; i < MCE_LOG_LEN; i++) { 296 for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -289,22 +299,27 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
289 continue; 299 continue;
290 if (!(m->status & MCI_STATUS_UC)) 300 if (!(m->status & MCI_STATUS_UC))
291 continue; 301 continue;
292 if (!final || memcmp(m, final, sizeof(struct mce))) 302 if (!final || memcmp(m, final, sizeof(struct mce))) {
293 print_mce(m); 303 print_mce(m);
304 if (!apei_err)
305 apei_err = apei_write_mce(m);
306 }
294 } 307 }
295 if (final) 308 if (final) {
296 print_mce(final); 309 print_mce(final);
310 if (!apei_err)
311 apei_err = apei_write_mce(final);
312 }
297 if (cpu_missing) 313 if (cpu_missing)
298 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 314 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
299 print_mce_tail();
300 if (exp) 315 if (exp)
301 printk(KERN_EMERG "Machine check: %s\n", exp); 316 pr_emerg(HW_ERR "Machine check: %s\n", exp);
302 if (!fake_panic) { 317 if (!fake_panic) {
303 if (panic_timeout == 0) 318 if (panic_timeout == 0)
304 panic_timeout = mce_panic_timeout; 319 panic_timeout = mce_panic_timeout;
305 panic(msg); 320 panic(msg);
306 } else 321 } else
307 printk(KERN_EMERG "Fake kernel panic: %s\n", msg); 322 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
308} 323}
309 324
310/* Support code for software error injection */ 325/* Support code for software error injection */
@@ -531,7 +546,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
531 struct mce m; 546 struct mce m;
532 int i; 547 int i;
533 548
534 __get_cpu_var(mce_poll_count)++; 549 percpu_inc(mce_poll_count);
535 550
536 mce_setup(&m); 551 mce_setup(&m);
537 552
@@ -573,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
573 */ 588 */
574 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 589 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
575 mce_log(&m); 590 mce_log(&m);
591 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
576 add_taint(TAINT_MACHINE_CHECK); 592 add_taint(TAINT_MACHINE_CHECK);
577 } 593 }
578 594
@@ -926,7 +942,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
926 942
927 atomic_inc(&mce_entry); 943 atomic_inc(&mce_entry);
928 944
929 __get_cpu_var(mce_exception_count)++; 945 percpu_inc(mce_exception_count);
930 946
931 if (notify_die(DIE_NMI, "machine check", regs, error_code, 947 if (notify_die(DIE_NMI, "machine check", regs, error_code,
932 18, SIGKILL) == NOTIFY_STOP) 948 18, SIGKILL) == NOTIFY_STOP)
@@ -1193,7 +1209,7 @@ int mce_notify_irq(void)
1193 schedule_work(&mce_trigger_work); 1209 schedule_work(&mce_trigger_work);
1194 1210
1195 if (__ratelimit(&ratelimit)) 1211 if (__ratelimit(&ratelimit))
1196 printk(KERN_INFO "Machine check events logged\n"); 1212 pr_info(HW_ERR "Machine check events logged\n");
1197 1213
1198 return 1; 1214 return 1;
1199 } 1215 }
@@ -1485,7 +1501,42 @@ static void collect_tscs(void *data)
1485 rdtscll(cpu_tsc[smp_processor_id()]); 1501 rdtscll(cpu_tsc[smp_processor_id()]);
1486} 1502}
1487 1503
1488static DEFINE_MUTEX(mce_read_mutex); 1504static int mce_apei_read_done;
1505
1506/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1507static int __mce_read_apei(char __user **ubuf, size_t usize)
1508{
1509 int rc;
1510 u64 record_id;
1511 struct mce m;
1512
1513 if (usize < sizeof(struct mce))
1514 return -EINVAL;
1515
1516 rc = apei_read_mce(&m, &record_id);
1517 /* Error or no more MCE record */
1518 if (rc <= 0) {
1519 mce_apei_read_done = 1;
1520 return rc;
1521 }
1522 rc = -EFAULT;
1523 if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1524 return rc;
1525 /*
1526 * In fact, we should have cleared the record after that has
1527 * been flushed to the disk or sent to network in
1528 * /sbin/mcelog, but we have no interface to support that now,
1529 * so just clear it to avoid duplication.
1530 */
1531 rc = apei_clear_mce(record_id);
1532 if (rc) {
1533 mce_apei_read_done = 1;
1534 return rc;
1535 }
1536 *ubuf += sizeof(struct mce);
1537
1538 return 0;
1539}
1489 1540
1490static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1541static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1491 loff_t *off) 1542 loff_t *off)
@@ -1500,16 +1551,20 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1500 return -ENOMEM; 1551 return -ENOMEM;
1501 1552
1502 mutex_lock(&mce_read_mutex); 1553 mutex_lock(&mce_read_mutex);
1503 next = rcu_dereference(mcelog.next);
1504
1505 /* Only supports full reads right now */
1506 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
1507 mutex_unlock(&mce_read_mutex);
1508 kfree(cpu_tsc);
1509 1554
1510 return -EINVAL; 1555 if (!mce_apei_read_done) {
1556 err = __mce_read_apei(&buf, usize);
1557 if (err || buf != ubuf)
1558 goto out;
1511 } 1559 }
1512 1560
1561 next = rcu_dereference_check_mce(mcelog.next);
1562
1563 /* Only supports full reads right now */
1564 err = -EINVAL;
1565 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1566 goto out;
1567
1513 err = 0; 1568 err = 0;
1514 prev = 0; 1569 prev = 0;
1515 do { 1570 do {
@@ -1556,16 +1611,23 @@ timeout:
1556 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1611 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1557 } 1612 }
1558 } 1613 }
1614
1615 if (err)
1616 err = -EFAULT;
1617
1618out:
1559 mutex_unlock(&mce_read_mutex); 1619 mutex_unlock(&mce_read_mutex);
1560 kfree(cpu_tsc); 1620 kfree(cpu_tsc);
1561 1621
1562 return err ? -EFAULT : buf - ubuf; 1622 return err ? err : buf - ubuf;
1563} 1623}
1564 1624
1565static unsigned int mce_poll(struct file *file, poll_table *wait) 1625static unsigned int mce_poll(struct file *file, poll_table *wait)
1566{ 1626{
1567 poll_wait(file, &mce_wait, wait); 1627 poll_wait(file, &mce_wait, wait);
1568 if (rcu_dereference(mcelog.next)) 1628 if (rcu_dereference_check_mce(mcelog.next))
1629 return POLLIN | POLLRDNORM;
1630 if (!mce_apei_read_done && apei_check_mce())
1569 return POLLIN | POLLRDNORM; 1631 return POLLIN | POLLRDNORM;
1570 return 0; 1632 return 0;
1571} 1633}
@@ -2044,6 +2106,7 @@ static __init void mce_init_banks(void)
2044 struct mce_bank *b = &mce_banks[i]; 2106 struct mce_bank *b = &mce_banks[i];
2045 struct sysdev_attribute *a = &b->attr; 2107 struct sysdev_attribute *a = &b->attr;
2046 2108
2109 sysfs_attr_init(&a->attr);
2047 a->attr.name = b->attrname; 2110 a->attr.name = b->attrname;
2048 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2111 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2049 2112
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 83a3d1f4efca..39aaee5c1ab2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -21,6 +21,7 @@
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/slab.h>
24#include <linux/init.h> 25#include <linux/init.h>
25#include <linux/cpu.h> 26#include <linux/cpu.h>
26#include <linux/smp.h> 27#include <linux/smp.h>
@@ -140,6 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
140 address = (low & MASK_BLKPTR_LO) >> 21; 141 address = (low & MASK_BLKPTR_LO) >> 21;
141 if (!address) 142 if (!address)
142 break; 143 break;
144
143 address += MCG_XBLK_ADDR; 145 address += MCG_XBLK_ADDR;
144 } else 146 } else
145 ++address; 147 ++address;
@@ -147,12 +149,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
147 if (rdmsr_safe(address, &low, &high)) 149 if (rdmsr_safe(address, &low, &high))
148 break; 150 break;
149 151
150 if (!(high & MASK_VALID_HI)) { 152 if (!(high & MASK_VALID_HI))
151 if (block) 153 continue;
152 continue;
153 else
154 break;
155 }
156 154
157 if (!(high & MASK_CNTP_HI) || 155 if (!(high & MASK_CNTP_HI) ||
158 (high & MASK_LOCKED_HI)) 156 (high & MASK_LOCKED_HI))
@@ -388,7 +386,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
388 return ret; 386 return ret;
389} 387}
390 388
391static struct sysfs_ops threshold_ops = { 389static const struct sysfs_ops threshold_ops = {
392 .show = show, 390 .show = show,
393 .store = store, 391 .store = store,
394}; 392};
@@ -529,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
529 err = -ENOMEM; 527 err = -ENOMEM;
530 goto out; 528 goto out;
531 } 529 }
532 if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { 530 if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
533 kfree(b); 531 kfree(b);
534 err = -ENOMEM; 532 err = -ENOMEM;
535 goto out; 533 goto out;
@@ -542,7 +540,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
542#ifndef CONFIG_SMP 540#ifndef CONFIG_SMP
543 cpumask_setall(b->cpus); 541 cpumask_setall(b->cpus);
544#else 542#else
545 cpumask_copy(b->cpus, c->llc_shared_map); 543 cpumask_set_cpu(cpu, b->cpus);
546#endif 544#endif
547 545
548 per_cpu(threshold_banks, cpu)[bank] = b; 546 per_cpu(threshold_banks, cpu)[bank] = b;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 7c785634af2b..6fcd0936194f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -5,6 +5,7 @@
5 * Author: Andi Kleen 5 * Author: Andi Kleen
6 */ 6 */
7 7
8#include <linux/gfp.h>
8#include <linux/init.h> 9#include <linux/init.h>
9#include <linux/interrupt.h> 10#include <linux/interrupt.h>
10#include <linux/percpu.h> 11#include <linux/percpu.h>
@@ -94,20 +95,21 @@ static void cmci_discover(int banks, int boot)
94 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 95 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
95 96
96 /* Already owned by someone else? */ 97 /* Already owned by someone else? */
97 if (val & CMCI_EN) { 98 if (val & MCI_CTL2_CMCI_EN) {
98 if (test_and_clear_bit(i, owned) || boot) 99 if (test_and_clear_bit(i, owned) && !boot)
99 print_update("SHD", &hdr, i); 100 print_update("SHD", &hdr, i);
100 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 101 __clear_bit(i, __get_cpu_var(mce_poll_banks));
101 continue; 102 continue;
102 } 103 }
103 104
104 val |= CMCI_EN | CMCI_THRESHOLD; 105 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
106 val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
105 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 107 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
106 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 108 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
107 109
108 /* Did the enable bit stick? -- the bank supports CMCI */ 110 /* Did the enable bit stick? -- the bank supports CMCI */
109 if (val & CMCI_EN) { 111 if (val & MCI_CTL2_CMCI_EN) {
110 if (!test_and_set_bit(i, owned) || boot) 112 if (!test_and_set_bit(i, owned) && !boot)
111 print_update("CMCI", &hdr, i); 113 print_update("CMCI", &hdr, i);
112 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 114 __clear_bit(i, __get_cpu_var(mce_poll_banks));
113 } else { 115 } else {
@@ -154,7 +156,7 @@ void cmci_clear(void)
154 continue; 156 continue;
155 /* Disable CMCI */ 157 /* Disable CMCI */
156 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 158 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
157 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); 159 val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
158 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 160 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
159 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 161 __clear_bit(i, __get_cpu_var(mce_banks_owned));
160 } 162 }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 81c499eceb21..169d8804a9f8 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,15 +34,25 @@
34/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
35#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
36 36
37#define THERMAL_THROTTLING_EVENT 0
38#define POWER_LIMIT_EVENT 1
39
37/* 40/*
38 * Current thermal throttling state: 41 * Current thermal event state:
39 */ 42 */
40struct thermal_state { 43struct _thermal_state {
41 bool is_throttled; 44 bool new_event;
42 45 int event;
43 u64 next_check; 46 u64 next_check;
44 unsigned long throttle_count; 47 unsigned long count;
45 unsigned long last_throttle_count; 48 unsigned long last_count;
49};
50
51struct thermal_state {
52 struct _thermal_state core_throttle;
53 struct _thermal_state core_power_limit;
54 struct _thermal_state package_throttle;
55 struct _thermal_state package_power_limit;
46}; 56};
47 57
48static DEFINE_PER_CPU(struct thermal_state, thermal_state); 58static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly;
53 63
54#ifdef CONFIG_SYSFS 64#ifdef CONFIG_SYSFS
55#define define_therm_throt_sysdev_one_ro(_name) \ 65#define define_therm_throt_sysdev_one_ro(_name) \
56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 66 static SYSDEV_ATTR(_name, 0444, \
67 therm_throt_sysdev_show_##_name, \
68 NULL) \
57 69
58#define define_therm_throt_sysdev_show_func(name) \ 70#define define_therm_throt_sysdev_show_func(event, name) \
59 \ 71 \
60static ssize_t therm_throt_sysdev_show_##name( \ 72static ssize_t therm_throt_sysdev_show_##event##_##name( \
61 struct sys_device *dev, \ 73 struct sys_device *dev, \
62 struct sysdev_attribute *attr, \ 74 struct sysdev_attribute *attr, \
63 char *buf) \ 75 char *buf) \
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \
66 ssize_t ret; \ 78 ssize_t ret; \
67 \ 79 \
68 preempt_disable(); /* CPU hotplug */ \ 80 preempt_disable(); /* CPU hotplug */ \
69 if (cpu_online(cpu)) \ 81 if (cpu_online(cpu)) { \
70 ret = sprintf(buf, "%lu\n", \ 82 ret = sprintf(buf, "%lu\n", \
71 per_cpu(thermal_state, cpu).name); \ 83 per_cpu(thermal_state, cpu).event.name); \
72 else \ 84 } else \
73 ret = 0; \ 85 ret = 0; \
74 preempt_enable(); \ 86 preempt_enable(); \
75 \ 87 \
76 return ret; \ 88 return ret; \
77} 89}
78 90
79define_therm_throt_sysdev_show_func(throttle_count); 91define_therm_throt_sysdev_show_func(core_throttle, count);
80define_therm_throt_sysdev_one_ro(throttle_count); 92define_therm_throt_sysdev_one_ro(core_throttle_count);
93
94define_therm_throt_sysdev_show_func(core_power_limit, count);
95define_therm_throt_sysdev_one_ro(core_power_limit_count);
96
97define_therm_throt_sysdev_show_func(package_throttle, count);
98define_therm_throt_sysdev_one_ro(package_throttle_count);
99
100define_therm_throt_sysdev_show_func(package_power_limit, count);
101define_therm_throt_sysdev_one_ro(package_power_limit_count);
81 102
82static struct attribute *thermal_throttle_attrs[] = { 103static struct attribute *thermal_throttle_attrs[] = {
83 &attr_throttle_count.attr, 104 &attr_core_throttle_count.attr,
84 NULL 105 NULL
85}; 106};
86 107
87static struct attribute_group thermal_throttle_attr_group = { 108static struct attribute_group thermal_attr_group = {
88 .attrs = thermal_throttle_attrs, 109 .attrs = thermal_throttle_attrs,
89 .name = "thermal_throttle" 110 .name = "thermal_throttle"
90}; 111};
91#endif /* CONFIG_SYSFS */ 112#endif /* CONFIG_SYSFS */
92 113
114#define CORE_LEVEL 0
115#define PACKAGE_LEVEL 1
116
93/*** 117/***
94 * therm_throt_process - Process thermal throttling event from interrupt 118 * therm_throt_process - Process thermal throttling event from interrupt
95 * @curr: Whether the condition is current or not (boolean), since the 119 * @curr: Whether the condition is current or not (boolean), since the
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
106 * 1 : Event should be logged further, and a message has been 130 * 1 : Event should be logged further, and a message has been
107 * printed to the syslog. 131 * printed to the syslog.
108 */ 132 */
109static int therm_throt_process(bool is_throttled) 133static int therm_throt_process(bool new_event, int event, int level)
110{ 134{
111 struct thermal_state *state; 135 struct _thermal_state *state;
112 unsigned int this_cpu; 136 unsigned int this_cpu = smp_processor_id();
113 bool was_throttled; 137 bool old_event;
114 u64 now; 138 u64 now;
139 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
115 140
116 this_cpu = smp_processor_id();
117 now = get_jiffies_64(); 141 now = get_jiffies_64();
118 state = &per_cpu(thermal_state, this_cpu); 142 if (level == CORE_LEVEL) {
143 if (event == THERMAL_THROTTLING_EVENT)
144 state = &pstate->core_throttle;
145 else if (event == POWER_LIMIT_EVENT)
146 state = &pstate->core_power_limit;
147 else
148 return 0;
149 } else if (level == PACKAGE_LEVEL) {
150 if (event == THERMAL_THROTTLING_EVENT)
151 state = &pstate->package_throttle;
152 else if (event == POWER_LIMIT_EVENT)
153 state = &pstate->package_power_limit;
154 else
155 return 0;
156 } else
157 return 0;
119 158
120 was_throttled = state->is_throttled; 159 old_event = state->new_event;
121 state->is_throttled = is_throttled; 160 state->new_event = new_event;
122 161
123 if (is_throttled) 162 if (new_event)
124 state->throttle_count++; 163 state->count++;
125 164
126 if (time_before64(now, state->next_check) && 165 if (time_before64(now, state->next_check) &&
127 state->throttle_count != state->last_throttle_count) 166 state->count != state->last_count)
128 return 0; 167 return 0;
129 168
130 state->next_check = now + CHECK_INTERVAL; 169 state->next_check = now + CHECK_INTERVAL;
131 state->last_throttle_count = state->throttle_count; 170 state->last_count = state->count;
132 171
133 /* if we just entered the thermal event */ 172 /* if we just entered the thermal event */
134 if (is_throttled) { 173 if (new_event) {
135 printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); 174 if (event == THERMAL_THROTTLING_EVENT)
175 printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
176 this_cpu,
177 level == CORE_LEVEL ? "Core" : "Package",
178 state->count);
179 else
180 printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
181 this_cpu,
182 level == CORE_LEVEL ? "Core" : "Package",
183 state->count);
136 184
137 add_taint(TAINT_MACHINE_CHECK); 185 add_taint(TAINT_MACHINE_CHECK);
138 return 1; 186 return 1;
139 } 187 }
140 if (was_throttled) { 188 if (old_event) {
141 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); 189 if (event == THERMAL_THROTTLING_EVENT)
190 printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
191 this_cpu,
192 level == CORE_LEVEL ? "Core" : "Package");
193 else
194 printk(KERN_INFO "CPU%d: %s power limit normal\n",
195 this_cpu,
196 level == CORE_LEVEL ? "Core" : "Package");
142 return 1; 197 return 1;
143 } 198 }
144 199
@@ -147,15 +202,36 @@ static int therm_throt_process(bool is_throttled)
147 202
148#ifdef CONFIG_SYSFS 203#ifdef CONFIG_SYSFS
149/* Add/Remove thermal_throttle interface for CPU device: */ 204/* Add/Remove thermal_throttle interface for CPU device: */
150static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 205static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
206 unsigned int cpu)
151{ 207{
152 return sysfs_create_group(&sys_dev->kobj, 208 int err;
153 &thermal_throttle_attr_group); 209 struct cpuinfo_x86 *c = &cpu_data(cpu);
210
211 err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
212 if (err)
213 return err;
214
215 if (cpu_has(c, X86_FEATURE_PLN))
216 err = sysfs_add_file_to_group(&sys_dev->kobj,
217 &attr_core_power_limit_count.attr,
218 thermal_attr_group.name);
219 if (cpu_has(c, X86_FEATURE_PTS)) {
220 err = sysfs_add_file_to_group(&sys_dev->kobj,
221 &attr_package_throttle_count.attr,
222 thermal_attr_group.name);
223 if (cpu_has(c, X86_FEATURE_PLN))
224 err = sysfs_add_file_to_group(&sys_dev->kobj,
225 &attr_package_power_limit_count.attr,
226 thermal_attr_group.name);
227 }
228
229 return err;
154} 230}
155 231
156static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 232static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
157{ 233{
158 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 234 sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
159} 235}
160 236
161/* Mutex protecting device creation against CPU hotplug: */ 237/* Mutex protecting device creation against CPU hotplug: */
@@ -177,7 +253,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
177 case CPU_UP_PREPARE: 253 case CPU_UP_PREPARE:
178 case CPU_UP_PREPARE_FROZEN: 254 case CPU_UP_PREPARE_FROZEN:
179 mutex_lock(&therm_cpu_lock); 255 mutex_lock(&therm_cpu_lock);
180 err = thermal_throttle_add_dev(sys_dev); 256 err = thermal_throttle_add_dev(sys_dev, cpu);
181 mutex_unlock(&therm_cpu_lock); 257 mutex_unlock(&therm_cpu_lock);
182 WARN_ON(err); 258 WARN_ON(err);
183 break; 259 break;
@@ -190,7 +266,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
190 mutex_unlock(&therm_cpu_lock); 266 mutex_unlock(&therm_cpu_lock);
191 break; 267 break;
192 } 268 }
193 return err ? NOTIFY_BAD : NOTIFY_OK; 269 return notifier_from_errno(err);
194} 270}
195 271
196static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = 272static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
@@ -213,7 +289,7 @@ static __init int thermal_throttle_init_device(void)
213#endif 289#endif
214 /* connect live CPUs to sysfs */ 290 /* connect live CPUs to sysfs */
215 for_each_online_cpu(cpu) { 291 for_each_online_cpu(cpu) {
216 err = thermal_throttle_add_dev(get_cpu_sysdev(cpu)); 292 err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
217 WARN_ON(err); 293 WARN_ON(err);
218 } 294 }
219#ifdef CONFIG_HOTPLUG_CPU 295#ifdef CONFIG_HOTPLUG_CPU
@@ -226,14 +302,50 @@ device_initcall(thermal_throttle_init_device);
226 302
227#endif /* CONFIG_SYSFS */ 303#endif /* CONFIG_SYSFS */
228 304
305/*
306 * Set up the most two significant bit to notify mce log that this thermal
307 * event type.
308 * This is a temp solution. May be changed in the future with mce log
309 * infrasture.
310 */
311#define CORE_THROTTLED (0)
312#define CORE_POWER_LIMIT ((__u64)1 << 62)
313#define PACKAGE_THROTTLED ((__u64)2 << 62)
314#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
315
229/* Thermal transition interrupt handler */ 316/* Thermal transition interrupt handler */
230static void intel_thermal_interrupt(void) 317static void intel_thermal_interrupt(void)
231{ 318{
232 __u64 msr_val; 319 __u64 msr_val;
320 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
233 321
234 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 322 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
235 if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) 323
236 mce_log_therm_throt_event(msr_val); 324 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
325 THERMAL_THROTTLING_EVENT,
326 CORE_LEVEL) != 0)
327 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
328
329 if (cpu_has(c, X86_FEATURE_PLN))
330 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
331 POWER_LIMIT_EVENT,
332 CORE_LEVEL) != 0)
333 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
334
335 if (cpu_has(c, X86_FEATURE_PTS)) {
336 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
337 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
338 THERMAL_THROTTLING_EVENT,
339 PACKAGE_LEVEL) != 0)
340 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
341 if (cpu_has(c, X86_FEATURE_PLN))
342 if (therm_throt_process(msr_val &
343 PACKAGE_THERM_STATUS_POWER_LIMIT,
344 POWER_LIMIT_EVENT,
345 PACKAGE_LEVEL) != 0)
346 mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
347 | msr_val);
348 }
237} 349}
238 350
239static void unexpected_thermal_interrupt(void) 351static void unexpected_thermal_interrupt(void)
@@ -335,8 +447,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
335 apic_write(APIC_LVTTHMR, h); 447 apic_write(APIC_LVTTHMR, h);
336 448
337 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 449 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
338 wrmsr(MSR_IA32_THERM_INTERRUPT, 450 if (cpu_has(c, X86_FEATURE_PLN))
339 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); 451 wrmsr(MSR_IA32_THERM_INTERRUPT,
452 l | (THERM_INT_LOW_ENABLE
453 | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
454 else
455 wrmsr(MSR_IA32_THERM_INTERRUPT,
456 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
457
458 if (cpu_has(c, X86_FEATURE_PTS)) {
459 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
460 if (cpu_has(c, X86_FEATURE_PLN))
461 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
462 l | (PACKAGE_THERM_INT_LOW_ENABLE
463 | PACKAGE_THERM_INT_HIGH_ENABLE
464 | PACKAGE_THERM_INT_PLN_ENABLE), h);
465 else
466 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
467 l | (PACKAGE_THERM_INT_LOW_ENABLE
468 | PACKAGE_THERM_INT_HIGH_ENABLE), h);
469 }
340 470
341 smp_thermal_vector = intel_thermal_interrupt; 471 smp_thermal_vector = intel_thermal_interrupt;
342 472
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 000000000000..d944bf6c50e9
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,56 @@
1/*
2 * HyperV Detection code.
3 *
4 * Copyright (C) 2010, Novell, Inc.
5 * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; version 2 of the License.
10 *
11 */
12
13#include <linux/types.h>
14#include <linux/module.h>
15#include <asm/processor.h>
16#include <asm/hypervisor.h>
17#include <asm/hyperv.h>
18#include <asm/mshyperv.h>
19
20struct ms_hyperv_info ms_hyperv;
21EXPORT_SYMBOL_GPL(ms_hyperv);
22
23static bool __init ms_hyperv_platform(void)
24{
25 u32 eax;
26 u32 hyp_signature[3];
27
28 if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
29 return false;
30
31 cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
32 &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
33
34 return eax >= HYPERV_CPUID_MIN &&
35 eax <= HYPERV_CPUID_MAX &&
36 !memcmp("Microsoft Hv", hyp_signature, 12);
37}
38
39static void __init ms_hyperv_init_platform(void)
40{
41 /*
42 * Extract the features and hints
43 */
44 ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
45 ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
46
47 printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
48 ms_hyperv.features, ms_hyperv.hints);
49}
50
51const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
52 .name = "Microsoft HyperV",
53 .detect = ms_hyperv_platform,
54 .init_platform = ms_hyperv_init_platform,
55};
56EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index f4361b56f8e9..ad9e5ed81181 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
1obj-y := main.o if.o generic.o state.o cleanup.o 1obj-y := main.o if.o generic.o cleanup.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3 3
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 33af14110dfd..92ba9cd31c9a 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
108 return 0; 108 return 0;
109} 109}
110 110
111static struct mtrr_ops amd_mtrr_ops = { 111static const struct mtrr_ops amd_mtrr_ops = {
112 .vendor = X86_VENDOR_AMD, 112 .vendor = X86_VENDOR_AMD,
113 .set = amd_set_mtrr, 113 .set = amd_set_mtrr,
114 .get = amd_get_mtrr, 114 .get = amd_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index de89f14eff3a..316fe3e60a97 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
110 return 0; 110 return 0;
111} 111}
112 112
113static struct mtrr_ops centaur_mtrr_ops = { 113static const struct mtrr_ops centaur_mtrr_ops = {
114 .vendor = X86_VENDOR_CENTAUR, 114 .vendor = X86_VENDOR_CENTAUR,
115 .set = centaur_set_mcr, 115 .set = centaur_set_mcr,
116 .get = centaur_get_mcr, 116 .get = centaur_get_mcr,
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 09b1698e0466..c5f59d071425 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
22#include <linux/pci.h> 22#include <linux/pci.h>
23#include <linux/smp.h> 23#include <linux/smp.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/sort.h>
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/uaccess.h> 26#include <linux/uaccess.h>
28#include <linux/kvm_para.h> 27#include <linux/kvm_para.h>
28#include <linux/range.h>
29 29
30#include <asm/processor.h> 30#include <asm/processor.h>
31#include <asm/e820.h> 31#include <asm/e820.h>
@@ -34,11 +34,6 @@
34 34
35#include "mtrr.h" 35#include "mtrr.h"
36 36
37struct res_range {
38 unsigned long start;
39 unsigned long end;
40};
41
42struct var_mtrr_range_state { 37struct var_mtrr_range_state {
43 unsigned long base_pfn; 38 unsigned long base_pfn;
44 unsigned long size_pfn; 39 unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
56/* Should be related to MTRR_VAR_RANGES nums */ 51/* Should be related to MTRR_VAR_RANGES nums */
57#define RANGE_NUM 256 52#define RANGE_NUM 256
58 53
59static struct res_range __initdata range[RANGE_NUM]; 54static struct range __initdata range[RANGE_NUM];
60static int __initdata nr_range; 55static int __initdata nr_range;
61 56
62static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; 57static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
64static int __initdata debug_print; 59static int __initdata debug_print;
65#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) 60#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
66 61
67
68static int __init
69add_range(struct res_range *range, int nr_range,
70 unsigned long start, unsigned long end)
71{
72 /* Out of slots: */
73 if (nr_range >= RANGE_NUM)
74 return nr_range;
75
76 range[nr_range].start = start;
77 range[nr_range].end = end;
78
79 nr_range++;
80
81 return nr_range;
82}
83
84static int __init
85add_range_with_merge(struct res_range *range, int nr_range,
86 unsigned long start, unsigned long end)
87{
88 int i;
89
90 /* Try to merge it with old one: */
91 for (i = 0; i < nr_range; i++) {
92 unsigned long final_start, final_end;
93 unsigned long common_start, common_end;
94
95 if (!range[i].end)
96 continue;
97
98 common_start = max(range[i].start, start);
99 common_end = min(range[i].end, end);
100 if (common_start > common_end + 1)
101 continue;
102
103 final_start = min(range[i].start, start);
104 final_end = max(range[i].end, end);
105
106 range[i].start = final_start;
107 range[i].end = final_end;
108 return nr_range;
109 }
110
111 /* Need to add it: */
112 return add_range(range, nr_range, start, end);
113}
114
115static void __init
116subtract_range(struct res_range *range, unsigned long start, unsigned long end)
117{
118 int i, j;
119
120 for (j = 0; j < RANGE_NUM; j++) {
121 if (!range[j].end)
122 continue;
123
124 if (start <= range[j].start && end >= range[j].end) {
125 range[j].start = 0;
126 range[j].end = 0;
127 continue;
128 }
129
130 if (start <= range[j].start && end < range[j].end &&
131 range[j].start < end + 1) {
132 range[j].start = end + 1;
133 continue;
134 }
135
136
137 if (start > range[j].start && end >= range[j].end &&
138 range[j].end > start - 1) {
139 range[j].end = start - 1;
140 continue;
141 }
142
143 if (start > range[j].start && end < range[j].end) {
144 /* Find the new spare: */
145 for (i = 0; i < RANGE_NUM; i++) {
146 if (range[i].end == 0)
147 break;
148 }
149 if (i < RANGE_NUM) {
150 range[i].end = range[j].end;
151 range[i].start = end + 1;
152 } else {
153 printk(KERN_ERR "run of slot in ranges\n");
154 }
155 range[j].end = start - 1;
156 continue;
157 }
158 }
159}
160
161static int __init cmp_range(const void *x1, const void *x2)
162{
163 const struct res_range *r1 = x1;
164 const struct res_range *r2 = x2;
165 long start1, start2;
166
167 start1 = r1->start;
168 start2 = r2->start;
169
170 return start1 - start2;
171}
172
173static int __init clean_sort_range(struct res_range *range, int az)
174{
175 int i, j, k = az - 1, nr_range = 0;
176
177 for (i = 0; i < k; i++) {
178 if (range[i].end)
179 continue;
180 for (j = k; j > i; j--) {
181 if (range[j].end) {
182 k = j;
183 break;
184 }
185 }
186 if (j == i)
187 break;
188 range[i].start = range[k].start;
189 range[i].end = range[k].end;
190 range[k].start = 0;
191 range[k].end = 0;
192 k--;
193 }
194 /* count it */
195 for (i = 0; i < az; i++) {
196 if (!range[i].end) {
197 nr_range = i;
198 break;
199 }
200 }
201
202 /* sort them */
203 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
204
205 return nr_range;
206}
207
208#define BIOS_BUG_MSG KERN_WARNING \ 62#define BIOS_BUG_MSG KERN_WARNING \
209 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 63 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
210 64
211static int __init 65static int __init
212x86_get_mtrr_mem_range(struct res_range *range, int nr_range, 66x86_get_mtrr_mem_range(struct range *range, int nr_range,
213 unsigned long extra_remove_base, 67 unsigned long extra_remove_base,
214 unsigned long extra_remove_size) 68 unsigned long extra_remove_size)
215{ 69{
@@ -223,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
223 continue; 77 continue;
224 base = range_state[i].base_pfn; 78 base = range_state[i].base_pfn;
225 size = range_state[i].size_pfn; 79 size = range_state[i].size_pfn;
226 nr_range = add_range_with_merge(range, nr_range, base, 80 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
227 base + size - 1); 81 base, base + size);
228 } 82 }
229 if (debug_print) { 83 if (debug_print) {
230 printk(KERN_DEBUG "After WB checking\n"); 84 printk(KERN_DEBUG "After WB checking\n");
231 for (i = 0; i < nr_range; i++) 85 for (i = 0; i < nr_range; i++)
232 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 86 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
233 range[i].start, range[i].end + 1); 87 range[i].start, range[i].end);
234 } 88 }
235 89
236 /* Take out UC ranges: */ 90 /* Take out UC ranges: */
@@ -252,19 +106,19 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
252 size -= (1<<(20-PAGE_SHIFT)) - base; 106 size -= (1<<(20-PAGE_SHIFT)) - base;
253 base = 1<<(20-PAGE_SHIFT); 107 base = 1<<(20-PAGE_SHIFT);
254 } 108 }
255 subtract_range(range, base, base + size - 1); 109 subtract_range(range, RANGE_NUM, base, base + size);
256 } 110 }
257 if (extra_remove_size) 111 if (extra_remove_size)
258 subtract_range(range, extra_remove_base, 112 subtract_range(range, RANGE_NUM, extra_remove_base,
259 extra_remove_base + extra_remove_size - 1); 113 extra_remove_base + extra_remove_size);
260 114
261 if (debug_print) { 115 if (debug_print) {
262 printk(KERN_DEBUG "After UC checking\n"); 116 printk(KERN_DEBUG "After UC checking\n");
263 for (i = 0; i < RANGE_NUM; i++) { 117 for (i = 0; i < RANGE_NUM; i++) {
264 if (!range[i].end) 118 if (!range[i].end)
265 continue; 119 continue;
266 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 120 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
267 range[i].start, range[i].end + 1); 121 range[i].start, range[i].end);
268 } 122 }
269 } 123 }
270 124
@@ -273,26 +127,22 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
273 if (debug_print) { 127 if (debug_print) {
274 printk(KERN_DEBUG "After sorting\n"); 128 printk(KERN_DEBUG "After sorting\n");
275 for (i = 0; i < nr_range; i++) 129 for (i = 0; i < nr_range; i++)
276 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 130 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
277 range[i].start, range[i].end + 1); 131 range[i].start, range[i].end);
278 } 132 }
279 133
280 /* clear those is not used */
281 for (i = nr_range; i < RANGE_NUM; i++)
282 memset(&range[i], 0, sizeof(range[i]));
283
284 return nr_range; 134 return nr_range;
285} 135}
286 136
287#ifdef CONFIG_MTRR_SANITIZER 137#ifdef CONFIG_MTRR_SANITIZER
288 138
289static unsigned long __init sum_ranges(struct res_range *range, int nr_range) 139static unsigned long __init sum_ranges(struct range *range, int nr_range)
290{ 140{
291 unsigned long sum = 0; 141 unsigned long sum = 0;
292 int i; 142 int i;
293 143
294 for (i = 0; i < nr_range; i++) 144 for (i = 0; i < nr_range; i++)
295 sum += range[i].end + 1 - range[i].start; 145 sum += range[i].end - range[i].start;
296 146
297 return sum; 147 return sum;
298} 148}
@@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg)
621early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); 471early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
622 472
623static int __init 473static int __init
624x86_setup_var_mtrrs(struct res_range *range, int nr_range, 474x86_setup_var_mtrrs(struct range *range, int nr_range,
625 u64 chunk_size, u64 gran_size) 475 u64 chunk_size, u64 gran_size)
626{ 476{
627 struct var_mtrr_state var_state; 477 struct var_mtrr_state var_state;
@@ -639,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
639 /* Write the range: */ 489 /* Write the range: */
640 for (i = 0; i < nr_range; i++) { 490 for (i = 0; i < nr_range; i++) {
641 set_var_mtrr_range(&var_state, range[i].start, 491 set_var_mtrr_range(&var_state, range[i].start,
642 range[i].end - range[i].start + 1); 492 range[i].end - range[i].start);
643 } 493 }
644 494
645 /* Write the last range: */ 495 /* Write the last range: */
@@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
742 unsigned long x_remove_base, 592 unsigned long x_remove_base,
743 unsigned long x_remove_size, int i) 593 unsigned long x_remove_size, int i)
744{ 594{
745 static struct res_range range_new[RANGE_NUM]; 595 static struct range range_new[RANGE_NUM];
746 unsigned long range_sums_new; 596 unsigned long range_sums_new;
747 static int nr_range_new; 597 static int nr_range_new;
748 int num_reg; 598 int num_reg;
@@ -782,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
782 unsigned long gran_base, chunk_base, lose_base; 632 unsigned long gran_base, chunk_base, lose_base;
783 char gran_factor, chunk_factor, lose_factor; 633 char gran_factor, chunk_factor, lose_factor;
784 634
785 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
786 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
787 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), 637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
788 638
789 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", 639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
790 result[i].bad ? "*BAD*" : " ", 640 result[i].bad ? "*BAD*" : " ",
@@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits)
869 * [0, 1M) should always be covered by var mtrr with WB 719 * [0, 1M) should always be covered by var mtrr with WB
870 * and fixed mtrrs should take effect before var mtrr for it: 720 * and fixed mtrrs should take effect before var mtrr for it:
871 */ 721 */
872 nr_range = add_range_with_merge(range, nr_range, 0, 722 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
873 (1ULL<<(20 - PAGE_SHIFT)) - 1); 723 1ULL<<(20 - PAGE_SHIFT));
874 /* Sort the ranges: */ 724 /* Sort the ranges: */
875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 725 sort_range(range, nr_range);
876 726
877 range_sums = sum_ranges(range, nr_range); 727 range_sums = sum_ranges(range, nr_range);
878 printk(KERN_INFO "total RAM covered: %ldM\n", 728 printk(KERN_INFO "total RAM covered: %ldM\n",
@@ -1089,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1089 nr_range = 0; 939 nr_range = 0;
1090 if (mtrr_tom2) { 940 if (mtrr_tom2) {
1091 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); 941 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1092 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; 942 range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
1093 if (highest_pfn < range[nr_range].end + 1) 943 if (highest_pfn < range[nr_range].end)
1094 highest_pfn = range[nr_range].end + 1; 944 highest_pfn = range[nr_range].end;
1095 nr_range++; 945 nr_range++;
1096 } 946 }
1097 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); 947 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
@@ -1103,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1103 953
1104 /* Check the holes: */ 954 /* Check the holes: */
1105 for (i = 0; i < nr_range - 1; i++) { 955 for (i = 0; i < nr_range - 1; i++) {
1106 if (range[i].end + 1 < range[i+1].start) 956 if (range[i].end < range[i+1].start)
1107 total_trim_size += real_trim_memory(range[i].end + 1, 957 total_trim_size += real_trim_memory(range[i].end,
1108 range[i+1].start); 958 range[i+1].start);
1109 } 959 }
1110 960
1111 /* Check the top: */ 961 /* Check the top: */
1112 i = nr_range - 1; 962 i = nr_range - 1;
1113 if (range[i].end + 1 < end_pfn) 963 if (range[i].end < end_pfn)
1114 total_trim_size += real_trim_memory(range[i].end + 1, 964 total_trim_size += real_trim_memory(range[i].end,
1115 end_pfn); 965 end_pfn);
1116 966
1117 if (total_trim_size) { 967 if (total_trim_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 228d982ce09c..68a3343e5798 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -265,7 +265,7 @@ static void cyrix_set_all(void)
265 post_set(); 265 post_set();
266} 266}
267 267
268static struct mtrr_ops cyrix_mtrr_ops = { 268static const struct mtrr_ops cyrix_mtrr_ops = {
269 .vendor = X86_VENDOR_CYRIX, 269 .vendor = X86_VENDOR_CYRIX,
270 .set_all = cyrix_set_all, 270 .set_all = cyrix_set_all,
271 .set = cyrix_set_arr, 271 .set = cyrix_set_arr,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 55da0c5f68dd..7d28d7d03885 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -6,7 +6,6 @@
6 6
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/io.h> 9#include <linux/io.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12 11
@@ -434,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
434{ 433{
435 unsigned int mask_lo, mask_hi, base_lo, base_hi; 434 unsigned int mask_lo, mask_hi, base_lo, base_hi;
436 unsigned int tmp, hi; 435 unsigned int tmp, hi;
437 int cpu;
438 436
439 /* 437 /*
440 * get_mtrr doesn't need to update mtrr_state, also it could be called 438 * get_mtrr doesn't need to update mtrr_state, also it could be called
441 * from any cpu, so try to print it out directly. 439 * from any cpu, so try to print it out directly.
442 */ 440 */
443 cpu = get_cpu(); 441 get_cpu();
444 442
445 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 443 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
446 444
@@ -464,7 +462,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
464 tmp |= ~((1<<(hi - 1)) - 1); 462 tmp |= ~((1<<(hi - 1)) - 1);
465 463
466 if (tmp != mask_lo) { 464 if (tmp != mask_lo) {
467 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); 465 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
468 mask_lo = tmp; 466 mask_lo = tmp;
469 } 467 }
470 } 468 }
@@ -570,7 +568,7 @@ static unsigned long set_mtrr_state(void)
570 568
571 569
572static unsigned long cr4; 570static unsigned long cr4;
573static DEFINE_SPINLOCK(set_atomicity_lock); 571static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
574 572
575/* 573/*
576 * Since we are disabling the cache don't allow any interrupts, 574 * Since we are disabling the cache don't allow any interrupts,
@@ -590,7 +588,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
590 * changes to the way the kernel boots 588 * changes to the way the kernel boots
591 */ 589 */
592 590
593 spin_lock(&set_atomicity_lock); 591 raw_spin_lock(&set_atomicity_lock);
594 592
595 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 593 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
596 cr0 = read_cr0() | X86_CR0_CD; 594 cr0 = read_cr0() | X86_CR0_CD;
@@ -627,7 +625,7 @@ static void post_set(void) __releases(set_atomicity_lock)
627 /* Restore value of CR4 */ 625 /* Restore value of CR4 */
628 if (cpu_has_pge) 626 if (cpu_has_pge)
629 write_cr4(cr4); 627 write_cr4(cr4);
630 spin_unlock(&set_atomicity_lock); 628 raw_spin_unlock(&set_atomicity_lock);
631} 629}
632 630
633static void generic_set_all(void) 631static void generic_set_all(void)
@@ -752,7 +750,7 @@ int positive_have_wrcomb(void)
752/* 750/*
753 * Generic structure... 751 * Generic structure...
754 */ 752 */
755struct mtrr_ops generic_mtrr_ops = { 753const struct mtrr_ops generic_mtrr_ops = {
756 .use_intel_if = 1, 754 .use_intel_if = 1,
757 .set_all = generic_set_all, 755 .set_all = generic_set_all,
758 .get = generic_get_mtrr, 756 .get = generic_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index e006e56f699c..79289632cb27 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -5,6 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/ctype.h> 6#include <linux/ctype.h>
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/slab.h>
8#include <linux/init.h> 9#include <linux/init.h>
9 10
10#define LINE_SIZE 80 11#define LINE_SIZE 80
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 84e83de54575..01c0f3ee6cc3 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
35 35
36#include <linux/types.h> /* FIXME: kvm_para.h needs this */ 36#include <linux/types.h> /* FIXME: kvm_para.h needs this */
37 37
38#include <linux/stop_machine.h>
38#include <linux/kvm_para.h> 39#include <linux/kvm_para.h>
39#include <linux/uaccess.h> 40#include <linux/uaccess.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -60,14 +61,14 @@ static DEFINE_MUTEX(mtrr_mutex);
60u64 size_or_mask, size_and_mask; 61u64 size_or_mask, size_and_mask;
61static bool mtrr_aps_delayed_init; 62static bool mtrr_aps_delayed_init;
62 63
63static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; 64static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
64 65
65struct mtrr_ops *mtrr_if; 66const struct mtrr_ops *mtrr_if;
66 67
67static void set_mtrr(unsigned int reg, unsigned long base, 68static void set_mtrr(unsigned int reg, unsigned long base,
68 unsigned long size, mtrr_type type); 69 unsigned long size, mtrr_type type);
69 70
70void set_mtrr_ops(struct mtrr_ops *ops) 71void set_mtrr_ops(const struct mtrr_ops *ops)
71{ 72{
72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM) 73 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
73 mtrr_ops[ops->vendor] = ops; 74 mtrr_ops[ops->vendor] = ops;
@@ -143,21 +144,28 @@ struct set_mtrr_data {
143 mtrr_type smp_type; 144 mtrr_type smp_type;
144}; 145};
145 146
147static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
148
146/** 149/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 150 * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
151 * @info: pointer to mtrr configuration data
148 * 152 *
149 * Returns nothing. 153 * Returns nothing.
150 */ 154 */
151static void ipi_handler(void *info) 155static int mtrr_work_handler(void *info)
152{ 156{
153#ifdef CONFIG_SMP 157#ifdef CONFIG_SMP
154 struct set_mtrr_data *data = info; 158 struct set_mtrr_data *data = info;
155 unsigned long flags; 159 unsigned long flags;
156 160
161 atomic_dec(&data->count);
162 while (!atomic_read(&data->gate))
163 cpu_relax();
164
157 local_irq_save(flags); 165 local_irq_save(flags);
158 166
159 atomic_dec(&data->count); 167 atomic_dec(&data->count);
160 while (!atomic_read(&data->gate)) 168 while (atomic_read(&data->gate))
161 cpu_relax(); 169 cpu_relax();
162 170
163 /* The master has cleared me to execute */ 171 /* The master has cleared me to execute */
@@ -172,12 +180,13 @@ static void ipi_handler(void *info)
172 } 180 }
173 181
174 atomic_dec(&data->count); 182 atomic_dec(&data->count);
175 while (atomic_read(&data->gate)) 183 while (!atomic_read(&data->gate))
176 cpu_relax(); 184 cpu_relax();
177 185
178 atomic_dec(&data->count); 186 atomic_dec(&data->count);
179 local_irq_restore(flags); 187 local_irq_restore(flags);
180#endif 188#endif
189 return 0;
181} 190}
182 191
183static inline int types_compatible(mtrr_type type1, mtrr_type type2) 192static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -197,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
197 * 206 *
198 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: 207 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
199 * 208 *
200 * 1. Send IPI to do the following: 209 * 1. Queue work to do the following on all processors:
201 * 2. Disable Interrupts 210 * 2. Disable Interrupts
202 * 3. Wait for all procs to do so 211 * 3. Wait for all procs to do so
203 * 4. Enter no-fill cache mode 212 * 4. Enter no-fill cache mode
@@ -214,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
214 * 15. Enable interrupts. 223 * 15. Enable interrupts.
215 * 224 *
216 * What does that mean for us? Well, first we set data.count to the number 225 * What does that mean for us? Well, first we set data.count to the number
217 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait 226 * of CPUs. As each CPU announces that it started the rendezvous handler by
218 * until it hits 0 and proceed. We set the data.gate flag and reset data.count. 227 * decrementing the count, We reset data.count and set the data.gate flag
219 * Meanwhile, they are waiting for that flag to be set. Once it's set, each 228 * allowing all the cpu's to proceed with the work. As each cpu disables
229 * interrupts, it'll decrement data.count once. We wait until it hits 0 and
230 * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
231 * are waiting for that flag to be cleared. Once it's cleared, each
220 * CPU goes through the transition of updating MTRRs. 232 * CPU goes through the transition of updating MTRRs.
221 * The CPU vendors may each do it differently, 233 * The CPU vendors may each do it differently,
222 * so we call mtrr_if->set() callback and let them take care of it. 234 * so we call mtrr_if->set() callback and let them take care of it.
223 * When they're done, they again decrement data->count and wait for data.gate 235 * When they're done, they again decrement data->count and wait for data.gate
224 * to be reset. 236 * to be set.
225 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag 237 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
226 * Everyone then enables interrupts and we all continue on. 238 * Everyone then enables interrupts and we all continue on.
227 * 239 *
@@ -233,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
233{ 245{
234 struct set_mtrr_data data; 246 struct set_mtrr_data data;
235 unsigned long flags; 247 unsigned long flags;
248 int cpu;
249
250 preempt_disable();
236 251
237 data.smp_reg = reg; 252 data.smp_reg = reg;
238 data.smp_base = base; 253 data.smp_base = base;
@@ -245,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
245 atomic_set(&data.gate, 0); 260 atomic_set(&data.gate, 0);
246 261
247 /* Start the ball rolling on other CPUs */ 262 /* Start the ball rolling on other CPUs */
248 if (smp_call_function(ipi_handler, &data, 0) != 0) 263 for_each_online_cpu(cpu) {
249 panic("mtrr: timed out waiting for other CPUs\n"); 264 struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
265
266 if (cpu == smp_processor_id())
267 continue;
268
269 stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
270 }
250 271
251 local_irq_save(flags);
252 272
253 while (atomic_read(&data.count)) 273 while (atomic_read(&data.count))
254 cpu_relax(); 274 cpu_relax();
@@ -258,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
258 smp_wmb(); 278 smp_wmb();
259 atomic_set(&data.gate, 1); 279 atomic_set(&data.gate, 1);
260 280
281 local_irq_save(flags);
282
283 while (atomic_read(&data.count))
284 cpu_relax();
285
286 /* Ok, reset count and toggle gate */
287 atomic_set(&data.count, num_booting_cpus() - 1);
288 smp_wmb();
289 atomic_set(&data.gate, 0);
290
261 /* Do our MTRR business */ 291 /* Do our MTRR business */
262 292
263 /* 293 /*
@@ -278,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
278 308
279 atomic_set(&data.count, num_booting_cpus() - 1); 309 atomic_set(&data.count, num_booting_cpus() - 1);
280 smp_wmb(); 310 smp_wmb();
281 atomic_set(&data.gate, 0); 311 atomic_set(&data.gate, 1);
282 312
283 /* 313 /*
284 * Wait here for everyone to have seen the gate change 314 * Wait here for everyone to have seen the gate change
@@ -288,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
288 cpu_relax(); 318 cpu_relax();
289 319
290 local_irq_restore(flags); 320 local_irq_restore(flags);
321 preempt_enable();
291} 322}
292 323
293/** 324/**
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index a501dee9a87a..df5e41f31a27 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size,
32extern int generic_validate_add_page(unsigned long base, unsigned long size, 32extern int generic_validate_add_page(unsigned long base, unsigned long size,
33 unsigned int type); 33 unsigned int type);
34 34
35extern struct mtrr_ops generic_mtrr_ops; 35extern const struct mtrr_ops generic_mtrr_ops;
36 36
37extern int positive_have_wrcomb(void); 37extern int positive_have_wrcomb(void);
38 38
@@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index,
53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); 53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
54void get_mtrr_state(void); 54void get_mtrr_state(void);
55 55
56extern void set_mtrr_ops(struct mtrr_ops *ops); 56extern void set_mtrr_ops(const struct mtrr_ops *ops);
57 57
58extern u64 size_or_mask, size_and_mask; 58extern u64 size_or_mask, size_and_mask;
59extern struct mtrr_ops *mtrr_if; 59extern const struct mtrr_ops *mtrr_if;
60 60
61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) 61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) 62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
deleted file mode 100644
index dfc80b4e6b0d..000000000000
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ /dev/null
@@ -1,94 +0,0 @@
1#include <linux/init.h>
2#include <linux/io.h>
3#include <linux/mm.h>
4
5#include <asm/processor-cyrix.h>
6#include <asm/processor-flags.h>
7#include <asm/mtrr.h>
8#include <asm/msr.h>
9
10#include "mtrr.h"
11
12/* Put the processor into a state where MTRRs can be safely set */
13void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
14{
15 unsigned int cr0;
16
17 /* Disable interrupts locally */
18 local_irq_save(ctxt->flags);
19
20 if (use_intel() || is_cpu(CYRIX)) {
21
22 /* Save value of CR4 and clear Page Global Enable (bit 7) */
23 if (cpu_has_pge) {
24 ctxt->cr4val = read_cr4();
25 write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
26 }
27
28 /*
29 * Disable and flush caches. Note that wbinvd flushes the TLBs
30 * as a side-effect
31 */
32 cr0 = read_cr0() | X86_CR0_CD;
33 wbinvd();
34 write_cr0(cr0);
35 wbinvd();
36
37 if (use_intel()) {
38 /* Save MTRR state */
39 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
40 } else {
41 /*
42 * Cyrix ARRs -
43 * everything else were excluded at the top
44 */
45 ctxt->ccr3 = getCx86(CX86_CCR3);
46 }
47 }
48}
49
50void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
51{
52 if (use_intel()) {
53 /* Disable MTRRs, and set the default type to uncached */
54 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
55 ctxt->deftype_hi);
56 } else {
57 if (is_cpu(CYRIX)) {
58 /* Cyrix ARRs - everything else were excluded at the top */
59 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
60 }
61 }
62}
63
64/* Restore the processor after a set_mtrr_prepare */
65void set_mtrr_done(struct set_mtrr_context *ctxt)
66{
67 if (use_intel() || is_cpu(CYRIX)) {
68
69 /* Flush caches and TLBs */
70 wbinvd();
71
72 /* Restore MTRRdefType */
73 if (use_intel()) {
74 /* Intel (P6) standard MTRRs */
75 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo,
76 ctxt->deftype_hi);
77 } else {
78 /*
79 * Cyrix ARRs -
80 * everything else was excluded at the top
81 */
82 setCx86(CX86_CCR3, ctxt->ccr3);
83 }
84
85 /* Enable caches */
86 write_cr0(read_cr0() & 0xbfffffff);
87
88 /* Restore value of CR4 */
89 if (cpu_has_pge)
90 write_cr4(ctxt->cr4val);
91 }
92 /* Re-enable interrupts locally (if enabled previously) */
93 local_irq_restore(ctxt->flags);
94}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8c1c07073ccc..03a5b0385ad6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -7,6 +7,7 @@
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
10 * 11 *
11 * For licencing details see kernel-base/COPYING 12 * For licencing details see kernel-base/COPYING
12 */ 13 */
@@ -20,215 +21,241 @@
20#include <linux/kdebug.h> 21#include <linux/kdebug.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
22#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/slab.h>
23#include <linux/highmem.h> 25#include <linux/highmem.h>
24#include <linux/cpu.h> 26#include <linux/cpu.h>
27#include <linux/bitops.h>
25 28
26#include <asm/apic.h> 29#include <asm/apic.h>
27#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
28#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h>
33
34#if 0
35#undef wrmsrl
36#define wrmsrl(msr, val) \
37do { \
38 trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
39 (unsigned long)(val)); \
40 native_write_msr((msr), (u32)((u64)(val)), \
41 (u32)((u64)(val) >> 32)); \
42} while (0)
43#endif
29 44
30static u64 perf_event_mask __read_mostly; 45/*
46 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
47 */
48static unsigned long
49copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
50{
51 unsigned long offset, addr = (unsigned long)from;
52 int type = in_nmi() ? KM_NMI : KM_IRQ0;
53 unsigned long size, len = 0;
54 struct page *page;
55 void *map;
56 int ret;
31 57
32/* The maximal number of PEBS events: */ 58 do {
33#define MAX_PEBS_EVENTS 4 59 ret = __get_user_pages_fast(addr, 1, 0, &page);
60 if (!ret)
61 break;
34 62
35/* The size of a BTS record in bytes: */ 63 offset = addr & (PAGE_SIZE - 1);
36#define BTS_RECORD_SIZE 24 64 size = min(PAGE_SIZE - offset, n - len);
37 65
38/* The size of a per-cpu BTS buffer in bytes: */ 66 map = kmap_atomic(page, type);
39#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) 67 memcpy(to, map+offset, size);
68 kunmap_atomic(map, type);
69 put_page(page);
70
71 len += size;
72 to += size;
73 addr += size;
40 74
41/* The BTS overflow threshold in bytes from the end of the buffer: */ 75 } while (len < n);
42#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128)
43 76
77 return len;
78}
44 79
45/* 80struct event_constraint {
46 * Bits in the debugctlmsr controlling branch tracing. 81 union {
47 */ 82 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
48#define X86_DEBUGCTL_TR (1 << 6) 83 u64 idxmsk64;
49#define X86_DEBUGCTL_BTS (1 << 7) 84 };
50#define X86_DEBUGCTL_BTINT (1 << 8) 85 u64 code;
51#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) 86 u64 cmask;
52#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) 87 int weight;
88};
53 89
54/* 90struct amd_nb {
55 * A debug store configuration. 91 int nb_id; /* NorthBridge id */
56 * 92 int refcnt; /* reference count */
57 * We only support architectures that use 64bit fields. 93 struct perf_event *owners[X86_PMC_IDX_MAX];
58 */ 94 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
59struct debug_store {
60 u64 bts_buffer_base;
61 u64 bts_index;
62 u64 bts_absolute_maximum;
63 u64 bts_interrupt_threshold;
64 u64 pebs_buffer_base;
65 u64 pebs_index;
66 u64 pebs_absolute_maximum;
67 u64 pebs_interrupt_threshold;
68 u64 pebs_event_reset[MAX_PEBS_EVENTS];
69}; 95};
70 96
97#define MAX_LBR_ENTRIES 16
98
71struct cpu_hw_events { 99struct cpu_hw_events {
72 struct perf_event *events[X86_PMC_IDX_MAX]; 100 /*
73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 101 * Generic x86 PMC bits
102 */
103 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 104 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
75 unsigned long interrupts; 105 unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
76 int enabled; 106 int enabled;
107
108 int n_events;
109 int n_added;
110 int n_txn;
111 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
112 u64 tags[X86_PMC_IDX_MAX];
113 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
114
115 unsigned int group_flag;
116
117 /*
118 * Intel DebugStore bits
119 */
77 struct debug_store *ds; 120 struct debug_store *ds;
78}; 121 u64 pebs_enabled;
79 122
80struct event_constraint { 123 /*
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 124 * Intel LBR bits
82 int code; 125 */
126 int lbr_users;
127 void *lbr_context;
128 struct perf_branch_stack lbr_stack;
129 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
130
131 /*
132 * AMD specific bits
133 */
134 struct amd_nb *amd_nb;
83}; 135};
84 136
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } 137#define __EVENT_CONSTRAINT(c, n, m, w) {\
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } 138 { .idxmsk64 = (n) }, \
139 .code = (c), \
140 .cmask = (m), \
141 .weight = (w), \
142}
87 143
88#define for_each_event_constraint(e, c) \ 144#define EVENT_CONSTRAINT(c, n, m) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++) 145 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
90 146
147/*
148 * Constraint on the Event code.
149 */
150#define INTEL_EVENT_CONSTRAINT(c, n) \
151 EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
152
153/*
154 * Constraint on the Event code + UMask + fixed-mask
155 *
156 * filter mask to validate fixed counter events.
157 * the following filters disqualify for fixed counters:
158 * - inv
159 * - edge
160 * - cnt-mask
161 * The other filters are supported by fixed counters.
162 * The any-thread option is supported starting with v3.
163 */
164#define FIXED_EVENT_CONSTRAINT(c, n) \
165 EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
166
167/*
168 * Constraint on the Event code + UMask
169 */
170#define PEBS_EVENT_CONSTRAINT(c, n) \
171 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
172
173#define EVENT_CONSTRAINT_END \
174 EVENT_CONSTRAINT(0, 0, 0)
175
176#define for_each_event_constraint(e, c) \
177 for ((e) = (c); (e)->weight; (e)++)
178
179union perf_capabilities {
180 struct {
181 u64 lbr_format : 6;
182 u64 pebs_trap : 1;
183 u64 pebs_arch_reg : 1;
184 u64 pebs_format : 4;
185 u64 smm_freeze : 1;
186 };
187 u64 capabilities;
188};
91 189
92/* 190/*
93 * struct x86_pmu - generic x86 pmu 191 * struct x86_pmu - generic x86 pmu
94 */ 192 */
95struct x86_pmu { 193struct x86_pmu {
194 /*
195 * Generic x86 PMC bits
196 */
96 const char *name; 197 const char *name;
97 int version; 198 int version;
98 int (*handle_irq)(struct pt_regs *); 199 int (*handle_irq)(struct pt_regs *);
99 void (*disable_all)(void); 200 void (*disable_all)(void);
100 void (*enable_all)(void); 201 void (*enable_all)(int added);
101 void (*enable)(struct hw_perf_event *, int); 202 void (*enable)(struct perf_event *);
102 void (*disable)(struct hw_perf_event *, int); 203 void (*disable)(struct perf_event *);
204 int (*hw_config)(struct perf_event *event);
205 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
103 unsigned eventsel; 206 unsigned eventsel;
104 unsigned perfctr; 207 unsigned perfctr;
105 u64 (*event_map)(int); 208 u64 (*event_map)(int);
106 u64 (*raw_event)(u64);
107 int max_events; 209 int max_events;
108 int num_events; 210 int num_counters;
109 int num_events_fixed; 211 int num_counters_fixed;
110 int event_bits; 212 int cntval_bits;
111 u64 event_mask; 213 u64 cntval_mask;
112 int apic; 214 int apic;
113 u64 max_period; 215 u64 max_period;
114 u64 intel_ctrl; 216 struct event_constraint *
115 void (*enable_bts)(u64 config); 217 (*get_event_constraints)(struct cpu_hw_events *cpuc,
116 void (*disable_bts)(void); 218 struct perf_event *event);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc,
118 struct hw_perf_event *hwc);
119};
120 219
121static struct x86_pmu x86_pmu __read_mostly; 220 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
221 struct perf_event *event);
222 struct event_constraint *event_constraints;
223 void (*quirks)(void);
224 int perfctr_second_write;
122 225
123static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 226 int (*cpu_prepare)(int cpu);
124 .enabled = 1, 227 void (*cpu_starting)(int cpu);
125}; 228 void (*cpu_dying)(int cpu);
126 229 void (*cpu_dead)(int cpu);
127static const struct event_constraint *event_constraints;
128
129/*
130 * Not sure about some of these
131 */
132static const u64 p6_perfmon_event_map[] =
133{
134 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
135 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
136 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
137 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
138 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
139 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
140 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
141};
142
143static u64 p6_pmu_event_map(int hw_event)
144{
145 return p6_perfmon_event_map[hw_event];
146}
147 230
148/* 231 /*
149 * Event setting that is specified not to count anything. 232 * Intel Arch Perfmon v2+
150 * We use this to effectively disable a counter. 233 */
151 * 234 u64 intel_ctrl;
152 * L2_RQSTS with 0 MESI unit mask. 235 union perf_capabilities intel_cap;
153 */
154#define P6_NOP_EVENT 0x0000002EULL
155
156static u64 p6_pmu_raw_event(u64 hw_event)
157{
158#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
159#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
160#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
161#define P6_EVNTSEL_INV_MASK 0x00800000ULL
162#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
163
164#define P6_EVNTSEL_MASK \
165 (P6_EVNTSEL_EVENT_MASK | \
166 P6_EVNTSEL_UNIT_MASK | \
167 P6_EVNTSEL_EDGE_MASK | \
168 P6_EVNTSEL_INV_MASK | \
169 P6_EVNTSEL_REG_MASK)
170
171 return hw_event & P6_EVNTSEL_MASK;
172}
173 236
174static const struct event_constraint intel_p6_event_constraints[] = 237 /*
175{ 238 * Intel DebugStore bits
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ 239 */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ 240 int bts, pebs;
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ 241 int pebs_record_size;
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ 242 void (*drain_pebs)(struct pt_regs *regs);
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ 243 struct event_constraint *pebs_constraints;
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
184 244
185/* 245 /*
186 * Intel PerfMon v3. Used on Core2 and later. 246 * Intel LBR
187 */ 247 */
188static const u64 intel_perfmon_event_map[] = 248 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
189{ 249 int lbr_nr; /* hardware stack size */
190 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
191 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
192 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
193 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
194 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
195 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
197}; 250};
198 251
199static const struct event_constraint intel_core_event_constraints[] = 252static struct x86_pmu x86_pmu __read_mostly;
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212 253
213static const struct event_constraint intel_nehalem_event_constraints[] = 254static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
214{ 255 .enabled = 1,
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226}; 256};
227 257
228static u64 intel_pmu_event_map(int hw_event) 258static int x86_perf_event_set_period(struct perf_event *event);
229{
230 return intel_perfmon_event_map[hw_event];
231}
232 259
233/* 260/*
234 * Generalized hw caching related hw_event table, filled 261 * Generalized hw caching related hw_event table, filled
@@ -245,435 +272,18 @@ static u64 __read_mostly hw_cache_event_ids
245 [PERF_COUNT_HW_CACHE_OP_MAX] 272 [PERF_COUNT_HW_CACHE_OP_MAX]
246 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 273 [PERF_COUNT_HW_CACHE_RESULT_MAX];
247 274
248static __initconst u64 nehalem_hw_cache_event_ids
249 [PERF_COUNT_HW_CACHE_MAX]
250 [PERF_COUNT_HW_CACHE_OP_MAX]
251 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
252{
253 [ C(L1D) ] = {
254 [ C(OP_READ) ] = {
255 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
256 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
257 },
258 [ C(OP_WRITE) ] = {
259 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
260 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
261 },
262 [ C(OP_PREFETCH) ] = {
263 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
264 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
265 },
266 },
267 [ C(L1I ) ] = {
268 [ C(OP_READ) ] = {
269 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
270 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
271 },
272 [ C(OP_WRITE) ] = {
273 [ C(RESULT_ACCESS) ] = -1,
274 [ C(RESULT_MISS) ] = -1,
275 },
276 [ C(OP_PREFETCH) ] = {
277 [ C(RESULT_ACCESS) ] = 0x0,
278 [ C(RESULT_MISS) ] = 0x0,
279 },
280 },
281 [ C(LL ) ] = {
282 [ C(OP_READ) ] = {
283 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
284 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
285 },
286 [ C(OP_WRITE) ] = {
287 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
288 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
289 },
290 [ C(OP_PREFETCH) ] = {
291 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
292 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
293 },
294 },
295 [ C(DTLB) ] = {
296 [ C(OP_READ) ] = {
297 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
298 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
299 },
300 [ C(OP_WRITE) ] = {
301 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
302 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
303 },
304 [ C(OP_PREFETCH) ] = {
305 [ C(RESULT_ACCESS) ] = 0x0,
306 [ C(RESULT_MISS) ] = 0x0,
307 },
308 },
309 [ C(ITLB) ] = {
310 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
312 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
313 },
314 [ C(OP_WRITE) ] = {
315 [ C(RESULT_ACCESS) ] = -1,
316 [ C(RESULT_MISS) ] = -1,
317 },
318 [ C(OP_PREFETCH) ] = {
319 [ C(RESULT_ACCESS) ] = -1,
320 [ C(RESULT_MISS) ] = -1,
321 },
322 },
323 [ C(BPU ) ] = {
324 [ C(OP_READ) ] = {
325 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
326 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
327 },
328 [ C(OP_WRITE) ] = {
329 [ C(RESULT_ACCESS) ] = -1,
330 [ C(RESULT_MISS) ] = -1,
331 },
332 [ C(OP_PREFETCH) ] = {
333 [ C(RESULT_ACCESS) ] = -1,
334 [ C(RESULT_MISS) ] = -1,
335 },
336 },
337};
338
339static __initconst u64 core2_hw_cache_event_ids
340 [PERF_COUNT_HW_CACHE_MAX]
341 [PERF_COUNT_HW_CACHE_OP_MAX]
342 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
343{
344 [ C(L1D) ] = {
345 [ C(OP_READ) ] = {
346 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
347 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
348 },
349 [ C(OP_WRITE) ] = {
350 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
351 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
352 },
353 [ C(OP_PREFETCH) ] = {
354 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
355 [ C(RESULT_MISS) ] = 0,
356 },
357 },
358 [ C(L1I ) ] = {
359 [ C(OP_READ) ] = {
360 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
361 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
362 },
363 [ C(OP_WRITE) ] = {
364 [ C(RESULT_ACCESS) ] = -1,
365 [ C(RESULT_MISS) ] = -1,
366 },
367 [ C(OP_PREFETCH) ] = {
368 [ C(RESULT_ACCESS) ] = 0,
369 [ C(RESULT_MISS) ] = 0,
370 },
371 },
372 [ C(LL ) ] = {
373 [ C(OP_READ) ] = {
374 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
375 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
376 },
377 [ C(OP_WRITE) ] = {
378 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
379 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
380 },
381 [ C(OP_PREFETCH) ] = {
382 [ C(RESULT_ACCESS) ] = 0,
383 [ C(RESULT_MISS) ] = 0,
384 },
385 },
386 [ C(DTLB) ] = {
387 [ C(OP_READ) ] = {
388 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
389 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
390 },
391 [ C(OP_WRITE) ] = {
392 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
393 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
394 },
395 [ C(OP_PREFETCH) ] = {
396 [ C(RESULT_ACCESS) ] = 0,
397 [ C(RESULT_MISS) ] = 0,
398 },
399 },
400 [ C(ITLB) ] = {
401 [ C(OP_READ) ] = {
402 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
403 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
404 },
405 [ C(OP_WRITE) ] = {
406 [ C(RESULT_ACCESS) ] = -1,
407 [ C(RESULT_MISS) ] = -1,
408 },
409 [ C(OP_PREFETCH) ] = {
410 [ C(RESULT_ACCESS) ] = -1,
411 [ C(RESULT_MISS) ] = -1,
412 },
413 },
414 [ C(BPU ) ] = {
415 [ C(OP_READ) ] = {
416 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
417 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
418 },
419 [ C(OP_WRITE) ] = {
420 [ C(RESULT_ACCESS) ] = -1,
421 [ C(RESULT_MISS) ] = -1,
422 },
423 [ C(OP_PREFETCH) ] = {
424 [ C(RESULT_ACCESS) ] = -1,
425 [ C(RESULT_MISS) ] = -1,
426 },
427 },
428};
429
430static __initconst u64 atom_hw_cache_event_ids
431 [PERF_COUNT_HW_CACHE_MAX]
432 [PERF_COUNT_HW_CACHE_OP_MAX]
433 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
434{
435 [ C(L1D) ] = {
436 [ C(OP_READ) ] = {
437 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
438 [ C(RESULT_MISS) ] = 0,
439 },
440 [ C(OP_WRITE) ] = {
441 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
442 [ C(RESULT_MISS) ] = 0,
443 },
444 [ C(OP_PREFETCH) ] = {
445 [ C(RESULT_ACCESS) ] = 0x0,
446 [ C(RESULT_MISS) ] = 0,
447 },
448 },
449 [ C(L1I ) ] = {
450 [ C(OP_READ) ] = {
451 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
452 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
453 },
454 [ C(OP_WRITE) ] = {
455 [ C(RESULT_ACCESS) ] = -1,
456 [ C(RESULT_MISS) ] = -1,
457 },
458 [ C(OP_PREFETCH) ] = {
459 [ C(RESULT_ACCESS) ] = 0,
460 [ C(RESULT_MISS) ] = 0,
461 },
462 },
463 [ C(LL ) ] = {
464 [ C(OP_READ) ] = {
465 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
466 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
467 },
468 [ C(OP_WRITE) ] = {
469 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
470 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
471 },
472 [ C(OP_PREFETCH) ] = {
473 [ C(RESULT_ACCESS) ] = 0,
474 [ C(RESULT_MISS) ] = 0,
475 },
476 },
477 [ C(DTLB) ] = {
478 [ C(OP_READ) ] = {
479 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
480 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
481 },
482 [ C(OP_WRITE) ] = {
483 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
484 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
485 },
486 [ C(OP_PREFETCH) ] = {
487 [ C(RESULT_ACCESS) ] = 0,
488 [ C(RESULT_MISS) ] = 0,
489 },
490 },
491 [ C(ITLB) ] = {
492 [ C(OP_READ) ] = {
493 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
494 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
495 },
496 [ C(OP_WRITE) ] = {
497 [ C(RESULT_ACCESS) ] = -1,
498 [ C(RESULT_MISS) ] = -1,
499 },
500 [ C(OP_PREFETCH) ] = {
501 [ C(RESULT_ACCESS) ] = -1,
502 [ C(RESULT_MISS) ] = -1,
503 },
504 },
505 [ C(BPU ) ] = {
506 [ C(OP_READ) ] = {
507 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
508 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
509 },
510 [ C(OP_WRITE) ] = {
511 [ C(RESULT_ACCESS) ] = -1,
512 [ C(RESULT_MISS) ] = -1,
513 },
514 [ C(OP_PREFETCH) ] = {
515 [ C(RESULT_ACCESS) ] = -1,
516 [ C(RESULT_MISS) ] = -1,
517 },
518 },
519};
520
521static u64 intel_pmu_raw_event(u64 hw_event)
522{
523#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
528
529#define CORE_EVNTSEL_MASK \
530 (CORE_EVNTSEL_EVENT_MASK | \
531 CORE_EVNTSEL_UNIT_MASK | \
532 CORE_EVNTSEL_EDGE_MASK | \
533 CORE_EVNTSEL_INV_MASK | \
534 CORE_EVNTSEL_REG_MASK)
535
536 return hw_event & CORE_EVNTSEL_MASK;
537}
538
539static __initconst u64 amd_hw_cache_event_ids
540 [PERF_COUNT_HW_CACHE_MAX]
541 [PERF_COUNT_HW_CACHE_OP_MAX]
542 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
543{
544 [ C(L1D) ] = {
545 [ C(OP_READ) ] = {
546 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
547 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
548 },
549 [ C(OP_WRITE) ] = {
550 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
551 [ C(RESULT_MISS) ] = 0,
552 },
553 [ C(OP_PREFETCH) ] = {
554 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
555 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
556 },
557 },
558 [ C(L1I ) ] = {
559 [ C(OP_READ) ] = {
560 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
561 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
562 },
563 [ C(OP_WRITE) ] = {
564 [ C(RESULT_ACCESS) ] = -1,
565 [ C(RESULT_MISS) ] = -1,
566 },
567 [ C(OP_PREFETCH) ] = {
568 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
569 [ C(RESULT_MISS) ] = 0,
570 },
571 },
572 [ C(LL ) ] = {
573 [ C(OP_READ) ] = {
574 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
575 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
576 },
577 [ C(OP_WRITE) ] = {
578 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
579 [ C(RESULT_MISS) ] = 0,
580 },
581 [ C(OP_PREFETCH) ] = {
582 [ C(RESULT_ACCESS) ] = 0,
583 [ C(RESULT_MISS) ] = 0,
584 },
585 },
586 [ C(DTLB) ] = {
587 [ C(OP_READ) ] = {
588 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
589 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
590 },
591 [ C(OP_WRITE) ] = {
592 [ C(RESULT_ACCESS) ] = 0,
593 [ C(RESULT_MISS) ] = 0,
594 },
595 [ C(OP_PREFETCH) ] = {
596 [ C(RESULT_ACCESS) ] = 0,
597 [ C(RESULT_MISS) ] = 0,
598 },
599 },
600 [ C(ITLB) ] = {
601 [ C(OP_READ) ] = {
602 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
603 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
604 },
605 [ C(OP_WRITE) ] = {
606 [ C(RESULT_ACCESS) ] = -1,
607 [ C(RESULT_MISS) ] = -1,
608 },
609 [ C(OP_PREFETCH) ] = {
610 [ C(RESULT_ACCESS) ] = -1,
611 [ C(RESULT_MISS) ] = -1,
612 },
613 },
614 [ C(BPU ) ] = {
615 [ C(OP_READ) ] = {
616 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
617 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
618 },
619 [ C(OP_WRITE) ] = {
620 [ C(RESULT_ACCESS) ] = -1,
621 [ C(RESULT_MISS) ] = -1,
622 },
623 [ C(OP_PREFETCH) ] = {
624 [ C(RESULT_ACCESS) ] = -1,
625 [ C(RESULT_MISS) ] = -1,
626 },
627 },
628};
629
630/*
631 * AMD Performance Monitor K7 and later.
632 */
633static const u64 amd_perfmon_event_map[] =
634{
635 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
636 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
637 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
638 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
639 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
640 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
641};
642
643static u64 amd_pmu_event_map(int hw_event)
644{
645 return amd_perfmon_event_map[hw_event];
646}
647
648static u64 amd_pmu_raw_event(u64 hw_event)
649{
650#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
651#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
652#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
653#define K7_EVNTSEL_INV_MASK 0x000800000ULL
654#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
655
656#define K7_EVNTSEL_MASK \
657 (K7_EVNTSEL_EVENT_MASK | \
658 K7_EVNTSEL_UNIT_MASK | \
659 K7_EVNTSEL_EDGE_MASK | \
660 K7_EVNTSEL_INV_MASK | \
661 K7_EVNTSEL_REG_MASK)
662
663 return hw_event & K7_EVNTSEL_MASK;
664}
665
666/* 275/*
667 * Propagate event elapsed time into the generic event. 276 * Propagate event elapsed time into the generic event.
668 * Can only be executed on the CPU where the event is active. 277 * Can only be executed on the CPU where the event is active.
669 * Returns the delta events processed. 278 * Returns the delta events processed.
670 */ 279 */
671static u64 280static u64
672x86_perf_event_update(struct perf_event *event, 281x86_perf_event_update(struct perf_event *event)
673 struct hw_perf_event *hwc, int idx)
674{ 282{
675 int shift = 64 - x86_pmu.event_bits; 283 struct hw_perf_event *hwc = &event->hw;
284 int shift = 64 - x86_pmu.cntval_bits;
676 u64 prev_raw_count, new_raw_count; 285 u64 prev_raw_count, new_raw_count;
286 int idx = hwc->idx;
677 s64 delta; 287 s64 delta;
678 288
679 if (idx == X86_PMC_IDX_FIXED_BTS) 289 if (idx == X86_PMC_IDX_FIXED_BTS)
@@ -687,10 +297,10 @@ x86_perf_event_update(struct perf_event *event,
687 * count to the generic event atomically: 297 * count to the generic event atomically:
688 */ 298 */
689again: 299again:
690 prev_raw_count = atomic64_read(&hwc->prev_count); 300 prev_raw_count = local64_read(&hwc->prev_count);
691 rdmsrl(hwc->event_base + idx, new_raw_count); 301 rdmsrl(hwc->event_base + idx, new_raw_count);
692 302
693 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, 303 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
694 new_raw_count) != prev_raw_count) 304 new_raw_count) != prev_raw_count)
695 goto again; 305 goto again;
696 306
@@ -705,8 +315,8 @@ again:
705 delta = (new_raw_count << shift) - (prev_raw_count << shift); 315 delta = (new_raw_count << shift) - (prev_raw_count << shift);
706 delta >>= shift; 316 delta >>= shift;
707 317
708 atomic64_add(delta, &event->count); 318 local64_add(delta, &event->count);
709 atomic64_sub(delta, &hwc->period_left); 319 local64_sub(delta, &hwc->period_left);
710 320
711 return new_raw_count; 321 return new_raw_count;
712} 322}
@@ -714,33 +324,32 @@ again:
714static atomic_t active_events; 324static atomic_t active_events;
715static DEFINE_MUTEX(pmc_reserve_mutex); 325static DEFINE_MUTEX(pmc_reserve_mutex);
716 326
327#ifdef CONFIG_X86_LOCAL_APIC
328
717static bool reserve_pmc_hardware(void) 329static bool reserve_pmc_hardware(void)
718{ 330{
719#ifdef CONFIG_X86_LOCAL_APIC
720 int i; 331 int i;
721 332
722 if (nmi_watchdog == NMI_LOCAL_APIC) 333 if (nmi_watchdog == NMI_LOCAL_APIC)
723 disable_lapic_nmi_watchdog(); 334 disable_lapic_nmi_watchdog();
724 335
725 for (i = 0; i < x86_pmu.num_events; i++) { 336 for (i = 0; i < x86_pmu.num_counters; i++) {
726 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 337 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
727 goto perfctr_fail; 338 goto perfctr_fail;
728 } 339 }
729 340
730 for (i = 0; i < x86_pmu.num_events; i++) { 341 for (i = 0; i < x86_pmu.num_counters; i++) {
731 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 342 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
732 goto eventsel_fail; 343 goto eventsel_fail;
733 } 344 }
734#endif
735 345
736 return true; 346 return true;
737 347
738#ifdef CONFIG_X86_LOCAL_APIC
739eventsel_fail: 348eventsel_fail:
740 for (i--; i >= 0; i--) 349 for (i--; i >= 0; i--)
741 release_evntsel_nmi(x86_pmu.eventsel + i); 350 release_evntsel_nmi(x86_pmu.eventsel + i);
742 351
743 i = x86_pmu.num_events; 352 i = x86_pmu.num_counters;
744 353
745perfctr_fail: 354perfctr_fail:
746 for (i--; i >= 0; i--) 355 for (i--; i >= 0; i--)
@@ -750,128 +359,36 @@ perfctr_fail:
750 enable_lapic_nmi_watchdog(); 359 enable_lapic_nmi_watchdog();
751 360
752 return false; 361 return false;
753#endif
754} 362}
755 363
756static void release_pmc_hardware(void) 364static void release_pmc_hardware(void)
757{ 365{
758#ifdef CONFIG_X86_LOCAL_APIC
759 int i; 366 int i;
760 367
761 for (i = 0; i < x86_pmu.num_events; i++) { 368 for (i = 0; i < x86_pmu.num_counters; i++) {
762 release_perfctr_nmi(x86_pmu.perfctr + i); 369 release_perfctr_nmi(x86_pmu.perfctr + i);
763 release_evntsel_nmi(x86_pmu.eventsel + i); 370 release_evntsel_nmi(x86_pmu.eventsel + i);
764 } 371 }
765 372
766 if (nmi_watchdog == NMI_LOCAL_APIC) 373 if (nmi_watchdog == NMI_LOCAL_APIC)
767 enable_lapic_nmi_watchdog(); 374 enable_lapic_nmi_watchdog();
768#endif
769}
770
771static inline bool bts_available(void)
772{
773 return x86_pmu.enable_bts != NULL;
774}
775
776static inline void init_debug_store_on_cpu(int cpu)
777{
778 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
779
780 if (!ds)
781 return;
782
783 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
784 (u32)((u64)(unsigned long)ds),
785 (u32)((u64)(unsigned long)ds >> 32));
786}
787
788static inline void fini_debug_store_on_cpu(int cpu)
789{
790 if (!per_cpu(cpu_hw_events, cpu).ds)
791 return;
792
793 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
794} 375}
795 376
796static void release_bts_hardware(void) 377#else
797{
798 int cpu;
799
800 if (!bts_available())
801 return;
802
803 get_online_cpus();
804
805 for_each_online_cpu(cpu)
806 fini_debug_store_on_cpu(cpu);
807 378
808 for_each_possible_cpu(cpu) { 379static bool reserve_pmc_hardware(void) { return true; }
809 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 380static void release_pmc_hardware(void) {}
810
811 if (!ds)
812 continue;
813
814 per_cpu(cpu_hw_events, cpu).ds = NULL;
815
816 kfree((void *)(unsigned long)ds->bts_buffer_base);
817 kfree(ds);
818 }
819
820 put_online_cpus();
821}
822
823static int reserve_bts_hardware(void)
824{
825 int cpu, err = 0;
826
827 if (!bts_available())
828 return 0;
829
830 get_online_cpus();
831
832 for_each_possible_cpu(cpu) {
833 struct debug_store *ds;
834 void *buffer;
835
836 err = -ENOMEM;
837 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
838 if (unlikely(!buffer))
839 break;
840
841 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
842 if (unlikely(!ds)) {
843 kfree(buffer);
844 break;
845 }
846 381
847 ds->bts_buffer_base = (u64)(unsigned long)buffer; 382#endif
848 ds->bts_index = ds->bts_buffer_base;
849 ds->bts_absolute_maximum =
850 ds->bts_buffer_base + BTS_BUFFER_SIZE;
851 ds->bts_interrupt_threshold =
852 ds->bts_absolute_maximum - BTS_OVFL_TH;
853
854 per_cpu(cpu_hw_events, cpu).ds = ds;
855 err = 0;
856 }
857
858 if (err)
859 release_bts_hardware();
860 else {
861 for_each_online_cpu(cpu)
862 init_debug_store_on_cpu(cpu);
863 }
864 383
865 put_online_cpus(); 384static int reserve_ds_buffers(void);
866 385static void release_ds_buffers(void);
867 return err;
868}
869 386
870static void hw_perf_event_destroy(struct perf_event *event) 387static void hw_perf_event_destroy(struct perf_event *event)
871{ 388{
872 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { 389 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
873 release_pmc_hardware(); 390 release_pmc_hardware();
874 release_bts_hardware(); 391 release_ds_buffers();
875 mutex_unlock(&pmc_reserve_mutex); 392 mutex_unlock(&pmc_reserve_mutex);
876 } 393 }
877} 394}
@@ -914,93 +431,16 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
914 return 0; 431 return 0;
915} 432}
916 433
917static void intel_pmu_enable_bts(u64 config) 434static int x86_setup_perfctr(struct perf_event *event)
918{
919 unsigned long debugctlmsr;
920
921 debugctlmsr = get_debugctlmsr();
922
923 debugctlmsr |= X86_DEBUGCTL_TR;
924 debugctlmsr |= X86_DEBUGCTL_BTS;
925 debugctlmsr |= X86_DEBUGCTL_BTINT;
926
927 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
928 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
929
930 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
931 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
932
933 update_debugctlmsr(debugctlmsr);
934}
935
936static void intel_pmu_disable_bts(void)
937{
938 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
939 unsigned long debugctlmsr;
940
941 if (!cpuc->ds)
942 return;
943
944 debugctlmsr = get_debugctlmsr();
945
946 debugctlmsr &=
947 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
948 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
949
950 update_debugctlmsr(debugctlmsr);
951}
952
953/*
954 * Setup the hardware configuration for a given attr_type
955 */
956static int __hw_perf_event_init(struct perf_event *event)
957{ 435{
958 struct perf_event_attr *attr = &event->attr; 436 struct perf_event_attr *attr = &event->attr;
959 struct hw_perf_event *hwc = &event->hw; 437 struct hw_perf_event *hwc = &event->hw;
960 u64 config; 438 u64 config;
961 int err;
962
963 if (!x86_pmu_initialized())
964 return -ENODEV;
965
966 err = 0;
967 if (!atomic_inc_not_zero(&active_events)) {
968 mutex_lock(&pmc_reserve_mutex);
969 if (atomic_read(&active_events) == 0) {
970 if (!reserve_pmc_hardware())
971 err = -EBUSY;
972 else
973 err = reserve_bts_hardware();
974 }
975 if (!err)
976 atomic_inc(&active_events);
977 mutex_unlock(&pmc_reserve_mutex);
978 }
979 if (err)
980 return err;
981
982 event->destroy = hw_perf_event_destroy;
983
984 /*
985 * Generate PMC IRQs:
986 * (keep 'enabled' bit clear for now)
987 */
988 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
989
990 hwc->idx = -1;
991
992 /*
993 * Count user and OS events unless requested not to.
994 */
995 if (!attr->exclude_user)
996 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
997 if (!attr->exclude_kernel)
998 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
999 439
1000 if (!hwc->sample_period) { 440 if (!hwc->sample_period) {
1001 hwc->sample_period = x86_pmu.max_period; 441 hwc->sample_period = x86_pmu.max_period;
1002 hwc->last_period = hwc->sample_period; 442 hwc->last_period = hwc->sample_period;
1003 atomic64_set(&hwc->period_left, hwc->sample_period); 443 local64_set(&hwc->period_left, hwc->sample_period);
1004 } else { 444 } else {
1005 /* 445 /*
1006 * If we have a PMU initialized but no APIC 446 * If we have a PMU initialized but no APIC
@@ -1012,13 +452,8 @@ static int __hw_perf_event_init(struct perf_event *event)
1012 return -EOPNOTSUPP; 452 return -EOPNOTSUPP;
1013 } 453 }
1014 454
1015 /* 455 if (attr->type == PERF_TYPE_RAW)
1016 * Raw hw_event type provide the config in the hw_event structure
1017 */
1018 if (attr->type == PERF_TYPE_RAW) {
1019 hwc->config |= x86_pmu.raw_event(attr->config);
1020 return 0; 456 return 0;
1021 }
1022 457
1023 if (attr->type == PERF_TYPE_HW_CACHE) 458 if (attr->type == PERF_TYPE_HW_CACHE)
1024 return set_ext_hw_attr(hwc, attr); 459 return set_ext_hw_attr(hwc, attr);
@@ -1043,11 +478,11 @@ static int __hw_perf_event_init(struct perf_event *event)
1043 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && 478 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
1044 (hwc->sample_period == 1)) { 479 (hwc->sample_period == 1)) {
1045 /* BTS is not supported by this architecture. */ 480 /* BTS is not supported by this architecture. */
1046 if (!bts_available()) 481 if (!x86_pmu.bts)
1047 return -EOPNOTSUPP; 482 return -EOPNOTSUPP;
1048 483
1049 /* BTS is currently only allowed for user-mode. */ 484 /* BTS is currently only allowed for user-mode. */
1050 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) 485 if (!attr->exclude_kernel)
1051 return -EOPNOTSUPP; 486 return -EOPNOTSUPP;
1052 } 487 }
1053 488
@@ -1056,127 +491,122 @@ static int __hw_perf_event_init(struct perf_event *event)
1056 return 0; 491 return 0;
1057} 492}
1058 493
1059static void p6_pmu_disable_all(void) 494static int x86_pmu_hw_config(struct perf_event *event)
1060{ 495{
1061 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 496 if (event->attr.precise_ip) {
1062 u64 val; 497 int precise = 0;
1063 498
1064 if (!cpuc->enabled) 499 /* Support for constant skid */
1065 return; 500 if (x86_pmu.pebs)
501 precise++;
1066 502
1067 cpuc->enabled = 0; 503 /* Support for IP fixup */
1068 barrier(); 504 if (x86_pmu.lbr_nr)
505 precise++;
506
507 if (event->attr.precise_ip > precise)
508 return -EOPNOTSUPP;
509 }
510
511 /*
512 * Generate PMC IRQs:
513 * (keep 'enabled' bit clear for now)
514 */
515 event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
516
517 /*
518 * Count user and OS events unless requested not to
519 */
520 if (!event->attr.exclude_user)
521 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
522 if (!event->attr.exclude_kernel)
523 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
524
525 if (event->attr.type == PERF_TYPE_RAW)
526 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
1069 527
1070 /* p6 only has one enable register */ 528 return x86_setup_perfctr(event);
1071 rdmsrl(MSR_P6_EVNTSEL0, val);
1072 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
1073 wrmsrl(MSR_P6_EVNTSEL0, val);
1074} 529}
1075 530
1076static void intel_pmu_disable_all(void) 531/*
532 * Setup the hardware configuration for a given attr_type
533 */
534static int __hw_perf_event_init(struct perf_event *event)
1077{ 535{
1078 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 536 int err;
1079 537
1080 if (!cpuc->enabled) 538 if (!x86_pmu_initialized())
1081 return; 539 return -ENODEV;
1082 540
1083 cpuc->enabled = 0; 541 err = 0;
1084 barrier(); 542 if (!atomic_inc_not_zero(&active_events)) {
543 mutex_lock(&pmc_reserve_mutex);
544 if (atomic_read(&active_events) == 0) {
545 if (!reserve_pmc_hardware())
546 err = -EBUSY;
547 else {
548 err = reserve_ds_buffers();
549 if (err)
550 release_pmc_hardware();
551 }
552 }
553 if (!err)
554 atomic_inc(&active_events);
555 mutex_unlock(&pmc_reserve_mutex);
556 }
557 if (err)
558 return err;
1085 559
1086 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 560 event->destroy = hw_perf_event_destroy;
1087 561
1088 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 562 event->hw.idx = -1;
1089 intel_pmu_disable_bts(); 563 event->hw.last_cpu = -1;
564 event->hw.last_tag = ~0ULL;
565
566 return x86_pmu.hw_config(event);
1090} 567}
1091 568
1092static void amd_pmu_disable_all(void) 569static void x86_pmu_disable_all(void)
1093{ 570{
1094 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 571 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1095 int idx; 572 int idx;
1096 573
1097 if (!cpuc->enabled) 574 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1098 return;
1099
1100 cpuc->enabled = 0;
1101 /*
1102 * ensure we write the disable before we start disabling the
1103 * events proper, so that amd_pmu_enable_event() does the
1104 * right thing.
1105 */
1106 barrier();
1107
1108 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1109 u64 val; 575 u64 val;
1110 576
1111 if (!test_bit(idx, cpuc->active_mask)) 577 if (!test_bit(idx, cpuc->active_mask))
1112 continue; 578 continue;
1113 rdmsrl(MSR_K7_EVNTSEL0 + idx, val); 579 rdmsrl(x86_pmu.eventsel + idx, val);
1114 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) 580 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
1115 continue; 581 continue;
1116 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 582 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
1117 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 583 wrmsrl(x86_pmu.eventsel + idx, val);
1118 } 584 }
1119} 585}
1120 586
1121void hw_perf_disable(void) 587void hw_perf_disable(void)
1122{ 588{
1123 if (!x86_pmu_initialized())
1124 return;
1125 return x86_pmu.disable_all();
1126}
1127
1128static void p6_pmu_enable_all(void)
1129{
1130 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 589 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1131 unsigned long val;
1132 590
1133 if (cpuc->enabled) 591 if (!x86_pmu_initialized())
1134 return; 592 return;
1135 593
1136 cpuc->enabled = 1; 594 if (!cpuc->enabled)
1137 barrier();
1138
1139 /* p6 only has one enable register */
1140 rdmsrl(MSR_P6_EVNTSEL0, val);
1141 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1142 wrmsrl(MSR_P6_EVNTSEL0, val);
1143}
1144
1145static void intel_pmu_enable_all(void)
1146{
1147 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1148
1149 if (cpuc->enabled)
1150 return; 595 return;
1151 596
1152 cpuc->enabled = 1; 597 cpuc->n_added = 0;
598 cpuc->enabled = 0;
1153 barrier(); 599 barrier();
1154 600
1155 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 601 x86_pmu.disable_all();
1156
1157 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
1158 struct perf_event *event =
1159 cpuc->events[X86_PMC_IDX_FIXED_BTS];
1160
1161 if (WARN_ON_ONCE(!event))
1162 return;
1163
1164 intel_pmu_enable_bts(event->hw.config);
1165 }
1166} 602}
1167 603
1168static void amd_pmu_enable_all(void) 604static void x86_pmu_enable_all(int added)
1169{ 605{
1170 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 606 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1171 int idx; 607 int idx;
1172 608
1173 if (cpuc->enabled) 609 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1174 return;
1175
1176 cpuc->enabled = 1;
1177 barrier();
1178
1179 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1180 struct perf_event *event = cpuc->events[idx]; 610 struct perf_event *event = cpuc->events[idx];
1181 u64 val; 611 u64 val;
1182 612
@@ -1184,88 +614,267 @@ static void amd_pmu_enable_all(void)
1184 continue; 614 continue;
1185 615
1186 val = event->hw.config; 616 val = event->hw.config;
1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 617 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
1188 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 618 wrmsrl(x86_pmu.eventsel + idx, val);
1189 } 619 }
1190} 620}
1191 621
1192void hw_perf_enable(void) 622static const struct pmu pmu;
623
624static inline int is_x86_event(struct perf_event *event)
1193{ 625{
1194 if (!x86_pmu_initialized()) 626 return event->pmu == &pmu;
1195 return;
1196 x86_pmu.enable_all();
1197} 627}
1198 628
1199static inline u64 intel_pmu_get_status(void) 629static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
1200{ 630{
1201 u64 status; 631 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
632 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
633 int i, j, w, wmax, num = 0;
634 struct hw_perf_event *hwc;
1202 635
1203 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 636 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1204 637
1205 return status; 638 for (i = 0; i < n; i++) {
1206} 639 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
640 constraints[i] = c;
641 }
1207 642
1208static inline void intel_pmu_ack_status(u64 ack) 643 /*
1209{ 644 * fastpath, try to reuse previous register
1210 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 645 */
1211} 646 for (i = 0; i < n; i++) {
647 hwc = &cpuc->event_list[i]->hw;
648 c = constraints[i];
1212 649
1213static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) 650 /* never assigned */
1214{ 651 if (hwc->idx == -1)
1215 (void)checking_wrmsrl(hwc->config_base + idx, 652 break;
1216 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
1217}
1218 653
1219static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) 654 /* constraint still honored */
1220{ 655 if (!test_bit(hwc->idx, c->idxmsk))
1221 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); 656 break;
657
658 /* not already used */
659 if (test_bit(hwc->idx, used_mask))
660 break;
661
662 __set_bit(hwc->idx, used_mask);
663 if (assign)
664 assign[i] = hwc->idx;
665 }
666 if (i == n)
667 goto done;
668
669 /*
670 * begin slow path
671 */
672
673 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
674
675 /*
676 * weight = number of possible counters
677 *
678 * 1 = most constrained, only works on one counter
679 * wmax = least constrained, works on any counter
680 *
681 * assign events to counters starting with most
682 * constrained events.
683 */
684 wmax = x86_pmu.num_counters;
685
686 /*
687 * when fixed event counters are present,
688 * wmax is incremented by 1 to account
689 * for one more choice
690 */
691 if (x86_pmu.num_counters_fixed)
692 wmax++;
693
694 for (w = 1, num = n; num && w <= wmax; w++) {
695 /* for each event */
696 for (i = 0; num && i < n; i++) {
697 c = constraints[i];
698 hwc = &cpuc->event_list[i]->hw;
699
700 if (c->weight != w)
701 continue;
702
703 for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
704 if (!test_bit(j, used_mask))
705 break;
706 }
707
708 if (j == X86_PMC_IDX_MAX)
709 break;
710
711 __set_bit(j, used_mask);
712
713 if (assign)
714 assign[i] = j;
715 num--;
716 }
717 }
718done:
719 /*
720 * scheduling failed or is just a simulation,
721 * free resources if necessary
722 */
723 if (!assign || num) {
724 for (i = 0; i < n; i++) {
725 if (x86_pmu.put_event_constraints)
726 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
727 }
728 }
729 return num ? -ENOSPC : 0;
1222} 730}
1223 731
1224static inline void 732/*
1225intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) 733 * dogrp: true if must collect siblings events (group)
734 * returns total number of events and error code
735 */
736static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
1226{ 737{
1227 int idx = __idx - X86_PMC_IDX_FIXED; 738 struct perf_event *event;
1228 u64 ctrl_val, mask; 739 int n, max_count;
1229 740
1230 mask = 0xfULL << (idx * 4); 741 max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
1231 742
1232 rdmsrl(hwc->config_base, ctrl_val); 743 /* current number of events already accepted */
1233 ctrl_val &= ~mask; 744 n = cpuc->n_events;
1234 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 745
746 if (is_x86_event(leader)) {
747 if (n >= max_count)
748 return -ENOSPC;
749 cpuc->event_list[n] = leader;
750 n++;
751 }
752 if (!dogrp)
753 return n;
754
755 list_for_each_entry(event, &leader->sibling_list, group_entry) {
756 if (!is_x86_event(event) ||
757 event->state <= PERF_EVENT_STATE_OFF)
758 continue;
759
760 if (n >= max_count)
761 return -ENOSPC;
762
763 cpuc->event_list[n] = event;
764 n++;
765 }
766 return n;
1235} 767}
1236 768
1237static inline void 769static inline void x86_assign_hw_event(struct perf_event *event,
1238p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) 770 struct cpu_hw_events *cpuc, int i)
1239{ 771{
1240 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 772 struct hw_perf_event *hwc = &event->hw;
1241 u64 val = P6_NOP_EVENT;
1242 773
1243 if (cpuc->enabled) 774 hwc->idx = cpuc->assign[i];
1244 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 775 hwc->last_cpu = smp_processor_id();
776 hwc->last_tag = ++cpuc->tags[i];
1245 777
1246 (void)checking_wrmsrl(hwc->config_base + idx, val); 778 if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
779 hwc->config_base = 0;
780 hwc->event_base = 0;
781 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
782 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
783 /*
784 * We set it so that event_base + idx in wrmsr/rdmsr maps to
785 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
786 */
787 hwc->event_base =
788 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
789 } else {
790 hwc->config_base = x86_pmu.eventsel;
791 hwc->event_base = x86_pmu.perfctr;
792 }
1247} 793}
1248 794
1249static inline void 795static inline int match_prev_assignment(struct hw_perf_event *hwc,
1250intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) 796 struct cpu_hw_events *cpuc,
797 int i)
1251{ 798{
1252 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 799 return hwc->idx == cpuc->assign[i] &&
1253 intel_pmu_disable_bts(); 800 hwc->last_cpu == smp_processor_id() &&
801 hwc->last_tag == cpuc->tags[i];
802}
803
804static int x86_pmu_start(struct perf_event *event);
805static void x86_pmu_stop(struct perf_event *event);
806
807void hw_perf_enable(void)
808{
809 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
810 struct perf_event *event;
811 struct hw_perf_event *hwc;
812 int i, added = cpuc->n_added;
813
814 if (!x86_pmu_initialized())
1254 return; 815 return;
1255 }
1256 816
1257 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 817 if (cpuc->enabled)
1258 intel_pmu_disable_fixed(hwc, idx);
1259 return; 818 return;
819
820 if (cpuc->n_added) {
821 int n_running = cpuc->n_events - cpuc->n_added;
822 /*
823 * apply assignment obtained either from
824 * hw_perf_group_sched_in() or x86_pmu_enable()
825 *
826 * step1: save events moving to new counters
827 * step2: reprogram moved events into new counters
828 */
829 for (i = 0; i < n_running; i++) {
830 event = cpuc->event_list[i];
831 hwc = &event->hw;
832
833 /*
834 * we can avoid reprogramming counter if:
835 * - assigned same counter as last time
836 * - running on same CPU as last time
837 * - no other event has used the counter since
838 */
839 if (hwc->idx == -1 ||
840 match_prev_assignment(hwc, cpuc, i))
841 continue;
842
843 x86_pmu_stop(event);
844 }
845
846 for (i = 0; i < cpuc->n_events; i++) {
847 event = cpuc->event_list[i];
848 hwc = &event->hw;
849
850 if (!match_prev_assignment(hwc, cpuc, i))
851 x86_assign_hw_event(event, cpuc, i);
852 else if (i < n_running)
853 continue;
854
855 x86_pmu_start(event);
856 }
857 cpuc->n_added = 0;
858 perf_events_lapic_init();
1260 } 859 }
1261 860
1262 x86_pmu_disable_event(hwc, idx); 861 cpuc->enabled = 1;
862 barrier();
863
864 x86_pmu.enable_all(added);
1263} 865}
1264 866
1265static inline void 867static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
1266amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) 868 u64 enable_mask)
1267{ 869{
1268 x86_pmu_disable_event(hwc, idx); 870 wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
871}
872
873static inline void x86_pmu_disable_event(struct perf_event *event)
874{
875 struct hw_perf_event *hwc = &event->hw;
876
877 wrmsrl(hwc->config_base + hwc->idx, hwc->config);
1269} 878}
1270 879
1271static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 880static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1275,12 +884,12 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1275 * To be called with the event disabled in hw: 884 * To be called with the event disabled in hw:
1276 */ 885 */
1277static int 886static int
1278x86_perf_event_set_period(struct perf_event *event, 887x86_perf_event_set_period(struct perf_event *event)
1279 struct hw_perf_event *hwc, int idx)
1280{ 888{
1281 s64 left = atomic64_read(&hwc->period_left); 889 struct hw_perf_event *hwc = &event->hw;
890 s64 left = local64_read(&hwc->period_left);
1282 s64 period = hwc->sample_period; 891 s64 period = hwc->sample_period;
1283 int err, ret = 0; 892 int ret = 0, idx = hwc->idx;
1284 893
1285 if (idx == X86_PMC_IDX_FIXED_BTS) 894 if (idx == X86_PMC_IDX_FIXED_BTS)
1286 return 0; 895 return 0;
@@ -1290,14 +899,14 @@ x86_perf_event_set_period(struct perf_event *event,
1290 */ 899 */
1291 if (unlikely(left <= -period)) { 900 if (unlikely(left <= -period)) {
1292 left = period; 901 left = period;
1293 atomic64_set(&hwc->period_left, left); 902 local64_set(&hwc->period_left, left);
1294 hwc->last_period = period; 903 hwc->last_period = period;
1295 ret = 1; 904 ret = 1;
1296 } 905 }
1297 906
1298 if (unlikely(left <= 0)) { 907 if (unlikely(left <= 0)) {
1299 left += period; 908 left += period;
1300 atomic64_set(&hwc->period_left, left); 909 local64_set(&hwc->period_left, left);
1301 hwc->last_period = period; 910 hwc->last_period = period;
1302 ret = 1; 911 ret = 1;
1303 } 912 }
@@ -1316,229 +925,94 @@ x86_perf_event_set_period(struct perf_event *event,
1316 * The hw event starts counting from this event offset, 925 * The hw event starts counting from this event offset,
1317 * mark it to be able to extra future deltas: 926 * mark it to be able to extra future deltas:
1318 */ 927 */
1319 atomic64_set(&hwc->prev_count, (u64)-left); 928 local64_set(&hwc->prev_count, (u64)-left);
1320
1321 err = checking_wrmsrl(hwc->event_base + idx,
1322 (u64)(-left) & x86_pmu.event_mask);
1323 929
1324 perf_event_update_userpage(event); 930 wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
1325
1326 return ret;
1327}
1328
1329static inline void
1330intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1331{
1332 int idx = __idx - X86_PMC_IDX_FIXED;
1333 u64 ctrl_val, bits, mask;
1334 int err;
1335 931
1336 /* 932 /*
1337 * Enable IRQ generation (0x8), 933 * Due to erratum on certan cpu we need
1338 * and enable ring-3 counting (0x2) and ring-0 counting (0x1) 934 * a second write to be sure the register
1339 * if requested: 935 * is updated properly
1340 */ 936 */
1341 bits = 0x8ULL; 937 if (x86_pmu.perfctr_second_write) {
1342 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) 938 wrmsrl(hwc->event_base + idx,
1343 bits |= 0x2; 939 (u64)(-left) & x86_pmu.cntval_mask);
1344 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
1345 bits |= 0x1;
1346
1347 /*
1348 * ANY bit is supported in v3 and up
1349 */
1350 if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
1351 bits |= 0x4;
1352
1353 bits <<= (idx * 4);
1354 mask = 0xfULL << (idx * 4);
1355
1356 rdmsrl(hwc->config_base, ctrl_val);
1357 ctrl_val &= ~mask;
1358 ctrl_val |= bits;
1359 err = checking_wrmsrl(hwc->config_base, ctrl_val);
1360}
1361
1362static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1363{
1364 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1365 u64 val;
1366
1367 val = hwc->config;
1368 if (cpuc->enabled)
1369 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1370
1371 (void)checking_wrmsrl(hwc->config_base + idx, val);
1372}
1373
1374
1375static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1376{
1377 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1378 if (!__get_cpu_var(cpu_hw_events).enabled)
1379 return;
1380
1381 intel_pmu_enable_bts(hwc->config);
1382 return;
1383 } 940 }
1384 941
1385 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 942 perf_event_update_userpage(event);
1386 intel_pmu_enable_fixed(hwc, idx);
1387 return;
1388 }
1389 943
1390 x86_pmu_enable_event(hwc, idx); 944 return ret;
1391} 945}
1392 946
1393static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) 947static void x86_pmu_enable_event(struct perf_event *event)
1394{ 948{
1395 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 949 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1396
1397 if (cpuc->enabled) 950 if (cpuc->enabled)
1398 x86_pmu_enable_event(hwc, idx); 951 __x86_pmu_enable_event(&event->hw,
1399} 952 ARCH_PERFMON_EVENTSEL_ENABLE);
1400
1401static int fixed_mode_idx(struct hw_perf_event *hwc)
1402{
1403 unsigned int hw_event;
1404
1405 hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1406
1407 if (unlikely((hw_event ==
1408 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
1409 (hwc->sample_period == 1)))
1410 return X86_PMC_IDX_FIXED_BTS;
1411
1412 if (!x86_pmu.num_events_fixed)
1413 return -1;
1414
1415 /*
1416 * fixed counters do not take all possible filters
1417 */
1418 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
1419 return -1;
1420
1421 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1422 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1423 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
1424 return X86_PMC_IDX_FIXED_CPU_CYCLES;
1425 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
1426 return X86_PMC_IDX_FIXED_BUS_CYCLES;
1427
1428 return -1;
1429} 953}
1430 954
1431/* 955/*
1432 * generic counter allocator: get next free counter 956 * activate a single event
1433 */ 957 *
1434static int 958 * The event is added to the group of enabled events
1435gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) 959 * but only if it can be scehduled with existing events.
1436{ 960 *
1437 int idx; 961 * Called with PMU disabled. If successful and return value 1,
1438 962 * then guaranteed to call perf_enable() and hw_perf_enable()
1439 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1440 return idx == x86_pmu.num_events ? -1 : idx;
1441}
1442
1443/*
1444 * intel-specific counter allocator: check event constraints
1445 */ 963 */
1446static int 964static int x86_pmu_enable(struct perf_event *event)
1447intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1448{ 965{
1449 const struct event_constraint *event_constraint; 966 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1450 int i, code; 967 struct hw_perf_event *hwc;
1451 968 int assign[X86_PMC_IDX_MAX];
1452 if (!event_constraints) 969 int n, n0, ret;
1453 goto skip;
1454 970
1455 code = hwc->config & CORE_EVNTSEL_EVENT_MASK; 971 hwc = &event->hw;
1456 972
1457 for_each_event_constraint(event_constraint, event_constraints) { 973 n0 = cpuc->n_events;
1458 if (code == event_constraint->code) { 974 n = collect_events(cpuc, event, false);
1459 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { 975 if (n < 0)
1460 if (!test_and_set_bit(i, cpuc->used_mask)) 976 return n;
1461 return i;
1462 }
1463 return -1;
1464 }
1465 }
1466skip:
1467 return gen_get_event_idx(cpuc, hwc);
1468}
1469 977
1470static int 978 /*
1471x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) 979 * If group events scheduling transaction was started,
1472{ 980 * skip the schedulability test here, it will be peformed
1473 int idx; 981 * at commit time(->commit_txn) as a whole
982 */
983 if (cpuc->group_flag & PERF_EVENT_TXN)
984 goto out;
1474 985
1475 idx = fixed_mode_idx(hwc); 986 ret = x86_pmu.schedule_events(cpuc, n, assign);
1476 if (idx == X86_PMC_IDX_FIXED_BTS) { 987 if (ret)
1477 /* BTS is already occupied. */ 988 return ret;
1478 if (test_and_set_bit(idx, cpuc->used_mask)) 989 /*
1479 return -EAGAIN; 990 * copy new assignment, now we know it is possible
991 * will be used by hw_perf_enable()
992 */
993 memcpy(cpuc->assign, assign, n*sizeof(int));
1480 994
1481 hwc->config_base = 0; 995out:
1482 hwc->event_base = 0; 996 cpuc->n_events = n;
1483 hwc->idx = idx; 997 cpuc->n_added += n - n0;
1484 } else if (idx >= 0) { 998 cpuc->n_txn += n - n0;
1485 /*
1486 * Try to get the fixed event, if that is already taken
1487 * then try to get a generic event:
1488 */
1489 if (test_and_set_bit(idx, cpuc->used_mask))
1490 goto try_generic;
1491 999
1492 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 1000 return 0;
1493 /*
1494 * We set it so that event_base + idx in wrmsr/rdmsr maps to
1495 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1496 */
1497 hwc->event_base =
1498 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1499 hwc->idx = idx;
1500 } else {
1501 idx = hwc->idx;
1502 /* Try to get the previous generic event again */
1503 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1504try_generic:
1505 idx = x86_pmu.get_event_idx(cpuc, hwc);
1506 if (idx == -1)
1507 return -EAGAIN;
1508
1509 set_bit(idx, cpuc->used_mask);
1510 hwc->idx = idx;
1511 }
1512 hwc->config_base = x86_pmu.eventsel;
1513 hwc->event_base = x86_pmu.perfctr;
1514 }
1515
1516 return idx;
1517} 1001}
1518 1002
1519/* 1003static int x86_pmu_start(struct perf_event *event)
1520 * Find a PMC slot for the freshly enabled / scheduled in event:
1521 */
1522static int x86_pmu_enable(struct perf_event *event)
1523{ 1004{
1524 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1005 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1525 struct hw_perf_event *hwc = &event->hw; 1006 int idx = event->hw.idx;
1526 int idx;
1527 1007
1528 idx = x86_schedule_event(cpuc, hwc); 1008 if (idx == -1)
1529 if (idx < 0) 1009 return -EAGAIN;
1530 return idx;
1531
1532 perf_events_lapic_init();
1533
1534 x86_pmu.disable(hwc, idx);
1535 1010
1011 x86_perf_event_set_period(event);
1536 cpuc->events[idx] = event; 1012 cpuc->events[idx] = event;
1537 set_bit(idx, cpuc->active_mask); 1013 __set_bit(idx, cpuc->active_mask);
1538 1014 __set_bit(idx, cpuc->running);
1539 x86_perf_event_set_period(event, hwc, idx); 1015 x86_pmu.enable(event);
1540 x86_pmu.enable(hwc, idx);
1541
1542 perf_event_update_userpage(event); 1016 perf_event_update_userpage(event);
1543 1017
1544 return 0; 1018 return 0;
@@ -1546,24 +1020,19 @@ static int x86_pmu_enable(struct perf_event *event)
1546 1020
1547static void x86_pmu_unthrottle(struct perf_event *event) 1021static void x86_pmu_unthrottle(struct perf_event *event)
1548{ 1022{
1549 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1023 int ret = x86_pmu_start(event);
1550 struct hw_perf_event *hwc = &event->hw; 1024 WARN_ON_ONCE(ret);
1551
1552 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1553 cpuc->events[hwc->idx] != event))
1554 return;
1555
1556 x86_pmu.enable(hwc, hwc->idx);
1557} 1025}
1558 1026
1559void perf_event_print_debug(void) 1027void perf_event_print_debug(void)
1560{ 1028{
1561 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1029 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1030 u64 pebs;
1562 struct cpu_hw_events *cpuc; 1031 struct cpu_hw_events *cpuc;
1563 unsigned long flags; 1032 unsigned long flags;
1564 int cpu, idx; 1033 int cpu, idx;
1565 1034
1566 if (!x86_pmu.num_events) 1035 if (!x86_pmu.num_counters)
1567 return; 1036 return;
1568 1037
1569 local_irq_save(flags); 1038 local_irq_save(flags);
@@ -1576,16 +1045,18 @@ void perf_event_print_debug(void)
1576 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1045 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1577 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1046 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1578 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1047 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1048 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1579 1049
1580 pr_info("\n"); 1050 pr_info("\n");
1581 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1051 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1582 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1052 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1583 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1053 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1584 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1054 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1055 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
1585 } 1056 }
1586 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); 1057 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1587 1058
1588 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1059 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1589 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1060 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1590 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1061 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1591 1062
@@ -1598,7 +1069,7 @@ void perf_event_print_debug(void)
1598 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1069 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1599 cpu, idx, prev_left); 1070 cpu, idx, prev_left);
1600 } 1071 }
1601 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 1072 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1602 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1073 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1603 1074
1604 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1075 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1607,257 +1078,58 @@ void perf_event_print_debug(void)
1607 local_irq_restore(flags); 1078 local_irq_restore(flags);
1608} 1079}
1609 1080
1610static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) 1081static void x86_pmu_stop(struct perf_event *event)
1611{
1612 struct debug_store *ds = cpuc->ds;
1613 struct bts_record {
1614 u64 from;
1615 u64 to;
1616 u64 flags;
1617 };
1618 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
1619 struct bts_record *at, *top;
1620 struct perf_output_handle handle;
1621 struct perf_event_header header;
1622 struct perf_sample_data data;
1623 struct pt_regs regs;
1624
1625 if (!event)
1626 return;
1627
1628 if (!ds)
1629 return;
1630
1631 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1632 top = (struct bts_record *)(unsigned long)ds->bts_index;
1633
1634 if (top <= at)
1635 return;
1636
1637 ds->bts_index = ds->bts_buffer_base;
1638
1639
1640 data.period = event->hw.last_period;
1641 data.addr = 0;
1642 data.raw = NULL;
1643 regs.ip = 0;
1644
1645 /*
1646 * Prepare a generic sample, i.e. fill in the invariant fields.
1647 * We will overwrite the from and to address before we output
1648 * the sample.
1649 */
1650 perf_prepare_sample(&header, &data, event, &regs);
1651
1652 if (perf_output_begin(&handle, event,
1653 header.size * (top - at), 1, 1))
1654 return;
1655
1656 for (; at < top; at++) {
1657 data.ip = at->from;
1658 data.addr = at->to;
1659
1660 perf_output_sample(&handle, &header, &data, event);
1661 }
1662
1663 perf_output_end(&handle);
1664
1665 /* There's new data available. */
1666 event->hw.interrupts++;
1667 event->pending_kill = POLL_IN;
1668}
1669
1670static void x86_pmu_disable(struct perf_event *event)
1671{ 1082{
1672 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1083 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1673 struct hw_perf_event *hwc = &event->hw; 1084 struct hw_perf_event *hwc = &event->hw;
1674 int idx = hwc->idx; 1085 int idx = hwc->idx;
1675 1086
1676 /* 1087 if (!__test_and_clear_bit(idx, cpuc->active_mask))
1677 * Must be done before we disable, otherwise the nmi handler 1088 return;
1678 * could reenable again:
1679 */
1680 clear_bit(idx, cpuc->active_mask);
1681 x86_pmu.disable(hwc, idx);
1682 1089
1683 /* 1090 x86_pmu.disable(event);
1684 * Make sure the cleared pointer becomes visible before we
1685 * (potentially) free the event:
1686 */
1687 barrier();
1688 1091
1689 /* 1092 /*
1690 * Drain the remaining delta count out of a event 1093 * Drain the remaining delta count out of a event
1691 * that we are disabling: 1094 * that we are disabling:
1692 */ 1095 */
1693 x86_perf_event_update(event, hwc, idx); 1096 x86_perf_event_update(event);
1694
1695 /* Drain the remaining BTS records. */
1696 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
1697 intel_pmu_drain_bts_buffer(cpuc);
1698 1097
1699 cpuc->events[idx] = NULL; 1098 cpuc->events[idx] = NULL;
1700 clear_bit(idx, cpuc->used_mask);
1701
1702 perf_event_update_userpage(event);
1703}
1704
1705/*
1706 * Save and restart an expired event. Called by NMI contexts,
1707 * so it has to be careful about preempting normal event ops:
1708 */
1709static int intel_pmu_save_and_restart(struct perf_event *event)
1710{
1711 struct hw_perf_event *hwc = &event->hw;
1712 int idx = hwc->idx;
1713 int ret;
1714
1715 x86_perf_event_update(event, hwc, idx);
1716 ret = x86_perf_event_set_period(event, hwc, idx);
1717
1718 if (event->state == PERF_EVENT_STATE_ACTIVE)
1719 intel_pmu_enable_event(hwc, idx);
1720
1721 return ret;
1722} 1099}
1723 1100
1724static void intel_pmu_reset(void) 1101static void x86_pmu_disable(struct perf_event *event)
1725{ 1102{
1726 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; 1103 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1727 unsigned long flags; 1104 int i;
1728 int idx;
1729 1105
1730 if (!x86_pmu.num_events) 1106 /*
1107 * If we're called during a txn, we don't need to do anything.
1108 * The events never got scheduled and ->cancel_txn will truncate
1109 * the event_list.
1110 */
1111 if (cpuc->group_flag & PERF_EVENT_TXN)
1731 return; 1112 return;
1732 1113
1733 local_irq_save(flags); 1114 x86_pmu_stop(event);
1734
1735 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1736 1115
1737 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1116 for (i = 0; i < cpuc->n_events; i++) {
1738 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 1117 if (event == cpuc->event_list[i]) {
1739 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1740 }
1741 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1742 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1743 }
1744 if (ds)
1745 ds->bts_index = ds->bts_buffer_base;
1746
1747 local_irq_restore(flags);
1748}
1749 1118
1750static int p6_pmu_handle_irq(struct pt_regs *regs) 1119 if (x86_pmu.put_event_constraints)
1751{ 1120 x86_pmu.put_event_constraints(cpuc, event);
1752 struct perf_sample_data data;
1753 struct cpu_hw_events *cpuc;
1754 struct perf_event *event;
1755 struct hw_perf_event *hwc;
1756 int idx, handled = 0;
1757 u64 val;
1758 1121
1759 data.addr = 0; 1122 while (++i < cpuc->n_events)
1760 data.raw = NULL; 1123 cpuc->event_list[i-1] = cpuc->event_list[i];
1761 1124
1762 cpuc = &__get_cpu_var(cpu_hw_events); 1125 --cpuc->n_events;
1763 1126 break;
1764 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1127 }
1765 if (!test_bit(idx, cpuc->active_mask))
1766 continue;
1767
1768 event = cpuc->events[idx];
1769 hwc = &event->hw;
1770
1771 val = x86_perf_event_update(event, hwc, idx);
1772 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1773 continue;
1774
1775 /*
1776 * event overflow
1777 */
1778 handled = 1;
1779 data.period = event->hw.last_period;
1780
1781 if (!x86_perf_event_set_period(event, hwc, idx))
1782 continue;
1783
1784 if (perf_event_overflow(event, 1, &data, regs))
1785 p6_pmu_disable_event(hwc, idx);
1786 }
1787
1788 if (handled)
1789 inc_irq_stat(apic_perf_irqs);
1790
1791 return handled;
1792}
1793
1794/*
1795 * This handler is triggered by the local APIC, so the APIC IRQ handling
1796 * rules apply:
1797 */
1798static int intel_pmu_handle_irq(struct pt_regs *regs)
1799{
1800 struct perf_sample_data data;
1801 struct cpu_hw_events *cpuc;
1802 int bit, loops;
1803 u64 ack, status;
1804
1805 data.addr = 0;
1806 data.raw = NULL;
1807
1808 cpuc = &__get_cpu_var(cpu_hw_events);
1809
1810 perf_disable();
1811 intel_pmu_drain_bts_buffer(cpuc);
1812 status = intel_pmu_get_status();
1813 if (!status) {
1814 perf_enable();
1815 return 0;
1816 }
1817
1818 loops = 0;
1819again:
1820 if (++loops > 100) {
1821 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
1822 perf_event_print_debug();
1823 intel_pmu_reset();
1824 perf_enable();
1825 return 1;
1826 }
1827
1828 inc_irq_stat(apic_perf_irqs);
1829 ack = status;
1830 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1831 struct perf_event *event = cpuc->events[bit];
1832
1833 clear_bit(bit, (unsigned long *) &status);
1834 if (!test_bit(bit, cpuc->active_mask))
1835 continue;
1836
1837 if (!intel_pmu_save_and_restart(event))
1838 continue;
1839
1840 data.period = event->hw.last_period;
1841
1842 if (perf_event_overflow(event, 1, &data, regs))
1843 intel_pmu_disable_event(&event->hw, bit);
1844 } 1128 }
1845 1129 perf_event_update_userpage(event);
1846 intel_pmu_ack_status(ack);
1847
1848 /*
1849 * Repeat if there is more work to be done:
1850 */
1851 status = intel_pmu_get_status();
1852 if (status)
1853 goto again;
1854
1855 perf_enable();
1856
1857 return 1;
1858} 1130}
1859 1131
1860static int amd_pmu_handle_irq(struct pt_regs *regs) 1132static int x86_pmu_handle_irq(struct pt_regs *regs)
1861{ 1133{
1862 struct perf_sample_data data; 1134 struct perf_sample_data data;
1863 struct cpu_hw_events *cpuc; 1135 struct cpu_hw_events *cpuc;
@@ -1866,33 +1138,40 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1866 int idx, handled = 0; 1138 int idx, handled = 0;
1867 u64 val; 1139 u64 val;
1868 1140
1869 data.addr = 0; 1141 perf_sample_data_init(&data, 0);
1870 data.raw = NULL;
1871 1142
1872 cpuc = &__get_cpu_var(cpu_hw_events); 1143 cpuc = &__get_cpu_var(cpu_hw_events);
1873 1144
1874 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1145 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1875 if (!test_bit(idx, cpuc->active_mask)) 1146 if (!test_bit(idx, cpuc->active_mask)) {
1147 /*
1148 * Though we deactivated the counter some cpus
1149 * might still deliver spurious interrupts still
1150 * in flight. Catch them:
1151 */
1152 if (__test_and_clear_bit(idx, cpuc->running))
1153 handled++;
1876 continue; 1154 continue;
1155 }
1877 1156
1878 event = cpuc->events[idx]; 1157 event = cpuc->events[idx];
1879 hwc = &event->hw; 1158 hwc = &event->hw;
1880 1159
1881 val = x86_perf_event_update(event, hwc, idx); 1160 val = x86_perf_event_update(event);
1882 if (val & (1ULL << (x86_pmu.event_bits - 1))) 1161 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1883 continue; 1162 continue;
1884 1163
1885 /* 1164 /*
1886 * event overflow 1165 * event overflow
1887 */ 1166 */
1888 handled = 1; 1167 handled++;
1889 data.period = event->hw.last_period; 1168 data.period = event->hw.last_period;
1890 1169
1891 if (!x86_perf_event_set_period(event, hwc, idx)) 1170 if (!x86_perf_event_set_period(event))
1892 continue; 1171 continue;
1893 1172
1894 if (perf_event_overflow(event, 1, &data, regs)) 1173 if (perf_event_overflow(event, 1, &data, regs))
1895 amd_pmu_disable_event(hwc, idx); 1174 x86_pmu_stop(event);
1896 } 1175 }
1897 1176
1898 if (handled) 1177 if (handled)
@@ -1922,7 +1201,6 @@ void set_perf_event_pending(void)
1922 1201
1923void perf_events_lapic_init(void) 1202void perf_events_lapic_init(void)
1924{ 1203{
1925#ifdef CONFIG_X86_LOCAL_APIC
1926 if (!x86_pmu.apic || !x86_pmu_initialized()) 1204 if (!x86_pmu.apic || !x86_pmu_initialized())
1927 return; 1205 return;
1928 1206
@@ -1930,15 +1208,22 @@ void perf_events_lapic_init(void)
1930 * Always use NMI for PMU 1208 * Always use NMI for PMU
1931 */ 1209 */
1932 apic_write(APIC_LVTPC, APIC_DM_NMI); 1210 apic_write(APIC_LVTPC, APIC_DM_NMI);
1933#endif
1934} 1211}
1935 1212
1213struct pmu_nmi_state {
1214 unsigned int marked;
1215 int handled;
1216};
1217
1218static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
1219
1936static int __kprobes 1220static int __kprobes
1937perf_event_nmi_handler(struct notifier_block *self, 1221perf_event_nmi_handler(struct notifier_block *self,
1938 unsigned long cmd, void *__args) 1222 unsigned long cmd, void *__args)
1939{ 1223{
1940 struct die_args *args = __args; 1224 struct die_args *args = __args;
1941 struct pt_regs *regs; 1225 unsigned int this_nmi;
1226 int handled;
1942 1227
1943 if (!atomic_read(&active_events)) 1228 if (!atomic_read(&active_events))
1944 return NOTIFY_DONE; 1229 return NOTIFY_DONE;
@@ -1947,24 +1232,47 @@ perf_event_nmi_handler(struct notifier_block *self,
1947 case DIE_NMI: 1232 case DIE_NMI:
1948 case DIE_NMI_IPI: 1233 case DIE_NMI_IPI:
1949 break; 1234 break;
1950 1235 case DIE_NMIUNKNOWN:
1236 this_nmi = percpu_read(irq_stat.__nmi_count);
1237 if (this_nmi != __get_cpu_var(pmu_nmi).marked)
1238 /* let the kernel handle the unknown nmi */
1239 return NOTIFY_DONE;
1240 /*
1241 * This one is a PMU back-to-back nmi. Two events
1242 * trigger 'simultaneously' raising two back-to-back
1243 * NMIs. If the first NMI handles both, the latter
1244 * will be empty and daze the CPU. So, we drop it to
1245 * avoid false-positive 'unknown nmi' messages.
1246 */
1247 return NOTIFY_STOP;
1951 default: 1248 default:
1952 return NOTIFY_DONE; 1249 return NOTIFY_DONE;
1953 } 1250 }
1954 1251
1955 regs = args->regs;
1956
1957#ifdef CONFIG_X86_LOCAL_APIC
1958 apic_write(APIC_LVTPC, APIC_DM_NMI); 1252 apic_write(APIC_LVTPC, APIC_DM_NMI);
1959#endif 1253
1960 /* 1254 handled = x86_pmu.handle_irq(args->regs);
1961 * Can't rely on the handled return value to say it was our NMI, two 1255 if (!handled)
1962 * events could trigger 'simultaneously' raising two back-to-back NMIs. 1256 return NOTIFY_DONE;
1963 * 1257
1964 * If the first NMI handles both, the latter will be empty and daze 1258 this_nmi = percpu_read(irq_stat.__nmi_count);
1965 * the CPU. 1259 if ((handled > 1) ||
1966 */ 1260 /* the next nmi could be a back-to-back nmi */
1967 x86_pmu.handle_irq(regs); 1261 ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
1262 (__get_cpu_var(pmu_nmi).handled > 1))) {
1263 /*
1264 * We could have two subsequent back-to-back nmis: The
1265 * first handles more than one counter, the 2nd
1266 * handles only one counter and the 3rd handles no
1267 * counter.
1268 *
1269 * This is the 2nd nmi because the previous was
1270 * handling more than one counter. We will mark the
1271 * next (3rd) and then drop it if unhandled.
1272 */
1273 __get_cpu_var(pmu_nmi).marked = this_nmi + 1;
1274 __get_cpu_var(pmu_nmi).handled = handled;
1275 }
1968 1276
1969 return NOTIFY_STOP; 1277 return NOTIFY_STOP;
1970} 1278}
@@ -1975,193 +1283,64 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1975 .priority = 1 1283 .priority = 1
1976}; 1284};
1977 1285
1978static __initconst struct x86_pmu p6_pmu = { 1286static struct event_constraint unconstrained;
1979 .name = "p6", 1287static struct event_constraint emptyconstraint;
1980 .handle_irq = p6_pmu_handle_irq,
1981 .disable_all = p6_pmu_disable_all,
1982 .enable_all = p6_pmu_enable_all,
1983 .enable = p6_pmu_enable_event,
1984 .disable = p6_pmu_disable_event,
1985 .eventsel = MSR_P6_EVNTSEL0,
1986 .perfctr = MSR_P6_PERFCTR0,
1987 .event_map = p6_pmu_event_map,
1988 .raw_event = p6_pmu_raw_event,
1989 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
1990 .apic = 1,
1991 .max_period = (1ULL << 31) - 1,
1992 .version = 0,
1993 .num_events = 2,
1994 /*
1995 * Events have 40 bits implemented. However they are designed such
1996 * that bits [32-39] are sign extensions of bit 31. As such the
1997 * effective width of a event for P6-like PMU is 32 bits only.
1998 *
1999 * See IA-32 Intel Architecture Software developer manual Vol 3B
2000 */
2001 .event_bits = 32,
2002 .event_mask = (1ULL << 32) - 1,
2003 .get_event_idx = intel_get_event_idx,
2004};
2005 1288
2006static __initconst struct x86_pmu intel_pmu = { 1289static struct event_constraint *
2007 .name = "Intel", 1290x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
2008 .handle_irq = intel_pmu_handle_irq,
2009 .disable_all = intel_pmu_disable_all,
2010 .enable_all = intel_pmu_enable_all,
2011 .enable = intel_pmu_enable_event,
2012 .disable = intel_pmu_disable_event,
2013 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
2014 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
2015 .event_map = intel_pmu_event_map,
2016 .raw_event = intel_pmu_raw_event,
2017 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
2018 .apic = 1,
2019 /*
2020 * Intel PMCs cannot be accessed sanely above 32 bit width,
2021 * so we install an artificial 1<<31 period regardless of
2022 * the generic event period:
2023 */
2024 .max_period = (1ULL << 31) - 1,
2025 .enable_bts = intel_pmu_enable_bts,
2026 .disable_bts = intel_pmu_disable_bts,
2027 .get_event_idx = intel_get_event_idx,
2028};
2029
2030static __initconst struct x86_pmu amd_pmu = {
2031 .name = "AMD",
2032 .handle_irq = amd_pmu_handle_irq,
2033 .disable_all = amd_pmu_disable_all,
2034 .enable_all = amd_pmu_enable_all,
2035 .enable = amd_pmu_enable_event,
2036 .disable = amd_pmu_disable_event,
2037 .eventsel = MSR_K7_EVNTSEL0,
2038 .perfctr = MSR_K7_PERFCTR0,
2039 .event_map = amd_pmu_event_map,
2040 .raw_event = amd_pmu_raw_event,
2041 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
2042 .num_events = 4,
2043 .event_bits = 48,
2044 .event_mask = (1ULL << 48) - 1,
2045 .apic = 1,
2046 /* use highest bit to detect overflow */
2047 .max_period = (1ULL << 47) - 1,
2048 .get_event_idx = gen_get_event_idx,
2049};
2050
2051static __init int p6_pmu_init(void)
2052{ 1291{
2053 switch (boot_cpu_data.x86_model) { 1292 struct event_constraint *c;
2054 case 1:
2055 case 3: /* Pentium Pro */
2056 case 5:
2057 case 6: /* Pentium II */
2058 case 7:
2059 case 8:
2060 case 11: /* Pentium III */
2061 event_constraints = intel_p6_event_constraints;
2062 break;
2063 case 9:
2064 case 13:
2065 /* Pentium M */
2066 event_constraints = intel_p6_event_constraints;
2067 break;
2068 default:
2069 pr_cont("unsupported p6 CPU model %d ",
2070 boot_cpu_data.x86_model);
2071 return -ENODEV;
2072 }
2073 1293
2074 x86_pmu = p6_pmu; 1294 if (x86_pmu.event_constraints) {
1295 for_each_event_constraint(c, x86_pmu.event_constraints) {
1296 if ((event->hw.config & c->cmask) == c->code)
1297 return c;
1298 }
1299 }
2075 1300
2076 return 0; 1301 return &unconstrained;
2077} 1302}
2078 1303
2079static __init int intel_pmu_init(void) 1304#include "perf_event_amd.c"
2080{ 1305#include "perf_event_p6.c"
2081 union cpuid10_edx edx; 1306#include "perf_event_p4.c"
2082 union cpuid10_eax eax; 1307#include "perf_event_intel_lbr.c"
2083 unsigned int unused; 1308#include "perf_event_intel_ds.c"
2084 unsigned int ebx; 1309#include "perf_event_intel.c"
2085 int version;
2086
2087 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
2088 /* check for P6 processor family */
2089 if (boot_cpu_data.x86 == 6) {
2090 return p6_pmu_init();
2091 } else {
2092 return -ENODEV;
2093 }
2094 }
2095
2096 /*
2097 * Check whether the Architectural PerfMon supports
2098 * Branch Misses Retired hw_event or not.
2099 */
2100 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
2101 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
2102 return -ENODEV;
2103 1310
2104 version = eax.split.version_id; 1311static int __cpuinit
2105 if (version < 2) 1312x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
2106 return -ENODEV; 1313{
1314 unsigned int cpu = (long)hcpu;
1315 int ret = NOTIFY_OK;
2107 1316
2108 x86_pmu = intel_pmu; 1317 switch (action & ~CPU_TASKS_FROZEN) {
2109 x86_pmu.version = version; 1318 case CPU_UP_PREPARE:
2110 x86_pmu.num_events = eax.split.num_events; 1319 if (x86_pmu.cpu_prepare)
2111 x86_pmu.event_bits = eax.split.bit_width; 1320 ret = x86_pmu.cpu_prepare(cpu);
2112 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; 1321 break;
2113 1322
2114 /* 1323 case CPU_STARTING:
2115 * Quirk: v2 perfmon does not report fixed-purpose events, so 1324 if (x86_pmu.cpu_starting)
2116 * assume at least 3 events: 1325 x86_pmu.cpu_starting(cpu);
2117 */ 1326 break;
2118 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
2119 1327
2120 /* 1328 case CPU_DYING:
2121 * Install the hw-cache-events table: 1329 if (x86_pmu.cpu_dying)
2122 */ 1330 x86_pmu.cpu_dying(cpu);
2123 switch (boot_cpu_data.x86_model) {
2124 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
2125 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
2126 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
2127 case 29: /* six-core 45 nm xeon "Dunnington" */
2128 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
2129 sizeof(hw_cache_event_ids));
2130
2131 pr_cont("Core2 events, ");
2132 event_constraints = intel_core_event_constraints;
2133 break; 1331 break;
2134 default:
2135 case 26:
2136 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2137 sizeof(hw_cache_event_ids));
2138 1332
2139 event_constraints = intel_nehalem_event_constraints; 1333 case CPU_UP_CANCELED:
2140 pr_cont("Nehalem/Corei7 events, "); 1334 case CPU_DEAD:
1335 if (x86_pmu.cpu_dead)
1336 x86_pmu.cpu_dead(cpu);
2141 break; 1337 break;
2142 case 28:
2143 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
2144 sizeof(hw_cache_event_ids));
2145 1338
2146 pr_cont("Atom events, "); 1339 default:
2147 break; 1340 break;
2148 } 1341 }
2149 return 0;
2150}
2151 1342
2152static __init int amd_pmu_init(void) 1343 return ret;
2153{
2154 /* Performance-monitoring supported from K7 and later: */
2155 if (boot_cpu_data.x86 < 6)
2156 return -ENODEV;
2157
2158 x86_pmu = amd_pmu;
2159
2160 /* Events are common for all AMDs */
2161 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
2162 sizeof(hw_cache_event_ids));
2163
2164 return 0;
2165} 1344}
2166 1345
2167static void __init pmu_check_apic(void) 1346static void __init pmu_check_apic(void)
@@ -2176,6 +1355,7 @@ static void __init pmu_check_apic(void)
2176 1355
2177void __init init_hw_perf_events(void) 1356void __init init_hw_perf_events(void)
2178{ 1357{
1358 struct event_constraint *c;
2179 int err; 1359 int err;
2180 1360
2181 pr_info("Performance Events: "); 1361 pr_info("Performance Events: ");
@@ -2199,88 +1379,227 @@ void __init init_hw_perf_events(void)
2199 1379
2200 pr_cont("%s PMU driver.\n", x86_pmu.name); 1380 pr_cont("%s PMU driver.\n", x86_pmu.name);
2201 1381
2202 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 1382 if (x86_pmu.quirks)
1383 x86_pmu.quirks();
1384
1385 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
2203 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1386 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
2204 x86_pmu.num_events, X86_PMC_MAX_GENERIC); 1387 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
2205 x86_pmu.num_events = X86_PMC_MAX_GENERIC; 1388 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
2206 } 1389 }
2207 perf_event_mask = (1 << x86_pmu.num_events) - 1; 1390 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
2208 perf_max_events = x86_pmu.num_events; 1391 perf_max_events = x86_pmu.num_counters;
2209 1392
2210 if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { 1393 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
2211 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", 1394 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
2212 x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); 1395 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
2213 x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; 1396 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
2214 } 1397 }
2215 1398
2216 perf_event_mask |= 1399 x86_pmu.intel_ctrl |=
2217 ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; 1400 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
2218 x86_pmu.intel_ctrl = perf_event_mask;
2219 1401
2220 perf_events_lapic_init(); 1402 perf_events_lapic_init();
2221 register_die_notifier(&perf_event_nmi_notifier); 1403 register_die_notifier(&perf_event_nmi_notifier);
2222 1404
1405 unconstrained = (struct event_constraint)
1406 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1407 0, x86_pmu.num_counters);
1408
1409 if (x86_pmu.event_constraints) {
1410 for_each_event_constraint(c, x86_pmu.event_constraints) {
1411 if (c->cmask != X86_RAW_EVENT_MASK)
1412 continue;
1413
1414 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1415 c->weight += x86_pmu.num_counters;
1416 }
1417 }
1418
2223 pr_info("... version: %d\n", x86_pmu.version); 1419 pr_info("... version: %d\n", x86_pmu.version);
2224 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1420 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
2225 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1421 pr_info("... generic registers: %d\n", x86_pmu.num_counters);
2226 pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); 1422 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
2227 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1423 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
2228 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); 1424 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
2229 pr_info("... event mask: %016Lx\n", perf_event_mask); 1425 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1426
1427 perf_cpu_notifier(x86_pmu_notifier);
2230} 1428}
2231 1429
2232static inline void x86_pmu_read(struct perf_event *event) 1430static inline void x86_pmu_read(struct perf_event *event)
2233{ 1431{
2234 x86_perf_event_update(event, &event->hw, event->hw.idx); 1432 x86_perf_event_update(event);
1433}
1434
1435/*
1436 * Start group events scheduling transaction
1437 * Set the flag to make pmu::enable() not perform the
1438 * schedulability test, it will be performed at commit time
1439 */
1440static void x86_pmu_start_txn(const struct pmu *pmu)
1441{
1442 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1443
1444 cpuc->group_flag |= PERF_EVENT_TXN;
1445 cpuc->n_txn = 0;
1446}
1447
1448/*
1449 * Stop group events scheduling transaction
1450 * Clear the flag and pmu::enable() will perform the
1451 * schedulability test.
1452 */
1453static void x86_pmu_cancel_txn(const struct pmu *pmu)
1454{
1455 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1456
1457 cpuc->group_flag &= ~PERF_EVENT_TXN;
1458 /*
1459 * Truncate the collected events.
1460 */
1461 cpuc->n_added -= cpuc->n_txn;
1462 cpuc->n_events -= cpuc->n_txn;
1463}
1464
1465/*
1466 * Commit group events scheduling transaction
1467 * Perform the group schedulability test as a whole
1468 * Return 0 if success
1469 */
1470static int x86_pmu_commit_txn(const struct pmu *pmu)
1471{
1472 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1473 int assign[X86_PMC_IDX_MAX];
1474 int n, ret;
1475
1476 n = cpuc->n_events;
1477
1478 if (!x86_pmu_initialized())
1479 return -EAGAIN;
1480
1481 ret = x86_pmu.schedule_events(cpuc, n, assign);
1482 if (ret)
1483 return ret;
1484
1485 /*
1486 * copy new assignment, now we know it is possible
1487 * will be used by hw_perf_enable()
1488 */
1489 memcpy(cpuc->assign, assign, n*sizeof(int));
1490
1491 cpuc->group_flag &= ~PERF_EVENT_TXN;
1492
1493 return 0;
2235} 1494}
2236 1495
2237static const struct pmu pmu = { 1496static const struct pmu pmu = {
2238 .enable = x86_pmu_enable, 1497 .enable = x86_pmu_enable,
2239 .disable = x86_pmu_disable, 1498 .disable = x86_pmu_disable,
1499 .start = x86_pmu_start,
1500 .stop = x86_pmu_stop,
2240 .read = x86_pmu_read, 1501 .read = x86_pmu_read,
2241 .unthrottle = x86_pmu_unthrottle, 1502 .unthrottle = x86_pmu_unthrottle,
1503 .start_txn = x86_pmu_start_txn,
1504 .cancel_txn = x86_pmu_cancel_txn,
1505 .commit_txn = x86_pmu_commit_txn,
2242}; 1506};
2243 1507
2244static int 1508/*
2245validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) 1509 * validate that we can schedule this event
1510 */
1511static int validate_event(struct perf_event *event)
2246{ 1512{
2247 struct hw_perf_event fake_event = event->hw; 1513 struct cpu_hw_events *fake_cpuc;
1514 struct event_constraint *c;
1515 int ret = 0;
2248 1516
2249 if (event->pmu && event->pmu != &pmu) 1517 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
2250 return 0; 1518 if (!fake_cpuc)
1519 return -ENOMEM;
1520
1521 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1522
1523 if (!c || !c->weight)
1524 ret = -ENOSPC;
1525
1526 if (x86_pmu.put_event_constraints)
1527 x86_pmu.put_event_constraints(fake_cpuc, event);
1528
1529 kfree(fake_cpuc);
2251 1530
2252 return x86_schedule_event(cpuc, &fake_event) >= 0; 1531 return ret;
2253} 1532}
2254 1533
1534/*
1535 * validate a single event group
1536 *
1537 * validation include:
1538 * - check events are compatible which each other
1539 * - events do not compete for the same counter
1540 * - number of events <= number of counters
1541 *
1542 * validation ensures the group can be loaded onto the
1543 * PMU if it was the only group available.
1544 */
2255static int validate_group(struct perf_event *event) 1545static int validate_group(struct perf_event *event)
2256{ 1546{
2257 struct perf_event *sibling, *leader = event->group_leader; 1547 struct perf_event *leader = event->group_leader;
2258 struct cpu_hw_events fake_pmu; 1548 struct cpu_hw_events *fake_cpuc;
1549 int ret, n;
2259 1550
2260 memset(&fake_pmu, 0, sizeof(fake_pmu)); 1551 ret = -ENOMEM;
1552 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1553 if (!fake_cpuc)
1554 goto out;
2261 1555
2262 if (!validate_event(&fake_pmu, leader)) 1556 /*
2263 return -ENOSPC; 1557 * the event is not yet connected with its
1558 * siblings therefore we must first collect
1559 * existing siblings, then add the new event
1560 * before we can simulate the scheduling
1561 */
1562 ret = -ENOSPC;
1563 n = collect_events(fake_cpuc, leader, true);
1564 if (n < 0)
1565 goto out_free;
2264 1566
2265 list_for_each_entry(sibling, &leader->sibling_list, group_entry) { 1567 fake_cpuc->n_events = n;
2266 if (!validate_event(&fake_pmu, sibling)) 1568 n = collect_events(fake_cpuc, event, false);
2267 return -ENOSPC; 1569 if (n < 0)
2268 } 1570 goto out_free;
2269 1571
2270 if (!validate_event(&fake_pmu, event)) 1572 fake_cpuc->n_events = n;
2271 return -ENOSPC;
2272 1573
2273 return 0; 1574 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1575
1576out_free:
1577 kfree(fake_cpuc);
1578out:
1579 return ret;
2274} 1580}
2275 1581
2276const struct pmu *hw_perf_event_init(struct perf_event *event) 1582const struct pmu *hw_perf_event_init(struct perf_event *event)
2277{ 1583{
1584 const struct pmu *tmp;
2278 int err; 1585 int err;
2279 1586
2280 err = __hw_perf_event_init(event); 1587 err = __hw_perf_event_init(event);
2281 if (!err) { 1588 if (!err) {
1589 /*
1590 * we temporarily connect event to its pmu
1591 * such that validate_group() can classify
1592 * it as an x86 event using is_x86_event()
1593 */
1594 tmp = event->pmu;
1595 event->pmu = &pmu;
1596
2282 if (event->group_leader != event) 1597 if (event->group_leader != event)
2283 err = validate_group(event); 1598 err = validate_group(event);
1599 else
1600 err = validate_event(event);
1601
1602 event->pmu = tmp;
2284 } 1603 }
2285 if (err) { 1604 if (err) {
2286 if (event->destroy) 1605 if (event->destroy)
@@ -2304,7 +1623,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2304 1623
2305static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 1624static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2306static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 1625static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2307static DEFINE_PER_CPU(int, in_ignored_frame);
2308 1626
2309 1627
2310static void 1628static void
@@ -2320,10 +1638,6 @@ static void backtrace_warning(void *data, char *msg)
2320 1638
2321static int backtrace_stack(void *data, char *name) 1639static int backtrace_stack(void *data, char *name)
2322{ 1640{
2323 per_cpu(in_ignored_frame, smp_processor_id()) =
2324 x86_is_stack_id(NMI_STACK, name) ||
2325 x86_is_stack_id(DEBUG_STACK, name);
2326
2327 return 0; 1641 return 0;
2328} 1642}
2329 1643
@@ -2331,11 +1645,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
2331{ 1645{
2332 struct perf_callchain_entry *entry = data; 1646 struct perf_callchain_entry *entry = data;
2333 1647
2334 if (per_cpu(in_ignored_frame, smp_processor_id())) 1648 callchain_store(entry, addr);
2335 return;
2336
2337 if (reliable)
2338 callchain_store(entry, addr);
2339} 1649}
2340 1650
2341static const struct stacktrace_ops backtrace_ops = { 1651static const struct stacktrace_ops backtrace_ops = {
@@ -2346,8 +1656,6 @@ static const struct stacktrace_ops backtrace_ops = {
2346 .walk_stack = print_context_stack_bp, 1656 .walk_stack = print_context_stack_bp,
2347}; 1657};
2348 1658
2349#include "../dumpstack.h"
2350
2351static void 1659static void
2352perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) 1660perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
2353{ 1661{
@@ -2357,49 +1665,42 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
2357 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1665 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
2358} 1666}
2359 1667
2360/* 1668#ifdef CONFIG_COMPAT
2361 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context 1669static inline int
2362 */ 1670perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
2363static unsigned long
2364copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
2365{ 1671{
2366 unsigned long offset, addr = (unsigned long)from; 1672 /* 32-bit process in 64-bit kernel. */
2367 int type = in_nmi() ? KM_NMI : KM_IRQ0; 1673 struct stack_frame_ia32 frame;
2368 unsigned long size, len = 0; 1674 const void __user *fp;
2369 struct page *page;
2370 void *map;
2371 int ret;
2372
2373 do {
2374 ret = __get_user_pages_fast(addr, 1, 0, &page);
2375 if (!ret)
2376 break;
2377 1675
2378 offset = addr & (PAGE_SIZE - 1); 1676 if (!test_thread_flag(TIF_IA32))
2379 size = min(PAGE_SIZE - offset, n - len); 1677 return 0;
2380 1678
2381 map = kmap_atomic(page, type); 1679 fp = compat_ptr(regs->bp);
2382 memcpy(to, map+offset, size); 1680 while (entry->nr < PERF_MAX_STACK_DEPTH) {
2383 kunmap_atomic(map, type); 1681 unsigned long bytes;
2384 put_page(page); 1682 frame.next_frame = 0;
1683 frame.return_address = 0;
2385 1684
2386 len += size; 1685 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
2387 to += size; 1686 if (bytes != sizeof(frame))
2388 addr += size; 1687 break;
2389 1688
2390 } while (len < n); 1689 if (fp < compat_ptr(regs->sp))
1690 break;
2391 1691
2392 return len; 1692 callchain_store(entry, frame.return_address);
1693 fp = compat_ptr(frame.next_frame);
1694 }
1695 return 1;
2393} 1696}
2394 1697#else
2395static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) 1698static inline int
1699perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
2396{ 1700{
2397 unsigned long bytes; 1701 return 0;
2398
2399 bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
2400
2401 return bytes == sizeof(*frame);
2402} 1702}
1703#endif
2403 1704
2404static void 1705static void
2405perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) 1706perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -2415,11 +1716,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
2415 callchain_store(entry, PERF_CONTEXT_USER); 1716 callchain_store(entry, PERF_CONTEXT_USER);
2416 callchain_store(entry, regs->ip); 1717 callchain_store(entry, regs->ip);
2417 1718
1719 if (perf_callchain_user32(regs, entry))
1720 return;
1721
2418 while (entry->nr < PERF_MAX_STACK_DEPTH) { 1722 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1723 unsigned long bytes;
2419 frame.next_frame = NULL; 1724 frame.next_frame = NULL;
2420 frame.return_address = 0; 1725 frame.return_address = 0;
2421 1726
2422 if (!copy_stack_frame(fp, &frame)) 1727 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1728 if (bytes != sizeof(frame))
2423 break; 1729 break;
2424 1730
2425 if ((unsigned long)fp < regs->sp) 1731 if ((unsigned long)fp < regs->sp)
@@ -2440,9 +1746,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
2440 1746
2441 is_user = user_mode(regs); 1747 is_user = user_mode(regs);
2442 1748
2443 if (!current || current->pid == 0)
2444 return;
2445
2446 if (is_user && current->state != TASK_RUNNING) 1749 if (is_user && current->state != TASK_RUNNING)
2447 return; 1750 return;
2448 1751
@@ -2457,6 +1760,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2457{ 1760{
2458 struct perf_callchain_entry *entry; 1761 struct perf_callchain_entry *entry;
2459 1762
1763 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1764 /* TODO: We don't support guest os callchain now */
1765 return NULL;
1766 }
1767
2460 if (in_nmi()) 1768 if (in_nmi())
2461 entry = &__get_cpu_var(pmc_nmi_entry); 1769 entry = &__get_cpu_var(pmc_nmi_entry);
2462 else 1770 else
@@ -2469,7 +1777,36 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2469 return entry; 1777 return entry;
2470} 1778}
2471 1779
2472void hw_perf_event_setup_online(int cpu) 1780unsigned long perf_instruction_pointer(struct pt_regs *regs)
2473{ 1781{
2474 init_debug_store_on_cpu(cpu); 1782 unsigned long ip;
1783
1784 if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1785 ip = perf_guest_cbs->get_guest_ip();
1786 else
1787 ip = instruction_pointer(regs);
1788
1789 return ip;
1790}
1791
1792unsigned long perf_misc_flags(struct pt_regs *regs)
1793{
1794 int misc = 0;
1795
1796 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1797 if (perf_guest_cbs->is_user_mode())
1798 misc |= PERF_RECORD_MISC_GUEST_USER;
1799 else
1800 misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1801 } else {
1802 if (user_mode(regs))
1803 misc |= PERF_RECORD_MISC_USER;
1804 else
1805 misc |= PERF_RECORD_MISC_KERNEL;
1806 }
1807
1808 if (regs->flags & PERF_EFLAGS_EXACT)
1809 misc |= PERF_RECORD_MISC_EXACT_IP;
1810
1811 return misc;
2475} 1812}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
new file mode 100644
index 000000000000..c2897b7b4a3b
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -0,0 +1,420 @@
1#ifdef CONFIG_CPU_SUP_AMD
2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4
5static __initconst const u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
9{
10 [ C(L1D) ] = {
11 [ C(OP_READ) ] = {
12 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
13 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
14 },
15 [ C(OP_WRITE) ] = {
16 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
17 [ C(RESULT_MISS) ] = 0,
18 },
19 [ C(OP_PREFETCH) ] = {
20 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
21 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
22 },
23 },
24 [ C(L1I ) ] = {
25 [ C(OP_READ) ] = {
26 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
27 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
28 },
29 [ C(OP_WRITE) ] = {
30 [ C(RESULT_ACCESS) ] = -1,
31 [ C(RESULT_MISS) ] = -1,
32 },
33 [ C(OP_PREFETCH) ] = {
34 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
35 [ C(RESULT_MISS) ] = 0,
36 },
37 },
38 [ C(LL ) ] = {
39 [ C(OP_READ) ] = {
40 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
41 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
42 },
43 [ C(OP_WRITE) ] = {
44 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
45 [ C(RESULT_MISS) ] = 0,
46 },
47 [ C(OP_PREFETCH) ] = {
48 [ C(RESULT_ACCESS) ] = 0,
49 [ C(RESULT_MISS) ] = 0,
50 },
51 },
52 [ C(DTLB) ] = {
53 [ C(OP_READ) ] = {
54 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
55 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
56 },
57 [ C(OP_WRITE) ] = {
58 [ C(RESULT_ACCESS) ] = 0,
59 [ C(RESULT_MISS) ] = 0,
60 },
61 [ C(OP_PREFETCH) ] = {
62 [ C(RESULT_ACCESS) ] = 0,
63 [ C(RESULT_MISS) ] = 0,
64 },
65 },
66 [ C(ITLB) ] = {
67 [ C(OP_READ) ] = {
68 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
69 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
70 },
71 [ C(OP_WRITE) ] = {
72 [ C(RESULT_ACCESS) ] = -1,
73 [ C(RESULT_MISS) ] = -1,
74 },
75 [ C(OP_PREFETCH) ] = {
76 [ C(RESULT_ACCESS) ] = -1,
77 [ C(RESULT_MISS) ] = -1,
78 },
79 },
80 [ C(BPU ) ] = {
81 [ C(OP_READ) ] = {
82 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
83 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
84 },
85 [ C(OP_WRITE) ] = {
86 [ C(RESULT_ACCESS) ] = -1,
87 [ C(RESULT_MISS) ] = -1,
88 },
89 [ C(OP_PREFETCH) ] = {
90 [ C(RESULT_ACCESS) ] = -1,
91 [ C(RESULT_MISS) ] = -1,
92 },
93 },
94};
95
96/*
97 * AMD Performance Monitor K7 and later.
98 */
99static const u64 amd_perfmon_event_map[] =
100{
101 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
107};
108
109static u64 amd_pmu_event_map(int hw_event)
110{
111 return amd_perfmon_event_map[hw_event];
112}
113
114static int amd_pmu_hw_config(struct perf_event *event)
115{
116 int ret = x86_pmu_hw_config(event);
117
118 if (ret)
119 return ret;
120
121 if (event->attr.type != PERF_TYPE_RAW)
122 return 0;
123
124 event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
125
126 return 0;
127}
128
129/*
130 * AMD64 events are detected based on their event codes.
131 */
132static inline int amd_is_nb_event(struct hw_perf_event *hwc)
133{
134 return (hwc->config & 0xe0) == 0xe0;
135}
136
137static inline int amd_has_nb(struct cpu_hw_events *cpuc)
138{
139 struct amd_nb *nb = cpuc->amd_nb;
140
141 return nb && nb->nb_id != -1;
142}
143
144static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
145 struct perf_event *event)
146{
147 struct hw_perf_event *hwc = &event->hw;
148 struct amd_nb *nb = cpuc->amd_nb;
149 int i;
150
151 /*
152 * only care about NB events
153 */
154 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
155 return;
156
157 /*
158 * need to scan whole list because event may not have
159 * been assigned during scheduling
160 *
161 * no race condition possible because event can only
162 * be removed on one CPU at a time AND PMU is disabled
163 * when we come here
164 */
165 for (i = 0; i < x86_pmu.num_counters; i++) {
166 if (nb->owners[i] == event) {
167 cmpxchg(nb->owners+i, event, NULL);
168 break;
169 }
170 }
171}
172
173 /*
174 * AMD64 NorthBridge events need special treatment because
175 * counter access needs to be synchronized across all cores
176 * of a package. Refer to BKDG section 3.12
177 *
178 * NB events are events measuring L3 cache, Hypertransport
179 * traffic. They are identified by an event code >= 0xe00.
180 * They measure events on the NorthBride which is shared
181 * by all cores on a package. NB events are counted on a
182 * shared set of counters. When a NB event is programmed
183 * in a counter, the data actually comes from a shared
184 * counter. Thus, access to those counters needs to be
185 * synchronized.
186 *
187 * We implement the synchronization such that no two cores
188 * can be measuring NB events using the same counters. Thus,
189 * we maintain a per-NB allocation table. The available slot
190 * is propagated using the event_constraint structure.
191 *
192 * We provide only one choice for each NB event based on
193 * the fact that only NB events have restrictions. Consequently,
194 * if a counter is available, there is a guarantee the NB event
195 * will be assigned to it. If no slot is available, an empty
196 * constraint is returned and scheduling will eventually fail
197 * for this event.
198 *
199 * Note that all cores attached the same NB compete for the same
200 * counters to host NB events, this is why we use atomic ops. Some
201 * multi-chip CPUs may have more than one NB.
202 *
203 * Given that resources are allocated (cmpxchg), they must be
204 * eventually freed for others to use. This is accomplished by
205 * calling amd_put_event_constraints().
206 *
207 * Non NB events are not impacted by this restriction.
208 */
209static struct event_constraint *
210amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
211{
212 struct hw_perf_event *hwc = &event->hw;
213 struct amd_nb *nb = cpuc->amd_nb;
214 struct perf_event *old = NULL;
215 int max = x86_pmu.num_counters;
216 int i, j, k = -1;
217
218 /*
219 * if not NB event or no NB, then no constraints
220 */
221 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
222 return &unconstrained;
223
224 /*
225 * detect if already present, if so reuse
226 *
227 * cannot merge with actual allocation
228 * because of possible holes
229 *
230 * event can already be present yet not assigned (in hwc->idx)
231 * because of successive calls to x86_schedule_events() from
232 * hw_perf_group_sched_in() without hw_perf_enable()
233 */
234 for (i = 0; i < max; i++) {
235 /*
236 * keep track of first free slot
237 */
238 if (k == -1 && !nb->owners[i])
239 k = i;
240
241 /* already present, reuse */
242 if (nb->owners[i] == event)
243 goto done;
244 }
245 /*
246 * not present, so grab a new slot
247 * starting either at:
248 */
249 if (hwc->idx != -1) {
250 /* previous assignment */
251 i = hwc->idx;
252 } else if (k != -1) {
253 /* start from free slot found */
254 i = k;
255 } else {
256 /*
257 * event not found, no slot found in
258 * first pass, try again from the
259 * beginning
260 */
261 i = 0;
262 }
263 j = i;
264 do {
265 old = cmpxchg(nb->owners+i, NULL, event);
266 if (!old)
267 break;
268 if (++i == max)
269 i = 0;
270 } while (i != j);
271done:
272 if (!old)
273 return &nb->event_constraints[i];
274
275 return &emptyconstraint;
276}
277
278static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
279{
280 struct amd_nb *nb;
281 int i;
282
283 nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
284 if (!nb)
285 return NULL;
286
287 memset(nb, 0, sizeof(*nb));
288 nb->nb_id = nb_id;
289
290 /*
291 * initialize all possible NB constraints
292 */
293 for (i = 0; i < x86_pmu.num_counters; i++) {
294 __set_bit(i, nb->event_constraints[i].idxmsk);
295 nb->event_constraints[i].weight = 1;
296 }
297 return nb;
298}
299
300static int amd_pmu_cpu_prepare(int cpu)
301{
302 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
303
304 WARN_ON_ONCE(cpuc->amd_nb);
305
306 if (boot_cpu_data.x86_max_cores < 2)
307 return NOTIFY_OK;
308
309 cpuc->amd_nb = amd_alloc_nb(cpu, -1);
310 if (!cpuc->amd_nb)
311 return NOTIFY_BAD;
312
313 return NOTIFY_OK;
314}
315
316static void amd_pmu_cpu_starting(int cpu)
317{
318 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
319 struct amd_nb *nb;
320 int i, nb_id;
321
322 if (boot_cpu_data.x86_max_cores < 2)
323 return;
324
325 nb_id = amd_get_nb_id(cpu);
326 WARN_ON_ONCE(nb_id == BAD_APICID);
327
328 raw_spin_lock(&amd_nb_lock);
329
330 for_each_online_cpu(i) {
331 nb = per_cpu(cpu_hw_events, i).amd_nb;
332 if (WARN_ON_ONCE(!nb))
333 continue;
334
335 if (nb->nb_id == nb_id) {
336 kfree(cpuc->amd_nb);
337 cpuc->amd_nb = nb;
338 break;
339 }
340 }
341
342 cpuc->amd_nb->nb_id = nb_id;
343 cpuc->amd_nb->refcnt++;
344
345 raw_spin_unlock(&amd_nb_lock);
346}
347
348static void amd_pmu_cpu_dead(int cpu)
349{
350 struct cpu_hw_events *cpuhw;
351
352 if (boot_cpu_data.x86_max_cores < 2)
353 return;
354
355 cpuhw = &per_cpu(cpu_hw_events, cpu);
356
357 raw_spin_lock(&amd_nb_lock);
358
359 if (cpuhw->amd_nb) {
360 struct amd_nb *nb = cpuhw->amd_nb;
361
362 if (nb->nb_id == -1 || --nb->refcnt == 0)
363 kfree(nb);
364
365 cpuhw->amd_nb = NULL;
366 }
367
368 raw_spin_unlock(&amd_nb_lock);
369}
370
371static __initconst const struct x86_pmu amd_pmu = {
372 .name = "AMD",
373 .handle_irq = x86_pmu_handle_irq,
374 .disable_all = x86_pmu_disable_all,
375 .enable_all = x86_pmu_enable_all,
376 .enable = x86_pmu_enable_event,
377 .disable = x86_pmu_disable_event,
378 .hw_config = amd_pmu_hw_config,
379 .schedule_events = x86_schedule_events,
380 .eventsel = MSR_K7_EVNTSEL0,
381 .perfctr = MSR_K7_PERFCTR0,
382 .event_map = amd_pmu_event_map,
383 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
384 .num_counters = 4,
385 .cntval_bits = 48,
386 .cntval_mask = (1ULL << 48) - 1,
387 .apic = 1,
388 /* use highest bit to detect overflow */
389 .max_period = (1ULL << 47) - 1,
390 .get_event_constraints = amd_get_event_constraints,
391 .put_event_constraints = amd_put_event_constraints,
392
393 .cpu_prepare = amd_pmu_cpu_prepare,
394 .cpu_starting = amd_pmu_cpu_starting,
395 .cpu_dead = amd_pmu_cpu_dead,
396};
397
398static __init int amd_pmu_init(void)
399{
400 /* Performance-monitoring supported from K7 and later: */
401 if (boot_cpu_data.x86 < 6)
402 return -ENODEV;
403
404 x86_pmu = amd_pmu;
405
406 /* Events are common for all AMDs */
407 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
408 sizeof(hw_cache_event_ids));
409
410 return 0;
411}
412
413#else /* CONFIG_CPU_SUP_AMD */
414
415static int amd_pmu_init(void)
416{
417 return 0;
418}
419
420#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
new file mode 100644
index 000000000000..ee05c90012d2
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -0,0 +1,1056 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Intel PerfMon, used on Core and later.
5 */
6static const u64 intel_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
15};
16
17static struct event_constraint intel_core_event_constraints[] =
18{
19 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
20 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
21 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
22 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
23 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
24 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
25 EVENT_CONSTRAINT_END
26};
27
28static struct event_constraint intel_core2_event_constraints[] =
29{
30 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
31 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
32 /*
33 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
34 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
35 * ratio between these counters.
36 */
37 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
38 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
39 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
40 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
41 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
42 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
43 INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
44 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
45 INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
46 INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
47 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
48 EVENT_CONSTRAINT_END
49};
50
51static struct event_constraint intel_nehalem_event_constraints[] =
52{
53 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
54 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
55 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
56 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
57 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
58 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
59 INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
60 INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
61 INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
62 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
63 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
64 EVENT_CONSTRAINT_END
65};
66
67static struct event_constraint intel_westmere_event_constraints[] =
68{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
70 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
71 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
72 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
73 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
74 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
75 INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
76 EVENT_CONSTRAINT_END
77};
78
79static struct event_constraint intel_gen_event_constraints[] =
80{
81 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
82 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
83 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
84 EVENT_CONSTRAINT_END
85};
86
87static u64 intel_pmu_event_map(int hw_event)
88{
89 return intel_perfmon_event_map[hw_event];
90}
91
92static __initconst const u64 westmere_hw_cache_event_ids
93 [PERF_COUNT_HW_CACHE_MAX]
94 [PERF_COUNT_HW_CACHE_OP_MAX]
95 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
96{
97 [ C(L1D) ] = {
98 [ C(OP_READ) ] = {
99 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
100 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
101 },
102 [ C(OP_WRITE) ] = {
103 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
104 [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
105 },
106 [ C(OP_PREFETCH) ] = {
107 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
108 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
109 },
110 },
111 [ C(L1I ) ] = {
112 [ C(OP_READ) ] = {
113 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
114 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
115 },
116 [ C(OP_WRITE) ] = {
117 [ C(RESULT_ACCESS) ] = -1,
118 [ C(RESULT_MISS) ] = -1,
119 },
120 [ C(OP_PREFETCH) ] = {
121 [ C(RESULT_ACCESS) ] = 0x0,
122 [ C(RESULT_MISS) ] = 0x0,
123 },
124 },
125 [ C(LL ) ] = {
126 [ C(OP_READ) ] = {
127 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
128 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
129 },
130 [ C(OP_WRITE) ] = {
131 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
132 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
133 },
134 [ C(OP_PREFETCH) ] = {
135 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
136 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
137 },
138 },
139 [ C(DTLB) ] = {
140 [ C(OP_READ) ] = {
141 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
142 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
143 },
144 [ C(OP_WRITE) ] = {
145 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
146 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
147 },
148 [ C(OP_PREFETCH) ] = {
149 [ C(RESULT_ACCESS) ] = 0x0,
150 [ C(RESULT_MISS) ] = 0x0,
151 },
152 },
153 [ C(ITLB) ] = {
154 [ C(OP_READ) ] = {
155 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
156 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
157 },
158 [ C(OP_WRITE) ] = {
159 [ C(RESULT_ACCESS) ] = -1,
160 [ C(RESULT_MISS) ] = -1,
161 },
162 [ C(OP_PREFETCH) ] = {
163 [ C(RESULT_ACCESS) ] = -1,
164 [ C(RESULT_MISS) ] = -1,
165 },
166 },
167 [ C(BPU ) ] = {
168 [ C(OP_READ) ] = {
169 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
170 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
171 },
172 [ C(OP_WRITE) ] = {
173 [ C(RESULT_ACCESS) ] = -1,
174 [ C(RESULT_MISS) ] = -1,
175 },
176 [ C(OP_PREFETCH) ] = {
177 [ C(RESULT_ACCESS) ] = -1,
178 [ C(RESULT_MISS) ] = -1,
179 },
180 },
181};
182
183static __initconst const u64 nehalem_hw_cache_event_ids
184 [PERF_COUNT_HW_CACHE_MAX]
185 [PERF_COUNT_HW_CACHE_OP_MAX]
186 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
187{
188 [ C(L1D) ] = {
189 [ C(OP_READ) ] = {
190 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
191 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
192 },
193 [ C(OP_WRITE) ] = {
194 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
195 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
196 },
197 [ C(OP_PREFETCH) ] = {
198 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
199 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
200 },
201 },
202 [ C(L1I ) ] = {
203 [ C(OP_READ) ] = {
204 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
205 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
206 },
207 [ C(OP_WRITE) ] = {
208 [ C(RESULT_ACCESS) ] = -1,
209 [ C(RESULT_MISS) ] = -1,
210 },
211 [ C(OP_PREFETCH) ] = {
212 [ C(RESULT_ACCESS) ] = 0x0,
213 [ C(RESULT_MISS) ] = 0x0,
214 },
215 },
216 [ C(LL ) ] = {
217 [ C(OP_READ) ] = {
218 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
219 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
220 },
221 [ C(OP_WRITE) ] = {
222 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
223 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
224 },
225 [ C(OP_PREFETCH) ] = {
226 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
227 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
228 },
229 },
230 [ C(DTLB) ] = {
231 [ C(OP_READ) ] = {
232 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
233 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
234 },
235 [ C(OP_WRITE) ] = {
236 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
237 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
238 },
239 [ C(OP_PREFETCH) ] = {
240 [ C(RESULT_ACCESS) ] = 0x0,
241 [ C(RESULT_MISS) ] = 0x0,
242 },
243 },
244 [ C(ITLB) ] = {
245 [ C(OP_READ) ] = {
246 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
247 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
248 },
249 [ C(OP_WRITE) ] = {
250 [ C(RESULT_ACCESS) ] = -1,
251 [ C(RESULT_MISS) ] = -1,
252 },
253 [ C(OP_PREFETCH) ] = {
254 [ C(RESULT_ACCESS) ] = -1,
255 [ C(RESULT_MISS) ] = -1,
256 },
257 },
258 [ C(BPU ) ] = {
259 [ C(OP_READ) ] = {
260 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
261 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
262 },
263 [ C(OP_WRITE) ] = {
264 [ C(RESULT_ACCESS) ] = -1,
265 [ C(RESULT_MISS) ] = -1,
266 },
267 [ C(OP_PREFETCH) ] = {
268 [ C(RESULT_ACCESS) ] = -1,
269 [ C(RESULT_MISS) ] = -1,
270 },
271 },
272};
273
274static __initconst const u64 core2_hw_cache_event_ids
275 [PERF_COUNT_HW_CACHE_MAX]
276 [PERF_COUNT_HW_CACHE_OP_MAX]
277 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
278{
279 [ C(L1D) ] = {
280 [ C(OP_READ) ] = {
281 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
282 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
283 },
284 [ C(OP_WRITE) ] = {
285 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
286 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
287 },
288 [ C(OP_PREFETCH) ] = {
289 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
290 [ C(RESULT_MISS) ] = 0,
291 },
292 },
293 [ C(L1I ) ] = {
294 [ C(OP_READ) ] = {
295 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
296 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
297 },
298 [ C(OP_WRITE) ] = {
299 [ C(RESULT_ACCESS) ] = -1,
300 [ C(RESULT_MISS) ] = -1,
301 },
302 [ C(OP_PREFETCH) ] = {
303 [ C(RESULT_ACCESS) ] = 0,
304 [ C(RESULT_MISS) ] = 0,
305 },
306 },
307 [ C(LL ) ] = {
308 [ C(OP_READ) ] = {
309 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
310 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
311 },
312 [ C(OP_WRITE) ] = {
313 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
314 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
315 },
316 [ C(OP_PREFETCH) ] = {
317 [ C(RESULT_ACCESS) ] = 0,
318 [ C(RESULT_MISS) ] = 0,
319 },
320 },
321 [ C(DTLB) ] = {
322 [ C(OP_READ) ] = {
323 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
324 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
325 },
326 [ C(OP_WRITE) ] = {
327 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
328 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
329 },
330 [ C(OP_PREFETCH) ] = {
331 [ C(RESULT_ACCESS) ] = 0,
332 [ C(RESULT_MISS) ] = 0,
333 },
334 },
335 [ C(ITLB) ] = {
336 [ C(OP_READ) ] = {
337 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
338 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
339 },
340 [ C(OP_WRITE) ] = {
341 [ C(RESULT_ACCESS) ] = -1,
342 [ C(RESULT_MISS) ] = -1,
343 },
344 [ C(OP_PREFETCH) ] = {
345 [ C(RESULT_ACCESS) ] = -1,
346 [ C(RESULT_MISS) ] = -1,
347 },
348 },
349 [ C(BPU ) ] = {
350 [ C(OP_READ) ] = {
351 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
352 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
353 },
354 [ C(OP_WRITE) ] = {
355 [ C(RESULT_ACCESS) ] = -1,
356 [ C(RESULT_MISS) ] = -1,
357 },
358 [ C(OP_PREFETCH) ] = {
359 [ C(RESULT_ACCESS) ] = -1,
360 [ C(RESULT_MISS) ] = -1,
361 },
362 },
363};
364
365static __initconst const u64 atom_hw_cache_event_ids
366 [PERF_COUNT_HW_CACHE_MAX]
367 [PERF_COUNT_HW_CACHE_OP_MAX]
368 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
369{
370 [ C(L1D) ] = {
371 [ C(OP_READ) ] = {
372 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
373 [ C(RESULT_MISS) ] = 0,
374 },
375 [ C(OP_WRITE) ] = {
376 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
377 [ C(RESULT_MISS) ] = 0,
378 },
379 [ C(OP_PREFETCH) ] = {
380 [ C(RESULT_ACCESS) ] = 0x0,
381 [ C(RESULT_MISS) ] = 0,
382 },
383 },
384 [ C(L1I ) ] = {
385 [ C(OP_READ) ] = {
386 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
387 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
388 },
389 [ C(OP_WRITE) ] = {
390 [ C(RESULT_ACCESS) ] = -1,
391 [ C(RESULT_MISS) ] = -1,
392 },
393 [ C(OP_PREFETCH) ] = {
394 [ C(RESULT_ACCESS) ] = 0,
395 [ C(RESULT_MISS) ] = 0,
396 },
397 },
398 [ C(LL ) ] = {
399 [ C(OP_READ) ] = {
400 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
401 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
402 },
403 [ C(OP_WRITE) ] = {
404 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
405 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
406 },
407 [ C(OP_PREFETCH) ] = {
408 [ C(RESULT_ACCESS) ] = 0,
409 [ C(RESULT_MISS) ] = 0,
410 },
411 },
412 [ C(DTLB) ] = {
413 [ C(OP_READ) ] = {
414 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
415 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
416 },
417 [ C(OP_WRITE) ] = {
418 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
419 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
420 },
421 [ C(OP_PREFETCH) ] = {
422 [ C(RESULT_ACCESS) ] = 0,
423 [ C(RESULT_MISS) ] = 0,
424 },
425 },
426 [ C(ITLB) ] = {
427 [ C(OP_READ) ] = {
428 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
429 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
430 },
431 [ C(OP_WRITE) ] = {
432 [ C(RESULT_ACCESS) ] = -1,
433 [ C(RESULT_MISS) ] = -1,
434 },
435 [ C(OP_PREFETCH) ] = {
436 [ C(RESULT_ACCESS) ] = -1,
437 [ C(RESULT_MISS) ] = -1,
438 },
439 },
440 [ C(BPU ) ] = {
441 [ C(OP_READ) ] = {
442 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
443 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
444 },
445 [ C(OP_WRITE) ] = {
446 [ C(RESULT_ACCESS) ] = -1,
447 [ C(RESULT_MISS) ] = -1,
448 },
449 [ C(OP_PREFETCH) ] = {
450 [ C(RESULT_ACCESS) ] = -1,
451 [ C(RESULT_MISS) ] = -1,
452 },
453 },
454};
455
456static void intel_pmu_disable_all(void)
457{
458 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
459
460 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
461
462 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
463 intel_pmu_disable_bts();
464
465 intel_pmu_pebs_disable_all();
466 intel_pmu_lbr_disable_all();
467}
468
469static void intel_pmu_enable_all(int added)
470{
471 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
472
473 intel_pmu_pebs_enable_all();
474 intel_pmu_lbr_enable_all();
475 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
476
477 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
478 struct perf_event *event =
479 cpuc->events[X86_PMC_IDX_FIXED_BTS];
480
481 if (WARN_ON_ONCE(!event))
482 return;
483
484 intel_pmu_enable_bts(event->hw.config);
485 }
486}
487
488/*
489 * Workaround for:
490 * Intel Errata AAK100 (model 26)
491 * Intel Errata AAP53 (model 30)
492 * Intel Errata BD53 (model 44)
493 *
494 * The official story:
495 * These chips need to be 'reset' when adding counters by programming the
496 * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
497 * in sequence on the same PMC or on different PMCs.
498 *
499 * In practise it appears some of these events do in fact count, and
500 * we need to programm all 4 events.
501 */
502static void intel_pmu_nhm_workaround(void)
503{
504 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
505 static const unsigned long nhm_magic[4] = {
506 0x4300B5,
507 0x4300D2,
508 0x4300B1,
509 0x4300B1
510 };
511 struct perf_event *event;
512 int i;
513
514 /*
515 * The Errata requires below steps:
516 * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
517 * 2) Configure 4 PERFEVTSELx with the magic events and clear
518 * the corresponding PMCx;
519 * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
520 * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
521 * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
522 */
523
524 /*
525 * The real steps we choose are a little different from above.
526 * A) To reduce MSR operations, we don't run step 1) as they
527 * are already cleared before this function is called;
528 * B) Call x86_perf_event_update to save PMCx before configuring
529 * PERFEVTSELx with magic number;
530 * C) With step 5), we do clear only when the PERFEVTSELx is
531 * not used currently.
532 * D) Call x86_perf_event_set_period to restore PMCx;
533 */
534
535 /* We always operate 4 pairs of PERF Counters */
536 for (i = 0; i < 4; i++) {
537 event = cpuc->events[i];
538 if (event)
539 x86_perf_event_update(event);
540 }
541
542 for (i = 0; i < 4; i++) {
543 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
544 wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
545 }
546
547 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
548 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
549
550 for (i = 0; i < 4; i++) {
551 event = cpuc->events[i];
552
553 if (event) {
554 x86_perf_event_set_period(event);
555 __x86_pmu_enable_event(&event->hw,
556 ARCH_PERFMON_EVENTSEL_ENABLE);
557 } else
558 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
559 }
560}
561
562static void intel_pmu_nhm_enable_all(int added)
563{
564 if (added)
565 intel_pmu_nhm_workaround();
566 intel_pmu_enable_all(added);
567}
568
569static inline u64 intel_pmu_get_status(void)
570{
571 u64 status;
572
573 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
574
575 return status;
576}
577
578static inline void intel_pmu_ack_status(u64 ack)
579{
580 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
581}
582
583static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
584{
585 int idx = hwc->idx - X86_PMC_IDX_FIXED;
586 u64 ctrl_val, mask;
587
588 mask = 0xfULL << (idx * 4);
589
590 rdmsrl(hwc->config_base, ctrl_val);
591 ctrl_val &= ~mask;
592 wrmsrl(hwc->config_base, ctrl_val);
593}
594
595static void intel_pmu_disable_event(struct perf_event *event)
596{
597 struct hw_perf_event *hwc = &event->hw;
598
599 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
600 intel_pmu_disable_bts();
601 intel_pmu_drain_bts_buffer();
602 return;
603 }
604
605 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
606 intel_pmu_disable_fixed(hwc);
607 return;
608 }
609
610 x86_pmu_disable_event(event);
611
612 if (unlikely(event->attr.precise_ip))
613 intel_pmu_pebs_disable(event);
614}
615
616static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
617{
618 int idx = hwc->idx - X86_PMC_IDX_FIXED;
619 u64 ctrl_val, bits, mask;
620
621 /*
622 * Enable IRQ generation (0x8),
623 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
624 * if requested:
625 */
626 bits = 0x8ULL;
627 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
628 bits |= 0x2;
629 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
630 bits |= 0x1;
631
632 /*
633 * ANY bit is supported in v3 and up
634 */
635 if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
636 bits |= 0x4;
637
638 bits <<= (idx * 4);
639 mask = 0xfULL << (idx * 4);
640
641 rdmsrl(hwc->config_base, ctrl_val);
642 ctrl_val &= ~mask;
643 ctrl_val |= bits;
644 wrmsrl(hwc->config_base, ctrl_val);
645}
646
647static void intel_pmu_enable_event(struct perf_event *event)
648{
649 struct hw_perf_event *hwc = &event->hw;
650
651 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
652 if (!__get_cpu_var(cpu_hw_events).enabled)
653 return;
654
655 intel_pmu_enable_bts(hwc->config);
656 return;
657 }
658
659 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
660 intel_pmu_enable_fixed(hwc);
661 return;
662 }
663
664 if (unlikely(event->attr.precise_ip))
665 intel_pmu_pebs_enable(event);
666
667 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
668}
669
670/*
671 * Save and restart an expired event. Called by NMI contexts,
672 * so it has to be careful about preempting normal event ops:
673 */
674static int intel_pmu_save_and_restart(struct perf_event *event)
675{
676 x86_perf_event_update(event);
677 return x86_perf_event_set_period(event);
678}
679
680static void intel_pmu_reset(void)
681{
682 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
683 unsigned long flags;
684 int idx;
685
686 if (!x86_pmu.num_counters)
687 return;
688
689 local_irq_save(flags);
690
691 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
692
693 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
694 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
695 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
696 }
697 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
698 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
699
700 if (ds)
701 ds->bts_index = ds->bts_buffer_base;
702
703 local_irq_restore(flags);
704}
705
706/*
707 * This handler is triggered by the local APIC, so the APIC IRQ handling
708 * rules apply:
709 */
710static int intel_pmu_handle_irq(struct pt_regs *regs)
711{
712 struct perf_sample_data data;
713 struct cpu_hw_events *cpuc;
714 int bit, loops;
715 u64 status;
716 int handled = 0;
717
718 perf_sample_data_init(&data, 0);
719
720 cpuc = &__get_cpu_var(cpu_hw_events);
721
722 intel_pmu_disable_all();
723 intel_pmu_drain_bts_buffer();
724 status = intel_pmu_get_status();
725 if (!status) {
726 intel_pmu_enable_all(0);
727 return 0;
728 }
729
730 loops = 0;
731again:
732 intel_pmu_ack_status(status);
733 if (++loops > 100) {
734 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
735 perf_event_print_debug();
736 intel_pmu_reset();
737 goto done;
738 }
739
740 inc_irq_stat(apic_perf_irqs);
741
742 intel_pmu_lbr_read();
743
744 /*
745 * PEBS overflow sets bit 62 in the global status register
746 */
747 if (__test_and_clear_bit(62, (unsigned long *)&status)) {
748 handled++;
749 x86_pmu.drain_pebs(regs);
750 }
751
752 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
753 struct perf_event *event = cpuc->events[bit];
754
755 handled++;
756
757 if (!test_bit(bit, cpuc->active_mask))
758 continue;
759
760 if (!intel_pmu_save_and_restart(event))
761 continue;
762
763 data.period = event->hw.last_period;
764
765 if (perf_event_overflow(event, 1, &data, regs))
766 x86_pmu_stop(event);
767 }
768
769 /*
770 * Repeat if there is more work to be done:
771 */
772 status = intel_pmu_get_status();
773 if (status)
774 goto again;
775
776done:
777 intel_pmu_enable_all(0);
778 return handled;
779}
780
781static struct event_constraint *
782intel_bts_constraints(struct perf_event *event)
783{
784 struct hw_perf_event *hwc = &event->hw;
785 unsigned int hw_event, bts_event;
786
787 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
788 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
789
790 if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
791 return &bts_constraint;
792
793 return NULL;
794}
795
796static struct event_constraint *
797intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
798{
799 struct event_constraint *c;
800
801 c = intel_bts_constraints(event);
802 if (c)
803 return c;
804
805 c = intel_pebs_constraints(event);
806 if (c)
807 return c;
808
809 return x86_get_event_constraints(cpuc, event);
810}
811
812static int intel_pmu_hw_config(struct perf_event *event)
813{
814 int ret = x86_pmu_hw_config(event);
815
816 if (ret)
817 return ret;
818
819 if (event->attr.type != PERF_TYPE_RAW)
820 return 0;
821
822 if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
823 return 0;
824
825 if (x86_pmu.version < 3)
826 return -EINVAL;
827
828 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
829 return -EACCES;
830
831 event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
832
833 return 0;
834}
835
836static __initconst const struct x86_pmu core_pmu = {
837 .name = "core",
838 .handle_irq = x86_pmu_handle_irq,
839 .disable_all = x86_pmu_disable_all,
840 .enable_all = x86_pmu_enable_all,
841 .enable = x86_pmu_enable_event,
842 .disable = x86_pmu_disable_event,
843 .hw_config = x86_pmu_hw_config,
844 .schedule_events = x86_schedule_events,
845 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
846 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
847 .event_map = intel_pmu_event_map,
848 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
849 .apic = 1,
850 /*
851 * Intel PMCs cannot be accessed sanely above 32 bit width,
852 * so we install an artificial 1<<31 period regardless of
853 * the generic event period:
854 */
855 .max_period = (1ULL << 31) - 1,
856 .get_event_constraints = intel_get_event_constraints,
857 .event_constraints = intel_core_event_constraints,
858};
859
860static void intel_pmu_cpu_starting(int cpu)
861{
862 init_debug_store_on_cpu(cpu);
863 /*
864 * Deal with CPUs that don't clear their LBRs on power-up.
865 */
866 intel_pmu_lbr_reset();
867}
868
869static void intel_pmu_cpu_dying(int cpu)
870{
871 fini_debug_store_on_cpu(cpu);
872}
873
874static __initconst const struct x86_pmu intel_pmu = {
875 .name = "Intel",
876 .handle_irq = intel_pmu_handle_irq,
877 .disable_all = intel_pmu_disable_all,
878 .enable_all = intel_pmu_enable_all,
879 .enable = intel_pmu_enable_event,
880 .disable = intel_pmu_disable_event,
881 .hw_config = intel_pmu_hw_config,
882 .schedule_events = x86_schedule_events,
883 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
884 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
885 .event_map = intel_pmu_event_map,
886 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
887 .apic = 1,
888 /*
889 * Intel PMCs cannot be accessed sanely above 32 bit width,
890 * so we install an artificial 1<<31 period regardless of
891 * the generic event period:
892 */
893 .max_period = (1ULL << 31) - 1,
894 .get_event_constraints = intel_get_event_constraints,
895
896 .cpu_starting = intel_pmu_cpu_starting,
897 .cpu_dying = intel_pmu_cpu_dying,
898};
899
900static void intel_clovertown_quirks(void)
901{
902 /*
903 * PEBS is unreliable due to:
904 *
905 * AJ67 - PEBS may experience CPL leaks
906 * AJ68 - PEBS PMI may be delayed by one event
907 * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
908 * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
909 *
910 * AJ67 could be worked around by restricting the OS/USR flags.
911 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
912 *
913 * AJ106 could possibly be worked around by not allowing LBR
914 * usage from PEBS, including the fixup.
915 * AJ68 could possibly be worked around by always programming
916 * a pebs_event_reset[0] value and coping with the lost events.
917 *
918 * But taken together it might just make sense to not enable PEBS on
919 * these chips.
920 */
921 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
922 x86_pmu.pebs = 0;
923 x86_pmu.pebs_constraints = NULL;
924}
925
926static __init int intel_pmu_init(void)
927{
928 union cpuid10_edx edx;
929 union cpuid10_eax eax;
930 unsigned int unused;
931 unsigned int ebx;
932 int version;
933
934 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
935 switch (boot_cpu_data.x86) {
936 case 0x6:
937 return p6_pmu_init();
938 case 0xf:
939 return p4_pmu_init();
940 }
941 return -ENODEV;
942 }
943
944 /*
945 * Check whether the Architectural PerfMon supports
946 * Branch Misses Retired hw_event or not.
947 */
948 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
949 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
950 return -ENODEV;
951
952 version = eax.split.version_id;
953 if (version < 2)
954 x86_pmu = core_pmu;
955 else
956 x86_pmu = intel_pmu;
957
958 x86_pmu.version = version;
959 x86_pmu.num_counters = eax.split.num_counters;
960 x86_pmu.cntval_bits = eax.split.bit_width;
961 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
962
963 /*
964 * Quirk: v2 perfmon does not report fixed-purpose events, so
965 * assume at least 3 events:
966 */
967 if (version > 1)
968 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
969
970 /*
971 * v2 and above have a perf capabilities MSR
972 */
973 if (version > 1) {
974 u64 capabilities;
975
976 rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
977 x86_pmu.intel_cap.capabilities = capabilities;
978 }
979
980 intel_ds_init();
981
982 /*
983 * Install the hw-cache-events table:
984 */
985 switch (boot_cpu_data.x86_model) {
986 case 14: /* 65 nm core solo/duo, "Yonah" */
987 pr_cont("Core events, ");
988 break;
989
990 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
991 x86_pmu.quirks = intel_clovertown_quirks;
992 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
993 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
994 case 29: /* six-core 45 nm xeon "Dunnington" */
995 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
996 sizeof(hw_cache_event_ids));
997
998 intel_pmu_lbr_init_core();
999
1000 x86_pmu.event_constraints = intel_core2_event_constraints;
1001 pr_cont("Core2 events, ");
1002 break;
1003
1004 case 26: /* 45 nm nehalem, "Bloomfield" */
1005 case 30: /* 45 nm nehalem, "Lynnfield" */
1006 case 46: /* 45 nm nehalem-ex, "Beckton" */
1007 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1008 sizeof(hw_cache_event_ids));
1009
1010 intel_pmu_lbr_init_nhm();
1011
1012 x86_pmu.event_constraints = intel_nehalem_event_constraints;
1013 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1014 pr_cont("Nehalem events, ");
1015 break;
1016
1017 case 28: /* Atom */
1018 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
1019 sizeof(hw_cache_event_ids));
1020
1021 intel_pmu_lbr_init_atom();
1022
1023 x86_pmu.event_constraints = intel_gen_event_constraints;
1024 pr_cont("Atom events, ");
1025 break;
1026
1027 case 37: /* 32 nm nehalem, "Clarkdale" */
1028 case 44: /* 32 nm nehalem, "Gulftown" */
1029 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
1030 sizeof(hw_cache_event_ids));
1031
1032 intel_pmu_lbr_init_nhm();
1033
1034 x86_pmu.event_constraints = intel_westmere_event_constraints;
1035 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1036 pr_cont("Westmere events, ");
1037 break;
1038
1039 default:
1040 /*
1041 * default constraints for v2 and up
1042 */
1043 x86_pmu.event_constraints = intel_gen_event_constraints;
1044 pr_cont("generic architected perfmon, ");
1045 }
1046 return 0;
1047}
1048
1049#else /* CONFIG_CPU_SUP_INTEL */
1050
1051static int intel_pmu_init(void)
1052{
1053 return 0;
1054}
1055
1056#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 000000000000..18018d1311cd
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,641 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/* The maximal number of PEBS events: */
4#define MAX_PEBS_EVENTS 4
5
6/* The size of a BTS record in bytes: */
7#define BTS_RECORD_SIZE 24
8
9#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
10#define PEBS_BUFFER_SIZE PAGE_SIZE
11
12/*
13 * pebs_record_32 for p4 and core not supported
14
15struct pebs_record_32 {
16 u32 flags, ip;
17 u32 ax, bc, cx, dx;
18 u32 si, di, bp, sp;
19};
20
21 */
22
23struct pebs_record_core {
24 u64 flags, ip;
25 u64 ax, bx, cx, dx;
26 u64 si, di, bp, sp;
27 u64 r8, r9, r10, r11;
28 u64 r12, r13, r14, r15;
29};
30
31struct pebs_record_nhm {
32 u64 flags, ip;
33 u64 ax, bx, cx, dx;
34 u64 si, di, bp, sp;
35 u64 r8, r9, r10, r11;
36 u64 r12, r13, r14, r15;
37 u64 status, dla, dse, lat;
38};
39
40/*
41 * A debug store configuration.
42 *
43 * We only support architectures that use 64bit fields.
44 */
45struct debug_store {
46 u64 bts_buffer_base;
47 u64 bts_index;
48 u64 bts_absolute_maximum;
49 u64 bts_interrupt_threshold;
50 u64 pebs_buffer_base;
51 u64 pebs_index;
52 u64 pebs_absolute_maximum;
53 u64 pebs_interrupt_threshold;
54 u64 pebs_event_reset[MAX_PEBS_EVENTS];
55};
56
57static void init_debug_store_on_cpu(int cpu)
58{
59 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
60
61 if (!ds)
62 return;
63
64 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
65 (u32)((u64)(unsigned long)ds),
66 (u32)((u64)(unsigned long)ds >> 32));
67}
68
69static void fini_debug_store_on_cpu(int cpu)
70{
71 if (!per_cpu(cpu_hw_events, cpu).ds)
72 return;
73
74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
75}
76
77static void release_ds_buffers(void)
78{
79 int cpu;
80
81 if (!x86_pmu.bts && !x86_pmu.pebs)
82 return;
83
84 get_online_cpus();
85
86 for_each_online_cpu(cpu)
87 fini_debug_store_on_cpu(cpu);
88
89 for_each_possible_cpu(cpu) {
90 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
91
92 if (!ds)
93 continue;
94
95 per_cpu(cpu_hw_events, cpu).ds = NULL;
96
97 kfree((void *)(unsigned long)ds->pebs_buffer_base);
98 kfree((void *)(unsigned long)ds->bts_buffer_base);
99 kfree(ds);
100 }
101
102 put_online_cpus();
103}
104
105static int reserve_ds_buffers(void)
106{
107 int cpu, err = 0;
108
109 if (!x86_pmu.bts && !x86_pmu.pebs)
110 return 0;
111
112 get_online_cpus();
113
114 for_each_possible_cpu(cpu) {
115 struct debug_store *ds;
116 void *buffer;
117 int max, thresh;
118
119 err = -ENOMEM;
120 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
121 if (unlikely(!ds))
122 break;
123 per_cpu(cpu_hw_events, cpu).ds = ds;
124
125 if (x86_pmu.bts) {
126 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
127 if (unlikely(!buffer))
128 break;
129
130 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
131 thresh = max / 16;
132
133 ds->bts_buffer_base = (u64)(unsigned long)buffer;
134 ds->bts_index = ds->bts_buffer_base;
135 ds->bts_absolute_maximum = ds->bts_buffer_base +
136 max * BTS_RECORD_SIZE;
137 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
138 thresh * BTS_RECORD_SIZE;
139 }
140
141 if (x86_pmu.pebs) {
142 buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
143 if (unlikely(!buffer))
144 break;
145
146 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
147
148 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
149 ds->pebs_index = ds->pebs_buffer_base;
150 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
151 max * x86_pmu.pebs_record_size;
152 /*
153 * Always use single record PEBS
154 */
155 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
156 x86_pmu.pebs_record_size;
157 }
158
159 err = 0;
160 }
161
162 if (err)
163 release_ds_buffers();
164 else {
165 for_each_online_cpu(cpu)
166 init_debug_store_on_cpu(cpu);
167 }
168
169 put_online_cpus();
170
171 return err;
172}
173
174/*
175 * BTS
176 */
177
178static struct event_constraint bts_constraint =
179 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
180
181static void intel_pmu_enable_bts(u64 config)
182{
183 unsigned long debugctlmsr;
184
185 debugctlmsr = get_debugctlmsr();
186
187 debugctlmsr |= DEBUGCTLMSR_TR;
188 debugctlmsr |= DEBUGCTLMSR_BTS;
189 debugctlmsr |= DEBUGCTLMSR_BTINT;
190
191 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
192 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
193
194 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
195 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
196
197 update_debugctlmsr(debugctlmsr);
198}
199
200static void intel_pmu_disable_bts(void)
201{
202 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
203 unsigned long debugctlmsr;
204
205 if (!cpuc->ds)
206 return;
207
208 debugctlmsr = get_debugctlmsr();
209
210 debugctlmsr &=
211 ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
212 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
213
214 update_debugctlmsr(debugctlmsr);
215}
216
217static void intel_pmu_drain_bts_buffer(void)
218{
219 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
220 struct debug_store *ds = cpuc->ds;
221 struct bts_record {
222 u64 from;
223 u64 to;
224 u64 flags;
225 };
226 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
227 struct bts_record *at, *top;
228 struct perf_output_handle handle;
229 struct perf_event_header header;
230 struct perf_sample_data data;
231 struct pt_regs regs;
232
233 if (!event)
234 return;
235
236 if (!ds)
237 return;
238
239 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
240 top = (struct bts_record *)(unsigned long)ds->bts_index;
241
242 if (top <= at)
243 return;
244
245 ds->bts_index = ds->bts_buffer_base;
246
247 perf_sample_data_init(&data, 0);
248 data.period = event->hw.last_period;
249 regs.ip = 0;
250
251 /*
252 * Prepare a generic sample, i.e. fill in the invariant fields.
253 * We will overwrite the from and to address before we output
254 * the sample.
255 */
256 perf_prepare_sample(&header, &data, event, &regs);
257
258 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
259 return;
260
261 for (; at < top; at++) {
262 data.ip = at->from;
263 data.addr = at->to;
264
265 perf_output_sample(&handle, &header, &data, event);
266 }
267
268 perf_output_end(&handle);
269
270 /* There's new data available. */
271 event->hw.interrupts++;
272 event->pending_kill = POLL_IN;
273}
274
275/*
276 * PEBS
277 */
278
279static struct event_constraint intel_core_pebs_events[] = {
280 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
281 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
282 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
283 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
284 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
285 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
286 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
287 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
288 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
289 EVENT_CONSTRAINT_END
290};
291
292static struct event_constraint intel_nehalem_pebs_events[] = {
293 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
294 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
295 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
296 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
297 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
298 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
299 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
300 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
301 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
302 EVENT_CONSTRAINT_END
303};
304
305static struct event_constraint *
306intel_pebs_constraints(struct perf_event *event)
307{
308 struct event_constraint *c;
309
310 if (!event->attr.precise_ip)
311 return NULL;
312
313 if (x86_pmu.pebs_constraints) {
314 for_each_event_constraint(c, x86_pmu.pebs_constraints) {
315 if ((event->hw.config & c->cmask) == c->code)
316 return c;
317 }
318 }
319
320 return &emptyconstraint;
321}
322
323static void intel_pmu_pebs_enable(struct perf_event *event)
324{
325 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
326 struct hw_perf_event *hwc = &event->hw;
327
328 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
329
330 cpuc->pebs_enabled |= 1ULL << hwc->idx;
331 WARN_ON_ONCE(cpuc->enabled);
332
333 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
334 intel_pmu_lbr_enable(event);
335}
336
337static void intel_pmu_pebs_disable(struct perf_event *event)
338{
339 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
340 struct hw_perf_event *hwc = &event->hw;
341
342 cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
343 if (cpuc->enabled)
344 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
345
346 hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
347
348 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
349 intel_pmu_lbr_disable(event);
350}
351
352static void intel_pmu_pebs_enable_all(void)
353{
354 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
355
356 if (cpuc->pebs_enabled)
357 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
358}
359
360static void intel_pmu_pebs_disable_all(void)
361{
362 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
363
364 if (cpuc->pebs_enabled)
365 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
366}
367
368#include <asm/insn.h>
369
370static inline bool kernel_ip(unsigned long ip)
371{
372#ifdef CONFIG_X86_32
373 return ip > PAGE_OFFSET;
374#else
375 return (long)ip < 0;
376#endif
377}
378
379static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
380{
381 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
382 unsigned long from = cpuc->lbr_entries[0].from;
383 unsigned long old_to, to = cpuc->lbr_entries[0].to;
384 unsigned long ip = regs->ip;
385
386 /*
387 * We don't need to fixup if the PEBS assist is fault like
388 */
389 if (!x86_pmu.intel_cap.pebs_trap)
390 return 1;
391
392 /*
393 * No LBR entry, no basic block, no rewinding
394 */
395 if (!cpuc->lbr_stack.nr || !from || !to)
396 return 0;
397
398 /*
399 * Basic blocks should never cross user/kernel boundaries
400 */
401 if (kernel_ip(ip) != kernel_ip(to))
402 return 0;
403
404 /*
405 * unsigned math, either ip is before the start (impossible) or
406 * the basic block is larger than 1 page (sanity)
407 */
408 if ((ip - to) > PAGE_SIZE)
409 return 0;
410
411 /*
412 * We sampled a branch insn, rewind using the LBR stack
413 */
414 if (ip == to) {
415 regs->ip = from;
416 return 1;
417 }
418
419 do {
420 struct insn insn;
421 u8 buf[MAX_INSN_SIZE];
422 void *kaddr;
423
424 old_to = to;
425 if (!kernel_ip(ip)) {
426 int bytes, size = MAX_INSN_SIZE;
427
428 bytes = copy_from_user_nmi(buf, (void __user *)to, size);
429 if (bytes != size)
430 return 0;
431
432 kaddr = buf;
433 } else
434 kaddr = (void *)to;
435
436 kernel_insn_init(&insn, kaddr);
437 insn_get_length(&insn);
438 to += insn.length;
439 } while (to < ip);
440
441 if (to == ip) {
442 regs->ip = old_to;
443 return 1;
444 }
445
446 /*
447 * Even though we decoded the basic block, the instruction stream
448 * never matched the given IP, either the TO or the IP got corrupted.
449 */
450 return 0;
451}
452
453static int intel_pmu_save_and_restart(struct perf_event *event);
454
455static void __intel_pmu_pebs_event(struct perf_event *event,
456 struct pt_regs *iregs, void *__pebs)
457{
458 /*
459 * We cast to pebs_record_core since that is a subset of
460 * both formats and we don't use the other fields in this
461 * routine.
462 */
463 struct pebs_record_core *pebs = __pebs;
464 struct perf_sample_data data;
465 struct pt_regs regs;
466
467 if (!intel_pmu_save_and_restart(event))
468 return;
469
470 perf_sample_data_init(&data, 0);
471 data.period = event->hw.last_period;
472
473 /*
474 * We use the interrupt regs as a base because the PEBS record
475 * does not contain a full regs set, specifically it seems to
476 * lack segment descriptors, which get used by things like
477 * user_mode().
478 *
479 * In the simple case fix up only the IP and BP,SP regs, for
480 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
481 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
482 */
483 regs = *iregs;
484 regs.ip = pebs->ip;
485 regs.bp = pebs->bp;
486 regs.sp = pebs->sp;
487
488 if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
489 regs.flags |= PERF_EFLAGS_EXACT;
490 else
491 regs.flags &= ~PERF_EFLAGS_EXACT;
492
493 if (perf_event_overflow(event, 1, &data, &regs))
494 x86_pmu_stop(event);
495}
496
497static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
498{
499 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
500 struct debug_store *ds = cpuc->ds;
501 struct perf_event *event = cpuc->events[0]; /* PMC0 only */
502 struct pebs_record_core *at, *top;
503 int n;
504
505 if (!ds || !x86_pmu.pebs)
506 return;
507
508 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
509 top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
510
511 /*
512 * Whatever else happens, drain the thing
513 */
514 ds->pebs_index = ds->pebs_buffer_base;
515
516 if (!test_bit(0, cpuc->active_mask))
517 return;
518
519 WARN_ON_ONCE(!event);
520
521 if (!event->attr.precise_ip)
522 return;
523
524 n = top - at;
525 if (n <= 0)
526 return;
527
528 /*
529 * Should not happen, we program the threshold at 1 and do not
530 * set a reset value.
531 */
532 WARN_ON_ONCE(n > 1);
533 at += n - 1;
534
535 __intel_pmu_pebs_event(event, iregs, at);
536}
537
538static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
539{
540 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
541 struct debug_store *ds = cpuc->ds;
542 struct pebs_record_nhm *at, *top;
543 struct perf_event *event = NULL;
544 u64 status = 0;
545 int bit, n;
546
547 if (!ds || !x86_pmu.pebs)
548 return;
549
550 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
551 top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
552
553 ds->pebs_index = ds->pebs_buffer_base;
554
555 n = top - at;
556 if (n <= 0)
557 return;
558
559 /*
560 * Should not happen, we program the threshold at 1 and do not
561 * set a reset value.
562 */
563 WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
564
565 for ( ; at < top; at++) {
566 for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
567 event = cpuc->events[bit];
568 if (!test_bit(bit, cpuc->active_mask))
569 continue;
570
571 WARN_ON_ONCE(!event);
572
573 if (!event->attr.precise_ip)
574 continue;
575
576 if (__test_and_set_bit(bit, (unsigned long *)&status))
577 continue;
578
579 break;
580 }
581
582 if (!event || bit >= MAX_PEBS_EVENTS)
583 continue;
584
585 __intel_pmu_pebs_event(event, iregs, at);
586 }
587}
588
589/*
590 * BTS, PEBS probe and setup
591 */
592
593static void intel_ds_init(void)
594{
595 /*
596 * No support for 32bit formats
597 */
598 if (!boot_cpu_has(X86_FEATURE_DTES64))
599 return;
600
601 x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
602 x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
603 if (x86_pmu.pebs) {
604 char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
605 int format = x86_pmu.intel_cap.pebs_format;
606
607 switch (format) {
608 case 0:
609 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
610 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
611 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
612 x86_pmu.pebs_constraints = intel_core_pebs_events;
613 break;
614
615 case 1:
616 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
617 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
618 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
619 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
620 break;
621
622 default:
623 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
624 x86_pmu.pebs = 0;
625 break;
626 }
627 }
628}
629
630#else /* CONFIG_CPU_SUP_INTEL */
631
632static int reserve_ds_buffers(void)
633{
634 return 0;
635}
636
637static void release_ds_buffers(void)
638{
639}
640
641#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 000000000000..d202c1bece1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,218 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3enum {
4 LBR_FORMAT_32 = 0x00,
5 LBR_FORMAT_LIP = 0x01,
6 LBR_FORMAT_EIP = 0x02,
7 LBR_FORMAT_EIP_FLAGS = 0x03,
8};
9
10/*
11 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
12 * otherwise it becomes near impossible to get a reliable stack.
13 */
14
15static void __intel_pmu_lbr_enable(void)
16{
17 u64 debugctl;
18
19 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
20 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
21 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
22}
23
24static void __intel_pmu_lbr_disable(void)
25{
26 u64 debugctl;
27
28 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
29 debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
30 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
31}
32
33static void intel_pmu_lbr_reset_32(void)
34{
35 int i;
36
37 for (i = 0; i < x86_pmu.lbr_nr; i++)
38 wrmsrl(x86_pmu.lbr_from + i, 0);
39}
40
41static void intel_pmu_lbr_reset_64(void)
42{
43 int i;
44
45 for (i = 0; i < x86_pmu.lbr_nr; i++) {
46 wrmsrl(x86_pmu.lbr_from + i, 0);
47 wrmsrl(x86_pmu.lbr_to + i, 0);
48 }
49}
50
51static void intel_pmu_lbr_reset(void)
52{
53 if (!x86_pmu.lbr_nr)
54 return;
55
56 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
57 intel_pmu_lbr_reset_32();
58 else
59 intel_pmu_lbr_reset_64();
60}
61
62static void intel_pmu_lbr_enable(struct perf_event *event)
63{
64 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
65
66 if (!x86_pmu.lbr_nr)
67 return;
68
69 WARN_ON_ONCE(cpuc->enabled);
70
71 /*
72 * Reset the LBR stack if we changed task context to
73 * avoid data leaks.
74 */
75
76 if (event->ctx->task && cpuc->lbr_context != event->ctx) {
77 intel_pmu_lbr_reset();
78 cpuc->lbr_context = event->ctx;
79 }
80
81 cpuc->lbr_users++;
82}
83
84static void intel_pmu_lbr_disable(struct perf_event *event)
85{
86 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
87
88 if (!x86_pmu.lbr_nr)
89 return;
90
91 cpuc->lbr_users--;
92 WARN_ON_ONCE(cpuc->lbr_users < 0);
93
94 if (cpuc->enabled && !cpuc->lbr_users)
95 __intel_pmu_lbr_disable();
96}
97
98static void intel_pmu_lbr_enable_all(void)
99{
100 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
101
102 if (cpuc->lbr_users)
103 __intel_pmu_lbr_enable();
104}
105
106static void intel_pmu_lbr_disable_all(void)
107{
108 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
109
110 if (cpuc->lbr_users)
111 __intel_pmu_lbr_disable();
112}
113
114static inline u64 intel_pmu_lbr_tos(void)
115{
116 u64 tos;
117
118 rdmsrl(x86_pmu.lbr_tos, tos);
119
120 return tos;
121}
122
123static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
124{
125 unsigned long mask = x86_pmu.lbr_nr - 1;
126 u64 tos = intel_pmu_lbr_tos();
127 int i;
128
129 for (i = 0; i < x86_pmu.lbr_nr; i++) {
130 unsigned long lbr_idx = (tos - i) & mask;
131 union {
132 struct {
133 u32 from;
134 u32 to;
135 };
136 u64 lbr;
137 } msr_lastbranch;
138
139 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
140
141 cpuc->lbr_entries[i].from = msr_lastbranch.from;
142 cpuc->lbr_entries[i].to = msr_lastbranch.to;
143 cpuc->lbr_entries[i].flags = 0;
144 }
145 cpuc->lbr_stack.nr = i;
146}
147
148#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
149
150/*
151 * Due to lack of segmentation in Linux the effective address (offset)
152 * is the same as the linear address, allowing us to merge the LIP and EIP
153 * LBR formats.
154 */
155static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
156{
157 unsigned long mask = x86_pmu.lbr_nr - 1;
158 int lbr_format = x86_pmu.intel_cap.lbr_format;
159 u64 tos = intel_pmu_lbr_tos();
160 int i;
161
162 for (i = 0; i < x86_pmu.lbr_nr; i++) {
163 unsigned long lbr_idx = (tos - i) & mask;
164 u64 from, to, flags = 0;
165
166 rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
167 rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
168
169 if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
170 flags = !!(from & LBR_FROM_FLAG_MISPRED);
171 from = (u64)((((s64)from) << 1) >> 1);
172 }
173
174 cpuc->lbr_entries[i].from = from;
175 cpuc->lbr_entries[i].to = to;
176 cpuc->lbr_entries[i].flags = flags;
177 }
178 cpuc->lbr_stack.nr = i;
179}
180
181static void intel_pmu_lbr_read(void)
182{
183 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
184
185 if (!cpuc->lbr_users)
186 return;
187
188 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
189 intel_pmu_lbr_read_32(cpuc);
190 else
191 intel_pmu_lbr_read_64(cpuc);
192}
193
194static void intel_pmu_lbr_init_core(void)
195{
196 x86_pmu.lbr_nr = 4;
197 x86_pmu.lbr_tos = 0x01c9;
198 x86_pmu.lbr_from = 0x40;
199 x86_pmu.lbr_to = 0x60;
200}
201
202static void intel_pmu_lbr_init_nhm(void)
203{
204 x86_pmu.lbr_nr = 16;
205 x86_pmu.lbr_tos = 0x01c9;
206 x86_pmu.lbr_from = 0x680;
207 x86_pmu.lbr_to = 0x6c0;
208}
209
210static void intel_pmu_lbr_init_atom(void)
211{
212 x86_pmu.lbr_nr = 8;
213 x86_pmu.lbr_tos = 0x01c9;
214 x86_pmu.lbr_from = 0x40;
215 x86_pmu.lbr_to = 0x60;
216}
217
218#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 000000000000..249015173992
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,951 @@
1/*
2 * Netburst Perfomance Events (P4, old Xeon)
3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#ifdef CONFIG_CPU_SUP_INTEL
11
12#include <asm/perf_event_p4.h>
13
14#define P4_CNTR_LIMIT 3
15/*
16 * array indices: 0,1 - HT threads, used with HT enabled cpu
17 */
18struct p4_event_bind {
19 unsigned int opcode; /* Event code and ESCR selector */
20 unsigned int escr_msr[2]; /* ESCR MSR for this event */
21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22};
23
24struct p4_pebs_bind {
25 unsigned int metric_pebs;
26 unsigned int metric_vert;
27};
28
29/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
30#define P4_GEN_PEBS_BIND(name, pebs, vert) \
31 [P4_PEBS_METRIC__##name] = { \
32 .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
33 .metric_vert = vert, \
34 }
35
36/*
37 * note we have P4_PEBS_ENABLE_UOP_TAG always set here
38 *
39 * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
40 * event configuration to find out which values are to be
41 * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
42 * resgisters
43 */
44static struct p4_pebs_bind p4_pebs_bind_map[] = {
45 P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
46 P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
47 P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
48 P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
49 P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
50 P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
51 P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
52 P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
53 P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
54};
55
56/*
57 * Note that we don't use CCCR1 here, there is an
58 * exception for P4_BSQ_ALLOCATION but we just have
59 * no workaround
60 *
61 * consider this binding as resources which particular
62 * event may borrow, it doesn't contain EventMask,
63 * Tags and friends -- they are left to a caller
64 */
65static struct p4_event_bind p4_event_bind_map[] = {
66 [P4_EVENT_TC_DELIVER_MODE] = {
67 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
68 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
69 .cntr = { {4, 5, -1}, {6, 7, -1} },
70 },
71 [P4_EVENT_BPU_FETCH_REQUEST] = {
72 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
73 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
74 .cntr = { {0, -1, -1}, {2, -1, -1} },
75 },
76 [P4_EVENT_ITLB_REFERENCE] = {
77 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
78 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
79 .cntr = { {0, -1, -1}, {2, -1, -1} },
80 },
81 [P4_EVENT_MEMORY_CANCEL] = {
82 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
83 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
84 .cntr = { {8, 9, -1}, {10, 11, -1} },
85 },
86 [P4_EVENT_MEMORY_COMPLETE] = {
87 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
88 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
89 .cntr = { {8, 9, -1}, {10, 11, -1} },
90 },
91 [P4_EVENT_LOAD_PORT_REPLAY] = {
92 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
93 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
94 .cntr = { {8, 9, -1}, {10, 11, -1} },
95 },
96 [P4_EVENT_STORE_PORT_REPLAY] = {
97 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
98 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
99 .cntr = { {8, 9, -1}, {10, 11, -1} },
100 },
101 [P4_EVENT_MOB_LOAD_REPLAY] = {
102 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
103 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
104 .cntr = { {0, -1, -1}, {2, -1, -1} },
105 },
106 [P4_EVENT_PAGE_WALK_TYPE] = {
107 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
108 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
109 .cntr = { {0, -1, -1}, {2, -1, -1} },
110 },
111 [P4_EVENT_BSQ_CACHE_REFERENCE] = {
112 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
113 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
114 .cntr = { {0, -1, -1}, {2, -1, -1} },
115 },
116 [P4_EVENT_IOQ_ALLOCATION] = {
117 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
118 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
119 .cntr = { {0, -1, -1}, {2, -1, -1} },
120 },
121 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
122 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
123 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
124 .cntr = { {2, -1, -1}, {3, -1, -1} },
125 },
126 [P4_EVENT_FSB_DATA_ACTIVITY] = {
127 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
128 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
129 .cntr = { {0, -1, -1}, {2, -1, -1} },
130 },
131 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
132 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
133 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
134 .cntr = { {0, -1, -1}, {1, -1, -1} },
135 },
136 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
137 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
138 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
139 .cntr = { {2, -1, -1}, {3, -1, -1} },
140 },
141 [P4_EVENT_SSE_INPUT_ASSIST] = {
142 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
143 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
144 .cntr = { {8, 9, -1}, {10, 11, -1} },
145 },
146 [P4_EVENT_PACKED_SP_UOP] = {
147 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
148 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
149 .cntr = { {8, 9, -1}, {10, 11, -1} },
150 },
151 [P4_EVENT_PACKED_DP_UOP] = {
152 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
153 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
154 .cntr = { {8, 9, -1}, {10, 11, -1} },
155 },
156 [P4_EVENT_SCALAR_SP_UOP] = {
157 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
158 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
159 .cntr = { {8, 9, -1}, {10, 11, -1} },
160 },
161 [P4_EVENT_SCALAR_DP_UOP] = {
162 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
163 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
164 .cntr = { {8, 9, -1}, {10, 11, -1} },
165 },
166 [P4_EVENT_64BIT_MMX_UOP] = {
167 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
168 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
169 .cntr = { {8, 9, -1}, {10, 11, -1} },
170 },
171 [P4_EVENT_128BIT_MMX_UOP] = {
172 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
173 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
174 .cntr = { {8, 9, -1}, {10, 11, -1} },
175 },
176 [P4_EVENT_X87_FP_UOP] = {
177 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
178 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
179 .cntr = { {8, 9, -1}, {10, 11, -1} },
180 },
181 [P4_EVENT_TC_MISC] = {
182 .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
183 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
184 .cntr = { {4, 5, -1}, {6, 7, -1} },
185 },
186 [P4_EVENT_GLOBAL_POWER_EVENTS] = {
187 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
188 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
189 .cntr = { {0, -1, -1}, {2, -1, -1} },
190 },
191 [P4_EVENT_TC_MS_XFER] = {
192 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
193 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
194 .cntr = { {4, 5, -1}, {6, 7, -1} },
195 },
196 [P4_EVENT_UOP_QUEUE_WRITES] = {
197 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
198 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
199 .cntr = { {4, 5, -1}, {6, 7, -1} },
200 },
201 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
202 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
203 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
204 .cntr = { {4, 5, -1}, {6, 7, -1} },
205 },
206 [P4_EVENT_RETIRED_BRANCH_TYPE] = {
207 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
208 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
209 .cntr = { {4, 5, -1}, {6, 7, -1} },
210 },
211 [P4_EVENT_RESOURCE_STALL] = {
212 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
213 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
214 .cntr = { {12, 13, 16}, {14, 15, 17} },
215 },
216 [P4_EVENT_WC_BUFFER] = {
217 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
218 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
219 .cntr = { {8, 9, -1}, {10, 11, -1} },
220 },
221 [P4_EVENT_B2B_CYCLES] = {
222 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
223 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
224 .cntr = { {0, -1, -1}, {2, -1, -1} },
225 },
226 [P4_EVENT_BNR] = {
227 .opcode = P4_OPCODE(P4_EVENT_BNR),
228 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
229 .cntr = { {0, -1, -1}, {2, -1, -1} },
230 },
231 [P4_EVENT_SNOOP] = {
232 .opcode = P4_OPCODE(P4_EVENT_SNOOP),
233 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
234 .cntr = { {0, -1, -1}, {2, -1, -1} },
235 },
236 [P4_EVENT_RESPONSE] = {
237 .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
238 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
239 .cntr = { {0, -1, -1}, {2, -1, -1} },
240 },
241 [P4_EVENT_FRONT_END_EVENT] = {
242 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
243 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
244 .cntr = { {12, 13, 16}, {14, 15, 17} },
245 },
246 [P4_EVENT_EXECUTION_EVENT] = {
247 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
248 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
249 .cntr = { {12, 13, 16}, {14, 15, 17} },
250 },
251 [P4_EVENT_REPLAY_EVENT] = {
252 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
253 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
254 .cntr = { {12, 13, 16}, {14, 15, 17} },
255 },
256 [P4_EVENT_INSTR_RETIRED] = {
257 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
258 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
259 .cntr = { {12, 13, 16}, {14, 15, 17} },
260 },
261 [P4_EVENT_UOPS_RETIRED] = {
262 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
263 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
264 .cntr = { {12, 13, 16}, {14, 15, 17} },
265 },
266 [P4_EVENT_UOP_TYPE] = {
267 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
268 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
269 .cntr = { {12, 13, 16}, {14, 15, 17} },
270 },
271 [P4_EVENT_BRANCH_RETIRED] = {
272 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
273 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
274 .cntr = { {12, 13, 16}, {14, 15, 17} },
275 },
276 [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
277 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
278 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
279 .cntr = { {12, 13, 16}, {14, 15, 17} },
280 },
281 [P4_EVENT_X87_ASSIST] = {
282 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
283 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
284 .cntr = { {12, 13, 16}, {14, 15, 17} },
285 },
286 [P4_EVENT_MACHINE_CLEAR] = {
287 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
288 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
289 .cntr = { {12, 13, 16}, {14, 15, 17} },
290 },
291 [P4_EVENT_INSTR_COMPLETED] = {
292 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
293 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
294 .cntr = { {12, 13, 16}, {14, 15, 17} },
295 },
296};
297
298#define P4_GEN_CACHE_EVENT(event, bit, metric) \
299 p4_config_pack_escr(P4_ESCR_EVENT(event) | \
300 P4_ESCR_EMASK_BIT(event, bit)) | \
301 p4_config_pack_cccr(metric | \
302 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
303
304static __initconst const u64 p4_hw_cache_event_ids
305 [PERF_COUNT_HW_CACHE_MAX]
306 [PERF_COUNT_HW_CACHE_OP_MAX]
307 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
308{
309 [ C(L1D ) ] = {
310 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x0,
312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
313 P4_PEBS_METRIC__1stl_cache_load_miss_retired),
314 },
315 },
316 [ C(LL ) ] = {
317 [ C(OP_READ) ] = {
318 [ C(RESULT_ACCESS) ] = 0x0,
319 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
320 P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
321 },
322},
323 [ C(DTLB) ] = {
324 [ C(OP_READ) ] = {
325 [ C(RESULT_ACCESS) ] = 0x0,
326 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
327 P4_PEBS_METRIC__dtlb_load_miss_retired),
328 },
329 [ C(OP_WRITE) ] = {
330 [ C(RESULT_ACCESS) ] = 0x0,
331 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
332 P4_PEBS_METRIC__dtlb_store_miss_retired),
333 },
334 },
335 [ C(ITLB) ] = {
336 [ C(OP_READ) ] = {
337 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
338 P4_PEBS_METRIC__none),
339 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
340 P4_PEBS_METRIC__none),
341 },
342 [ C(OP_WRITE) ] = {
343 [ C(RESULT_ACCESS) ] = -1,
344 [ C(RESULT_MISS) ] = -1,
345 },
346 [ C(OP_PREFETCH) ] = {
347 [ C(RESULT_ACCESS) ] = -1,
348 [ C(RESULT_MISS) ] = -1,
349 },
350 },
351};
352
353static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
354 /* non-halted CPU clocks */
355 [PERF_COUNT_HW_CPU_CYCLES] =
356 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
357 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
358
359 /*
360 * retired instructions
361 * in a sake of simplicity we don't use the FSB tagging
362 */
363 [PERF_COUNT_HW_INSTRUCTIONS] =
364 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
365 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
366 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
367
368 /* cache hits */
369 [PERF_COUNT_HW_CACHE_REFERENCES] =
370 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
371 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
372 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
373 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
374 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
375 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
376 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
377
378 /* cache misses */
379 [PERF_COUNT_HW_CACHE_MISSES] =
380 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
381 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
382 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
383 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
384
385 /* branch instructions retired */
386 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
387 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
388 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
389 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
390 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
391 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
392
393 /* mispredicted branches retired */
394 [PERF_COUNT_HW_BRANCH_MISSES] =
395 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
396 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
397
398 /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
399 [PERF_COUNT_HW_BUS_CYCLES] =
400 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
401 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
402 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
403 p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
404};
405
406static struct p4_event_bind *p4_config_get_bind(u64 config)
407{
408 unsigned int evnt = p4_config_unpack_event(config);
409 struct p4_event_bind *bind = NULL;
410
411 if (evnt < ARRAY_SIZE(p4_event_bind_map))
412 bind = &p4_event_bind_map[evnt];
413
414 return bind;
415}
416
417static u64 p4_pmu_event_map(int hw_event)
418{
419 struct p4_event_bind *bind;
420 unsigned int esel;
421 u64 config;
422
423 config = p4_general_events[hw_event];
424 bind = p4_config_get_bind(config);
425 esel = P4_OPCODE_ESEL(bind->opcode);
426 config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
427
428 return config;
429}
430
431static int p4_validate_raw_event(struct perf_event *event)
432{
433 unsigned int v;
434
435 /* user data may have out-of-bound event index */
436 v = p4_config_unpack_event(event->attr.config);
437 if (v >= ARRAY_SIZE(p4_event_bind_map)) {
438 pr_warning("P4 PMU: Unknown event code: %d\n", v);
439 return -EINVAL;
440 }
441
442 /*
443 * it may have some screwed PEBS bits
444 */
445 if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
446 pr_warning("P4 PMU: PEBS are not supported yet\n");
447 return -EINVAL;
448 }
449 v = p4_config_unpack_metric(event->attr.config);
450 if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
451 pr_warning("P4 PMU: Unknown metric code: %d\n", v);
452 return -EINVAL;
453 }
454
455 return 0;
456}
457
458static int p4_hw_config(struct perf_event *event)
459{
460 int cpu = get_cpu();
461 int rc = 0;
462 u32 escr, cccr;
463
464 /*
465 * the reason we use cpu that early is that: if we get scheduled
466 * first time on the same cpu -- we will not need swap thread
467 * specific flags in config (and will save some cpu cycles)
468 */
469
470 cccr = p4_default_cccr_conf(cpu);
471 escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
472 event->attr.exclude_user);
473 event->hw.config = p4_config_pack_escr(escr) |
474 p4_config_pack_cccr(cccr);
475
476 if (p4_ht_active() && p4_ht_thread(cpu))
477 event->hw.config = p4_set_ht_bit(event->hw.config);
478
479 if (event->attr.type == PERF_TYPE_RAW) {
480
481 rc = p4_validate_raw_event(event);
482 if (rc)
483 goto out;
484
485 /*
486 * We don't control raw events so it's up to the caller
487 * to pass sane values (and we don't count the thread number
488 * on HT machine but allow HT-compatible specifics to be
489 * passed on)
490 *
491 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
492 * bits since we keep additional info here (for cache events and etc)
493 *
494 * XXX: HT wide things should check perf_paranoid_cpu() &&
495 * CAP_SYS_ADMIN
496 */
497 event->hw.config |= event->attr.config &
498 (p4_config_pack_escr(P4_ESCR_MASK_HT) |
499 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
500
501 event->hw.config &= ~P4_CCCR_FORCE_OVF;
502 }
503
504 rc = x86_setup_perfctr(event);
505out:
506 put_cpu();
507 return rc;
508}
509
510static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
511{
512 int overflow = 0;
513 u32 low, high;
514
515 rdmsr(hwc->config_base + hwc->idx, low, high);
516
517 /* we need to check high bit for unflagged overflows */
518 if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
519 overflow = 1;
520 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
521 ((u64)low) & ~P4_CCCR_OVF);
522 }
523
524 return overflow;
525}
526
527static void p4_pmu_disable_pebs(void)
528{
529 /*
530 * FIXME
531 *
532 * It's still allowed that two threads setup same cache
533 * events so we can't simply clear metrics until we knew
534 * noone is depending on us, so we need kind of counter
535 * for "ReplayEvent" users.
536 *
537 * What is more complex -- RAW events, if user (for some
538 * reason) will pass some cache event metric with improper
539 * event opcode -- it's fine from hardware point of view
540 * but completely nonsence from "meaning" of such action.
541 *
542 * So at moment let leave metrics turned on forever -- it's
543 * ok for now but need to be revisited!
544 *
545 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
546 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
547 */
548}
549
550static inline void p4_pmu_disable_event(struct perf_event *event)
551{
552 struct hw_perf_event *hwc = &event->hw;
553
554 /*
555 * If event gets disabled while counter is in overflowed
556 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
557 * asserted again and again
558 */
559 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
560 (u64)(p4_config_unpack_cccr(hwc->config)) &
561 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
562}
563
564static void p4_pmu_disable_all(void)
565{
566 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
567 int idx;
568
569 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
570 struct perf_event *event = cpuc->events[idx];
571 if (!test_bit(idx, cpuc->active_mask))
572 continue;
573 p4_pmu_disable_event(event);
574 }
575
576 p4_pmu_disable_pebs();
577}
578
579/* configuration must be valid */
580static void p4_pmu_enable_pebs(u64 config)
581{
582 struct p4_pebs_bind *bind;
583 unsigned int idx;
584
585 BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
586
587 idx = p4_config_unpack_metric(config);
588 if (idx == P4_PEBS_METRIC__none)
589 return;
590
591 bind = &p4_pebs_bind_map[idx];
592
593 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
594 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
595}
596
597static void p4_pmu_enable_event(struct perf_event *event)
598{
599 struct hw_perf_event *hwc = &event->hw;
600 int thread = p4_ht_config_thread(hwc->config);
601 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
602 unsigned int idx = p4_config_unpack_event(hwc->config);
603 struct p4_event_bind *bind;
604 u64 escr_addr, cccr;
605
606 bind = &p4_event_bind_map[idx];
607 escr_addr = (u64)bind->escr_msr[thread];
608
609 /*
610 * - we dont support cascaded counters yet
611 * - and counter 1 is broken (erratum)
612 */
613 WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
614 WARN_ON_ONCE(hwc->idx == 1);
615
616 /* we need a real Event value */
617 escr_conf &= ~P4_ESCR_EVENT_MASK;
618 escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
619
620 cccr = p4_config_unpack_cccr(hwc->config);
621
622 /*
623 * it could be Cache event so we need to write metrics
624 * into additional MSRs
625 */
626 p4_pmu_enable_pebs(hwc->config);
627
628 (void)checking_wrmsrl(escr_addr, escr_conf);
629 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
630 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
631}
632
633static void p4_pmu_enable_all(int added)
634{
635 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
636 int idx;
637
638 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
639 struct perf_event *event = cpuc->events[idx];
640 if (!test_bit(idx, cpuc->active_mask))
641 continue;
642 p4_pmu_enable_event(event);
643 }
644}
645
646static int p4_pmu_handle_irq(struct pt_regs *regs)
647{
648 struct perf_sample_data data;
649 struct cpu_hw_events *cpuc;
650 struct perf_event *event;
651 struct hw_perf_event *hwc;
652 int idx, handled = 0;
653 u64 val;
654
655 data.addr = 0;
656 data.raw = NULL;
657
658 cpuc = &__get_cpu_var(cpu_hw_events);
659
660 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
661 int overflow;
662
663 if (!test_bit(idx, cpuc->active_mask)) {
664 /* catch in-flight IRQs */
665 if (__test_and_clear_bit(idx, cpuc->running))
666 handled++;
667 continue;
668 }
669
670 event = cpuc->events[idx];
671 hwc = &event->hw;
672
673 WARN_ON_ONCE(hwc->idx != idx);
674
675 /* it might be unflagged overflow */
676 overflow = p4_pmu_clear_cccr_ovf(hwc);
677
678 val = x86_perf_event_update(event);
679 if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
680 continue;
681
682 handled += overflow;
683
684 /* event overflow for sure */
685 data.period = event->hw.last_period;
686
687 if (!x86_perf_event_set_period(event))
688 continue;
689 if (perf_event_overflow(event, 1, &data, regs))
690 p4_pmu_disable_event(event);
691 }
692
693 if (handled) {
694 /* p4 quirk: unmask it again */
695 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
696 inc_irq_stat(apic_perf_irqs);
697 }
698
699 return handled;
700}
701
702/*
703 * swap thread specific fields according to a thread
704 * we are going to run on
705 */
706static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
707{
708 u32 escr, cccr;
709
710 /*
711 * we either lucky and continue on same cpu or no HT support
712 */
713 if (!p4_should_swap_ts(hwc->config, cpu))
714 return;
715
716 /*
717 * the event is migrated from an another logical
718 * cpu, so we need to swap thread specific flags
719 */
720
721 escr = p4_config_unpack_escr(hwc->config);
722 cccr = p4_config_unpack_cccr(hwc->config);
723
724 if (p4_ht_thread(cpu)) {
725 cccr &= ~P4_CCCR_OVF_PMI_T0;
726 cccr |= P4_CCCR_OVF_PMI_T1;
727 if (escr & P4_ESCR_T0_OS) {
728 escr &= ~P4_ESCR_T0_OS;
729 escr |= P4_ESCR_T1_OS;
730 }
731 if (escr & P4_ESCR_T0_USR) {
732 escr &= ~P4_ESCR_T0_USR;
733 escr |= P4_ESCR_T1_USR;
734 }
735 hwc->config = p4_config_pack_escr(escr);
736 hwc->config |= p4_config_pack_cccr(cccr);
737 hwc->config |= P4_CONFIG_HT;
738 } else {
739 cccr &= ~P4_CCCR_OVF_PMI_T1;
740 cccr |= P4_CCCR_OVF_PMI_T0;
741 if (escr & P4_ESCR_T1_OS) {
742 escr &= ~P4_ESCR_T1_OS;
743 escr |= P4_ESCR_T0_OS;
744 }
745 if (escr & P4_ESCR_T1_USR) {
746 escr &= ~P4_ESCR_T1_USR;
747 escr |= P4_ESCR_T0_USR;
748 }
749 hwc->config = p4_config_pack_escr(escr);
750 hwc->config |= p4_config_pack_cccr(cccr);
751 hwc->config &= ~P4_CONFIG_HT;
752 }
753}
754
755/*
756 * ESCR address hashing is tricky, ESCRs are not sequential
757 * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
758 * the metric between any ESCRs is laid in range [0xa0,0xe1]
759 *
760 * so we make ~70% filled hashtable
761 */
762
763#define P4_ESCR_MSR_BASE 0x000003a0
764#define P4_ESCR_MSR_MAX 0x000003e1
765#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
766#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE)
767#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr
768
769static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
770 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
771 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
772 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
773 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
774 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
775 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
776 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
777 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
778 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
779 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
780 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
781 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
782 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
783 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
784 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
785 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
786 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
787 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
788 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
789 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
790 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
791 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
792 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
793 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
794 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
795 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
796 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
797 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
798 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
799 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
800 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
801 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
802 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
803 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
804 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
805 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
806 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
807 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
808 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
809 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
810 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
811 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
812 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
813 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
814 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
815 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
816};
817
818static int p4_get_escr_idx(unsigned int addr)
819{
820 unsigned int idx = P4_ESCR_MSR_IDX(addr);
821
822 if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
823 !p4_escr_table[idx] ||
824 p4_escr_table[idx] != addr)) {
825 WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
826 return -1;
827 }
828
829 return idx;
830}
831
832static int p4_next_cntr(int thread, unsigned long *used_mask,
833 struct p4_event_bind *bind)
834{
835 int i, j;
836
837 for (i = 0; i < P4_CNTR_LIMIT; i++) {
838 j = bind->cntr[thread][i];
839 if (j != -1 && !test_bit(j, used_mask))
840 return j;
841 }
842
843 return -1;
844}
845
846static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
847{
848 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
849 unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
850 int cpu = smp_processor_id();
851 struct hw_perf_event *hwc;
852 struct p4_event_bind *bind;
853 unsigned int i, thread, num;
854 int cntr_idx, escr_idx;
855
856 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
857 bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
858
859 for (i = 0, num = n; i < n; i++, num--) {
860
861 hwc = &cpuc->event_list[i]->hw;
862 thread = p4_ht_thread(cpu);
863 bind = p4_config_get_bind(hwc->config);
864 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
865 if (unlikely(escr_idx == -1))
866 goto done;
867
868 if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
869 cntr_idx = hwc->idx;
870 if (assign)
871 assign[i] = hwc->idx;
872 goto reserve;
873 }
874
875 cntr_idx = p4_next_cntr(thread, used_mask, bind);
876 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
877 goto done;
878
879 p4_pmu_swap_config_ts(hwc, cpu);
880 if (assign)
881 assign[i] = cntr_idx;
882reserve:
883 set_bit(cntr_idx, used_mask);
884 set_bit(escr_idx, escr_mask);
885 }
886
887done:
888 return num ? -ENOSPC : 0;
889}
890
891static __initconst const struct x86_pmu p4_pmu = {
892 .name = "Netburst P4/Xeon",
893 .handle_irq = p4_pmu_handle_irq,
894 .disable_all = p4_pmu_disable_all,
895 .enable_all = p4_pmu_enable_all,
896 .enable = p4_pmu_enable_event,
897 .disable = p4_pmu_disable_event,
898 .eventsel = MSR_P4_BPU_CCCR0,
899 .perfctr = MSR_P4_BPU_PERFCTR0,
900 .event_map = p4_pmu_event_map,
901 .max_events = ARRAY_SIZE(p4_general_events),
902 .get_event_constraints = x86_get_event_constraints,
903 /*
904 * IF HT disabled we may need to use all
905 * ARCH_P4_MAX_CCCR counters simulaneously
906 * though leave it restricted at moment assuming
907 * HT is on
908 */
909 .num_counters = ARCH_P4_MAX_CCCR,
910 .apic = 1,
911 .cntval_bits = 40,
912 .cntval_mask = (1ULL << 40) - 1,
913 .max_period = (1ULL << 39) - 1,
914 .hw_config = p4_hw_config,
915 .schedule_events = p4_pmu_schedule_events,
916 /*
917 * This handles erratum N15 in intel doc 249199-029,
918 * the counter may not be updated correctly on write
919 * so we need a second write operation to do the trick
920 * (the official workaround didn't work)
921 *
922 * the former idea is taken from OProfile code
923 */
924 .perfctr_second_write = 1,
925};
926
927static __init int p4_pmu_init(void)
928{
929 unsigned int low, high;
930
931 /* If we get stripped -- indexig fails */
932 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
933
934 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
935 if (!(low & (1 << 7))) {
936 pr_cont("unsupported Netburst CPU model %d ",
937 boot_cpu_data.x86_model);
938 return -ENODEV;
939 }
940
941 memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
942 sizeof(hw_cache_event_ids));
943
944 pr_cont("Netburst events, ");
945
946 x86_pmu = p4_pmu;
947
948 return 0;
949}
950
951#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
new file mode 100644
index 000000000000..34ba07be2cda
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -0,0 +1,142 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Not sure about some of these
5 */
6static const u64 p6_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
15};
16
17static u64 p6_pmu_event_map(int hw_event)
18{
19 return p6_perfmon_event_map[hw_event];
20}
21
22/*
23 * Event setting that is specified not to count anything.
24 * We use this to effectively disable a counter.
25 *
26 * L2_RQSTS with 0 MESI unit mask.
27 */
28#define P6_NOP_EVENT 0x0000002EULL
29
30static struct event_constraint p6_event_constraints[] =
31{
32 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
33 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
34 INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
35 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
36 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
37 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
38 EVENT_CONSTRAINT_END
39};
40
41static void p6_pmu_disable_all(void)
42{
43 u64 val;
44
45 /* p6 only has one enable register */
46 rdmsrl(MSR_P6_EVNTSEL0, val);
47 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
48 wrmsrl(MSR_P6_EVNTSEL0, val);
49}
50
51static void p6_pmu_enable_all(int added)
52{
53 unsigned long val;
54
55 /* p6 only has one enable register */
56 rdmsrl(MSR_P6_EVNTSEL0, val);
57 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
58 wrmsrl(MSR_P6_EVNTSEL0, val);
59}
60
61static inline void
62p6_pmu_disable_event(struct perf_event *event)
63{
64 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
65 struct hw_perf_event *hwc = &event->hw;
66 u64 val = P6_NOP_EVENT;
67
68 if (cpuc->enabled)
69 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
70
71 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
72}
73
74static void p6_pmu_enable_event(struct perf_event *event)
75{
76 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
77 struct hw_perf_event *hwc = &event->hw;
78 u64 val;
79
80 val = hwc->config;
81 if (cpuc->enabled)
82 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
83
84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
85}
86
87static __initconst const struct x86_pmu p6_pmu = {
88 .name = "p6",
89 .handle_irq = x86_pmu_handle_irq,
90 .disable_all = p6_pmu_disable_all,
91 .enable_all = p6_pmu_enable_all,
92 .enable = p6_pmu_enable_event,
93 .disable = p6_pmu_disable_event,
94 .hw_config = x86_pmu_hw_config,
95 .schedule_events = x86_schedule_events,
96 .eventsel = MSR_P6_EVNTSEL0,
97 .perfctr = MSR_P6_PERFCTR0,
98 .event_map = p6_pmu_event_map,
99 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
100 .apic = 1,
101 .max_period = (1ULL << 31) - 1,
102 .version = 0,
103 .num_counters = 2,
104 /*
105 * Events have 40 bits implemented. However they are designed such
106 * that bits [32-39] are sign extensions of bit 31. As such the
107 * effective width of a event for P6-like PMU is 32 bits only.
108 *
109 * See IA-32 Intel Architecture Software developer manual Vol 3B
110 */
111 .cntval_bits = 32,
112 .cntval_mask = (1ULL << 32) - 1,
113 .get_event_constraints = x86_get_event_constraints,
114 .event_constraints = p6_event_constraints,
115};
116
117static __init int p6_pmu_init(void)
118{
119 switch (boot_cpu_data.x86_model) {
120 case 1:
121 case 3: /* Pentium Pro */
122 case 5:
123 case 6: /* Pentium II */
124 case 7:
125 case 8:
126 case 11: /* Pentium III */
127 case 9:
128 case 13:
129 /* Pentium M */
130 break;
131 default:
132 pr_cont("unsupported p6 CPU model %d ",
133 boot_cpu_data.x86_model);
134 return -ENODEV;
135 }
136
137 x86_pmu = p6_pmu;
138
139 return 0;
140}
141
142#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 898df9719afb..fb329e9f8494 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
115 115
116 return !test_bit(counter, perfctr_nmi_owner); 116 return !test_bit(counter, perfctr_nmi_owner);
117} 117}
118
119/* checks the an msr for availability */
120int avail_to_resrv_perfctr_nmi(unsigned int msr)
121{
122 unsigned int counter;
123
124 counter = nmi_perfctr_msr_to_bit(msr);
125 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126
127 return !test_bit(counter, perfctr_nmi_owner);
128}
129EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); 118EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
130 119
131int reserve_perfctr_nmi(unsigned int msr) 120int reserve_perfctr_nmi(unsigned int msr)
@@ -691,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
691 cpu_nmi_set_wd_enabled(); 680 cpu_nmi_set_wd_enabled();
692 681
693 apic_write(APIC_LVTPC, APIC_DM_NMI); 682 apic_write(APIC_LVTPC, APIC_DM_NMI);
694 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 683 evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
695 wrmsr(evntsel_msr, evntsel, 0); 684 wrmsr(evntsel_msr, evntsel, 0);
696 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
697 return 1; 686 return 1;
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000000..d49079515122
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,64 @@
1/*
2 * Routines to indentify additional cpu features that are scattered in
3 * cpuid space.
4 */
5#include <linux/cpu.h>
6
7#include <asm/pat.h>
8#include <asm/processor.h>
9
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17 u32 sub_leaf;
18};
19
20enum cpuid_regs {
21 CR_EAX = 0,
22 CR_ECX,
23 CR_EDX,
24 CR_EBX
25};
26
27void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
28{
29 u32 max_level;
30 u32 regs[4];
31 const struct cpuid_bit *cb;
32
33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
34 { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 },
35 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
36 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
37 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
38 { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
39 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
40 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
41 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
42 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
43 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
46 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
47 { 0, 0, 0, 0, 0 }
48 };
49
50 for (cb = cpuid_bits; cb->feature; cb++) {
51
52 /* Verify that the level is valid */
53 max_level = cpuid_eax(cb->level & 0xffff0000);
54 if (max_level < cb->level ||
55 max_level > (cb->level | 0xffff))
56 continue;
57
58 cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
59 &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
60
61 if (regs[cb->reg] & (1 << cb->bit))
62 set_cpu_cap(c, cb->feature);
63 }
64}
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c
index 468489b57aae..4397e987a1cf 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,56 +1,14 @@
1/* 1/*
2 * Routines to indentify additional cpu features that are scattered in 2 * Check for extended topology enumeration cpuid leaf 0xb and if it
3 * cpuid space. 3 * exists, use it for populating initial_apicid and cpu topology
4 * detection.
4 */ 5 */
5#include <linux/cpu.h>
6 6
7#include <linux/cpu.h>
8#include <asm/apic.h>
7#include <asm/pat.h> 9#include <asm/pat.h>
8#include <asm/processor.h> 10#include <asm/processor.h>
9 11
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17};
18
19enum cpuid_regs {
20 CR_EAX = 0,
21 CR_ECX,
22 CR_EDX,
23 CR_EBX
24};
25
26void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
27{
28 u32 max_level;
29 u32 regs[4];
30 const struct cpuid_bit *cb;
31
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { 0, 0, 0, 0 }
36 };
37
38 for (cb = cpuid_bits; cb->feature; cb++) {
39
40 /* Verify that the level is valid */
41 max_level = cpuid_eax(cb->level & 0xffff0000);
42 if (max_level < cb->level ||
43 max_level > (cb->level | 0xffff))
44 continue;
45
46 cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
47 &regs[CR_ECX], &regs[CR_EDX]);
48
49 if (regs[cb->reg] & (1 << cb->bit))
50 set_cpu_cap(c, cb->feature);
51 }
52}
53
54/* leaf 0xb SMT level */ 12/* leaf 0xb SMT level */
55#define SMT_LEVEL 0 13#define SMT_LEVEL 0
56 14
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97b59cf..227b0448960d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,9 +22,10 @@
22 */ 22 */
23 23
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/module.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include <asm/vmware.h>
27#include <asm/x86_init.h> 27#include <asm/x86_init.h>
28#include <asm/hypervisor.h>
28 29
29#define CPUID_VMWARE_INFO_LEAF 0x40000000 30#define CPUID_VMWARE_INFO_LEAF 0x40000000
30#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 31#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -50,7 +51,7 @@ static inline int __vmware_platform(void)
50 51
51static unsigned long vmware_get_tsc_khz(void) 52static unsigned long vmware_get_tsc_khz(void)
52{ 53{
53 uint64_t tsc_hz; 54 uint64_t tsc_hz, lpj;
54 uint32_t eax, ebx, ecx, edx; 55 uint32_t eax, ebx, ecx, edx;
55 56
56 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); 57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -61,10 +62,17 @@ static unsigned long vmware_get_tsc_khz(void)
61 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", 62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
62 (unsigned long) tsc_hz / 1000, 63 (unsigned long) tsc_hz / 1000,
63 (unsigned long) tsc_hz % 1000); 64 (unsigned long) tsc_hz % 1000);
65
66 if (!preset_lpj) {
67 lpj = ((u64)tsc_hz * 1000);
68 do_div(lpj, HZ);
69 preset_lpj = lpj;
70 }
71
64 return tsc_hz; 72 return tsc_hz;
65} 73}
66 74
67void __init vmware_platform_setup(void) 75static void __init vmware_platform_setup(void)
68{ 76{
69 uint32_t eax, ebx, ecx, edx; 77 uint32_t eax, ebx, ecx, edx;
70 78
@@ -82,24 +90,21 @@ void __init vmware_platform_setup(void)
82 * serial key should be enough, as this will always have a VMware 90 * serial key should be enough, as this will always have a VMware
83 * specific string when running under VMware hypervisor. 91 * specific string when running under VMware hypervisor.
84 */ 92 */
85int vmware_platform(void) 93static bool __init vmware_platform(void)
86{ 94{
87 if (cpu_has_hypervisor) { 95 if (cpu_has_hypervisor) {
88 unsigned int eax, ebx, ecx, edx; 96 unsigned int eax;
89 char hyper_vendor_id[13]; 97 unsigned int hyper_vendor_id[3];
90 98
91 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); 99 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
92 memcpy(hyper_vendor_id + 0, &ebx, 4); 100 &hyper_vendor_id[1], &hyper_vendor_id[2]);
93 memcpy(hyper_vendor_id + 4, &ecx, 4); 101 if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
94 memcpy(hyper_vendor_id + 8, &edx, 4); 102 return true;
95 hyper_vendor_id[12] = '\0';
96 if (!strcmp(hyper_vendor_id, "VMwareVMware"))
97 return 1;
98 } else if (dmi_available && dmi_name_in_serial("VMware") && 103 } else if (dmi_available && dmi_name_in_serial("VMware") &&
99 __vmware_platform()) 104 __vmware_platform())
100 return 1; 105 return true;
101 106
102 return 0; 107 return false;
103} 108}
104 109
105/* 110/*
@@ -114,8 +119,16 @@ int vmware_platform(void)
114 * so that the kernel could just trust the hypervisor with providing a 119 * so that the kernel could just trust the hypervisor with providing a
115 * reliable virtual TSC that is suitable for timekeeping. 120 * reliable virtual TSC that is suitable for timekeeping.
116 */ 121 */
117void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) 122static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
118{ 123{
119 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 124 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
120 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); 125 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
121} 126}
127
128const __refconst struct hypervisor_x86 x86_hyper_vmware = {
129 .name = "VMware",
130 .detect = vmware_platform,
131 .set_cpu_features = vmware_set_cpu_features,
132 .init_platform = vmware_platform_setup,
133};
134EXPORT_SYMBOL(x86_hyper_vmware);