aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c133
-rw-r--r--arch/x86/kernel/alternative.c47
-rw-r--r--arch/x86/kernel/amd_iommu.c197
-rw-r--r--arch/x86/kernel/amd_iommu_init.c6
-rw-r--r--arch/x86/kernel/apic/es7000_32.c19
-rw-r--r--arch/x86/kernel/apic/io_apic.c99
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c3
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c14
-rw-r--r--arch/x86/kernel/cpu/bugs.c2
-rw-r--r--arch/x86/kernel/cpu/common.c5
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c169
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h2
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c52
-rw-r--r--arch/x86/kernel/cpu/intel.c8
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c181
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c4
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c55
-rw-r--r--arch/x86/kernel/cpu/perf_event.c815
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c46
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c357
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c641
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c218
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c857
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c31
-rw-r--r--arch/x86/kernel/cpu/vmware.c38
-rw-r--r--arch/x86/kernel/ds.c1437
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/entry_32.S19
-rw-r--r--arch/x86/kernel/hw_breakpoint.c41
-rw-r--r--arch/x86/kernel/i387.c107
-rw-r--r--arch/x86/kernel/i8253.c14
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kprobes.c16
-rw-r--r--arch/x86/kernel/microcode_core.c4
-rw-r--r--arch/x86/kernel/microcode_intel.c22
-rw-r--r--arch/x86/kernel/mpparse.c25
-rw-r--r--arch/x86/kernel/mrst.c5
-rw-r--r--arch/x86/kernel/process.c50
-rw-r--r--arch/x86/kernel/process_32.c10
-rw-r--r--arch/x86/kernel/process_64.c10
-rw-r--r--arch/x86/kernel/ptrace.c384
-rw-r--r--arch/x86/kernel/sfi.c4
-rw-r--r--arch/x86/kernel/step.c46
-rw-r--r--arch/x86/kernel/tboot.c20
-rw-r--r--arch/x86/kernel/tlb_uv.c1280
-rw-r--r--arch/x86/kernel/traps.c177
-rw-r--r--arch/x86/kernel/uv_irq.c12
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/xsave.c8
58 files changed, 4257 insertions, 3980 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c58352209e0..e77b22083721 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
47obj-y += process.o 47obj-y += process.o
48obj-y += i387.o xsave.o 48obj-y += i387.o xsave.o
49obj-y += ptrace.o 49obj-y += ptrace.o
50obj-$(CONFIG_X86_DS) += ds.o
51obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
52obj-$(CONFIG_X86_32) += tls.o 50obj-$(CONFIG_X86_32) += tls.o
53obj-$(CONFIG_IA32_EMULATION) += tls.o 51obj-$(CONFIG_IA32_EMULATION) += tls.o
54obj-y += step.o 52obj-y += step.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cd40aba6aa95..9a5ed58f09dc 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -94,6 +94,53 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
94 94
95 95
96/* 96/*
97 * ISA irqs by default are the first 16 gsis but can be
98 * any gsi as specified by an interrupt source override.
99 */
100static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
101 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
102};
103
104static unsigned int gsi_to_irq(unsigned int gsi)
105{
106 unsigned int irq = gsi + NR_IRQS_LEGACY;
107 unsigned int i;
108
109 for (i = 0; i < NR_IRQS_LEGACY; i++) {
110 if (isa_irq_to_gsi[i] == gsi) {
111 return i;
112 }
113 }
114
115 /* Provide an identity mapping of gsi == irq
116 * except on truly weird platforms that have
117 * non isa irqs in the first 16 gsis.
118 */
119 if (gsi >= NR_IRQS_LEGACY)
120 irq = gsi;
121 else
122 irq = gsi_end + 1 + gsi;
123
124 return irq;
125}
126
127static u32 irq_to_gsi(int irq)
128{
129 unsigned int gsi;
130
131 if (irq < NR_IRQS_LEGACY)
132 gsi = isa_irq_to_gsi[irq];
133 else if (irq <= gsi_end)
134 gsi = irq;
135 else if (irq <= (gsi_end + NR_IRQS_LEGACY))
136 gsi = irq - gsi_end;
137 else
138 gsi = 0xffffffff;
139
140 return gsi;
141}
142
143/*
97 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, 144 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
98 * to map the target physical address. The problem is that set_fixmap() 145 * to map the target physical address. The problem is that set_fixmap()
99 * provides a single page, and it is possible that the page is not 146 * provides a single page, and it is possible that the page is not
@@ -313,7 +360,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
313/* 360/*
314 * Parse Interrupt Source Override for the ACPI SCI 361 * Parse Interrupt Source Override for the ACPI SCI
315 */ 362 */
316static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) 363static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
317{ 364{
318 if (trigger == 0) /* compatible SCI trigger is level */ 365 if (trigger == 0) /* compatible SCI trigger is level */
319 trigger = 3; 366 trigger = 3;
@@ -333,7 +380,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
333 * If GSI is < 16, this will update its flags, 380 * If GSI is < 16, this will update its flags,
334 * else it will create a new mp_irqs[] entry. 381 * else it will create a new mp_irqs[] entry.
335 */ 382 */
336 mp_override_legacy_irq(gsi, polarity, trigger, gsi); 383 mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
337 384
338 /* 385 /*
339 * stash over-ride to indicate we've been here 386 * stash over-ride to indicate we've been here
@@ -357,9 +404,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
357 acpi_table_print_madt_entry(header); 404 acpi_table_print_madt_entry(header);
358 405
359 if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { 406 if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
360 acpi_sci_ioapic_setup(intsrc->global_irq, 407 acpi_sci_ioapic_setup(intsrc->source_irq,
361 intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, 408 intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
362 (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2); 409 (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
410 intsrc->global_irq);
363 return 0; 411 return 0;
364 } 412 }
365 413
@@ -448,7 +496,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
448 496
449int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) 497int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
450{ 498{
451 *irq = gsi; 499 *irq = gsi_to_irq(gsi);
452 500
453#ifdef CONFIG_X86_IO_APIC 501#ifdef CONFIG_X86_IO_APIC
454 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) 502 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
@@ -458,6 +506,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
458 return 0; 506 return 0;
459} 507}
460 508
509int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
510{
511 if (isa_irq >= 16)
512 return -1;
513 *gsi = irq_to_gsi(isa_irq);
514 return 0;
515}
516
461/* 517/*
462 * success: return IRQ number (>=0) 518 * success: return IRQ number (>=0)
463 * failure: return < 0 519 * failure: return < 0
@@ -482,7 +538,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
482 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); 538 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
483 } 539 }
484#endif 540#endif
485 irq = plat_gsi; 541 irq = gsi_to_irq(plat_gsi);
486 542
487 return irq; 543 return irq;
488} 544}
@@ -867,29 +923,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
867extern int es7000_plat; 923extern int es7000_plat;
868#endif 924#endif
869 925
870int __init acpi_probe_gsi(void)
871{
872 int idx;
873 int gsi;
874 int max_gsi = 0;
875
876 if (acpi_disabled)
877 return 0;
878
879 if (!acpi_ioapic)
880 return 0;
881
882 max_gsi = 0;
883 for (idx = 0; idx < nr_ioapics; idx++) {
884 gsi = mp_gsi_routing[idx].gsi_end;
885
886 if (gsi > max_gsi)
887 max_gsi = gsi;
888 }
889
890 return max_gsi + 1;
891}
892
893static void assign_to_mp_irq(struct mpc_intsrc *m, 926static void assign_to_mp_irq(struct mpc_intsrc *m,
894 struct mpc_intsrc *mp_irq) 927 struct mpc_intsrc *mp_irq)
895{ 928{
@@ -947,13 +980,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
947 mp_irq.dstirq = pin; /* INTIN# */ 980 mp_irq.dstirq = pin; /* INTIN# */
948 981
949 save_mp_irq(&mp_irq); 982 save_mp_irq(&mp_irq);
983
984 isa_irq_to_gsi[bus_irq] = gsi;
950} 985}
951 986
952void __init mp_config_acpi_legacy_irqs(void) 987void __init mp_config_acpi_legacy_irqs(void)
953{ 988{
954 int i; 989 int i;
955 int ioapic;
956 unsigned int dstapic;
957 struct mpc_intsrc mp_irq; 990 struct mpc_intsrc mp_irq;
958 991
959#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 992#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
@@ -974,19 +1007,27 @@ void __init mp_config_acpi_legacy_irqs(void)
974#endif 1007#endif
975 1008
976 /* 1009 /*
977 * Locate the IOAPIC that manages the ISA IRQs (0-15).
978 */
979 ioapic = mp_find_ioapic(0);
980 if (ioapic < 0)
981 return;
982 dstapic = mp_ioapics[ioapic].apicid;
983
984 /*
985 * Use the default configuration for the IRQs 0-15. Unless 1010 * Use the default configuration for the IRQs 0-15. Unless
986 * overridden by (MADT) interrupt source override entries. 1011 * overridden by (MADT) interrupt source override entries.
987 */ 1012 */
988 for (i = 0; i < 16; i++) { 1013 for (i = 0; i < 16; i++) {
1014 int ioapic, pin;
1015 unsigned int dstapic;
989 int idx; 1016 int idx;
1017 u32 gsi;
1018
1019 /* Locate the gsi that irq i maps to. */
1020 if (acpi_isa_irq_to_gsi(i, &gsi))
1021 continue;
1022
1023 /*
1024 * Locate the IOAPIC that manages the ISA IRQ.
1025 */
1026 ioapic = mp_find_ioapic(gsi);
1027 if (ioapic < 0)
1028 continue;
1029 pin = mp_find_ioapic_pin(ioapic, gsi);
1030 dstapic = mp_ioapics[ioapic].apicid;
990 1031
991 for (idx = 0; idx < mp_irq_entries; idx++) { 1032 for (idx = 0; idx < mp_irq_entries; idx++) {
992 struct mpc_intsrc *irq = mp_irqs + idx; 1033 struct mpc_intsrc *irq = mp_irqs + idx;
@@ -996,7 +1037,7 @@ void __init mp_config_acpi_legacy_irqs(void)
996 break; 1037 break;
997 1038
998 /* Do we already have a mapping for this IOAPIC pin */ 1039 /* Do we already have a mapping for this IOAPIC pin */
999 if (irq->dstapic == dstapic && irq->dstirq == i) 1040 if (irq->dstapic == dstapic && irq->dstirq == pin)
1000 break; 1041 break;
1001 } 1042 }
1002 1043
@@ -1011,7 +1052,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1011 mp_irq.dstapic = dstapic; 1052 mp_irq.dstapic = dstapic;
1012 mp_irq.irqtype = mp_INT; 1053 mp_irq.irqtype = mp_INT;
1013 mp_irq.srcbusirq = i; /* Identity mapped */ 1054 mp_irq.srcbusirq = i; /* Identity mapped */
1014 mp_irq.dstirq = i; 1055 mp_irq.dstirq = pin;
1015 1056
1016 save_mp_irq(&mp_irq); 1057 save_mp_irq(&mp_irq);
1017 } 1058 }
@@ -1076,11 +1117,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1076 1117
1077 ioapic_pin = mp_find_ioapic_pin(ioapic, gsi); 1118 ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
1078 1119
1079#ifdef CONFIG_X86_32
1080 if (ioapic_renumber_irq)
1081 gsi = ioapic_renumber_irq(ioapic, gsi);
1082#endif
1083
1084 if (ioapic_pin > MP_MAX_IOAPIC_PIN) { 1120 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1085 printk(KERN_ERR "Invalid reference to IOAPIC pin " 1121 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1086 "%d-%d\n", mp_ioapics[ioapic].apicid, 1122 "%d-%d\n", mp_ioapics[ioapic].apicid,
@@ -1094,7 +1130,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1094 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, 1130 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
1095 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, 1131 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
1096 polarity == ACPI_ACTIVE_HIGH ? 0 : 1); 1132 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1097 io_apic_set_pci_routing(dev, gsi, &irq_attr); 1133 io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
1098 1134
1099 return gsi; 1135 return gsi;
1100} 1136}
@@ -1154,7 +1190,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1154 * pretend we got one so we can set the SCI flags. 1190 * pretend we got one so we can set the SCI flags.
1155 */ 1191 */
1156 if (!acpi_sci_override_gsi) 1192 if (!acpi_sci_override_gsi)
1157 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); 1193 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
1194 acpi_gbl_FADT.sci_interrupt);
1158 1195
1159 /* Fill in identity legacy mappings where no override */ 1196 /* Fill in identity legacy mappings where no override */
1160 mp_config_acpi_legacy_irqs(); 1197 mp_config_acpi_legacy_irqs();
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1a160d5d44d0..70237732a6c7 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
194} 194}
195 195
196extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 196extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
197extern u8 *__smp_locks[], *__smp_locks_end[]; 197extern s32 __smp_locks[], __smp_locks_end[];
198static void *text_poke_early(void *addr, const void *opcode, size_t len); 198static void *text_poke_early(void *addr, const void *opcode, size_t len);
199 199
200/* Replace instructions with better alternatives for this CPU type. 200/* Replace instructions with better alternatives for this CPU type.
@@ -235,37 +235,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
235 235
236#ifdef CONFIG_SMP 236#ifdef CONFIG_SMP
237 237
238static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) 238static void alternatives_smp_lock(const s32 *start, const s32 *end,
239 u8 *text, u8 *text_end)
239{ 240{
240 u8 **ptr; 241 const s32 *poff;
241 242
242 mutex_lock(&text_mutex); 243 mutex_lock(&text_mutex);
243 for (ptr = start; ptr < end; ptr++) { 244 for (poff = start; poff < end; poff++) {
244 if (*ptr < text) 245 u8 *ptr = (u8 *)poff + *poff;
245 continue; 246
246 if (*ptr > text_end) 247 if (!*poff || ptr < text || ptr >= text_end)
247 continue; 248 continue;
248 /* turn DS segment override prefix into lock prefix */ 249 /* turn DS segment override prefix into lock prefix */
249 text_poke(*ptr, ((unsigned char []){0xf0}), 1); 250 if (*ptr == 0x3e)
251 text_poke(ptr, ((unsigned char []){0xf0}), 1);
250 }; 252 };
251 mutex_unlock(&text_mutex); 253 mutex_unlock(&text_mutex);
252} 254}
253 255
254static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 256static void alternatives_smp_unlock(const s32 *start, const s32 *end,
257 u8 *text, u8 *text_end)
255{ 258{
256 u8 **ptr; 259 const s32 *poff;
257 260
258 if (noreplace_smp) 261 if (noreplace_smp)
259 return; 262 return;
260 263
261 mutex_lock(&text_mutex); 264 mutex_lock(&text_mutex);
262 for (ptr = start; ptr < end; ptr++) { 265 for (poff = start; poff < end; poff++) {
263 if (*ptr < text) 266 u8 *ptr = (u8 *)poff + *poff;
264 continue; 267
265 if (*ptr > text_end) 268 if (!*poff || ptr < text || ptr >= text_end)
266 continue; 269 continue;
267 /* turn lock prefix into DS segment override prefix */ 270 /* turn lock prefix into DS segment override prefix */
268 text_poke(*ptr, ((unsigned char []){0x3E}), 1); 271 if (*ptr == 0xf0)
272 text_poke(ptr, ((unsigned char []){0x3E}), 1);
269 }; 273 };
270 mutex_unlock(&text_mutex); 274 mutex_unlock(&text_mutex);
271} 275}
@@ -276,8 +280,8 @@ struct smp_alt_module {
276 char *name; 280 char *name;
277 281
278 /* ptrs to lock prefixes */ 282 /* ptrs to lock prefixes */
279 u8 **locks; 283 const s32 *locks;
280 u8 **locks_end; 284 const s32 *locks_end;
281 285
282 /* .text segment, needed to avoid patching init code ;) */ 286 /* .text segment, needed to avoid patching init code ;) */
283 u8 *text; 287 u8 *text;
@@ -398,16 +402,19 @@ void alternatives_smp_switch(int smp)
398int alternatives_text_reserved(void *start, void *end) 402int alternatives_text_reserved(void *start, void *end)
399{ 403{
400 struct smp_alt_module *mod; 404 struct smp_alt_module *mod;
401 u8 **ptr; 405 const s32 *poff;
402 u8 *text_start = start; 406 u8 *text_start = start;
403 u8 *text_end = end; 407 u8 *text_end = end;
404 408
405 list_for_each_entry(mod, &smp_alt_modules, next) { 409 list_for_each_entry(mod, &smp_alt_modules, next) {
406 if (mod->text > text_end || mod->text_end < text_start) 410 if (mod->text > text_end || mod->text_end < text_start)
407 continue; 411 continue;
408 for (ptr = mod->locks; ptr < mod->locks_end; ptr++) 412 for (poff = mod->locks; poff < mod->locks_end; poff++) {
409 if (text_start <= *ptr && text_end >= *ptr) 413 const u8 *ptr = (const u8 *)poff + *poff;
414
415 if (text_start <= ptr && text_end > ptr)
410 return 1; 416 return 1;
417 }
411 } 418 }
412 419
413 return 0; 420 return 0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f854d89b7edf..fa5a1474cd18 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -731,18 +731,22 @@ static bool increase_address_space(struct protection_domain *domain,
731 731
732static u64 *alloc_pte(struct protection_domain *domain, 732static u64 *alloc_pte(struct protection_domain *domain,
733 unsigned long address, 733 unsigned long address,
734 int end_lvl, 734 unsigned long page_size,
735 u64 **pte_page, 735 u64 **pte_page,
736 gfp_t gfp) 736 gfp_t gfp)
737{ 737{
738 int level, end_lvl;
738 u64 *pte, *page; 739 u64 *pte, *page;
739 int level; 740
741 BUG_ON(!is_power_of_2(page_size));
740 742
741 while (address > PM_LEVEL_SIZE(domain->mode)) 743 while (address > PM_LEVEL_SIZE(domain->mode))
742 increase_address_space(domain, gfp); 744 increase_address_space(domain, gfp);
743 745
744 level = domain->mode - 1; 746 level = domain->mode - 1;
745 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; 747 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
748 address = PAGE_SIZE_ALIGN(address, page_size);
749 end_lvl = PAGE_SIZE_LEVEL(page_size);
746 750
747 while (level > end_lvl) { 751 while (level > end_lvl) {
748 if (!IOMMU_PTE_PRESENT(*pte)) { 752 if (!IOMMU_PTE_PRESENT(*pte)) {
@@ -752,6 +756,10 @@ static u64 *alloc_pte(struct protection_domain *domain,
752 *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); 756 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
753 } 757 }
754 758
759 /* No level skipping support yet */
760 if (PM_PTE_LEVEL(*pte) != level)
761 return NULL;
762
755 level -= 1; 763 level -= 1;
756 764
757 pte = IOMMU_PTE_PAGE(*pte); 765 pte = IOMMU_PTE_PAGE(*pte);
@@ -769,28 +777,47 @@ static u64 *alloc_pte(struct protection_domain *domain,
769 * This function checks if there is a PTE for a given dma address. If 777 * This function checks if there is a PTE for a given dma address. If
770 * there is one, it returns the pointer to it. 778 * there is one, it returns the pointer to it.
771 */ 779 */
772static u64 *fetch_pte(struct protection_domain *domain, 780static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
773 unsigned long address, int map_size)
774{ 781{
775 int level; 782 int level;
776 u64 *pte; 783 u64 *pte;
777 784
778 level = domain->mode - 1; 785 if (address > PM_LEVEL_SIZE(domain->mode))
779 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; 786 return NULL;
787
788 level = domain->mode - 1;
789 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
780 790
781 while (level > map_size) { 791 while (level > 0) {
792
793 /* Not Present */
782 if (!IOMMU_PTE_PRESENT(*pte)) 794 if (!IOMMU_PTE_PRESENT(*pte))
783 return NULL; 795 return NULL;
784 796
797 /* Large PTE */
798 if (PM_PTE_LEVEL(*pte) == 0x07) {
799 unsigned long pte_mask, __pte;
800
801 /*
802 * If we have a series of large PTEs, make
803 * sure to return a pointer to the first one.
804 */
805 pte_mask = PTE_PAGE_SIZE(*pte);
806 pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
807 __pte = ((unsigned long)pte) & pte_mask;
808
809 return (u64 *)__pte;
810 }
811
812 /* No level skipping support yet */
813 if (PM_PTE_LEVEL(*pte) != level)
814 return NULL;
815
785 level -= 1; 816 level -= 1;
786 817
818 /* Walk to the next level */
787 pte = IOMMU_PTE_PAGE(*pte); 819 pte = IOMMU_PTE_PAGE(*pte);
788 pte = &pte[PM_LEVEL_INDEX(level, address)]; 820 pte = &pte[PM_LEVEL_INDEX(level, address)];
789
790 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
791 pte = NULL;
792 break;
793 }
794 } 821 }
795 822
796 return pte; 823 return pte;
@@ -807,44 +834,84 @@ static int iommu_map_page(struct protection_domain *dom,
807 unsigned long bus_addr, 834 unsigned long bus_addr,
808 unsigned long phys_addr, 835 unsigned long phys_addr,
809 int prot, 836 int prot,
810 int map_size) 837 unsigned long page_size)
811{ 838{
812 u64 __pte, *pte; 839 u64 __pte, *pte;
813 840 int i, count;
814 bus_addr = PAGE_ALIGN(bus_addr);
815 phys_addr = PAGE_ALIGN(phys_addr);
816
817 BUG_ON(!PM_ALIGNED(map_size, bus_addr));
818 BUG_ON(!PM_ALIGNED(map_size, phys_addr));
819 841
820 if (!(prot & IOMMU_PROT_MASK)) 842 if (!(prot & IOMMU_PROT_MASK))
821 return -EINVAL; 843 return -EINVAL;
822 844
823 pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL); 845 bus_addr = PAGE_ALIGN(bus_addr);
846 phys_addr = PAGE_ALIGN(phys_addr);
847 count = PAGE_SIZE_PTE_COUNT(page_size);
848 pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
849
850 for (i = 0; i < count; ++i)
851 if (IOMMU_PTE_PRESENT(pte[i]))
852 return -EBUSY;
824 853
825 if (IOMMU_PTE_PRESENT(*pte)) 854 if (page_size > PAGE_SIZE) {
826 return -EBUSY; 855 __pte = PAGE_SIZE_PTE(phys_addr, page_size);
856 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
857 } else
858 __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
827 859
828 __pte = phys_addr | IOMMU_PTE_P;
829 if (prot & IOMMU_PROT_IR) 860 if (prot & IOMMU_PROT_IR)
830 __pte |= IOMMU_PTE_IR; 861 __pte |= IOMMU_PTE_IR;
831 if (prot & IOMMU_PROT_IW) 862 if (prot & IOMMU_PROT_IW)
832 __pte |= IOMMU_PTE_IW; 863 __pte |= IOMMU_PTE_IW;
833 864
834 *pte = __pte; 865 for (i = 0; i < count; ++i)
866 pte[i] = __pte;
835 867
836 update_domain(dom); 868 update_domain(dom);
837 869
838 return 0; 870 return 0;
839} 871}
840 872
841static void iommu_unmap_page(struct protection_domain *dom, 873static unsigned long iommu_unmap_page(struct protection_domain *dom,
842 unsigned long bus_addr, int map_size) 874 unsigned long bus_addr,
875 unsigned long page_size)
843{ 876{
844 u64 *pte = fetch_pte(dom, bus_addr, map_size); 877 unsigned long long unmap_size, unmapped;
878 u64 *pte;
879
880 BUG_ON(!is_power_of_2(page_size));
881
882 unmapped = 0;
845 883
846 if (pte) 884 while (unmapped < page_size) {
847 *pte = 0; 885
886 pte = fetch_pte(dom, bus_addr);
887
888 if (!pte) {
889 /*
890 * No PTE for this address
891 * move forward in 4kb steps
892 */
893 unmap_size = PAGE_SIZE;
894 } else if (PM_PTE_LEVEL(*pte) == 0) {
895 /* 4kb PTE found for this address */
896 unmap_size = PAGE_SIZE;
897 *pte = 0ULL;
898 } else {
899 int count, i;
900
901 /* Large PTE found which maps this address */
902 unmap_size = PTE_PAGE_SIZE(*pte);
903 count = PAGE_SIZE_PTE_COUNT(unmap_size);
904 for (i = 0; i < count; i++)
905 pte[i] = 0ULL;
906 }
907
908 bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
909 unmapped += unmap_size;
910 }
911
912 BUG_ON(!is_power_of_2(unmapped));
913
914 return unmapped;
848} 915}
849 916
850/* 917/*
@@ -878,7 +945,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
878 for (addr = e->address_start; addr < e->address_end; 945 for (addr = e->address_start; addr < e->address_end;
879 addr += PAGE_SIZE) { 946 addr += PAGE_SIZE) {
880 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, 947 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
881 PM_MAP_4k); 948 PAGE_SIZE);
882 if (ret) 949 if (ret)
883 return ret; 950 return ret;
884 /* 951 /*
@@ -1006,7 +1073,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
1006 u64 *pte, *pte_page; 1073 u64 *pte, *pte_page;
1007 1074
1008 for (i = 0; i < num_ptes; ++i) { 1075 for (i = 0; i < num_ptes; ++i) {
1009 pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k, 1076 pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
1010 &pte_page, gfp); 1077 &pte_page, gfp);
1011 if (!pte) 1078 if (!pte)
1012 goto out_free; 1079 goto out_free;
@@ -1042,7 +1109,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
1042 for (i = dma_dom->aperture[index]->offset; 1109 for (i = dma_dom->aperture[index]->offset;
1043 i < dma_dom->aperture_size; 1110 i < dma_dom->aperture_size;
1044 i += PAGE_SIZE) { 1111 i += PAGE_SIZE) {
1045 u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k); 1112 u64 *pte = fetch_pte(&dma_dom->domain, i);
1046 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 1113 if (!pte || !IOMMU_PTE_PRESENT(*pte))
1047 continue; 1114 continue;
1048 1115
@@ -1712,7 +1779,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1712 1779
1713 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; 1780 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1714 if (!pte) { 1781 if (!pte) {
1715 pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page, 1782 pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
1716 GFP_ATOMIC); 1783 GFP_ATOMIC);
1717 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; 1784 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1718 } else 1785 } else
@@ -2439,12 +2506,11 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
2439 return ret; 2506 return ret;
2440} 2507}
2441 2508
2442static int amd_iommu_map_range(struct iommu_domain *dom, 2509static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
2443 unsigned long iova, phys_addr_t paddr, 2510 phys_addr_t paddr, int gfp_order, int iommu_prot)
2444 size_t size, int iommu_prot)
2445{ 2511{
2512 unsigned long page_size = 0x1000UL << gfp_order;
2446 struct protection_domain *domain = dom->priv; 2513 struct protection_domain *domain = dom->priv;
2447 unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE);
2448 int prot = 0; 2514 int prot = 0;
2449 int ret; 2515 int ret;
2450 2516
@@ -2453,61 +2519,50 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2453 if (iommu_prot & IOMMU_WRITE) 2519 if (iommu_prot & IOMMU_WRITE)
2454 prot |= IOMMU_PROT_IW; 2520 prot |= IOMMU_PROT_IW;
2455 2521
2456 iova &= PAGE_MASK;
2457 paddr &= PAGE_MASK;
2458
2459 mutex_lock(&domain->api_lock); 2522 mutex_lock(&domain->api_lock);
2460 2523 ret = iommu_map_page(domain, iova, paddr, prot, page_size);
2461 for (i = 0; i < npages; ++i) {
2462 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
2463 if (ret)
2464 return ret;
2465
2466 iova += PAGE_SIZE;
2467 paddr += PAGE_SIZE;
2468 }
2469
2470 mutex_unlock(&domain->api_lock); 2524 mutex_unlock(&domain->api_lock);
2471 2525
2472 return 0; 2526 return ret;
2473} 2527}
2474 2528
2475static void amd_iommu_unmap_range(struct iommu_domain *dom, 2529static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
2476 unsigned long iova, size_t size) 2530 int gfp_order)
2477{ 2531{
2478
2479 struct protection_domain *domain = dom->priv; 2532 struct protection_domain *domain = dom->priv;
2480 unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE); 2533 unsigned long page_size, unmap_size;
2481 2534
2482 iova &= PAGE_MASK; 2535 page_size = 0x1000UL << gfp_order;
2483 2536
2484 mutex_lock(&domain->api_lock); 2537 mutex_lock(&domain->api_lock);
2485 2538 unmap_size = iommu_unmap_page(domain, iova, page_size);
2486 for (i = 0; i < npages; ++i) { 2539 mutex_unlock(&domain->api_lock);
2487 iommu_unmap_page(domain, iova, PM_MAP_4k);
2488 iova += PAGE_SIZE;
2489 }
2490 2540
2491 iommu_flush_tlb_pde(domain); 2541 iommu_flush_tlb_pde(domain);
2492 2542
2493 mutex_unlock(&domain->api_lock); 2543 return get_order(unmap_size);
2494} 2544}
2495 2545
2496static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2546static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2497 unsigned long iova) 2547 unsigned long iova)
2498{ 2548{
2499 struct protection_domain *domain = dom->priv; 2549 struct protection_domain *domain = dom->priv;
2500 unsigned long offset = iova & ~PAGE_MASK; 2550 unsigned long offset_mask;
2501 phys_addr_t paddr; 2551 phys_addr_t paddr;
2502 u64 *pte; 2552 u64 *pte, __pte;
2503 2553
2504 pte = fetch_pte(domain, iova, PM_MAP_4k); 2554 pte = fetch_pte(domain, iova);
2505 2555
2506 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 2556 if (!pte || !IOMMU_PTE_PRESENT(*pte))
2507 return 0; 2557 return 0;
2508 2558
2509 paddr = *pte & IOMMU_PAGE_MASK; 2559 if (PM_PTE_LEVEL(*pte) == 0)
2510 paddr |= offset; 2560 offset_mask = PAGE_SIZE - 1;
2561 else
2562 offset_mask = PTE_PAGE_SIZE(*pte) - 1;
2563
2564 __pte = *pte & PM_ADDR_MASK;
2565 paddr = (__pte & ~offset_mask) | (iova & offset_mask);
2511 2566
2512 return paddr; 2567 return paddr;
2513} 2568}
@@ -2523,8 +2578,8 @@ static struct iommu_ops amd_iommu_ops = {
2523 .domain_destroy = amd_iommu_domain_destroy, 2578 .domain_destroy = amd_iommu_domain_destroy,
2524 .attach_dev = amd_iommu_attach_device, 2579 .attach_dev = amd_iommu_attach_device,
2525 .detach_dev = amd_iommu_detach_device, 2580 .detach_dev = amd_iommu_detach_device,
2526 .map = amd_iommu_map_range, 2581 .map = amd_iommu_map,
2527 .unmap = amd_iommu_unmap_range, 2582 .unmap = amd_iommu_unmap,
2528 .iova_to_phys = amd_iommu_iova_to_phys, 2583 .iova_to_phys = amd_iommu_iova_to_phys,
2529 .domain_has_cap = amd_iommu_domain_has_cap, 2584 .domain_has_cap = amd_iommu_domain_has_cap,
2530}; 2585};
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6360abf993d4..3bacb4d0844c 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -120,6 +120,7 @@ struct ivmd_header {
120bool amd_iommu_dump; 120bool amd_iommu_dump;
121 121
122static int __initdata amd_iommu_detected; 122static int __initdata amd_iommu_detected;
123static bool __initdata amd_iommu_disabled;
123 124
124u16 amd_iommu_last_bdf; /* largest PCI device id we have 125u16 amd_iommu_last_bdf; /* largest PCI device id we have
125 to handle */ 126 to handle */
@@ -1372,6 +1373,9 @@ void __init amd_iommu_detect(void)
1372 if (no_iommu || (iommu_detected && !gart_iommu_aperture)) 1373 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1373 return; 1374 return;
1374 1375
1376 if (amd_iommu_disabled)
1377 return;
1378
1375 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1379 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1376 iommu_detected = 1; 1380 iommu_detected = 1;
1377 amd_iommu_detected = 1; 1381 amd_iommu_detected = 1;
@@ -1401,6 +1405,8 @@ static int __init parse_amd_iommu_options(char *str)
1401 for (; *str; ++str) { 1405 for (; *str; ++str) {
1402 if (strncmp(str, "fullflush", 9) == 0) 1406 if (strncmp(str, "fullflush", 9) == 0)
1403 amd_iommu_unmap_flush = true; 1407 amd_iommu_unmap_flush = true;
1408 if (strncmp(str, "off", 3) == 0)
1409 amd_iommu_disabled = true;
1404 } 1410 }
1405 1411
1406 return 1; 1412 return 1;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 03ba1b895f5e..425e53a87feb 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -131,24 +131,6 @@ int es7000_plat;
131 131
132static unsigned int base; 132static unsigned int base;
133 133
134static int
135es7000_rename_gsi(int ioapic, int gsi)
136{
137 if (es7000_plat == ES7000_ZORRO)
138 return gsi;
139
140 if (!base) {
141 int i;
142 for (i = 0; i < nr_ioapics; i++)
143 base += nr_ioapic_registers[i];
144 }
145
146 if (!ioapic && (gsi < 16))
147 gsi += base;
148
149 return gsi;
150}
151
152static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) 134static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
153{ 135{
154 unsigned long vect = 0, psaival = 0; 136 unsigned long vect = 0, psaival = 0;
@@ -190,7 +172,6 @@ static void setup_unisys(void)
190 es7000_plat = ES7000_ZORRO; 172 es7000_plat = ES7000_ZORRO;
191 else 173 else
192 es7000_plat = ES7000_CLASSIC; 174 es7000_plat = ES7000_CLASSIC;
193 ioapic_renumber_irq = es7000_rename_gsi;
194} 175}
195 176
196/* 177/*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index eb2789c3f721..33f3563a2a52 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,6 +89,9 @@ int nr_ioapics;
89/* IO APIC gsi routing info */ 89/* IO APIC gsi routing info */
90struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; 90struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS];
91 91
92/* The last gsi number used */
93u32 gsi_end;
94
92/* MP IRQ source entries */ 95/* MP IRQ source entries */
93struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; 96struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
94 97
@@ -1013,10 +1016,9 @@ static inline int irq_trigger(int idx)
1013 return MPBIOS_trigger(idx); 1016 return MPBIOS_trigger(idx);
1014} 1017}
1015 1018
1016int (*ioapic_renumber_irq)(int ioapic, int irq);
1017static int pin_2_irq(int idx, int apic, int pin) 1019static int pin_2_irq(int idx, int apic, int pin)
1018{ 1020{
1019 int irq, i; 1021 int irq;
1020 int bus = mp_irqs[idx].srcbus; 1022 int bus = mp_irqs[idx].srcbus;
1021 1023
1022 /* 1024 /*
@@ -1028,18 +1030,12 @@ static int pin_2_irq(int idx, int apic, int pin)
1028 if (test_bit(bus, mp_bus_not_pci)) { 1030 if (test_bit(bus, mp_bus_not_pci)) {
1029 irq = mp_irqs[idx].srcbusirq; 1031 irq = mp_irqs[idx].srcbusirq;
1030 } else { 1032 } else {
1031 /* 1033 u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
1032 * PCI IRQs are mapped in order 1034
1033 */ 1035 if (gsi >= NR_IRQS_LEGACY)
1034 i = irq = 0; 1036 irq = gsi;
1035 while (i < apic) 1037 else
1036 irq += nr_ioapic_registers[i++]; 1038 irq = gsi_end + 1 + gsi;
1037 irq += pin;
1038 /*
1039 * For MPS mode, so far only needed by ES7000 platform
1040 */
1041 if (ioapic_renumber_irq)
1042 irq = ioapic_renumber_irq(apic, irq);
1043 } 1039 }
1044 1040
1045#ifdef CONFIG_X86_32 1041#ifdef CONFIG_X86_32
@@ -1950,20 +1946,8 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
1950 1946
1951void __init enable_IO_APIC(void) 1947void __init enable_IO_APIC(void)
1952{ 1948{
1953 union IO_APIC_reg_01 reg_01;
1954 int i8259_apic, i8259_pin; 1949 int i8259_apic, i8259_pin;
1955 int apic; 1950 int apic;
1956 unsigned long flags;
1957
1958 /*
1959 * The number of IO-APIC IRQ registers (== #pins):
1960 */
1961 for (apic = 0; apic < nr_ioapics; apic++) {
1962 raw_spin_lock_irqsave(&ioapic_lock, flags);
1963 reg_01.raw = io_apic_read(apic, 1);
1964 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1965 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1966 }
1967 1951
1968 if (!legacy_pic->nr_legacy_irqs) 1952 if (!legacy_pic->nr_legacy_irqs)
1969 return; 1953 return;
@@ -3858,27 +3842,20 @@ int __init io_apic_get_redir_entries (int ioapic)
3858 reg_01.raw = io_apic_read(ioapic, 1); 3842 reg_01.raw = io_apic_read(ioapic, 1);
3859 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 3843 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3860 3844
3861 return reg_01.bits.entries; 3845 /* The register returns the maximum index redir index
3846 * supported, which is one less than the total number of redir
3847 * entries.
3848 */
3849 return reg_01.bits.entries + 1;
3862} 3850}
3863 3851
3864void __init probe_nr_irqs_gsi(void) 3852void __init probe_nr_irqs_gsi(void)
3865{ 3853{
3866 int nr = 0; 3854 int nr;
3867 3855
3868 nr = acpi_probe_gsi(); 3856 nr = gsi_end + 1 + NR_IRQS_LEGACY;
3869 if (nr > nr_irqs_gsi) { 3857 if (nr > nr_irqs_gsi)
3870 nr_irqs_gsi = nr; 3858 nr_irqs_gsi = nr;
3871 } else {
3872 /* for acpi=off or acpi is not compiled in */
3873 int idx;
3874
3875 nr = 0;
3876 for (idx = 0; idx < nr_ioapics; idx++)
3877 nr += io_apic_get_redir_entries(idx) + 1;
3878
3879 if (nr > nr_irqs_gsi)
3880 nr_irqs_gsi = nr;
3881 }
3882 3859
3883 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); 3860 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
3884} 3861}
@@ -4085,22 +4062,27 @@ int __init io_apic_get_version(int ioapic)
4085 return reg_01.bits.version; 4062 return reg_01.bits.version;
4086} 4063}
4087 4064
4088int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) 4065int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
4089{ 4066{
4090 int i; 4067 int ioapic, pin, idx;
4091 4068
4092 if (skip_ioapic_setup) 4069 if (skip_ioapic_setup)
4093 return -1; 4070 return -1;
4094 4071
4095 for (i = 0; i < mp_irq_entries; i++) 4072 ioapic = mp_find_ioapic(gsi);
4096 if (mp_irqs[i].irqtype == mp_INT && 4073 if (ioapic < 0)
4097 mp_irqs[i].srcbusirq == bus_irq)
4098 break;
4099 if (i >= mp_irq_entries)
4100 return -1; 4074 return -1;
4101 4075
4102 *trigger = irq_trigger(i); 4076 pin = mp_find_ioapic_pin(ioapic, gsi);
4103 *polarity = irq_polarity(i); 4077 if (pin < 0)
4078 return -1;
4079
4080 idx = find_irq_entry(ioapic, pin, mp_INT);
4081 if (idx < 0)
4082 return -1;
4083
4084 *trigger = irq_trigger(idx);
4085 *polarity = irq_polarity(idx);
4104 return 0; 4086 return 0;
4105} 4087}
4106 4088
@@ -4241,7 +4223,7 @@ void __init ioapic_insert_resources(void)
4241 } 4223 }
4242} 4224}
4243 4225
4244int mp_find_ioapic(int gsi) 4226int mp_find_ioapic(u32 gsi)
4245{ 4227{
4246 int i = 0; 4228 int i = 0;
4247 4229
@@ -4256,7 +4238,7 @@ int mp_find_ioapic(int gsi)
4256 return -1; 4238 return -1;
4257} 4239}
4258 4240
4259int mp_find_ioapic_pin(int ioapic, int gsi) 4241int mp_find_ioapic_pin(int ioapic, u32 gsi)
4260{ 4242{
4261 if (WARN_ON(ioapic == -1)) 4243 if (WARN_ON(ioapic == -1))
4262 return -1; 4244 return -1;
@@ -4284,6 +4266,7 @@ static int bad_ioapic(unsigned long address)
4284void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) 4266void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4285{ 4267{
4286 int idx = 0; 4268 int idx = 0;
4269 int entries;
4287 4270
4288 if (bad_ioapic(address)) 4271 if (bad_ioapic(address))
4289 return; 4272 return;
@@ -4302,9 +4285,17 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4302 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 4285 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
4303 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 4286 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
4304 */ 4287 */
4288 entries = io_apic_get_redir_entries(idx);
4305 mp_gsi_routing[idx].gsi_base = gsi_base; 4289 mp_gsi_routing[idx].gsi_base = gsi_base;
4306 mp_gsi_routing[idx].gsi_end = gsi_base + 4290 mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
4307 io_apic_get_redir_entries(idx); 4291
4292 /*
4293 * The number of IO-APIC IRQ registers (== #pins):
4294 */
4295 nr_ioapic_registers[idx] = entries;
4296
4297 if (mp_gsi_routing[idx].gsi_end > gsi_end)
4298 gsi_end = mp_gsi_routing[idx].gsi_end;
4308 4299
4309 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 4300 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4310 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, 4301 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c085d52dbaf2..e46f98f36e31 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -735,9 +735,6 @@ void __init uv_system_init(void)
735 uv_node_to_blade[nid] = blade; 735 uv_node_to_blade[nid] = blade;
736 uv_cpu_to_blade[cpu] = blade; 736 uv_cpu_to_blade[cpu] = blade;
737 max_pnode = max(pnode, max_pnode); 737 max_pnode = max(pnode, max_pnode);
738
739 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
740 cpu, apicid, pnode, nid, lcpu, blade);
741 } 738 }
742 739
743 /* Add blade/pnode info for nodes without cpus */ 740 /* Add blade/pnode info for nodes without cpus */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 031aa887b0eb..c4f9182ca3ac 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1224,7 +1224,7 @@ static void reinit_timer(void)
1224#ifdef INIT_TIMER_AFTER_SUSPEND 1224#ifdef INIT_TIMER_AFTER_SUSPEND
1225 unsigned long flags; 1225 unsigned long flags;
1226 1226
1227 spin_lock_irqsave(&i8253_lock, flags); 1227 raw_spin_lock_irqsave(&i8253_lock, flags);
1228 /* set the clock to HZ */ 1228 /* set the clock to HZ */
1229 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ 1229 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
1230 udelay(10); 1230 udelay(10);
@@ -1232,7 +1232,7 @@ static void reinit_timer(void)
1232 udelay(10); 1232 udelay(10);
1233 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ 1233 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
1234 udelay(10); 1234 udelay(10);
1235 spin_unlock_irqrestore(&i8253_lock, flags); 1235 raw_spin_unlock_irqrestore(&i8253_lock, flags);
1236#endif 1236#endif
1237} 1237}
1238 1238
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62f3671..3a785da34b6f 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o addon_cpuid_features.o 15obj-y := intel_cacheinfo.o addon_cpuid_features.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18 18
19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
20obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 97ad79cdf688..10fa5684a662 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,12 +30,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
30 const struct cpuid_bit *cb; 30 const struct cpuid_bit *cb;
31 31
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, 34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, 35 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 },
36 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, 36 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 },
37 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, 37 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
38 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, 38 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
39 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
40 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
39 { 0, 0, 0, 0 } 41 { 0, 0, 0, 0 }
40 }; 42 };
41 43
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01a265212395..c39576cb3018 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
86 86
87static void __init check_hlt(void) 87static void __init check_hlt(void)
88{ 88{
89 if (paravirt_enabled()) 89 if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
90 return; 90 return;
91 91
92 printk(KERN_INFO "Checking 'hlt' instruction... "); 92 printk(KERN_INFO "Checking 'hlt' instruction... ");
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4868e4a951ee..c1c00d0b1692 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1243,10 +1243,7 @@ void __cpuinit cpu_init(void)
1243 /* 1243 /*
1244 * Force FPU initialization: 1244 * Force FPU initialization:
1245 */ 1245 */
1246 if (cpu_has_xsave) 1246 current_thread_info()->status = 0;
1247 current_thread_info()->status = TS_XSAVE;
1248 else
1249 current_thread_info()->status = 0;
1250 clear_used_math(); 1247 clear_used_math();
1251 mxcsr_feature_mask_init(); 1248 mxcsr_feature_mask_init();
1252 1249
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 1840c0a5170b..bd54bf67e6fb 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,8 @@
2# K8 systems. ACPI is preferred to all other hardware-specific drivers. 2# K8 systems. ACPI is preferred to all other hardware-specific drivers.
3# speedstep-* is preferred over p4-clockmod. 3# speedstep-* is preferred over p4-clockmod.
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o 7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 459168083b77..1d3cddaa40ee 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -46,6 +46,7 @@
46#include <asm/msr.h> 46#include <asm/msr.h>
47#include <asm/processor.h> 47#include <asm/processor.h>
48#include <asm/cpufeature.h> 48#include <asm/cpufeature.h>
49#include "mperf.h"
49 50
50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ 51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
51 "acpi-cpufreq", msg) 52 "acpi-cpufreq", msg)
@@ -71,8 +72,6 @@ struct acpi_cpufreq_data {
71 72
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); 73static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
73 74
74static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
75
76/* acpi_perf_data is a pointer to percpu data. */ 75/* acpi_perf_data is a pointer to percpu data. */
77static struct acpi_processor_performance *acpi_perf_data; 76static struct acpi_processor_performance *acpi_perf_data;
78 77
@@ -240,45 +239,6 @@ static u32 get_cur_val(const struct cpumask *mask)
240 return cmd.val; 239 return cmd.val;
241} 240}
242 241
243/* Called via smp_call_function_single(), on the target CPU */
244static void read_measured_perf_ctrs(void *_cur)
245{
246 struct aperfmperf *am = _cur;
247
248 get_aperfmperf(am);
249}
250
251/*
252 * Return the measured active (C0) frequency on this CPU since last call
253 * to this function.
254 * Input: cpu number
255 * Return: Average CPU frequency in terms of max frequency (zero on error)
256 *
257 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
258 * over a period of time, while CPU is in C0 state.
259 * IA32_MPERF counts at the rate of max advertised frequency
260 * IA32_APERF counts at the rate of actual CPU frequency
261 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
262 * no meaning should be associated with absolute values of these MSRs.
263 */
264static unsigned int get_measured_perf(struct cpufreq_policy *policy,
265 unsigned int cpu)
266{
267 struct aperfmperf perf;
268 unsigned long ratio;
269 unsigned int retval;
270
271 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
272 return 0;
273
274 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
275 per_cpu(acfreq_old_perf, cpu) = perf;
276
277 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
278
279 return retval;
280}
281
282static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 242static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
283{ 243{
284 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); 244 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -702,7 +662,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
702 662
703 /* Check for APERF/MPERF support in hardware */ 663 /* Check for APERF/MPERF support in hardware */
704 if (cpu_has(c, X86_FEATURE_APERFMPERF)) 664 if (cpu_has(c, X86_FEATURE_APERFMPERF))
705 acpi_cpufreq_driver.getavg = get_measured_perf; 665 acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
706 666
707 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 667 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
708 for (i = 0; i < perf->state_count; i++) 668 for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 000000000000..911e193018ae
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
1#include <linux/kernel.h>
2#include <linux/smp.h>
3#include <linux/module.h>
4#include <linux/init.h>
5#include <linux/cpufreq.h>
6#include <linux/slab.h>
7
8#include "mperf.h"
9
10static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
11
12/* Called via smp_call_function_single(), on the target CPU */
13static void read_measured_perf_ctrs(void *_cur)
14{
15 struct aperfmperf *am = _cur;
16
17 get_aperfmperf(am);
18}
19
20/*
21 * Return the measured active (C0) frequency on this CPU since last call
22 * to this function.
23 * Input: cpu number
24 * Return: Average CPU frequency in terms of max frequency (zero on error)
25 *
26 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
27 * over a period of time, while CPU is in C0 state.
28 * IA32_MPERF counts at the rate of max advertised frequency
29 * IA32_APERF counts at the rate of actual CPU frequency
30 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
31 * no meaning should be associated with absolute values of these MSRs.
32 */
33unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
34 unsigned int cpu)
35{
36 struct aperfmperf perf;
37 unsigned long ratio;
38 unsigned int retval;
39
40 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
41 return 0;
42
43 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
44 per_cpu(acfreq_old_perf, cpu) = perf;
45
46 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
47
48 return retval;
49}
50EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
51MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 000000000000..5dbf2950dc22
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
1/*
2 * (c) 2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
9 unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index b6215b9798e2..6f3dc8fbbfdc 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
1
2/* 1/*
3 * (c) 2003-2006 Advanced Micro Devices, Inc. 2 * (c) 2003-2010 Advanced Micro Devices, Inc.
4 * Your use of this code is subject to the terms and conditions of the 3 * Your use of this code is subject to the terms and conditions of the
5 * GNU general public license version 2. See "COPYING" or 4 * GNU general public license version 2. See "COPYING" or
6 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
@@ -46,6 +45,7 @@
46#define PFX "powernow-k8: " 45#define PFX "powernow-k8: "
47#define VERSION "version 2.20.00" 46#define VERSION "version 2.20.00"
48#include "powernow-k8.h" 47#include "powernow-k8.h"
48#include "mperf.h"
49 49
50/* serialize freq changes */ 50/* serialize freq changes */
51static DEFINE_MUTEX(fidvid_mutex); 51static DEFINE_MUTEX(fidvid_mutex);
@@ -54,6 +54,12 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
54 54
55static int cpu_family = CPU_OPTERON; 55static int cpu_family = CPU_OPTERON;
56 56
57/* core performance boost */
58static bool cpb_capable, cpb_enabled;
59static struct msr __percpu *msrs;
60
61static struct cpufreq_driver cpufreq_amd64_driver;
62
57#ifndef CONFIG_SMP 63#ifndef CONFIG_SMP
58static inline const struct cpumask *cpu_core_mask(int cpu) 64static inline const struct cpumask *cpu_core_mask(int cpu)
59{ 65{
@@ -1249,6 +1255,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1249 struct powernow_k8_data *data; 1255 struct powernow_k8_data *data;
1250 struct init_on_cpu init_on_cpu; 1256 struct init_on_cpu init_on_cpu;
1251 int rc; 1257 int rc;
1258 struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
1252 1259
1253 if (!cpu_online(pol->cpu)) 1260 if (!cpu_online(pol->cpu))
1254 return -ENODEV; 1261 return -ENODEV;
@@ -1323,6 +1330,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1323 return -EINVAL; 1330 return -EINVAL;
1324 } 1331 }
1325 1332
1333 /* Check for APERF/MPERF support in hardware */
1334 if (cpu_has(c, X86_FEATURE_APERFMPERF))
1335 cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
1336
1326 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); 1337 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1327 1338
1328 if (cpu_family == CPU_HW_PSTATE) 1339 if (cpu_family == CPU_HW_PSTATE)
@@ -1394,8 +1405,77 @@ out:
1394 return khz; 1405 return khz;
1395} 1406}
1396 1407
1408static void _cpb_toggle_msrs(bool t)
1409{
1410 int cpu;
1411
1412 get_online_cpus();
1413
1414 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1415
1416 for_each_cpu(cpu, cpu_online_mask) {
1417 struct msr *reg = per_cpu_ptr(msrs, cpu);
1418 if (t)
1419 reg->l &= ~BIT(25);
1420 else
1421 reg->l |= BIT(25);
1422 }
1423 wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1424
1425 put_online_cpus();
1426}
1427
1428/*
1429 * Switch on/off core performance boosting.
1430 *
1431 * 0=disable
1432 * 1=enable.
1433 */
1434static void cpb_toggle(bool t)
1435{
1436 if (!cpb_capable)
1437 return;
1438
1439 if (t && !cpb_enabled) {
1440 cpb_enabled = true;
1441 _cpb_toggle_msrs(t);
1442 printk(KERN_INFO PFX "Core Boosting enabled.\n");
1443 } else if (!t && cpb_enabled) {
1444 cpb_enabled = false;
1445 _cpb_toggle_msrs(t);
1446 printk(KERN_INFO PFX "Core Boosting disabled.\n");
1447 }
1448}
1449
1450static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
1451 size_t count)
1452{
1453 int ret = -EINVAL;
1454 unsigned long val = 0;
1455
1456 ret = strict_strtoul(buf, 10, &val);
1457 if (!ret && (val == 0 || val == 1) && cpb_capable)
1458 cpb_toggle(val);
1459 else
1460 return -EINVAL;
1461
1462 return count;
1463}
1464
1465static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
1466{
1467 return sprintf(buf, "%u\n", cpb_enabled);
1468}
1469
1470#define define_one_rw(_name) \
1471static struct freq_attr _name = \
1472__ATTR(_name, 0644, show_##_name, store_##_name)
1473
1474define_one_rw(cpb);
1475
1397static struct freq_attr *powernow_k8_attr[] = { 1476static struct freq_attr *powernow_k8_attr[] = {
1398 &cpufreq_freq_attr_scaling_available_freqs, 1477 &cpufreq_freq_attr_scaling_available_freqs,
1478 &cpb,
1399 NULL, 1479 NULL,
1400}; 1480};
1401 1481
@@ -1411,10 +1491,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
1411 .attr = powernow_k8_attr, 1491 .attr = powernow_k8_attr,
1412}; 1492};
1413 1493
1494/*
1495 * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
1496 * cannot block the remaining ones from boosting. On the CPU_UP path we
1497 * simply keep the boost-disable flag in sync with the current global
1498 * state.
1499 */
1500static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
1501 void *hcpu)
1502{
1503 unsigned cpu = (long)hcpu;
1504 u32 lo, hi;
1505
1506 switch (action) {
1507 case CPU_UP_PREPARE:
1508 case CPU_UP_PREPARE_FROZEN:
1509
1510 if (!cpb_enabled) {
1511 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1512 lo |= BIT(25);
1513 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1514 }
1515 break;
1516
1517 case CPU_DOWN_PREPARE:
1518 case CPU_DOWN_PREPARE_FROZEN:
1519 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1520 lo &= ~BIT(25);
1521 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1522 break;
1523
1524 default:
1525 break;
1526 }
1527
1528 return NOTIFY_OK;
1529}
1530
1531static struct notifier_block __cpuinitdata cpb_nb = {
1532 .notifier_call = cpb_notify,
1533};
1534
1414/* driver entry point for init */ 1535/* driver entry point for init */
1415static int __cpuinit powernowk8_init(void) 1536static int __cpuinit powernowk8_init(void)
1416{ 1537{
1417 unsigned int i, supported_cpus = 0; 1538 unsigned int i, supported_cpus = 0, cpu;
1418 1539
1419 for_each_online_cpu(i) { 1540 for_each_online_cpu(i) {
1420 int rc; 1541 int rc;
@@ -1423,15 +1544,36 @@ static int __cpuinit powernowk8_init(void)
1423 supported_cpus++; 1544 supported_cpus++;
1424 } 1545 }
1425 1546
1426 if (supported_cpus == num_online_cpus()) { 1547 if (supported_cpus != num_online_cpus())
1427 printk(KERN_INFO PFX "Found %d %s " 1548 return -ENODEV;
1428 "processors (%d cpu cores) (" VERSION ")\n", 1549
1429 num_online_nodes(), 1550 printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
1430 boot_cpu_data.x86_model_id, supported_cpus); 1551 num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
1431 return cpufreq_register_driver(&cpufreq_amd64_driver); 1552
1553 if (boot_cpu_has(X86_FEATURE_CPB)) {
1554
1555 cpb_capable = true;
1556
1557 register_cpu_notifier(&cpb_nb);
1558
1559 msrs = msrs_alloc();
1560 if (!msrs) {
1561 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
1562 return -ENOMEM;
1563 }
1564
1565 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1566
1567 for_each_cpu(cpu, cpu_online_mask) {
1568 struct msr *reg = per_cpu_ptr(msrs, cpu);
1569 cpb_enabled |= !(!!(reg->l & BIT(25)));
1570 }
1571
1572 printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
1573 (cpb_enabled ? "on" : "off"));
1432 } 1574 }
1433 1575
1434 return -ENODEV; 1576 return cpufreq_register_driver(&cpufreq_amd64_driver);
1435} 1577}
1436 1578
1437/* driver entry point for term */ 1579/* driver entry point for term */
@@ -1439,6 +1581,13 @@ static void __exit powernowk8_exit(void)
1439{ 1581{
1440 dprintk("exit\n"); 1582 dprintk("exit\n");
1441 1583
1584 if (boot_cpu_has(X86_FEATURE_CPB)) {
1585 msrs_free(msrs);
1586 msrs = NULL;
1587
1588 unregister_cpu_notifier(&cpb_nb);
1589 }
1590
1442 cpufreq_unregister_driver(&cpufreq_amd64_driver); 1591 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1443} 1592}
1444 1593
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824073cb..df3529b1c02d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
5 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
6 */ 6 */
7 7
8
9enum pstate { 8enum pstate {
10 HW_PSTATE_INVALID = 0xff, 9 HW_PSTATE_INVALID = 0xff,
11 HW_PSTATE_0 = 0, 10 HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
55 struct cpumask *available_cores; 54 struct cpumask *available_cores;
56}; 55};
57 56
58
59/* processor's cpuid instruction support */ 57/* processor's cpuid instruction support */
60#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ 58#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
61#define CPUID_XFAM 0x0ff00000 /* extended family */ 59#define CPUID_XFAM 0x0ff00000 /* extended family */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922de33a..dd531cc56a8f 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,37 +21,55 @@
21 * 21 *
22 */ 22 */
23 23
24#include <linux/module.h>
24#include <asm/processor.h> 25#include <asm/processor.h>
25#include <asm/vmware.h>
26#include <asm/hypervisor.h> 26#include <asm/hypervisor.h>
27 27
28static inline void __cpuinit 28/*
29detect_hypervisor_vendor(struct cpuinfo_x86 *c) 29 * Hypervisor detect order. This is specified explicitly here because
30 * some hypervisors might implement compatibility modes for other
31 * hypervisors and therefore need to be detected in specific sequence.
32 */
33static const __initconst struct hypervisor_x86 * const hypervisors[] =
30{ 34{
31 if (vmware_platform()) 35 &x86_hyper_vmware,
32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; 36 &x86_hyper_ms_hyperv,
33 else 37};
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
35}
36 38
37static inline void __cpuinit 39const struct hypervisor_x86 *x86_hyper;
38hypervisor_set_feature_bits(struct cpuinfo_x86 *c) 40EXPORT_SYMBOL(x86_hyper);
41
42static inline void __init
43detect_hypervisor_vendor(void)
39{ 44{
40 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { 45 const struct hypervisor_x86 *h, * const *p;
41 vmware_set_feature_bits(c); 46
42 return; 47 for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
48 h = *p;
49 if (h->detect()) {
50 x86_hyper = h;
51 printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
52 break;
53 }
43 } 54 }
44} 55}
45 56
46void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) 57void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
47{ 58{
48 detect_hypervisor_vendor(c); 59 if (x86_hyper && x86_hyper->set_cpu_features)
49 hypervisor_set_feature_bits(c); 60 x86_hyper->set_cpu_features(c);
50} 61}
51 62
52void __init init_hypervisor_platform(void) 63void __init init_hypervisor_platform(void)
53{ 64{
65
66 detect_hypervisor_vendor();
67
68 if (!x86_hyper)
69 return;
70
54 init_hypervisor(&boot_cpu_data); 71 init_hypervisor(&boot_cpu_data);
55 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) 72
56 vmware_platform_setup(); 73 if (x86_hyper->init_platform)
74 x86_hyper->init_platform();
57} 75}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1366c7cfd483..85f69cdeae10 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
12#include <asm/processor.h> 12#include <asm/processor.h>
13#include <asm/pgtable.h> 13#include <asm/pgtable.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17#include <asm/cpu.h> 16#include <asm/cpu.h>
18 17
@@ -373,12 +372,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
373 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 372 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
374 } 373 }
375 374
376 if (c->cpuid_level > 6) {
377 unsigned ecx = cpuid_ecx(6);
378 if (ecx & 0x01)
379 set_cpu_cap(c, X86_FEATURE_APERFMPERF);
380 }
381
382 if (cpu_has_xmm2) 375 if (cpu_has_xmm2)
383 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 376 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
384 if (cpu_has_ds) { 377 if (cpu_has_ds) {
@@ -388,7 +381,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
388 set_cpu_cap(c, X86_FEATURE_BTS); 381 set_cpu_cap(c, X86_FEATURE_BTS);
389 if (!(l1 & (1<<12))) 382 if (!(l1 & (1<<12)))
390 set_cpu_cap(c, X86_FEATURE_PEBS); 383 set_cpu_cap(c, X86_FEATURE_PEBS);
391 ds_init_intel(c);
392 } 384 }
393 385
394 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) 386 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66c0a51..33eae2062cf5 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx {
148 u32 full; 148 u32 full;
149}; 149};
150 150
151struct amd_l3_cache {
152 struct pci_dev *dev;
153 bool can_disable;
154 unsigned indices;
155 u8 subcaches[4];
156};
157
151struct _cpuid4_info { 158struct _cpuid4_info {
152 union _cpuid4_leaf_eax eax; 159 union _cpuid4_leaf_eax eax;
153 union _cpuid4_leaf_ebx ebx; 160 union _cpuid4_leaf_ebx ebx;
154 union _cpuid4_leaf_ecx ecx; 161 union _cpuid4_leaf_ecx ecx;
155 unsigned long size; 162 unsigned long size;
156 bool can_disable; 163 struct amd_l3_cache *l3;
157 unsigned int l3_indices;
158 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 164 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
159}; 165};
160 166
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs {
164 union _cpuid4_leaf_ebx ebx; 170 union _cpuid4_leaf_ebx ebx;
165 union _cpuid4_leaf_ecx ecx; 171 union _cpuid4_leaf_ecx ecx;
166 unsigned long size; 172 unsigned long size;
167 bool can_disable; 173 struct amd_l3_cache *l3;
168 unsigned int l3_indices;
169}; 174};
170 175
171unsigned short num_cache_leaves; 176unsigned short num_cache_leaves;
@@ -302,87 +307,163 @@ struct _cache_attr {
302}; 307};
303 308
304#ifdef CONFIG_CPU_SUP_AMD 309#ifdef CONFIG_CPU_SUP_AMD
305static unsigned int __cpuinit amd_calc_l3_indices(void) 310
311/*
312 * L3 cache descriptors
313 */
314static struct amd_l3_cache **__cpuinitdata l3_caches;
315
316static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
306{ 317{
307 /*
308 * We're called over smp_call_function_single() and therefore
309 * are on the correct cpu.
310 */
311 int cpu = smp_processor_id();
312 int node = cpu_to_node(cpu);
313 struct pci_dev *dev = node_to_k8_nb_misc(node);
314 unsigned int sc0, sc1, sc2, sc3; 318 unsigned int sc0, sc1, sc2, sc3;
315 u32 val = 0; 319 u32 val = 0;
316 320
317 pci_read_config_dword(dev, 0x1C4, &val); 321 pci_read_config_dword(l3->dev, 0x1C4, &val);
318 322
319 /* calculate subcache sizes */ 323 /* calculate subcache sizes */
320 sc0 = !(val & BIT(0)); 324 l3->subcaches[0] = sc0 = !(val & BIT(0));
321 sc1 = !(val & BIT(4)); 325 l3->subcaches[1] = sc1 = !(val & BIT(4));
322 sc2 = !(val & BIT(8)) + !(val & BIT(9)); 326 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
323 sc3 = !(val & BIT(12)) + !(val & BIT(13)); 327 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
324 328
325 return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; 329 l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
330}
331
332static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
333{
334 struct amd_l3_cache *l3;
335 struct pci_dev *dev = node_to_k8_nb_misc(node);
336
337 l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
338 if (!l3) {
339 printk(KERN_WARNING "Error allocating L3 struct\n");
340 return NULL;
341 }
342
343 l3->dev = dev;
344
345 amd_calc_l3_indices(l3);
346
347 return l3;
326} 348}
327 349
328static void __cpuinit 350static void __cpuinit
329amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 351amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
330{ 352{
331 if (index < 3) 353 int node;
354
355 if (boot_cpu_data.x86 != 0x10)
332 return; 356 return;
333 357
334 if (boot_cpu_data.x86 == 0x11) 358 if (index < 3)
335 return; 359 return;
336 360
337 /* see errata #382 and #388 */ 361 /* see errata #382 and #388 */
338 if ((boot_cpu_data.x86 == 0x10) && 362 if (boot_cpu_data.x86_model < 0x8)
339 ((boot_cpu_data.x86_model < 0x8) || 363 return;
340 (boot_cpu_data.x86_mask < 0x1))) 364
365 if ((boot_cpu_data.x86_model == 0x8 ||
366 boot_cpu_data.x86_model == 0x9)
367 &&
368 boot_cpu_data.x86_mask < 0x1)
369 return;
370
371 /* not in virtualized environments */
372 if (num_k8_northbridges == 0)
341 return; 373 return;
342 374
343 this_leaf->can_disable = true; 375 /*
344 this_leaf->l3_indices = amd_calc_l3_indices(); 376 * Strictly speaking, the amount in @size below is leaked since it is
377 * never freed but this is done only on shutdown so it doesn't matter.
378 */
379 if (!l3_caches) {
380 int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
381
382 l3_caches = kzalloc(size, GFP_ATOMIC);
383 if (!l3_caches)
384 return;
385 }
386
387 node = amd_get_nb_id(smp_processor_id());
388
389 if (!l3_caches[node]) {
390 l3_caches[node] = amd_init_l3_cache(node);
391 l3_caches[node]->can_disable = true;
392 }
393
394 WARN_ON(!l3_caches[node]);
395
396 this_leaf->l3 = l3_caches[node];
345} 397}
346 398
347static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, 399static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
348 unsigned int index) 400 unsigned int slot)
349{ 401{
350 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 402 struct pci_dev *dev = this_leaf->l3->dev;
351 int node = amd_get_nb_id(cpu);
352 struct pci_dev *dev = node_to_k8_nb_misc(node);
353 unsigned int reg = 0; 403 unsigned int reg = 0;
354 404
355 if (!this_leaf->can_disable) 405 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
356 return -EINVAL; 406 return -EINVAL;
357 407
358 if (!dev) 408 if (!dev)
359 return -EINVAL; 409 return -EINVAL;
360 410
361 pci_read_config_dword(dev, 0x1BC + index * 4, &reg); 411 pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
362 return sprintf(buf, "0x%08x\n", reg); 412 return sprintf(buf, "0x%08x\n", reg);
363} 413}
364 414
365#define SHOW_CACHE_DISABLE(index) \ 415#define SHOW_CACHE_DISABLE(slot) \
366static ssize_t \ 416static ssize_t \
367show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ 417show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \
368{ \ 418{ \
369 return show_cache_disable(this_leaf, buf, index); \ 419 return show_cache_disable(this_leaf, buf, slot); \
370} 420}
371SHOW_CACHE_DISABLE(0) 421SHOW_CACHE_DISABLE(0)
372SHOW_CACHE_DISABLE(1) 422SHOW_CACHE_DISABLE(1)
373 423
424static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
425 unsigned slot, unsigned long idx)
426{
427 int i;
428
429 idx |= BIT(30);
430
431 /*
432 * disable index in all 4 subcaches
433 */
434 for (i = 0; i < 4; i++) {
435 u32 reg = idx | (i << 20);
436
437 if (!l3->subcaches[i])
438 continue;
439
440 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
441
442 /*
443 * We need to WBINVD on a core on the node containing the L3
444 * cache which indices we disable therefore a simple wbinvd()
445 * is not sufficient.
446 */
447 wbinvd_on_cpu(cpu);
448
449 reg |= BIT(31);
450 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
451 }
452}
453
454
374static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, 455static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
375 const char *buf, size_t count, unsigned int index) 456 const char *buf, size_t count,
457 unsigned int slot)
376{ 458{
459 struct pci_dev *dev = this_leaf->l3->dev;
377 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 460 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
378 int node = amd_get_nb_id(cpu);
379 struct pci_dev *dev = node_to_k8_nb_misc(node);
380 unsigned long val = 0; 461 unsigned long val = 0;
381 462
382#define SUBCACHE_MASK (3UL << 20) 463#define SUBCACHE_MASK (3UL << 20)
383#define SUBCACHE_INDEX 0xfff 464#define SUBCACHE_INDEX 0xfff
384 465
385 if (!this_leaf->can_disable) 466 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
386 return -EINVAL; 467 return -EINVAL;
387 468
388 if (!capable(CAP_SYS_ADMIN)) 469 if (!capable(CAP_SYS_ADMIN))
@@ -396,26 +477,20 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
396 477
397 /* do not allow writes outside of allowed bits */ 478 /* do not allow writes outside of allowed bits */
398 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || 479 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
399 ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) 480 ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
400 return -EINVAL; 481 return -EINVAL;
401 482
402 val |= BIT(30); 483 amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
403 pci_write_config_dword(dev, 0x1BC + index * 4, val); 484
404 /*
405 * We need to WBINVD on a core on the node containing the L3 cache which
406 * indices we disable therefore a simple wbinvd() is not sufficient.
407 */
408 wbinvd_on_cpu(cpu);
409 pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
410 return count; 485 return count;
411} 486}
412 487
413#define STORE_CACHE_DISABLE(index) \ 488#define STORE_CACHE_DISABLE(slot) \
414static ssize_t \ 489static ssize_t \
415store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ 490store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
416 const char *buf, size_t count) \ 491 const char *buf, size_t count) \
417{ \ 492{ \
418 return store_cache_disable(this_leaf, buf, count, index); \ 493 return store_cache_disable(this_leaf, buf, count, slot); \
419} 494}
420STORE_CACHE_DISABLE(0) 495STORE_CACHE_DISABLE(0)
421STORE_CACHE_DISABLE(1) 496STORE_CACHE_DISABLE(1)
@@ -443,8 +518,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
443 518
444 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 519 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
445 amd_cpuid4(index, &eax, &ebx, &ecx); 520 amd_cpuid4(index, &eax, &ebx, &ecx);
446 if (boot_cpu_data.x86 >= 0x10) 521 amd_check_l3_disable(index, this_leaf);
447 amd_check_l3_disable(index, this_leaf);
448 } else { 522 } else {
449 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 523 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
450 } 524 }
@@ -701,6 +775,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
701 for (i = 0; i < num_cache_leaves; i++) 775 for (i = 0; i < num_cache_leaves; i++)
702 cache_remove_shared_cpu_map(cpu, i); 776 cache_remove_shared_cpu_map(cpu, i);
703 777
778 kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
704 kfree(per_cpu(ici_cpuid4_info, cpu)); 779 kfree(per_cpu(ici_cpuid4_info, cpu));
705 per_cpu(ici_cpuid4_info, cpu) = NULL; 780 per_cpu(ici_cpuid4_info, cpu) = NULL;
706} 781}
@@ -985,7 +1060,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
985 1060
986 this_leaf = CPUID4_INFO_IDX(cpu, i); 1061 this_leaf = CPUID4_INFO_IDX(cpu, i);
987 1062
988 if (this_leaf->can_disable) 1063 if (this_leaf->l3 && this_leaf->l3->can_disable)
989 ktype_cache.default_attrs = default_l3_attrs; 1064 ktype_cache.default_attrs = default_l3_attrs;
990 else 1065 else
991 ktype_cache.default_attrs = default_attrs; 1066 ktype_cache.default_attrs = default_attrs;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767e..7a355ddcc64b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -539,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
539 struct mce m; 539 struct mce m;
540 int i; 540 int i;
541 541
542 __get_cpu_var(mce_poll_count)++; 542 percpu_inc(mce_poll_count);
543 543
544 mce_setup(&m); 544 mce_setup(&m);
545 545
@@ -934,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
934 934
935 atomic_inc(&mce_entry); 935 atomic_inc(&mce_entry);
936 936
937 __get_cpu_var(mce_exception_count)++; 937 percpu_inc(mce_exception_count);
938 938
939 if (notify_die(DIE_NMI, "machine check", regs, error_code, 939 if (notify_die(DIE_NMI, "machine check", regs, error_code,
940 18, SIGKILL) == NOTIFY_STOP) 940 18, SIGKILL) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 000000000000..16f41bbe46b6
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,55 @@
1/*
2 * HyperV Detection code.
3 *
4 * Copyright (C) 2010, Novell, Inc.
5 * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; version 2 of the License.
10 *
11 */
12
13#include <linux/types.h>
14#include <linux/module.h>
15#include <asm/processor.h>
16#include <asm/hypervisor.h>
17#include <asm/hyperv.h>
18#include <asm/mshyperv.h>
19
20struct ms_hyperv_info ms_hyperv;
21
22static bool __init ms_hyperv_platform(void)
23{
24 u32 eax;
25 u32 hyp_signature[3];
26
27 if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
28 return false;
29
30 cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
31 &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
32
33 return eax >= HYPERV_CPUID_MIN &&
34 eax <= HYPERV_CPUID_MAX &&
35 !memcmp("Microsoft Hv", hyp_signature, 12);
36}
37
38static void __init ms_hyperv_init_platform(void)
39{
40 /*
41 * Extract the features and hints
42 */
43 ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
44 ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
45
46 printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
47 ms_hyperv.features, ms_hyperv.hints);
48}
49
50const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
51 .name = "Microsoft HyperV",
52 .detect = ms_hyperv_platform,
53 .init_platform = ms_hyperv_init_platform,
54};
55EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index db5bdc8addf8..fd4db0db3708 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,46 +31,51 @@
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33 33
34static u64 perf_event_mask __read_mostly; 34#if 0
35#undef wrmsrl
36#define wrmsrl(msr, val) \
37do { \
38 trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
39 (unsigned long)(val)); \
40 native_write_msr((msr), (u32)((u64)(val)), \
41 (u32)((u64)(val) >> 32)); \
42} while (0)
43#endif
35 44
36/* The maximal number of PEBS events: */ 45/*
37#define MAX_PEBS_EVENTS 4 46 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
47 */
48static unsigned long
49copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
50{
51 unsigned long offset, addr = (unsigned long)from;
52 int type = in_nmi() ? KM_NMI : KM_IRQ0;
53 unsigned long size, len = 0;
54 struct page *page;
55 void *map;
56 int ret;
38 57
39/* The size of a BTS record in bytes: */ 58 do {
40#define BTS_RECORD_SIZE 24 59 ret = __get_user_pages_fast(addr, 1, 0, &page);
60 if (!ret)
61 break;
41 62
42/* The size of a per-cpu BTS buffer in bytes: */ 63 offset = addr & (PAGE_SIZE - 1);
43#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) 64 size = min(PAGE_SIZE - offset, n - len);
44 65
45/* The BTS overflow threshold in bytes from the end of the buffer: */ 66 map = kmap_atomic(page, type);
46#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) 67 memcpy(to, map+offset, size);
68 kunmap_atomic(map, type);
69 put_page(page);
47 70
71 len += size;
72 to += size;
73 addr += size;
48 74
49/* 75 } while (len < n);
50 * Bits in the debugctlmsr controlling branch tracing.
51 */
52#define X86_DEBUGCTL_TR (1 << 6)
53#define X86_DEBUGCTL_BTS (1 << 7)
54#define X86_DEBUGCTL_BTINT (1 << 8)
55#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
56#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
57 76
58/* 77 return len;
59 * A debug store configuration. 78}
60 *
61 * We only support architectures that use 64bit fields.
62 */
63struct debug_store {
64 u64 bts_buffer_base;
65 u64 bts_index;
66 u64 bts_absolute_maximum;
67 u64 bts_interrupt_threshold;
68 u64 pebs_buffer_base;
69 u64 pebs_index;
70 u64 pebs_absolute_maximum;
71 u64 pebs_interrupt_threshold;
72 u64 pebs_event_reset[MAX_PEBS_EVENTS];
73};
74 79
75struct event_constraint { 80struct event_constraint {
76 union { 81 union {
@@ -89,18 +94,41 @@ struct amd_nb {
89 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 94 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
90}; 95};
91 96
97#define MAX_LBR_ENTRIES 16
98
92struct cpu_hw_events { 99struct cpu_hw_events {
100 /*
101 * Generic x86 PMC bits
102 */
93 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ 103 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
94 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 104 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
95 unsigned long interrupts;
96 int enabled; 105 int enabled;
97 struct debug_store *ds;
98 106
99 int n_events; 107 int n_events;
100 int n_added; 108 int n_added;
101 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ 109 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
102 u64 tags[X86_PMC_IDX_MAX]; 110 u64 tags[X86_PMC_IDX_MAX];
103 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ 111 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
112
113 unsigned int group_flag;
114
115 /*
116 * Intel DebugStore bits
117 */
118 struct debug_store *ds;
119 u64 pebs_enabled;
120
121 /*
122 * Intel LBR bits
123 */
124 int lbr_users;
125 void *lbr_context;
126 struct perf_branch_stack lbr_stack;
127 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
128
129 /*
130 * AMD specific bits
131 */
104 struct amd_nb *amd_nb; 132 struct amd_nb *amd_nb;
105}; 133};
106 134
@@ -114,44 +142,75 @@ struct cpu_hw_events {
114#define EVENT_CONSTRAINT(c, n, m) \ 142#define EVENT_CONSTRAINT(c, n, m) \
115 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) 143 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
116 144
145/*
146 * Constraint on the Event code.
147 */
117#define INTEL_EVENT_CONSTRAINT(c, n) \ 148#define INTEL_EVENT_CONSTRAINT(c, n) \
118 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) 149 EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
119 150
151/*
152 * Constraint on the Event code + UMask + fixed-mask
153 *
154 * filter mask to validate fixed counter events.
155 * the following filters disqualify for fixed counters:
156 * - inv
157 * - edge
158 * - cnt-mask
159 * The other filters are supported by fixed counters.
160 * The any-thread option is supported starting with v3.
161 */
120#define FIXED_EVENT_CONSTRAINT(c, n) \ 162#define FIXED_EVENT_CONSTRAINT(c, n) \
121 EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) 163 EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
164
165/*
166 * Constraint on the Event code + UMask
167 */
168#define PEBS_EVENT_CONSTRAINT(c, n) \
169 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
122 170
123#define EVENT_CONSTRAINT_END \ 171#define EVENT_CONSTRAINT_END \
124 EVENT_CONSTRAINT(0, 0, 0) 172 EVENT_CONSTRAINT(0, 0, 0)
125 173
126#define for_each_event_constraint(e, c) \ 174#define for_each_event_constraint(e, c) \
127 for ((e) = (c); (e)->cmask; (e)++) 175 for ((e) = (c); (e)->weight; (e)++)
176
177union perf_capabilities {
178 struct {
179 u64 lbr_format : 6;
180 u64 pebs_trap : 1;
181 u64 pebs_arch_reg : 1;
182 u64 pebs_format : 4;
183 u64 smm_freeze : 1;
184 };
185 u64 capabilities;
186};
128 187
129/* 188/*
130 * struct x86_pmu - generic x86 pmu 189 * struct x86_pmu - generic x86 pmu
131 */ 190 */
132struct x86_pmu { 191struct x86_pmu {
192 /*
193 * Generic x86 PMC bits
194 */
133 const char *name; 195 const char *name;
134 int version; 196 int version;
135 int (*handle_irq)(struct pt_regs *); 197 int (*handle_irq)(struct pt_regs *);
136 void (*disable_all)(void); 198 void (*disable_all)(void);
137 void (*enable_all)(void); 199 void (*enable_all)(int added);
138 void (*enable)(struct perf_event *); 200 void (*enable)(struct perf_event *);
139 void (*disable)(struct perf_event *); 201 void (*disable)(struct perf_event *);
202 int (*hw_config)(struct perf_event *event);
203 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
140 unsigned eventsel; 204 unsigned eventsel;
141 unsigned perfctr; 205 unsigned perfctr;
142 u64 (*event_map)(int); 206 u64 (*event_map)(int);
143 u64 (*raw_event)(u64);
144 int max_events; 207 int max_events;
145 int num_events; 208 int num_counters;
146 int num_events_fixed; 209 int num_counters_fixed;
147 int event_bits; 210 int cntval_bits;
148 u64 event_mask; 211 u64 cntval_mask;
149 int apic; 212 int apic;
150 u64 max_period; 213 u64 max_period;
151 u64 intel_ctrl;
152 void (*enable_bts)(u64 config);
153 void (*disable_bts)(void);
154
155 struct event_constraint * 214 struct event_constraint *
156 (*get_event_constraints)(struct cpu_hw_events *cpuc, 215 (*get_event_constraints)(struct cpu_hw_events *cpuc,
157 struct perf_event *event); 216 struct perf_event *event);
@@ -159,11 +218,32 @@ struct x86_pmu {
159 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 218 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
160 struct perf_event *event); 219 struct perf_event *event);
161 struct event_constraint *event_constraints; 220 struct event_constraint *event_constraints;
221 void (*quirks)(void);
162 222
163 int (*cpu_prepare)(int cpu); 223 int (*cpu_prepare)(int cpu);
164 void (*cpu_starting)(int cpu); 224 void (*cpu_starting)(int cpu);
165 void (*cpu_dying)(int cpu); 225 void (*cpu_dying)(int cpu);
166 void (*cpu_dead)(int cpu); 226 void (*cpu_dead)(int cpu);
227
228 /*
229 * Intel Arch Perfmon v2+
230 */
231 u64 intel_ctrl;
232 union perf_capabilities intel_cap;
233
234 /*
235 * Intel DebugStore bits
236 */
237 int bts, pebs;
238 int pebs_record_size;
239 void (*drain_pebs)(struct pt_regs *regs);
240 struct event_constraint *pebs_constraints;
241
242 /*
243 * Intel LBR
244 */
245 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
246 int lbr_nr; /* hardware stack size */
167}; 247};
168 248
169static struct x86_pmu x86_pmu __read_mostly; 249static struct x86_pmu x86_pmu __read_mostly;
@@ -198,7 +278,7 @@ static u64
198x86_perf_event_update(struct perf_event *event) 278x86_perf_event_update(struct perf_event *event)
199{ 279{
200 struct hw_perf_event *hwc = &event->hw; 280 struct hw_perf_event *hwc = &event->hw;
201 int shift = 64 - x86_pmu.event_bits; 281 int shift = 64 - x86_pmu.cntval_bits;
202 u64 prev_raw_count, new_raw_count; 282 u64 prev_raw_count, new_raw_count;
203 int idx = hwc->idx; 283 int idx = hwc->idx;
204 s64 delta; 284 s64 delta;
@@ -241,33 +321,32 @@ again:
241static atomic_t active_events; 321static atomic_t active_events;
242static DEFINE_MUTEX(pmc_reserve_mutex); 322static DEFINE_MUTEX(pmc_reserve_mutex);
243 323
324#ifdef CONFIG_X86_LOCAL_APIC
325
244static bool reserve_pmc_hardware(void) 326static bool reserve_pmc_hardware(void)
245{ 327{
246#ifdef CONFIG_X86_LOCAL_APIC
247 int i; 328 int i;
248 329
249 if (nmi_watchdog == NMI_LOCAL_APIC) 330 if (nmi_watchdog == NMI_LOCAL_APIC)
250 disable_lapic_nmi_watchdog(); 331 disable_lapic_nmi_watchdog();
251 332
252 for (i = 0; i < x86_pmu.num_events; i++) { 333 for (i = 0; i < x86_pmu.num_counters; i++) {
253 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 334 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
254 goto perfctr_fail; 335 goto perfctr_fail;
255 } 336 }
256 337
257 for (i = 0; i < x86_pmu.num_events; i++) { 338 for (i = 0; i < x86_pmu.num_counters; i++) {
258 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 339 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
259 goto eventsel_fail; 340 goto eventsel_fail;
260 } 341 }
261#endif
262 342
263 return true; 343 return true;
264 344
265#ifdef CONFIG_X86_LOCAL_APIC
266eventsel_fail: 345eventsel_fail:
267 for (i--; i >= 0; i--) 346 for (i--; i >= 0; i--)
268 release_evntsel_nmi(x86_pmu.eventsel + i); 347 release_evntsel_nmi(x86_pmu.eventsel + i);
269 348
270 i = x86_pmu.num_events; 349 i = x86_pmu.num_counters;
271 350
272perfctr_fail: 351perfctr_fail:
273 for (i--; i >= 0; i--) 352 for (i--; i >= 0; i--)
@@ -277,128 +356,36 @@ perfctr_fail:
277 enable_lapic_nmi_watchdog(); 356 enable_lapic_nmi_watchdog();
278 357
279 return false; 358 return false;
280#endif
281} 359}
282 360
283static void release_pmc_hardware(void) 361static void release_pmc_hardware(void)
284{ 362{
285#ifdef CONFIG_X86_LOCAL_APIC
286 int i; 363 int i;
287 364
288 for (i = 0; i < x86_pmu.num_events; i++) { 365 for (i = 0; i < x86_pmu.num_counters; i++) {
289 release_perfctr_nmi(x86_pmu.perfctr + i); 366 release_perfctr_nmi(x86_pmu.perfctr + i);
290 release_evntsel_nmi(x86_pmu.eventsel + i); 367 release_evntsel_nmi(x86_pmu.eventsel + i);
291 } 368 }
292 369
293 if (nmi_watchdog == NMI_LOCAL_APIC) 370 if (nmi_watchdog == NMI_LOCAL_APIC)
294 enable_lapic_nmi_watchdog(); 371 enable_lapic_nmi_watchdog();
295#endif
296}
297
298static inline bool bts_available(void)
299{
300 return x86_pmu.enable_bts != NULL;
301} 372}
302 373
303static void init_debug_store_on_cpu(int cpu) 374#else
304{
305 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
306
307 if (!ds)
308 return;
309
310 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
311 (u32)((u64)(unsigned long)ds),
312 (u32)((u64)(unsigned long)ds >> 32));
313}
314
315static void fini_debug_store_on_cpu(int cpu)
316{
317 if (!per_cpu(cpu_hw_events, cpu).ds)
318 return;
319
320 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
321}
322
323static void release_bts_hardware(void)
324{
325 int cpu;
326
327 if (!bts_available())
328 return;
329
330 get_online_cpus();
331
332 for_each_online_cpu(cpu)
333 fini_debug_store_on_cpu(cpu);
334
335 for_each_possible_cpu(cpu) {
336 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
337
338 if (!ds)
339 continue;
340
341 per_cpu(cpu_hw_events, cpu).ds = NULL;
342
343 kfree((void *)(unsigned long)ds->bts_buffer_base);
344 kfree(ds);
345 }
346
347 put_online_cpus();
348}
349
350static int reserve_bts_hardware(void)
351{
352 int cpu, err = 0;
353
354 if (!bts_available())
355 return 0;
356
357 get_online_cpus();
358
359 for_each_possible_cpu(cpu) {
360 struct debug_store *ds;
361 void *buffer;
362
363 err = -ENOMEM;
364 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
365 if (unlikely(!buffer))
366 break;
367
368 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
369 if (unlikely(!ds)) {
370 kfree(buffer);
371 break;
372 }
373
374 ds->bts_buffer_base = (u64)(unsigned long)buffer;
375 ds->bts_index = ds->bts_buffer_base;
376 ds->bts_absolute_maximum =
377 ds->bts_buffer_base + BTS_BUFFER_SIZE;
378 ds->bts_interrupt_threshold =
379 ds->bts_absolute_maximum - BTS_OVFL_TH;
380
381 per_cpu(cpu_hw_events, cpu).ds = ds;
382 err = 0;
383 }
384 375
385 if (err) 376static bool reserve_pmc_hardware(void) { return true; }
386 release_bts_hardware(); 377static void release_pmc_hardware(void) {}
387 else {
388 for_each_online_cpu(cpu)
389 init_debug_store_on_cpu(cpu);
390 }
391 378
392 put_online_cpus(); 379#endif
393 380
394 return err; 381static int reserve_ds_buffers(void);
395} 382static void release_ds_buffers(void);
396 383
397static void hw_perf_event_destroy(struct perf_event *event) 384static void hw_perf_event_destroy(struct perf_event *event)
398{ 385{
399 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { 386 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
400 release_pmc_hardware(); 387 release_pmc_hardware();
401 release_bts_hardware(); 388 release_ds_buffers();
402 mutex_unlock(&pmc_reserve_mutex); 389 mutex_unlock(&pmc_reserve_mutex);
403 } 390 }
404} 391}
@@ -441,54 +428,11 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
441 return 0; 428 return 0;
442} 429}
443 430
444/* 431static int x86_setup_perfctr(struct perf_event *event)
445 * Setup the hardware configuration for a given attr_type
446 */
447static int __hw_perf_event_init(struct perf_event *event)
448{ 432{
449 struct perf_event_attr *attr = &event->attr; 433 struct perf_event_attr *attr = &event->attr;
450 struct hw_perf_event *hwc = &event->hw; 434 struct hw_perf_event *hwc = &event->hw;
451 u64 config; 435 u64 config;
452 int err;
453
454 if (!x86_pmu_initialized())
455 return -ENODEV;
456
457 err = 0;
458 if (!atomic_inc_not_zero(&active_events)) {
459 mutex_lock(&pmc_reserve_mutex);
460 if (atomic_read(&active_events) == 0) {
461 if (!reserve_pmc_hardware())
462 err = -EBUSY;
463 else
464 err = reserve_bts_hardware();
465 }
466 if (!err)
467 atomic_inc(&active_events);
468 mutex_unlock(&pmc_reserve_mutex);
469 }
470 if (err)
471 return err;
472
473 event->destroy = hw_perf_event_destroy;
474
475 /*
476 * Generate PMC IRQs:
477 * (keep 'enabled' bit clear for now)
478 */
479 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
480
481 hwc->idx = -1;
482 hwc->last_cpu = -1;
483 hwc->last_tag = ~0ULL;
484
485 /*
486 * Count user and OS events unless requested not to.
487 */
488 if (!attr->exclude_user)
489 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
490 if (!attr->exclude_kernel)
491 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
492 436
493 if (!hwc->sample_period) { 437 if (!hwc->sample_period) {
494 hwc->sample_period = x86_pmu.max_period; 438 hwc->sample_period = x86_pmu.max_period;
@@ -505,16 +449,8 @@ static int __hw_perf_event_init(struct perf_event *event)
505 return -EOPNOTSUPP; 449 return -EOPNOTSUPP;
506 } 450 }
507 451
508 /* 452 if (attr->type == PERF_TYPE_RAW)
509 * Raw hw_event type provide the config in the hw_event structure
510 */
511 if (attr->type == PERF_TYPE_RAW) {
512 hwc->config |= x86_pmu.raw_event(attr->config);
513 if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
514 perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
515 return -EACCES;
516 return 0; 453 return 0;
517 }
518 454
519 if (attr->type == PERF_TYPE_HW_CACHE) 455 if (attr->type == PERF_TYPE_HW_CACHE)
520 return set_ext_hw_attr(hwc, attr); 456 return set_ext_hw_attr(hwc, attr);
@@ -539,11 +475,11 @@ static int __hw_perf_event_init(struct perf_event *event)
539 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && 475 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
540 (hwc->sample_period == 1)) { 476 (hwc->sample_period == 1)) {
541 /* BTS is not supported by this architecture. */ 477 /* BTS is not supported by this architecture. */
542 if (!bts_available()) 478 if (!x86_pmu.bts)
543 return -EOPNOTSUPP; 479 return -EOPNOTSUPP;
544 480
545 /* BTS is currently only allowed for user-mode. */ 481 /* BTS is currently only allowed for user-mode. */
546 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) 482 if (!attr->exclude_kernel)
547 return -EOPNOTSUPP; 483 return -EOPNOTSUPP;
548 } 484 }
549 485
@@ -552,12 +488,87 @@ static int __hw_perf_event_init(struct perf_event *event)
552 return 0; 488 return 0;
553} 489}
554 490
491static int x86_pmu_hw_config(struct perf_event *event)
492{
493 if (event->attr.precise_ip) {
494 int precise = 0;
495
496 /* Support for constant skid */
497 if (x86_pmu.pebs)
498 precise++;
499
500 /* Support for IP fixup */
501 if (x86_pmu.lbr_nr)
502 precise++;
503
504 if (event->attr.precise_ip > precise)
505 return -EOPNOTSUPP;
506 }
507
508 /*
509 * Generate PMC IRQs:
510 * (keep 'enabled' bit clear for now)
511 */
512 event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
513
514 /*
515 * Count user and OS events unless requested not to
516 */
517 if (!event->attr.exclude_user)
518 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
519 if (!event->attr.exclude_kernel)
520 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
521
522 if (event->attr.type == PERF_TYPE_RAW)
523 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
524
525 return x86_setup_perfctr(event);
526}
527
528/*
529 * Setup the hardware configuration for a given attr_type
530 */
531static int __hw_perf_event_init(struct perf_event *event)
532{
533 int err;
534
535 if (!x86_pmu_initialized())
536 return -ENODEV;
537
538 err = 0;
539 if (!atomic_inc_not_zero(&active_events)) {
540 mutex_lock(&pmc_reserve_mutex);
541 if (atomic_read(&active_events) == 0) {
542 if (!reserve_pmc_hardware())
543 err = -EBUSY;
544 else {
545 err = reserve_ds_buffers();
546 if (err)
547 release_pmc_hardware();
548 }
549 }
550 if (!err)
551 atomic_inc(&active_events);
552 mutex_unlock(&pmc_reserve_mutex);
553 }
554 if (err)
555 return err;
556
557 event->destroy = hw_perf_event_destroy;
558
559 event->hw.idx = -1;
560 event->hw.last_cpu = -1;
561 event->hw.last_tag = ~0ULL;
562
563 return x86_pmu.hw_config(event);
564}
565
555static void x86_pmu_disable_all(void) 566static void x86_pmu_disable_all(void)
556{ 567{
557 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 568 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
558 int idx; 569 int idx;
559 570
560 for (idx = 0; idx < x86_pmu.num_events; idx++) { 571 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
561 u64 val; 572 u64 val;
562 573
563 if (!test_bit(idx, cpuc->active_mask)) 574 if (!test_bit(idx, cpuc->active_mask))
@@ -587,12 +598,12 @@ void hw_perf_disable(void)
587 x86_pmu.disable_all(); 598 x86_pmu.disable_all();
588} 599}
589 600
590static void x86_pmu_enable_all(void) 601static void x86_pmu_enable_all(int added)
591{ 602{
592 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 603 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
593 int idx; 604 int idx;
594 605
595 for (idx = 0; idx < x86_pmu.num_events; idx++) { 606 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
596 struct perf_event *event = cpuc->events[idx]; 607 struct perf_event *event = cpuc->events[idx];
597 u64 val; 608 u64 val;
598 609
@@ -667,14 +678,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
667 * assign events to counters starting with most 678 * assign events to counters starting with most
668 * constrained events. 679 * constrained events.
669 */ 680 */
670 wmax = x86_pmu.num_events; 681 wmax = x86_pmu.num_counters;
671 682
672 /* 683 /*
673 * when fixed event counters are present, 684 * when fixed event counters are present,
674 * wmax is incremented by 1 to account 685 * wmax is incremented by 1 to account
675 * for one more choice 686 * for one more choice
676 */ 687 */
677 if (x86_pmu.num_events_fixed) 688 if (x86_pmu.num_counters_fixed)
678 wmax++; 689 wmax++;
679 690
680 for (w = 1, num = n; num && w <= wmax; w++) { 691 for (w = 1, num = n; num && w <= wmax; w++) {
@@ -724,7 +735,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
724 struct perf_event *event; 735 struct perf_event *event;
725 int n, max_count; 736 int n, max_count;
726 737
727 max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; 738 max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
728 739
729 /* current number of events already accepted */ 740 /* current number of events already accepted */
730 n = cpuc->n_events; 741 n = cpuc->n_events;
@@ -795,7 +806,7 @@ void hw_perf_enable(void)
795 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 806 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
796 struct perf_event *event; 807 struct perf_event *event;
797 struct hw_perf_event *hwc; 808 struct hw_perf_event *hwc;
798 int i; 809 int i, added = cpuc->n_added;
799 810
800 if (!x86_pmu_initialized()) 811 if (!x86_pmu_initialized())
801 return; 812 return;
@@ -847,19 +858,20 @@ void hw_perf_enable(void)
847 cpuc->enabled = 1; 858 cpuc->enabled = 1;
848 barrier(); 859 barrier();
849 860
850 x86_pmu.enable_all(); 861 x86_pmu.enable_all(added);
851} 862}
852 863
853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) 864static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
865 u64 enable_mask)
854{ 866{
855 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 867 wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
856 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
857} 868}
858 869
859static inline void x86_pmu_disable_event(struct perf_event *event) 870static inline void x86_pmu_disable_event(struct perf_event *event)
860{ 871{
861 struct hw_perf_event *hwc = &event->hw; 872 struct hw_perf_event *hwc = &event->hw;
862 (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); 873
874 wrmsrl(hwc->config_base + hwc->idx, hwc->config);
863} 875}
864 876
865static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 877static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -874,7 +886,7 @@ x86_perf_event_set_period(struct perf_event *event)
874 struct hw_perf_event *hwc = &event->hw; 886 struct hw_perf_event *hwc = &event->hw;
875 s64 left = atomic64_read(&hwc->period_left); 887 s64 left = atomic64_read(&hwc->period_left);
876 s64 period = hwc->sample_period; 888 s64 period = hwc->sample_period;
877 int err, ret = 0, idx = hwc->idx; 889 int ret = 0, idx = hwc->idx;
878 890
879 if (idx == X86_PMC_IDX_FIXED_BTS) 891 if (idx == X86_PMC_IDX_FIXED_BTS)
880 return 0; 892 return 0;
@@ -912,8 +924,8 @@ x86_perf_event_set_period(struct perf_event *event)
912 */ 924 */
913 atomic64_set(&hwc->prev_count, (u64)-left); 925 atomic64_set(&hwc->prev_count, (u64)-left);
914 926
915 err = checking_wrmsrl(hwc->event_base + idx, 927 wrmsrl(hwc->event_base + idx,
916 (u64)(-left) & x86_pmu.event_mask); 928 (u64)(-left) & x86_pmu.cntval_mask);
917 929
918 perf_event_update_userpage(event); 930 perf_event_update_userpage(event);
919 931
@@ -924,7 +936,8 @@ static void x86_pmu_enable_event(struct perf_event *event)
924{ 936{
925 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 937 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
926 if (cpuc->enabled) 938 if (cpuc->enabled)
927 __x86_pmu_enable_event(&event->hw); 939 __x86_pmu_enable_event(&event->hw,
940 ARCH_PERFMON_EVENTSEL_ENABLE);
928} 941}
929 942
930/* 943/*
@@ -950,7 +963,15 @@ static int x86_pmu_enable(struct perf_event *event)
950 if (n < 0) 963 if (n < 0)
951 return n; 964 return n;
952 965
953 ret = x86_schedule_events(cpuc, n, assign); 966 /*
967 * If group events scheduling transaction was started,
968 * skip the schedulability test here, it will be peformed
969 * at commit time(->commit_txn) as a whole
970 */
971 if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
972 goto out;
973
974 ret = x86_pmu.schedule_events(cpuc, n, assign);
954 if (ret) 975 if (ret)
955 return ret; 976 return ret;
956 /* 977 /*
@@ -959,6 +980,7 @@ static int x86_pmu_enable(struct perf_event *event)
959 */ 980 */
960 memcpy(cpuc->assign, assign, n*sizeof(int)); 981 memcpy(cpuc->assign, assign, n*sizeof(int));
961 982
983out:
962 cpuc->n_events = n; 984 cpuc->n_events = n;
963 cpuc->n_added += n - n0; 985 cpuc->n_added += n - n0;
964 986
@@ -991,11 +1013,12 @@ static void x86_pmu_unthrottle(struct perf_event *event)
991void perf_event_print_debug(void) 1013void perf_event_print_debug(void)
992{ 1014{
993 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1015 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1016 u64 pebs;
994 struct cpu_hw_events *cpuc; 1017 struct cpu_hw_events *cpuc;
995 unsigned long flags; 1018 unsigned long flags;
996 int cpu, idx; 1019 int cpu, idx;
997 1020
998 if (!x86_pmu.num_events) 1021 if (!x86_pmu.num_counters)
999 return; 1022 return;
1000 1023
1001 local_irq_save(flags); 1024 local_irq_save(flags);
@@ -1008,16 +1031,18 @@ void perf_event_print_debug(void)
1008 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1031 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1009 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1032 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1010 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1033 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1034 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1011 1035
1012 pr_info("\n"); 1036 pr_info("\n");
1013 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1037 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1014 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1038 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1015 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1039 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1016 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1040 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1041 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
1017 } 1042 }
1018 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1043 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1019 1044
1020 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1045 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1021 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1046 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1022 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1047 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1023 1048
@@ -1030,7 +1055,7 @@ void perf_event_print_debug(void)
1030 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1055 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1031 cpu, idx, prev_left); 1056 cpu, idx, prev_left);
1032 } 1057 }
1033 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 1058 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1034 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1059 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1035 1060
1036 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1061 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1095,7 +1120,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1095 1120
1096 cpuc = &__get_cpu_var(cpu_hw_events); 1121 cpuc = &__get_cpu_var(cpu_hw_events);
1097 1122
1098 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1123 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1099 if (!test_bit(idx, cpuc->active_mask)) 1124 if (!test_bit(idx, cpuc->active_mask))
1100 continue; 1125 continue;
1101 1126
@@ -1103,7 +1128,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1103 hwc = &event->hw; 1128 hwc = &event->hw;
1104 1129
1105 val = x86_perf_event_update(event); 1130 val = x86_perf_event_update(event);
1106 if (val & (1ULL << (x86_pmu.event_bits - 1))) 1131 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1107 continue; 1132 continue;
1108 1133
1109 /* 1134 /*
@@ -1146,7 +1171,6 @@ void set_perf_event_pending(void)
1146 1171
1147void perf_events_lapic_init(void) 1172void perf_events_lapic_init(void)
1148{ 1173{
1149#ifdef CONFIG_X86_LOCAL_APIC
1150 if (!x86_pmu.apic || !x86_pmu_initialized()) 1174 if (!x86_pmu.apic || !x86_pmu_initialized())
1151 return; 1175 return;
1152 1176
@@ -1154,7 +1178,6 @@ void perf_events_lapic_init(void)
1154 * Always use NMI for PMU 1178 * Always use NMI for PMU
1155 */ 1179 */
1156 apic_write(APIC_LVTPC, APIC_DM_NMI); 1180 apic_write(APIC_LVTPC, APIC_DM_NMI);
1157#endif
1158} 1181}
1159 1182
1160static int __kprobes 1183static int __kprobes
@@ -1178,9 +1201,7 @@ perf_event_nmi_handler(struct notifier_block *self,
1178 1201
1179 regs = args->regs; 1202 regs = args->regs;
1180 1203
1181#ifdef CONFIG_X86_LOCAL_APIC
1182 apic_write(APIC_LVTPC, APIC_DM_NMI); 1204 apic_write(APIC_LVTPC, APIC_DM_NMI);
1183#endif
1184 /* 1205 /*
1185 * Can't rely on the handled return value to say it was our NMI, two 1206 * Can't rely on the handled return value to say it was our NMI, two
1186 * events could trigger 'simultaneously' raising two back-to-back NMIs. 1207 * events could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1217,118 +1238,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1217 return &unconstrained; 1238 return &unconstrained;
1218} 1239}
1219 1240
1220static int x86_event_sched_in(struct perf_event *event,
1221 struct perf_cpu_context *cpuctx)
1222{
1223 int ret = 0;
1224
1225 event->state = PERF_EVENT_STATE_ACTIVE;
1226 event->oncpu = smp_processor_id();
1227 event->tstamp_running += event->ctx->time - event->tstamp_stopped;
1228
1229 if (!is_x86_event(event))
1230 ret = event->pmu->enable(event);
1231
1232 if (!ret && !is_software_event(event))
1233 cpuctx->active_oncpu++;
1234
1235 if (!ret && event->attr.exclusive)
1236 cpuctx->exclusive = 1;
1237
1238 return ret;
1239}
1240
1241static void x86_event_sched_out(struct perf_event *event,
1242 struct perf_cpu_context *cpuctx)
1243{
1244 event->state = PERF_EVENT_STATE_INACTIVE;
1245 event->oncpu = -1;
1246
1247 if (!is_x86_event(event))
1248 event->pmu->disable(event);
1249
1250 event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
1251
1252 if (!is_software_event(event))
1253 cpuctx->active_oncpu--;
1254
1255 if (event->attr.exclusive || !cpuctx->active_oncpu)
1256 cpuctx->exclusive = 0;
1257}
1258
1259/*
1260 * Called to enable a whole group of events.
1261 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
1262 * Assumes the caller has disabled interrupts and has
1263 * frozen the PMU with hw_perf_save_disable.
1264 *
1265 * called with PMU disabled. If successful and return value 1,
1266 * then guaranteed to call perf_enable() and hw_perf_enable()
1267 */
1268int hw_perf_group_sched_in(struct perf_event *leader,
1269 struct perf_cpu_context *cpuctx,
1270 struct perf_event_context *ctx)
1271{
1272 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1273 struct perf_event *sub;
1274 int assign[X86_PMC_IDX_MAX];
1275 int n0, n1, ret;
1276
1277 /* n0 = total number of events */
1278 n0 = collect_events(cpuc, leader, true);
1279 if (n0 < 0)
1280 return n0;
1281
1282 ret = x86_schedule_events(cpuc, n0, assign);
1283 if (ret)
1284 return ret;
1285
1286 ret = x86_event_sched_in(leader, cpuctx);
1287 if (ret)
1288 return ret;
1289
1290 n1 = 1;
1291 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1292 if (sub->state > PERF_EVENT_STATE_OFF) {
1293 ret = x86_event_sched_in(sub, cpuctx);
1294 if (ret)
1295 goto undo;
1296 ++n1;
1297 }
1298 }
1299 /*
1300 * copy new assignment, now we know it is possible
1301 * will be used by hw_perf_enable()
1302 */
1303 memcpy(cpuc->assign, assign, n0*sizeof(int));
1304
1305 cpuc->n_events = n0;
1306 cpuc->n_added += n1;
1307 ctx->nr_active += n1;
1308
1309 /*
1310 * 1 means successful and events are active
1311 * This is not quite true because we defer
1312 * actual activation until hw_perf_enable() but
1313 * this way we* ensure caller won't try to enable
1314 * individual events
1315 */
1316 return 1;
1317undo:
1318 x86_event_sched_out(leader, cpuctx);
1319 n0 = 1;
1320 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1321 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
1322 x86_event_sched_out(sub, cpuctx);
1323 if (++n0 == n1)
1324 break;
1325 }
1326 }
1327 return ret;
1328}
1329
1330#include "perf_event_amd.c" 1241#include "perf_event_amd.c"
1331#include "perf_event_p6.c" 1242#include "perf_event_p6.c"
1243#include "perf_event_p4.c"
1244#include "perf_event_intel_lbr.c"
1245#include "perf_event_intel_ds.c"
1332#include "perf_event_intel.c" 1246#include "perf_event_intel.c"
1333 1247
1334static int __cpuinit 1248static int __cpuinit
@@ -1402,48 +1316,50 @@ void __init init_hw_perf_events(void)
1402 1316
1403 pr_cont("%s PMU driver.\n", x86_pmu.name); 1317 pr_cont("%s PMU driver.\n", x86_pmu.name);
1404 1318
1405 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 1319 if (x86_pmu.quirks)
1320 x86_pmu.quirks();
1321
1322 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1406 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1323 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1407 x86_pmu.num_events, X86_PMC_MAX_GENERIC); 1324 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1408 x86_pmu.num_events = X86_PMC_MAX_GENERIC; 1325 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1409 } 1326 }
1410 perf_event_mask = (1 << x86_pmu.num_events) - 1; 1327 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1411 perf_max_events = x86_pmu.num_events; 1328 perf_max_events = x86_pmu.num_counters;
1412 1329
1413 if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { 1330 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1414 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", 1331 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1415 x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); 1332 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1416 x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; 1333 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1417 } 1334 }
1418 1335
1419 perf_event_mask |= 1336 x86_pmu.intel_ctrl |=
1420 ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; 1337 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1421 x86_pmu.intel_ctrl = perf_event_mask;
1422 1338
1423 perf_events_lapic_init(); 1339 perf_events_lapic_init();
1424 register_die_notifier(&perf_event_nmi_notifier); 1340 register_die_notifier(&perf_event_nmi_notifier);
1425 1341
1426 unconstrained = (struct event_constraint) 1342 unconstrained = (struct event_constraint)
1427 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 1343 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1428 0, x86_pmu.num_events); 1344 0, x86_pmu.num_counters);
1429 1345
1430 if (x86_pmu.event_constraints) { 1346 if (x86_pmu.event_constraints) {
1431 for_each_event_constraint(c, x86_pmu.event_constraints) { 1347 for_each_event_constraint(c, x86_pmu.event_constraints) {
1432 if (c->cmask != INTEL_ARCH_FIXED_MASK) 1348 if (c->cmask != X86_RAW_EVENT_MASK)
1433 continue; 1349 continue;
1434 1350
1435 c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; 1351 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1436 c->weight += x86_pmu.num_events; 1352 c->weight += x86_pmu.num_counters;
1437 } 1353 }
1438 } 1354 }
1439 1355
1440 pr_info("... version: %d\n", x86_pmu.version); 1356 pr_info("... version: %d\n", x86_pmu.version);
1441 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1357 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
1442 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1358 pr_info("... generic registers: %d\n", x86_pmu.num_counters);
1443 pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); 1359 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
1444 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1360 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1445 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); 1361 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
1446 pr_info("... event mask: %016Lx\n", perf_event_mask); 1362 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1447 1363
1448 perf_cpu_notifier(x86_pmu_notifier); 1364 perf_cpu_notifier(x86_pmu_notifier);
1449} 1365}
@@ -1453,6 +1369,59 @@ static inline void x86_pmu_read(struct perf_event *event)
1453 x86_perf_event_update(event); 1369 x86_perf_event_update(event);
1454} 1370}
1455 1371
1372/*
1373 * Start group events scheduling transaction
1374 * Set the flag to make pmu::enable() not perform the
1375 * schedulability test, it will be performed at commit time
1376 */
1377static void x86_pmu_start_txn(const struct pmu *pmu)
1378{
1379 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1380
1381 cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
1382}
1383
1384/*
1385 * Stop group events scheduling transaction
1386 * Clear the flag and pmu::enable() will perform the
1387 * schedulability test.
1388 */
1389static void x86_pmu_cancel_txn(const struct pmu *pmu)
1390{
1391 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1392
1393 cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
1394}
1395
1396/*
1397 * Commit group events scheduling transaction
1398 * Perform the group schedulability test as a whole
1399 * Return 0 if success
1400 */
1401static int x86_pmu_commit_txn(const struct pmu *pmu)
1402{
1403 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1404 int assign[X86_PMC_IDX_MAX];
1405 int n, ret;
1406
1407 n = cpuc->n_events;
1408
1409 if (!x86_pmu_initialized())
1410 return -EAGAIN;
1411
1412 ret = x86_pmu.schedule_events(cpuc, n, assign);
1413 if (ret)
1414 return ret;
1415
1416 /*
1417 * copy new assignment, now we know it is possible
1418 * will be used by hw_perf_enable()
1419 */
1420 memcpy(cpuc->assign, assign, n*sizeof(int));
1421
1422 return 0;
1423}
1424
1456static const struct pmu pmu = { 1425static const struct pmu pmu = {
1457 .enable = x86_pmu_enable, 1426 .enable = x86_pmu_enable,
1458 .disable = x86_pmu_disable, 1427 .disable = x86_pmu_disable,
@@ -1460,9 +1429,38 @@ static const struct pmu pmu = {
1460 .stop = x86_pmu_stop, 1429 .stop = x86_pmu_stop,
1461 .read = x86_pmu_read, 1430 .read = x86_pmu_read,
1462 .unthrottle = x86_pmu_unthrottle, 1431 .unthrottle = x86_pmu_unthrottle,
1432 .start_txn = x86_pmu_start_txn,
1433 .cancel_txn = x86_pmu_cancel_txn,
1434 .commit_txn = x86_pmu_commit_txn,
1463}; 1435};
1464 1436
1465/* 1437/*
1438 * validate that we can schedule this event
1439 */
1440static int validate_event(struct perf_event *event)
1441{
1442 struct cpu_hw_events *fake_cpuc;
1443 struct event_constraint *c;
1444 int ret = 0;
1445
1446 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1447 if (!fake_cpuc)
1448 return -ENOMEM;
1449
1450 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1451
1452 if (!c || !c->weight)
1453 ret = -ENOSPC;
1454
1455 if (x86_pmu.put_event_constraints)
1456 x86_pmu.put_event_constraints(fake_cpuc, event);
1457
1458 kfree(fake_cpuc);
1459
1460 return ret;
1461}
1462
1463/*
1466 * validate a single event group 1464 * validate a single event group
1467 * 1465 *
1468 * validation include: 1466 * validation include:
@@ -1502,7 +1500,7 @@ static int validate_group(struct perf_event *event)
1502 1500
1503 fake_cpuc->n_events = n; 1501 fake_cpuc->n_events = n;
1504 1502
1505 ret = x86_schedule_events(fake_cpuc, n, NULL); 1503 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1506 1504
1507out_free: 1505out_free:
1508 kfree(fake_cpuc); 1506 kfree(fake_cpuc);
@@ -1527,6 +1525,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
1527 1525
1528 if (event->group_leader != event) 1526 if (event->group_leader != event)
1529 err = validate_group(event); 1527 err = validate_group(event);
1528 else
1529 err = validate_event(event);
1530 1530
1531 event->pmu = tmp; 1531 event->pmu = tmp;
1532 } 1532 }
@@ -1574,8 +1574,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1574{ 1574{
1575 struct perf_callchain_entry *entry = data; 1575 struct perf_callchain_entry *entry = data;
1576 1576
1577 if (reliable) 1577 callchain_store(entry, addr);
1578 callchain_store(entry, addr);
1579} 1578}
1580 1579
1581static const struct stacktrace_ops backtrace_ops = { 1580static const struct stacktrace_ops backtrace_ops = {
@@ -1597,41 +1596,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1597 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1596 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
1598} 1597}
1599 1598
1600/*
1601 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
1602 */
1603static unsigned long
1604copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
1605{
1606 unsigned long offset, addr = (unsigned long)from;
1607 int type = in_nmi() ? KM_NMI : KM_IRQ0;
1608 unsigned long size, len = 0;
1609 struct page *page;
1610 void *map;
1611 int ret;
1612
1613 do {
1614 ret = __get_user_pages_fast(addr, 1, 0, &page);
1615 if (!ret)
1616 break;
1617
1618 offset = addr & (PAGE_SIZE - 1);
1619 size = min(PAGE_SIZE - offset, n - len);
1620
1621 map = kmap_atomic(page, type);
1622 memcpy(to, map+offset, size);
1623 kunmap_atomic(map, type);
1624 put_page(page);
1625
1626 len += size;
1627 to += size;
1628 addr += size;
1629
1630 } while (len < n);
1631
1632 return len;
1633}
1634
1635#ifdef CONFIG_COMPAT 1599#ifdef CONFIG_COMPAT
1636static inline int 1600static inline int
1637perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) 1601perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -1727,6 +1691,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1727{ 1691{
1728 struct perf_callchain_entry *entry; 1692 struct perf_callchain_entry *entry;
1729 1693
1694 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1695 /* TODO: We don't support guest os callchain now */
1696 return NULL;
1697 }
1698
1730 if (in_nmi()) 1699 if (in_nmi())
1731 entry = &__get_cpu_var(pmc_nmi_entry); 1700 entry = &__get_cpu_var(pmc_nmi_entry);
1732 else 1701 else
@@ -1750,3 +1719,37 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
1750 regs->cs = __KERNEL_CS; 1719 regs->cs = __KERNEL_CS;
1751 local_save_flags(regs->flags); 1720 local_save_flags(regs->flags);
1752} 1721}
1722
1723unsigned long perf_instruction_pointer(struct pt_regs *regs)
1724{
1725 unsigned long ip;
1726
1727 if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1728 ip = perf_guest_cbs->get_guest_ip();
1729 else
1730 ip = instruction_pointer(regs);
1731
1732 return ip;
1733}
1734
1735unsigned long perf_misc_flags(struct pt_regs *regs)
1736{
1737 int misc = 0;
1738
1739 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1740 if (perf_guest_cbs->is_user_mode())
1741 misc |= PERF_RECORD_MISC_GUEST_USER;
1742 else
1743 misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1744 } else {
1745 if (user_mode(regs))
1746 misc |= PERF_RECORD_MISC_USER;
1747 else
1748 misc |= PERF_RECORD_MISC_KERNEL;
1749 }
1750
1751 if (regs->flags & PERF_EFLAGS_EXACT)
1752 misc |= PERF_RECORD_MISC_EXACT_IP;
1753
1754 return misc;
1755}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index db6f7d4056e1..611df11ba15e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,7 +2,7 @@
2 2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock); 3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4 4
5static __initconst u64 amd_hw_cache_event_ids 5static __initconst const u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX] 6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX] 7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -111,22 +111,19 @@ static u64 amd_pmu_event_map(int hw_event)
111 return amd_perfmon_event_map[hw_event]; 111 return amd_perfmon_event_map[hw_event];
112} 112}
113 113
114static u64 amd_pmu_raw_event(u64 hw_event) 114static int amd_pmu_hw_config(struct perf_event *event)
115{ 115{
116#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL 116 int ret = x86_pmu_hw_config(event);
117#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL 117
118#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL 118 if (ret)
119#define K7_EVNTSEL_INV_MASK 0x000800000ULL 119 return ret;
120#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL 120
121 121 if (event->attr.type != PERF_TYPE_RAW)
122#define K7_EVNTSEL_MASK \ 122 return 0;
123 (K7_EVNTSEL_EVENT_MASK | \ 123
124 K7_EVNTSEL_UNIT_MASK | \ 124 event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
125 K7_EVNTSEL_EDGE_MASK | \ 125
126 K7_EVNTSEL_INV_MASK | \ 126 return 0;
127 K7_EVNTSEL_REG_MASK)
128
129 return hw_event & K7_EVNTSEL_MASK;
130} 127}
131 128
132/* 129/*
@@ -165,7 +162,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
165 * be removed on one CPU at a time AND PMU is disabled 162 * be removed on one CPU at a time AND PMU is disabled
166 * when we come here 163 * when we come here
167 */ 164 */
168 for (i = 0; i < x86_pmu.num_events; i++) { 165 for (i = 0; i < x86_pmu.num_counters; i++) {
169 if (nb->owners[i] == event) { 166 if (nb->owners[i] == event) {
170 cmpxchg(nb->owners+i, event, NULL); 167 cmpxchg(nb->owners+i, event, NULL);
171 break; 168 break;
@@ -215,7 +212,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
215 struct hw_perf_event *hwc = &event->hw; 212 struct hw_perf_event *hwc = &event->hw;
216 struct amd_nb *nb = cpuc->amd_nb; 213 struct amd_nb *nb = cpuc->amd_nb;
217 struct perf_event *old = NULL; 214 struct perf_event *old = NULL;
218 int max = x86_pmu.num_events; 215 int max = x86_pmu.num_counters;
219 int i, j, k = -1; 216 int i, j, k = -1;
220 217
221 /* 218 /*
@@ -293,7 +290,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
293 /* 290 /*
294 * initialize all possible NB constraints 291 * initialize all possible NB constraints
295 */ 292 */
296 for (i = 0; i < x86_pmu.num_events; i++) { 293 for (i = 0; i < x86_pmu.num_counters; i++) {
297 __set_bit(i, nb->event_constraints[i].idxmsk); 294 __set_bit(i, nb->event_constraints[i].idxmsk);
298 nb->event_constraints[i].weight = 1; 295 nb->event_constraints[i].weight = 1;
299 } 296 }
@@ -371,21 +368,22 @@ static void amd_pmu_cpu_dead(int cpu)
371 raw_spin_unlock(&amd_nb_lock); 368 raw_spin_unlock(&amd_nb_lock);
372} 369}
373 370
374static __initconst struct x86_pmu amd_pmu = { 371static __initconst const struct x86_pmu amd_pmu = {
375 .name = "AMD", 372 .name = "AMD",
376 .handle_irq = x86_pmu_handle_irq, 373 .handle_irq = x86_pmu_handle_irq,
377 .disable_all = x86_pmu_disable_all, 374 .disable_all = x86_pmu_disable_all,
378 .enable_all = x86_pmu_enable_all, 375 .enable_all = x86_pmu_enable_all,
379 .enable = x86_pmu_enable_event, 376 .enable = x86_pmu_enable_event,
380 .disable = x86_pmu_disable_event, 377 .disable = x86_pmu_disable_event,
378 .hw_config = amd_pmu_hw_config,
379 .schedule_events = x86_schedule_events,
381 .eventsel = MSR_K7_EVNTSEL0, 380 .eventsel = MSR_K7_EVNTSEL0,
382 .perfctr = MSR_K7_PERFCTR0, 381 .perfctr = MSR_K7_PERFCTR0,
383 .event_map = amd_pmu_event_map, 382 .event_map = amd_pmu_event_map,
384 .raw_event = amd_pmu_raw_event,
385 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 383 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
386 .num_events = 4, 384 .num_counters = 4,
387 .event_bits = 48, 385 .cntval_bits = 48,
388 .event_mask = (1ULL << 48) - 1, 386 .cntval_mask = (1ULL << 48) - 1,
389 .apic = 1, 387 .apic = 1,
390 /* use highest bit to detect overflow */ 388 /* use highest bit to detect overflow */
391 .max_period = (1ULL << 47) - 1, 389 .max_period = (1ULL << 47) - 1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9c794ac87837..fdbc652d3feb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -88,7 +88,7 @@ static u64 intel_pmu_event_map(int hw_event)
88 return intel_perfmon_event_map[hw_event]; 88 return intel_perfmon_event_map[hw_event];
89} 89}
90 90
91static __initconst u64 westmere_hw_cache_event_ids 91static __initconst const u64 westmere_hw_cache_event_ids
92 [PERF_COUNT_HW_CACHE_MAX] 92 [PERF_COUNT_HW_CACHE_MAX]
93 [PERF_COUNT_HW_CACHE_OP_MAX] 93 [PERF_COUNT_HW_CACHE_OP_MAX]
94 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 94 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -179,7 +179,7 @@ static __initconst u64 westmere_hw_cache_event_ids
179 }, 179 },
180}; 180};
181 181
182static __initconst u64 nehalem_hw_cache_event_ids 182static __initconst const u64 nehalem_hw_cache_event_ids
183 [PERF_COUNT_HW_CACHE_MAX] 183 [PERF_COUNT_HW_CACHE_MAX]
184 [PERF_COUNT_HW_CACHE_OP_MAX] 184 [PERF_COUNT_HW_CACHE_OP_MAX]
185 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 185 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -270,7 +270,7 @@ static __initconst u64 nehalem_hw_cache_event_ids
270 }, 270 },
271}; 271};
272 272
273static __initconst u64 core2_hw_cache_event_ids 273static __initconst const u64 core2_hw_cache_event_ids
274 [PERF_COUNT_HW_CACHE_MAX] 274 [PERF_COUNT_HW_CACHE_MAX]
275 [PERF_COUNT_HW_CACHE_OP_MAX] 275 [PERF_COUNT_HW_CACHE_OP_MAX]
276 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 276 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -361,7 +361,7 @@ static __initconst u64 core2_hw_cache_event_ids
361 }, 361 },
362}; 362};
363 363
364static __initconst u64 atom_hw_cache_event_ids 364static __initconst const u64 atom_hw_cache_event_ids
365 [PERF_COUNT_HW_CACHE_MAX] 365 [PERF_COUNT_HW_CACHE_MAX]
366 [PERF_COUNT_HW_CACHE_OP_MAX] 366 [PERF_COUNT_HW_CACHE_OP_MAX]
367 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 367 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -452,60 +452,6 @@ static __initconst u64 atom_hw_cache_event_ids
452 }, 452 },
453}; 453};
454 454
455static u64 intel_pmu_raw_event(u64 hw_event)
456{
457#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
458#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
459#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
460#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
461#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
462
463#define CORE_EVNTSEL_MASK \
464 (INTEL_ARCH_EVTSEL_MASK | \
465 INTEL_ARCH_UNIT_MASK | \
466 INTEL_ARCH_EDGE_MASK | \
467 INTEL_ARCH_INV_MASK | \
468 INTEL_ARCH_CNT_MASK)
469
470 return hw_event & CORE_EVNTSEL_MASK;
471}
472
473static void intel_pmu_enable_bts(u64 config)
474{
475 unsigned long debugctlmsr;
476
477 debugctlmsr = get_debugctlmsr();
478
479 debugctlmsr |= X86_DEBUGCTL_TR;
480 debugctlmsr |= X86_DEBUGCTL_BTS;
481 debugctlmsr |= X86_DEBUGCTL_BTINT;
482
483 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
484 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
485
486 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
487 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
488
489 update_debugctlmsr(debugctlmsr);
490}
491
492static void intel_pmu_disable_bts(void)
493{
494 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
495 unsigned long debugctlmsr;
496
497 if (!cpuc->ds)
498 return;
499
500 debugctlmsr = get_debugctlmsr();
501
502 debugctlmsr &=
503 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
504 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
505
506 update_debugctlmsr(debugctlmsr);
507}
508
509static void intel_pmu_disable_all(void) 455static void intel_pmu_disable_all(void)
510{ 456{
511 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 457 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -514,12 +460,17 @@ static void intel_pmu_disable_all(void)
514 460
515 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 461 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
516 intel_pmu_disable_bts(); 462 intel_pmu_disable_bts();
463
464 intel_pmu_pebs_disable_all();
465 intel_pmu_lbr_disable_all();
517} 466}
518 467
519static void intel_pmu_enable_all(void) 468static void intel_pmu_enable_all(int added)
520{ 469{
521 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 470 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
522 471
472 intel_pmu_pebs_enable_all();
473 intel_pmu_lbr_enable_all();
523 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 474 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
524 475
525 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 476 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
@@ -533,6 +484,42 @@ static void intel_pmu_enable_all(void)
533 } 484 }
534} 485}
535 486
487/*
488 * Workaround for:
489 * Intel Errata AAK100 (model 26)
490 * Intel Errata AAP53 (model 30)
491 * Intel Errata BD53 (model 44)
492 *
493 * These chips need to be 'reset' when adding counters by programming
494 * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
495 * either in sequence on the same PMC or on different PMCs.
496 */
497static void intel_pmu_nhm_enable_all(int added)
498{
499 if (added) {
500 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
501 int i;
502
503 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
504 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
505 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
506
507 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
508 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
509
510 for (i = 0; i < 3; i++) {
511 struct perf_event *event = cpuc->events[i];
512
513 if (!event)
514 continue;
515
516 __x86_pmu_enable_event(&event->hw,
517 ARCH_PERFMON_EVENTSEL_ENABLE);
518 }
519 }
520 intel_pmu_enable_all(added);
521}
522
536static inline u64 intel_pmu_get_status(void) 523static inline u64 intel_pmu_get_status(void)
537{ 524{
538 u64 status; 525 u64 status;
@@ -547,8 +534,7 @@ static inline void intel_pmu_ack_status(u64 ack)
547 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 534 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
548} 535}
549 536
550static inline void 537static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
551intel_pmu_disable_fixed(struct hw_perf_event *hwc)
552{ 538{
553 int idx = hwc->idx - X86_PMC_IDX_FIXED; 539 int idx = hwc->idx - X86_PMC_IDX_FIXED;
554 u64 ctrl_val, mask; 540 u64 ctrl_val, mask;
@@ -557,71 +543,10 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc)
557 543
558 rdmsrl(hwc->config_base, ctrl_val); 544 rdmsrl(hwc->config_base, ctrl_val);
559 ctrl_val &= ~mask; 545 ctrl_val &= ~mask;
560 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 546 wrmsrl(hwc->config_base, ctrl_val);
561}
562
563static void intel_pmu_drain_bts_buffer(void)
564{
565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
566 struct debug_store *ds = cpuc->ds;
567 struct bts_record {
568 u64 from;
569 u64 to;
570 u64 flags;
571 };
572 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
573 struct bts_record *at, *top;
574 struct perf_output_handle handle;
575 struct perf_event_header header;
576 struct perf_sample_data data;
577 struct pt_regs regs;
578
579 if (!event)
580 return;
581
582 if (!ds)
583 return;
584
585 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
586 top = (struct bts_record *)(unsigned long)ds->bts_index;
587
588 if (top <= at)
589 return;
590
591 ds->bts_index = ds->bts_buffer_base;
592
593 perf_sample_data_init(&data, 0);
594
595 data.period = event->hw.last_period;
596 regs.ip = 0;
597
598 /*
599 * Prepare a generic sample, i.e. fill in the invariant fields.
600 * We will overwrite the from and to address before we output
601 * the sample.
602 */
603 perf_prepare_sample(&header, &data, event, &regs);
604
605 if (perf_output_begin(&handle, event,
606 header.size * (top - at), 1, 1))
607 return;
608
609 for (; at < top; at++) {
610 data.ip = at->from;
611 data.addr = at->to;
612
613 perf_output_sample(&handle, &header, &data, event);
614 }
615
616 perf_output_end(&handle);
617
618 /* There's new data available. */
619 event->hw.interrupts++;
620 event->pending_kill = POLL_IN;
621} 547}
622 548
623static inline void 549static void intel_pmu_disable_event(struct perf_event *event)
624intel_pmu_disable_event(struct perf_event *event)
625{ 550{
626 struct hw_perf_event *hwc = &event->hw; 551 struct hw_perf_event *hwc = &event->hw;
627 552
@@ -637,14 +562,15 @@ intel_pmu_disable_event(struct perf_event *event)
637 } 562 }
638 563
639 x86_pmu_disable_event(event); 564 x86_pmu_disable_event(event);
565
566 if (unlikely(event->attr.precise_ip))
567 intel_pmu_pebs_disable(event);
640} 568}
641 569
642static inline void 570static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
643intel_pmu_enable_fixed(struct hw_perf_event *hwc)
644{ 571{
645 int idx = hwc->idx - X86_PMC_IDX_FIXED; 572 int idx = hwc->idx - X86_PMC_IDX_FIXED;
646 u64 ctrl_val, bits, mask; 573 u64 ctrl_val, bits, mask;
647 int err;
648 574
649 /* 575 /*
650 * Enable IRQ generation (0x8), 576 * Enable IRQ generation (0x8),
@@ -669,7 +595,7 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc)
669 rdmsrl(hwc->config_base, ctrl_val); 595 rdmsrl(hwc->config_base, ctrl_val);
670 ctrl_val &= ~mask; 596 ctrl_val &= ~mask;
671 ctrl_val |= bits; 597 ctrl_val |= bits;
672 err = checking_wrmsrl(hwc->config_base, ctrl_val); 598 wrmsrl(hwc->config_base, ctrl_val);
673} 599}
674 600
675static void intel_pmu_enable_event(struct perf_event *event) 601static void intel_pmu_enable_event(struct perf_event *event)
@@ -689,7 +615,10 @@ static void intel_pmu_enable_event(struct perf_event *event)
689 return; 615 return;
690 } 616 }
691 617
692 __x86_pmu_enable_event(hwc); 618 if (unlikely(event->attr.precise_ip))
619 intel_pmu_pebs_enable(event);
620
621 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
693} 622}
694 623
695/* 624/*
@@ -708,20 +637,20 @@ static void intel_pmu_reset(void)
708 unsigned long flags; 637 unsigned long flags;
709 int idx; 638 int idx;
710 639
711 if (!x86_pmu.num_events) 640 if (!x86_pmu.num_counters)
712 return; 641 return;
713 642
714 local_irq_save(flags); 643 local_irq_save(flags);
715 644
716 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 645 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
717 646
718 for (idx = 0; idx < x86_pmu.num_events; idx++) { 647 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
719 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 648 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
720 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 649 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
721 } 650 }
722 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 651 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
723 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 652 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
724 } 653
725 if (ds) 654 if (ds)
726 ds->bts_index = ds->bts_buffer_base; 655 ds->bts_index = ds->bts_buffer_base;
727 656
@@ -747,7 +676,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
747 intel_pmu_drain_bts_buffer(); 676 intel_pmu_drain_bts_buffer();
748 status = intel_pmu_get_status(); 677 status = intel_pmu_get_status();
749 if (!status) { 678 if (!status) {
750 intel_pmu_enable_all(); 679 intel_pmu_enable_all(0);
751 return 0; 680 return 0;
752 } 681 }
753 682
@@ -762,6 +691,15 @@ again:
762 691
763 inc_irq_stat(apic_perf_irqs); 692 inc_irq_stat(apic_perf_irqs);
764 ack = status; 693 ack = status;
694
695 intel_pmu_lbr_read();
696
697 /*
698 * PEBS overflow sets bit 62 in the global status register
699 */
700 if (__test_and_clear_bit(62, (unsigned long *)&status))
701 x86_pmu.drain_pebs(regs);
702
765 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 703 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
766 struct perf_event *event = cpuc->events[bit]; 704 struct perf_event *event = cpuc->events[bit];
767 705
@@ -787,26 +725,22 @@ again:
787 goto again; 725 goto again;
788 726
789done: 727done:
790 intel_pmu_enable_all(); 728 intel_pmu_enable_all(0);
791 return 1; 729 return 1;
792} 730}
793 731
794static struct event_constraint bts_constraint =
795 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
796
797static struct event_constraint * 732static struct event_constraint *
798intel_special_constraints(struct perf_event *event) 733intel_bts_constraints(struct perf_event *event)
799{ 734{
800 unsigned int hw_event; 735 struct hw_perf_event *hwc = &event->hw;
801 736 unsigned int hw_event, bts_event;
802 hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
803 737
804 if (unlikely((hw_event == 738 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
805 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && 739 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
806 (event->hw.sample_period == 1))) {
807 740
741 if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
808 return &bts_constraint; 742 return &bts_constraint;
809 } 743
810 return NULL; 744 return NULL;
811} 745}
812 746
@@ -815,24 +749,53 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
815{ 749{
816 struct event_constraint *c; 750 struct event_constraint *c;
817 751
818 c = intel_special_constraints(event); 752 c = intel_bts_constraints(event);
753 if (c)
754 return c;
755
756 c = intel_pebs_constraints(event);
819 if (c) 757 if (c)
820 return c; 758 return c;
821 759
822 return x86_get_event_constraints(cpuc, event); 760 return x86_get_event_constraints(cpuc, event);
823} 761}
824 762
825static __initconst struct x86_pmu core_pmu = { 763static int intel_pmu_hw_config(struct perf_event *event)
764{
765 int ret = x86_pmu_hw_config(event);
766
767 if (ret)
768 return ret;
769
770 if (event->attr.type != PERF_TYPE_RAW)
771 return 0;
772
773 if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
774 return 0;
775
776 if (x86_pmu.version < 3)
777 return -EINVAL;
778
779 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
780 return -EACCES;
781
782 event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
783
784 return 0;
785}
786
787static __initconst const struct x86_pmu core_pmu = {
826 .name = "core", 788 .name = "core",
827 .handle_irq = x86_pmu_handle_irq, 789 .handle_irq = x86_pmu_handle_irq,
828 .disable_all = x86_pmu_disable_all, 790 .disable_all = x86_pmu_disable_all,
829 .enable_all = x86_pmu_enable_all, 791 .enable_all = x86_pmu_enable_all,
830 .enable = x86_pmu_enable_event, 792 .enable = x86_pmu_enable_event,
831 .disable = x86_pmu_disable_event, 793 .disable = x86_pmu_disable_event,
794 .hw_config = x86_pmu_hw_config,
795 .schedule_events = x86_schedule_events,
832 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 796 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
833 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 797 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
834 .event_map = intel_pmu_event_map, 798 .event_map = intel_pmu_event_map,
835 .raw_event = intel_pmu_raw_event,
836 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 799 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
837 .apic = 1, 800 .apic = 1,
838 /* 801 /*
@@ -845,17 +808,32 @@ static __initconst struct x86_pmu core_pmu = {
845 .event_constraints = intel_core_event_constraints, 808 .event_constraints = intel_core_event_constraints,
846}; 809};
847 810
848static __initconst struct x86_pmu intel_pmu = { 811static void intel_pmu_cpu_starting(int cpu)
812{
813 init_debug_store_on_cpu(cpu);
814 /*
815 * Deal with CPUs that don't clear their LBRs on power-up.
816 */
817 intel_pmu_lbr_reset();
818}
819
820static void intel_pmu_cpu_dying(int cpu)
821{
822 fini_debug_store_on_cpu(cpu);
823}
824
825static __initconst const struct x86_pmu intel_pmu = {
849 .name = "Intel", 826 .name = "Intel",
850 .handle_irq = intel_pmu_handle_irq, 827 .handle_irq = intel_pmu_handle_irq,
851 .disable_all = intel_pmu_disable_all, 828 .disable_all = intel_pmu_disable_all,
852 .enable_all = intel_pmu_enable_all, 829 .enable_all = intel_pmu_enable_all,
853 .enable = intel_pmu_enable_event, 830 .enable = intel_pmu_enable_event,
854 .disable = intel_pmu_disable_event, 831 .disable = intel_pmu_disable_event,
832 .hw_config = intel_pmu_hw_config,
833 .schedule_events = x86_schedule_events,
855 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 834 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
856 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 835 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
857 .event_map = intel_pmu_event_map, 836 .event_map = intel_pmu_event_map,
858 .raw_event = intel_pmu_raw_event,
859 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 837 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
860 .apic = 1, 838 .apic = 1,
861 /* 839 /*
@@ -864,14 +842,38 @@ static __initconst struct x86_pmu intel_pmu = {
864 * the generic event period: 842 * the generic event period:
865 */ 843 */
866 .max_period = (1ULL << 31) - 1, 844 .max_period = (1ULL << 31) - 1,
867 .enable_bts = intel_pmu_enable_bts,
868 .disable_bts = intel_pmu_disable_bts,
869 .get_event_constraints = intel_get_event_constraints, 845 .get_event_constraints = intel_get_event_constraints,
870 846
871 .cpu_starting = init_debug_store_on_cpu, 847 .cpu_starting = intel_pmu_cpu_starting,
872 .cpu_dying = fini_debug_store_on_cpu, 848 .cpu_dying = intel_pmu_cpu_dying,
873}; 849};
874 850
851static void intel_clovertown_quirks(void)
852{
853 /*
854 * PEBS is unreliable due to:
855 *
856 * AJ67 - PEBS may experience CPL leaks
857 * AJ68 - PEBS PMI may be delayed by one event
858 * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
859 * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
860 *
861 * AJ67 could be worked around by restricting the OS/USR flags.
862 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
863 *
864 * AJ106 could possibly be worked around by not allowing LBR
865 * usage from PEBS, including the fixup.
866 * AJ68 could possibly be worked around by always programming
867 * a pebs_event_reset[0] value and coping with the lost events.
868 *
869 * But taken together it might just make sense to not enable PEBS on
870 * these chips.
871 */
872 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
873 x86_pmu.pebs = 0;
874 x86_pmu.pebs_constraints = NULL;
875}
876
875static __init int intel_pmu_init(void) 877static __init int intel_pmu_init(void)
876{ 878{
877 union cpuid10_edx edx; 879 union cpuid10_edx edx;
@@ -881,12 +883,13 @@ static __init int intel_pmu_init(void)
881 int version; 883 int version;
882 884
883 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 885 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
884 /* check for P6 processor family */ 886 switch (boot_cpu_data.x86) {
885 if (boot_cpu_data.x86 == 6) { 887 case 0x6:
886 return p6_pmu_init(); 888 return p6_pmu_init();
887 } else { 889 case 0xf:
890 return p4_pmu_init();
891 }
888 return -ENODEV; 892 return -ENODEV;
889 }
890 } 893 }
891 894
892 /* 895 /*
@@ -904,16 +907,28 @@ static __init int intel_pmu_init(void)
904 x86_pmu = intel_pmu; 907 x86_pmu = intel_pmu;
905 908
906 x86_pmu.version = version; 909 x86_pmu.version = version;
907 x86_pmu.num_events = eax.split.num_events; 910 x86_pmu.num_counters = eax.split.num_counters;
908 x86_pmu.event_bits = eax.split.bit_width; 911 x86_pmu.cntval_bits = eax.split.bit_width;
909 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; 912 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
910 913
911 /* 914 /*
912 * Quirk: v2 perfmon does not report fixed-purpose events, so 915 * Quirk: v2 perfmon does not report fixed-purpose events, so
913 * assume at least 3 events: 916 * assume at least 3 events:
914 */ 917 */
915 if (version > 1) 918 if (version > 1)
916 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); 919 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
920
921 /*
922 * v2 and above have a perf capabilities MSR
923 */
924 if (version > 1) {
925 u64 capabilities;
926
927 rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
928 x86_pmu.intel_cap.capabilities = capabilities;
929 }
930
931 intel_ds_init();
917 932
918 /* 933 /*
919 * Install the hw-cache-events table: 934 * Install the hw-cache-events table:
@@ -924,12 +939,15 @@ static __init int intel_pmu_init(void)
924 break; 939 break;
925 940
926 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 941 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
942 x86_pmu.quirks = intel_clovertown_quirks;
927 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 943 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
928 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 944 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
929 case 29: /* six-core 45 nm xeon "Dunnington" */ 945 case 29: /* six-core 45 nm xeon "Dunnington" */
930 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, 946 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
931 sizeof(hw_cache_event_ids)); 947 sizeof(hw_cache_event_ids));
932 948
949 intel_pmu_lbr_init_core();
950
933 x86_pmu.event_constraints = intel_core2_event_constraints; 951 x86_pmu.event_constraints = intel_core2_event_constraints;
934 pr_cont("Core2 events, "); 952 pr_cont("Core2 events, ");
935 break; 953 break;
@@ -940,13 +958,19 @@ static __init int intel_pmu_init(void)
940 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 958 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
941 sizeof(hw_cache_event_ids)); 959 sizeof(hw_cache_event_ids));
942 960
961 intel_pmu_lbr_init_nhm();
962
943 x86_pmu.event_constraints = intel_nehalem_event_constraints; 963 x86_pmu.event_constraints = intel_nehalem_event_constraints;
944 pr_cont("Nehalem/Corei7 events, "); 964 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
965 pr_cont("Nehalem events, ");
945 break; 966 break;
967
946 case 28: /* Atom */ 968 case 28: /* Atom */
947 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, 969 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
948 sizeof(hw_cache_event_ids)); 970 sizeof(hw_cache_event_ids));
949 971
972 intel_pmu_lbr_init_atom();
973
950 x86_pmu.event_constraints = intel_gen_event_constraints; 974 x86_pmu.event_constraints = intel_gen_event_constraints;
951 pr_cont("Atom events, "); 975 pr_cont("Atom events, ");
952 break; 976 break;
@@ -956,7 +980,10 @@ static __init int intel_pmu_init(void)
956 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 980 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
957 sizeof(hw_cache_event_ids)); 981 sizeof(hw_cache_event_ids));
958 982
983 intel_pmu_lbr_init_nhm();
984
959 x86_pmu.event_constraints = intel_westmere_event_constraints; 985 x86_pmu.event_constraints = intel_westmere_event_constraints;
986 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
960 pr_cont("Westmere events, "); 987 pr_cont("Westmere events, ");
961 break; 988 break;
962 989
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 000000000000..18018d1311cd
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,641 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/* The maximal number of PEBS events: */
4#define MAX_PEBS_EVENTS 4
5
6/* The size of a BTS record in bytes: */
7#define BTS_RECORD_SIZE 24
8
9#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
10#define PEBS_BUFFER_SIZE PAGE_SIZE
11
12/*
13 * pebs_record_32 for p4 and core not supported
14
15struct pebs_record_32 {
16 u32 flags, ip;
17 u32 ax, bc, cx, dx;
18 u32 si, di, bp, sp;
19};
20
21 */
22
23struct pebs_record_core {
24 u64 flags, ip;
25 u64 ax, bx, cx, dx;
26 u64 si, di, bp, sp;
27 u64 r8, r9, r10, r11;
28 u64 r12, r13, r14, r15;
29};
30
31struct pebs_record_nhm {
32 u64 flags, ip;
33 u64 ax, bx, cx, dx;
34 u64 si, di, bp, sp;
35 u64 r8, r9, r10, r11;
36 u64 r12, r13, r14, r15;
37 u64 status, dla, dse, lat;
38};
39
40/*
41 * A debug store configuration.
42 *
43 * We only support architectures that use 64bit fields.
44 */
45struct debug_store {
46 u64 bts_buffer_base;
47 u64 bts_index;
48 u64 bts_absolute_maximum;
49 u64 bts_interrupt_threshold;
50 u64 pebs_buffer_base;
51 u64 pebs_index;
52 u64 pebs_absolute_maximum;
53 u64 pebs_interrupt_threshold;
54 u64 pebs_event_reset[MAX_PEBS_EVENTS];
55};
56
57static void init_debug_store_on_cpu(int cpu)
58{
59 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
60
61 if (!ds)
62 return;
63
64 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
65 (u32)((u64)(unsigned long)ds),
66 (u32)((u64)(unsigned long)ds >> 32));
67}
68
69static void fini_debug_store_on_cpu(int cpu)
70{
71 if (!per_cpu(cpu_hw_events, cpu).ds)
72 return;
73
74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
75}
76
77static void release_ds_buffers(void)
78{
79 int cpu;
80
81 if (!x86_pmu.bts && !x86_pmu.pebs)
82 return;
83
84 get_online_cpus();
85
86 for_each_online_cpu(cpu)
87 fini_debug_store_on_cpu(cpu);
88
89 for_each_possible_cpu(cpu) {
90 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
91
92 if (!ds)
93 continue;
94
95 per_cpu(cpu_hw_events, cpu).ds = NULL;
96
97 kfree((void *)(unsigned long)ds->pebs_buffer_base);
98 kfree((void *)(unsigned long)ds->bts_buffer_base);
99 kfree(ds);
100 }
101
102 put_online_cpus();
103}
104
105static int reserve_ds_buffers(void)
106{
107 int cpu, err = 0;
108
109 if (!x86_pmu.bts && !x86_pmu.pebs)
110 return 0;
111
112 get_online_cpus();
113
114 for_each_possible_cpu(cpu) {
115 struct debug_store *ds;
116 void *buffer;
117 int max, thresh;
118
119 err = -ENOMEM;
120 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
121 if (unlikely(!ds))
122 break;
123 per_cpu(cpu_hw_events, cpu).ds = ds;
124
125 if (x86_pmu.bts) {
126 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
127 if (unlikely(!buffer))
128 break;
129
130 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
131 thresh = max / 16;
132
133 ds->bts_buffer_base = (u64)(unsigned long)buffer;
134 ds->bts_index = ds->bts_buffer_base;
135 ds->bts_absolute_maximum = ds->bts_buffer_base +
136 max * BTS_RECORD_SIZE;
137 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
138 thresh * BTS_RECORD_SIZE;
139 }
140
141 if (x86_pmu.pebs) {
142 buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
143 if (unlikely(!buffer))
144 break;
145
146 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
147
148 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
149 ds->pebs_index = ds->pebs_buffer_base;
150 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
151 max * x86_pmu.pebs_record_size;
152 /*
153 * Always use single record PEBS
154 */
155 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
156 x86_pmu.pebs_record_size;
157 }
158
159 err = 0;
160 }
161
162 if (err)
163 release_ds_buffers();
164 else {
165 for_each_online_cpu(cpu)
166 init_debug_store_on_cpu(cpu);
167 }
168
169 put_online_cpus();
170
171 return err;
172}
173
174/*
175 * BTS
176 */
177
178static struct event_constraint bts_constraint =
179 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
180
181static void intel_pmu_enable_bts(u64 config)
182{
183 unsigned long debugctlmsr;
184
185 debugctlmsr = get_debugctlmsr();
186
187 debugctlmsr |= DEBUGCTLMSR_TR;
188 debugctlmsr |= DEBUGCTLMSR_BTS;
189 debugctlmsr |= DEBUGCTLMSR_BTINT;
190
191 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
192 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
193
194 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
195 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
196
197 update_debugctlmsr(debugctlmsr);
198}
199
200static void intel_pmu_disable_bts(void)
201{
202 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
203 unsigned long debugctlmsr;
204
205 if (!cpuc->ds)
206 return;
207
208 debugctlmsr = get_debugctlmsr();
209
210 debugctlmsr &=
211 ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
212 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
213
214 update_debugctlmsr(debugctlmsr);
215}
216
217static void intel_pmu_drain_bts_buffer(void)
218{
219 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
220 struct debug_store *ds = cpuc->ds;
221 struct bts_record {
222 u64 from;
223 u64 to;
224 u64 flags;
225 };
226 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
227 struct bts_record *at, *top;
228 struct perf_output_handle handle;
229 struct perf_event_header header;
230 struct perf_sample_data data;
231 struct pt_regs regs;
232
233 if (!event)
234 return;
235
236 if (!ds)
237 return;
238
239 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
240 top = (struct bts_record *)(unsigned long)ds->bts_index;
241
242 if (top <= at)
243 return;
244
245 ds->bts_index = ds->bts_buffer_base;
246
247 perf_sample_data_init(&data, 0);
248 data.period = event->hw.last_period;
249 regs.ip = 0;
250
251 /*
252 * Prepare a generic sample, i.e. fill in the invariant fields.
253 * We will overwrite the from and to address before we output
254 * the sample.
255 */
256 perf_prepare_sample(&header, &data, event, &regs);
257
258 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
259 return;
260
261 for (; at < top; at++) {
262 data.ip = at->from;
263 data.addr = at->to;
264
265 perf_output_sample(&handle, &header, &data, event);
266 }
267
268 perf_output_end(&handle);
269
270 /* There's new data available. */
271 event->hw.interrupts++;
272 event->pending_kill = POLL_IN;
273}
274
275/*
276 * PEBS
277 */
278
279static struct event_constraint intel_core_pebs_events[] = {
280 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
281 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
282 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
283 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
284 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
285 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
286 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
287 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
288 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
289 EVENT_CONSTRAINT_END
290};
291
292static struct event_constraint intel_nehalem_pebs_events[] = {
293 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
294 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
295 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
296 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
297 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
298 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
299 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
300 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
301 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
302 EVENT_CONSTRAINT_END
303};
304
305static struct event_constraint *
306intel_pebs_constraints(struct perf_event *event)
307{
308 struct event_constraint *c;
309
310 if (!event->attr.precise_ip)
311 return NULL;
312
313 if (x86_pmu.pebs_constraints) {
314 for_each_event_constraint(c, x86_pmu.pebs_constraints) {
315 if ((event->hw.config & c->cmask) == c->code)
316 return c;
317 }
318 }
319
320 return &emptyconstraint;
321}
322
323static void intel_pmu_pebs_enable(struct perf_event *event)
324{
325 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
326 struct hw_perf_event *hwc = &event->hw;
327
328 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
329
330 cpuc->pebs_enabled |= 1ULL << hwc->idx;
331 WARN_ON_ONCE(cpuc->enabled);
332
333 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
334 intel_pmu_lbr_enable(event);
335}
336
337static void intel_pmu_pebs_disable(struct perf_event *event)
338{
339 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
340 struct hw_perf_event *hwc = &event->hw;
341
342 cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
343 if (cpuc->enabled)
344 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
345
346 hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
347
348 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
349 intel_pmu_lbr_disable(event);
350}
351
352static void intel_pmu_pebs_enable_all(void)
353{
354 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
355
356 if (cpuc->pebs_enabled)
357 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
358}
359
360static void intel_pmu_pebs_disable_all(void)
361{
362 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
363
364 if (cpuc->pebs_enabled)
365 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
366}
367
368#include <asm/insn.h>
369
370static inline bool kernel_ip(unsigned long ip)
371{
372#ifdef CONFIG_X86_32
373 return ip > PAGE_OFFSET;
374#else
375 return (long)ip < 0;
376#endif
377}
378
379static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
380{
381 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
382 unsigned long from = cpuc->lbr_entries[0].from;
383 unsigned long old_to, to = cpuc->lbr_entries[0].to;
384 unsigned long ip = regs->ip;
385
386 /*
387 * We don't need to fixup if the PEBS assist is fault like
388 */
389 if (!x86_pmu.intel_cap.pebs_trap)
390 return 1;
391
392 /*
393 * No LBR entry, no basic block, no rewinding
394 */
395 if (!cpuc->lbr_stack.nr || !from || !to)
396 return 0;
397
398 /*
399 * Basic blocks should never cross user/kernel boundaries
400 */
401 if (kernel_ip(ip) != kernel_ip(to))
402 return 0;
403
404 /*
405 * unsigned math, either ip is before the start (impossible) or
406 * the basic block is larger than 1 page (sanity)
407 */
408 if ((ip - to) > PAGE_SIZE)
409 return 0;
410
411 /*
412 * We sampled a branch insn, rewind using the LBR stack
413 */
414 if (ip == to) {
415 regs->ip = from;
416 return 1;
417 }
418
419 do {
420 struct insn insn;
421 u8 buf[MAX_INSN_SIZE];
422 void *kaddr;
423
424 old_to = to;
425 if (!kernel_ip(ip)) {
426 int bytes, size = MAX_INSN_SIZE;
427
428 bytes = copy_from_user_nmi(buf, (void __user *)to, size);
429 if (bytes != size)
430 return 0;
431
432 kaddr = buf;
433 } else
434 kaddr = (void *)to;
435
436 kernel_insn_init(&insn, kaddr);
437 insn_get_length(&insn);
438 to += insn.length;
439 } while (to < ip);
440
441 if (to == ip) {
442 regs->ip = old_to;
443 return 1;
444 }
445
446 /*
447 * Even though we decoded the basic block, the instruction stream
448 * never matched the given IP, either the TO or the IP got corrupted.
449 */
450 return 0;
451}
452
453static int intel_pmu_save_and_restart(struct perf_event *event);
454
455static void __intel_pmu_pebs_event(struct perf_event *event,
456 struct pt_regs *iregs, void *__pebs)
457{
458 /*
459 * We cast to pebs_record_core since that is a subset of
460 * both formats and we don't use the other fields in this
461 * routine.
462 */
463 struct pebs_record_core *pebs = __pebs;
464 struct perf_sample_data data;
465 struct pt_regs regs;
466
467 if (!intel_pmu_save_and_restart(event))
468 return;
469
470 perf_sample_data_init(&data, 0);
471 data.period = event->hw.last_period;
472
473 /*
474 * We use the interrupt regs as a base because the PEBS record
475 * does not contain a full regs set, specifically it seems to
476 * lack segment descriptors, which get used by things like
477 * user_mode().
478 *
479 * In the simple case fix up only the IP and BP,SP regs, for
480 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
481 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
482 */
483 regs = *iregs;
484 regs.ip = pebs->ip;
485 regs.bp = pebs->bp;
486 regs.sp = pebs->sp;
487
488 if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
489 regs.flags |= PERF_EFLAGS_EXACT;
490 else
491 regs.flags &= ~PERF_EFLAGS_EXACT;
492
493 if (perf_event_overflow(event, 1, &data, &regs))
494 x86_pmu_stop(event);
495}
496
497static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
498{
499 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
500 struct debug_store *ds = cpuc->ds;
501 struct perf_event *event = cpuc->events[0]; /* PMC0 only */
502 struct pebs_record_core *at, *top;
503 int n;
504
505 if (!ds || !x86_pmu.pebs)
506 return;
507
508 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
509 top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
510
511 /*
512 * Whatever else happens, drain the thing
513 */
514 ds->pebs_index = ds->pebs_buffer_base;
515
516 if (!test_bit(0, cpuc->active_mask))
517 return;
518
519 WARN_ON_ONCE(!event);
520
521 if (!event->attr.precise_ip)
522 return;
523
524 n = top - at;
525 if (n <= 0)
526 return;
527
528 /*
529 * Should not happen, we program the threshold at 1 and do not
530 * set a reset value.
531 */
532 WARN_ON_ONCE(n > 1);
533 at += n - 1;
534
535 __intel_pmu_pebs_event(event, iregs, at);
536}
537
538static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
539{
540 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
541 struct debug_store *ds = cpuc->ds;
542 struct pebs_record_nhm *at, *top;
543 struct perf_event *event = NULL;
544 u64 status = 0;
545 int bit, n;
546
547 if (!ds || !x86_pmu.pebs)
548 return;
549
550 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
551 top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
552
553 ds->pebs_index = ds->pebs_buffer_base;
554
555 n = top - at;
556 if (n <= 0)
557 return;
558
559 /*
560 * Should not happen, we program the threshold at 1 and do not
561 * set a reset value.
562 */
563 WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
564
565 for ( ; at < top; at++) {
566 for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
567 event = cpuc->events[bit];
568 if (!test_bit(bit, cpuc->active_mask))
569 continue;
570
571 WARN_ON_ONCE(!event);
572
573 if (!event->attr.precise_ip)
574 continue;
575
576 if (__test_and_set_bit(bit, (unsigned long *)&status))
577 continue;
578
579 break;
580 }
581
582 if (!event || bit >= MAX_PEBS_EVENTS)
583 continue;
584
585 __intel_pmu_pebs_event(event, iregs, at);
586 }
587}
588
589/*
590 * BTS, PEBS probe and setup
591 */
592
593static void intel_ds_init(void)
594{
595 /*
596 * No support for 32bit formats
597 */
598 if (!boot_cpu_has(X86_FEATURE_DTES64))
599 return;
600
601 x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
602 x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
603 if (x86_pmu.pebs) {
604 char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
605 int format = x86_pmu.intel_cap.pebs_format;
606
607 switch (format) {
608 case 0:
609 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
610 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
611 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
612 x86_pmu.pebs_constraints = intel_core_pebs_events;
613 break;
614
615 case 1:
616 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
617 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
618 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
619 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
620 break;
621
622 default:
623 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
624 x86_pmu.pebs = 0;
625 break;
626 }
627 }
628}
629
630#else /* CONFIG_CPU_SUP_INTEL */
631
632static int reserve_ds_buffers(void)
633{
634 return 0;
635}
636
637static void release_ds_buffers(void)
638{
639}
640
641#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 000000000000..d202c1bece1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,218 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3enum {
4 LBR_FORMAT_32 = 0x00,
5 LBR_FORMAT_LIP = 0x01,
6 LBR_FORMAT_EIP = 0x02,
7 LBR_FORMAT_EIP_FLAGS = 0x03,
8};
9
10/*
11 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
12 * otherwise it becomes near impossible to get a reliable stack.
13 */
14
15static void __intel_pmu_lbr_enable(void)
16{
17 u64 debugctl;
18
19 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
20 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
21 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
22}
23
24static void __intel_pmu_lbr_disable(void)
25{
26 u64 debugctl;
27
28 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
29 debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
30 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
31}
32
33static void intel_pmu_lbr_reset_32(void)
34{
35 int i;
36
37 for (i = 0; i < x86_pmu.lbr_nr; i++)
38 wrmsrl(x86_pmu.lbr_from + i, 0);
39}
40
41static void intel_pmu_lbr_reset_64(void)
42{
43 int i;
44
45 for (i = 0; i < x86_pmu.lbr_nr; i++) {
46 wrmsrl(x86_pmu.lbr_from + i, 0);
47 wrmsrl(x86_pmu.lbr_to + i, 0);
48 }
49}
50
51static void intel_pmu_lbr_reset(void)
52{
53 if (!x86_pmu.lbr_nr)
54 return;
55
56 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
57 intel_pmu_lbr_reset_32();
58 else
59 intel_pmu_lbr_reset_64();
60}
61
62static void intel_pmu_lbr_enable(struct perf_event *event)
63{
64 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
65
66 if (!x86_pmu.lbr_nr)
67 return;
68
69 WARN_ON_ONCE(cpuc->enabled);
70
71 /*
72 * Reset the LBR stack if we changed task context to
73 * avoid data leaks.
74 */
75
76 if (event->ctx->task && cpuc->lbr_context != event->ctx) {
77 intel_pmu_lbr_reset();
78 cpuc->lbr_context = event->ctx;
79 }
80
81 cpuc->lbr_users++;
82}
83
84static void intel_pmu_lbr_disable(struct perf_event *event)
85{
86 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
87
88 if (!x86_pmu.lbr_nr)
89 return;
90
91 cpuc->lbr_users--;
92 WARN_ON_ONCE(cpuc->lbr_users < 0);
93
94 if (cpuc->enabled && !cpuc->lbr_users)
95 __intel_pmu_lbr_disable();
96}
97
98static void intel_pmu_lbr_enable_all(void)
99{
100 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
101
102 if (cpuc->lbr_users)
103 __intel_pmu_lbr_enable();
104}
105
106static void intel_pmu_lbr_disable_all(void)
107{
108 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
109
110 if (cpuc->lbr_users)
111 __intel_pmu_lbr_disable();
112}
113
114static inline u64 intel_pmu_lbr_tos(void)
115{
116 u64 tos;
117
118 rdmsrl(x86_pmu.lbr_tos, tos);
119
120 return tos;
121}
122
123static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
124{
125 unsigned long mask = x86_pmu.lbr_nr - 1;
126 u64 tos = intel_pmu_lbr_tos();
127 int i;
128
129 for (i = 0; i < x86_pmu.lbr_nr; i++) {
130 unsigned long lbr_idx = (tos - i) & mask;
131 union {
132 struct {
133 u32 from;
134 u32 to;
135 };
136 u64 lbr;
137 } msr_lastbranch;
138
139 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
140
141 cpuc->lbr_entries[i].from = msr_lastbranch.from;
142 cpuc->lbr_entries[i].to = msr_lastbranch.to;
143 cpuc->lbr_entries[i].flags = 0;
144 }
145 cpuc->lbr_stack.nr = i;
146}
147
148#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
149
150/*
151 * Due to lack of segmentation in Linux the effective address (offset)
152 * is the same as the linear address, allowing us to merge the LIP and EIP
153 * LBR formats.
154 */
155static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
156{
157 unsigned long mask = x86_pmu.lbr_nr - 1;
158 int lbr_format = x86_pmu.intel_cap.lbr_format;
159 u64 tos = intel_pmu_lbr_tos();
160 int i;
161
162 for (i = 0; i < x86_pmu.lbr_nr; i++) {
163 unsigned long lbr_idx = (tos - i) & mask;
164 u64 from, to, flags = 0;
165
166 rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
167 rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
168
169 if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
170 flags = !!(from & LBR_FROM_FLAG_MISPRED);
171 from = (u64)((((s64)from) << 1) >> 1);
172 }
173
174 cpuc->lbr_entries[i].from = from;
175 cpuc->lbr_entries[i].to = to;
176 cpuc->lbr_entries[i].flags = flags;
177 }
178 cpuc->lbr_stack.nr = i;
179}
180
181static void intel_pmu_lbr_read(void)
182{
183 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
184
185 if (!cpuc->lbr_users)
186 return;
187
188 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
189 intel_pmu_lbr_read_32(cpuc);
190 else
191 intel_pmu_lbr_read_64(cpuc);
192}
193
194static void intel_pmu_lbr_init_core(void)
195{
196 x86_pmu.lbr_nr = 4;
197 x86_pmu.lbr_tos = 0x01c9;
198 x86_pmu.lbr_from = 0x40;
199 x86_pmu.lbr_to = 0x60;
200}
201
202static void intel_pmu_lbr_init_nhm(void)
203{
204 x86_pmu.lbr_nr = 16;
205 x86_pmu.lbr_tos = 0x01c9;
206 x86_pmu.lbr_from = 0x680;
207 x86_pmu.lbr_to = 0x6c0;
208}
209
210static void intel_pmu_lbr_init_atom(void)
211{
212 x86_pmu.lbr_nr = 8;
213 x86_pmu.lbr_tos = 0x01c9;
214 x86_pmu.lbr_from = 0x40;
215 x86_pmu.lbr_to = 0x60;
216}
217
218#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 000000000000..424fc8de68e4
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,857 @@
1/*
2 * Netburst Perfomance Events (P4, old Xeon)
3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#ifdef CONFIG_CPU_SUP_INTEL
11
12#include <asm/perf_event_p4.h>
13
14#define P4_CNTR_LIMIT 3
15/*
16 * array indices: 0,1 - HT threads, used with HT enabled cpu
17 */
18struct p4_event_bind {
19 unsigned int opcode; /* Event code and ESCR selector */
20 unsigned int escr_msr[2]; /* ESCR MSR for this event */
21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22};
23
24struct p4_cache_event_bind {
25 unsigned int metric_pebs;
26 unsigned int metric_vert;
27};
28
29#define P4_GEN_CACHE_EVENT_BIND(name) \
30 [P4_CACHE__##name] = { \
31 .metric_pebs = P4_PEBS__##name, \
32 .metric_vert = P4_VERT__##name, \
33 }
34
35static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
36 P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
37 P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
38 P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
39 P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
40};
41
42/*
43 * Note that we don't use CCCR1 here, there is an
44 * exception for P4_BSQ_ALLOCATION but we just have
45 * no workaround
46 *
47 * consider this binding as resources which particular
48 * event may borrow, it doesn't contain EventMask,
49 * Tags and friends -- they are left to a caller
50 */
51static struct p4_event_bind p4_event_bind_map[] = {
52 [P4_EVENT_TC_DELIVER_MODE] = {
53 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
54 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
55 .cntr = { {4, 5, -1}, {6, 7, -1} },
56 },
57 [P4_EVENT_BPU_FETCH_REQUEST] = {
58 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
59 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
60 .cntr = { {0, -1, -1}, {2, -1, -1} },
61 },
62 [P4_EVENT_ITLB_REFERENCE] = {
63 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
64 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
65 .cntr = { {0, -1, -1}, {2, -1, -1} },
66 },
67 [P4_EVENT_MEMORY_CANCEL] = {
68 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
69 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
70 .cntr = { {8, 9, -1}, {10, 11, -1} },
71 },
72 [P4_EVENT_MEMORY_COMPLETE] = {
73 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
74 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
75 .cntr = { {8, 9, -1}, {10, 11, -1} },
76 },
77 [P4_EVENT_LOAD_PORT_REPLAY] = {
78 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
79 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
80 .cntr = { {8, 9, -1}, {10, 11, -1} },
81 },
82 [P4_EVENT_STORE_PORT_REPLAY] = {
83 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
84 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
85 .cntr = { {8, 9, -1}, {10, 11, -1} },
86 },
87 [P4_EVENT_MOB_LOAD_REPLAY] = {
88 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
89 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
90 .cntr = { {0, -1, -1}, {2, -1, -1} },
91 },
92 [P4_EVENT_PAGE_WALK_TYPE] = {
93 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
94 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
95 .cntr = { {0, -1, -1}, {2, -1, -1} },
96 },
97 [P4_EVENT_BSQ_CACHE_REFERENCE] = {
98 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
99 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
100 .cntr = { {0, -1, -1}, {2, -1, -1} },
101 },
102 [P4_EVENT_IOQ_ALLOCATION] = {
103 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
104 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
105 .cntr = { {0, -1, -1}, {2, -1, -1} },
106 },
107 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
108 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
109 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
110 .cntr = { {2, -1, -1}, {3, -1, -1} },
111 },
112 [P4_EVENT_FSB_DATA_ACTIVITY] = {
113 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
114 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
115 .cntr = { {0, -1, -1}, {2, -1, -1} },
116 },
117 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
118 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
119 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
120 .cntr = { {0, -1, -1}, {1, -1, -1} },
121 },
122 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
123 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
124 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
125 .cntr = { {2, -1, -1}, {3, -1, -1} },
126 },
127 [P4_EVENT_SSE_INPUT_ASSIST] = {
128 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
129 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
130 .cntr = { {8, 9, -1}, {10, 11, -1} },
131 },
132 [P4_EVENT_PACKED_SP_UOP] = {
133 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
134 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
135 .cntr = { {8, 9, -1}, {10, 11, -1} },
136 },
137 [P4_EVENT_PACKED_DP_UOP] = {
138 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
139 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
140 .cntr = { {8, 9, -1}, {10, 11, -1} },
141 },
142 [P4_EVENT_SCALAR_SP_UOP] = {
143 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
144 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
145 .cntr = { {8, 9, -1}, {10, 11, -1} },
146 },
147 [P4_EVENT_SCALAR_DP_UOP] = {
148 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
149 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
150 .cntr = { {8, 9, -1}, {10, 11, -1} },
151 },
152 [P4_EVENT_64BIT_MMX_UOP] = {
153 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
154 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
155 .cntr = { {8, 9, -1}, {10, 11, -1} },
156 },
157 [P4_EVENT_128BIT_MMX_UOP] = {
158 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
159 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
160 .cntr = { {8, 9, -1}, {10, 11, -1} },
161 },
162 [P4_EVENT_X87_FP_UOP] = {
163 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
164 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
165 .cntr = { {8, 9, -1}, {10, 11, -1} },
166 },
167 [P4_EVENT_TC_MISC] = {
168 .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
169 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
170 .cntr = { {4, 5, -1}, {6, 7, -1} },
171 },
172 [P4_EVENT_GLOBAL_POWER_EVENTS] = {
173 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
174 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
175 .cntr = { {0, -1, -1}, {2, -1, -1} },
176 },
177 [P4_EVENT_TC_MS_XFER] = {
178 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
179 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
180 .cntr = { {4, 5, -1}, {6, 7, -1} },
181 },
182 [P4_EVENT_UOP_QUEUE_WRITES] = {
183 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
184 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
185 .cntr = { {4, 5, -1}, {6, 7, -1} },
186 },
187 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
188 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
189 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
190 .cntr = { {4, 5, -1}, {6, 7, -1} },
191 },
192 [P4_EVENT_RETIRED_BRANCH_TYPE] = {
193 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
194 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
195 .cntr = { {4, 5, -1}, {6, 7, -1} },
196 },
197 [P4_EVENT_RESOURCE_STALL] = {
198 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
199 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
200 .cntr = { {12, 13, 16}, {14, 15, 17} },
201 },
202 [P4_EVENT_WC_BUFFER] = {
203 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
204 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
205 .cntr = { {8, 9, -1}, {10, 11, -1} },
206 },
207 [P4_EVENT_B2B_CYCLES] = {
208 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
209 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
210 .cntr = { {0, -1, -1}, {2, -1, -1} },
211 },
212 [P4_EVENT_BNR] = {
213 .opcode = P4_OPCODE(P4_EVENT_BNR),
214 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
215 .cntr = { {0, -1, -1}, {2, -1, -1} },
216 },
217 [P4_EVENT_SNOOP] = {
218 .opcode = P4_OPCODE(P4_EVENT_SNOOP),
219 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
220 .cntr = { {0, -1, -1}, {2, -1, -1} },
221 },
222 [P4_EVENT_RESPONSE] = {
223 .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
224 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
225 .cntr = { {0, -1, -1}, {2, -1, -1} },
226 },
227 [P4_EVENT_FRONT_END_EVENT] = {
228 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
229 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
230 .cntr = { {12, 13, 16}, {14, 15, 17} },
231 },
232 [P4_EVENT_EXECUTION_EVENT] = {
233 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
234 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
235 .cntr = { {12, 13, 16}, {14, 15, 17} },
236 },
237 [P4_EVENT_REPLAY_EVENT] = {
238 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
239 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
240 .cntr = { {12, 13, 16}, {14, 15, 17} },
241 },
242 [P4_EVENT_INSTR_RETIRED] = {
243 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
244 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
245 .cntr = { {12, 13, 16}, {14, 15, 17} },
246 },
247 [P4_EVENT_UOPS_RETIRED] = {
248 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
249 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
250 .cntr = { {12, 13, 16}, {14, 15, 17} },
251 },
252 [P4_EVENT_UOP_TYPE] = {
253 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
254 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
255 .cntr = { {12, 13, 16}, {14, 15, 17} },
256 },
257 [P4_EVENT_BRANCH_RETIRED] = {
258 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
259 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
260 .cntr = { {12, 13, 16}, {14, 15, 17} },
261 },
262 [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
263 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
264 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
265 .cntr = { {12, 13, 16}, {14, 15, 17} },
266 },
267 [P4_EVENT_X87_ASSIST] = {
268 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
269 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
270 .cntr = { {12, 13, 16}, {14, 15, 17} },
271 },
272 [P4_EVENT_MACHINE_CLEAR] = {
273 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
274 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
275 .cntr = { {12, 13, 16}, {14, 15, 17} },
276 },
277 [P4_EVENT_INSTR_COMPLETED] = {
278 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
279 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
280 .cntr = { {12, 13, 16}, {14, 15, 17} },
281 },
282};
283
284#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \
285 p4_config_pack_escr(P4_ESCR_EVENT(event) | \
286 P4_ESCR_EMASK_BIT(event, bit)) | \
287 p4_config_pack_cccr(cache_event | \
288 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
289
290static __initconst const u64 p4_hw_cache_event_ids
291 [PERF_COUNT_HW_CACHE_MAX]
292 [PERF_COUNT_HW_CACHE_OP_MAX]
293 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
294{
295 [ C(L1D ) ] = {
296 [ C(OP_READ) ] = {
297 [ C(RESULT_ACCESS) ] = 0x0,
298 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
299 P4_CACHE__1stl_cache_load_miss_retired),
300 },
301 },
302 [ C(LL ) ] = {
303 [ C(OP_READ) ] = {
304 [ C(RESULT_ACCESS) ] = 0x0,
305 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
306 P4_CACHE__2ndl_cache_load_miss_retired),
307 },
308},
309 [ C(DTLB) ] = {
310 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x0,
312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
313 P4_CACHE__dtlb_load_miss_retired),
314 },
315 [ C(OP_WRITE) ] = {
316 [ C(RESULT_ACCESS) ] = 0x0,
317 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
318 P4_CACHE__dtlb_store_miss_retired),
319 },
320 },
321 [ C(ITLB) ] = {
322 [ C(OP_READ) ] = {
323 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
324 P4_CACHE__itlb_reference_hit),
325 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
326 P4_CACHE__itlb_reference_miss),
327 },
328 [ C(OP_WRITE) ] = {
329 [ C(RESULT_ACCESS) ] = -1,
330 [ C(RESULT_MISS) ] = -1,
331 },
332 [ C(OP_PREFETCH) ] = {
333 [ C(RESULT_ACCESS) ] = -1,
334 [ C(RESULT_MISS) ] = -1,
335 },
336 },
337};
338
339static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
340 /* non-halted CPU clocks */
341 [PERF_COUNT_HW_CPU_CYCLES] =
342 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
343 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
344
345 /*
346 * retired instructions
347 * in a sake of simplicity we don't use the FSB tagging
348 */
349 [PERF_COUNT_HW_INSTRUCTIONS] =
350 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
351 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
352 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
353
354 /* cache hits */
355 [PERF_COUNT_HW_CACHE_REFERENCES] =
356 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
357 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
358 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
359 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
360 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
361 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
362 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
363
364 /* cache misses */
365 [PERF_COUNT_HW_CACHE_MISSES] =
366 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
367 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
368 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
369 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
370
371 /* branch instructions retired */
372 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
373 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
374 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
375 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
376 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
377 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
378
379 /* mispredicted branches retired */
380 [PERF_COUNT_HW_BRANCH_MISSES] =
381 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
382 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
383
384 /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
385 [PERF_COUNT_HW_BUS_CYCLES] =
386 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
387 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
388 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
389 p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
390};
391
392static struct p4_event_bind *p4_config_get_bind(u64 config)
393{
394 unsigned int evnt = p4_config_unpack_event(config);
395 struct p4_event_bind *bind = NULL;
396
397 if (evnt < ARRAY_SIZE(p4_event_bind_map))
398 bind = &p4_event_bind_map[evnt];
399
400 return bind;
401}
402
403static u64 p4_pmu_event_map(int hw_event)
404{
405 struct p4_event_bind *bind;
406 unsigned int esel;
407 u64 config;
408
409 config = p4_general_events[hw_event];
410 bind = p4_config_get_bind(config);
411 esel = P4_OPCODE_ESEL(bind->opcode);
412 config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
413
414 return config;
415}
416
417static int p4_hw_config(struct perf_event *event)
418{
419 int cpu = get_cpu();
420 int rc = 0;
421 unsigned int evnt;
422 u32 escr, cccr;
423
424 /*
425 * the reason we use cpu that early is that: if we get scheduled
426 * first time on the same cpu -- we will not need swap thread
427 * specific flags in config (and will save some cpu cycles)
428 */
429
430 cccr = p4_default_cccr_conf(cpu);
431 escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
432 event->attr.exclude_user);
433 event->hw.config = p4_config_pack_escr(escr) |
434 p4_config_pack_cccr(cccr);
435
436 if (p4_ht_active() && p4_ht_thread(cpu))
437 event->hw.config = p4_set_ht_bit(event->hw.config);
438
439 if (event->attr.type == PERF_TYPE_RAW) {
440
441 /* user data may have out-of-bound event index */
442 evnt = p4_config_unpack_event(event->attr.config);
443 if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
444 rc = -EINVAL;
445 goto out;
446 }
447
448 /*
449 * We don't control raw events so it's up to the caller
450 * to pass sane values (and we don't count the thread number
451 * on HT machine but allow HT-compatible specifics to be
452 * passed on)
453 *
454 * XXX: HT wide things should check perf_paranoid_cpu() &&
455 * CAP_SYS_ADMIN
456 */
457 event->hw.config |= event->attr.config &
458 (p4_config_pack_escr(P4_ESCR_MASK_HT) |
459 p4_config_pack_cccr(P4_CCCR_MASK_HT));
460 }
461
462 rc = x86_setup_perfctr(event);
463out:
464 put_cpu();
465 return rc;
466}
467
468static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
469{
470 unsigned long dummy;
471
472 rdmsrl(hwc->config_base + hwc->idx, dummy);
473 if (dummy & P4_CCCR_OVF) {
474 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
475 ((u64)dummy) & ~P4_CCCR_OVF);
476 }
477}
478
479static inline void p4_pmu_disable_event(struct perf_event *event)
480{
481 struct hw_perf_event *hwc = &event->hw;
482
483 /*
484 * If event gets disabled while counter is in overflowed
485 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
486 * asserted again and again
487 */
488 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
489 (u64)(p4_config_unpack_cccr(hwc->config)) &
490 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
491}
492
493static void p4_pmu_disable_all(void)
494{
495 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
496 int idx;
497
498 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
499 struct perf_event *event = cpuc->events[idx];
500 if (!test_bit(idx, cpuc->active_mask))
501 continue;
502 p4_pmu_disable_event(event);
503 }
504}
505
506static void p4_pmu_enable_event(struct perf_event *event)
507{
508 struct hw_perf_event *hwc = &event->hw;
509 int thread = p4_ht_config_thread(hwc->config);
510 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
511 unsigned int idx = p4_config_unpack_event(hwc->config);
512 unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
513 struct p4_event_bind *bind;
514 struct p4_cache_event_bind *bind_cache;
515 u64 escr_addr, cccr;
516
517 bind = &p4_event_bind_map[idx];
518 escr_addr = (u64)bind->escr_msr[thread];
519
520 /*
521 * - we dont support cascaded counters yet
522 * - and counter 1 is broken (erratum)
523 */
524 WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
525 WARN_ON_ONCE(hwc->idx == 1);
526
527 /* we need a real Event value */
528 escr_conf &= ~P4_ESCR_EVENT_MASK;
529 escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
530
531 cccr = p4_config_unpack_cccr(hwc->config);
532
533 /*
534 * it could be Cache event so that we need to
535 * set metrics into additional MSRs
536 */
537 BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
538 if (idx_cache > P4_CACHE__NONE &&
539 idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
540 bind_cache = &p4_cache_event_bind_map[idx_cache];
541 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
542 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
543 }
544
545 (void)checking_wrmsrl(escr_addr, escr_conf);
546 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
547 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
548}
549
550static void p4_pmu_enable_all(int added)
551{
552 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
553 int idx;
554
555 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
556 struct perf_event *event = cpuc->events[idx];
557 if (!test_bit(idx, cpuc->active_mask))
558 continue;
559 p4_pmu_enable_event(event);
560 }
561}
562
563static int p4_pmu_handle_irq(struct pt_regs *regs)
564{
565 struct perf_sample_data data;
566 struct cpu_hw_events *cpuc;
567 struct perf_event *event;
568 struct hw_perf_event *hwc;
569 int idx, handled = 0;
570 u64 val;
571
572 data.addr = 0;
573 data.raw = NULL;
574
575 cpuc = &__get_cpu_var(cpu_hw_events);
576
577 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
578
579 if (!test_bit(idx, cpuc->active_mask))
580 continue;
581
582 event = cpuc->events[idx];
583 hwc = &event->hw;
584
585 WARN_ON_ONCE(hwc->idx != idx);
586
587 /*
588 * FIXME: Redundant call, actually not needed
589 * but just to check if we're screwed
590 */
591 p4_pmu_clear_cccr_ovf(hwc);
592
593 val = x86_perf_event_update(event);
594 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
595 continue;
596
597 /*
598 * event overflow
599 */
600 handled = 1;
601 data.period = event->hw.last_period;
602
603 if (!x86_perf_event_set_period(event))
604 continue;
605 if (perf_event_overflow(event, 1, &data, regs))
606 p4_pmu_disable_event(event);
607 }
608
609 if (handled) {
610 /* p4 quirk: unmask it again */
611 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
612 inc_irq_stat(apic_perf_irqs);
613 }
614
615 return handled;
616}
617
618/*
619 * swap thread specific fields according to a thread
620 * we are going to run on
621 */
622static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
623{
624 u32 escr, cccr;
625
626 /*
627 * we either lucky and continue on same cpu or no HT support
628 */
629 if (!p4_should_swap_ts(hwc->config, cpu))
630 return;
631
632 /*
633 * the event is migrated from an another logical
634 * cpu, so we need to swap thread specific flags
635 */
636
637 escr = p4_config_unpack_escr(hwc->config);
638 cccr = p4_config_unpack_cccr(hwc->config);
639
640 if (p4_ht_thread(cpu)) {
641 cccr &= ~P4_CCCR_OVF_PMI_T0;
642 cccr |= P4_CCCR_OVF_PMI_T1;
643 if (escr & P4_ESCR_T0_OS) {
644 escr &= ~P4_ESCR_T0_OS;
645 escr |= P4_ESCR_T1_OS;
646 }
647 if (escr & P4_ESCR_T0_USR) {
648 escr &= ~P4_ESCR_T0_USR;
649 escr |= P4_ESCR_T1_USR;
650 }
651 hwc->config = p4_config_pack_escr(escr);
652 hwc->config |= p4_config_pack_cccr(cccr);
653 hwc->config |= P4_CONFIG_HT;
654 } else {
655 cccr &= ~P4_CCCR_OVF_PMI_T1;
656 cccr |= P4_CCCR_OVF_PMI_T0;
657 if (escr & P4_ESCR_T1_OS) {
658 escr &= ~P4_ESCR_T1_OS;
659 escr |= P4_ESCR_T0_OS;
660 }
661 if (escr & P4_ESCR_T1_USR) {
662 escr &= ~P4_ESCR_T1_USR;
663 escr |= P4_ESCR_T0_USR;
664 }
665 hwc->config = p4_config_pack_escr(escr);
666 hwc->config |= p4_config_pack_cccr(cccr);
667 hwc->config &= ~P4_CONFIG_HT;
668 }
669}
670
671/*
672 * ESCR address hashing is tricky, ESCRs are not sequential
673 * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and
674 * the metric between any ESCRs is laid in range [0xa0,0xe1]
675 *
676 * so we make ~70% filled hashtable
677 */
678
679#define P4_ESCR_MSR_BASE 0x000003a0
680#define P4_ESCR_MSR_MAX 0x000003e1
681#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
682#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE)
683#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr
684
685static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
686 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
687 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
688 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
689 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
690 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
691 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
692 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
693 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
694 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
695 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
696 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
697 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
698 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
699 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
700 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
701 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
702 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
703 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
704 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
705 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
706 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
707 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
708 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
709 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
710 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
711 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
712 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
713 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
714 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
715 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
716 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
717 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
718 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
719 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
720 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
721 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
722 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
723 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
724 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
725 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
726 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
727 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
728 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
729 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
730 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
731 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
732};
733
734static int p4_get_escr_idx(unsigned int addr)
735{
736 unsigned int idx = P4_ESCR_MSR_IDX(addr);
737
738 if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
739 !p4_escr_table[idx])) {
740 WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
741 return -1;
742 }
743
744 return idx;
745}
746
747static int p4_next_cntr(int thread, unsigned long *used_mask,
748 struct p4_event_bind *bind)
749{
750 int i, j;
751
752 for (i = 0; i < P4_CNTR_LIMIT; i++) {
753 j = bind->cntr[thread][i];
754 if (j != -1 && !test_bit(j, used_mask))
755 return j;
756 }
757
758 return -1;
759}
760
761static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
762{
763 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
764 unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
765 int cpu = raw_smp_processor_id();
766 struct hw_perf_event *hwc;
767 struct p4_event_bind *bind;
768 unsigned int i, thread, num;
769 int cntr_idx, escr_idx;
770
771 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
772 bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
773
774 for (i = 0, num = n; i < n; i++, num--) {
775
776 hwc = &cpuc->event_list[i]->hw;
777 thread = p4_ht_thread(cpu);
778 bind = p4_config_get_bind(hwc->config);
779 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
780 if (unlikely(escr_idx == -1))
781 goto done;
782
783 if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
784 cntr_idx = hwc->idx;
785 if (assign)
786 assign[i] = hwc->idx;
787 goto reserve;
788 }
789
790 cntr_idx = p4_next_cntr(thread, used_mask, bind);
791 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
792 goto done;
793
794 p4_pmu_swap_config_ts(hwc, cpu);
795 if (assign)
796 assign[i] = cntr_idx;
797reserve:
798 set_bit(cntr_idx, used_mask);
799 set_bit(escr_idx, escr_mask);
800 }
801
802done:
803 return num ? -ENOSPC : 0;
804}
805
806static __initconst const struct x86_pmu p4_pmu = {
807 .name = "Netburst P4/Xeon",
808 .handle_irq = p4_pmu_handle_irq,
809 .disable_all = p4_pmu_disable_all,
810 .enable_all = p4_pmu_enable_all,
811 .enable = p4_pmu_enable_event,
812 .disable = p4_pmu_disable_event,
813 .eventsel = MSR_P4_BPU_CCCR0,
814 .perfctr = MSR_P4_BPU_PERFCTR0,
815 .event_map = p4_pmu_event_map,
816 .max_events = ARRAY_SIZE(p4_general_events),
817 .get_event_constraints = x86_get_event_constraints,
818 /*
819 * IF HT disabled we may need to use all
820 * ARCH_P4_MAX_CCCR counters simulaneously
821 * though leave it restricted at moment assuming
822 * HT is on
823 */
824 .num_counters = ARCH_P4_MAX_CCCR,
825 .apic = 1,
826 .cntval_bits = 40,
827 .cntval_mask = (1ULL << 40) - 1,
828 .max_period = (1ULL << 39) - 1,
829 .hw_config = p4_hw_config,
830 .schedule_events = p4_pmu_schedule_events,
831};
832
833static __init int p4_pmu_init(void)
834{
835 unsigned int low, high;
836
837 /* If we get stripped -- indexig fails */
838 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
839
840 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
841 if (!(low & (1 << 7))) {
842 pr_cont("unsupported Netburst CPU model %d ",
843 boot_cpu_data.x86_model);
844 return -ENODEV;
845 }
846
847 memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
848 sizeof(hw_cache_event_ids));
849
850 pr_cont("Netburst events, ");
851
852 x86_pmu = p4_pmu;
853
854 return 0;
855}
856
857#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index a330485d14da..34ba07be2cda 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event)
27 */ 27 */
28#define P6_NOP_EVENT 0x0000002EULL 28#define P6_NOP_EVENT 0x0000002EULL
29 29
30static u64 p6_pmu_raw_event(u64 hw_event)
31{
32#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
33#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
34#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
35#define P6_EVNTSEL_INV_MASK 0x00800000ULL
36#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
37
38#define P6_EVNTSEL_MASK \
39 (P6_EVNTSEL_EVENT_MASK | \
40 P6_EVNTSEL_UNIT_MASK | \
41 P6_EVNTSEL_EDGE_MASK | \
42 P6_EVNTSEL_INV_MASK | \
43 P6_EVNTSEL_REG_MASK)
44
45 return hw_event & P6_EVNTSEL_MASK;
46}
47
48static struct event_constraint p6_event_constraints[] = 30static struct event_constraint p6_event_constraints[] =
49{ 31{
50 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ 32 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
@@ -66,7 +48,7 @@ static void p6_pmu_disable_all(void)
66 wrmsrl(MSR_P6_EVNTSEL0, val); 48 wrmsrl(MSR_P6_EVNTSEL0, val);
67} 49}
68 50
69static void p6_pmu_enable_all(void) 51static void p6_pmu_enable_all(int added)
70{ 52{
71 unsigned long val; 53 unsigned long val;
72 54
@@ -102,22 +84,23 @@ static void p6_pmu_enable_event(struct perf_event *event)
102 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
103} 85}
104 86
105static __initconst struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
106 .name = "p6", 88 .name = "p6",
107 .handle_irq = x86_pmu_handle_irq, 89 .handle_irq = x86_pmu_handle_irq,
108 .disable_all = p6_pmu_disable_all, 90 .disable_all = p6_pmu_disable_all,
109 .enable_all = p6_pmu_enable_all, 91 .enable_all = p6_pmu_enable_all,
110 .enable = p6_pmu_enable_event, 92 .enable = p6_pmu_enable_event,
111 .disable = p6_pmu_disable_event, 93 .disable = p6_pmu_disable_event,
94 .hw_config = x86_pmu_hw_config,
95 .schedule_events = x86_schedule_events,
112 .eventsel = MSR_P6_EVNTSEL0, 96 .eventsel = MSR_P6_EVNTSEL0,
113 .perfctr = MSR_P6_PERFCTR0, 97 .perfctr = MSR_P6_PERFCTR0,
114 .event_map = p6_pmu_event_map, 98 .event_map = p6_pmu_event_map,
115 .raw_event = p6_pmu_raw_event,
116 .max_events = ARRAY_SIZE(p6_perfmon_event_map), 99 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
117 .apic = 1, 100 .apic = 1,
118 .max_period = (1ULL << 31) - 1, 101 .max_period = (1ULL << 31) - 1,
119 .version = 0, 102 .version = 0,
120 .num_events = 2, 103 .num_counters = 2,
121 /* 104 /*
122 * Events have 40 bits implemented. However they are designed such 105 * Events have 40 bits implemented. However they are designed such
123 * that bits [32-39] are sign extensions of bit 31. As such the 106 * that bits [32-39] are sign extensions of bit 31. As such the
@@ -125,8 +108,8 @@ static __initconst struct x86_pmu p6_pmu = {
125 * 108 *
126 * See IA-32 Intel Architecture Software developer manual Vol 3B 109 * See IA-32 Intel Architecture Software developer manual Vol 3B
127 */ 110 */
128 .event_bits = 32, 111 .cntval_bits = 32,
129 .event_mask = (1ULL << 32) - 1, 112 .cntval_mask = (1ULL << 32) - 1,
130 .get_event_constraints = x86_get_event_constraints, 113 .get_event_constraints = x86_get_event_constraints,
131 .event_constraints = p6_event_constraints, 114 .event_constraints = p6_event_constraints,
132}; 115};
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index dfdb4dba2320..b9d1ff588445 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,8 +24,8 @@
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <asm/div64.h> 26#include <asm/div64.h>
27#include <asm/vmware.h>
28#include <asm/x86_init.h> 27#include <asm/x86_init.h>
28#include <asm/hypervisor.h>
29 29
30#define CPUID_VMWARE_INFO_LEAF 0x40000000 30#define CPUID_VMWARE_INFO_LEAF 0x40000000
31#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 31#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -65,7 +65,7 @@ static unsigned long vmware_get_tsc_khz(void)
65 return tsc_hz; 65 return tsc_hz;
66} 66}
67 67
68void __init vmware_platform_setup(void) 68static void __init vmware_platform_setup(void)
69{ 69{
70 uint32_t eax, ebx, ecx, edx; 70 uint32_t eax, ebx, ecx, edx;
71 71
@@ -83,26 +83,22 @@ void __init vmware_platform_setup(void)
83 * serial key should be enough, as this will always have a VMware 83 * serial key should be enough, as this will always have a VMware
84 * specific string when running under VMware hypervisor. 84 * specific string when running under VMware hypervisor.
85 */ 85 */
86int vmware_platform(void) 86static bool __init vmware_platform(void)
87{ 87{
88 if (cpu_has_hypervisor) { 88 if (cpu_has_hypervisor) {
89 unsigned int eax, ebx, ecx, edx; 89 unsigned int eax;
90 char hyper_vendor_id[13]; 90 unsigned int hyper_vendor_id[3];
91 91
92 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); 92 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
93 memcpy(hyper_vendor_id + 0, &ebx, 4); 93 &hyper_vendor_id[1], &hyper_vendor_id[2]);
94 memcpy(hyper_vendor_id + 4, &ecx, 4); 94 if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
95 memcpy(hyper_vendor_id + 8, &edx, 4); 95 return true;
96 hyper_vendor_id[12] = '\0';
97 if (!strcmp(hyper_vendor_id, "VMwareVMware"))
98 return 1;
99 } else if (dmi_available && dmi_name_in_serial("VMware") && 96 } else if (dmi_available && dmi_name_in_serial("VMware") &&
100 __vmware_platform()) 97 __vmware_platform())
101 return 1; 98 return true;
102 99
103 return 0; 100 return false;
104} 101}
105EXPORT_SYMBOL(vmware_platform);
106 102
107/* 103/*
108 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 104 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -116,8 +112,16 @@ EXPORT_SYMBOL(vmware_platform);
116 * so that the kernel could just trust the hypervisor with providing a 112 * so that the kernel could just trust the hypervisor with providing a
117 * reliable virtual TSC that is suitable for timekeeping. 113 * reliable virtual TSC that is suitable for timekeeping.
118 */ 114 */
119void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) 115static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
120{ 116{
121 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
122 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); 118 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
123} 119}
120
121const __refconst struct hypervisor_x86 x86_hyper_vmware = {
122 .name = "VMware",
123 .detect = vmware_platform,
124 .set_cpu_features = vmware_set_cpu_features,
125 .init_platform = vmware_platform_setup,
126};
127EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 1c47390dd0e5..000000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
1/*
2 * Debug Store support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS).
7 *
8 * It manages:
9 * - DS and BTS hardware configuration
10 * - buffer overflow handling (to be done)
11 * - buffer access
12 *
13 * It does not do:
14 * - security checking (is the caller allowed to trace the task)
15 * - buffer allocation (memory accounting)
16 *
17 *
18 * Copyright (C) 2007-2009 Intel Corporation.
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
20 */
21
22#include <linux/kernel.h>
23#include <linux/string.h>
24#include <linux/errno.h>
25#include <linux/sched.h>
26#include <linux/slab.h>
27#include <linux/mm.h>
28#include <linux/trace_clock.h>
29
30#include <asm/ds.h>
31
32#include "ds_selftest.h"
33
34/*
35 * The configuration for a particular DS hardware implementation:
36 */
37struct ds_configuration {
38 /* The name of the configuration: */
39 const char *name;
40
41 /* The size of pointer-typed fields in DS, BTS, and PEBS: */
42 unsigned char sizeof_ptr_field;
43
44 /* The size of a BTS/PEBS record in bytes: */
45 unsigned char sizeof_rec[2];
46
47 /* The number of pebs counter reset values in the DS structure. */
48 unsigned char nr_counter_reset;
49
50 /* Control bit-masks indexed by enum ds_feature: */
51 unsigned long ctl[dsf_ctl_max];
52};
53static struct ds_configuration ds_cfg __read_mostly;
54
55
56/* Maximal size of a DS configuration: */
57#define MAX_SIZEOF_DS 0x80
58
59/* Maximal size of a BTS record: */
60#define MAX_SIZEOF_BTS (3 * 8)
61
62/* BTS and PEBS buffer alignment: */
63#define DS_ALIGNMENT (1 << 3)
64
65/* Number of buffer pointers in DS: */
66#define NUM_DS_PTR_FIELDS 8
67
68/* Size of a pebs reset value in DS: */
69#define PEBS_RESET_FIELD_SIZE 8
70
71/* Mask of control bits in the DS MSR register: */
72#define BTS_CONTROL \
73 ( ds_cfg.ctl[dsf_bts] | \
74 ds_cfg.ctl[dsf_bts_kernel] | \
75 ds_cfg.ctl[dsf_bts_user] | \
76 ds_cfg.ctl[dsf_bts_overflow] )
77
78/*
79 * A BTS or PEBS tracer.
80 *
81 * This holds the configuration of the tracer and serves as a handle
82 * to identify tracers.
83 */
84struct ds_tracer {
85 /* The DS context (partially) owned by this tracer. */
86 struct ds_context *context;
87 /* The buffer provided on ds_request() and its size in bytes. */
88 void *buffer;
89 size_t size;
90};
91
92struct bts_tracer {
93 /* The common DS part: */
94 struct ds_tracer ds;
95
96 /* The trace including the DS configuration: */
97 struct bts_trace trace;
98
99 /* Buffer overflow notification function: */
100 bts_ovfl_callback_t ovfl;
101
102 /* Active flags affecting trace collection. */
103 unsigned int flags;
104};
105
106struct pebs_tracer {
107 /* The common DS part: */
108 struct ds_tracer ds;
109
110 /* The trace including the DS configuration: */
111 struct pebs_trace trace;
112
113 /* Buffer overflow notification function: */
114 pebs_ovfl_callback_t ovfl;
115};
116
117/*
118 * Debug Store (DS) save area configuration (see Intel64 and IA32
119 * Architectures Software Developer's Manual, section 18.5)
120 *
121 * The DS configuration consists of the following fields; different
122 * architetures vary in the size of those fields.
123 *
124 * - double-word aligned base linear address of the BTS buffer
125 * - write pointer into the BTS buffer
126 * - end linear address of the BTS buffer (one byte beyond the end of
127 * the buffer)
128 * - interrupt pointer into BTS buffer
129 * (interrupt occurs when write pointer passes interrupt pointer)
130 * - double-word aligned base linear address of the PEBS buffer
131 * - write pointer into the PEBS buffer
132 * - end linear address of the PEBS buffer (one byte beyond the end of
133 * the buffer)
134 * - interrupt pointer into PEBS buffer
135 * (interrupt occurs when write pointer passes interrupt pointer)
136 * - value to which counter is reset following counter overflow
137 *
138 * Later architectures use 64bit pointers throughout, whereas earlier
139 * architectures use 32bit pointers in 32bit mode.
140 *
141 *
142 * We compute the base address for the first 8 fields based on:
143 * - the field size stored in the DS configuration
144 * - the relative field position
145 * - an offset giving the start of the respective region
146 *
147 * This offset is further used to index various arrays holding
148 * information for BTS and PEBS at the respective index.
149 *
150 * On later 32bit processors, we only access the lower 32bit of the
151 * 64bit pointer fields. The upper halves will be zeroed out.
152 */
153
154enum ds_field {
155 ds_buffer_base = 0,
156 ds_index,
157 ds_absolute_maximum,
158 ds_interrupt_threshold,
159};
160
161enum ds_qualifier {
162 ds_bts = 0,
163 ds_pebs
164};
165
166static inline unsigned long
167ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
168{
169 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
170 return *(unsigned long *)base;
171}
172
173static inline void
174ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
175 unsigned long value)
176{
177 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
178 (*(unsigned long *)base) = value;
179}
180
181
182/*
183 * Locking is done only for allocating BTS or PEBS resources.
184 */
185static DEFINE_SPINLOCK(ds_lock);
186
187/*
188 * We either support (system-wide) per-cpu or per-thread allocation.
189 * We distinguish the two based on the task_struct pointer, where a
190 * NULL pointer indicates per-cpu allocation for the current cpu.
191 *
192 * Allocations are use-counted. As soon as resources are allocated,
193 * further allocations must be of the same type (per-cpu or
194 * per-thread). We model this by counting allocations (i.e. the number
195 * of tracers of a certain type) for one type negatively:
196 * =0 no tracers
197 * >0 number of per-thread tracers
198 * <0 number of per-cpu tracers
199 *
200 * Tracers essentially gives the number of ds contexts for a certain
201 * type of allocation.
202 */
203static atomic_t tracers = ATOMIC_INIT(0);
204
205static inline int get_tracer(struct task_struct *task)
206{
207 int error;
208
209 spin_lock_irq(&ds_lock);
210
211 if (task) {
212 error = -EPERM;
213 if (atomic_read(&tracers) < 0)
214 goto out;
215 atomic_inc(&tracers);
216 } else {
217 error = -EPERM;
218 if (atomic_read(&tracers) > 0)
219 goto out;
220 atomic_dec(&tracers);
221 }
222
223 error = 0;
224out:
225 spin_unlock_irq(&ds_lock);
226 return error;
227}
228
229static inline void put_tracer(struct task_struct *task)
230{
231 if (task)
232 atomic_dec(&tracers);
233 else
234 atomic_inc(&tracers);
235}
236
237/*
238 * The DS context is either attached to a thread or to a cpu:
239 * - in the former case, the thread_struct contains a pointer to the
240 * attached context.
241 * - in the latter case, we use a static array of per-cpu context
242 * pointers.
243 *
244 * Contexts are use-counted. They are allocated on first access and
245 * deallocated when the last user puts the context.
246 */
247struct ds_context {
248 /* The DS configuration; goes into MSR_IA32_DS_AREA: */
249 unsigned char ds[MAX_SIZEOF_DS];
250
251 /* The owner of the BTS and PEBS configuration, respectively: */
252 struct bts_tracer *bts_master;
253 struct pebs_tracer *pebs_master;
254
255 /* Use count: */
256 unsigned long count;
257
258 /* Pointer to the context pointer field: */
259 struct ds_context **this;
260
261 /* The traced task; NULL for cpu tracing: */
262 struct task_struct *task;
263
264 /* The traced cpu; only valid if task is NULL: */
265 int cpu;
266};
267
268static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
269
270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
272{
273 struct ds_context **p_context =
274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
275 struct ds_context *context = NULL;
276 struct ds_context *new_context = NULL;
277
278 /* Chances are small that we already have a context. */
279 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
280 if (!new_context)
281 return NULL;
282
283 spin_lock_irq(&ds_lock);
284
285 context = *p_context;
286 if (likely(!context)) {
287 context = new_context;
288
289 context->this = p_context;
290 context->task = task;
291 context->cpu = cpu;
292 context->count = 0;
293
294 *p_context = context;
295 }
296
297 context->count++;
298
299 spin_unlock_irq(&ds_lock);
300
301 if (context != new_context)
302 kfree(new_context);
303
304 return context;
305}
306
307static void ds_put_context(struct ds_context *context)
308{
309 struct task_struct *task;
310 unsigned long irq;
311
312 if (!context)
313 return;
314
315 spin_lock_irqsave(&ds_lock, irq);
316
317 if (--context->count) {
318 spin_unlock_irqrestore(&ds_lock, irq);
319 return;
320 }
321
322 *(context->this) = NULL;
323
324 task = context->task;
325
326 if (task)
327 clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
328
329 /*
330 * We leave the (now dangling) pointer to the DS configuration in
331 * the DS_AREA msr. This is as good or as bad as replacing it with
332 * NULL - the hardware would crash if we enabled tracing.
333 *
334 * This saves us some problems with having to write an msr on a
335 * different cpu while preventing others from doing the same for the
336 * next context for that same cpu.
337 */
338
339 spin_unlock_irqrestore(&ds_lock, irq);
340
341 /* The context might still be in use for context switching. */
342 if (task && (task != current))
343 wait_task_context_switch(task);
344
345 kfree(context);
346}
347
348static void ds_install_ds_area(struct ds_context *context)
349{
350 unsigned long ds;
351
352 ds = (unsigned long)context->ds;
353
354 /*
355 * There is a race between the bts master and the pebs master.
356 *
357 * The thread/cpu access is synchronized via get/put_cpu() for
358 * task tracing and via wrmsr_on_cpu for cpu tracing.
359 *
360 * If bts and pebs are collected for the same task or same cpu,
361 * the same confiuration is written twice.
362 */
363 if (context->task) {
364 get_cpu();
365 if (context->task == current)
366 wrmsrl(MSR_IA32_DS_AREA, ds);
367 set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
368 put_cpu();
369 } else
370 wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
371 (u32)((u64)ds), (u32)((u64)ds >> 32));
372}
373
374/*
375 * Call the tracer's callback on a buffer overflow.
376 *
377 * context: the ds context
378 * qual: the buffer type
379 */
380static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
381{
382 switch (qual) {
383 case ds_bts:
384 if (context->bts_master &&
385 context->bts_master->ovfl)
386 context->bts_master->ovfl(context->bts_master);
387 break;
388 case ds_pebs:
389 if (context->pebs_master &&
390 context->pebs_master->ovfl)
391 context->pebs_master->ovfl(context->pebs_master);
392 break;
393 }
394}
395
396
397/*
398 * Write raw data into the BTS or PEBS buffer.
399 *
400 * The remainder of any partially written record is zeroed out.
401 *
402 * context: the DS context
403 * qual: the buffer type
404 * record: the data to write
405 * size: the size of the data
406 */
407static int ds_write(struct ds_context *context, enum ds_qualifier qual,
408 const void *record, size_t size)
409{
410 int bytes_written = 0;
411
412 if (!record)
413 return -EINVAL;
414
415 while (size) {
416 unsigned long base, index, end, write_end, int_th;
417 unsigned long write_size, adj_write_size;
418
419 /*
420 * Write as much as possible without producing an
421 * overflow interrupt.
422 *
423 * Interrupt_threshold must either be
424 * - bigger than absolute_maximum or
425 * - point to a record between buffer_base and absolute_maximum
426 *
427 * Index points to a valid record.
428 */
429 base = ds_get(context->ds, qual, ds_buffer_base);
430 index = ds_get(context->ds, qual, ds_index);
431 end = ds_get(context->ds, qual, ds_absolute_maximum);
432 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
433
434 write_end = min(end, int_th);
435
436 /*
437 * If we are already beyond the interrupt threshold,
438 * we fill the entire buffer.
439 */
440 if (write_end <= index)
441 write_end = end;
442
443 if (write_end <= index)
444 break;
445
446 write_size = min((unsigned long) size, write_end - index);
447 memcpy((void *)index, record, write_size);
448
449 record = (const char *)record + write_size;
450 size -= write_size;
451 bytes_written += write_size;
452
453 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
454 adj_write_size *= ds_cfg.sizeof_rec[qual];
455
456 /* Zero out trailing bytes. */
457 memset((char *)index + write_size, 0,
458 adj_write_size - write_size);
459 index += adj_write_size;
460
461 if (index >= end)
462 index = base;
463 ds_set(context->ds, qual, ds_index, index);
464
465 if (index >= int_th)
466 ds_overflow(context, qual);
467 }
468
469 return bytes_written;
470}
471
472
473/*
474 * Branch Trace Store (BTS) uses the following format. Different
475 * architectures vary in the size of those fields.
476 * - source linear address
477 * - destination linear address
478 * - flags
479 *
480 * Later architectures use 64bit pointers throughout, whereas earlier
481 * architectures use 32bit pointers in 32bit mode.
482 *
483 * We compute the base address for the fields based on:
484 * - the field size stored in the DS configuration
485 * - the relative field position
486 *
487 * In order to store additional information in the BTS buffer, we use
488 * a special source address to indicate that the record requires
489 * special interpretation.
490 *
491 * Netburst indicated via a bit in the flags field whether the branch
492 * was predicted; this is ignored.
493 *
494 * We use two levels of abstraction:
495 * - the raw data level defined here
496 * - an arch-independent level defined in ds.h
497 */
498
499enum bts_field {
500 bts_from,
501 bts_to,
502 bts_flags,
503
504 bts_qual = bts_from,
505 bts_clock = bts_to,
506 bts_pid = bts_flags,
507
508 bts_qual_mask = (bts_qual_max - 1),
509 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
510};
511
512static inline unsigned long bts_get(const char *base, unsigned long field)
513{
514 base += (ds_cfg.sizeof_ptr_field * field);
515 return *(unsigned long *)base;
516}
517
518static inline void bts_set(char *base, unsigned long field, unsigned long val)
519{
520 base += (ds_cfg.sizeof_ptr_field * field);
521 (*(unsigned long *)base) = val;
522}
523
524
525/*
526 * The raw BTS data is architecture dependent.
527 *
528 * For higher-level users, we give an arch-independent view.
529 * - ds.h defines struct bts_struct
530 * - bts_read translates one raw bts record into a bts_struct
531 * - bts_write translates one bts_struct into the raw format and
532 * writes it into the top of the parameter tracer's buffer.
533 *
534 * return: bytes read/written on success; -Eerrno, otherwise
535 */
536static int
537bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
538{
539 if (!tracer)
540 return -EINVAL;
541
542 if (at < tracer->trace.ds.begin)
543 return -EINVAL;
544
545 if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
546 return -EINVAL;
547
548 memset(out, 0, sizeof(*out));
549 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
550 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
551 out->variant.event.clock = bts_get(at, bts_clock);
552 out->variant.event.pid = bts_get(at, bts_pid);
553 } else {
554 out->qualifier = bts_branch;
555 out->variant.lbr.from = bts_get(at, bts_from);
556 out->variant.lbr.to = bts_get(at, bts_to);
557
558 if (!out->variant.lbr.from && !out->variant.lbr.to)
559 out->qualifier = bts_invalid;
560 }
561
562 return ds_cfg.sizeof_rec[ds_bts];
563}
564
565static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
566{
567 unsigned char raw[MAX_SIZEOF_BTS];
568
569 if (!tracer)
570 return -EINVAL;
571
572 if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
573 return -EOVERFLOW;
574
575 switch (in->qualifier) {
576 case bts_invalid:
577 bts_set(raw, bts_from, 0);
578 bts_set(raw, bts_to, 0);
579 bts_set(raw, bts_flags, 0);
580 break;
581 case bts_branch:
582 bts_set(raw, bts_from, in->variant.lbr.from);
583 bts_set(raw, bts_to, in->variant.lbr.to);
584 bts_set(raw, bts_flags, 0);
585 break;
586 case bts_task_arrives:
587 case bts_task_departs:
588 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
589 bts_set(raw, bts_clock, in->variant.event.clock);
590 bts_set(raw, bts_pid, in->variant.event.pid);
591 break;
592 default:
593 return -EINVAL;
594 }
595
596 return ds_write(tracer->ds.context, ds_bts, raw,
597 ds_cfg.sizeof_rec[ds_bts]);
598}
599
600
601static void ds_write_config(struct ds_context *context,
602 struct ds_trace *cfg, enum ds_qualifier qual)
603{
604 unsigned char *ds = context->ds;
605
606 ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
607 ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
608 ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
609 ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
610}
611
612static void ds_read_config(struct ds_context *context,
613 struct ds_trace *cfg, enum ds_qualifier qual)
614{
615 unsigned char *ds = context->ds;
616
617 cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
618 cfg->top = (void *)ds_get(ds, qual, ds_index);
619 cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
620 cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
621}
622
623static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
624 void *base, size_t size, size_t ith,
625 unsigned int flags) {
626 unsigned long buffer, adj;
627
628 /*
629 * Adjust the buffer address and size to meet alignment
630 * constraints:
631 * - buffer is double-word aligned
632 * - size is multiple of record size
633 *
634 * We checked the size at the very beginning; we have enough
635 * space to do the adjustment.
636 */
637 buffer = (unsigned long)base;
638
639 adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
640 buffer += adj;
641 size -= adj;
642
643 trace->n = size / ds_cfg.sizeof_rec[qual];
644 trace->size = ds_cfg.sizeof_rec[qual];
645
646 size = (trace->n * trace->size);
647
648 trace->begin = (void *)buffer;
649 trace->top = trace->begin;
650 trace->end = (void *)(buffer + size);
651 /*
652 * The value for 'no threshold' is -1, which will set the
653 * threshold outside of the buffer, just like we want it.
654 */
655 ith *= ds_cfg.sizeof_rec[qual];
656 trace->ith = (void *)(buffer + size - ith);
657
658 trace->flags = flags;
659}
660
661
662static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
663 enum ds_qualifier qual, struct task_struct *task,
664 int cpu, void *base, size_t size, size_t th)
665{
666 struct ds_context *context;
667 int error;
668 size_t req_size;
669
670 error = -EOPNOTSUPP;
671 if (!ds_cfg.sizeof_rec[qual])
672 goto out;
673
674 error = -EINVAL;
675 if (!base)
676 goto out;
677
678 req_size = ds_cfg.sizeof_rec[qual];
679 /* We might need space for alignment adjustments. */
680 if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
681 req_size += DS_ALIGNMENT;
682
683 error = -EINVAL;
684 if (size < req_size)
685 goto out;
686
687 if (th != (size_t)-1) {
688 th *= ds_cfg.sizeof_rec[qual];
689
690 error = -EINVAL;
691 if (size <= th)
692 goto out;
693 }
694
695 tracer->buffer = base;
696 tracer->size = size;
697
698 error = -ENOMEM;
699 context = ds_get_context(task, cpu);
700 if (!context)
701 goto out;
702 tracer->context = context;
703
704 /*
705 * Defer any tracer-specific initialization work for the context until
706 * context ownership has been clarified.
707 */
708
709 error = 0;
710 out:
711 return error;
712}
713
714static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
715 void *base, size_t size,
716 bts_ovfl_callback_t ovfl, size_t th,
717 unsigned int flags)
718{
719 struct bts_tracer *tracer;
720 int error;
721
722 /* Buffer overflow notification is not yet implemented. */
723 error = -EOPNOTSUPP;
724 if (ovfl)
725 goto out;
726
727 error = get_tracer(task);
728 if (error < 0)
729 goto out;
730
731 error = -ENOMEM;
732 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
733 if (!tracer)
734 goto out_put_tracer;
735 tracer->ovfl = ovfl;
736
737 /* Do some more error checking and acquire a tracing context. */
738 error = ds_request(&tracer->ds, &tracer->trace.ds,
739 ds_bts, task, cpu, base, size, th);
740 if (error < 0)
741 goto out_tracer;
742
743 /* Claim the bts part of the tracing context we acquired above. */
744 spin_lock_irq(&ds_lock);
745
746 error = -EPERM;
747 if (tracer->ds.context->bts_master)
748 goto out_unlock;
749 tracer->ds.context->bts_master = tracer;
750
751 spin_unlock_irq(&ds_lock);
752
753 /*
754 * Now that we own the bts part of the context, let's complete the
755 * initialization for that part.
756 */
757 ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
758 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
759 ds_install_ds_area(tracer->ds.context);
760
761 tracer->trace.read = bts_read;
762 tracer->trace.write = bts_write;
763
764 /* Start tracing. */
765 ds_resume_bts(tracer);
766
767 return tracer;
768
769 out_unlock:
770 spin_unlock_irq(&ds_lock);
771 ds_put_context(tracer->ds.context);
772 out_tracer:
773 kfree(tracer);
774 out_put_tracer:
775 put_tracer(task);
776 out:
777 return ERR_PTR(error);
778}
779
780struct bts_tracer *ds_request_bts_task(struct task_struct *task,
781 void *base, size_t size,
782 bts_ovfl_callback_t ovfl,
783 size_t th, unsigned int flags)
784{
785 return ds_request_bts(task, 0, base, size, ovfl, th, flags);
786}
787
788struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
789 bts_ovfl_callback_t ovfl,
790 size_t th, unsigned int flags)
791{
792 return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
793}
794
795static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
796 void *base, size_t size,
797 pebs_ovfl_callback_t ovfl, size_t th,
798 unsigned int flags)
799{
800 struct pebs_tracer *tracer;
801 int error;
802
803 /* Buffer overflow notification is not yet implemented. */
804 error = -EOPNOTSUPP;
805 if (ovfl)
806 goto out;
807
808 error = get_tracer(task);
809 if (error < 0)
810 goto out;
811
812 error = -ENOMEM;
813 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
814 if (!tracer)
815 goto out_put_tracer;
816 tracer->ovfl = ovfl;
817
818 /* Do some more error checking and acquire a tracing context. */
819 error = ds_request(&tracer->ds, &tracer->trace.ds,
820 ds_pebs, task, cpu, base, size, th);
821 if (error < 0)
822 goto out_tracer;
823
824 /* Claim the pebs part of the tracing context we acquired above. */
825 spin_lock_irq(&ds_lock);
826
827 error = -EPERM;
828 if (tracer->ds.context->pebs_master)
829 goto out_unlock;
830 tracer->ds.context->pebs_master = tracer;
831
832 spin_unlock_irq(&ds_lock);
833
834 /*
835 * Now that we own the pebs part of the context, let's complete the
836 * initialization for that part.
837 */
838 ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
839 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
840 ds_install_ds_area(tracer->ds.context);
841
842 /* Start tracing. */
843 ds_resume_pebs(tracer);
844
845 return tracer;
846
847 out_unlock:
848 spin_unlock_irq(&ds_lock);
849 ds_put_context(tracer->ds.context);
850 out_tracer:
851 kfree(tracer);
852 out_put_tracer:
853 put_tracer(task);
854 out:
855 return ERR_PTR(error);
856}
857
858struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
859 void *base, size_t size,
860 pebs_ovfl_callback_t ovfl,
861 size_t th, unsigned int flags)
862{
863 return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
864}
865
866struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
867 pebs_ovfl_callback_t ovfl,
868 size_t th, unsigned int flags)
869{
870 return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
871}
872
873static void ds_free_bts(struct bts_tracer *tracer)
874{
875 struct task_struct *task;
876
877 task = tracer->ds.context->task;
878
879 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
880 tracer->ds.context->bts_master = NULL;
881
882 /* Make sure tracing stopped and the tracer is not in use. */
883 if (task && (task != current))
884 wait_task_context_switch(task);
885
886 ds_put_context(tracer->ds.context);
887 put_tracer(task);
888
889 kfree(tracer);
890}
891
892void ds_release_bts(struct bts_tracer *tracer)
893{
894 might_sleep();
895
896 if (!tracer)
897 return;
898
899 ds_suspend_bts(tracer);
900 ds_free_bts(tracer);
901}
902
903int ds_release_bts_noirq(struct bts_tracer *tracer)
904{
905 struct task_struct *task;
906 unsigned long irq;
907 int error;
908
909 if (!tracer)
910 return 0;
911
912 task = tracer->ds.context->task;
913
914 local_irq_save(irq);
915
916 error = -EPERM;
917 if (!task &&
918 (tracer->ds.context->cpu != smp_processor_id()))
919 goto out;
920
921 error = -EPERM;
922 if (task && (task != current))
923 goto out;
924
925 ds_suspend_bts_noirq(tracer);
926 ds_free_bts(tracer);
927
928 error = 0;
929 out:
930 local_irq_restore(irq);
931 return error;
932}
933
934static void update_task_debugctlmsr(struct task_struct *task,
935 unsigned long debugctlmsr)
936{
937 task->thread.debugctlmsr = debugctlmsr;
938
939 get_cpu();
940 if (task == current)
941 update_debugctlmsr(debugctlmsr);
942 put_cpu();
943}
944
945void ds_suspend_bts(struct bts_tracer *tracer)
946{
947 struct task_struct *task;
948 unsigned long debugctlmsr;
949 int cpu;
950
951 if (!tracer)
952 return;
953
954 tracer->flags = 0;
955
956 task = tracer->ds.context->task;
957 cpu = tracer->ds.context->cpu;
958
959 WARN_ON(!task && irqs_disabled());
960
961 debugctlmsr = (task ?
962 task->thread.debugctlmsr :
963 get_debugctlmsr_on_cpu(cpu));
964 debugctlmsr &= ~BTS_CONTROL;
965
966 if (task)
967 update_task_debugctlmsr(task, debugctlmsr);
968 else
969 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
970}
971
972int ds_suspend_bts_noirq(struct bts_tracer *tracer)
973{
974 struct task_struct *task;
975 unsigned long debugctlmsr, irq;
976 int cpu, error = 0;
977
978 if (!tracer)
979 return 0;
980
981 tracer->flags = 0;
982
983 task = tracer->ds.context->task;
984 cpu = tracer->ds.context->cpu;
985
986 local_irq_save(irq);
987
988 error = -EPERM;
989 if (!task && (cpu != smp_processor_id()))
990 goto out;
991
992 debugctlmsr = (task ?
993 task->thread.debugctlmsr :
994 get_debugctlmsr());
995 debugctlmsr &= ~BTS_CONTROL;
996
997 if (task)
998 update_task_debugctlmsr(task, debugctlmsr);
999 else
1000 update_debugctlmsr(debugctlmsr);
1001
1002 error = 0;
1003 out:
1004 local_irq_restore(irq);
1005 return error;
1006}
1007
1008static unsigned long ds_bts_control(struct bts_tracer *tracer)
1009{
1010 unsigned long control;
1011
1012 control = ds_cfg.ctl[dsf_bts];
1013 if (!(tracer->trace.ds.flags & BTS_KERNEL))
1014 control |= ds_cfg.ctl[dsf_bts_kernel];
1015 if (!(tracer->trace.ds.flags & BTS_USER))
1016 control |= ds_cfg.ctl[dsf_bts_user];
1017
1018 return control;
1019}
1020
1021void ds_resume_bts(struct bts_tracer *tracer)
1022{
1023 struct task_struct *task;
1024 unsigned long debugctlmsr;
1025 int cpu;
1026
1027 if (!tracer)
1028 return;
1029
1030 tracer->flags = tracer->trace.ds.flags;
1031
1032 task = tracer->ds.context->task;
1033 cpu = tracer->ds.context->cpu;
1034
1035 WARN_ON(!task && irqs_disabled());
1036
1037 debugctlmsr = (task ?
1038 task->thread.debugctlmsr :
1039 get_debugctlmsr_on_cpu(cpu));
1040 debugctlmsr |= ds_bts_control(tracer);
1041
1042 if (task)
1043 update_task_debugctlmsr(task, debugctlmsr);
1044 else
1045 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
1046}
1047
1048int ds_resume_bts_noirq(struct bts_tracer *tracer)
1049{
1050 struct task_struct *task;
1051 unsigned long debugctlmsr, irq;
1052 int cpu, error = 0;
1053
1054 if (!tracer)
1055 return 0;
1056
1057 tracer->flags = tracer->trace.ds.flags;
1058
1059 task = tracer->ds.context->task;
1060 cpu = tracer->ds.context->cpu;
1061
1062 local_irq_save(irq);
1063
1064 error = -EPERM;
1065 if (!task && (cpu != smp_processor_id()))
1066 goto out;
1067
1068 debugctlmsr = (task ?
1069 task->thread.debugctlmsr :
1070 get_debugctlmsr());
1071 debugctlmsr |= ds_bts_control(tracer);
1072
1073 if (task)
1074 update_task_debugctlmsr(task, debugctlmsr);
1075 else
1076 update_debugctlmsr(debugctlmsr);
1077
1078 error = 0;
1079 out:
1080 local_irq_restore(irq);
1081 return error;
1082}
1083
1084static void ds_free_pebs(struct pebs_tracer *tracer)
1085{
1086 struct task_struct *task;
1087
1088 task = tracer->ds.context->task;
1089
1090 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
1091 tracer->ds.context->pebs_master = NULL;
1092
1093 ds_put_context(tracer->ds.context);
1094 put_tracer(task);
1095
1096 kfree(tracer);
1097}
1098
1099void ds_release_pebs(struct pebs_tracer *tracer)
1100{
1101 might_sleep();
1102
1103 if (!tracer)
1104 return;
1105
1106 ds_suspend_pebs(tracer);
1107 ds_free_pebs(tracer);
1108}
1109
1110int ds_release_pebs_noirq(struct pebs_tracer *tracer)
1111{
1112 struct task_struct *task;
1113 unsigned long irq;
1114 int error;
1115
1116 if (!tracer)
1117 return 0;
1118
1119 task = tracer->ds.context->task;
1120
1121 local_irq_save(irq);
1122
1123 error = -EPERM;
1124 if (!task &&
1125 (tracer->ds.context->cpu != smp_processor_id()))
1126 goto out;
1127
1128 error = -EPERM;
1129 if (task && (task != current))
1130 goto out;
1131
1132 ds_suspend_pebs_noirq(tracer);
1133 ds_free_pebs(tracer);
1134
1135 error = 0;
1136 out:
1137 local_irq_restore(irq);
1138 return error;
1139}
1140
1141void ds_suspend_pebs(struct pebs_tracer *tracer)
1142{
1143
1144}
1145
1146int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
1147{
1148 return 0;
1149}
1150
1151void ds_resume_pebs(struct pebs_tracer *tracer)
1152{
1153
1154}
1155
1156int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
1157{
1158 return 0;
1159}
1160
1161const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
1162{
1163 if (!tracer)
1164 return NULL;
1165
1166 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
1167 return &tracer->trace;
1168}
1169
1170const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
1171{
1172 if (!tracer)
1173 return NULL;
1174
1175 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
1176
1177 tracer->trace.counters = ds_cfg.nr_counter_reset;
1178 memcpy(tracer->trace.counter_reset,
1179 tracer->ds.context->ds +
1180 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
1181 ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
1182
1183 return &tracer->trace;
1184}
1185
1186int ds_reset_bts(struct bts_tracer *tracer)
1187{
1188 if (!tracer)
1189 return -EINVAL;
1190
1191 tracer->trace.ds.top = tracer->trace.ds.begin;
1192
1193 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
1194 (unsigned long)tracer->trace.ds.top);
1195
1196 return 0;
1197}
1198
1199int ds_reset_pebs(struct pebs_tracer *tracer)
1200{
1201 if (!tracer)
1202 return -EINVAL;
1203
1204 tracer->trace.ds.top = tracer->trace.ds.begin;
1205
1206 ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
1207 (unsigned long)tracer->trace.ds.top);
1208
1209 return 0;
1210}
1211
1212int ds_set_pebs_reset(struct pebs_tracer *tracer,
1213 unsigned int counter, u64 value)
1214{
1215 if (!tracer)
1216 return -EINVAL;
1217
1218 if (ds_cfg.nr_counter_reset < counter)
1219 return -EINVAL;
1220
1221 *(u64 *)(tracer->ds.context->ds +
1222 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
1223 (counter * PEBS_RESET_FIELD_SIZE)) = value;
1224
1225 return 0;
1226}
1227
1228static const struct ds_configuration ds_cfg_netburst = {
1229 .name = "Netburst",
1230 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
1231 .ctl[dsf_bts_kernel] = (1 << 5),
1232 .ctl[dsf_bts_user] = (1 << 6),
1233 .nr_counter_reset = 1,
1234};
1235static const struct ds_configuration ds_cfg_pentium_m = {
1236 .name = "Pentium M",
1237 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1238 .nr_counter_reset = 1,
1239};
1240static const struct ds_configuration ds_cfg_core2_atom = {
1241 .name = "Core 2/Atom",
1242 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1243 .ctl[dsf_bts_kernel] = (1 << 9),
1244 .ctl[dsf_bts_user] = (1 << 10),
1245 .nr_counter_reset = 1,
1246};
1247static const struct ds_configuration ds_cfg_core_i7 = {
1248 .name = "Core i7",
1249 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1250 .ctl[dsf_bts_kernel] = (1 << 9),
1251 .ctl[dsf_bts_user] = (1 << 10),
1252 .nr_counter_reset = 4,
1253};
1254
1255static void
1256ds_configure(const struct ds_configuration *cfg,
1257 struct cpuinfo_x86 *cpu)
1258{
1259 unsigned long nr_pebs_fields = 0;
1260
1261 printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
1262
1263#ifdef __i386__
1264 nr_pebs_fields = 10;
1265#else
1266 nr_pebs_fields = 18;
1267#endif
1268
1269 /*
1270 * Starting with version 2, architectural performance
1271 * monitoring supports a format specifier.
1272 */
1273 if ((cpuid_eax(0xa) & 0xff) > 1) {
1274 unsigned long perf_capabilities, format;
1275
1276 rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
1277
1278 format = (perf_capabilities >> 8) & 0xf;
1279
1280 switch (format) {
1281 case 0:
1282 nr_pebs_fields = 18;
1283 break;
1284 case 1:
1285 nr_pebs_fields = 22;
1286 break;
1287 default:
1288 printk(KERN_INFO
1289 "[ds] unknown PEBS format: %lu\n", format);
1290 nr_pebs_fields = 0;
1291 break;
1292 }
1293 }
1294
1295 memset(&ds_cfg, 0, sizeof(ds_cfg));
1296 ds_cfg = *cfg;
1297
1298 ds_cfg.sizeof_ptr_field =
1299 (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
1300
1301 ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
1302 ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
1303
1304 if (!cpu_has(cpu, X86_FEATURE_BTS)) {
1305 ds_cfg.sizeof_rec[ds_bts] = 0;
1306 printk(KERN_INFO "[ds] bts not available\n");
1307 }
1308 if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
1309 ds_cfg.sizeof_rec[ds_pebs] = 0;
1310 printk(KERN_INFO "[ds] pebs not available\n");
1311 }
1312
1313 printk(KERN_INFO "[ds] sizes: address: %u bit, ",
1314 8 * ds_cfg.sizeof_ptr_field);
1315 printk("bts/pebs record: %u/%u bytes\n",
1316 ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
1317
1318 WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
1319}
1320
1321void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
1322{
1323 /* Only configure the first cpu. Others are identical. */
1324 if (ds_cfg.name)
1325 return;
1326
1327 switch (c->x86) {
1328 case 0x6:
1329 switch (c->x86_model) {
1330 case 0x9:
1331 case 0xd: /* Pentium M */
1332 ds_configure(&ds_cfg_pentium_m, c);
1333 break;
1334 case 0xf:
1335 case 0x17: /* Core2 */
1336 case 0x1c: /* Atom */
1337 ds_configure(&ds_cfg_core2_atom, c);
1338 break;
1339 case 0x1a: /* Core i7 */
1340 ds_configure(&ds_cfg_core_i7, c);
1341 break;
1342 default:
1343 /* Sorry, don't know about them. */
1344 break;
1345 }
1346 break;
1347 case 0xf:
1348 switch (c->x86_model) {
1349 case 0x0:
1350 case 0x1:
1351 case 0x2: /* Netburst */
1352 ds_configure(&ds_cfg_netburst, c);
1353 break;
1354 default:
1355 /* Sorry, don't know about them. */
1356 break;
1357 }
1358 break;
1359 default:
1360 /* Sorry, don't know about them. */
1361 break;
1362 }
1363}
1364
1365static inline void ds_take_timestamp(struct ds_context *context,
1366 enum bts_qualifier qualifier,
1367 struct task_struct *task)
1368{
1369 struct bts_tracer *tracer = context->bts_master;
1370 struct bts_struct ts;
1371
1372 /* Prevent compilers from reading the tracer pointer twice. */
1373 barrier();
1374
1375 if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
1376 return;
1377
1378 memset(&ts, 0, sizeof(ts));
1379 ts.qualifier = qualifier;
1380 ts.variant.event.clock = trace_clock_global();
1381 ts.variant.event.pid = task->pid;
1382
1383 bts_write(tracer, &ts);
1384}
1385
1386/*
1387 * Change the DS configuration from tracing prev to tracing next.
1388 */
1389void ds_switch_to(struct task_struct *prev, struct task_struct *next)
1390{
1391 struct ds_context *prev_ctx = prev->thread.ds_ctx;
1392 struct ds_context *next_ctx = next->thread.ds_ctx;
1393 unsigned long debugctlmsr = next->thread.debugctlmsr;
1394
1395 /* Make sure all data is read before we start. */
1396 barrier();
1397
1398 if (prev_ctx) {
1399 update_debugctlmsr(0);
1400
1401 ds_take_timestamp(prev_ctx, bts_task_departs, prev);
1402 }
1403
1404 if (next_ctx) {
1405 ds_take_timestamp(next_ctx, bts_task_arrives, next);
1406
1407 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1408 }
1409
1410 update_debugctlmsr(debugctlmsr);
1411}
1412
1413static __init int ds_selftest(void)
1414{
1415 if (ds_cfg.sizeof_rec[ds_bts]) {
1416 int error;
1417
1418 error = ds_selftest_bts();
1419 if (error) {
1420 WARN(1, "[ds] selftest failed. disabling bts.\n");
1421 ds_cfg.sizeof_rec[ds_bts] = 0;
1422 }
1423 }
1424
1425 if (ds_cfg.sizeof_rec[ds_pebs]) {
1426 int error;
1427
1428 error = ds_selftest_pebs();
1429 if (error) {
1430 WARN(1, "[ds] selftest failed. disabling pebs.\n");
1431 ds_cfg.sizeof_rec[ds_pebs] = 0;
1432 }
1433 }
1434
1435 return 0;
1436}
1437device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab99..000000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#include "ds_selftest.h"
10
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/smp.h>
14#include <linux/cpu.h>
15
16#include <asm/ds.h>
17
18
19#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
20#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
21
22struct ds_selftest_bts_conf {
23 struct bts_tracer *tracer;
24 int error;
25 int (*suspend)(struct bts_tracer *);
26 int (*resume)(struct bts_tracer *);
27};
28
29static int ds_selftest_bts_consistency(const struct bts_trace *trace)
30{
31 int error = 0;
32
33 if (!trace) {
34 printk(KERN_CONT "failed to access trace...");
35 /* Bail out. Other tests are pointless. */
36 return -1;
37 }
38
39 if (!trace->read) {
40 printk(KERN_CONT "bts read not available...");
41 error = -1;
42 }
43
44 /* Do some sanity checks on the trace configuration. */
45 if (!trace->ds.n) {
46 printk(KERN_CONT "empty bts buffer...");
47 error = -1;
48 }
49 if (!trace->ds.size) {
50 printk(KERN_CONT "bad bts trace setup...");
51 error = -1;
52 }
53 if (trace->ds.end !=
54 (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
55 printk(KERN_CONT "bad bts buffer setup...");
56 error = -1;
57 }
58 /*
59 * We allow top in [begin; end], since its not clear when the
60 * overflow adjustment happens: after the increment or before the
61 * write.
62 */
63 if ((trace->ds.top < trace->ds.begin) ||
64 (trace->ds.end < trace->ds.top)) {
65 printk(KERN_CONT "bts top out of bounds...");
66 error = -1;
67 }
68
69 return error;
70}
71
72static int ds_selftest_bts_read(struct bts_tracer *tracer,
73 const struct bts_trace *trace,
74 const void *from, const void *to)
75{
76 const unsigned char *at;
77
78 /*
79 * Check a few things which do not belong to this test.
80 * They should be covered by other tests.
81 */
82 if (!trace)
83 return -1;
84
85 if (!trace->read)
86 return -1;
87
88 if (to < from)
89 return -1;
90
91 if (from < trace->ds.begin)
92 return -1;
93
94 if (trace->ds.end < to)
95 return -1;
96
97 if (!trace->ds.size)
98 return -1;
99
100 /* Now to the test itself. */
101 for (at = from; (void *)at < to; at += trace->ds.size) {
102 struct bts_struct bts;
103 unsigned long index;
104 int error;
105
106 if (((void *)at - trace->ds.begin) % trace->ds.size) {
107 printk(KERN_CONT
108 "read from non-integer index...");
109 return -1;
110 }
111 index = ((void *)at - trace->ds.begin) / trace->ds.size;
112
113 memset(&bts, 0, sizeof(bts));
114 error = trace->read(tracer, at, &bts);
115 if (error < 0) {
116 printk(KERN_CONT
117 "error reading bts trace at [%lu] (0x%p)...",
118 index, at);
119 return error;
120 }
121
122 switch (bts.qualifier) {
123 case BTS_BRANCH:
124 break;
125 default:
126 printk(KERN_CONT
127 "unexpected bts entry %llu at [%lu] (0x%p)...",
128 bts.qualifier, index, at);
129 return -1;
130 }
131 }
132
133 return 0;
134}
135
136static void ds_selftest_bts_cpu(void *arg)
137{
138 struct ds_selftest_bts_conf *conf = arg;
139 const struct bts_trace *trace;
140 void *top;
141
142 if (IS_ERR(conf->tracer)) {
143 conf->error = PTR_ERR(conf->tracer);
144 conf->tracer = NULL;
145
146 printk(KERN_CONT
147 "initialization failed (err: %d)...", conf->error);
148 return;
149 }
150
151 /* We should meanwhile have enough trace. */
152 conf->error = conf->suspend(conf->tracer);
153 if (conf->error < 0)
154 return;
155
156 /* Let's see if we can access the trace. */
157 trace = ds_read_bts(conf->tracer);
158
159 conf->error = ds_selftest_bts_consistency(trace);
160 if (conf->error < 0)
161 return;
162
163 /* If everything went well, we should have a few trace entries. */
164 if (trace->ds.top == trace->ds.begin) {
165 /*
166 * It is possible but highly unlikely that we got a
167 * buffer overflow and end up at exactly the same
168 * position we started from.
169 * Let's issue a warning, but continue.
170 */
171 printk(KERN_CONT "no trace/overflow...");
172 }
173
174 /* Let's try to read the trace we collected. */
175 conf->error =
176 ds_selftest_bts_read(conf->tracer, trace,
177 trace->ds.begin, trace->ds.top);
178 if (conf->error < 0)
179 return;
180
181 /*
182 * Let's read the trace again.
183 * Since we suspended tracing, we should get the same result.
184 */
185 top = trace->ds.top;
186
187 trace = ds_read_bts(conf->tracer);
188 conf->error = ds_selftest_bts_consistency(trace);
189 if (conf->error < 0)
190 return;
191
192 if (top != trace->ds.top) {
193 printk(KERN_CONT "suspend not working...");
194 conf->error = -1;
195 return;
196 }
197
198 /* Let's collect some more trace - see if resume is working. */
199 conf->error = conf->resume(conf->tracer);
200 if (conf->error < 0)
201 return;
202
203 conf->error = conf->suspend(conf->tracer);
204 if (conf->error < 0)
205 return;
206
207 trace = ds_read_bts(conf->tracer);
208
209 conf->error = ds_selftest_bts_consistency(trace);
210 if (conf->error < 0)
211 return;
212
213 if (trace->ds.top == top) {
214 /*
215 * It is possible but highly unlikely that we got a
216 * buffer overflow and end up at exactly the same
217 * position we started from.
218 * Let's issue a warning and check the full trace.
219 */
220 printk(KERN_CONT
221 "no resume progress/overflow...");
222
223 conf->error =
224 ds_selftest_bts_read(conf->tracer, trace,
225 trace->ds.begin, trace->ds.end);
226 } else if (trace->ds.top < top) {
227 /*
228 * We had a buffer overflow - the entire buffer should
229 * contain trace records.
230 */
231 conf->error =
232 ds_selftest_bts_read(conf->tracer, trace,
233 trace->ds.begin, trace->ds.end);
234 } else {
235 /*
236 * It is quite likely that the buffer did not overflow.
237 * Let's just check the delta trace.
238 */
239 conf->error =
240 ds_selftest_bts_read(conf->tracer, trace, top,
241 trace->ds.top);
242 }
243 if (conf->error < 0)
244 return;
245
246 conf->error = 0;
247}
248
249static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
250{
251 ds_suspend_bts(tracer);
252 return 0;
253}
254
255static int ds_resume_bts_wrap(struct bts_tracer *tracer)
256{
257 ds_resume_bts(tracer);
258 return 0;
259}
260
261static void ds_release_bts_noirq_wrap(void *tracer)
262{
263 (void)ds_release_bts_noirq(tracer);
264}
265
266static int ds_selftest_bts_bad_release_noirq(int cpu,
267 struct bts_tracer *tracer)
268{
269 int error = -EPERM;
270
271 /* Try to release the tracer on the wrong cpu. */
272 get_cpu();
273 if (cpu != smp_processor_id()) {
274 error = ds_release_bts_noirq(tracer);
275 if (error != -EPERM)
276 printk(KERN_CONT "release on wrong cpu...");
277 }
278 put_cpu();
279
280 return error ? 0 : -1;
281}
282
283static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
284{
285 struct bts_tracer *tracer;
286 int error;
287
288 /* Try to request cpu tracing while task tracing is active. */
289 tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
290 (size_t)-1, BTS_KERNEL);
291 error = PTR_ERR(tracer);
292 if (!IS_ERR(tracer)) {
293 ds_release_bts(tracer);
294 error = 0;
295 }
296
297 if (error != -EPERM)
298 printk(KERN_CONT "cpu/task tracing overlap...");
299
300 return error ? 0 : -1;
301}
302
303static int ds_selftest_bts_bad_request_task(void *buffer)
304{
305 struct bts_tracer *tracer;
306 int error;
307
308 /* Try to request cpu tracing while task tracing is active. */
309 tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
310 (size_t)-1, BTS_KERNEL);
311 error = PTR_ERR(tracer);
312 if (!IS_ERR(tracer)) {
313 error = 0;
314 ds_release_bts(tracer);
315 }
316
317 if (error != -EPERM)
318 printk(KERN_CONT "task/cpu tracing overlap...");
319
320 return error ? 0 : -1;
321}
322
323int ds_selftest_bts(void)
324{
325 struct ds_selftest_bts_conf conf;
326 unsigned char buffer[BUFFER_SIZE], *small_buffer;
327 unsigned long irq;
328 int cpu;
329
330 printk(KERN_INFO "[ds] bts selftest...");
331 conf.error = 0;
332
333 small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
334
335 get_online_cpus();
336 for_each_online_cpu(cpu) {
337 conf.suspend = ds_suspend_bts_wrap;
338 conf.resume = ds_resume_bts_wrap;
339 conf.tracer =
340 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
341 NULL, (size_t)-1, BTS_KERNEL);
342 ds_selftest_bts_cpu(&conf);
343 if (conf.error >= 0)
344 conf.error = ds_selftest_bts_bad_request_task(buffer);
345 ds_release_bts(conf.tracer);
346 if (conf.error < 0)
347 goto out;
348
349 conf.suspend = ds_suspend_bts_noirq;
350 conf.resume = ds_resume_bts_noirq;
351 conf.tracer =
352 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
353 NULL, (size_t)-1, BTS_KERNEL);
354 smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
355 if (conf.error >= 0) {
356 conf.error =
357 ds_selftest_bts_bad_release_noirq(cpu,
358 conf.tracer);
359 /* We must not release the tracer twice. */
360 if (conf.error < 0)
361 conf.tracer = NULL;
362 }
363 if (conf.error >= 0)
364 conf.error = ds_selftest_bts_bad_request_task(buffer);
365 smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
366 conf.tracer, 1);
367 if (conf.error < 0)
368 goto out;
369 }
370
371 conf.suspend = ds_suspend_bts_wrap;
372 conf.resume = ds_resume_bts_wrap;
373 conf.tracer =
374 ds_request_bts_task(current, buffer, BUFFER_SIZE,
375 NULL, (size_t)-1, BTS_KERNEL);
376 ds_selftest_bts_cpu(&conf);
377 if (conf.error >= 0)
378 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
379 ds_release_bts(conf.tracer);
380 if (conf.error < 0)
381 goto out;
382
383 conf.suspend = ds_suspend_bts_noirq;
384 conf.resume = ds_resume_bts_noirq;
385 conf.tracer =
386 ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
387 NULL, (size_t)-1, BTS_KERNEL);
388 local_irq_save(irq);
389 ds_selftest_bts_cpu(&conf);
390 if (conf.error >= 0)
391 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
392 ds_release_bts_noirq(conf.tracer);
393 local_irq_restore(irq);
394 if (conf.error < 0)
395 goto out;
396
397 conf.error = 0;
398 out:
399 put_online_cpus();
400 printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
401
402 return conf.error;
403}
404
405int ds_selftest_pebs(void)
406{
407 return 0;
408}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c6663..000000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#ifdef CONFIG_X86_DS_SELFTEST
10extern int ds_selftest_bts(void);
11extern int ds_selftest_pebs(void);
12#else
13static inline int ds_selftest_bts(void) { return 0; }
14static inline int ds_selftest_pebs(void) { return 0; }
15#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6d817554780a..c89a386930b7 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -224,11 +224,6 @@ unsigned __kprobes long oops_begin(void)
224 int cpu; 224 int cpu;
225 unsigned long flags; 225 unsigned long flags;
226 226
227 /* notify the hw-branch tracer so it may disable tracing and
228 add the last trace to the trace buffer -
229 the earlier this happens, the more useful the trace. */
230 trace_hw_branch_oops();
231
232 oops_enter(); 227 oops_enter();
233 228
234 /* racy, but better than risking deadlock. */ 229 /* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 44a8e0dc6737..cd49141cf153 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -53,6 +53,7 @@
53#include <asm/processor-flags.h> 53#include <asm/processor-flags.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
56#include <asm/cpufeature.h>
56 57
57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 58/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
58#include <linux/elf-em.h> 59#include <linux/elf-em.h>
@@ -905,7 +906,25 @@ ENTRY(simd_coprocessor_error)
905 RING0_INT_FRAME 906 RING0_INT_FRAME
906 pushl $0 907 pushl $0
907 CFI_ADJUST_CFA_OFFSET 4 908 CFI_ADJUST_CFA_OFFSET 4
909#ifdef CONFIG_X86_INVD_BUG
910 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
911661: pushl $do_general_protection
912662:
913.section .altinstructions,"a"
914 .balign 4
915 .long 661b
916 .long 663f
917 .byte X86_FEATURE_XMM
918 .byte 662b-661b
919 .byte 664f-663f
920.previous
921.section .altinstr_replacement,"ax"
922663: pushl $do_simd_coprocessor_error
923664:
924.previous
925#else
908 pushl $do_simd_coprocessor_error 926 pushl $do_simd_coprocessor_error
927#endif
909 CFI_ADJUST_CFA_OFFSET 4 928 CFI_ADJUST_CFA_OFFSET 4
910 jmp error_code 929 jmp error_code
911 CFI_ENDPROC 930 CFI_ENDPROC
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index d6cc065f519f..a8f1b803d2fd 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -189,25 +189,16 @@ static int get_hbp_len(u8 hbp_len)
189} 189}
190 190
191/* 191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space. 192 * Check for virtual address in kernel space.
205 */ 193 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) 194int arch_check_bp_in_kernelspace(struct perf_event *bp)
207{ 195{
208 unsigned int len; 196 unsigned int len;
197 unsigned long va;
198 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
209 199
210 len = get_hbp_len(hbp_len); 200 va = info->address;
201 len = get_hbp_len(info->len);
211 202
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); 203 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213} 204}
@@ -300,8 +291,7 @@ static int arch_build_bp_info(struct perf_event *bp)
300/* 291/*
301 * Validate the arch-specific HW Breakpoint register settings 292 * Validate the arch-specific HW Breakpoint register settings
302 */ 293 */
303int arch_validate_hwbkpt_settings(struct perf_event *bp, 294int arch_validate_hwbkpt_settings(struct perf_event *bp)
304 struct task_struct *tsk)
305{ 295{
306 struct arch_hw_breakpoint *info = counter_arch_bp(bp); 296 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
307 unsigned int align; 297 unsigned int align;
@@ -314,16 +304,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
314 304
315 ret = -EINVAL; 305 ret = -EINVAL;
316 306
317 if (info->type == X86_BREAKPOINT_EXECUTE)
318 /*
319 * Ptrace-refactoring code
320 * For now, we'll allow instruction breakpoint only for user-space
321 * addresses
322 */
323 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
324 info->len != X86_BREAKPOINT_EXECUTE)
325 return ret;
326
327 switch (info->len) { 307 switch (info->len) {
328 case X86_BREAKPOINT_LEN_1: 308 case X86_BREAKPOINT_LEN_1:
329 align = 0; 309 align = 0;
@@ -350,15 +330,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
350 if (info->address & align) 330 if (info->address & align)
351 return -EINVAL; 331 return -EINVAL;
352 332
353 /* Check that the virtual address is in the proper range */
354 if (tsk) {
355 if (!arch_check_va_in_userspace(info->address, info->len))
356 return -EFAULT;
357 } else {
358 if (!arch_check_va_in_kernelspace(info->address, info->len))
359 return -EFAULT;
360 }
361
362 return 0; 333 return 0;
363} 334}
364 335
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 54c31c285488..86cef6b32253 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -102,65 +102,62 @@ void __cpuinit fpu_init(void)
102 102
103 mxcsr_feature_mask_init(); 103 mxcsr_feature_mask_init();
104 /* clean state in init */ 104 /* clean state in init */
105 if (cpu_has_xsave) 105 current_thread_info()->status = 0;
106 current_thread_info()->status = TS_XSAVE;
107 else
108 current_thread_info()->status = 0;
109 clear_used_math(); 106 clear_used_math();
110} 107}
111#endif /* CONFIG_X86_64 */ 108#endif /* CONFIG_X86_64 */
112 109
113/* 110static void fpu_finit(struct fpu *fpu)
114 * The _current_ task is using the FPU for the first time
115 * so initialize it and set the mxcsr to its default
116 * value at reset if we support XMM instructions and then
117 * remeber the current task has used the FPU.
118 */
119int init_fpu(struct task_struct *tsk)
120{ 111{
121 if (tsk_used_math(tsk)) {
122 if (HAVE_HWFP && tsk == current)
123 unlazy_fpu(tsk);
124 return 0;
125 }
126
127 /*
128 * Memory allocation at the first usage of the FPU and other state.
129 */
130 if (!tsk->thread.xstate) {
131 tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
132 GFP_KERNEL);
133 if (!tsk->thread.xstate)
134 return -ENOMEM;
135 }
136
137#ifdef CONFIG_X86_32 112#ifdef CONFIG_X86_32
138 if (!HAVE_HWFP) { 113 if (!HAVE_HWFP) {
139 memset(tsk->thread.xstate, 0, xstate_size); 114 finit_soft_fpu(&fpu->state->soft);
140 finit_task(tsk); 115 return;
141 set_stopped_child_used_math(tsk);
142 return 0;
143 } 116 }
144#endif 117#endif
145 118
146 if (cpu_has_fxsr) { 119 if (cpu_has_fxsr) {
147 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 120 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
148 121
149 memset(fx, 0, xstate_size); 122 memset(fx, 0, xstate_size);
150 fx->cwd = 0x37f; 123 fx->cwd = 0x37f;
151 if (cpu_has_xmm) 124 if (cpu_has_xmm)
152 fx->mxcsr = MXCSR_DEFAULT; 125 fx->mxcsr = MXCSR_DEFAULT;
153 } else { 126 } else {
154 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 127 struct i387_fsave_struct *fp = &fpu->state->fsave;
155 memset(fp, 0, xstate_size); 128 memset(fp, 0, xstate_size);
156 fp->cwd = 0xffff037fu; 129 fp->cwd = 0xffff037fu;
157 fp->swd = 0xffff0000u; 130 fp->swd = 0xffff0000u;
158 fp->twd = 0xffffffffu; 131 fp->twd = 0xffffffffu;
159 fp->fos = 0xffff0000u; 132 fp->fos = 0xffff0000u;
160 } 133 }
134}
135
136/*
137 * The _current_ task is using the FPU for the first time
138 * so initialize it and set the mxcsr to its default
139 * value at reset if we support XMM instructions and then
140 * remeber the current task has used the FPU.
141 */
142int init_fpu(struct task_struct *tsk)
143{
144 int ret;
145
146 if (tsk_used_math(tsk)) {
147 if (HAVE_HWFP && tsk == current)
148 unlazy_fpu(tsk);
149 return 0;
150 }
151
161 /* 152 /*
162 * Only the device not available exception or ptrace can call init_fpu. 153 * Memory allocation at the first usage of the FPU and other state.
163 */ 154 */
155 ret = fpu_alloc(&tsk->thread.fpu);
156 if (ret)
157 return ret;
158
159 fpu_finit(&tsk->thread.fpu);
160
164 set_stopped_child_used_math(tsk); 161 set_stopped_child_used_math(tsk);
165 return 0; 162 return 0;
166} 163}
@@ -194,7 +191,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
194 return ret; 191 return ret;
195 192
196 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 193 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
197 &target->thread.xstate->fxsave, 0, -1); 194 &target->thread.fpu.state->fxsave, 0, -1);
198} 195}
199 196
200int xfpregs_set(struct task_struct *target, const struct user_regset *regset, 197int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -211,19 +208,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
211 return ret; 208 return ret;
212 209
213 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 210 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
214 &target->thread.xstate->fxsave, 0, -1); 211 &target->thread.fpu.state->fxsave, 0, -1);
215 212
216 /* 213 /*
217 * mxcsr reserved bits must be masked to zero for security reasons. 214 * mxcsr reserved bits must be masked to zero for security reasons.
218 */ 215 */
219 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 216 target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
220 217
221 /* 218 /*
222 * update the header bits in the xsave header, indicating the 219 * update the header bits in the xsave header, indicating the
223 * presence of FP and SSE state. 220 * presence of FP and SSE state.
224 */ 221 */
225 if (cpu_has_xsave) 222 if (cpu_has_xsave)
226 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; 223 target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
227 224
228 return ret; 225 return ret;
229} 226}
@@ -246,14 +243,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
246 * memory layout in the thread struct, so that we can copy the entire 243 * memory layout in the thread struct, so that we can copy the entire
247 * xstateregs to the user using one user_regset_copyout(). 244 * xstateregs to the user using one user_regset_copyout().
248 */ 245 */
249 memcpy(&target->thread.xstate->fxsave.sw_reserved, 246 memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
250 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); 247 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
251 248
252 /* 249 /*
253 * Copy the xstate memory layout. 250 * Copy the xstate memory layout.
254 */ 251 */
255 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, 252 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
256 &target->thread.xstate->xsave, 0, -1); 253 &target->thread.fpu.state->xsave, 0, -1);
257 return ret; 254 return ret;
258} 255}
259 256
@@ -272,14 +269,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
272 return ret; 269 return ret;
273 270
274 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 271 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
275 &target->thread.xstate->xsave, 0, -1); 272 &target->thread.fpu.state->xsave, 0, -1);
276 273
277 /* 274 /*
278 * mxcsr reserved bits must be masked to zero for security reasons. 275 * mxcsr reserved bits must be masked to zero for security reasons.
279 */ 276 */
280 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 277 target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
281 278
282 xsave_hdr = &target->thread.xstate->xsave.xsave_hdr; 279 xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
283 280
284 xsave_hdr->xstate_bv &= pcntxt_mask; 281 xsave_hdr->xstate_bv &= pcntxt_mask;
285 /* 282 /*
@@ -365,7 +362,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
365static void 362static void
366convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) 363convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
367{ 364{
368 struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave; 365 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
369 struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; 366 struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
370 struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; 367 struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
371 int i; 368 int i;
@@ -405,7 +402,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
405 const struct user_i387_ia32_struct *env) 402 const struct user_i387_ia32_struct *env)
406 403
407{ 404{
408 struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave; 405 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
409 struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; 406 struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
410 struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; 407 struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
411 int i; 408 int i;
@@ -445,7 +442,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
445 442
446 if (!cpu_has_fxsr) { 443 if (!cpu_has_fxsr) {
447 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 444 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
448 &target->thread.xstate->fsave, 0, 445 &target->thread.fpu.state->fsave, 0,
449 -1); 446 -1);
450 } 447 }
451 448
@@ -475,7 +472,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
475 472
476 if (!cpu_has_fxsr) { 473 if (!cpu_has_fxsr) {
477 return user_regset_copyin(&pos, &count, &kbuf, &ubuf, 474 return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
478 &target->thread.xstate->fsave, 0, -1); 475 &target->thread.fpu.state->fsave, 0, -1);
479 } 476 }
480 477
481 if (pos > 0 || count < sizeof(env)) 478 if (pos > 0 || count < sizeof(env))
@@ -490,7 +487,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
490 * presence of FP. 487 * presence of FP.
491 */ 488 */
492 if (cpu_has_xsave) 489 if (cpu_has_xsave)
493 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; 490 target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
494 return ret; 491 return ret;
495} 492}
496 493
@@ -501,7 +498,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
501static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) 498static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
502{ 499{
503 struct task_struct *tsk = current; 500 struct task_struct *tsk = current;
504 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 501 struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
505 502
506 fp->status = fp->swd; 503 fp->status = fp->swd;
507 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) 504 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
@@ -512,7 +509,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
512static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) 509static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
513{ 510{
514 struct task_struct *tsk = current; 511 struct task_struct *tsk = current;
515 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 512 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
516 struct user_i387_ia32_struct env; 513 struct user_i387_ia32_struct env;
517 int err = 0; 514 int err = 0;
518 515
@@ -547,7 +544,7 @@ static int save_i387_xsave(void __user *buf)
547 * header as well as change any contents in the memory layout. 544 * header as well as change any contents in the memory layout.
548 * xrestore as part of sigreturn will capture all the changes. 545 * xrestore as part of sigreturn will capture all the changes.
549 */ 546 */
550 tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; 547 tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
551 548
552 if (save_i387_fxsave(fx) < 0) 549 if (save_i387_fxsave(fx) < 0)
553 return -1; 550 return -1;
@@ -599,7 +596,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
599{ 596{
600 struct task_struct *tsk = current; 597 struct task_struct *tsk = current;
601 598
602 return __copy_from_user(&tsk->thread.xstate->fsave, buf, 599 return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
603 sizeof(struct i387_fsave_struct)); 600 sizeof(struct i387_fsave_struct));
604} 601}
605 602
@@ -610,10 +607,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
610 struct user_i387_ia32_struct env; 607 struct user_i387_ia32_struct env;
611 int err; 608 int err;
612 609
613 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], 610 err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
614 size); 611 size);
615 /* mxcsr reserved bits must be masked to zero for security reasons */ 612 /* mxcsr reserved bits must be masked to zero for security reasons */
616 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 613 tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
617 if (err || __copy_from_user(&env, buf, sizeof(env))) 614 if (err || __copy_from_user(&env, buf, sizeof(env)))
618 return 1; 615 return 1;
619 convert_to_fxsr(tsk, &env); 616 convert_to_fxsr(tsk, &env);
@@ -629,7 +626,7 @@ static int restore_i387_xsave(void __user *buf)
629 struct i387_fxsave_struct __user *fx = 626 struct i387_fxsave_struct __user *fx =
630 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; 627 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
631 struct xsave_hdr_struct *xsave_hdr = 628 struct xsave_hdr_struct *xsave_hdr =
632 &current->thread.xstate->xsave.xsave_hdr; 629 &current->thread.fpu.state->xsave.xsave_hdr;
633 u64 mask; 630 u64 mask;
634 int err; 631 int err;
635 632
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 23c167925a5c..2dfd31597443 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -16,7 +16,7 @@
16#include <asm/hpet.h> 16#include <asm/hpet.h>
17#include <asm/smp.h> 17#include <asm/smp.h>
18 18
19DEFINE_SPINLOCK(i8253_lock); 19DEFINE_RAW_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock); 20EXPORT_SYMBOL(i8253_lock);
21 21
22/* 22/*
@@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event;
33static void init_pit_timer(enum clock_event_mode mode, 33static void init_pit_timer(enum clock_event_mode mode,
34 struct clock_event_device *evt) 34 struct clock_event_device *evt)
35{ 35{
36 spin_lock(&i8253_lock); 36 raw_spin_lock(&i8253_lock);
37 37
38 switch (mode) { 38 switch (mode) {
39 case CLOCK_EVT_MODE_PERIODIC: 39 case CLOCK_EVT_MODE_PERIODIC:
@@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode,
62 /* Nothing to do here */ 62 /* Nothing to do here */
63 break; 63 break;
64 } 64 }
65 spin_unlock(&i8253_lock); 65 raw_spin_unlock(&i8253_lock);
66} 66}
67 67
68/* 68/*
@@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode,
72 */ 72 */
73static int pit_next_event(unsigned long delta, struct clock_event_device *evt) 73static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
74{ 74{
75 spin_lock(&i8253_lock); 75 raw_spin_lock(&i8253_lock);
76 outb_pit(delta & 0xff , PIT_CH0); /* LSB */ 76 outb_pit(delta & 0xff , PIT_CH0); /* LSB */
77 outb_pit(delta >> 8 , PIT_CH0); /* MSB */ 77 outb_pit(delta >> 8 , PIT_CH0); /* MSB */
78 spin_unlock(&i8253_lock); 78 raw_spin_unlock(&i8253_lock);
79 79
80 return 0; 80 return 0;
81} 81}
@@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs)
130 int count; 130 int count;
131 u32 jifs; 131 u32 jifs;
132 132
133 spin_lock_irqsave(&i8253_lock, flags); 133 raw_spin_lock_irqsave(&i8253_lock, flags);
134 /* 134 /*
135 * Although our caller may have the read side of xtime_lock, 135 * Although our caller may have the read side of xtime_lock,
136 * this is now a seqlock, and we are cheating in this routine 136 * this is now a seqlock, and we are cheating in this routine
@@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs)
176 old_count = count; 176 old_count = count;
177 old_jifs = jifs; 177 old_jifs = jifs;
178 178
179 spin_unlock_irqrestore(&i8253_lock, flags); 179 raw_spin_unlock_irqrestore(&i8253_lock, flags);
180 180
181 count = (LATCH - 1) - count; 181 count = (LATCH - 1) - count;
182 182
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 0ed2d300cd46..990ae7cfc578 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -60,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
60 outb(0, 0xF0); 60 outb(0, 0xF0);
61 if (ignore_fpu_irq || !boot_cpu_data.hard_math) 61 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
62 return IRQ_NONE; 62 return IRQ_NONE;
63 math_error((void __user *)get_irq_regs()->ip); 63 math_error(get_irq_regs(), 0, 16);
64 return IRQ_HANDLED; 64 return IRQ_HANDLED;
65} 65}
66 66
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 1658efdfb4e5..345a4b1fe144 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,14 +422,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
422 422
423static void __kprobes clear_btf(void) 423static void __kprobes clear_btf(void)
424{ 424{
425 if (test_thread_flag(TIF_DEBUGCTLMSR)) 425 if (test_thread_flag(TIF_BLOCKSTEP)) {
426 update_debugctlmsr(0); 426 unsigned long debugctl = get_debugctlmsr();
427
428 debugctl &= ~DEBUGCTLMSR_BTF;
429 update_debugctlmsr(debugctl);
430 }
427} 431}
428 432
429static void __kprobes restore_btf(void) 433static void __kprobes restore_btf(void)
430{ 434{
431 if (test_thread_flag(TIF_DEBUGCTLMSR)) 435 if (test_thread_flag(TIF_BLOCKSTEP)) {
432 update_debugctlmsr(current->thread.debugctlmsr); 436 unsigned long debugctl = get_debugctlmsr();
437
438 debugctl |= DEBUGCTLMSR_BTF;
439 update_debugctlmsr(debugctl);
440 }
433} 441}
434 442
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 443void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index cceb5bc3c3c2..2cd8c544e41a 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -201,9 +201,9 @@ static int do_microcode_update(const void __user *buf, size_t size)
201 return error; 201 return error;
202} 202}
203 203
204static int microcode_open(struct inode *unused1, struct file *unused2) 204static int microcode_open(struct inode *inode, struct file *file)
205{ 205{
206 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 206 return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
207} 207}
208 208
209static ssize_t microcode_write(struct file *file, const char __user *buf, 209static ssize_t microcode_write(struct file *file, const char __user *buf,
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 85a343e28937..356170262a93 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -343,10 +343,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
343 int (*get_ucode_data)(void *, const void *, size_t)) 343 int (*get_ucode_data)(void *, const void *, size_t))
344{ 344{
345 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 345 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
346 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 346 u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
347 int new_rev = uci->cpu_sig.rev; 347 int new_rev = uci->cpu_sig.rev;
348 unsigned int leftover = size; 348 unsigned int leftover = size;
349 enum ucode_state state = UCODE_OK; 349 enum ucode_state state = UCODE_OK;
350 unsigned int curr_mc_size = 0;
350 351
351 while (leftover) { 352 while (leftover) {
352 struct microcode_header_intel mc_header; 353 struct microcode_header_intel mc_header;
@@ -361,9 +362,15 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
361 break; 362 break;
362 } 363 }
363 364
364 mc = vmalloc(mc_size); 365 /* For performance reasons, reuse mc area when possible */
365 if (!mc) 366 if (!mc || mc_size > curr_mc_size) {
366 break; 367 if (mc)
368 vfree(mc);
369 mc = vmalloc(mc_size);
370 if (!mc)
371 break;
372 curr_mc_size = mc_size;
373 }
367 374
368 if (get_ucode_data(mc, ucode_ptr, mc_size) || 375 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
369 microcode_sanity_check(mc) < 0) { 376 microcode_sanity_check(mc) < 0) {
@@ -376,13 +383,16 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
376 vfree(new_mc); 383 vfree(new_mc);
377 new_rev = mc_header.rev; 384 new_rev = mc_header.rev;
378 new_mc = mc; 385 new_mc = mc;
379 } else 386 mc = NULL; /* trigger new vmalloc */
380 vfree(mc); 387 }
381 388
382 ucode_ptr += mc_size; 389 ucode_ptr += mc_size;
383 leftover -= mc_size; 390 leftover -= mc_size;
384 } 391 }
385 392
393 if (mc)
394 vfree(mc);
395
386 if (leftover) { 396 if (leftover) {
387 if (new_mc) 397 if (new_mc)
388 vfree(new_mc); 398 vfree(new_mc);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e81030f71a8f..5ae5d2426edf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
116} 116}
117 117
118static int bad_ioapic(unsigned long address)
119{
120 if (nr_ioapics >= MAX_IO_APICS) {
121 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
122 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
123 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
124 }
125 if (!address) {
126 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
127 " found in table, skipping!\n");
128 return 1;
129 }
130 return 0;
131}
132
133static void __init MP_ioapic_info(struct mpc_ioapic *m) 118static void __init MP_ioapic_info(struct mpc_ioapic *m)
134{ 119{
135 if (!(m->flags & MPC_APIC_USABLE)) 120 if (!(m->flags & MPC_APIC_USABLE))
@@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
138 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", 123 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
139 m->apicid, m->apicver, m->apicaddr); 124 m->apicid, m->apicver, m->apicaddr);
140 125
141 if (bad_ioapic(m->apicaddr)) 126 mp_register_ioapic(m->apicid, m->apicaddr, gsi_end + 1);
142 return;
143
144 mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
145 mp_ioapics[nr_ioapics].apicid = m->apicid;
146 mp_ioapics[nr_ioapics].type = m->type;
147 mp_ioapics[nr_ioapics].apicver = m->apicver;
148 mp_ioapics[nr_ioapics].flags = m->flags;
149 nr_ioapics++;
150} 127}
151 128
152static void print_MP_intsrc_info(struct mpc_intsrc *m) 129static void print_MP_intsrc_info(struct mpc_intsrc *m)
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 0aad8670858e..e796448f0eb5 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -237,4 +237,9 @@ void __init x86_mrst_early_setup(void)
237 x86_init.pci.fixup_irqs = x86_init_noop; 237 x86_init.pci.fixup_irqs = x86_init_noop;
238 238
239 legacy_pic = &null_legacy_pic; 239 legacy_pic = &null_legacy_pic;
240
241 /* Avoid searching for BIOS MP tables */
242 x86_init.mpparse.find_smp_config = x86_init_noop;
243 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
244
240} 245}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4d8b94..e7e35219b32f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,7 +20,6 @@
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <asm/i387.h> 22#include <asm/i387.h>
23#include <asm/ds.h>
24#include <asm/debugreg.h> 23#include <asm/debugreg.h>
25 24
26unsigned long idle_halt; 25unsigned long idle_halt;
@@ -32,26 +31,22 @@ struct kmem_cache *task_xstate_cachep;
32 31
33int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 32int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
34{ 33{
34 int ret;
35
35 *dst = *src; 36 *dst = *src;
36 if (src->thread.xstate) { 37 if (fpu_allocated(&src->thread.fpu)) {
37 dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, 38 memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
38 GFP_KERNEL); 39 ret = fpu_alloc(&dst->thread.fpu);
39 if (!dst->thread.xstate) 40 if (ret)
40 return -ENOMEM; 41 return ret;
41 WARN_ON((unsigned long)dst->thread.xstate & 15); 42 fpu_copy(&dst->thread.fpu, &src->thread.fpu);
42 memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
43 } 43 }
44 return 0; 44 return 0;
45} 45}
46 46
47void free_thread_xstate(struct task_struct *tsk) 47void free_thread_xstate(struct task_struct *tsk)
48{ 48{
49 if (tsk->thread.xstate) { 49 fpu_free(&tsk->thread.fpu);
50 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
51 tsk->thread.xstate = NULL;
52 }
53
54 WARN(tsk->thread.ds_ctx, "leaking DS context\n");
55} 50}
56 51
57void free_thread_info(struct thread_info *ti) 52void free_thread_info(struct thread_info *ti)
@@ -198,11 +193,16 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
198 prev = &prev_p->thread; 193 prev = &prev_p->thread;
199 next = &next_p->thread; 194 next = &next_p->thread;
200 195
201 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || 196 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
202 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) 197 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
203 ds_switch_to(prev_p, next_p); 198 unsigned long debugctl = get_debugctlmsr();
204 else if (next->debugctlmsr != prev->debugctlmsr) 199
205 update_debugctlmsr(next->debugctlmsr); 200 debugctl &= ~DEBUGCTLMSR_BTF;
201 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
202 debugctl |= DEBUGCTLMSR_BTF;
203
204 update_debugctlmsr(debugctl);
205 }
206 206
207 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 207 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
208 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 208 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
@@ -546,11 +546,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
546 * check OSVW bit for CPUs that are not affected 546 * check OSVW bit for CPUs that are not affected
547 * by erratum #400 547 * by erratum #400
548 */ 548 */
549 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); 549 if (cpu_has(c, X86_FEATURE_OSVW)) {
550 if (val >= 2) { 550 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
551 rdmsrl(MSR_AMD64_OSVW_STATUS, val); 551 if (val >= 2) {
552 if (!(val & BIT(1))) 552 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
553 goto no_c1e_idle; 553 if (!(val & BIT(1)))
554 goto no_c1e_idle;
555 }
554 } 556 }
555 return 1; 557 return 1;
556 } 558 }
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30c..8d128783af47 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,7 +55,6 @@
55#include <asm/cpu.h> 55#include <asm/cpu.h>
56#include <asm/idle.h> 56#include <asm/idle.h>
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/ds.h>
59#include <asm/debugreg.h> 58#include <asm/debugreg.h>
60 59
61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -238,13 +237,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
238 kfree(p->thread.io_bitmap_ptr); 237 kfree(p->thread.io_bitmap_ptr);
239 p->thread.io_bitmap_max = 0; 238 p->thread.io_bitmap_max = 0;
240 } 239 }
241
242 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
243 p->thread.ds_ctx = NULL;
244
245 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
246 p->thread.debugctlmsr = 0;
247
248 return err; 240 return err;
249} 241}
250 242
@@ -317,7 +309,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
317 309
318 /* we're going to use this soon, after a few expensive things */ 310 /* we're going to use this soon, after a few expensive things */
319 if (preload_fpu) 311 if (preload_fpu)
320 prefetch(next->xstate); 312 prefetch(next->fpu.state);
321 313
322 /* 314 /*
323 * Reload esp0. 315 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 17cb3295cbf7..3c2422a99f1f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,7 +49,6 @@
49#include <asm/ia32.h> 49#include <asm/ia32.h>
50#include <asm/idle.h> 50#include <asm/idle.h>
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/ds.h>
53#include <asm/debugreg.h> 52#include <asm/debugreg.h>
54 53
55asmlinkage extern void ret_from_fork(void); 54asmlinkage extern void ret_from_fork(void);
@@ -313,13 +312,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
313 if (err) 312 if (err)
314 goto out; 313 goto out;
315 } 314 }
316
317 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
318 p->thread.ds_ctx = NULL;
319
320 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
321 p->thread.debugctlmsr = 0;
322
323 err = 0; 315 err = 0;
324out: 316out:
325 if (err && p->thread.io_bitmap_ptr) { 317 if (err && p->thread.io_bitmap_ptr) {
@@ -396,7 +388,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
396 388
397 /* we're going to use this soon, after a few expensive things */ 389 /* we're going to use this soon, after a few expensive things */
398 if (preload_fpu) 390 if (preload_fpu)
399 prefetch(next->xstate); 391 prefetch(next->fpu.state);
400 392
401 /* 393 /*
402 * Reload esp0, LDT and the page table pointer: 394 * Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2e9b55027b7e..70c4872cd8aa 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,9 +2,6 @@
2/* 2/*
3 * Pentium III FXSR, SSE support 3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000 4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 *
6 * BTS tracing
7 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
8 */ 5 */
9 6
10#include <linux/kernel.h> 7#include <linux/kernel.h>
@@ -22,7 +19,6 @@
22#include <linux/audit.h> 19#include <linux/audit.h>
23#include <linux/seccomp.h> 20#include <linux/seccomp.h>
24#include <linux/signal.h> 21#include <linux/signal.h>
25#include <linux/workqueue.h>
26#include <linux/perf_event.h> 22#include <linux/perf_event.h>
27#include <linux/hw_breakpoint.h> 23#include <linux/hw_breakpoint.h>
28 24
@@ -36,7 +32,6 @@
36#include <asm/desc.h> 32#include <asm/desc.h>
37#include <asm/prctl.h> 33#include <asm/prctl.h>
38#include <asm/proto.h> 34#include <asm/proto.h>
39#include <asm/ds.h>
40#include <asm/hw_breakpoint.h> 35#include <asm/hw_breakpoint.h>
41 36
42#include "tls.h" 37#include "tls.h"
@@ -693,7 +688,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
693 struct perf_event_attr attr; 688 struct perf_event_attr attr;
694 689
695 if (!t->ptrace_bps[nr]) { 690 if (!t->ptrace_bps[nr]) {
696 hw_breakpoint_init(&attr); 691 ptrace_breakpoint_init(&attr);
697 /* 692 /*
698 * Put stub len and type to register (reserve) an inactive but 693 * Put stub len and type to register (reserve) an inactive but
699 * correct bp 694 * correct bp
@@ -789,342 +784,6 @@ static int ioperm_get(struct task_struct *target,
789 0, IO_BITMAP_BYTES); 784 0, IO_BITMAP_BYTES);
790} 785}
791 786
792#ifdef CONFIG_X86_PTRACE_BTS
793/*
794 * A branch trace store context.
795 *
796 * Contexts may only be installed by ptrace_bts_config() and only for
797 * ptraced tasks.
798 *
799 * Contexts are destroyed when the tracee is detached from the tracer.
800 * The actual destruction work requires interrupts enabled, so the
801 * work is deferred and will be scheduled during __ptrace_unlink().
802 *
803 * Contexts hold an additional task_struct reference on the traced
804 * task, as well as a reference on the tracer's mm.
805 *
806 * Ptrace already holds a task_struct for the duration of ptrace operations,
807 * but since destruction is deferred, it may be executed after both
808 * tracer and tracee exited.
809 */
810struct bts_context {
811 /* The branch trace handle. */
812 struct bts_tracer *tracer;
813
814 /* The buffer used to store the branch trace and its size. */
815 void *buffer;
816 unsigned int size;
817
818 /* The mm that paid for the above buffer. */
819 struct mm_struct *mm;
820
821 /* The task this context belongs to. */
822 struct task_struct *task;
823
824 /* The signal to send on a bts buffer overflow. */
825 unsigned int bts_ovfl_signal;
826
827 /* The work struct to destroy a context. */
828 struct work_struct work;
829};
830
831static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
832{
833 void *buffer = NULL;
834 int err = -ENOMEM;
835
836 err = account_locked_memory(current->mm, current->signal->rlim, size);
837 if (err < 0)
838 return err;
839
840 buffer = kzalloc(size, GFP_KERNEL);
841 if (!buffer)
842 goto out_refund;
843
844 context->buffer = buffer;
845 context->size = size;
846 context->mm = get_task_mm(current);
847
848 return 0;
849
850 out_refund:
851 refund_locked_memory(current->mm, size);
852 return err;
853}
854
855static inline void free_bts_buffer(struct bts_context *context)
856{
857 if (!context->buffer)
858 return;
859
860 kfree(context->buffer);
861 context->buffer = NULL;
862
863 refund_locked_memory(context->mm, context->size);
864 context->size = 0;
865
866 mmput(context->mm);
867 context->mm = NULL;
868}
869
870static void free_bts_context_work(struct work_struct *w)
871{
872 struct bts_context *context;
873
874 context = container_of(w, struct bts_context, work);
875
876 ds_release_bts(context->tracer);
877 put_task_struct(context->task);
878 free_bts_buffer(context);
879 kfree(context);
880}
881
882static inline void free_bts_context(struct bts_context *context)
883{
884 INIT_WORK(&context->work, free_bts_context_work);
885 schedule_work(&context->work);
886}
887
888static inline struct bts_context *alloc_bts_context(struct task_struct *task)
889{
890 struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
891 if (context) {
892 context->task = task;
893 task->bts = context;
894
895 get_task_struct(task);
896 }
897
898 return context;
899}
900
901static int ptrace_bts_read_record(struct task_struct *child, size_t index,
902 struct bts_struct __user *out)
903{
904 struct bts_context *context;
905 const struct bts_trace *trace;
906 struct bts_struct bts;
907 const unsigned char *at;
908 int error;
909
910 context = child->bts;
911 if (!context)
912 return -ESRCH;
913
914 trace = ds_read_bts(context->tracer);
915 if (!trace)
916 return -ESRCH;
917
918 at = trace->ds.top - ((index + 1) * trace->ds.size);
919 if ((void *)at < trace->ds.begin)
920 at += (trace->ds.n * trace->ds.size);
921
922 if (!trace->read)
923 return -EOPNOTSUPP;
924
925 error = trace->read(context->tracer, at, &bts);
926 if (error < 0)
927 return error;
928
929 if (copy_to_user(out, &bts, sizeof(bts)))
930 return -EFAULT;
931
932 return sizeof(bts);
933}
934
935static int ptrace_bts_drain(struct task_struct *child,
936 long size,
937 struct bts_struct __user *out)
938{
939 struct bts_context *context;
940 const struct bts_trace *trace;
941 const unsigned char *at;
942 int error, drained = 0;
943
944 context = child->bts;
945 if (!context)
946 return -ESRCH;
947
948 trace = ds_read_bts(context->tracer);
949 if (!trace)
950 return -ESRCH;
951
952 if (!trace->read)
953 return -EOPNOTSUPP;
954
955 if (size < (trace->ds.top - trace->ds.begin))
956 return -EIO;
957
958 for (at = trace->ds.begin; (void *)at < trace->ds.top;
959 out++, drained++, at += trace->ds.size) {
960 struct bts_struct bts;
961
962 error = trace->read(context->tracer, at, &bts);
963 if (error < 0)
964 return error;
965
966 if (copy_to_user(out, &bts, sizeof(bts)))
967 return -EFAULT;
968 }
969
970 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
971
972 error = ds_reset_bts(context->tracer);
973 if (error < 0)
974 return error;
975
976 return drained;
977}
978
979static int ptrace_bts_config(struct task_struct *child,
980 long cfg_size,
981 const struct ptrace_bts_config __user *ucfg)
982{
983 struct bts_context *context;
984 struct ptrace_bts_config cfg;
985 unsigned int flags = 0;
986
987 if (cfg_size < sizeof(cfg))
988 return -EIO;
989
990 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
991 return -EFAULT;
992
993 context = child->bts;
994 if (!context)
995 context = alloc_bts_context(child);
996 if (!context)
997 return -ENOMEM;
998
999 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
1000 if (!cfg.signal)
1001 return -EINVAL;
1002
1003 return -EOPNOTSUPP;
1004 context->bts_ovfl_signal = cfg.signal;
1005 }
1006
1007 ds_release_bts(context->tracer);
1008 context->tracer = NULL;
1009
1010 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
1011 int err;
1012
1013 free_bts_buffer(context);
1014 if (!cfg.size)
1015 return 0;
1016
1017 err = alloc_bts_buffer(context, cfg.size);
1018 if (err < 0)
1019 return err;
1020 }
1021
1022 if (cfg.flags & PTRACE_BTS_O_TRACE)
1023 flags |= BTS_USER;
1024
1025 if (cfg.flags & PTRACE_BTS_O_SCHED)
1026 flags |= BTS_TIMESTAMPS;
1027
1028 context->tracer =
1029 ds_request_bts_task(child, context->buffer, context->size,
1030 NULL, (size_t)-1, flags);
1031 if (unlikely(IS_ERR(context->tracer))) {
1032 int error = PTR_ERR(context->tracer);
1033
1034 free_bts_buffer(context);
1035 context->tracer = NULL;
1036 return error;
1037 }
1038
1039 return sizeof(cfg);
1040}
1041
1042static int ptrace_bts_status(struct task_struct *child,
1043 long cfg_size,
1044 struct ptrace_bts_config __user *ucfg)
1045{
1046 struct bts_context *context;
1047 const struct bts_trace *trace;
1048 struct ptrace_bts_config cfg;
1049
1050 context = child->bts;
1051 if (!context)
1052 return -ESRCH;
1053
1054 if (cfg_size < sizeof(cfg))
1055 return -EIO;
1056
1057 trace = ds_read_bts(context->tracer);
1058 if (!trace)
1059 return -ESRCH;
1060
1061 memset(&cfg, 0, sizeof(cfg));
1062 cfg.size = trace->ds.end - trace->ds.begin;
1063 cfg.signal = context->bts_ovfl_signal;
1064 cfg.bts_size = sizeof(struct bts_struct);
1065
1066 if (cfg.signal)
1067 cfg.flags |= PTRACE_BTS_O_SIGNAL;
1068
1069 if (trace->ds.flags & BTS_USER)
1070 cfg.flags |= PTRACE_BTS_O_TRACE;
1071
1072 if (trace->ds.flags & BTS_TIMESTAMPS)
1073 cfg.flags |= PTRACE_BTS_O_SCHED;
1074
1075 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
1076 return -EFAULT;
1077
1078 return sizeof(cfg);
1079}
1080
1081static int ptrace_bts_clear(struct task_struct *child)
1082{
1083 struct bts_context *context;
1084 const struct bts_trace *trace;
1085
1086 context = child->bts;
1087 if (!context)
1088 return -ESRCH;
1089
1090 trace = ds_read_bts(context->tracer);
1091 if (!trace)
1092 return -ESRCH;
1093
1094 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
1095
1096 return ds_reset_bts(context->tracer);
1097}
1098
1099static int ptrace_bts_size(struct task_struct *child)
1100{
1101 struct bts_context *context;
1102 const struct bts_trace *trace;
1103
1104 context = child->bts;
1105 if (!context)
1106 return -ESRCH;
1107
1108 trace = ds_read_bts(context->tracer);
1109 if (!trace)
1110 return -ESRCH;
1111
1112 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
1113}
1114
1115/*
1116 * Called from __ptrace_unlink() after the child has been moved back
1117 * to its original parent.
1118 */
1119void ptrace_bts_untrace(struct task_struct *child)
1120{
1121 if (unlikely(child->bts)) {
1122 free_bts_context(child->bts);
1123 child->bts = NULL;
1124 }
1125}
1126#endif /* CONFIG_X86_PTRACE_BTS */
1127
1128/* 787/*
1129 * Called by kernel/ptrace.c when detaching.. 788 * Called by kernel/ptrace.c when detaching..
1130 * 789 *
@@ -1252,39 +911,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1252 break; 911 break;
1253#endif 912#endif
1254 913
1255 /*
1256 * These bits need more cooking - not enabled yet:
1257 */
1258#ifdef CONFIG_X86_PTRACE_BTS
1259 case PTRACE_BTS_CONFIG:
1260 ret = ptrace_bts_config
1261 (child, data, (struct ptrace_bts_config __user *)addr);
1262 break;
1263
1264 case PTRACE_BTS_STATUS:
1265 ret = ptrace_bts_status
1266 (child, data, (struct ptrace_bts_config __user *)addr);
1267 break;
1268
1269 case PTRACE_BTS_SIZE:
1270 ret = ptrace_bts_size(child);
1271 break;
1272
1273 case PTRACE_BTS_GET:
1274 ret = ptrace_bts_read_record
1275 (child, data, (struct bts_struct __user *) addr);
1276 break;
1277
1278 case PTRACE_BTS_CLEAR:
1279 ret = ptrace_bts_clear(child);
1280 break;
1281
1282 case PTRACE_BTS_DRAIN:
1283 ret = ptrace_bts_drain
1284 (child, data, (struct bts_struct __user *) addr);
1285 break;
1286#endif /* CONFIG_X86_PTRACE_BTS */
1287
1288 default: 914 default:
1289 ret = ptrace_request(child, request, addr, data); 915 ret = ptrace_request(child, request, addr, data);
1290 break; 916 break;
@@ -1544,14 +1170,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1544 1170
1545 case PTRACE_GET_THREAD_AREA: 1171 case PTRACE_GET_THREAD_AREA:
1546 case PTRACE_SET_THREAD_AREA: 1172 case PTRACE_SET_THREAD_AREA:
1547#ifdef CONFIG_X86_PTRACE_BTS
1548 case PTRACE_BTS_CONFIG:
1549 case PTRACE_BTS_STATUS:
1550 case PTRACE_BTS_SIZE:
1551 case PTRACE_BTS_GET:
1552 case PTRACE_BTS_CLEAR:
1553 case PTRACE_BTS_DRAIN:
1554#endif /* CONFIG_X86_PTRACE_BTS */
1555 return arch_ptrace(child, request, addr, data); 1173 return arch_ptrace(child, request, addr, data);
1556 1174
1557 default: 1175 default:
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index 34e099382651..7ded57896c0a 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
81#endif /* CONFIG_X86_LOCAL_APIC */ 81#endif /* CONFIG_X86_LOCAL_APIC */
82 82
83#ifdef CONFIG_X86_IO_APIC 83#ifdef CONFIG_X86_IO_APIC
84static u32 gsi_base;
85 84
86static int __init sfi_parse_ioapic(struct sfi_table_header *table) 85static int __init sfi_parse_ioapic(struct sfi_table_header *table)
87{ 86{
@@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
94 pentry = (struct sfi_apic_table_entry *)sb->pentry; 93 pentry = (struct sfi_apic_table_entry *)sb->pentry;
95 94
96 for (i = 0; i < num; i++) { 95 for (i = 0; i < num; i++) {
97 mp_register_ioapic(i, pentry->phys_addr, gsi_base); 96 mp_register_ioapic(i, pentry->phys_addr, gsi_end + 1);
98 gsi_base += io_apic_get_redir_entries(i);
99 pentry++; 97 pentry++;
100 } 98 }
101 99
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3149032ff107..58de45ee08b6 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -158,22 +158,6 @@ static int enable_single_step(struct task_struct *child)
158} 158}
159 159
160/* 160/*
161 * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
162 */
163static void write_debugctlmsr(struct task_struct *child, unsigned long val)
164{
165 if (child->thread.debugctlmsr == val)
166 return;
167
168 child->thread.debugctlmsr = val;
169
170 if (child != current)
171 return;
172
173 update_debugctlmsr(val);
174}
175
176/*
177 * Enable single or block step. 161 * Enable single or block step.
178 */ 162 */
179static void enable_step(struct task_struct *child, bool block) 163static void enable_step(struct task_struct *child, bool block)
@@ -186,15 +170,17 @@ static void enable_step(struct task_struct *child, bool block)
186 * that uses user-mode single stepping itself. 170 * that uses user-mode single stepping itself.
187 */ 171 */
188 if (enable_single_step(child) && block) { 172 if (enable_single_step(child) && block) {
189 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 173 unsigned long debugctl = get_debugctlmsr();
190 write_debugctlmsr(child, 174
191 child->thread.debugctlmsr | DEBUGCTLMSR_BTF); 175 debugctl |= DEBUGCTLMSR_BTF;
192 } else { 176 update_debugctlmsr(debugctl);
193 write_debugctlmsr(child, 177 set_tsk_thread_flag(child, TIF_BLOCKSTEP);
194 child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); 178 } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
195 179 unsigned long debugctl = get_debugctlmsr();
196 if (!child->thread.debugctlmsr) 180
197 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 181 debugctl &= ~DEBUGCTLMSR_BTF;
182 update_debugctlmsr(debugctl);
183 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
198 } 184 }
199} 185}
200 186
@@ -213,11 +199,13 @@ void user_disable_single_step(struct task_struct *child)
213 /* 199 /*
214 * Make sure block stepping (BTF) is disabled. 200 * Make sure block stepping (BTF) is disabled.
215 */ 201 */
216 write_debugctlmsr(child, 202 if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
217 child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); 203 unsigned long debugctl = get_debugctlmsr();
218 204
219 if (!child->thread.debugctlmsr) 205 debugctl &= ~DEBUGCTLMSR_BTF;
220 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 206 update_debugctlmsr(debugctl);
207 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
208 }
221 209
222 /* Always clear TIF_SINGLESTEP... */ 210 /* Always clear TIF_SINGLESTEP... */
223 clear_tsk_thread_flag(child, TIF_SINGLESTEP); 211 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 86c9f91b48ae..cc2c60474fd0 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -175,6 +175,9 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
175 struct tboot_mac_region *mr; 175 struct tboot_mac_region *mr;
176 phys_addr_t end = start + size; 176 phys_addr_t end = start + size;
177 177
178 if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
179 panic("tboot: Too many MAC regions\n");
180
178 if (start && size) { 181 if (start && size) {
179 mr = &tboot->mac_regions[tboot->num_mac_regions++]; 182 mr = &tboot->mac_regions[tboot->num_mac_regions++];
180 mr->start = round_down(start, PAGE_SIZE); 183 mr->start = round_down(start, PAGE_SIZE);
@@ -184,18 +187,17 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
184 187
185static int tboot_setup_sleep(void) 188static int tboot_setup_sleep(void)
186{ 189{
190 int i;
191
187 tboot->num_mac_regions = 0; 192 tboot->num_mac_regions = 0;
188 193
189 /* S3 resume code */ 194 for (i = 0; i < e820.nr_map; i++) {
190 add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); 195 if ((e820.map[i].type != E820_RAM)
196 && (e820.map[i].type != E820_RESERVED_KERN))
197 continue;
191 198
192#ifdef CONFIG_X86_TRAMPOLINE 199 add_mac_region(e820.map[i].addr, e820.map[i].size);
193 /* AP trampoline code */ 200 }
194 add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
195#endif
196
197 /* kernel code + data + bss */
198 add_mac_region(virt_to_phys(_text), _end - _text);
199 201
200 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; 202 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
201 203
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 17b03dd3a6b5..7fea555929e2 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * SGI UltraViolet TLB flush routines. 2 * SGI UltraViolet TLB flush routines.
3 * 3 *
4 * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI. 4 * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
5 * 5 *
6 * This code is released under the GNU General Public License version 2 or 6 * This code is released under the GNU General Public License version 2 or
7 * later. 7 * later.
@@ -20,42 +20,67 @@
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/tsc.h> 21#include <asm/tsc.h>
22#include <asm/irq_vectors.h> 22#include <asm/irq_vectors.h>
23#include <asm/timer.h>
23 24
24static struct bau_control **uv_bau_table_bases __read_mostly; 25struct msg_desc {
25static int uv_bau_retry_limit __read_mostly; 26 struct bau_payload_queue_entry *msg;
27 int msg_slot;
28 int sw_ack_slot;
29 struct bau_payload_queue_entry *va_queue_first;
30 struct bau_payload_queue_entry *va_queue_last;
31};
26 32
27/* base pnode in this partition */ 33#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
28static int uv_partition_base_pnode __read_mostly; 34
35static int uv_bau_max_concurrent __read_mostly;
36
37static int nobau;
38static int __init setup_nobau(char *arg)
39{
40 nobau = 1;
41 return 0;
42}
43early_param("nobau", setup_nobau);
29 44
30static unsigned long uv_mmask __read_mostly; 45/* base pnode in this partition */
46static int uv_partition_base_pnode __read_mostly;
47/* position of pnode (which is nasid>>1): */
48static int uv_nshift __read_mostly;
49static unsigned long uv_mmask __read_mostly;
31 50
32static DEFINE_PER_CPU(struct ptc_stats, ptcstats); 51static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
33static DEFINE_PER_CPU(struct bau_control, bau_control); 52static DEFINE_PER_CPU(struct bau_control, bau_control);
53static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
54
55struct reset_args {
56 int sender;
57};
34 58
35/* 59/*
36 * Determine the first node on a blade. 60 * Determine the first node on a uvhub. 'Nodes' are used for kernel
61 * memory allocation.
37 */ 62 */
38static int __init blade_to_first_node(int blade) 63static int __init uvhub_to_first_node(int uvhub)
39{ 64{
40 int node, b; 65 int node, b;
41 66
42 for_each_online_node(node) { 67 for_each_online_node(node) {
43 b = uv_node_to_blade_id(node); 68 b = uv_node_to_blade_id(node);
44 if (blade == b) 69 if (uvhub == b)
45 return node; 70 return node;
46 } 71 }
47 return -1; /* shouldn't happen */ 72 return -1;
48} 73}
49 74
50/* 75/*
51 * Determine the apicid of the first cpu on a blade. 76 * Determine the apicid of the first cpu on a uvhub.
52 */ 77 */
53static int __init blade_to_first_apicid(int blade) 78static int __init uvhub_to_first_apicid(int uvhub)
54{ 79{
55 int cpu; 80 int cpu;
56 81
57 for_each_present_cpu(cpu) 82 for_each_present_cpu(cpu)
58 if (blade == uv_cpu_to_blade_id(cpu)) 83 if (uvhub == uv_cpu_to_blade_id(cpu))
59 return per_cpu(x86_cpu_to_apicid, cpu); 84 return per_cpu(x86_cpu_to_apicid, cpu);
60 return -1; 85 return -1;
61} 86}
@@ -68,195 +93,459 @@ static int __init blade_to_first_apicid(int blade)
68 * clear of the Timeout bit (as well) will free the resource. No reply will 93 * clear of the Timeout bit (as well) will free the resource. No reply will
69 * be sent (the hardware will only do one reply per message). 94 * be sent (the hardware will only do one reply per message).
70 */ 95 */
71static void uv_reply_to_message(int resource, 96static inline void uv_reply_to_message(struct msg_desc *mdp,
72 struct bau_payload_queue_entry *msg, 97 struct bau_control *bcp)
73 struct bau_msg_status *msp)
74{ 98{
75 unsigned long dw; 99 unsigned long dw;
100 struct bau_payload_queue_entry *msg;
76 101
77 dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); 102 msg = mdp->msg;
103 if (!msg->canceled) {
104 dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
105 msg->sw_ack_vector;
106 uv_write_local_mmr(
107 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
108 }
78 msg->replied_to = 1; 109 msg->replied_to = 1;
79 msg->sw_ack_vector = 0; 110 msg->sw_ack_vector = 0;
80 if (msp)
81 msp->seen_by.bits = 0;
82 uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
83} 111}
84 112
85/* 113/*
86 * Do all the things a cpu should do for a TLB shootdown message. 114 * Process the receipt of a RETRY message
87 * Other cpu's may come here at the same time for this message.
88 */ 115 */
89static void uv_bau_process_message(struct bau_payload_queue_entry *msg, 116static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
90 int msg_slot, int sw_ack_slot) 117 struct bau_control *bcp)
91{ 118{
92 unsigned long this_cpu_mask; 119 int i;
93 struct bau_msg_status *msp; 120 int cancel_count = 0;
94 int cpu; 121 int slot2;
122 unsigned long msg_res;
123 unsigned long mmr = 0;
124 struct bau_payload_queue_entry *msg;
125 struct bau_payload_queue_entry *msg2;
126 struct ptc_stats *stat;
95 127
96 msp = __get_cpu_var(bau_control).msg_statuses + msg_slot; 128 msg = mdp->msg;
97 cpu = uv_blade_processor_id(); 129 stat = &per_cpu(ptcstats, bcp->cpu);
98 msg->number_of_cpus = 130 stat->d_retries++;
99 uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id())); 131 /*
100 this_cpu_mask = 1UL << cpu; 132 * cancel any message from msg+1 to the retry itself
101 if (msp->seen_by.bits & this_cpu_mask) 133 */
102 return; 134 for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
103 atomic_or_long(&msp->seen_by.bits, this_cpu_mask); 135 if (msg2 > mdp->va_queue_last)
136 msg2 = mdp->va_queue_first;
137 if (msg2 == msg)
138 break;
139
140 /* same conditions for cancellation as uv_do_reset */
141 if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
142 (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
143 msg->sw_ack_vector) == 0) &&
144 (msg2->sending_cpu == msg->sending_cpu) &&
145 (msg2->msg_type != MSG_NOOP)) {
146 slot2 = msg2 - mdp->va_queue_first;
147 mmr = uv_read_local_mmr
148 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
149 msg_res = ((msg2->sw_ack_vector << 8) |
150 msg2->sw_ack_vector);
151 /*
152 * This is a message retry; clear the resources held
153 * by the previous message only if they timed out.
154 * If it has not timed out we have an unexpected
155 * situation to report.
156 */
157 if (mmr & (msg_res << 8)) {
158 /*
159 * is the resource timed out?
160 * make everyone ignore the cancelled message.
161 */
162 msg2->canceled = 1;
163 stat->d_canceled++;
164 cancel_count++;
165 uv_write_local_mmr(
166 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
167 (msg_res << 8) | msg_res);
168 } else
169 printk(KERN_INFO "note bau retry: no effect\n");
170 }
171 }
172 if (!cancel_count)
173 stat->d_nocanceled++;
174}
104 175
105 if (msg->replied_to == 1) 176/*
106 return; 177 * Do all the things a cpu should do for a TLB shootdown message.
178 * Other cpu's may come here at the same time for this message.
179 */
180static void uv_bau_process_message(struct msg_desc *mdp,
181 struct bau_control *bcp)
182{
183 int msg_ack_count;
184 short socket_ack_count = 0;
185 struct ptc_stats *stat;
186 struct bau_payload_queue_entry *msg;
187 struct bau_control *smaster = bcp->socket_master;
107 188
189 /*
190 * This must be a normal message, or retry of a normal message
191 */
192 msg = mdp->msg;
193 stat = &per_cpu(ptcstats, bcp->cpu);
108 if (msg->address == TLB_FLUSH_ALL) { 194 if (msg->address == TLB_FLUSH_ALL) {
109 local_flush_tlb(); 195 local_flush_tlb();
110 __get_cpu_var(ptcstats).alltlb++; 196 stat->d_alltlb++;
111 } else { 197 } else {
112 __flush_tlb_one(msg->address); 198 __flush_tlb_one(msg->address);
113 __get_cpu_var(ptcstats).onetlb++; 199 stat->d_onetlb++;
114 } 200 }
201 stat->d_requestee++;
202
203 /*
204 * One cpu on each uvhub has the additional job on a RETRY
205 * of releasing the resource held by the message that is
206 * being retried. That message is identified by sending
207 * cpu number.
208 */
209 if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
210 uv_bau_process_retry_msg(mdp, bcp);
115 211
116 __get_cpu_var(ptcstats).requestee++; 212 /*
213 * This is a sw_ack message, so we have to reply to it.
214 * Count each responding cpu on the socket. This avoids
215 * pinging the count's cache line back and forth between
216 * the sockets.
217 */
218 socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
219 &smaster->socket_acknowledge_count[mdp->msg_slot]);
220 if (socket_ack_count == bcp->cpus_in_socket) {
221 /*
222 * Both sockets dump their completed count total into
223 * the message's count.
224 */
225 smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
226 msg_ack_count = atomic_add_short_return(socket_ack_count,
227 (struct atomic_short *)&msg->acknowledge_count);
228
229 if (msg_ack_count == bcp->cpus_in_uvhub) {
230 /*
231 * All cpus in uvhub saw it; reply
232 */
233 uv_reply_to_message(mdp, bcp);
234 }
235 }
117 236
118 atomic_inc_short(&msg->acknowledge_count); 237 return;
119 if (msg->number_of_cpus == msg->acknowledge_count)
120 uv_reply_to_message(sw_ack_slot, msg, msp);
121} 238}
122 239
123/* 240/*
124 * Examine the payload queue on one distribution node to see 241 * Determine the first cpu on a uvhub.
125 * which messages have not been seen, and which cpu(s) have not seen them. 242 */
243static int uvhub_to_first_cpu(int uvhub)
244{
245 int cpu;
246 for_each_present_cpu(cpu)
247 if (uvhub == uv_cpu_to_blade_id(cpu))
248 return cpu;
249 return -1;
250}
251
252/*
253 * Last resort when we get a large number of destination timeouts is
254 * to clear resources held by a given cpu.
255 * Do this with IPI so that all messages in the BAU message queue
256 * can be identified by their nonzero sw_ack_vector field.
126 * 257 *
127 * Returns the number of cpu's that have not responded. 258 * This is entered for a single cpu on the uvhub.
259 * The sender want's this uvhub to free a specific message's
260 * sw_ack resources.
128 */ 261 */
129static int uv_examine_destination(struct bau_control *bau_tablesp, int sender) 262static void
263uv_do_reset(void *ptr)
130{ 264{
131 struct bau_payload_queue_entry *msg;
132 struct bau_msg_status *msp;
133 int count = 0;
134 int i; 265 int i;
135 int j; 266 int slot;
267 int count = 0;
268 unsigned long mmr;
269 unsigned long msg_res;
270 struct bau_control *bcp;
271 struct reset_args *rap;
272 struct bau_payload_queue_entry *msg;
273 struct ptc_stats *stat;
136 274
137 for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE; 275 bcp = &per_cpu(bau_control, smp_processor_id());
138 msg++, i++) { 276 rap = (struct reset_args *)ptr;
139 if ((msg->sending_cpu == sender) && (!msg->replied_to)) { 277 stat = &per_cpu(ptcstats, bcp->cpu);
140 msp = bau_tablesp->msg_statuses + i; 278 stat->d_resets++;
141 printk(KERN_DEBUG 279
142 "blade %d: address:%#lx %d of %d, not cpu(s): ", 280 /*
143 i, msg->address, msg->acknowledge_count, 281 * We're looking for the given sender, and
144 msg->number_of_cpus); 282 * will free its sw_ack resource.
145 for (j = 0; j < msg->number_of_cpus; j++) { 283 * If all cpu's finally responded after the timeout, its
146 if (!((1L << j) & msp->seen_by.bits)) { 284 * message 'replied_to' was set.
147 count++; 285 */
148 printk("%d ", j); 286 for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
149 } 287 /* uv_do_reset: same conditions for cancellation as
288 uv_bau_process_retry_msg() */
289 if ((msg->replied_to == 0) &&
290 (msg->canceled == 0) &&
291 (msg->sending_cpu == rap->sender) &&
292 (msg->sw_ack_vector) &&
293 (msg->msg_type != MSG_NOOP)) {
294 /*
295 * make everyone else ignore this message
296 */
297 msg->canceled = 1;
298 slot = msg - bcp->va_queue_first;
299 count++;
300 /*
301 * only reset the resource if it is still pending
302 */
303 mmr = uv_read_local_mmr
304 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
305 msg_res = ((msg->sw_ack_vector << 8) |
306 msg->sw_ack_vector);
307 if (mmr & msg_res) {
308 stat->d_rcanceled++;
309 uv_write_local_mmr(
310 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
311 msg_res);
150 } 312 }
151 printk("\n");
152 } 313 }
153 } 314 }
154 return count; 315 return;
155} 316}
156 317
157/* 318/*
158 * Examine the payload queue on all the distribution nodes to see 319 * Use IPI to get all target uvhubs to release resources held by
159 * which messages have not been seen, and which cpu(s) have not seen them. 320 * a given sending cpu number.
160 *
161 * Returns the number of cpu's that have not responded.
162 */ 321 */
163static int uv_examine_destinations(struct bau_target_nodemask *distribution) 322static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
323 int sender)
164{ 324{
165 int sender; 325 int uvhub;
166 int i; 326 int cpu;
167 int count = 0; 327 cpumask_t mask;
328 struct reset_args reset_args;
329
330 reset_args.sender = sender;
168 331
169 sender = smp_processor_id(); 332 cpus_clear(mask);
170 for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) { 333 /* find a single cpu for each uvhub in this distribution mask */
171 if (!bau_node_isset(i, distribution)) 334 for (uvhub = 0;
335 uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
336 uvhub++) {
337 if (!bau_uvhub_isset(uvhub, distribution))
172 continue; 338 continue;
173 count += uv_examine_destination(uv_bau_table_bases[i], sender); 339 /* find a cpu for this uvhub */
340 cpu = uvhub_to_first_cpu(uvhub);
341 cpu_set(cpu, mask);
174 } 342 }
175 return count; 343 /* IPI all cpus; Preemption is already disabled */
344 smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
345 return;
346}
347
348static inline unsigned long
349cycles_2_us(unsigned long long cyc)
350{
351 unsigned long long ns;
352 unsigned long us;
353 ns = (cyc * per_cpu(cyc2ns, smp_processor_id()))
354 >> CYC2NS_SCALE_FACTOR;
355 us = ns / 1000;
356 return us;
176} 357}
177 358
178/* 359/*
179 * wait for completion of a broadcast message 360 * wait for all cpus on this hub to finish their sends and go quiet
180 * 361 * leaves uvhub_quiesce set so that no new broadcasts are started by
181 * return COMPLETE, RETRY or GIVEUP 362 * bau_flush_send_and_wait()
363 */
364static inline void
365quiesce_local_uvhub(struct bau_control *hmaster)
366{
367 atomic_add_short_return(1, (struct atomic_short *)
368 &hmaster->uvhub_quiesce);
369}
370
371/*
372 * mark this quiet-requestor as done
373 */
374static inline void
375end_uvhub_quiesce(struct bau_control *hmaster)
376{
377 atomic_add_short_return(-1, (struct atomic_short *)
378 &hmaster->uvhub_quiesce);
379}
380
381/*
382 * Wait for completion of a broadcast software ack message
383 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
182 */ 384 */
183static int uv_wait_completion(struct bau_desc *bau_desc, 385static int uv_wait_completion(struct bau_desc *bau_desc,
184 unsigned long mmr_offset, int right_shift) 386 unsigned long mmr_offset, int right_shift, int this_cpu,
387 struct bau_control *bcp, struct bau_control *smaster, long try)
185{ 388{
186 int exams = 0; 389 int relaxes = 0;
187 long destination_timeouts = 0;
188 long source_timeouts = 0;
189 unsigned long descriptor_status; 390 unsigned long descriptor_status;
391 unsigned long mmr;
392 unsigned long mask;
393 cycles_t ttime;
394 cycles_t timeout_time;
395 struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
396 struct bau_control *hmaster;
397
398 hmaster = bcp->uvhub_master;
399 timeout_time = get_cycles() + bcp->timeout_interval;
190 400
401 /* spin on the status MMR, waiting for it to go idle */
191 while ((descriptor_status = (((unsigned long) 402 while ((descriptor_status = (((unsigned long)
192 uv_read_local_mmr(mmr_offset) >> 403 uv_read_local_mmr(mmr_offset) >>
193 right_shift) & UV_ACT_STATUS_MASK)) != 404 right_shift) & UV_ACT_STATUS_MASK)) !=
194 DESC_STATUS_IDLE) { 405 DESC_STATUS_IDLE) {
195 if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
196 source_timeouts++;
197 if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
198 source_timeouts = 0;
199 __get_cpu_var(ptcstats).s_retry++;
200 return FLUSH_RETRY;
201 }
202 /* 406 /*
203 * spin here looking for progress at the destinations 407 * Our software ack messages may be blocked because there are
408 * no swack resources available. As long as none of them
409 * has timed out hardware will NACK our message and its
410 * state will stay IDLE.
204 */ 411 */
205 if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { 412 if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
206 destination_timeouts++; 413 stat->s_stimeout++;
207 if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { 414 return FLUSH_GIVEUP;
208 /* 415 } else if (descriptor_status ==
209 * returns number of cpus not responding 416 DESC_STATUS_DESTINATION_TIMEOUT) {
210 */ 417 stat->s_dtimeout++;
211 if (uv_examine_destinations 418 ttime = get_cycles();
212 (&bau_desc->distribution) == 0) { 419
213 __get_cpu_var(ptcstats).d_retry++; 420 /*
214 return FLUSH_RETRY; 421 * Our retries may be blocked by all destination
215 } 422 * swack resources being consumed, and a timeout
216 exams++; 423 * pending. In that case hardware returns the
217 if (exams >= uv_bau_retry_limit) { 424 * ERROR that looks like a destination timeout.
218 printk(KERN_DEBUG 425 */
219 "uv_flush_tlb_others"); 426 if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
220 printk("giving up on cpu %d\n", 427 bcp->conseccompletes = 0;
221 smp_processor_id()); 428 return FLUSH_RETRY_PLUGGED;
429 }
430
431 bcp->conseccompletes = 0;
432 return FLUSH_RETRY_TIMEOUT;
433 } else {
434 /*
435 * descriptor_status is still BUSY
436 */
437 cpu_relax();
438 relaxes++;
439 if (relaxes >= 10000) {
440 relaxes = 0;
441 if (get_cycles() > timeout_time) {
442 quiesce_local_uvhub(hmaster);
443
444 /* single-thread the register change */
445 spin_lock(&hmaster->masks_lock);
446 mmr = uv_read_local_mmr(mmr_offset);
447 mask = 0UL;
448 mask |= (3UL < right_shift);
449 mask = ~mask;
450 mmr &= mask;
451 uv_write_local_mmr(mmr_offset, mmr);
452 spin_unlock(&hmaster->masks_lock);
453 end_uvhub_quiesce(hmaster);
454 stat->s_busy++;
222 return FLUSH_GIVEUP; 455 return FLUSH_GIVEUP;
223 } 456 }
224 /*
225 * delays can hang the simulator
226 udelay(1000);
227 */
228 destination_timeouts = 0;
229 } 457 }
230 } 458 }
231 cpu_relax();
232 } 459 }
460 bcp->conseccompletes++;
233 return FLUSH_COMPLETE; 461 return FLUSH_COMPLETE;
234} 462}
235 463
464static inline cycles_t
465sec_2_cycles(unsigned long sec)
466{
467 unsigned long ns;
468 cycles_t cyc;
469
470 ns = sec * 1000000000;
471 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
472 return cyc;
473}
474
475/*
476 * conditionally add 1 to *v, unless *v is >= u
477 * return 0 if we cannot add 1 to *v because it is >= u
478 * return 1 if we can add 1 to *v because it is < u
479 * the add is atomic
480 *
481 * This is close to atomic_add_unless(), but this allows the 'u' value
482 * to be lowered below the current 'v'. atomic_add_unless can only stop
483 * on equal.
484 */
485static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
486{
487 spin_lock(lock);
488 if (atomic_read(v) >= u) {
489 spin_unlock(lock);
490 return 0;
491 }
492 atomic_inc(v);
493 spin_unlock(lock);
494 return 1;
495}
496
236/** 497/**
237 * uv_flush_send_and_wait 498 * uv_flush_send_and_wait
238 * 499 *
239 * Send a broadcast and wait for a broadcast message to complete. 500 * Send a broadcast and wait for it to complete.
240 * 501 *
241 * The flush_mask contains the cpus the broadcast was sent to. 502 * The flush_mask contains the cpus the broadcast is to be sent to, plus
503 * cpus that are on the local uvhub.
242 * 504 *
243 * Returns NULL if all remote flushing was done. The mask is zeroed. 505 * Returns NULL if all flushing represented in the mask was done. The mask
506 * is zeroed.
244 * Returns @flush_mask if some remote flushing remains to be done. The 507 * Returns @flush_mask if some remote flushing remains to be done. The
245 * mask will have some bits still set. 508 * mask will have some bits still set, representing any cpus on the local
509 * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
246 */ 510 */
247const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, 511const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
248 struct bau_desc *bau_desc, 512 struct cpumask *flush_mask,
249 struct cpumask *flush_mask) 513 struct bau_control *bcp)
250{ 514{
251 int completion_status = 0;
252 int right_shift; 515 int right_shift;
253 int tries = 0; 516 int uvhub;
254 int pnode;
255 int bit; 517 int bit;
518 int completion_status = 0;
519 int seq_number = 0;
520 long try = 0;
521 int cpu = bcp->uvhub_cpu;
522 int this_cpu = bcp->cpu;
523 int this_uvhub = bcp->uvhub;
256 unsigned long mmr_offset; 524 unsigned long mmr_offset;
257 unsigned long index; 525 unsigned long index;
258 cycles_t time1; 526 cycles_t time1;
259 cycles_t time2; 527 cycles_t time2;
528 struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
529 struct bau_control *smaster = bcp->socket_master;
530 struct bau_control *hmaster = bcp->uvhub_master;
531
532 /*
533 * Spin here while there are hmaster->max_concurrent or more active
534 * descriptors. This is the per-uvhub 'throttle'.
535 */
536 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
537 &hmaster->active_descriptor_count,
538 hmaster->max_concurrent)) {
539 stat->s_throttles++;
540 do {
541 cpu_relax();
542 } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
543 &hmaster->active_descriptor_count,
544 hmaster->max_concurrent));
545 }
546
547 while (hmaster->uvhub_quiesce)
548 cpu_relax();
260 549
261 if (cpu < UV_CPUS_PER_ACT_STATUS) { 550 if (cpu < UV_CPUS_PER_ACT_STATUS) {
262 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; 551 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -268,24 +557,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
268 } 557 }
269 time1 = get_cycles(); 558 time1 = get_cycles();
270 do { 559 do {
271 tries++; 560 /*
561 * Every message from any given cpu gets a unique message
562 * sequence number. But retries use that same number.
563 * Our message may have timed out at the destination because
564 * all sw-ack resources are in use and there is a timeout
565 * pending there. In that case, our last send never got
566 * placed into the queue and we need to persist until it
567 * does.
568 *
569 * Make any retry a type MSG_RETRY so that the destination will
570 * free any resource held by a previous message from this cpu.
571 */
572 if (try == 0) {
573 /* use message type set by the caller the first time */
574 seq_number = bcp->message_number++;
575 } else {
576 /* use RETRY type on all the rest; same sequence */
577 bau_desc->header.msg_type = MSG_RETRY;
578 stat->s_retry_messages++;
579 }
580 bau_desc->header.sequence = seq_number;
272 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | 581 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
273 cpu; 582 bcp->uvhub_cpu;
583 bcp->send_message = get_cycles();
584
274 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); 585 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
586
587 try++;
275 completion_status = uv_wait_completion(bau_desc, mmr_offset, 588 completion_status = uv_wait_completion(bau_desc, mmr_offset,
276 right_shift); 589 right_shift, this_cpu, bcp, smaster, try);
277 } while (completion_status == FLUSH_RETRY); 590
591 if (completion_status == FLUSH_RETRY_PLUGGED) {
592 /*
593 * Our retries may be blocked by all destination swack
594 * resources being consumed, and a timeout pending. In
595 * that case hardware immediately returns the ERROR
596 * that looks like a destination timeout.
597 */
598 udelay(TIMEOUT_DELAY);
599 bcp->plugged_tries++;
600 if (bcp->plugged_tries >= PLUGSB4RESET) {
601 bcp->plugged_tries = 0;
602 quiesce_local_uvhub(hmaster);
603 spin_lock(&hmaster->queue_lock);
604 uv_reset_with_ipi(&bau_desc->distribution,
605 this_cpu);
606 spin_unlock(&hmaster->queue_lock);
607 end_uvhub_quiesce(hmaster);
608 bcp->ipi_attempts++;
609 stat->s_resets_plug++;
610 }
611 } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
612 hmaster->max_concurrent = 1;
613 bcp->timeout_tries++;
614 udelay(TIMEOUT_DELAY);
615 if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
616 bcp->timeout_tries = 0;
617 quiesce_local_uvhub(hmaster);
618 spin_lock(&hmaster->queue_lock);
619 uv_reset_with_ipi(&bau_desc->distribution,
620 this_cpu);
621 spin_unlock(&hmaster->queue_lock);
622 end_uvhub_quiesce(hmaster);
623 bcp->ipi_attempts++;
624 stat->s_resets_timeout++;
625 }
626 }
627 if (bcp->ipi_attempts >= 3) {
628 bcp->ipi_attempts = 0;
629 completion_status = FLUSH_GIVEUP;
630 break;
631 }
632 cpu_relax();
633 } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
634 (completion_status == FLUSH_RETRY_TIMEOUT));
278 time2 = get_cycles(); 635 time2 = get_cycles();
279 __get_cpu_var(ptcstats).sflush += (time2 - time1);
280 if (tries > 1)
281 __get_cpu_var(ptcstats).retriesok++;
282 636
283 if (completion_status == FLUSH_GIVEUP) { 637 if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
638 && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
639 hmaster->max_concurrent++;
640
641 /*
642 * hold any cpu not timing out here; no other cpu currently held by
643 * the 'throttle' should enter the activation code
644 */
645 while (hmaster->uvhub_quiesce)
646 cpu_relax();
647 atomic_dec(&hmaster->active_descriptor_count);
648
649 /* guard against cycles wrap */
650 if (time2 > time1)
651 stat->s_time += (time2 - time1);
652 else
653 stat->s_requestor--; /* don't count this one */
654 if (completion_status == FLUSH_COMPLETE && try > 1)
655 stat->s_retriesok++;
656 else if (completion_status == FLUSH_GIVEUP) {
284 /* 657 /*
285 * Cause the caller to do an IPI-style TLB shootdown on 658 * Cause the caller to do an IPI-style TLB shootdown on
286 * the cpu's, all of which are still in the mask. 659 * the target cpu's, all of which are still in the mask.
287 */ 660 */
288 __get_cpu_var(ptcstats).ptc_i++; 661 stat->s_giveup++;
289 return flush_mask; 662 return flush_mask;
290 } 663 }
291 664
@@ -294,18 +667,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
294 * use the IPI method of shootdown on them. 667 * use the IPI method of shootdown on them.
295 */ 668 */
296 for_each_cpu(bit, flush_mask) { 669 for_each_cpu(bit, flush_mask) {
297 pnode = uv_cpu_to_pnode(bit); 670 uvhub = uv_cpu_to_blade_id(bit);
298 if (pnode == this_pnode) 671 if (uvhub == this_uvhub)
299 continue; 672 continue;
300 cpumask_clear_cpu(bit, flush_mask); 673 cpumask_clear_cpu(bit, flush_mask);
301 } 674 }
302 if (!cpumask_empty(flush_mask)) 675 if (!cpumask_empty(flush_mask))
303 return flush_mask; 676 return flush_mask;
677
304 return NULL; 678 return NULL;
305} 679}
306 680
307static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
308
309/** 681/**
310 * uv_flush_tlb_others - globally purge translation cache of a virtual 682 * uv_flush_tlb_others - globally purge translation cache of a virtual
311 * address or all TLB's 683 * address or all TLB's
@@ -322,8 +694,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
322 * The caller has derived the cpumask from the mm_struct. This function 694 * The caller has derived the cpumask from the mm_struct. This function
323 * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) 695 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
324 * 696 *
325 * The cpumask is converted into a nodemask of the nodes containing 697 * The cpumask is converted into a uvhubmask of the uvhubs containing
326 * the cpus. 698 * those cpus.
327 * 699 *
328 * Note that this function should be called with preemption disabled. 700 * Note that this function should be called with preemption disabled.
329 * 701 *
@@ -335,52 +707,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
335 struct mm_struct *mm, 707 struct mm_struct *mm,
336 unsigned long va, unsigned int cpu) 708 unsigned long va, unsigned int cpu)
337{ 709{
338 struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask); 710 int remotes;
339 int i; 711 int tcpu;
340 int bit; 712 int uvhub;
341 int pnode;
342 int uv_cpu;
343 int this_pnode;
344 int locals = 0; 713 int locals = 0;
345 struct bau_desc *bau_desc; 714 struct bau_desc *bau_desc;
715 struct cpumask *flush_mask;
716 struct ptc_stats *stat;
717 struct bau_control *bcp;
346 718
347 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 719 if (nobau)
720 return cpumask;
348 721
349 uv_cpu = uv_blade_processor_id(); 722 bcp = &per_cpu(bau_control, cpu);
350 this_pnode = uv_hub_info->pnode; 723 /*
351 bau_desc = __get_cpu_var(bau_control).descriptor_base; 724 * Each sending cpu has a per-cpu mask which it fills from the caller's
352 bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu; 725 * cpu mask. Only remote cpus are converted to uvhubs and copied.
726 */
727 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
728 /*
729 * copy cpumask to flush_mask, removing current cpu
730 * (current cpu should already have been flushed by the caller and
731 * should never be returned if we return flush_mask)
732 */
733 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
734 if (cpu_isset(cpu, *cpumask))
735 locals++; /* current cpu was targeted */
353 736
354 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 737 bau_desc = bcp->descriptor_base;
738 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
355 739
356 i = 0; 740 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
357 for_each_cpu(bit, flush_mask) { 741 remotes = 0;
358 pnode = uv_cpu_to_pnode(bit); 742 for_each_cpu(tcpu, flush_mask) {
359 BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1)); 743 uvhub = uv_cpu_to_blade_id(tcpu);
360 if (pnode == this_pnode) { 744 if (uvhub == bcp->uvhub) {
361 locals++; 745 locals++;
362 continue; 746 continue;
363 } 747 }
364 bau_node_set(pnode - uv_partition_base_pnode, 748 bau_uvhub_set(uvhub, &bau_desc->distribution);
365 &bau_desc->distribution); 749 remotes++;
366 i++;
367 } 750 }
368 if (i == 0) { 751 if (remotes == 0) {
369 /* 752 /*
370 * no off_node flushing; return status for local node 753 * No off_hub flushing; return status for local hub.
754 * Return the caller's mask if all were local (the current
755 * cpu may be in that mask).
371 */ 756 */
372 if (locals) 757 if (locals)
373 return flush_mask; 758 return cpumask;
374 else 759 else
375 return NULL; 760 return NULL;
376 } 761 }
377 __get_cpu_var(ptcstats).requestor++; 762 stat = &per_cpu(ptcstats, cpu);
378 __get_cpu_var(ptcstats).ntargeted += i; 763 stat->s_requestor++;
764 stat->s_ntargcpu += remotes;
765 remotes = bau_uvhub_weight(&bau_desc->distribution);
766 stat->s_ntarguvhub += remotes;
767 if (remotes >= 16)
768 stat->s_ntarguvhub16++;
769 else if (remotes >= 8)
770 stat->s_ntarguvhub8++;
771 else if (remotes >= 4)
772 stat->s_ntarguvhub4++;
773 else if (remotes >= 2)
774 stat->s_ntarguvhub2++;
775 else
776 stat->s_ntarguvhub1++;
379 777
380 bau_desc->payload.address = va; 778 bau_desc->payload.address = va;
381 bau_desc->payload.sending_cpu = cpu; 779 bau_desc->payload.sending_cpu = cpu;
382 780
383 return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask); 781 /*
782 * uv_flush_send_and_wait returns null if all cpu's were messaged, or
783 * the adjusted flush_mask if any cpu's were not messaged.
784 */
785 return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
384} 786}
385 787
386/* 788/*
@@ -389,87 +791,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
389 * 791 *
390 * We received a broadcast assist message. 792 * We received a broadcast assist message.
391 * 793 *
392 * Interrupts may have been disabled; this interrupt could represent 794 * Interrupts are disabled; this interrupt could represent
393 * the receipt of several messages. 795 * the receipt of several messages.
394 * 796 *
395 * All cores/threads on this node get this interrupt. 797 * All cores/threads on this hub get this interrupt.
396 * The last one to see it does the s/w ack. 798 * The last one to see it does the software ack.
397 * (the resource will not be freed until noninterruptable cpus see this 799 * (the resource will not be freed until noninterruptable cpus see this
398 * interrupt; hardware will timeout the s/w ack and reply ERROR) 800 * interrupt; hardware may timeout the s/w ack and reply ERROR)
399 */ 801 */
400void uv_bau_message_interrupt(struct pt_regs *regs) 802void uv_bau_message_interrupt(struct pt_regs *regs)
401{ 803{
402 struct bau_payload_queue_entry *va_queue_first;
403 struct bau_payload_queue_entry *va_queue_last;
404 struct bau_payload_queue_entry *msg;
405 struct pt_regs *old_regs = set_irq_regs(regs);
406 cycles_t time1;
407 cycles_t time2;
408 int msg_slot;
409 int sw_ack_slot;
410 int fw;
411 int count = 0; 804 int count = 0;
412 unsigned long local_pnode; 805 cycles_t time_start;
413 806 struct bau_payload_queue_entry *msg;
414 ack_APIC_irq(); 807 struct bau_control *bcp;
415 exit_idle(); 808 struct ptc_stats *stat;
416 irq_enter(); 809 struct msg_desc msgdesc;
417 810
418 time1 = get_cycles(); 811 time_start = get_cycles();
419 812 bcp = &per_cpu(bau_control, smp_processor_id());
420 local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); 813 stat = &per_cpu(ptcstats, smp_processor_id());
421 814 msgdesc.va_queue_first = bcp->va_queue_first;
422 va_queue_first = __get_cpu_var(bau_control).va_queue_first; 815 msgdesc.va_queue_last = bcp->va_queue_last;
423 va_queue_last = __get_cpu_var(bau_control).va_queue_last; 816 msg = bcp->bau_msg_head;
424
425 msg = __get_cpu_var(bau_control).bau_msg_head;
426 while (msg->sw_ack_vector) { 817 while (msg->sw_ack_vector) {
427 count++; 818 count++;
428 fw = msg->sw_ack_vector; 819 msgdesc.msg_slot = msg - msgdesc.va_queue_first;
429 msg_slot = msg - va_queue_first; 820 msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
430 sw_ack_slot = ffs(fw) - 1; 821 msgdesc.msg = msg;
431 822 uv_bau_process_message(&msgdesc, bcp);
432 uv_bau_process_message(msg, msg_slot, sw_ack_slot);
433
434 msg++; 823 msg++;
435 if (msg > va_queue_last) 824 if (msg > msgdesc.va_queue_last)
436 msg = va_queue_first; 825 msg = msgdesc.va_queue_first;
437 __get_cpu_var(bau_control).bau_msg_head = msg; 826 bcp->bau_msg_head = msg;
438 } 827 }
828 stat->d_time += (get_cycles() - time_start);
439 if (!count) 829 if (!count)
440 __get_cpu_var(ptcstats).nomsg++; 830 stat->d_nomsg++;
441 else if (count > 1) 831 else if (count > 1)
442 __get_cpu_var(ptcstats).multmsg++; 832 stat->d_multmsg++;
443 833 ack_APIC_irq();
444 time2 = get_cycles();
445 __get_cpu_var(ptcstats).dflush += (time2 - time1);
446
447 irq_exit();
448 set_irq_regs(old_regs);
449} 834}
450 835
451/* 836/*
452 * uv_enable_timeouts 837 * uv_enable_timeouts
453 * 838 *
454 * Each target blade (i.e. blades that have cpu's) needs to have 839 * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
455 * shootdown message timeouts enabled. The timeout does not cause 840 * shootdown message timeouts enabled. The timeout does not cause
456 * an interrupt, but causes an error message to be returned to 841 * an interrupt, but causes an error message to be returned to
457 * the sender. 842 * the sender.
458 */ 843 */
459static void uv_enable_timeouts(void) 844static void uv_enable_timeouts(void)
460{ 845{
461 int blade; 846 int uvhub;
462 int nblades; 847 int nuvhubs;
463 int pnode; 848 int pnode;
464 unsigned long mmr_image; 849 unsigned long mmr_image;
465 850
466 nblades = uv_num_possible_blades(); 851 nuvhubs = uv_num_possible_blades();
467 852
468 for (blade = 0; blade < nblades; blade++) { 853 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
469 if (!uv_blade_nr_possible_cpus(blade)) 854 if (!uv_blade_nr_possible_cpus(uvhub))
470 continue; 855 continue;
471 856
472 pnode = uv_blade_to_pnode(blade); 857 pnode = uv_blade_to_pnode(uvhub);
473 mmr_image = 858 mmr_image =
474 uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); 859 uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
475 /* 860 /*
@@ -479,16 +864,16 @@ static void uv_enable_timeouts(void)
479 * To program the period, the SOFT_ACK_MODE must be off. 864 * To program the period, the SOFT_ACK_MODE must be off.
480 */ 865 */
481 mmr_image &= ~((unsigned long)1 << 866 mmr_image &= ~((unsigned long)1 <<
482 UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT); 867 UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
483 uv_write_global_mmr64 868 uv_write_global_mmr64
484 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); 869 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
485 /* 870 /*
486 * Set the 4-bit period. 871 * Set the 4-bit period.
487 */ 872 */
488 mmr_image &= ~((unsigned long)0xf << 873 mmr_image &= ~((unsigned long)0xf <<
489 UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT); 874 UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
490 mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD << 875 mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
491 UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT); 876 UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
492 uv_write_global_mmr64 877 uv_write_global_mmr64
493 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); 878 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
494 /* 879 /*
@@ -497,7 +882,7 @@ static void uv_enable_timeouts(void)
497 * indicated in bits 2:0 (7 causes all of them to timeout). 882 * indicated in bits 2:0 (7 causes all of them to timeout).
498 */ 883 */
499 mmr_image |= ((unsigned long)1 << 884 mmr_image |= ((unsigned long)1 <<
500 UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT); 885 UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
501 uv_write_global_mmr64 886 uv_write_global_mmr64
502 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); 887 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
503 } 888 }
@@ -522,9 +907,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
522{ 907{
523} 908}
524 909
910static inline unsigned long long
911millisec_2_cycles(unsigned long millisec)
912{
913 unsigned long ns;
914 unsigned long long cyc;
915
916 ns = millisec * 1000;
917 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
918 return cyc;
919}
920
525/* 921/*
526 * Display the statistics thru /proc 922 * Display the statistics thru /proc.
527 * data points to the cpu number 923 * 'data' points to the cpu number
528 */ 924 */
529static int uv_ptc_seq_show(struct seq_file *file, void *data) 925static int uv_ptc_seq_show(struct seq_file *file, void *data)
530{ 926{
@@ -535,78 +931,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
535 931
536 if (!cpu) { 932 if (!cpu) {
537 seq_printf(file, 933 seq_printf(file,
538 "# cpu requestor requestee one all sretry dretry ptc_i "); 934 "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
539 seq_printf(file, 935 seq_printf(file,
540 "sw_ack sflush dflush sok dnomsg dmult starget\n"); 936 "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
937 seq_printf(file,
938 "retries rok resetp resett giveup sto bz throt ");
939 seq_printf(file,
940 "sw_ack recv rtime all ");
941 seq_printf(file,
942 "one mult none retry canc nocan reset rcan\n");
541 } 943 }
542 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 944 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
543 stat = &per_cpu(ptcstats, cpu); 945 stat = &per_cpu(ptcstats, cpu);
544 seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ", 946 /* source side statistics */
545 cpu, stat->requestor, 947 seq_printf(file,
546 stat->requestee, stat->onetlb, stat->alltlb, 948 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
547 stat->s_retry, stat->d_retry, stat->ptc_i); 949 cpu, stat->s_requestor, cycles_2_us(stat->s_time),
548 seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n", 950 stat->s_ntarguvhub, stat->s_ntarguvhub16,
951 stat->s_ntarguvhub8, stat->s_ntarguvhub4,
952 stat->s_ntarguvhub2, stat->s_ntarguvhub1,
953 stat->s_ntargcpu, stat->s_dtimeout);
954 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
955 stat->s_retry_messages, stat->s_retriesok,
956 stat->s_resets_plug, stat->s_resets_timeout,
957 stat->s_giveup, stat->s_stimeout,
958 stat->s_busy, stat->s_throttles);
959 /* destination side statistics */
960 seq_printf(file,
961 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
549 uv_read_global_mmr64(uv_cpu_to_pnode(cpu), 962 uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
550 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), 963 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
551 stat->sflush, stat->dflush, 964 stat->d_requestee, cycles_2_us(stat->d_time),
552 stat->retriesok, stat->nomsg, 965 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
553 stat->multmsg, stat->ntargeted); 966 stat->d_nomsg, stat->d_retries, stat->d_canceled,
967 stat->d_nocanceled, stat->d_resets,
968 stat->d_rcanceled);
554 } 969 }
555 970
556 return 0; 971 return 0;
557} 972}
558 973
559/* 974/*
975 * -1: resetf the statistics
560 * 0: display meaning of the statistics 976 * 0: display meaning of the statistics
561 * >0: retry limit 977 * >0: maximum concurrent active descriptors per uvhub (throttle)
562 */ 978 */
563static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, 979static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
564 size_t count, loff_t *data) 980 size_t count, loff_t *data)
565{ 981{
566 long newmode; 982 int cpu;
983 long input_arg;
567 char optstr[64]; 984 char optstr[64];
985 struct ptc_stats *stat;
986 struct bau_control *bcp;
568 987
569 if (count == 0 || count > sizeof(optstr)) 988 if (count == 0 || count > sizeof(optstr))
570 return -EINVAL; 989 return -EINVAL;
571 if (copy_from_user(optstr, user, count)) 990 if (copy_from_user(optstr, user, count))
572 return -EFAULT; 991 return -EFAULT;
573 optstr[count - 1] = '\0'; 992 optstr[count - 1] = '\0';
574 if (strict_strtoul(optstr, 10, &newmode) < 0) { 993 if (strict_strtol(optstr, 10, &input_arg) < 0) {
575 printk(KERN_DEBUG "%s is invalid\n", optstr); 994 printk(KERN_DEBUG "%s is invalid\n", optstr);
576 return -EINVAL; 995 return -EINVAL;
577 } 996 }
578 997
579 if (newmode == 0) { 998 if (input_arg == 0) {
580 printk(KERN_DEBUG "# cpu: cpu number\n"); 999 printk(KERN_DEBUG "# cpu: cpu number\n");
1000 printk(KERN_DEBUG "Sender statistics:\n");
1001 printk(KERN_DEBUG
1002 "sent: number of shootdown messages sent\n");
1003 printk(KERN_DEBUG
1004 "stime: time spent sending messages\n");
1005 printk(KERN_DEBUG
1006 "numuvhubs: number of hubs targeted with shootdown\n");
1007 printk(KERN_DEBUG
1008 "numuvhubs16: number times 16 or more hubs targeted\n");
1009 printk(KERN_DEBUG
1010 "numuvhubs8: number times 8 or more hubs targeted\n");
1011 printk(KERN_DEBUG
1012 "numuvhubs4: number times 4 or more hubs targeted\n");
1013 printk(KERN_DEBUG
1014 "numuvhubs2: number times 2 or more hubs targeted\n");
1015 printk(KERN_DEBUG
1016 "numuvhubs1: number times 1 hub targeted\n");
1017 printk(KERN_DEBUG
1018 "numcpus: number of cpus targeted with shootdown\n");
1019 printk(KERN_DEBUG
1020 "dto: number of destination timeouts\n");
1021 printk(KERN_DEBUG
1022 "retries: destination timeout retries sent\n");
1023 printk(KERN_DEBUG
1024 "rok: : destination timeouts successfully retried\n");
1025 printk(KERN_DEBUG
1026 "resetp: ipi-style resource resets for plugs\n");
1027 printk(KERN_DEBUG
1028 "resett: ipi-style resource resets for timeouts\n");
1029 printk(KERN_DEBUG
1030 "giveup: fall-backs to ipi-style shootdowns\n");
1031 printk(KERN_DEBUG
1032 "sto: number of source timeouts\n");
1033 printk(KERN_DEBUG
1034 "bz: number of stay-busy's\n");
1035 printk(KERN_DEBUG
1036 "throt: number times spun in throttle\n");
1037 printk(KERN_DEBUG "Destination side statistics:\n");
581 printk(KERN_DEBUG 1038 printk(KERN_DEBUG
582 "requestor: times this cpu was the flush requestor\n"); 1039 "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
583 printk(KERN_DEBUG 1040 printk(KERN_DEBUG
584 "requestee: times this cpu was requested to flush its TLBs\n"); 1041 "recv: shootdown messages received\n");
585 printk(KERN_DEBUG 1042 printk(KERN_DEBUG
586 "one: times requested to flush a single address\n"); 1043 "rtime: time spent processing messages\n");
587 printk(KERN_DEBUG 1044 printk(KERN_DEBUG
588 "all: times requested to flush all TLB's\n"); 1045 "all: shootdown all-tlb messages\n");
589 printk(KERN_DEBUG 1046 printk(KERN_DEBUG
590 "sretry: number of retries of source-side timeouts\n"); 1047 "one: shootdown one-tlb messages\n");
591 printk(KERN_DEBUG 1048 printk(KERN_DEBUG
592 "dretry: number of retries of destination-side timeouts\n"); 1049 "mult: interrupts that found multiple messages\n");
593 printk(KERN_DEBUG 1050 printk(KERN_DEBUG
594 "ptc_i: times UV fell through to IPI-style flushes\n"); 1051 "none: interrupts that found no messages\n");
595 printk(KERN_DEBUG 1052 printk(KERN_DEBUG
596 "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); 1053 "retry: number of retry messages processed\n");
597 printk(KERN_DEBUG 1054 printk(KERN_DEBUG
598 "sflush_us: cycles spent in uv_flush_tlb_others()\n"); 1055 "canc: number messages canceled by retries\n");
599 printk(KERN_DEBUG 1056 printk(KERN_DEBUG
600 "dflush_us: cycles spent in handling flush requests\n"); 1057 "nocan: number retries that found nothing to cancel\n");
601 printk(KERN_DEBUG "sok: successes on retry\n");
602 printk(KERN_DEBUG "dnomsg: interrupts with no message\n");
603 printk(KERN_DEBUG 1058 printk(KERN_DEBUG
604 "dmult: interrupts with multiple messages\n"); 1059 "reset: number of ipi-style reset requests processed\n");
605 printk(KERN_DEBUG "starget: nodes targeted\n"); 1060 printk(KERN_DEBUG
1061 "rcan: number messages canceled by reset requests\n");
1062 } else if (input_arg == -1) {
1063 for_each_present_cpu(cpu) {
1064 stat = &per_cpu(ptcstats, cpu);
1065 memset(stat, 0, sizeof(struct ptc_stats));
1066 }
606 } else { 1067 } else {
607 uv_bau_retry_limit = newmode; 1068 uv_bau_max_concurrent = input_arg;
608 printk(KERN_DEBUG "timeout retry limit:%d\n", 1069 bcp = &per_cpu(bau_control, smp_processor_id());
609 uv_bau_retry_limit); 1070 if (uv_bau_max_concurrent < 1 ||
1071 uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
1072 printk(KERN_DEBUG
1073 "Error: BAU max concurrent %d; %d is invalid\n",
1074 bcp->max_concurrent, uv_bau_max_concurrent);
1075 return -EINVAL;
1076 }
1077 printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
1078 uv_bau_max_concurrent);
1079 for_each_present_cpu(cpu) {
1080 bcp = &per_cpu(bau_control, cpu);
1081 bcp->max_concurrent = uv_bau_max_concurrent;
1082 }
610 } 1083 }
611 1084
612 return count; 1085 return count;
@@ -650,79 +1123,30 @@ static int __init uv_ptc_init(void)
650} 1123}
651 1124
652/* 1125/*
653 * begin the initialization of the per-blade control structures
654 */
655static struct bau_control * __init uv_table_bases_init(int blade, int node)
656{
657 int i;
658 struct bau_msg_status *msp;
659 struct bau_control *bau_tabp;
660
661 bau_tabp =
662 kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
663 BUG_ON(!bau_tabp);
664
665 bau_tabp->msg_statuses =
666 kmalloc_node(sizeof(struct bau_msg_status) *
667 DEST_Q_SIZE, GFP_KERNEL, node);
668 BUG_ON(!bau_tabp->msg_statuses);
669
670 for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
671 bau_cpubits_clear(&msp->seen_by, (int)
672 uv_blade_nr_possible_cpus(blade));
673
674 uv_bau_table_bases[blade] = bau_tabp;
675
676 return bau_tabp;
677}
678
679/*
680 * finish the initialization of the per-blade control structures
681 */
682static void __init
683uv_table_bases_finish(int blade,
684 struct bau_control *bau_tablesp,
685 struct bau_desc *adp)
686{
687 struct bau_control *bcp;
688 int cpu;
689
690 for_each_present_cpu(cpu) {
691 if (blade != uv_cpu_to_blade_id(cpu))
692 continue;
693
694 bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
695 bcp->bau_msg_head = bau_tablesp->va_queue_first;
696 bcp->va_queue_first = bau_tablesp->va_queue_first;
697 bcp->va_queue_last = bau_tablesp->va_queue_last;
698 bcp->msg_statuses = bau_tablesp->msg_statuses;
699 bcp->descriptor_base = adp;
700 }
701}
702
703/*
704 * initialize the sending side's sending buffers 1126 * initialize the sending side's sending buffers
705 */ 1127 */
706static struct bau_desc * __init 1128static void
707uv_activation_descriptor_init(int node, int pnode) 1129uv_activation_descriptor_init(int node, int pnode)
708{ 1130{
709 int i; 1131 int i;
1132 int cpu;
710 unsigned long pa; 1133 unsigned long pa;
711 unsigned long m; 1134 unsigned long m;
712 unsigned long n; 1135 unsigned long n;
713 struct bau_desc *adp; 1136 struct bau_desc *bau_desc;
714 struct bau_desc *ad2; 1137 struct bau_desc *bd2;
1138 struct bau_control *bcp;
715 1139
716 /* 1140 /*
717 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) 1141 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
718 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade 1142 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
719 */ 1143 */
720 adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* 1144 bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
721 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); 1145 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
722 BUG_ON(!adp); 1146 BUG_ON(!bau_desc);
723 1147
724 pa = uv_gpa(adp); /* need the real nasid*/ 1148 pa = uv_gpa(bau_desc); /* need the real nasid*/
725 n = uv_gpa_to_pnode(pa); 1149 n = pa >> uv_nshift;
726 m = pa & uv_mmask; 1150 m = pa & uv_mmask;
727 1151
728 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, 1152 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -731,96 +1155,188 @@ uv_activation_descriptor_init(int node, int pnode)
731 /* 1155 /*
732 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each 1156 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
733 * cpu even though we only use the first one; one descriptor can 1157 * cpu even though we only use the first one; one descriptor can
734 * describe a broadcast to 256 nodes. 1158 * describe a broadcast to 256 uv hubs.
735 */ 1159 */
736 for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); 1160 for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
737 i++, ad2++) { 1161 i++, bd2++) {
738 memset(ad2, 0, sizeof(struct bau_desc)); 1162 memset(bd2, 0, sizeof(struct bau_desc));
739 ad2->header.sw_ack_flag = 1; 1163 bd2->header.sw_ack_flag = 1;
740 /* 1164 /*
741 * base_dest_nodeid is the first node in the partition, so 1165 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
742 * the bit map will indicate partition-relative node numbers. 1166 * in the partition. The bit map will indicate uvhub numbers,
743 * note that base_dest_nodeid is actually a nasid. 1167 * which are 0-N in a partition. Pnodes are unique system-wide.
744 */ 1168 */
745 ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; 1169 bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
746 ad2->header.dest_subnodeid = 0x10; /* the LB */ 1170 bd2->header.dest_subnodeid = 0x10; /* the LB */
747 ad2->header.command = UV_NET_ENDPOINT_INTD; 1171 bd2->header.command = UV_NET_ENDPOINT_INTD;
748 ad2->header.int_both = 1; 1172 bd2->header.int_both = 1;
749 /* 1173 /*
750 * all others need to be set to zero: 1174 * all others need to be set to zero:
751 * fairness chaining multilevel count replied_to 1175 * fairness chaining multilevel count replied_to
752 */ 1176 */
753 } 1177 }
754 return adp; 1178 for_each_present_cpu(cpu) {
1179 if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
1180 continue;
1181 bcp = &per_cpu(bau_control, cpu);
1182 bcp->descriptor_base = bau_desc;
1183 }
755} 1184}
756 1185
757/* 1186/*
758 * initialize the destination side's receiving buffers 1187 * initialize the destination side's receiving buffers
1188 * entered for each uvhub in the partition
1189 * - node is first node (kernel memory notion) on the uvhub
1190 * - pnode is the uvhub's physical identifier
759 */ 1191 */
760static struct bau_payload_queue_entry * __init 1192static void
761uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) 1193uv_payload_queue_init(int node, int pnode)
762{ 1194{
763 struct bau_payload_queue_entry *pqp;
764 unsigned long pa;
765 int pn; 1195 int pn;
1196 int cpu;
766 char *cp; 1197 char *cp;
1198 unsigned long pa;
1199 struct bau_payload_queue_entry *pqp;
1200 struct bau_payload_queue_entry *pqp_malloc;
1201 struct bau_control *bcp;
767 1202
768 pqp = (struct bau_payload_queue_entry *) kmalloc_node( 1203 pqp = (struct bau_payload_queue_entry *) kmalloc_node(
769 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), 1204 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
770 GFP_KERNEL, node); 1205 GFP_KERNEL, node);
771 BUG_ON(!pqp); 1206 BUG_ON(!pqp);
1207 pqp_malloc = pqp;
772 1208
773 cp = (char *)pqp + 31; 1209 cp = (char *)pqp + 31;
774 pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); 1210 pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
775 bau_tablesp->va_queue_first = pqp; 1211
1212 for_each_present_cpu(cpu) {
1213 if (pnode != uv_cpu_to_pnode(cpu))
1214 continue;
1215 /* for every cpu on this pnode: */
1216 bcp = &per_cpu(bau_control, cpu);
1217 bcp->va_queue_first = pqp;
1218 bcp->bau_msg_head = pqp;
1219 bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
1220 }
776 /* 1221 /*
777 * need the pnode of where the memory was really allocated 1222 * need the pnode of where the memory was really allocated
778 */ 1223 */
779 pa = uv_gpa(pqp); 1224 pa = uv_gpa(pqp);
780 pn = uv_gpa_to_pnode(pa); 1225 pn = pa >> uv_nshift;
781 uv_write_global_mmr64(pnode, 1226 uv_write_global_mmr64(pnode,
782 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, 1227 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
783 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | 1228 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
784 uv_physnodeaddr(pqp)); 1229 uv_physnodeaddr(pqp));
785 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, 1230 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
786 uv_physnodeaddr(pqp)); 1231 uv_physnodeaddr(pqp));
787 bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
788 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, 1232 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
789 (unsigned long) 1233 (unsigned long)
790 uv_physnodeaddr(bau_tablesp->va_queue_last)); 1234 uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
1235 /* in effect, all msg_type's are set to MSG_NOOP */
791 memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); 1236 memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
792
793 return pqp;
794} 1237}
795 1238
796/* 1239/*
797 * Initialization of each UV blade's structures 1240 * Initialization of each UV hub's structures
798 */ 1241 */
799static int __init uv_init_blade(int blade) 1242static void __init uv_init_uvhub(int uvhub, int vector)
800{ 1243{
801 int node; 1244 int node;
802 int pnode; 1245 int pnode;
803 unsigned long pa;
804 unsigned long apicid; 1246 unsigned long apicid;
805 struct bau_desc *adp; 1247
806 struct bau_payload_queue_entry *pqp; 1248 node = uvhub_to_first_node(uvhub);
807 struct bau_control *bau_tablesp; 1249 pnode = uv_blade_to_pnode(uvhub);
808 1250 uv_activation_descriptor_init(node, pnode);
809 node = blade_to_first_node(blade); 1251 uv_payload_queue_init(node, pnode);
810 bau_tablesp = uv_table_bases_init(blade, node);
811 pnode = uv_blade_to_pnode(blade);
812 adp = uv_activation_descriptor_init(node, pnode);
813 pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
814 uv_table_bases_finish(blade, bau_tablesp, adp);
815 /* 1252 /*
816 * the below initialization can't be in firmware because the 1253 * the below initialization can't be in firmware because the
817 * messaging IRQ will be determined by the OS 1254 * messaging IRQ will be determined by the OS
818 */ 1255 */
819 apicid = blade_to_first_apicid(blade); 1256 apicid = uvhub_to_first_apicid(uvhub);
820 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
821 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, 1257 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
822 ((apicid << 32) | UV_BAU_MESSAGE)); 1258 ((apicid << 32) | vector));
823 return 0; 1259}
1260
1261/*
1262 * initialize the bau_control structure for each cpu
1263 */
1264static void uv_init_per_cpu(int nuvhubs)
1265{
1266 int i, j, k;
1267 int cpu;
1268 int pnode;
1269 int uvhub;
1270 short socket = 0;
1271 struct bau_control *bcp;
1272 struct uvhub_desc *bdp;
1273 struct socket_desc *sdp;
1274 struct bau_control *hmaster = NULL;
1275 struct bau_control *smaster = NULL;
1276 struct socket_desc {
1277 short num_cpus;
1278 short cpu_number[16];
1279 };
1280 struct uvhub_desc {
1281 short num_sockets;
1282 short num_cpus;
1283 short uvhub;
1284 short pnode;
1285 struct socket_desc socket[2];
1286 };
1287 struct uvhub_desc *uvhub_descs;
1288
1289 uvhub_descs = (struct uvhub_desc *)
1290 kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1291 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1292 for_each_present_cpu(cpu) {
1293 bcp = &per_cpu(bau_control, cpu);
1294 memset(bcp, 0, sizeof(struct bau_control));
1295 spin_lock_init(&bcp->masks_lock);
1296 bcp->max_concurrent = uv_bau_max_concurrent;
1297 pnode = uv_cpu_hub_info(cpu)->pnode;
1298 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1299 bdp = &uvhub_descs[uvhub];
1300 bdp->num_cpus++;
1301 bdp->uvhub = uvhub;
1302 bdp->pnode = pnode;
1303 /* time interval to catch a hardware stay-busy bug */
1304 bcp->timeout_interval = millisec_2_cycles(3);
1305 /* kludge: assume uv_hub.h is constant */
1306 socket = (cpu_physical_id(cpu)>>5)&1;
1307 if (socket >= bdp->num_sockets)
1308 bdp->num_sockets = socket+1;
1309 sdp = &bdp->socket[socket];
1310 sdp->cpu_number[sdp->num_cpus] = cpu;
1311 sdp->num_cpus++;
1312 }
1313 socket = 0;
1314 for_each_possible_blade(uvhub) {
1315 bdp = &uvhub_descs[uvhub];
1316 for (i = 0; i < bdp->num_sockets; i++) {
1317 sdp = &bdp->socket[i];
1318 for (j = 0; j < sdp->num_cpus; j++) {
1319 cpu = sdp->cpu_number[j];
1320 bcp = &per_cpu(bau_control, cpu);
1321 bcp->cpu = cpu;
1322 if (j == 0) {
1323 smaster = bcp;
1324 if (i == 0)
1325 hmaster = bcp;
1326 }
1327 bcp->cpus_in_uvhub = bdp->num_cpus;
1328 bcp->cpus_in_socket = sdp->num_cpus;
1329 bcp->socket_master = smaster;
1330 bcp->uvhub_master = hmaster;
1331 for (k = 0; k < DEST_Q_SIZE; k++)
1332 bcp->socket_acknowledge_count[k] = 0;
1333 bcp->uvhub_cpu =
1334 uv_cpu_hub_info(cpu)->blade_processor_id;
1335 }
1336 socket++;
1337 }
1338 }
1339 kfree(uvhub_descs);
824} 1340}
825 1341
826/* 1342/*
@@ -828,38 +1344,54 @@ static int __init uv_init_blade(int blade)
828 */ 1344 */
829static int __init uv_bau_init(void) 1345static int __init uv_bau_init(void)
830{ 1346{
831 int blade; 1347 int uvhub;
832 int nblades; 1348 int pnode;
1349 int nuvhubs;
833 int cur_cpu; 1350 int cur_cpu;
1351 int vector;
1352 unsigned long mmr;
834 1353
835 if (!is_uv_system()) 1354 if (!is_uv_system())
836 return 0; 1355 return 0;
837 1356
1357 if (nobau)
1358 return 0;
1359
838 for_each_possible_cpu(cur_cpu) 1360 for_each_possible_cpu(cur_cpu)
839 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 1361 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
840 GFP_KERNEL, cpu_to_node(cur_cpu)); 1362 GFP_KERNEL, cpu_to_node(cur_cpu));
841 1363
842 uv_bau_retry_limit = 1; 1364 uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
1365 uv_nshift = uv_hub_info->m_val;
843 uv_mmask = (1UL << uv_hub_info->m_val) - 1; 1366 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
844 nblades = uv_num_possible_blades(); 1367 nuvhubs = uv_num_possible_blades();
845 1368
846 uv_bau_table_bases = (struct bau_control **) 1369 uv_init_per_cpu(nuvhubs);
847 kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
848 BUG_ON(!uv_bau_table_bases);
849 1370
850 uv_partition_base_pnode = 0x7fffffff; 1371 uv_partition_base_pnode = 0x7fffffff;
851 for (blade = 0; blade < nblades; blade++) 1372 for (uvhub = 0; uvhub < nuvhubs; uvhub++)
852 if (uv_blade_nr_possible_cpus(blade) && 1373 if (uv_blade_nr_possible_cpus(uvhub) &&
853 (uv_blade_to_pnode(blade) < uv_partition_base_pnode)) 1374 (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
854 uv_partition_base_pnode = uv_blade_to_pnode(blade); 1375 uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
855 for (blade = 0; blade < nblades; blade++) 1376
856 if (uv_blade_nr_possible_cpus(blade)) 1377 vector = UV_BAU_MESSAGE;
857 uv_init_blade(blade); 1378 for_each_possible_blade(uvhub)
858 1379 if (uv_blade_nr_possible_cpus(uvhub))
859 alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); 1380 uv_init_uvhub(uvhub, vector);
1381
860 uv_enable_timeouts(); 1382 uv_enable_timeouts();
1383 alloc_intr_gate(vector, uv_bau_message_intr1);
1384
1385 for_each_possible_blade(uvhub) {
1386 pnode = uv_blade_to_pnode(uvhub);
1387 /* INIT the bau */
1388 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
1389 ((unsigned long)1 << 63));
1390 mmr = 1; /* should be 1 to broadcast to both sockets */
1391 uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
1392 }
861 1393
862 return 0; 1394 return 0;
863} 1395}
864__initcall(uv_bau_init); 1396core_initcall(uv_bau_init);
865__initcall(uv_ptc_init); 1397core_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e4454188..02cfb9b8f5b1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,15 +108,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
108 dec_preempt_count(); 108 dec_preempt_count();
109} 109}
110 110
111#ifdef CONFIG_X86_32
112static inline void
113die_if_kernel(const char *str, struct pt_regs *regs, long err)
114{
115 if (!user_mode_vm(regs))
116 die(str, regs, err);
117}
118#endif
119
120static void __kprobes 111static void __kprobes
121do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, 112do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
122 long error_code, siginfo_t *info) 113 long error_code, siginfo_t *info)
@@ -543,11 +534,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
543 534
544 /* DR6 may or may not be cleared by the CPU */ 535 /* DR6 may or may not be cleared by the CPU */
545 set_debugreg(0, 6); 536 set_debugreg(0, 6);
537
546 /* 538 /*
547 * The processor cleared BTF, so don't mark that we need it set. 539 * The processor cleared BTF, so don't mark that we need it set.
548 */ 540 */
549 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 541 clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
550 tsk->thread.debugctlmsr = 0;
551 542
552 /* Store the virtualized DR6 value */ 543 /* Store the virtualized DR6 value */
553 tsk->thread.debugreg6 = dr6; 544 tsk->thread.debugreg6 = dr6;
@@ -585,55 +576,67 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
585 return; 576 return;
586} 577}
587 578
588#ifdef CONFIG_X86_64
589static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
590{
591 if (fixup_exception(regs))
592 return 1;
593
594 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
595 /* Illegal floating point operation in the kernel */
596 current->thread.trap_no = trapnr;
597 die(str, regs, 0);
598 return 0;
599}
600#endif
601
602/* 579/*
603 * Note that we play around with the 'TS' bit in an attempt to get 580 * Note that we play around with the 'TS' bit in an attempt to get
604 * the correct behaviour even in the presence of the asynchronous 581 * the correct behaviour even in the presence of the asynchronous
605 * IRQ13 behaviour 582 * IRQ13 behaviour
606 */ 583 */
607void math_error(void __user *ip) 584void math_error(struct pt_regs *regs, int error_code, int trapnr)
608{ 585{
609 struct task_struct *task; 586 struct task_struct *task = current;
610 siginfo_t info; 587 siginfo_t info;
611 unsigned short cwd, swd, err; 588 unsigned short err;
589 char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
590
591 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
592 return;
593 conditional_sti(regs);
594
595 if (!user_mode_vm(regs))
596 {
597 if (!fixup_exception(regs)) {
598 task->thread.error_code = error_code;
599 task->thread.trap_no = trapnr;
600 die(str, regs, error_code);
601 }
602 return;
603 }
612 604
613 /* 605 /*
614 * Save the info for the exception handler and clear the error. 606 * Save the info for the exception handler and clear the error.
615 */ 607 */
616 task = current;
617 save_init_fpu(task); 608 save_init_fpu(task);
618 task->thread.trap_no = 16; 609 task->thread.trap_no = trapnr;
619 task->thread.error_code = 0; 610 task->thread.error_code = error_code;
620 info.si_signo = SIGFPE; 611 info.si_signo = SIGFPE;
621 info.si_errno = 0; 612 info.si_errno = 0;
622 info.si_addr = ip; 613 info.si_addr = (void __user *)regs->ip;
623 /* 614 if (trapnr == 16) {
624 * (~cwd & swd) will mask out exceptions that are not set to unmasked 615 unsigned short cwd, swd;
625 * status. 0x3f is the exception bits in these regs, 0x200 is the 616 /*
626 * C1 reg you need in case of a stack fault, 0x040 is the stack 617 * (~cwd & swd) will mask out exceptions that are not set to unmasked
627 * fault bit. We should only be taking one exception at a time, 618 * status. 0x3f is the exception bits in these regs, 0x200 is the
628 * so if this combination doesn't produce any single exception, 619 * C1 reg you need in case of a stack fault, 0x040 is the stack
629 * then we have a bad program that isn't synchronizing its FPU usage 620 * fault bit. We should only be taking one exception at a time,
630 * and it will suffer the consequences since we won't be able to 621 * so if this combination doesn't produce any single exception,
631 * fully reproduce the context of the exception 622 * then we have a bad program that isn't synchronizing its FPU usage
632 */ 623 * and it will suffer the consequences since we won't be able to
633 cwd = get_fpu_cwd(task); 624 * fully reproduce the context of the exception
634 swd = get_fpu_swd(task); 625 */
626 cwd = get_fpu_cwd(task);
627 swd = get_fpu_swd(task);
635 628
636 err = swd & ~cwd; 629 err = swd & ~cwd;
630 } else {
631 /*
632 * The SIMD FPU exceptions are handled a little differently, as there
633 * is only a single status/control register. Thus, to determine which
634 * unmasked exception was caught we must mask the exception mask bits
635 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
636 */
637 unsigned short mxcsr = get_fpu_mxcsr(task);
638 err = ~(mxcsr >> 7) & mxcsr;
639 }
637 640
638 if (err & 0x001) { /* Invalid op */ 641 if (err & 0x001) { /* Invalid op */
639 /* 642 /*
@@ -662,97 +665,17 @@ void math_error(void __user *ip)
662 665
663dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 666dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
664{ 667{
665 conditional_sti(regs);
666
667#ifdef CONFIG_X86_32 668#ifdef CONFIG_X86_32
668 ignore_fpu_irq = 1; 669 ignore_fpu_irq = 1;
669#else
670 if (!user_mode(regs) &&
671 kernel_math_error(regs, "kernel x87 math error", 16))
672 return;
673#endif 670#endif
674 671
675 math_error((void __user *)regs->ip); 672 math_error(regs, error_code, 16);
676}
677
678static void simd_math_error(void __user *ip)
679{
680 struct task_struct *task;
681 siginfo_t info;
682 unsigned short mxcsr;
683
684 /*
685 * Save the info for the exception handler and clear the error.
686 */
687 task = current;
688 save_init_fpu(task);
689 task->thread.trap_no = 19;
690 task->thread.error_code = 0;
691 info.si_signo = SIGFPE;
692 info.si_errno = 0;
693 info.si_code = __SI_FAULT;
694 info.si_addr = ip;
695 /*
696 * The SIMD FPU exceptions are handled a little differently, as there
697 * is only a single status/control register. Thus, to determine which
698 * unmasked exception was caught we must mask the exception mask bits
699 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
700 */
701 mxcsr = get_fpu_mxcsr(task);
702 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
703 case 0x000:
704 default:
705 break;
706 case 0x001: /* Invalid Op */
707 info.si_code = FPE_FLTINV;
708 break;
709 case 0x002: /* Denormalize */
710 case 0x010: /* Underflow */
711 info.si_code = FPE_FLTUND;
712 break;
713 case 0x004: /* Zero Divide */
714 info.si_code = FPE_FLTDIV;
715 break;
716 case 0x008: /* Overflow */
717 info.si_code = FPE_FLTOVF;
718 break;
719 case 0x020: /* Precision */
720 info.si_code = FPE_FLTRES;
721 break;
722 }
723 force_sig_info(SIGFPE, &info, task);
724} 673}
725 674
726dotraplinkage void 675dotraplinkage void
727do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 676do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
728{ 677{
729 conditional_sti(regs); 678 math_error(regs, error_code, 19);
730
731#ifdef CONFIG_X86_32
732 if (cpu_has_xmm) {
733 /* Handle SIMD FPU exceptions on PIII+ processors. */
734 ignore_fpu_irq = 1;
735 simd_math_error((void __user *)regs->ip);
736 return;
737 }
738 /*
739 * Handle strange cache flush from user space exception
740 * in all other cases. This is undocumented behaviour.
741 */
742 if (regs->flags & X86_VM_MASK) {
743 handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
744 return;
745 }
746 current->thread.trap_no = 19;
747 current->thread.error_code = error_code;
748 die_if_kernel("cache flush denied", regs, error_code);
749 force_sig(SIGSEGV, current);
750#else
751 if (!user_mode(regs) &&
752 kernel_math_error(regs, "kernel simd math error", 19))
753 return;
754 simd_math_error((void __user *)regs->ip);
755#endif
756} 679}
757 680
758dotraplinkage void 681dotraplinkage void
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index 1d40336b030a..1132129db792 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -44,7 +44,7 @@ static void uv_ack_apic(unsigned int irq)
44 ack_APIC_irq(); 44 ack_APIC_irq();
45} 45}
46 46
47struct irq_chip uv_irq_chip = { 47static struct irq_chip uv_irq_chip = {
48 .name = "UV-CORE", 48 .name = "UV-CORE",
49 .startup = uv_noop_ret, 49 .startup = uv_noop_ret,
50 .shutdown = uv_noop, 50 .shutdown = uv_noop,
@@ -141,7 +141,7 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
141 */ 141 */
142static int 142static int
143arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, 143arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
144 unsigned long mmr_offset, int restrict) 144 unsigned long mmr_offset, int limit)
145{ 145{
146 const struct cpumask *eligible_cpu = cpumask_of(cpu); 146 const struct cpumask *eligible_cpu = cpumask_of(cpu);
147 struct irq_desc *desc = irq_to_desc(irq); 147 struct irq_desc *desc = irq_to_desc(irq);
@@ -160,7 +160,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
160 if (err != 0) 160 if (err != 0)
161 return err; 161 return err;
162 162
163 if (restrict == UV_AFFINITY_CPU) 163 if (limit == UV_AFFINITY_CPU)
164 desc->status |= IRQ_NO_BALANCING; 164 desc->status |= IRQ_NO_BALANCING;
165 else 165 else
166 desc->status |= IRQ_MOVE_PCNTXT; 166 desc->status |= IRQ_MOVE_PCNTXT;
@@ -214,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
214 unsigned long mmr_value; 214 unsigned long mmr_value;
215 struct uv_IO_APIC_route_entry *entry; 215 struct uv_IO_APIC_route_entry *entry;
216 unsigned long mmr_offset; 216 unsigned long mmr_offset;
217 unsigned mmr_pnode; 217 int mmr_pnode;
218 218
219 if (set_desc_affinity(desc, mask, &dest)) 219 if (set_desc_affinity(desc, mask, &dest))
220 return -1; 220 return -1;
@@ -248,7 +248,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
248 * interrupt is raised. 248 * interrupt is raised.
249 */ 249 */
250int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, 250int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
251 unsigned long mmr_offset, int restrict) 251 unsigned long mmr_offset, int limit)
252{ 252{
253 int irq, ret; 253 int irq, ret;
254 254
@@ -258,7 +258,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
258 return -EBUSY; 258 return -EBUSY;
259 259
260 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, 260 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
261 restrict); 261 limit);
262 if (ret == irq) 262 if (ret == irq)
263 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); 263 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
264 else 264 else
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 693920b22496..1b950d151e58 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy);
54EXPORT_SYMBOL(__memcpy); 54EXPORT_SYMBOL(__memcpy);
55 55
56EXPORT_SYMBOL(empty_zero_page); 56EXPORT_SYMBOL(empty_zero_page);
57EXPORT_SYMBOL(init_level4_pgt);
58#ifndef CONFIG_PARAVIRT 57#ifndef CONFIG_PARAVIRT
59EXPORT_SYMBOL(native_load_gs_index); 58EXPORT_SYMBOL(native_load_gs_index);
60#endif 59#endif
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 782c3a362ec6..37e68fc5e24a 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -99,7 +99,7 @@ int save_i387_xstate(void __user *buf)
99 if (err) 99 if (err)
100 return err; 100 return err;
101 101
102 if (task_thread_info(tsk)->status & TS_XSAVE) 102 if (use_xsave())
103 err = xsave_user(buf); 103 err = xsave_user(buf);
104 else 104 else
105 err = fxsave_user(buf); 105 err = fxsave_user(buf);
@@ -109,14 +109,14 @@ int save_i387_xstate(void __user *buf)
109 task_thread_info(tsk)->status &= ~TS_USEDFPU; 109 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts(); 110 stts();
111 } else { 111 } else {
112 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, 112 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
113 xstate_size)) 113 xstate_size))
114 return -1; 114 return -1;
115 } 115 }
116 116
117 clear_used_math(); /* trigger finit */ 117 clear_used_math(); /* trigger finit */
118 118
119 if (task_thread_info(tsk)->status & TS_XSAVE) { 119 if (use_xsave()) {
120 struct _fpstate __user *fx = buf; 120 struct _fpstate __user *fx = buf;
121 struct _xstate __user *x = buf; 121 struct _xstate __user *x = buf;
122 u64 xstate_bv; 122 u64 xstate_bv;
@@ -225,7 +225,7 @@ int restore_i387_xstate(void __user *buf)
225 clts(); 225 clts();
226 task_thread_info(current)->status |= TS_USEDFPU; 226 task_thread_info(current)->status |= TS_USEDFPU;
227 } 227 }
228 if (task_thread_info(tsk)->status & TS_XSAVE) 228 if (use_xsave())
229 err = restore_user_xstate(buf); 229 err = restore_user_xstate(buf);
230 else 230 else
231 err = fxrstor_checking((__force struct i387_fxsave_struct *) 231 err = fxrstor_checking((__force struct i387_fxsave_struct *)