aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c156
-rw-r--r--arch/x86/kernel/acpi/cstate.c9
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S2
-rw-r--r--arch/x86/kernel/acpi/sleep.c13
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S2
-rw-r--r--arch/x86/kernel/alternative.c48
-rw-r--r--arch/x86/kernel/amd_iommu.c221
-rw-r--r--arch/x86/kernel/amd_iommu_init.c26
-rw-r--r--arch/x86/kernel/apb_timer.c37
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile7
-rw-r--r--arch/x86/kernel/apic/apic.c45
-rw-r--r--arch/x86/kernel/apic/es7000_32.c20
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c107
-rw-r--r--arch/x86/kernel/apic/io_apic.c101
-rw-r--r--arch/x86/kernel/apic/nmi.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c3
-rw-r--r--arch/x86/kernel/apm_32.c6
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/amd.c77
-rw-r--r--arch/x86/kernel/cpu/bugs.c2
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c68
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h26
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c41
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c188
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h2
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c55
-rw-r--r--arch/x86/kernel/cpu/intel.c8
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c263
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c138
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h23
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c128
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c208
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c56
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c56
-rw-r--r--arch/x86/kernel/cpu/perf_event.c869
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c50
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c358
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c641
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c218
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c942
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c31
-rw-r--r--arch/x86/kernel/cpu/scattered.c63
-rw-r--r--arch/x86/kernel/cpu/topology.c (renamed from arch/x86/kernel/cpu/addon_cpuid_features.c)56
-rw-r--r--arch/x86/kernel/cpu/vmware.c47
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/ds.c1437
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.c6
-rw-r--r--arch/x86/kernel/dumpstack.h56
-rw-r--r--arch/x86/kernel/dumpstack_32.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/early-quirks.c18
-rw-r--r--arch/x86/kernel/early_printk.c8
-rw-r--r--arch/x86/kernel/entry_32.S33
-rw-r--r--arch/x86/kernel/entry_64.S13
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S5
-rw-r--r--arch/x86/kernel/hpet.c44
-rw-r--r--arch/x86/kernel/hw_breakpoint.c92
-rw-r--r--arch/x86/kernel/i387.c143
-rw-r--r--arch/x86/kernel/i8253.c14
-rw-r--r--arch/x86/kernel/i8259.c25
-rw-r--r--arch/x86/kernel/init_task.c2
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kgdb.c297
-rw-r--r--arch/x86/kernel/kprobes.c51
-rw-r--r--arch/x86/kernel/kvmclock.c56
-rw-r--r--arch/x86/kernel/microcode_core.c5
-rw-r--r--arch/x86/kernel/microcode_intel.c22
-rw-r--r--arch/x86/kernel/mpparse.c25
-rw-r--r--arch/x86/kernel/mrst.c117
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/olpc.c20
-rw-r--r--arch/x86/kernel/olpc_ofw.c106
-rw-r--r--arch/x86/kernel/pci-calgary_64.c17
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/process.c92
-rw-r--r--arch/x86/kernel/process_32.c14
-rw-r--r--arch/x86/kernel/process_64.c15
-rw-r--r--arch/x86/kernel/ptrace.c384
-rw-r--r--arch/x86/kernel/pvclock.c37
-rw-r--r--arch/x86/kernel/quirks.c3
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/setup.c18
-rw-r--r--arch/x86/kernel/setup_percpu.c25
-rw-r--r--arch/x86/kernel/sfi.c4
-rw-r--r--arch/x86/kernel/smpboot.c43
-rw-r--r--arch/x86/kernel/stacktrace.c31
-rw-r--r--arch/x86/kernel/step.c46
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/tboot.c21
-rw-r--r--arch/x86/kernel/tlb_uv.c1280
-rw-r--r--arch/x86/kernel/traps.c215
-rw-r--r--arch/x86/kernel/tsc.c5
-rw-r--r--arch/x86/kernel/uv_irq.c12
-rw-r--r--arch/x86/kernel/verify_cpu_64.S3
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/kernel/vsyscall_64.c17
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/x86_init.c7
-rw-r--r--arch/x86/kernel/xsave.c203
120 files changed, 6334 insertions, 4895 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c58352209e0..0925676266bd 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
47obj-y += process.o 47obj-y += process.o
48obj-y += i387.o xsave.o 48obj-y += i387.o xsave.o
49obj-y += ptrace.o 49obj-y += ptrace.o
50obj-$(CONFIG_X86_DS) += ds.o
51obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
52obj-$(CONFIG_X86_32) += tls.o 50obj-$(CONFIG_X86_32) += tls.o
53obj-$(CONFIG_IA32_EMULATION) += tls.o 51obj-$(CONFIG_IA32_EMULATION) += tls.o
54obj-y += step.o 52obj-y += step.o
@@ -106,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o
106scx200-y += scx200_32.o 104scx200-y += scx200_32.o
107 105
108obj-$(CONFIG_OLPC) += olpc.o 106obj-$(CONFIG_OLPC) += olpc.o
107obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
109obj-$(CONFIG_X86_MRST) += mrst.o 108obj-$(CONFIG_X86_MRST) += mrst.o
110 109
111microcode-y := microcode_core.o 110microcode-y := microcode_core.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cd40aba6aa95..c05872aa3ce0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -63,7 +63,6 @@ EXPORT_SYMBOL(acpi_disabled);
63int acpi_noirq; /* skip ACPI IRQ initialization */ 63int acpi_noirq; /* skip ACPI IRQ initialization */
64int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ 64int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
65EXPORT_SYMBOL(acpi_pci_disabled); 65EXPORT_SYMBOL(acpi_pci_disabled);
66int acpi_ht __initdata = 1; /* enable HT */
67 66
68int acpi_lapic; 67int acpi_lapic;
69int acpi_ioapic; 68int acpi_ioapic;
@@ -94,6 +93,53 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
94 93
95 94
96/* 95/*
96 * ISA irqs by default are the first 16 gsis but can be
97 * any gsi as specified by an interrupt source override.
98 */
99static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
100 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
101};
102
103static unsigned int gsi_to_irq(unsigned int gsi)
104{
105 unsigned int irq = gsi + NR_IRQS_LEGACY;
106 unsigned int i;
107
108 for (i = 0; i < NR_IRQS_LEGACY; i++) {
109 if (isa_irq_to_gsi[i] == gsi) {
110 return i;
111 }
112 }
113
114 /* Provide an identity mapping of gsi == irq
115 * except on truly weird platforms that have
116 * non isa irqs in the first 16 gsis.
117 */
118 if (gsi >= NR_IRQS_LEGACY)
119 irq = gsi;
120 else
121 irq = gsi_top + gsi;
122
123 return irq;
124}
125
126static u32 irq_to_gsi(int irq)
127{
128 unsigned int gsi;
129
130 if (irq < NR_IRQS_LEGACY)
131 gsi = isa_irq_to_gsi[irq];
132 else if (irq < gsi_top)
133 gsi = irq;
134 else if (irq < (gsi_top + NR_IRQS_LEGACY))
135 gsi = irq - gsi_top;
136 else
137 gsi = 0xffffffff;
138
139 return gsi;
140}
141
142/*
97 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, 143 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
98 * to map the target physical address. The problem is that set_fixmap() 144 * to map the target physical address. The problem is that set_fixmap()
99 * provides a single page, and it is possible that the page is not 145 * provides a single page, and it is possible that the page is not
@@ -313,7 +359,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
313/* 359/*
314 * Parse Interrupt Source Override for the ACPI SCI 360 * Parse Interrupt Source Override for the ACPI SCI
315 */ 361 */
316static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) 362static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
317{ 363{
318 if (trigger == 0) /* compatible SCI trigger is level */ 364 if (trigger == 0) /* compatible SCI trigger is level */
319 trigger = 3; 365 trigger = 3;
@@ -333,7 +379,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
333 * If GSI is < 16, this will update its flags, 379 * If GSI is < 16, this will update its flags,
334 * else it will create a new mp_irqs[] entry. 380 * else it will create a new mp_irqs[] entry.
335 */ 381 */
336 mp_override_legacy_irq(gsi, polarity, trigger, gsi); 382 mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
337 383
338 /* 384 /*
339 * stash over-ride to indicate we've been here 385 * stash over-ride to indicate we've been here
@@ -357,9 +403,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
357 acpi_table_print_madt_entry(header); 403 acpi_table_print_madt_entry(header);
358 404
359 if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { 405 if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
360 acpi_sci_ioapic_setup(intsrc->global_irq, 406 acpi_sci_ioapic_setup(intsrc->source_irq,
361 intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, 407 intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
362 (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2); 408 (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
409 intsrc->global_irq);
363 return 0; 410 return 0;
364 } 411 }
365 412
@@ -448,7 +495,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
448 495
449int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) 496int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
450{ 497{
451 *irq = gsi; 498 *irq = gsi_to_irq(gsi);
452 499
453#ifdef CONFIG_X86_IO_APIC 500#ifdef CONFIG_X86_IO_APIC
454 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) 501 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
@@ -458,6 +505,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
458 return 0; 505 return 0;
459} 506}
460 507
508int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
509{
510 if (isa_irq >= 16)
511 return -1;
512 *gsi = irq_to_gsi(isa_irq);
513 return 0;
514}
515
461/* 516/*
462 * success: return IRQ number (>=0) 517 * success: return IRQ number (>=0)
463 * failure: return < 0 518 * failure: return < 0
@@ -482,7 +537,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
482 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); 537 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
483 } 538 }
484#endif 539#endif
485 irq = plat_gsi; 540 irq = gsi_to_irq(plat_gsi);
486 541
487 return irq; 542 return irq;
488} 543}
@@ -867,29 +922,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
867extern int es7000_plat; 922extern int es7000_plat;
868#endif 923#endif
869 924
870int __init acpi_probe_gsi(void)
871{
872 int idx;
873 int gsi;
874 int max_gsi = 0;
875
876 if (acpi_disabled)
877 return 0;
878
879 if (!acpi_ioapic)
880 return 0;
881
882 max_gsi = 0;
883 for (idx = 0; idx < nr_ioapics; idx++) {
884 gsi = mp_gsi_routing[idx].gsi_end;
885
886 if (gsi > max_gsi)
887 max_gsi = gsi;
888 }
889
890 return max_gsi + 1;
891}
892
893static void assign_to_mp_irq(struct mpc_intsrc *m, 925static void assign_to_mp_irq(struct mpc_intsrc *m,
894 struct mpc_intsrc *mp_irq) 926 struct mpc_intsrc *mp_irq)
895{ 927{
@@ -947,13 +979,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
947 mp_irq.dstirq = pin; /* INTIN# */ 979 mp_irq.dstirq = pin; /* INTIN# */
948 980
949 save_mp_irq(&mp_irq); 981 save_mp_irq(&mp_irq);
982
983 isa_irq_to_gsi[bus_irq] = gsi;
950} 984}
951 985
952void __init mp_config_acpi_legacy_irqs(void) 986void __init mp_config_acpi_legacy_irqs(void)
953{ 987{
954 int i; 988 int i;
955 int ioapic;
956 unsigned int dstapic;
957 struct mpc_intsrc mp_irq; 989 struct mpc_intsrc mp_irq;
958 990
959#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 991#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
@@ -974,19 +1006,27 @@ void __init mp_config_acpi_legacy_irqs(void)
974#endif 1006#endif
975 1007
976 /* 1008 /*
977 * Locate the IOAPIC that manages the ISA IRQs (0-15).
978 */
979 ioapic = mp_find_ioapic(0);
980 if (ioapic < 0)
981 return;
982 dstapic = mp_ioapics[ioapic].apicid;
983
984 /*
985 * Use the default configuration for the IRQs 0-15. Unless 1009 * Use the default configuration for the IRQs 0-15. Unless
986 * overridden by (MADT) interrupt source override entries. 1010 * overridden by (MADT) interrupt source override entries.
987 */ 1011 */
988 for (i = 0; i < 16; i++) { 1012 for (i = 0; i < 16; i++) {
1013 int ioapic, pin;
1014 unsigned int dstapic;
989 int idx; 1015 int idx;
1016 u32 gsi;
1017
1018 /* Locate the gsi that irq i maps to. */
1019 if (acpi_isa_irq_to_gsi(i, &gsi))
1020 continue;
1021
1022 /*
1023 * Locate the IOAPIC that manages the ISA IRQ.
1024 */
1025 ioapic = mp_find_ioapic(gsi);
1026 if (ioapic < 0)
1027 continue;
1028 pin = mp_find_ioapic_pin(ioapic, gsi);
1029 dstapic = mp_ioapics[ioapic].apicid;
990 1030
991 for (idx = 0; idx < mp_irq_entries; idx++) { 1031 for (idx = 0; idx < mp_irq_entries; idx++) {
992 struct mpc_intsrc *irq = mp_irqs + idx; 1032 struct mpc_intsrc *irq = mp_irqs + idx;
@@ -996,7 +1036,7 @@ void __init mp_config_acpi_legacy_irqs(void)
996 break; 1036 break;
997 1037
998 /* Do we already have a mapping for this IOAPIC pin */ 1038 /* Do we already have a mapping for this IOAPIC pin */
999 if (irq->dstapic == dstapic && irq->dstirq == i) 1039 if (irq->dstapic == dstapic && irq->dstirq == pin)
1000 break; 1040 break;
1001 } 1041 }
1002 1042
@@ -1011,7 +1051,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1011 mp_irq.dstapic = dstapic; 1051 mp_irq.dstapic = dstapic;
1012 mp_irq.irqtype = mp_INT; 1052 mp_irq.irqtype = mp_INT;
1013 mp_irq.srcbusirq = i; /* Identity mapped */ 1053 mp_irq.srcbusirq = i; /* Identity mapped */
1014 mp_irq.dstirq = i; 1054 mp_irq.dstirq = pin;
1015 1055
1016 save_mp_irq(&mp_irq); 1056 save_mp_irq(&mp_irq);
1017 } 1057 }
@@ -1076,11 +1116,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1076 1116
1077 ioapic_pin = mp_find_ioapic_pin(ioapic, gsi); 1117 ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
1078 1118
1079#ifdef CONFIG_X86_32
1080 if (ioapic_renumber_irq)
1081 gsi = ioapic_renumber_irq(ioapic, gsi);
1082#endif
1083
1084 if (ioapic_pin > MP_MAX_IOAPIC_PIN) { 1119 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1085 printk(KERN_ERR "Invalid reference to IOAPIC pin " 1120 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1086 "%d-%d\n", mp_ioapics[ioapic].apicid, 1121 "%d-%d\n", mp_ioapics[ioapic].apicid,
@@ -1094,7 +1129,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1094 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, 1129 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
1095 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, 1130 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
1096 polarity == ACPI_ACTIVE_HIGH ? 0 : 1); 1131 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1097 io_apic_set_pci_routing(dev, gsi, &irq_attr); 1132 io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
1098 1133
1099 return gsi; 1134 return gsi;
1100} 1135}
@@ -1154,7 +1189,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1154 * pretend we got one so we can set the SCI flags. 1189 * pretend we got one so we can set the SCI flags.
1155 */ 1190 */
1156 if (!acpi_sci_override_gsi) 1191 if (!acpi_sci_override_gsi)
1157 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); 1192 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
1193 acpi_gbl_FADT.sci_interrupt);
1158 1194
1159 /* Fill in identity legacy mappings where no override */ 1195 /* Fill in identity legacy mappings where no override */
1160 mp_config_acpi_legacy_irqs(); 1196 mp_config_acpi_legacy_irqs();
@@ -1464,9 +1500,8 @@ void __init acpi_boot_table_init(void)
1464 1500
1465 /* 1501 /*
1466 * If acpi_disabled, bail out 1502 * If acpi_disabled, bail out
1467 * One exception: acpi=ht continues far enough to enumerate LAPICs
1468 */ 1503 */
1469 if (acpi_disabled && !acpi_ht) 1504 if (acpi_disabled)
1470 return; 1505 return;
1471 1506
1472 /* 1507 /*
@@ -1497,9 +1532,8 @@ int __init early_acpi_boot_init(void)
1497{ 1532{
1498 /* 1533 /*
1499 * If acpi_disabled, bail out 1534 * If acpi_disabled, bail out
1500 * One exception: acpi=ht continues far enough to enumerate LAPICs
1501 */ 1535 */
1502 if (acpi_disabled && !acpi_ht) 1536 if (acpi_disabled)
1503 return 1; 1537 return 1;
1504 1538
1505 /* 1539 /*
@@ -1517,9 +1551,8 @@ int __init acpi_boot_init(void)
1517 1551
1518 /* 1552 /*
1519 * If acpi_disabled, bail out 1553 * If acpi_disabled, bail out
1520 * One exception: acpi=ht continues far enough to enumerate LAPICs
1521 */ 1554 */
1522 if (acpi_disabled && !acpi_ht) 1555 if (acpi_disabled)
1523 return 1; 1556 return 1;
1524 1557
1525 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); 1558 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1554,21 +1587,12 @@ static int __init parse_acpi(char *arg)
1554 /* acpi=force to over-ride black-list */ 1587 /* acpi=force to over-ride black-list */
1555 else if (strcmp(arg, "force") == 0) { 1588 else if (strcmp(arg, "force") == 0) {
1556 acpi_force = 1; 1589 acpi_force = 1;
1557 acpi_ht = 1;
1558 acpi_disabled = 0; 1590 acpi_disabled = 0;
1559 } 1591 }
1560 /* acpi=strict disables out-of-spec workarounds */ 1592 /* acpi=strict disables out-of-spec workarounds */
1561 else if (strcmp(arg, "strict") == 0) { 1593 else if (strcmp(arg, "strict") == 0) {
1562 acpi_strict = 1; 1594 acpi_strict = 1;
1563 } 1595 }
1564 /* Limit ACPI just to boot-time to enable HT */
1565 else if (strcmp(arg, "ht") == 0) {
1566 if (!acpi_force) {
1567 printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
1568 disable_acpi();
1569 }
1570 acpi_ht = 1;
1571 }
1572 /* acpi=rsdt use RSDT instead of XSDT */ 1596 /* acpi=rsdt use RSDT instead of XSDT */
1573 else if (strcmp(arg, "rsdt") == 0) { 1597 else if (strcmp(arg, "rsdt") == 0) {
1574 acpi_rsdt_forced = 1; 1598 acpi_rsdt_forced = 1;
@@ -1576,6 +1600,10 @@ static int __init parse_acpi(char *arg)
1576 /* "acpi=noirq" disables ACPI interrupt routing */ 1600 /* "acpi=noirq" disables ACPI interrupt routing */
1577 else if (strcmp(arg, "noirq") == 0) { 1601 else if (strcmp(arg, "noirq") == 0) {
1578 acpi_noirq_set(); 1602 acpi_noirq_set();
1603 }
1604 /* "acpi=copy_dsdt" copys DSDT */
1605 else if (strcmp(arg, "copy_dsdt") == 0) {
1606 acpi_gbl_copy_dsdt_locally = 1;
1579 } else { 1607 } else {
1580 /* Core will printk when we return error. */ 1608 /* Core will printk when we return error. */
1581 return -EINVAL; 1609 return -EINVAL;
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 2e837f5080fe..fb7a5f052e2b 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -145,6 +145,15 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
145 percpu_entry->states[cx->index].eax = cx->address; 145 percpu_entry->states[cx->index].eax = cx->address;
146 percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK; 146 percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
147 } 147 }
148
149 /*
150 * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared,
151 * then we should skip checking BM_STS for this C-state.
152 * ref: "Intel Processor Vendor-Specific ACPI Interface Specification"
153 */
154 if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2))
155 cx->bm_sts_skip = 1;
156
148 return retval; 157 return retval;
149} 158}
150EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); 159EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 580b4e296010..28595d6df47c 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -104,7 +104,7 @@ _start:
104 movl %eax, %ecx 104 movl %eax, %ecx
105 orl %edx, %ecx 105 orl %edx, %ecx
106 jz 1f 106 jz 1f
107 movl $0xc0000080, %ecx 107 movl $MSR_EFER, %ecx
108 wrmsr 108 wrmsr
1091: 1091:
110 110
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index f9961034e557..33cec152070d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -2,7 +2,7 @@
2 * sleep.c - x86-specific ACPI sleep support. 2 * sleep.c - x86-specific ACPI sleep support.
3 * 3 *
4 * Copyright (C) 2001-2003 Patrick Mochel 4 * Copyright (C) 2001-2003 Patrick Mochel
5 * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
6 */ 6 */
7 7
8#include <linux/acpi.h> 8#include <linux/acpi.h>
@@ -157,13 +157,16 @@ static int __init acpi_sleep_setup(char *str)
157#ifdef CONFIG_HIBERNATION 157#ifdef CONFIG_HIBERNATION
158 if (strncmp(str, "s4_nohwsig", 10) == 0) 158 if (strncmp(str, "s4_nohwsig", 10) == 0)
159 acpi_no_s4_hw_signature(); 159 acpi_no_s4_hw_signature();
160 if (strncmp(str, "s4_nonvs", 8) == 0) 160 if (strncmp(str, "s4_nonvs", 8) == 0) {
161 acpi_s4_no_nvs(); 161 pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
162 "please use acpi_sleep=nonvs instead");
163 acpi_nvs_nosave();
164 }
162#endif 165#endif
166 if (strncmp(str, "nonvs", 5) == 0)
167 acpi_nvs_nosave();
163 if (strncmp(str, "old_ordering", 12) == 0) 168 if (strncmp(str, "old_ordering", 12) == 0)
164 acpi_old_suspend_ordering(); 169 acpi_old_suspend_ordering();
165 if (strncmp(str, "sci_force_enable", 16) == 0)
166 acpi_set_sci_en_on_resume();
167 str = strchr(str, ','); 170 str = strchr(str, ',');
168 if (str != NULL) 171 if (str != NULL)
169 str += strspn(str, ", \t"); 172 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 8ded418b0593..13ab720573e3 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,4 +1,4 @@
1 .section .text.page_aligned 1 .section .text..page_aligned
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/page_types.h> 4#include <asm/page_types.h>
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1a160d5d44d0..f65ab8b014c4 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
194} 194}
195 195
196extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 196extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
197extern u8 *__smp_locks[], *__smp_locks_end[]; 197extern s32 __smp_locks[], __smp_locks_end[];
198static void *text_poke_early(void *addr, const void *opcode, size_t len); 198static void *text_poke_early(void *addr, const void *opcode, size_t len);
199 199
200/* Replace instructions with better alternatives for this CPU type. 200/* Replace instructions with better alternatives for this CPU type.
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
214 u8 *instr = a->instr; 214 u8 *instr = a->instr;
215 BUG_ON(a->replacementlen > a->instrlen); 215 BUG_ON(a->replacementlen > a->instrlen);
216 BUG_ON(a->instrlen > sizeof(insnbuf)); 216 BUG_ON(a->instrlen > sizeof(insnbuf));
217 BUG_ON(a->cpuid >= NCAPINTS*32);
217 if (!boot_cpu_has(a->cpuid)) 218 if (!boot_cpu_has(a->cpuid))
218 continue; 219 continue;
219#ifdef CONFIG_X86_64 220#ifdef CONFIG_X86_64
@@ -235,37 +236,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
235 236
236#ifdef CONFIG_SMP 237#ifdef CONFIG_SMP
237 238
238static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) 239static void alternatives_smp_lock(const s32 *start, const s32 *end,
240 u8 *text, u8 *text_end)
239{ 241{
240 u8 **ptr; 242 const s32 *poff;
241 243
242 mutex_lock(&text_mutex); 244 mutex_lock(&text_mutex);
243 for (ptr = start; ptr < end; ptr++) { 245 for (poff = start; poff < end; poff++) {
244 if (*ptr < text) 246 u8 *ptr = (u8 *)poff + *poff;
245 continue; 247
246 if (*ptr > text_end) 248 if (!*poff || ptr < text || ptr >= text_end)
247 continue; 249 continue;
248 /* turn DS segment override prefix into lock prefix */ 250 /* turn DS segment override prefix into lock prefix */
249 text_poke(*ptr, ((unsigned char []){0xf0}), 1); 251 if (*ptr == 0x3e)
252 text_poke(ptr, ((unsigned char []){0xf0}), 1);
250 }; 253 };
251 mutex_unlock(&text_mutex); 254 mutex_unlock(&text_mutex);
252} 255}
253 256
254static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 257static void alternatives_smp_unlock(const s32 *start, const s32 *end,
258 u8 *text, u8 *text_end)
255{ 259{
256 u8 **ptr; 260 const s32 *poff;
257 261
258 if (noreplace_smp) 262 if (noreplace_smp)
259 return; 263 return;
260 264
261 mutex_lock(&text_mutex); 265 mutex_lock(&text_mutex);
262 for (ptr = start; ptr < end; ptr++) { 266 for (poff = start; poff < end; poff++) {
263 if (*ptr < text) 267 u8 *ptr = (u8 *)poff + *poff;
264 continue; 268
265 if (*ptr > text_end) 269 if (!*poff || ptr < text || ptr >= text_end)
266 continue; 270 continue;
267 /* turn lock prefix into DS segment override prefix */ 271 /* turn lock prefix into DS segment override prefix */
268 text_poke(*ptr, ((unsigned char []){0x3E}), 1); 272 if (*ptr == 0xf0)
273 text_poke(ptr, ((unsigned char []){0x3E}), 1);
269 }; 274 };
270 mutex_unlock(&text_mutex); 275 mutex_unlock(&text_mutex);
271} 276}
@@ -276,8 +281,8 @@ struct smp_alt_module {
276 char *name; 281 char *name;
277 282
278 /* ptrs to lock prefixes */ 283 /* ptrs to lock prefixes */
279 u8 **locks; 284 const s32 *locks;
280 u8 **locks_end; 285 const s32 *locks_end;
281 286
282 /* .text segment, needed to avoid patching init code ;) */ 287 /* .text segment, needed to avoid patching init code ;) */
283 u8 *text; 288 u8 *text;
@@ -398,16 +403,19 @@ void alternatives_smp_switch(int smp)
398int alternatives_text_reserved(void *start, void *end) 403int alternatives_text_reserved(void *start, void *end)
399{ 404{
400 struct smp_alt_module *mod; 405 struct smp_alt_module *mod;
401 u8 **ptr; 406 const s32 *poff;
402 u8 *text_start = start; 407 u8 *text_start = start;
403 u8 *text_end = end; 408 u8 *text_end = end;
404 409
405 list_for_each_entry(mod, &smp_alt_modules, next) { 410 list_for_each_entry(mod, &smp_alt_modules, next) {
406 if (mod->text > text_end || mod->text_end < text_start) 411 if (mod->text > text_end || mod->text_end < text_start)
407 continue; 412 continue;
408 for (ptr = mod->locks; ptr < mod->locks_end; ptr++) 413 for (poff = mod->locks; poff < mod->locks_end; poff++) {
409 if (text_start <= *ptr && text_end >= *ptr) 414 const u8 *ptr = (const u8 *)poff + *poff;
415
416 if (text_start <= ptr && text_end > ptr)
410 return 1; 417 return 1;
418 }
411 } 419 }
412 420
413 return 0; 421 return 0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f854d89b7edf..fa044e1e30a2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -731,18 +731,22 @@ static bool increase_address_space(struct protection_domain *domain,
731 731
732static u64 *alloc_pte(struct protection_domain *domain, 732static u64 *alloc_pte(struct protection_domain *domain,
733 unsigned long address, 733 unsigned long address,
734 int end_lvl, 734 unsigned long page_size,
735 u64 **pte_page, 735 u64 **pte_page,
736 gfp_t gfp) 736 gfp_t gfp)
737{ 737{
738 int level, end_lvl;
738 u64 *pte, *page; 739 u64 *pte, *page;
739 int level; 740
741 BUG_ON(!is_power_of_2(page_size));
740 742
741 while (address > PM_LEVEL_SIZE(domain->mode)) 743 while (address > PM_LEVEL_SIZE(domain->mode))
742 increase_address_space(domain, gfp); 744 increase_address_space(domain, gfp);
743 745
744 level = domain->mode - 1; 746 level = domain->mode - 1;
745 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; 747 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
748 address = PAGE_SIZE_ALIGN(address, page_size);
749 end_lvl = PAGE_SIZE_LEVEL(page_size);
746 750
747 while (level > end_lvl) { 751 while (level > end_lvl) {
748 if (!IOMMU_PTE_PRESENT(*pte)) { 752 if (!IOMMU_PTE_PRESENT(*pte)) {
@@ -752,6 +756,10 @@ static u64 *alloc_pte(struct protection_domain *domain,
752 *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); 756 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
753 } 757 }
754 758
759 /* No level skipping support yet */
760 if (PM_PTE_LEVEL(*pte) != level)
761 return NULL;
762
755 level -= 1; 763 level -= 1;
756 764
757 pte = IOMMU_PTE_PAGE(*pte); 765 pte = IOMMU_PTE_PAGE(*pte);
@@ -769,28 +777,47 @@ static u64 *alloc_pte(struct protection_domain *domain,
769 * This function checks if there is a PTE for a given dma address. If 777 * This function checks if there is a PTE for a given dma address. If
770 * there is one, it returns the pointer to it. 778 * there is one, it returns the pointer to it.
771 */ 779 */
772static u64 *fetch_pte(struct protection_domain *domain, 780static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
773 unsigned long address, int map_size)
774{ 781{
775 int level; 782 int level;
776 u64 *pte; 783 u64 *pte;
777 784
778 level = domain->mode - 1; 785 if (address > PM_LEVEL_SIZE(domain->mode))
779 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; 786 return NULL;
780 787
781 while (level > map_size) { 788 level = domain->mode - 1;
789 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
790
791 while (level > 0) {
792
793 /* Not Present */
782 if (!IOMMU_PTE_PRESENT(*pte)) 794 if (!IOMMU_PTE_PRESENT(*pte))
783 return NULL; 795 return NULL;
784 796
797 /* Large PTE */
798 if (PM_PTE_LEVEL(*pte) == 0x07) {
799 unsigned long pte_mask, __pte;
800
801 /*
802 * If we have a series of large PTEs, make
803 * sure to return a pointer to the first one.
804 */
805 pte_mask = PTE_PAGE_SIZE(*pte);
806 pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
807 __pte = ((unsigned long)pte) & pte_mask;
808
809 return (u64 *)__pte;
810 }
811
812 /* No level skipping support yet */
813 if (PM_PTE_LEVEL(*pte) != level)
814 return NULL;
815
785 level -= 1; 816 level -= 1;
786 817
818 /* Walk to the next level */
787 pte = IOMMU_PTE_PAGE(*pte); 819 pte = IOMMU_PTE_PAGE(*pte);
788 pte = &pte[PM_LEVEL_INDEX(level, address)]; 820 pte = &pte[PM_LEVEL_INDEX(level, address)];
789
790 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
791 pte = NULL;
792 break;
793 }
794 } 821 }
795 822
796 return pte; 823 return pte;
@@ -807,44 +834,84 @@ static int iommu_map_page(struct protection_domain *dom,
807 unsigned long bus_addr, 834 unsigned long bus_addr,
808 unsigned long phys_addr, 835 unsigned long phys_addr,
809 int prot, 836 int prot,
810 int map_size) 837 unsigned long page_size)
811{ 838{
812 u64 __pte, *pte; 839 u64 __pte, *pte;
813 840 int i, count;
814 bus_addr = PAGE_ALIGN(bus_addr);
815 phys_addr = PAGE_ALIGN(phys_addr);
816
817 BUG_ON(!PM_ALIGNED(map_size, bus_addr));
818 BUG_ON(!PM_ALIGNED(map_size, phys_addr));
819 841
820 if (!(prot & IOMMU_PROT_MASK)) 842 if (!(prot & IOMMU_PROT_MASK))
821 return -EINVAL; 843 return -EINVAL;
822 844
823 pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL); 845 bus_addr = PAGE_ALIGN(bus_addr);
846 phys_addr = PAGE_ALIGN(phys_addr);
847 count = PAGE_SIZE_PTE_COUNT(page_size);
848 pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
849
850 for (i = 0; i < count; ++i)
851 if (IOMMU_PTE_PRESENT(pte[i]))
852 return -EBUSY;
824 853
825 if (IOMMU_PTE_PRESENT(*pte)) 854 if (page_size > PAGE_SIZE) {
826 return -EBUSY; 855 __pte = PAGE_SIZE_PTE(phys_addr, page_size);
856 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
857 } else
858 __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
827 859
828 __pte = phys_addr | IOMMU_PTE_P;
829 if (prot & IOMMU_PROT_IR) 860 if (prot & IOMMU_PROT_IR)
830 __pte |= IOMMU_PTE_IR; 861 __pte |= IOMMU_PTE_IR;
831 if (prot & IOMMU_PROT_IW) 862 if (prot & IOMMU_PROT_IW)
832 __pte |= IOMMU_PTE_IW; 863 __pte |= IOMMU_PTE_IW;
833 864
834 *pte = __pte; 865 for (i = 0; i < count; ++i)
866 pte[i] = __pte;
835 867
836 update_domain(dom); 868 update_domain(dom);
837 869
838 return 0; 870 return 0;
839} 871}
840 872
841static void iommu_unmap_page(struct protection_domain *dom, 873static unsigned long iommu_unmap_page(struct protection_domain *dom,
842 unsigned long bus_addr, int map_size) 874 unsigned long bus_addr,
875 unsigned long page_size)
843{ 876{
844 u64 *pte = fetch_pte(dom, bus_addr, map_size); 877 unsigned long long unmap_size, unmapped;
878 u64 *pte;
879
880 BUG_ON(!is_power_of_2(page_size));
881
882 unmapped = 0;
883
884 while (unmapped < page_size) {
885
886 pte = fetch_pte(dom, bus_addr);
887
888 if (!pte) {
889 /*
890 * No PTE for this address
891 * move forward in 4kb steps
892 */
893 unmap_size = PAGE_SIZE;
894 } else if (PM_PTE_LEVEL(*pte) == 0) {
895 /* 4kb PTE found for this address */
896 unmap_size = PAGE_SIZE;
897 *pte = 0ULL;
898 } else {
899 int count, i;
900
901 /* Large PTE found which maps this address */
902 unmap_size = PTE_PAGE_SIZE(*pte);
903 count = PAGE_SIZE_PTE_COUNT(unmap_size);
904 for (i = 0; i < count; i++)
905 pte[i] = 0ULL;
906 }
845 907
846 if (pte) 908 bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
847 *pte = 0; 909 unmapped += unmap_size;
910 }
911
912 BUG_ON(!is_power_of_2(unmapped));
913
914 return unmapped;
848} 915}
849 916
850/* 917/*
@@ -878,7 +945,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
878 for (addr = e->address_start; addr < e->address_end; 945 for (addr = e->address_start; addr < e->address_end;
879 addr += PAGE_SIZE) { 946 addr += PAGE_SIZE) {
880 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, 947 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
881 PM_MAP_4k); 948 PAGE_SIZE);
882 if (ret) 949 if (ret)
883 return ret; 950 return ret;
884 /* 951 /*
@@ -1006,7 +1073,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
1006 u64 *pte, *pte_page; 1073 u64 *pte, *pte_page;
1007 1074
1008 for (i = 0; i < num_ptes; ++i) { 1075 for (i = 0; i < num_ptes; ++i) {
1009 pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k, 1076 pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
1010 &pte_page, gfp); 1077 &pte_page, gfp);
1011 if (!pte) 1078 if (!pte)
1012 goto out_free; 1079 goto out_free;
@@ -1042,7 +1109,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
1042 for (i = dma_dom->aperture[index]->offset; 1109 for (i = dma_dom->aperture[index]->offset;
1043 i < dma_dom->aperture_size; 1110 i < dma_dom->aperture_size;
1044 i += PAGE_SIZE) { 1111 i += PAGE_SIZE) {
1045 u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k); 1112 u64 *pte = fetch_pte(&dma_dom->domain, i);
1046 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 1113 if (!pte || !IOMMU_PTE_PRESENT(*pte))
1047 continue; 1114 continue;
1048 1115
@@ -1420,6 +1487,7 @@ static int __attach_device(struct device *dev,
1420 struct protection_domain *domain) 1487 struct protection_domain *domain)
1421{ 1488{
1422 struct iommu_dev_data *dev_data, *alias_data; 1489 struct iommu_dev_data *dev_data, *alias_data;
1490 int ret;
1423 1491
1424 dev_data = get_dev_data(dev); 1492 dev_data = get_dev_data(dev);
1425 alias_data = get_dev_data(dev_data->alias); 1493 alias_data = get_dev_data(dev_data->alias);
@@ -1431,13 +1499,14 @@ static int __attach_device(struct device *dev,
1431 spin_lock(&domain->lock); 1499 spin_lock(&domain->lock);
1432 1500
1433 /* Some sanity checks */ 1501 /* Some sanity checks */
1502 ret = -EBUSY;
1434 if (alias_data->domain != NULL && 1503 if (alias_data->domain != NULL &&
1435 alias_data->domain != domain) 1504 alias_data->domain != domain)
1436 return -EBUSY; 1505 goto out_unlock;
1437 1506
1438 if (dev_data->domain != NULL && 1507 if (dev_data->domain != NULL &&
1439 dev_data->domain != domain) 1508 dev_data->domain != domain)
1440 return -EBUSY; 1509 goto out_unlock;
1441 1510
1442 /* Do real assignment */ 1511 /* Do real assignment */
1443 if (dev_data->alias != dev) { 1512 if (dev_data->alias != dev) {
@@ -1453,10 +1522,14 @@ static int __attach_device(struct device *dev,
1453 1522
1454 atomic_inc(&dev_data->bind); 1523 atomic_inc(&dev_data->bind);
1455 1524
1525 ret = 0;
1526
1527out_unlock:
1528
1456 /* ready */ 1529 /* ready */
1457 spin_unlock(&domain->lock); 1530 spin_unlock(&domain->lock);
1458 1531
1459 return 0; 1532 return ret;
1460} 1533}
1461 1534
1462/* 1535/*
@@ -1712,7 +1785,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1712 1785
1713 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; 1786 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1714 if (!pte) { 1787 if (!pte) {
1715 pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page, 1788 pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
1716 GFP_ATOMIC); 1789 GFP_ATOMIC);
1717 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; 1790 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1718 } else 1791 } else
@@ -2257,10 +2330,6 @@ int __init amd_iommu_init_dma_ops(void)
2257 2330
2258 iommu_detected = 1; 2331 iommu_detected = 1;
2259 swiotlb = 0; 2332 swiotlb = 0;
2260#ifdef CONFIG_GART_IOMMU
2261 gart_iommu_aperture_disabled = 1;
2262 gart_iommu_aperture = 0;
2263#endif
2264 2333
2265 /* Make the driver finally visible to the drivers */ 2334 /* Make the driver finally visible to the drivers */
2266 dma_ops = &amd_iommu_dma_ops; 2335 dma_ops = &amd_iommu_dma_ops;
@@ -2439,12 +2508,11 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
2439 return ret; 2508 return ret;
2440} 2509}
2441 2510
2442static int amd_iommu_map_range(struct iommu_domain *dom, 2511static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
2443 unsigned long iova, phys_addr_t paddr, 2512 phys_addr_t paddr, int gfp_order, int iommu_prot)
2444 size_t size, int iommu_prot)
2445{ 2513{
2514 unsigned long page_size = 0x1000UL << gfp_order;
2446 struct protection_domain *domain = dom->priv; 2515 struct protection_domain *domain = dom->priv;
2447 unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE);
2448 int prot = 0; 2516 int prot = 0;
2449 int ret; 2517 int ret;
2450 2518
@@ -2453,61 +2521,50 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2453 if (iommu_prot & IOMMU_WRITE) 2521 if (iommu_prot & IOMMU_WRITE)
2454 prot |= IOMMU_PROT_IW; 2522 prot |= IOMMU_PROT_IW;
2455 2523
2456 iova &= PAGE_MASK;
2457 paddr &= PAGE_MASK;
2458
2459 mutex_lock(&domain->api_lock); 2524 mutex_lock(&domain->api_lock);
2460 2525 ret = iommu_map_page(domain, iova, paddr, prot, page_size);
2461 for (i = 0; i < npages; ++i) {
2462 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
2463 if (ret)
2464 return ret;
2465
2466 iova += PAGE_SIZE;
2467 paddr += PAGE_SIZE;
2468 }
2469
2470 mutex_unlock(&domain->api_lock); 2526 mutex_unlock(&domain->api_lock);
2471 2527
2472 return 0; 2528 return ret;
2473} 2529}
2474 2530
2475static void amd_iommu_unmap_range(struct iommu_domain *dom, 2531static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
2476 unsigned long iova, size_t size) 2532 int gfp_order)
2477{ 2533{
2478
2479 struct protection_domain *domain = dom->priv; 2534 struct protection_domain *domain = dom->priv;
2480 unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE); 2535 unsigned long page_size, unmap_size;
2481 2536
2482 iova &= PAGE_MASK; 2537 page_size = 0x1000UL << gfp_order;
2483 2538
2484 mutex_lock(&domain->api_lock); 2539 mutex_lock(&domain->api_lock);
2485 2540 unmap_size = iommu_unmap_page(domain, iova, page_size);
2486 for (i = 0; i < npages; ++i) { 2541 mutex_unlock(&domain->api_lock);
2487 iommu_unmap_page(domain, iova, PM_MAP_4k);
2488 iova += PAGE_SIZE;
2489 }
2490 2542
2491 iommu_flush_tlb_pde(domain); 2543 iommu_flush_tlb_pde(domain);
2492 2544
2493 mutex_unlock(&domain->api_lock); 2545 return get_order(unmap_size);
2494} 2546}
2495 2547
2496static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2548static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2497 unsigned long iova) 2549 unsigned long iova)
2498{ 2550{
2499 struct protection_domain *domain = dom->priv; 2551 struct protection_domain *domain = dom->priv;
2500 unsigned long offset = iova & ~PAGE_MASK; 2552 unsigned long offset_mask;
2501 phys_addr_t paddr; 2553 phys_addr_t paddr;
2502 u64 *pte; 2554 u64 *pte, __pte;
2503 2555
2504 pte = fetch_pte(domain, iova, PM_MAP_4k); 2556 pte = fetch_pte(domain, iova);
2505 2557
2506 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 2558 if (!pte || !IOMMU_PTE_PRESENT(*pte))
2507 return 0; 2559 return 0;
2508 2560
2509 paddr = *pte & IOMMU_PAGE_MASK; 2561 if (PM_PTE_LEVEL(*pte) == 0)
2510 paddr |= offset; 2562 offset_mask = PAGE_SIZE - 1;
2563 else
2564 offset_mask = PTE_PAGE_SIZE(*pte) - 1;
2565
2566 __pte = *pte & PM_ADDR_MASK;
2567 paddr = (__pte & ~offset_mask) | (iova & offset_mask);
2511 2568
2512 return paddr; 2569 return paddr;
2513} 2570}
@@ -2515,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2515static int amd_iommu_domain_has_cap(struct iommu_domain *domain, 2572static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
2516 unsigned long cap) 2573 unsigned long cap)
2517{ 2574{
2575 switch (cap) {
2576 case IOMMU_CAP_CACHE_COHERENCY:
2577 return 1;
2578 }
2579
2518 return 0; 2580 return 0;
2519} 2581}
2520 2582
@@ -2523,8 +2585,8 @@ static struct iommu_ops amd_iommu_ops = {
2523 .domain_destroy = amd_iommu_domain_destroy, 2585 .domain_destroy = amd_iommu_domain_destroy,
2524 .attach_dev = amd_iommu_attach_device, 2586 .attach_dev = amd_iommu_attach_device,
2525 .detach_dev = amd_iommu_detach_device, 2587 .detach_dev = amd_iommu_detach_device,
2526 .map = amd_iommu_map_range, 2588 .map = amd_iommu_map,
2527 .unmap = amd_iommu_unmap_range, 2589 .unmap = amd_iommu_unmap,
2528 .iova_to_phys = amd_iommu_iova_to_phys, 2590 .iova_to_phys = amd_iommu_iova_to_phys,
2529 .domain_has_cap = amd_iommu_domain_has_cap, 2591 .domain_has_cap = amd_iommu_domain_has_cap,
2530}; 2592};
@@ -2552,8 +2614,7 @@ int __init amd_iommu_init_passthrough(void)
2552 2614
2553 pt_domain->mode |= PAGE_MODE_NONE; 2615 pt_domain->mode |= PAGE_MODE_NONE;
2554 2616
2555 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2617 for_each_pci_dev(dev) {
2556
2557 if (!check_device(&dev->dev)) 2618 if (!check_device(&dev->dev))
2558 continue; 2619 continue;
2559 2620
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6360abf993d4..3cc63e2b8dd4 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -120,6 +120,7 @@ struct ivmd_header {
120bool amd_iommu_dump; 120bool amd_iommu_dump;
121 121
122static int __initdata amd_iommu_detected; 122static int __initdata amd_iommu_detected;
123static bool __initdata amd_iommu_disabled;
123 124
124u16 amd_iommu_last_bdf; /* largest PCI device id we have 125u16 amd_iommu_last_bdf; /* largest PCI device id we have
125 to handle */ 126 to handle */
@@ -286,8 +287,12 @@ static u8 * __init iommu_map_mmio_space(u64 address)
286{ 287{
287 u8 *ret; 288 u8 *ret;
288 289
289 if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) 290 if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
291 pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
292 address);
293 pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
290 return NULL; 294 return NULL;
295 }
291 296
292 ret = ioremap_nocache(address, MMIO_REGION_LENGTH); 297 ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
293 if (ret != NULL) 298 if (ret != NULL)
@@ -1313,7 +1318,7 @@ static int __init amd_iommu_init(void)
1313 ret = amd_iommu_init_dma_ops(); 1318 ret = amd_iommu_init_dma_ops();
1314 1319
1315 if (ret) 1320 if (ret)
1316 goto free; 1321 goto free_disable;
1317 1322
1318 amd_iommu_init_api(); 1323 amd_iommu_init_api();
1319 1324
@@ -1331,9 +1336,10 @@ static int __init amd_iommu_init(void)
1331out: 1336out:
1332 return ret; 1337 return ret;
1333 1338
1334free: 1339free_disable:
1335 disable_iommus(); 1340 disable_iommus();
1336 1341
1342free:
1337 amd_iommu_uninit_devices(); 1343 amd_iommu_uninit_devices();
1338 1344
1339 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1345 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
@@ -1352,6 +1358,15 @@ free:
1352 1358
1353 free_unity_maps(); 1359 free_unity_maps();
1354 1360
1361#ifdef CONFIG_GART_IOMMU
1362 /*
1363 * We failed to initialize the AMD IOMMU - try fallback to GART
1364 * if possible.
1365 */
1366 gart_iommu_init();
1367
1368#endif
1369
1355 goto out; 1370 goto out;
1356} 1371}
1357 1372
@@ -1372,6 +1387,9 @@ void __init amd_iommu_detect(void)
1372 if (no_iommu || (iommu_detected && !gart_iommu_aperture)) 1387 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1373 return; 1388 return;
1374 1389
1390 if (amd_iommu_disabled)
1391 return;
1392
1375 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1393 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1376 iommu_detected = 1; 1394 iommu_detected = 1;
1377 amd_iommu_detected = 1; 1395 amd_iommu_detected = 1;
@@ -1401,6 +1419,8 @@ static int __init parse_amd_iommu_options(char *str)
1401 for (; *str; ++str) { 1419 for (; *str; ++str) {
1402 if (strncmp(str, "fullflush", 9) == 0) 1420 if (strncmp(str, "fullflush", 9) == 0)
1403 amd_iommu_unmap_flush = true; 1421 amd_iommu_unmap_flush = true;
1422 if (strncmp(str, "off", 3) == 0)
1423 amd_iommu_disabled = true;
1404 } 1424 }
1405 1425
1406 return 1; 1426 return 1;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index a35347501d36..8dd77800ff5d 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -43,10 +43,11 @@
43 43
44#include <asm/fixmap.h> 44#include <asm/fixmap.h>
45#include <asm/apb_timer.h> 45#include <asm/apb_timer.h>
46#include <asm/mrst.h>
46 47
47#define APBT_MASK CLOCKSOURCE_MASK(32) 48#define APBT_MASK CLOCKSOURCE_MASK(32)
48#define APBT_SHIFT 22 49#define APBT_SHIFT 22
49#define APBT_CLOCKEVENT_RATING 150 50#define APBT_CLOCKEVENT_RATING 110
50#define APBT_CLOCKSOURCE_RATING 250 51#define APBT_CLOCKSOURCE_RATING 250
51#define APBT_MIN_DELTA_USEC 200 52#define APBT_MIN_DELTA_USEC 200
52 53
@@ -83,8 +84,6 @@ struct apbt_dev {
83 char name[10]; 84 char name[10];
84}; 85};
85 86
86int disable_apbt_percpu __cpuinitdata;
87
88static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); 87static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
89 88
90#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
@@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = {
195}; 194};
196 195
197/* 196/*
198 * if user does not want to use per CPU apb timer, just give it a lower rating
199 * than local apic timer and skip the late per cpu timer init.
200 */
201static inline int __init setup_x86_mrst_timer(char *arg)
202{
203 if (!arg)
204 return -EINVAL;
205
206 if (strcmp("apbt_only", arg) == 0)
207 disable_apbt_percpu = 0;
208 else if (strcmp("lapic_and_apbt", arg) == 0)
209 disable_apbt_percpu = 1;
210 else {
211 pr_warning("X86 MRST timer option %s not recognised"
212 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
213 arg);
214 return -EINVAL;
215 }
216 return 0;
217}
218__setup("x86_mrst_timer=", setup_x86_mrst_timer);
219
220/*
221 * start count down from 0xffff_ffff. this is done by toggling the enable bit 197 * start count down from 0xffff_ffff. this is done by toggling the enable bit
222 * then load initial load count to ~0. 198 * then load initial load count to ~0.
223 */ 199 */
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void)
335 adev->num = smp_processor_id(); 311 adev->num = smp_processor_id();
336 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); 312 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
337 313
338 if (disable_apbt_percpu) { 314 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
339 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; 315 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
340 global_clock_event = &adev->evt; 316 global_clock_event = &adev->evt;
341 printk(KERN_DEBUG "%s clockevent registered as global\n", 317 printk(KERN_DEBUG "%s clockevent registered as global\n",
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
429 405
430static __init int apbt_late_init(void) 406static __init int apbt_late_init(void)
431{ 407{
432 if (disable_apbt_percpu || !apb_timer_block_enabled) 408 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
409 !apb_timer_block_enabled)
433 return 0; 410 return 0;
434 /* This notifier should be called after workqueue is ready */ 411 /* This notifier should be called after workqueue is ready */
435 hotcpu_notifier(apbt_cpuhp_notify, -20); 412 hotcpu_notifier(apbt_cpuhp_notify, -20);
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode,
450 int timer_num; 427 int timer_num;
451 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); 428 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
452 429
430 BUG_ON(!apbt_virt_address);
431
453 timer_num = adev->num; 432 timer_num = adev->num;
454 pr_debug("%s CPU %d timer %d mode=%d\n", 433 pr_debug("%s CPU %d timer %d mode=%d\n",
455 __func__, first_cpu(*evt->cpumask), timer_num, mode); 434 __func__, first_cpu(*evt->cpumask), timer_num, mode);
@@ -676,7 +655,7 @@ void __init apbt_time_init(void)
676 } 655 }
677#ifdef CONFIG_SMP 656#ifdef CONFIG_SMP
678 /* kernel cmdline disable apb timer, so we will use lapic timers */ 657 /* kernel cmdline disable apb timer, so we will use lapic timers */
679 if (disable_apbt_percpu) { 658 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
680 printk(KERN_INFO "apbt: disabled per cpu timer\n"); 659 printk(KERN_INFO "apbt: disabled per cpu timer\n");
681 return; 660 return;
682 } 661 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b5d8b0bcf235..a2e0caf26e17 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void)
280 * or BIOS forget to put that in reserved. 280 * or BIOS forget to put that in reserved.
281 * try to update e820 to make that region as reserved. 281 * try to update e820 to make that region as reserved.
282 */ 282 */
283 u32 agp_aper_base = 0, agp_aper_order = 0; 283 u32 agp_aper_order = 0;
284 int i, fix, slot, valid_agp = 0; 284 int i, fix, slot, valid_agp = 0;
285 u32 ctl; 285 u32 ctl;
286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void)
291 return; 291 return;
292 292
293 /* This is mostly duplicate of iommu_hole_init */ 293 /* This is mostly duplicate of iommu_hole_init */
294 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); 294 search_agp_bridge(&agp_aper_order, &valid_agp);
295 295
296 fix = 0; 296 fix = 0;
297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 565c1bfc507d..910f20b457c4 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,12 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
6ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
7obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
8endif
9obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
10
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o 11obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o 12obj-$(CONFIG_SMP) += ipi.o
8 13
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e5a4a1e01618..980508c79082 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -51,6 +51,7 @@
51#include <asm/smp.h> 51#include <asm/smp.h>
52#include <asm/mce.h> 52#include <asm/mce.h>
53#include <asm/kvm_para.h> 53#include <asm/kvm_para.h>
54#include <asm/tsc.h>
54 55
55unsigned int num_processors; 56unsigned int num_processors;
56 57
@@ -459,7 +460,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
459} 460}
460 461
461/* 462/*
462 * Setup the local APIC timer for this CPU. Copy the initilized values 463 * Setup the local APIC timer for this CPU. Copy the initialized values
463 * of the boot CPU and register the clock event in the framework. 464 * of the boot CPU and register the clock event in the framework.
464 */ 465 */
465static void __cpuinit setup_APIC_timer(void) 466static void __cpuinit setup_APIC_timer(void)
@@ -920,7 +921,7 @@ void disable_local_APIC(void)
920 unsigned int value; 921 unsigned int value;
921 922
922 /* APIC hasn't been mapped yet */ 923 /* APIC hasn't been mapped yet */
923 if (!apic_phys) 924 if (!x2apic_mode && !apic_phys)
924 return; 925 return;
925 926
926 clear_local_APIC(); 927 clear_local_APIC();
@@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void)
1151 */ 1152 */
1152void __cpuinit setup_local_APIC(void) 1153void __cpuinit setup_local_APIC(void)
1153{ 1154{
1154 unsigned int value; 1155 unsigned int value, queued;
1155 int i, j; 1156 int i, j, acked = 0;
1157 unsigned long long tsc = 0, ntsc;
1158 long long max_loops = cpu_khz;
1159
1160 if (cpu_has_tsc)
1161 rdtscll(tsc);
1156 1162
1157 if (disable_apic) { 1163 if (disable_apic) {
1158 arch_disable_smp_support(); 1164 arch_disable_smp_support();
@@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void)
1204 * the interrupt. Hence a vector might get locked. It was noticed 1210 * the interrupt. Hence a vector might get locked. It was noticed
1205 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. 1211 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
1206 */ 1212 */
1207 for (i = APIC_ISR_NR - 1; i >= 0; i--) { 1213 do {
1208 value = apic_read(APIC_ISR + i*0x10); 1214 queued = 0;
1209 for (j = 31; j >= 0; j--) { 1215 for (i = APIC_ISR_NR - 1; i >= 0; i--)
1210 if (value & (1<<j)) 1216 queued |= apic_read(APIC_IRR + i*0x10);
1211 ack_APIC_irq(); 1217
1218 for (i = APIC_ISR_NR - 1; i >= 0; i--) {
1219 value = apic_read(APIC_ISR + i*0x10);
1220 for (j = 31; j >= 0; j--) {
1221 if (value & (1<<j)) {
1222 ack_APIC_irq();
1223 acked++;
1224 }
1225 }
1212 } 1226 }
1213 } 1227 if (acked > 256) {
1228 printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
1229 acked);
1230 break;
1231 }
1232 if (cpu_has_tsc) {
1233 rdtscll(ntsc);
1234 max_loops = (cpu_khz << 10) - (ntsc - tsc);
1235 } else
1236 max_loops--;
1237 } while (queued && max_loops > 0);
1238 WARN_ON(max_loops <= 0);
1214 1239
1215 /* 1240 /*
1216 * Now that we are all set up, enable the APIC 1241 * Now that we are all set up, enable the APIC
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 03ba1b895f5e..8593582d8022 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -129,25 +129,6 @@ int es7000_plat;
129 * GSI override for ES7000 platforms. 129 * GSI override for ES7000 platforms.
130 */ 130 */
131 131
132static unsigned int base;
133
134static int
135es7000_rename_gsi(int ioapic, int gsi)
136{
137 if (es7000_plat == ES7000_ZORRO)
138 return gsi;
139
140 if (!base) {
141 int i;
142 for (i = 0; i < nr_ioapics; i++)
143 base += nr_ioapic_registers[i];
144 }
145
146 if (!ioapic && (gsi < 16))
147 gsi += base;
148
149 return gsi;
150}
151 132
152static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) 133static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
153{ 134{
@@ -190,7 +171,6 @@ static void setup_unisys(void)
190 es7000_plat = ES7000_ZORRO; 171 es7000_plat = ES7000_ZORRO;
191 else 172 else
192 es7000_plat = ES7000_CLASSIC; 173 es7000_plat = ES7000_CLASSIC;
193 ioapic_renumber_irq = es7000_rename_gsi;
194} 174}
195 175
196/* 176/*
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 000000000000..cefd6942f0e9
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,107 @@
1/*
2 * HW NMI watchdog support
3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 *
6 * Arch specific calls to support NMI watchdog
7 *
8 * Bits copied from original nmi.c file
9 *
10 */
11#include <asm/apic.h>
12
13#include <linux/cpumask.h>
14#include <linux/kdebug.h>
15#include <linux/notifier.h>
16#include <linux/kprobes.h>
17#include <linux/nmi.h>
18#include <linux/module.h>
19
20/* For reliability, we're prepared to waste bits here. */
21static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
22
23u64 hw_nmi_get_sample_period(void)
24{
25 return (u64)(cpu_khz) * 1000 * 60;
26}
27
28#ifdef ARCH_HAS_NMI_WATCHDOG
29void arch_trigger_all_cpu_backtrace(void)
30{
31 int i;
32
33 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
34
35 printk(KERN_INFO "sending NMI to all CPUs:\n");
36 apic->send_IPI_all(NMI_VECTOR);
37
38 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
39 for (i = 0; i < 10 * 1000; i++) {
40 if (cpumask_empty(to_cpumask(backtrace_mask)))
41 break;
42 mdelay(1);
43 }
44}
45
46static int __kprobes
47arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
48 unsigned long cmd, void *__args)
49{
50 struct die_args *args = __args;
51 struct pt_regs *regs;
52 int cpu = smp_processor_id();
53
54 switch (cmd) {
55 case DIE_NMI:
56 case DIE_NMI_IPI:
57 break;
58
59 default:
60 return NOTIFY_DONE;
61 }
62
63 regs = args->regs;
64
65 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
66 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
67
68 arch_spin_lock(&lock);
69 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
70 show_regs(regs);
71 dump_stack();
72 arch_spin_unlock(&lock);
73 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
74 return NOTIFY_STOP;
75 }
76
77 return NOTIFY_DONE;
78}
79
80static __read_mostly struct notifier_block backtrace_notifier = {
81 .notifier_call = arch_trigger_all_cpu_backtrace_handler,
82 .next = NULL,
83 .priority = 1
84};
85
86static int __init register_trigger_all_cpu_backtrace(void)
87{
88 register_die_notifier(&backtrace_notifier);
89 return 0;
90}
91early_initcall(register_trigger_all_cpu_backtrace);
92#endif
93
94/* STUB calls to mimic old nmi_watchdog behaviour */
95#if defined(CONFIG_X86_LOCAL_APIC)
96unsigned int nmi_watchdog = NMI_NONE;
97EXPORT_SYMBOL(nmi_watchdog);
98void acpi_nmi_enable(void) { return; }
99void acpi_nmi_disable(void) { return; }
100#endif
101atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
102EXPORT_SYMBOL(nmi_active);
103int unknown_nmi_panic;
104void cpu_nmi_set_wd_enabled(void) { return; }
105void stop_apic_nmi_watchdog(void *unused) { return; }
106void setup_apic_nmi_watchdog(void *unused) { return; }
107int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index eb2789c3f721..4dc0084ec1b1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,6 +89,9 @@ int nr_ioapics;
89/* IO APIC gsi routing info */ 89/* IO APIC gsi routing info */
90struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; 90struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS];
91 91
92/* The one past the highest gsi number used */
93u32 gsi_top;
94
92/* MP IRQ source entries */ 95/* MP IRQ source entries */
93struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; 96struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
94 97
@@ -1013,10 +1016,9 @@ static inline int irq_trigger(int idx)
1013 return MPBIOS_trigger(idx); 1016 return MPBIOS_trigger(idx);
1014} 1017}
1015 1018
1016int (*ioapic_renumber_irq)(int ioapic, int irq);
1017static int pin_2_irq(int idx, int apic, int pin) 1019static int pin_2_irq(int idx, int apic, int pin)
1018{ 1020{
1019 int irq, i; 1021 int irq;
1020 int bus = mp_irqs[idx].srcbus; 1022 int bus = mp_irqs[idx].srcbus;
1021 1023
1022 /* 1024 /*
@@ -1028,18 +1030,12 @@ static int pin_2_irq(int idx, int apic, int pin)
1028 if (test_bit(bus, mp_bus_not_pci)) { 1030 if (test_bit(bus, mp_bus_not_pci)) {
1029 irq = mp_irqs[idx].srcbusirq; 1031 irq = mp_irqs[idx].srcbusirq;
1030 } else { 1032 } else {
1031 /* 1033 u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
1032 * PCI IRQs are mapped in order 1034
1033 */ 1035 if (gsi >= NR_IRQS_LEGACY)
1034 i = irq = 0; 1036 irq = gsi;
1035 while (i < apic) 1037 else
1036 irq += nr_ioapic_registers[i++]; 1038 irq = gsi_top + gsi;
1037 irq += pin;
1038 /*
1039 * For MPS mode, so far only needed by ES7000 platform
1040 */
1041 if (ioapic_renumber_irq)
1042 irq = ioapic_renumber_irq(apic, irq);
1043 } 1039 }
1044 1040
1045#ifdef CONFIG_X86_32 1041#ifdef CONFIG_X86_32
@@ -1950,20 +1946,8 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
1950 1946
1951void __init enable_IO_APIC(void) 1947void __init enable_IO_APIC(void)
1952{ 1948{
1953 union IO_APIC_reg_01 reg_01;
1954 int i8259_apic, i8259_pin; 1949 int i8259_apic, i8259_pin;
1955 int apic; 1950 int apic;
1956 unsigned long flags;
1957
1958 /*
1959 * The number of IO-APIC IRQ registers (== #pins):
1960 */
1961 for (apic = 0; apic < nr_ioapics; apic++) {
1962 raw_spin_lock_irqsave(&ioapic_lock, flags);
1963 reg_01.raw = io_apic_read(apic, 1);
1964 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1965 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1966 }
1967 1951
1968 if (!legacy_pic->nr_legacy_irqs) 1952 if (!legacy_pic->nr_legacy_irqs)
1969 return; 1953 return;
@@ -3413,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3413 3397
3414 cfg = desc->chip_data; 3398 cfg = desc->chip_data;
3415 3399
3416 read_msi_msg_desc(desc, &msg); 3400 get_cached_msi_msg_desc(desc, &msg);
3417 3401
3418 msg.data &= ~MSI_DATA_VECTOR_MASK; 3402 msg.data &= ~MSI_DATA_VECTOR_MASK;
3419 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3403 msg.data |= MSI_DATA_VECTOR(cfg->vector);
@@ -3858,27 +3842,20 @@ int __init io_apic_get_redir_entries (int ioapic)
3858 reg_01.raw = io_apic_read(ioapic, 1); 3842 reg_01.raw = io_apic_read(ioapic, 1);
3859 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 3843 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3860 3844
3861 return reg_01.bits.entries; 3845 /* The register returns the maximum index redir index
3846 * supported, which is one less than the total number of redir
3847 * entries.
3848 */
3849 return reg_01.bits.entries + 1;
3862} 3850}
3863 3851
3864void __init probe_nr_irqs_gsi(void) 3852void __init probe_nr_irqs_gsi(void)
3865{ 3853{
3866 int nr = 0; 3854 int nr;
3867 3855
3868 nr = acpi_probe_gsi(); 3856 nr = gsi_top + NR_IRQS_LEGACY;
3869 if (nr > nr_irqs_gsi) { 3857 if (nr > nr_irqs_gsi)
3870 nr_irqs_gsi = nr; 3858 nr_irqs_gsi = nr;
3871 } else {
3872 /* for acpi=off or acpi is not compiled in */
3873 int idx;
3874
3875 nr = 0;
3876 for (idx = 0; idx < nr_ioapics; idx++)
3877 nr += io_apic_get_redir_entries(idx) + 1;
3878
3879 if (nr > nr_irqs_gsi)
3880 nr_irqs_gsi = nr;
3881 }
3882 3859
3883 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); 3860 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
3884} 3861}
@@ -4085,22 +4062,27 @@ int __init io_apic_get_version(int ioapic)
4085 return reg_01.bits.version; 4062 return reg_01.bits.version;
4086} 4063}
4087 4064
4088int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) 4065int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
4089{ 4066{
4090 int i; 4067 int ioapic, pin, idx;
4091 4068
4092 if (skip_ioapic_setup) 4069 if (skip_ioapic_setup)
4093 return -1; 4070 return -1;
4094 4071
4095 for (i = 0; i < mp_irq_entries; i++) 4072 ioapic = mp_find_ioapic(gsi);
4096 if (mp_irqs[i].irqtype == mp_INT && 4073 if (ioapic < 0)
4097 mp_irqs[i].srcbusirq == bus_irq)
4098 break;
4099 if (i >= mp_irq_entries)
4100 return -1; 4074 return -1;
4101 4075
4102 *trigger = irq_trigger(i); 4076 pin = mp_find_ioapic_pin(ioapic, gsi);
4103 *polarity = irq_polarity(i); 4077 if (pin < 0)
4078 return -1;
4079
4080 idx = find_irq_entry(ioapic, pin, mp_INT);
4081 if (idx < 0)
4082 return -1;
4083
4084 *trigger = irq_trigger(idx);
4085 *polarity = irq_polarity(idx);
4104 return 0; 4086 return 0;
4105} 4087}
4106 4088
@@ -4241,7 +4223,7 @@ void __init ioapic_insert_resources(void)
4241 } 4223 }
4242} 4224}
4243 4225
4244int mp_find_ioapic(int gsi) 4226int mp_find_ioapic(u32 gsi)
4245{ 4227{
4246 int i = 0; 4228 int i = 0;
4247 4229
@@ -4256,7 +4238,7 @@ int mp_find_ioapic(int gsi)
4256 return -1; 4238 return -1;
4257} 4239}
4258 4240
4259int mp_find_ioapic_pin(int ioapic, int gsi) 4241int mp_find_ioapic_pin(int ioapic, u32 gsi)
4260{ 4242{
4261 if (WARN_ON(ioapic == -1)) 4243 if (WARN_ON(ioapic == -1))
4262 return -1; 4244 return -1;
@@ -4284,6 +4266,7 @@ static int bad_ioapic(unsigned long address)
4284void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) 4266void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4285{ 4267{
4286 int idx = 0; 4268 int idx = 0;
4269 int entries;
4287 4270
4288 if (bad_ioapic(address)) 4271 if (bad_ioapic(address))
4289 return; 4272 return;
@@ -4302,9 +4285,17 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4302 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 4285 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
4303 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 4286 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
4304 */ 4287 */
4288 entries = io_apic_get_redir_entries(idx);
4305 mp_gsi_routing[idx].gsi_base = gsi_base; 4289 mp_gsi_routing[idx].gsi_base = gsi_base;
4306 mp_gsi_routing[idx].gsi_end = gsi_base + 4290 mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
4307 io_apic_get_redir_entries(idx); 4291
4292 /*
4293 * The number of IO-APIC IRQ registers (== #pins):
4294 */
4295 nr_ioapic_registers[idx] = entries;
4296
4297 if (mp_gsi_routing[idx].gsi_end >= gsi_top)
4298 gsi_top = mp_gsi_routing[idx].gsi_end + 1;
4308 4299
4309 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 4300 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4310 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, 4301 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 1edaf15c0b8e..a43f71cb30f8 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -401,13 +401,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
401 int cpu = smp_processor_id(); 401 int cpu = smp_processor_id();
402 int rc = 0; 402 int rc = 0;
403 403
404 /* check for other users first */
405 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
406 == NOTIFY_STOP) {
407 rc = 1;
408 touched = 1;
409 }
410
411 sum = get_timer_irqs(cpu); 404 sum = get_timer_irqs(cpu);
412 405
413 if (__get_cpu_var(nmi_touch)) { 406 if (__get_cpu_var(nmi_touch)) {
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c085d52dbaf2..e46f98f36e31 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -735,9 +735,6 @@ void __init uv_system_init(void)
735 uv_node_to_blade[nid] = blade; 735 uv_node_to_blade[nid] = blade;
736 uv_cpu_to_blade[cpu] = blade; 736 uv_cpu_to_blade[cpu] = blade;
737 max_pnode = max(pnode, max_pnode); 737 max_pnode = max(pnode, max_pnode);
738
739 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
740 cpu, apicid, pnode, nid, lcpu, blade);
741 } 738 }
742 739
743 /* Add blade/pnode info for nodes without cpus */ 740 /* Add blade/pnode info for nodes without cpus */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 031aa887b0eb..4c9c67bf09b7 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -140,7 +140,7 @@
140 * is now the way life works). 140 * is now the way life works).
141 * Fix thinko in suspend() (wrong return). 141 * Fix thinko in suspend() (wrong return).
142 * Notify drivers on critical suspend. 142 * Notify drivers on critical suspend.
143 * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz> 143 * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
144 * modified by sfr). 144 * modified by sfr).
145 * Disable interrupts while we are suspended (Andy Henroid 145 * Disable interrupts while we are suspended (Andy Henroid
146 * <andy_henroid@yahoo.com> fixed by sfr). 146 * <andy_henroid@yahoo.com> fixed by sfr).
@@ -1224,7 +1224,7 @@ static void reinit_timer(void)
1224#ifdef INIT_TIMER_AFTER_SUSPEND 1224#ifdef INIT_TIMER_AFTER_SUSPEND
1225 unsigned long flags; 1225 unsigned long flags;
1226 1226
1227 spin_lock_irqsave(&i8253_lock, flags); 1227 raw_spin_lock_irqsave(&i8253_lock, flags);
1228 /* set the clock to HZ */ 1228 /* set the clock to HZ */
1229 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ 1229 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
1230 udelay(10); 1230 udelay(10);
@@ -1232,7 +1232,7 @@ static void reinit_timer(void)
1232 udelay(10); 1232 udelay(10);
1233 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ 1233 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
1234 udelay(10); 1234 udelay(10);
1235 spin_unlock_irqrestore(&i8253_lock, flags); 1235 raw_spin_unlock_irqrestore(&i8253_lock, flags);
1236#endif 1236#endif
1237} 1237}
1238 1238
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62f3671..3f0ebe429a01 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,11 +12,11 @@ endif
12nostackp := $(call cc-option, -fno-stack-protector) 12nostackp := $(call cc-option, -fno-stack-protector)
13CFLAGS_common.o := $(nostackp) 13CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o addon_cpuid_features.o 15obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18 18
19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o
20obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
21 21
22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d2..60a57b13082d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
466 } 466 }
467 467
468 } 468 }
469 if (c->x86 == 0x10 || c->x86 == 0x11) 469 if (c->x86 >= 0x10)
470 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 470 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
471 471
472 /* get apicid instead of initial apic id from cpuid */ 472 /* get apicid instead of initial apic id from cpuid */
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
529 num_cache_leaves = 3; 529 num_cache_leaves = 3;
530 } 530 }
531 531
532 if (c->x86 >= 0xf && c->x86 <= 0x11) 532 if (c->x86 >= 0xf)
533 set_cpu_cap(c, X86_FEATURE_K8); 533 set_cpu_cap(c, X86_FEATURE_K8);
534 534
535 if (cpu_has_xmm2) { 535 if (cpu_has_xmm2) {
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
546 fam10h_check_enable_mmcfg(); 546 fam10h_check_enable_mmcfg();
547 } 547 }
548 548
549 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { 549 if (c == &boot_cpu_data && c->x86 >= 0xf) {
550 unsigned long long tseg; 550 unsigned long long tseg;
551 551
552 /* 552 /*
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
609}; 609};
610 610
611cpu_dev_register(amd_cpu_dev); 611cpu_dev_register(amd_cpu_dev);
612
613/*
614 * AMD errata checking
615 *
616 * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
617 * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
618 * have an OSVW id assigned, which it takes as first argument. Both take a
619 * variable number of family-specific model-stepping ranges created by
620 * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
621 * int[] in arch/x86/include/asm/processor.h.
622 *
623 * Example:
624 *
625 * const int amd_erratum_319[] =
626 * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
627 * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
628 * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
629 */
630
631const int amd_erratum_400[] =
632 AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
633 AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
634EXPORT_SYMBOL_GPL(amd_erratum_400);
635
636const int amd_erratum_383[] =
637 AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
638EXPORT_SYMBOL_GPL(amd_erratum_383);
639
640bool cpu_has_amd_erratum(const int *erratum)
641{
642 struct cpuinfo_x86 *cpu = &current_cpu_data;
643 int osvw_id = *erratum++;
644 u32 range;
645 u32 ms;
646
647 /*
648 * If called early enough that current_cpu_data hasn't been initialized
649 * yet, fall back to boot_cpu_data.
650 */
651 if (cpu->x86 == 0)
652 cpu = &boot_cpu_data;
653
654 if (cpu->x86_vendor != X86_VENDOR_AMD)
655 return false;
656
657 if (osvw_id >= 0 && osvw_id < 65536 &&
658 cpu_has(cpu, X86_FEATURE_OSVW)) {
659 u64 osvw_len;
660
661 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
662 if (osvw_id < osvw_len) {
663 u64 osvw_bits;
664
665 rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
666 osvw_bits);
667 return osvw_bits & (1ULL << (osvw_id & 0x3f));
668 }
669 }
670
671 /* OSVW unavailable or ID unknown, match family-model-stepping range */
672 ms = (cpu->x86_model << 8) | cpu->x86_mask;
673 while ((range = *erratum++))
674 if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
675 (ms >= AMD_MODEL_RANGE_START(range)) &&
676 (ms <= AMD_MODEL_RANGE_END(range)))
677 return true;
678
679 return false;
680}
681
682EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01a265212395..c39576cb3018 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
86 86
87static void __init check_hlt(void) 87static void __init check_hlt(void)
88{ 88{
89 if (paravirt_enabled()) 89 if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
90 return; 90 return;
91 91
92 printk(KERN_INFO "Checking 'hlt' instruction... "); 92 printk(KERN_INFO "Checking 'hlt' instruction... ");
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
deleted file mode 100644
index 2056ccf572cc..000000000000
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ /dev/null
@@ -1,72 +0,0 @@
1/*
2 * cmpxchg*() fallbacks for CPU not supporting these instructions
3 */
4
5#include <linux/kernel.h>
6#include <linux/smp.h>
7#include <linux/module.h>
8
9#ifndef CONFIG_X86_CMPXCHG
10unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
11{
12 u8 prev;
13 unsigned long flags;
14
15 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
16 local_irq_save(flags);
17 prev = *(u8 *)ptr;
18 if (prev == old)
19 *(u8 *)ptr = new;
20 local_irq_restore(flags);
21 return prev;
22}
23EXPORT_SYMBOL(cmpxchg_386_u8);
24
25unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
26{
27 u16 prev;
28 unsigned long flags;
29
30 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
31 local_irq_save(flags);
32 prev = *(u16 *)ptr;
33 if (prev == old)
34 *(u16 *)ptr = new;
35 local_irq_restore(flags);
36 return prev;
37}
38EXPORT_SYMBOL(cmpxchg_386_u16);
39
40unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
41{
42 u32 prev;
43 unsigned long flags;
44
45 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
46 local_irq_save(flags);
47 prev = *(u32 *)ptr;
48 if (prev == old)
49 *(u32 *)ptr = new;
50 local_irq_restore(flags);
51 return prev;
52}
53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4868e4a951ee..490dac63c2d2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
140static int __init x86_xsave_setup(char *s) 140static int __init x86_xsave_setup(char *s)
141{ 141{
142 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 142 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
143 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
143 return 1; 144 return 1;
144} 145}
145__setup("noxsave", x86_xsave_setup); 146__setup("noxsave", x86_xsave_setup);
146 147
148static int __init x86_xsaveopt_setup(char *s)
149{
150 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
151 return 1;
152}
153__setup("noxsaveopt", x86_xsaveopt_setup);
154
147#ifdef CONFIG_X86_32 155#ifdef CONFIG_X86_32
148static int cachesize_override __cpuinitdata = -1; 156static int cachesize_override __cpuinitdata = -1;
149static int disable_x86_serial_nr __cpuinitdata = 1; 157static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
551 c->x86_capability[4] = excap; 559 c->x86_capability[4] = excap;
552 } 560 }
553 561
562 /* Additional Intel-defined flags: level 0x00000007 */
563 if (c->cpuid_level >= 0x00000007) {
564 u32 eax, ebx, ecx, edx;
565
566 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
567
568 if (eax > 0)
569 c->x86_capability[9] = ebx;
570 }
571
554 /* AMD-defined flags: level 0x80000001 */ 572 /* AMD-defined flags: level 0x80000001 */
555 xlvl = cpuid_eax(0x80000000); 573 xlvl = cpuid_eax(0x80000000);
556 c->extended_cpuid_level = xlvl; 574 c->extended_cpuid_level = xlvl;
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
576 if (c->extended_cpuid_level >= 0x80000007) 594 if (c->extended_cpuid_level >= 0x80000007)
577 c->x86_power = cpuid_edx(0x80000007); 595 c->x86_power = cpuid_edx(0x80000007);
578 596
597 init_scattered_cpuid_features(c);
579} 598}
580 599
581static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) 600static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
731 750
732 get_model_name(c); /* Default name */ 751 get_model_name(c); /* Default name */
733 752
734 init_scattered_cpuid_features(c);
735 detect_nopl(c); 753 detect_nopl(c);
736} 754}
737 755
@@ -1084,6 +1102,20 @@ static void clear_all_debug_regs(void)
1084 } 1102 }
1085} 1103}
1086 1104
1105#ifdef CONFIG_KGDB
1106/*
1107 * Restore debug regs if using kgdbwait and you have a kernel debugger
1108 * connection established.
1109 */
1110static void dbg_restore_debug_regs(void)
1111{
1112 if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
1113 arch_kgdb_ops.correct_hw_break();
1114}
1115#else /* ! CONFIG_KGDB */
1116#define dbg_restore_debug_regs()
1117#endif /* ! CONFIG_KGDB */
1118
1087/* 1119/*
1088 * cpu_init() initializes state that is per-CPU. Some data is already 1120 * cpu_init() initializes state that is per-CPU. Some data is already
1089 * initialized (naturally) in the bootstrap process, such as the GDT 1121 * initialized (naturally) in the bootstrap process, such as the GDT
@@ -1107,9 +1139,9 @@ void __cpuinit cpu_init(void)
1107 oist = &per_cpu(orig_ist, cpu); 1139 oist = &per_cpu(orig_ist, cpu);
1108 1140
1109#ifdef CONFIG_NUMA 1141#ifdef CONFIG_NUMA
1110 if (cpu != 0 && percpu_read(node_number) == 0 && 1142 if (cpu != 0 && percpu_read(numa_node) == 0 &&
1111 cpu_to_node(cpu) != NUMA_NO_NODE) 1143 early_cpu_to_node(cpu) != NUMA_NO_NODE)
1112 percpu_write(node_number, cpu_to_node(cpu)); 1144 set_numa_node(early_cpu_to_node(cpu));
1113#endif 1145#endif
1114 1146
1115 me = current; 1147 me = current;
@@ -1174,20 +1206,11 @@ void __cpuinit cpu_init(void)
1174 load_TR_desc(); 1206 load_TR_desc();
1175 load_LDT(&init_mm.context); 1207 load_LDT(&init_mm.context);
1176 1208
1177#ifdef CONFIG_KGDB 1209 clear_all_debug_regs();
1178 /* 1210 dbg_restore_debug_regs();
1179 * If the kgdb is connected no debug regs should be altered. This
1180 * is only applicable when KGDB and a KGDB I/O module are built
1181 * into the kernel and you are using early debugging with
1182 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1183 */
1184 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1185 arch_kgdb_ops.correct_hw_break();
1186 else
1187#endif
1188 clear_all_debug_regs();
1189 1211
1190 fpu_init(); 1212 fpu_init();
1213 xsave_init();
1191 1214
1192 raw_local_save_flags(kernel_eflags); 1215 raw_local_save_flags(kernel_eflags);
1193 1216
@@ -1239,23 +1262,16 @@ void __cpuinit cpu_init(void)
1239#endif 1262#endif
1240 1263
1241 clear_all_debug_regs(); 1264 clear_all_debug_regs();
1265 dbg_restore_debug_regs();
1242 1266
1243 /* 1267 /*
1244 * Force FPU initialization: 1268 * Force FPU initialization:
1245 */ 1269 */
1246 if (cpu_has_xsave) 1270 current_thread_info()->status = 0;
1247 current_thread_info()->status = TS_XSAVE;
1248 else
1249 current_thread_info()->status = 0;
1250 clear_used_math(); 1271 clear_used_math();
1251 mxcsr_feature_mask_init(); 1272 mxcsr_feature_mask_init();
1252 1273
1253 /* 1274 fpu_init();
1254 * Boot processor to setup the FP and extended state context info.
1255 */
1256 if (smp_processor_id() == boot_cpu_id)
1257 init_thread_xstate();
1258
1259 xsave_init(); 1275 xsave_init();
1260} 1276}
1261#endif 1277#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 1840c0a5170b..bd54bf67e6fb 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,8 @@
2# K8 systems. ACPI is preferred to all other hardware-specific drivers. 2# K8 systems. ACPI is preferred to all other hardware-specific drivers.
3# speedstep-* is preferred over p4-clockmod. 3# speedstep-* is preferred over p4-clockmod.
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o 7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 459168083b77..246cd3afbb5f 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -34,7 +34,6 @@
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <trace/events/power.h>
38 37
39#include <linux/acpi.h> 38#include <linux/acpi.h>
40#include <linux/io.h> 39#include <linux/io.h>
@@ -46,6 +45,7 @@
46#include <asm/msr.h> 45#include <asm/msr.h>
47#include <asm/processor.h> 46#include <asm/processor.h>
48#include <asm/cpufeature.h> 47#include <asm/cpufeature.h>
48#include "mperf.h"
49 49
50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ 50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
51 "acpi-cpufreq", msg) 51 "acpi-cpufreq", msg)
@@ -71,8 +71,6 @@ struct acpi_cpufreq_data {
71 71
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); 72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
73 73
74static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
75
76/* acpi_perf_data is a pointer to percpu data. */ 74/* acpi_perf_data is a pointer to percpu data. */
77static struct acpi_processor_performance *acpi_perf_data; 75static struct acpi_processor_performance *acpi_perf_data;
78 76
@@ -240,45 +238,6 @@ static u32 get_cur_val(const struct cpumask *mask)
240 return cmd.val; 238 return cmd.val;
241} 239}
242 240
243/* Called via smp_call_function_single(), on the target CPU */
244static void read_measured_perf_ctrs(void *_cur)
245{
246 struct aperfmperf *am = _cur;
247
248 get_aperfmperf(am);
249}
250
251/*
252 * Return the measured active (C0) frequency on this CPU since last call
253 * to this function.
254 * Input: cpu number
255 * Return: Average CPU frequency in terms of max frequency (zero on error)
256 *
257 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
258 * over a period of time, while CPU is in C0 state.
259 * IA32_MPERF counts at the rate of max advertised frequency
260 * IA32_APERF counts at the rate of actual CPU frequency
261 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
262 * no meaning should be associated with absolute values of these MSRs.
263 */
264static unsigned int get_measured_perf(struct cpufreq_policy *policy,
265 unsigned int cpu)
266{
267 struct aperfmperf perf;
268 unsigned long ratio;
269 unsigned int retval;
270
271 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
272 return 0;
273
274 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
275 per_cpu(acfreq_old_perf, cpu) = perf;
276
277 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
278
279 return retval;
280}
281
282static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 241static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
283{ 242{
284 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); 243 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -364,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
364 } 323 }
365 } 324 }
366 325
367 trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
368
369 switch (data->cpu_feature) { 326 switch (data->cpu_feature) {
370 case SYSTEM_INTEL_MSR_CAPABLE: 327 case SYSTEM_INTEL_MSR_CAPABLE:
371 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 328 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
@@ -391,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
391 348
392 freqs.old = perf->states[perf->state].core_frequency * 1000; 349 freqs.old = perf->states[perf->state].core_frequency * 1000;
393 freqs.new = data->freq_table[next_state].frequency; 350 freqs.new = data->freq_table[next_state].frequency;
394 for_each_cpu(i, cmd.mask) { 351 for_each_cpu(i, policy->cpus) {
395 freqs.cpu = i; 352 freqs.cpu = i;
396 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 353 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
397 } 354 }
@@ -407,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
407 } 364 }
408 } 365 }
409 366
410 for_each_cpu(i, cmd.mask) { 367 for_each_cpu(i, policy->cpus) {
411 freqs.cpu = i; 368 freqs.cpu = i;
412 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 369 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
413 } 370 }
@@ -702,7 +659,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
702 659
703 /* Check for APERF/MPERF support in hardware */ 660 /* Check for APERF/MPERF support in hardware */
704 if (cpu_has(c, X86_FEATURE_APERFMPERF)) 661 if (cpu_has(c, X86_FEATURE_APERFMPERF))
705 acpi_cpufreq_driver.getavg = get_measured_perf; 662 acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
706 663
707 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 664 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
708 for (i = 0; i < perf->state_count; i++) 665 for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 16e3483be9e3..32974cf84232 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -169,12 +169,9 @@ static int gx_freq_mult[16] = {
169 * Low Level chipset interface * 169 * Low Level chipset interface *
170 ****************************************************************/ 170 ****************************************************************/
171static struct pci_device_id gx_chipset_tbl[] __initdata = { 171static struct pci_device_id gx_chipset_tbl[] __initdata = {
172 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, 172 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
173 PCI_ANY_ID, PCI_ANY_ID }, 173 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
174 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, 174 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
175 PCI_ANY_ID, PCI_ANY_ID },
176 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
177 PCI_ANY_ID, PCI_ANY_ID },
178 { 0, }, 175 { 0, },
179}; 176};
180 177
@@ -199,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
199 } 196 }
200 197
201 /* detect which companion chip is used */ 198 /* detect which companion chip is used */
202 while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { 199 for_each_pci_dev(gx_pci) {
203 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) 200 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
204 return gx_pci; 201 return gx_pci;
205 } 202 }
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 7e7eea4f8261..03162dac6271 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -426,7 +426,7 @@ static int guess_fsb(int mult)
426} 426}
427 427
428 428
429static int __init longhaul_get_ranges(void) 429static int __cpuinit longhaul_get_ranges(void)
430{ 430{
431 unsigned int i, j, k = 0; 431 unsigned int i, j, k = 0;
432 unsigned int ratio; 432 unsigned int ratio;
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void)
530} 530}
531 531
532 532
533static void __init longhaul_setup_voltagescaling(void) 533static void __cpuinit longhaul_setup_voltagescaling(void)
534{ 534{
535 union msr_longhaul longhaul; 535 union msr_longhaul longhaul;
536 struct mV_pos minvid, maxvid, vid; 536 struct mV_pos minvid, maxvid, vid;
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void)
784 return 0; 784 return 0;
785} 785}
786 786
787static int __init longhaul_cpu_init(struct cpufreq_policy *policy) 787static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
788{ 788{
789 struct cpuinfo_x86 *c = &cpu_data(0); 789 struct cpuinfo_x86 *c = &cpu_data(0);
790 char *cpuname = NULL; 790 char *cpuname = NULL;
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index e2360a469f79..cbf48fbca881 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -56,7 +56,7 @@ union msr_longhaul {
56/* 56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0) 57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */ 58 */
59static const int __initdata samuel1_mults[16] = { 59static const int __cpuinitdata samuel1_mults[16] = {
60 -1, /* 0000 -> RESERVED */ 60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */ 61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */ 62 40, /* 0010 -> 4.0x */
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = {
75 -1, /* 1111 -> RESERVED */ 75 -1, /* 1111 -> RESERVED */
76}; 76};
77 77
78static const int __initdata samuel1_eblcr[16] = { 78static const int __cpuinitdata samuel1_eblcr[16] = {
79 50, /* 0000 -> RESERVED */ 79 50, /* 0000 -> RESERVED */
80 30, /* 0001 -> 3.0x */ 80 30, /* 0001 -> 3.0x */
81 40, /* 0010 -> 4.0x */ 81 40, /* 0010 -> 4.0x */
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = {
97/* 97/*
98 * VIA C3 Samuel2 Stepping 1->15 98 * VIA C3 Samuel2 Stepping 1->15
99 */ 99 */
100static const int __initdata samuel2_eblcr[16] = { 100static const int __cpuinitdata samuel2_eblcr[16] = {
101 50, /* 0000 -> 5.0x */ 101 50, /* 0000 -> 5.0x */
102 30, /* 0001 -> 3.0x */ 102 30, /* 0001 -> 3.0x */
103 40, /* 0010 -> 4.0x */ 103 40, /* 0010 -> 4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
119/* 119/*
120 * VIA C3 Ezra 120 * VIA C3 Ezra
121 */ 121 */
122static const int __initdata ezra_mults[16] = { 122static const int __cpuinitdata ezra_mults[16] = {
123 100, /* 0000 -> 10.0x */ 123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */ 124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */ 125 40, /* 0010 -> 4.0x */
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = {
138 120, /* 1111 -> 12.0x */ 138 120, /* 1111 -> 12.0x */
139}; 139};
140 140
141static const int __initdata ezra_eblcr[16] = { 141static const int __cpuinitdata ezra_eblcr[16] = {
142 50, /* 0000 -> 5.0x */ 142 50, /* 0000 -> 5.0x */
143 30, /* 0001 -> 3.0x */ 143 30, /* 0001 -> 3.0x */
144 40, /* 0010 -> 4.0x */ 144 40, /* 0010 -> 4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
160/* 160/*
161 * VIA C3 (Ezra-T) [C5M]. 161 * VIA C3 (Ezra-T) [C5M].
162 */ 162 */
163static const int __initdata ezrat_mults[32] = { 163static const int __cpuinitdata ezrat_mults[32] = {
164 100, /* 0000 -> 10.0x */ 164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */ 165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */ 166 40, /* 0010 -> 4.0x */
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = {
196 -1, /* 1111 -> RESERVED (12.0x) */ 196 -1, /* 1111 -> RESERVED (12.0x) */
197}; 197};
198 198
199static const int __initdata ezrat_eblcr[32] = { 199static const int __cpuinitdata ezrat_eblcr[32] = {
200 50, /* 0000 -> 5.0x */ 200 50, /* 0000 -> 5.0x */
201 30, /* 0001 -> 3.0x */ 201 30, /* 0001 -> 3.0x */
202 40, /* 0010 -> 4.0x */ 202 40, /* 0010 -> 4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
235/* 235/*
236 * VIA C3 Nehemiah */ 236 * VIA C3 Nehemiah */
237 237
238static const int __initdata nehemiah_mults[32] = { 238static const int __cpuinitdata nehemiah_mults[32] = {
239 100, /* 0000 -> 10.0x */ 239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */ 240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */ 241 40, /* 0010 -> 4.0x */
@@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = {
270 -1, /* 1111 -> 12.0x */ 270 -1, /* 1111 -> 12.0x */
271}; 271};
272 272
273static const int __initdata nehemiah_eblcr[32] = { 273static const int __cpuinitdata nehemiah_eblcr[32] = {
274 50, /* 0000 -> 5.0x */ 274 50, /* 0000 -> 5.0x */
275 160, /* 0001 -> 16.0x */ 275 160, /* 0001 -> 16.0x */
276 40, /* 0010 -> 4.0x */ 276 40, /* 0010 -> 4.0x */
@@ -315,7 +315,7 @@ struct mV_pos {
315 unsigned short pos; 315 unsigned short pos;
316}; 316};
317 317
318static const struct mV_pos __initdata vrm85_mV[32] = { 318static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, 319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, 320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, 321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = {
326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} 326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
327}; 327};
328 328
329static const unsigned char __initdata mV_vrm85[32] = { 329static const unsigned char __cpuinitdata mV_vrm85[32] = {
330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, 330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, 331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, 332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
334}; 334};
335 335
336static const struct mV_pos __initdata mobilevrm_mV[32] = { 336static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, 337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, 338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, 339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = {
344 {675, 3}, {650, 2}, {625, 1}, {600, 0} 344 {675, 3}, {650, 2}, {625, 1}, {600, 0}
345}; 345};
346 346
347static const unsigned char __initdata mV_mobilevrm[32] = { 347static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index e7b559d74c52..fc09f142d94d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu)
165 * TMTA rules: 165 * TMTA rules:
166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) 166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
167 */ 167 */
168static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, 168static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
169 unsigned int *high_freq) 169 unsigned int *high_freq)
170{ 170{
171 u32 msr_lo, msr_hi; 171 u32 msr_lo, msr_hi;
172 u32 save_lo, save_hi; 172 u32 save_lo, save_hi;
@@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
258} 258}
259 259
260 260
261static int __init longrun_cpu_init(struct cpufreq_policy *policy) 261static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
262{ 262{
263 int result = 0; 263 int result = 0;
264 264
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 000000000000..911e193018ae
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
1#include <linux/kernel.h>
2#include <linux/smp.h>
3#include <linux/module.h>
4#include <linux/init.h>
5#include <linux/cpufreq.h>
6#include <linux/slab.h>
7
8#include "mperf.h"
9
10static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
11
12/* Called via smp_call_function_single(), on the target CPU */
13static void read_measured_perf_ctrs(void *_cur)
14{
15 struct aperfmperf *am = _cur;
16
17 get_aperfmperf(am);
18}
19
20/*
21 * Return the measured active (C0) frequency on this CPU since last call
22 * to this function.
23 * Input: cpu number
24 * Return: Average CPU frequency in terms of max frequency (zero on error)
25 *
26 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
27 * over a period of time, while CPU is in C0 state.
28 * IA32_MPERF counts at the rate of max advertised frequency
29 * IA32_APERF counts at the rate of actual CPU frequency
30 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
31 * no meaning should be associated with absolute values of these MSRs.
32 */
33unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
34 unsigned int cpu)
35{
36 struct aperfmperf perf;
37 unsigned long ratio;
38 unsigned int retval;
39
40 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
41 return 0;
42
43 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
44 per_cpu(acfreq_old_perf, cpu) = perf;
45
46 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
47
48 return retval;
49}
50EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
51MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 000000000000..5dbf2950dc22
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
1/*
2 * (c) 2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
9 unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 7b8a8ba67b07..bd1cac747f67 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
178 } 178 }
179 } 179 }
180 180
181 if (c->x86 != 0xF) { 181 if (c->x86 != 0xF)
182 if (!cpu_has(c, X86_FEATURE_EST))
183 printk(KERN_WARNING PFX "Unknown CPU. "
184 "Please send an e-mail to "
185 "<cpufreq@vger.kernel.org>\n");
186 return 0; 182 return 0;
187 }
188 183
189 /* on P-4s, the TSC runs with constant frequency independent whether 184 /* on P-4s, the TSC runs with constant frequency independent whether
190 * throttling is active or not. */ 185 * throttling is active or not. */
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index ce7cde713e71..a36de5bbb622 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -368,22 +368,16 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
368 return -ENODEV; 368 return -ENODEV;
369 369
370 out_obj = output.pointer; 370 out_obj = output.pointer;
371 if (out_obj->type != ACPI_TYPE_BUFFER) { 371 if (out_obj->type != ACPI_TYPE_BUFFER)
372 ret = -ENODEV; 372 return -ENODEV;
373 goto out_free;
374 }
375 373
376 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); 374 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
377 if (errors) { 375 if (errors)
378 ret = -ENODEV; 376 return -ENODEV;
379 goto out_free;
380 }
381 377
382 supported = *((u32 *)(out_obj->buffer.pointer + 4)); 378 supported = *((u32 *)(out_obj->buffer.pointer + 4));
383 if (!(supported & 0x1)) { 379 if (!(supported & 0x1))
384 ret = -ENODEV; 380 return -ENODEV;
385 goto out_free;
386 }
387 381
388out_free: 382out_free:
389 kfree(output.pointer); 383 kfree(output.pointer);
@@ -397,13 +391,17 @@ static int __init pcc_cpufreq_probe(void)
397 struct pcc_memory_resource *mem_resource; 391 struct pcc_memory_resource *mem_resource;
398 struct pcc_register_resource *reg_resource; 392 struct pcc_register_resource *reg_resource;
399 union acpi_object *out_obj, *member; 393 union acpi_object *out_obj, *member;
400 acpi_handle handle, osc_handle; 394 acpi_handle handle, osc_handle, pcch_handle;
401 int ret = 0; 395 int ret = 0;
402 396
403 status = acpi_get_handle(NULL, "\\_SB", &handle); 397 status = acpi_get_handle(NULL, "\\_SB", &handle);
404 if (ACPI_FAILURE(status)) 398 if (ACPI_FAILURE(status))
405 return -ENODEV; 399 return -ENODEV;
406 400
401 status = acpi_get_handle(handle, "PCCH", &pcch_handle);
402 if (ACPI_FAILURE(status))
403 return -ENODEV;
404
407 status = acpi_get_handle(handle, "_OSC", &osc_handle); 405 status = acpi_get_handle(handle, "_OSC", &osc_handle);
408 if (ACPI_SUCCESS(status)) { 406 if (ACPI_SUCCESS(status)) {
409 ret = pcc_cpufreq_do_osc(&osc_handle); 407 ret = pcc_cpufreq_do_osc(&osc_handle);
@@ -543,13 +541,13 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
543 541
544 if (!pcch_virt_addr) { 542 if (!pcch_virt_addr) {
545 result = -1; 543 result = -1;
546 goto pcch_null; 544 goto out;
547 } 545 }
548 546
549 result = pcc_get_offset(cpu); 547 result = pcc_get_offset(cpu);
550 if (result) { 548 if (result) {
551 dprintk("init: PCCP evaluation failed\n"); 549 dprintk("init: PCCP evaluation failed\n");
552 goto free; 550 goto out;
553 } 551 }
554 552
555 policy->max = policy->cpuinfo.max_freq = 553 policy->max = policy->cpuinfo.max_freq =
@@ -558,14 +556,15 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
558 ioread32(&pcch_hdr->minimum_frequency) * 1000; 556 ioread32(&pcch_hdr->minimum_frequency) * 1000;
559 policy->cur = pcc_get_freq(cpu); 557 policy->cur = pcc_get_freq(cpu);
560 558
559 if (!policy->cur) {
560 dprintk("init: Unable to get current CPU frequency\n");
561 result = -EINVAL;
562 goto out;
563 }
564
561 dprintk("init: policy->max is %d, policy->min is %d\n", 565 dprintk("init: policy->max is %d, policy->min is %d\n",
562 policy->max, policy->min); 566 policy->max, policy->min);
563 567out:
564 return 0;
565free:
566 pcc_clear_mapping();
567 free_percpu(pcc_cpu_info);
568pcch_null:
569 return result; 568 return result;
570} 569}
571 570
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 9a97116f89e5..4a45fd6e41ba 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy)
569 * We will then get the same kind of behaviour already tested under 569 * We will then get the same kind of behaviour already tested under
570 * the "well-known" other OS. 570 * the "well-known" other OS.
571 */ 571 */
572static int __init fixup_sgtc(void) 572static int __cpuinit fixup_sgtc(void)
573{ 573{
574 unsigned int sgtc; 574 unsigned int sgtc;
575 unsigned int m; 575 unsigned int m;
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu)
603} 603}
604 604
605 605
606static int __init acer_cpufreq_pst(const struct dmi_system_id *d) 606static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
607{ 607{
608 printk(KERN_WARNING PFX 608 printk(KERN_WARNING PFX
609 "%s laptop with broken PST tables in BIOS detected.\n", 609 "%s laptop with broken PST tables in BIOS detected.\n",
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
621 * A BIOS update is all that can save them. 621 * A BIOS update is all that can save them.
622 * Mention this, and disable cpufreq. 622 * Mention this, and disable cpufreq.
623 */ 623 */
624static struct dmi_system_id __initdata powernow_dmi_table[] = { 624static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
625 { 625 {
626 .callback = acer_cpufreq_pst, 626 .callback = acer_cpufreq_pst,
627 .ident = "Acer Aspire", 627 .ident = "Acer Aspire",
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
633 { } 633 { }
634}; 634};
635 635
636static int __init powernow_cpu_init(struct cpufreq_policy *policy) 636static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
637{ 637{
638 union msr_fidvidstatus fidvidstatus; 638 union msr_fidvidstatus fidvidstatus;
639 int result; 639 int result;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index b6215b9798e2..491977baf6c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
1
2/* 1/*
3 * (c) 2003-2006 Advanced Micro Devices, Inc. 2 * (c) 2003-2010 Advanced Micro Devices, Inc.
4 * Your use of this code is subject to the terms and conditions of the 3 * Your use of this code is subject to the terms and conditions of the
5 * GNU general public license version 2. See "COPYING" or 4 * GNU general public license version 2. See "COPYING" or
6 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
@@ -10,7 +9,7 @@
10 * Based on the powernow-k7.c module written by Dave Jones. 9 * Based on the powernow-k7.c module written by Dave Jones.
11 * (C) 2003 Dave Jones on behalf of SuSE Labs 10 * (C) 2003 Dave Jones on behalf of SuSE Labs
12 * (C) 2004 Dominik Brodowski <linux@brodo.de> 11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
13 * (C) 2004 Pavel Machek <pavel@suse.cz> 12 * (C) 2004 Pavel Machek <pavel@ucw.cz>
14 * Licensed under the terms of the GNU GPL License version 2. 13 * Licensed under the terms of the GNU GPL License version 2.
15 * Based upon datasheets & sample CPUs kindly provided by AMD. 14 * Based upon datasheets & sample CPUs kindly provided by AMD.
16 * 15 *
@@ -46,6 +45,7 @@
46#define PFX "powernow-k8: " 45#define PFX "powernow-k8: "
47#define VERSION "version 2.20.00" 46#define VERSION "version 2.20.00"
48#include "powernow-k8.h" 47#include "powernow-k8.h"
48#include "mperf.h"
49 49
50/* serialize freq changes */ 50/* serialize freq changes */
51static DEFINE_MUTEX(fidvid_mutex); 51static DEFINE_MUTEX(fidvid_mutex);
@@ -54,6 +54,12 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
54 54
55static int cpu_family = CPU_OPTERON; 55static int cpu_family = CPU_OPTERON;
56 56
57/* core performance boost */
58static bool cpb_capable, cpb_enabled;
59static struct msr __percpu *msrs;
60
61static struct cpufreq_driver cpufreq_amd64_driver;
62
57#ifndef CONFIG_SMP 63#ifndef CONFIG_SMP
58static inline const struct cpumask *cpu_core_mask(int cpu) 64static inline const struct cpumask *cpu_core_mask(int cpu)
59{ 65{
@@ -800,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data)
800 * www.amd.com 806 * www.amd.com
801 */ 807 */
802 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); 808 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
809 printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
810 " and Cool'N'Quiet support is enabled in BIOS setup\n");
803 return -ENODEV; 811 return -ENODEV;
804} 812}
805 813
@@ -904,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
904{ 912{
905 int i; 913 int i;
906 u32 hi = 0, lo = 0; 914 u32 hi = 0, lo = 0;
907 rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); 915 rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
908 data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; 916 data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
909 917
910 for (i = 0; i < data->acpi_data.state_count; i++) { 918 for (i = 0; i < data->acpi_data.state_count; i++) {
911 u32 index; 919 u32 index;
@@ -1017,13 +1025,12 @@ static int get_transition_latency(struct powernow_k8_data *data)
1017 } 1025 }
1018 if (max_latency == 0) { 1026 if (max_latency == 0) {
1019 /* 1027 /*
1020 * Fam 11h always returns 0 as transition latency. 1028 * Fam 11h and later may return 0 as transition latency. This
1021 * This is intended and means "very fast". While cpufreq core 1029 * is intended and means "very fast". While cpufreq core and
1022 * and governors currently can handle that gracefully, better 1030 * governors currently can handle that gracefully, better set it
1023 * set it to 1 to avoid problems in the future. 1031 * to 1 to avoid problems in the future.
1024 * For all others it's a BIOS bug.
1025 */ 1032 */
1026 if (boot_cpu_data.x86 != 0x11) 1033 if (boot_cpu_data.x86 < 0x11)
1027 printk(KERN_ERR FW_WARN PFX "Invalid zero transition " 1034 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1028 "latency\n"); 1035 "latency\n");
1029 max_latency = 1; 1036 max_latency = 1;
@@ -1249,6 +1256,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1249 struct powernow_k8_data *data; 1256 struct powernow_k8_data *data;
1250 struct init_on_cpu init_on_cpu; 1257 struct init_on_cpu init_on_cpu;
1251 int rc; 1258 int rc;
1259 struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
1252 1260
1253 if (!cpu_online(pol->cpu)) 1261 if (!cpu_online(pol->cpu))
1254 return -ENODEV; 1262 return -ENODEV;
@@ -1323,6 +1331,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1323 return -EINVAL; 1331 return -EINVAL;
1324 } 1332 }
1325 1333
1334 /* Check for APERF/MPERF support in hardware */
1335 if (cpu_has(c, X86_FEATURE_APERFMPERF))
1336 cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
1337
1326 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); 1338 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1327 1339
1328 if (cpu_family == CPU_HW_PSTATE) 1340 if (cpu_family == CPU_HW_PSTATE)
@@ -1394,8 +1406,77 @@ out:
1394 return khz; 1406 return khz;
1395} 1407}
1396 1408
1409static void _cpb_toggle_msrs(bool t)
1410{
1411 int cpu;
1412
1413 get_online_cpus();
1414
1415 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1416
1417 for_each_cpu(cpu, cpu_online_mask) {
1418 struct msr *reg = per_cpu_ptr(msrs, cpu);
1419 if (t)
1420 reg->l &= ~BIT(25);
1421 else
1422 reg->l |= BIT(25);
1423 }
1424 wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1425
1426 put_online_cpus();
1427}
1428
1429/*
1430 * Switch on/off core performance boosting.
1431 *
1432 * 0=disable
1433 * 1=enable.
1434 */
1435static void cpb_toggle(bool t)
1436{
1437 if (!cpb_capable)
1438 return;
1439
1440 if (t && !cpb_enabled) {
1441 cpb_enabled = true;
1442 _cpb_toggle_msrs(t);
1443 printk(KERN_INFO PFX "Core Boosting enabled.\n");
1444 } else if (!t && cpb_enabled) {
1445 cpb_enabled = false;
1446 _cpb_toggle_msrs(t);
1447 printk(KERN_INFO PFX "Core Boosting disabled.\n");
1448 }
1449}
1450
1451static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
1452 size_t count)
1453{
1454 int ret = -EINVAL;
1455 unsigned long val = 0;
1456
1457 ret = strict_strtoul(buf, 10, &val);
1458 if (!ret && (val == 0 || val == 1) && cpb_capable)
1459 cpb_toggle(val);
1460 else
1461 return -EINVAL;
1462
1463 return count;
1464}
1465
1466static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
1467{
1468 return sprintf(buf, "%u\n", cpb_enabled);
1469}
1470
1471#define define_one_rw(_name) \
1472static struct freq_attr _name = \
1473__ATTR(_name, 0644, show_##_name, store_##_name)
1474
1475define_one_rw(cpb);
1476
1397static struct freq_attr *powernow_k8_attr[] = { 1477static struct freq_attr *powernow_k8_attr[] = {
1398 &cpufreq_freq_attr_scaling_available_freqs, 1478 &cpufreq_freq_attr_scaling_available_freqs,
1479 &cpb,
1399 NULL, 1480 NULL,
1400}; 1481};
1401 1482
@@ -1411,10 +1492,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
1411 .attr = powernow_k8_attr, 1492 .attr = powernow_k8_attr,
1412}; 1493};
1413 1494
1495/*
1496 * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
1497 * cannot block the remaining ones from boosting. On the CPU_UP path we
1498 * simply keep the boost-disable flag in sync with the current global
1499 * state.
1500 */
1501static int cpb_notify(struct notifier_block *nb, unsigned long action,
1502 void *hcpu)
1503{
1504 unsigned cpu = (long)hcpu;
1505 u32 lo, hi;
1506
1507 switch (action) {
1508 case CPU_UP_PREPARE:
1509 case CPU_UP_PREPARE_FROZEN:
1510
1511 if (!cpb_enabled) {
1512 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1513 lo |= BIT(25);
1514 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1515 }
1516 break;
1517
1518 case CPU_DOWN_PREPARE:
1519 case CPU_DOWN_PREPARE_FROZEN:
1520 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1521 lo &= ~BIT(25);
1522 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1523 break;
1524
1525 default:
1526 break;
1527 }
1528
1529 return NOTIFY_OK;
1530}
1531
1532static struct notifier_block cpb_nb = {
1533 .notifier_call = cpb_notify,
1534};
1535
1414/* driver entry point for init */ 1536/* driver entry point for init */
1415static int __cpuinit powernowk8_init(void) 1537static int __cpuinit powernowk8_init(void)
1416{ 1538{
1417 unsigned int i, supported_cpus = 0; 1539 unsigned int i, supported_cpus = 0, cpu;
1418 1540
1419 for_each_online_cpu(i) { 1541 for_each_online_cpu(i) {
1420 int rc; 1542 int rc;
@@ -1423,15 +1545,36 @@ static int __cpuinit powernowk8_init(void)
1423 supported_cpus++; 1545 supported_cpus++;
1424 } 1546 }
1425 1547
1426 if (supported_cpus == num_online_cpus()) { 1548 if (supported_cpus != num_online_cpus())
1427 printk(KERN_INFO PFX "Found %d %s " 1549 return -ENODEV;
1428 "processors (%d cpu cores) (" VERSION ")\n", 1550
1429 num_online_nodes(), 1551 printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
1430 boot_cpu_data.x86_model_id, supported_cpus); 1552 num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
1431 return cpufreq_register_driver(&cpufreq_amd64_driver); 1553
1554 if (boot_cpu_has(X86_FEATURE_CPB)) {
1555
1556 cpb_capable = true;
1557
1558 register_cpu_notifier(&cpb_nb);
1559
1560 msrs = msrs_alloc();
1561 if (!msrs) {
1562 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
1563 return -ENOMEM;
1564 }
1565
1566 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1567
1568 for_each_cpu(cpu, cpu_online_mask) {
1569 struct msr *reg = per_cpu_ptr(msrs, cpu);
1570 cpb_enabled |= !(!!(reg->l & BIT(25)));
1571 }
1572
1573 printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
1574 (cpb_enabled ? "on" : "off"));
1432 } 1575 }
1433 1576
1434 return -ENODEV; 1577 return cpufreq_register_driver(&cpufreq_amd64_driver);
1435} 1578}
1436 1579
1437/* driver entry point for term */ 1580/* driver entry point for term */
@@ -1439,6 +1582,13 @@ static void __exit powernowk8_exit(void)
1439{ 1582{
1440 dprintk("exit\n"); 1583 dprintk("exit\n");
1441 1584
1585 if (boot_cpu_has(X86_FEATURE_CPB)) {
1586 msrs_free(msrs);
1587 msrs = NULL;
1588
1589 unregister_cpu_notifier(&cpb_nb);
1590 }
1591
1442 cpufreq_unregister_driver(&cpufreq_amd64_driver); 1592 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1443} 1593}
1444 1594
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824073cb..df3529b1c02d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
5 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
6 */ 6 */
7 7
8
9enum pstate { 8enum pstate {
10 HW_PSTATE_INVALID = 0xff, 9 HW_PSTATE_INVALID = 0xff,
11 HW_PSTATE_0 = 0, 10 HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
55 struct cpumask *available_cores; 54 struct cpumask *available_cores;
56}; 55};
57 56
58
59/* processor's cpuid instruction support */ 57/* processor's cpuid instruction support */
60#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ 58#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
61#define CPUID_XFAM 0x0ff00000 /* extended family */ 59#define CPUID_XFAM 0x0ff00000 /* extended family */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922de33a..8095f8611f8a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,37 +21,58 @@
21 * 21 *
22 */ 22 */
23 23
24#include <linux/module.h>
24#include <asm/processor.h> 25#include <asm/processor.h>
25#include <asm/vmware.h>
26#include <asm/hypervisor.h> 26#include <asm/hypervisor.h>
27 27
28static inline void __cpuinit 28/*
29detect_hypervisor_vendor(struct cpuinfo_x86 *c) 29 * Hypervisor detect order. This is specified explicitly here because
30 * some hypervisors might implement compatibility modes for other
31 * hypervisors and therefore need to be detected in specific sequence.
32 */
33static const __initconst struct hypervisor_x86 * const hypervisors[] =
30{ 34{
31 if (vmware_platform()) 35 &x86_hyper_vmware,
32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; 36 &x86_hyper_ms_hyperv,
33 else 37#ifdef CONFIG_XEN_PVHVM
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; 38 &x86_hyper_xen_hvm,
35} 39#endif
40};
36 41
37static inline void __cpuinit 42const struct hypervisor_x86 *x86_hyper;
38hypervisor_set_feature_bits(struct cpuinfo_x86 *c) 43EXPORT_SYMBOL(x86_hyper);
44
45static inline void __init
46detect_hypervisor_vendor(void)
39{ 47{
40 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { 48 const struct hypervisor_x86 *h, * const *p;
41 vmware_set_feature_bits(c); 49
42 return; 50 for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
51 h = *p;
52 if (h->detect()) {
53 x86_hyper = h;
54 printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
55 break;
56 }
43 } 57 }
44} 58}
45 59
46void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) 60void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
47{ 61{
48 detect_hypervisor_vendor(c); 62 if (x86_hyper && x86_hyper->set_cpu_features)
49 hypervisor_set_feature_bits(c); 63 x86_hyper->set_cpu_features(c);
50} 64}
51 65
52void __init init_hypervisor_platform(void) 66void __init init_hypervisor_platform(void)
53{ 67{
68
69 detect_hypervisor_vendor();
70
71 if (!x86_hyper)
72 return;
73
54 init_hypervisor(&boot_cpu_data); 74 init_hypervisor(&boot_cpu_data);
55 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) 75
56 vmware_platform_setup(); 76 if (x86_hyper->init_platform)
77 x86_hyper->init_platform();
57} 78}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1366c7cfd483..85f69cdeae10 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
12#include <asm/processor.h> 12#include <asm/processor.h>
13#include <asm/pgtable.h> 13#include <asm/pgtable.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17#include <asm/cpu.h> 16#include <asm/cpu.h>
18 17
@@ -373,12 +372,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
373 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 372 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
374 } 373 }
375 374
376 if (c->cpuid_level > 6) {
377 unsigned ecx = cpuid_ecx(6);
378 if (ecx & 0x01)
379 set_cpu_cap(c, X86_FEATURE_APERFMPERF);
380 }
381
382 if (cpu_has_xmm2) 375 if (cpu_has_xmm2)
383 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 376 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
384 if (cpu_has_ds) { 377 if (cpu_has_ds) {
@@ -388,7 +381,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
388 set_cpu_cap(c, X86_FEATURE_BTS); 381 set_cpu_cap(c, X86_FEATURE_BTS);
389 if (!(l1 & (1<<12))) 382 if (!(l1 & (1<<12)))
390 set_cpu_cap(c, X86_FEATURE_PEBS); 383 set_cpu_cap(c, X86_FEATURE_PEBS);
391 ds_init_intel(c);
392 } 384 }
393 385
394 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) 386 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 95962a93f99a..898c2f4eab88 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx {
148 u32 full; 148 u32 full;
149}; 149};
150 150
151struct amd_l3_cache {
152 struct pci_dev *dev;
153 bool can_disable;
154 unsigned indices;
155 u8 subcaches[4];
156};
157
151struct _cpuid4_info { 158struct _cpuid4_info {
152 union _cpuid4_leaf_eax eax; 159 union _cpuid4_leaf_eax eax;
153 union _cpuid4_leaf_ebx ebx; 160 union _cpuid4_leaf_ebx ebx;
154 union _cpuid4_leaf_ecx ecx; 161 union _cpuid4_leaf_ecx ecx;
155 unsigned long size; 162 unsigned long size;
156 bool can_disable; 163 struct amd_l3_cache *l3;
157 unsigned int l3_indices;
158 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 164 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
159}; 165};
160 166
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs {
164 union _cpuid4_leaf_ebx ebx; 170 union _cpuid4_leaf_ebx ebx;
165 union _cpuid4_leaf_ecx ecx; 171 union _cpuid4_leaf_ecx ecx;
166 unsigned long size; 172 unsigned long size;
167 bool can_disable; 173 struct amd_l3_cache *l3;
168 unsigned int l3_indices;
169}; 174};
170 175
171unsigned short num_cache_leaves; 176unsigned short num_cache_leaves;
@@ -302,124 +307,246 @@ struct _cache_attr {
302}; 307};
303 308
304#ifdef CONFIG_CPU_SUP_AMD 309#ifdef CONFIG_CPU_SUP_AMD
305static unsigned int __cpuinit amd_calc_l3_indices(void) 310
311/*
312 * L3 cache descriptors
313 */
314static struct amd_l3_cache **__cpuinitdata l3_caches;
315
316static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
306{ 317{
307 /*
308 * We're called over smp_call_function_single() and therefore
309 * are on the correct cpu.
310 */
311 int cpu = smp_processor_id();
312 int node = cpu_to_node(cpu);
313 struct pci_dev *dev = node_to_k8_nb_misc(node);
314 unsigned int sc0, sc1, sc2, sc3; 318 unsigned int sc0, sc1, sc2, sc3;
315 u32 val = 0; 319 u32 val = 0;
316 320
317 pci_read_config_dword(dev, 0x1C4, &val); 321 pci_read_config_dword(l3->dev, 0x1C4, &val);
318 322
319 /* calculate subcache sizes */ 323 /* calculate subcache sizes */
320 sc0 = !(val & BIT(0)); 324 l3->subcaches[0] = sc0 = !(val & BIT(0));
321 sc1 = !(val & BIT(4)); 325 l3->subcaches[1] = sc1 = !(val & BIT(4));
322 sc2 = !(val & BIT(8)) + !(val & BIT(9)); 326 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
323 sc3 = !(val & BIT(12)) + !(val & BIT(13)); 327 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
324 328
325 return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; 329 l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
326} 330}
327 331
328static void __cpuinit 332static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
329amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
330{ 333{
331 if (index < 3) 334 struct amd_l3_cache *l3;
335 struct pci_dev *dev = node_to_k8_nb_misc(node);
336
337 l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
338 if (!l3) {
339 printk(KERN_WARNING "Error allocating L3 struct\n");
340 return NULL;
341 }
342
343 l3->dev = dev;
344
345 amd_calc_l3_indices(l3);
346
347 return l3;
348}
349
350static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
351 int index)
352{
353 int node;
354
355 if (boot_cpu_data.x86 != 0x10)
332 return; 356 return;
333 357
334 if (boot_cpu_data.x86 == 0x11) 358 if (index < 3)
335 return; 359 return;
336 360
337 /* see errata #382 and #388 */ 361 /* see errata #382 and #388 */
338 if ((boot_cpu_data.x86 == 0x10) && 362 if (boot_cpu_data.x86_model < 0x8)
339 ((boot_cpu_data.x86_model < 0x8) ||
340 (boot_cpu_data.x86_mask < 0x1)))
341 return; 363 return;
342 364
365 if ((boot_cpu_data.x86_model == 0x8 ||
366 boot_cpu_data.x86_model == 0x9)
367 &&
368 boot_cpu_data.x86_mask < 0x1)
369 return;
370
343 /* not in virtualized environments */ 371 /* not in virtualized environments */
344 if (num_k8_northbridges == 0) 372 if (num_k8_northbridges == 0)
345 return; 373 return;
346 374
347 this_leaf->can_disable = true; 375 /*
348 this_leaf->l3_indices = amd_calc_l3_indices(); 376 * Strictly speaking, the amount in @size below is leaked since it is
377 * never freed but this is done only on shutdown so it doesn't matter.
378 */
379 if (!l3_caches) {
380 int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
381
382 l3_caches = kzalloc(size, GFP_ATOMIC);
383 if (!l3_caches)
384 return;
385 }
386
387 node = amd_get_nb_id(smp_processor_id());
388
389 if (!l3_caches[node]) {
390 l3_caches[node] = amd_init_l3_cache(node);
391 l3_caches[node]->can_disable = true;
392 }
393
394 WARN_ON(!l3_caches[node]);
395
396 this_leaf->l3 = l3_caches[node];
349} 397}
350 398
351static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, 399/*
352 unsigned int index) 400 * check whether a slot used for disabling an L3 index is occupied.
401 * @l3: L3 cache descriptor
402 * @slot: slot number (0..1)
403 *
404 * @returns: the disabled index if used or negative value if slot free.
405 */
406int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
353{ 407{
354 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
355 int node = amd_get_nb_id(cpu);
356 struct pci_dev *dev = node_to_k8_nb_misc(node);
357 unsigned int reg = 0; 408 unsigned int reg = 0;
358 409
359 if (!this_leaf->can_disable) 410 pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
360 return -EINVAL; 411
412 /* check whether this slot is activated already */
413 if (reg & (3UL << 30))
414 return reg & 0xfff;
361 415
362 if (!dev) 416 return -1;
417}
418
419static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
420 unsigned int slot)
421{
422 int index;
423
424 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
363 return -EINVAL; 425 return -EINVAL;
364 426
365 pci_read_config_dword(dev, 0x1BC + index * 4, &reg); 427 index = amd_get_l3_disable_slot(this_leaf->l3, slot);
366 return sprintf(buf, "0x%08x\n", reg); 428 if (index >= 0)
429 return sprintf(buf, "%d\n", index);
430
431 return sprintf(buf, "FREE\n");
367} 432}
368 433
369#define SHOW_CACHE_DISABLE(index) \ 434#define SHOW_CACHE_DISABLE(slot) \
370static ssize_t \ 435static ssize_t \
371show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ 436show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \
372{ \ 437{ \
373 return show_cache_disable(this_leaf, buf, index); \ 438 return show_cache_disable(this_leaf, buf, slot); \
374} 439}
375SHOW_CACHE_DISABLE(0) 440SHOW_CACHE_DISABLE(0)
376SHOW_CACHE_DISABLE(1) 441SHOW_CACHE_DISABLE(1)
377 442
378static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, 443static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
379 const char *buf, size_t count, unsigned int index) 444 unsigned slot, unsigned long idx)
380{ 445{
381 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 446 int i;
382 int node = amd_get_nb_id(cpu); 447
383 struct pci_dev *dev = node_to_k8_nb_misc(node); 448 idx |= BIT(30);
384 unsigned long val = 0; 449
450 /*
451 * disable index in all 4 subcaches
452 */
453 for (i = 0; i < 4; i++) {
454 u32 reg = idx | (i << 20);
455
456 if (!l3->subcaches[i])
457 continue;
458
459 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
460
461 /*
462 * We need to WBINVD on a core on the node containing the L3
463 * cache which indices we disable therefore a simple wbinvd()
464 * is not sufficient.
465 */
466 wbinvd_on_cpu(cpu);
467
468 reg |= BIT(31);
469 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
470 }
471}
472
473/*
474 * disable a L3 cache index by using a disable-slot
475 *
476 * @l3: L3 cache descriptor
477 * @cpu: A CPU on the node containing the L3 cache
478 * @slot: slot number (0..1)
479 * @index: index to disable
480 *
481 * @return: 0 on success, error status on failure
482 */
483int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
484 unsigned long index)
485{
486 int ret = 0;
385 487
386#define SUBCACHE_MASK (3UL << 20) 488#define SUBCACHE_MASK (3UL << 20)
387#define SUBCACHE_INDEX 0xfff 489#define SUBCACHE_INDEX 0xfff
388 490
389 if (!this_leaf->can_disable) 491 /*
492 * check whether this slot is already used or
493 * the index is already disabled
494 */
495 ret = amd_get_l3_disable_slot(l3, slot);
496 if (ret >= 0)
390 return -EINVAL; 497 return -EINVAL;
391 498
499 /*
500 * check whether the other slot has disabled the
501 * same index already
502 */
503 if (index == amd_get_l3_disable_slot(l3, !slot))
504 return -EINVAL;
505
506 /* do not allow writes outside of allowed bits */
507 if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
508 ((index & SUBCACHE_INDEX) > l3->indices))
509 return -EINVAL;
510
511 amd_l3_disable_index(l3, cpu, slot, index);
512
513 return 0;
514}
515
516static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
517 const char *buf, size_t count,
518 unsigned int slot)
519{
520 unsigned long val = 0;
521 int cpu, err = 0;
522
392 if (!capable(CAP_SYS_ADMIN)) 523 if (!capable(CAP_SYS_ADMIN))
393 return -EPERM; 524 return -EPERM;
394 525
395 if (!dev) 526 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
396 return -EINVAL; 527 return -EINVAL;
397 528
398 if (strict_strtoul(buf, 10, &val) < 0) 529 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
399 return -EINVAL;
400 530
401 /* do not allow writes outside of allowed bits */ 531 if (strict_strtoul(buf, 10, &val) < 0)
402 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
403 ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
404 return -EINVAL; 532 return -EINVAL;
405 533
406 val |= BIT(30); 534 err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
407 pci_write_config_dword(dev, 0x1BC + index * 4, val); 535 if (err) {
408 /* 536 if (err == -EEXIST)
409 * We need to WBINVD on a core on the node containing the L3 cache which 537 printk(KERN_WARNING "L3 disable slot %d in use!\n",
410 * indices we disable therefore a simple wbinvd() is not sufficient. 538 slot);
411 */ 539 return err;
412 wbinvd_on_cpu(cpu); 540 }
413 pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
414 return count; 541 return count;
415} 542}
416 543
417#define STORE_CACHE_DISABLE(index) \ 544#define STORE_CACHE_DISABLE(slot) \
418static ssize_t \ 545static ssize_t \
419store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ 546store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
420 const char *buf, size_t count) \ 547 const char *buf, size_t count) \
421{ \ 548{ \
422 return store_cache_disable(this_leaf, buf, count, index); \ 549 return store_cache_disable(this_leaf, buf, count, slot); \
423} 550}
424STORE_CACHE_DISABLE(0) 551STORE_CACHE_DISABLE(0)
425STORE_CACHE_DISABLE(1) 552STORE_CACHE_DISABLE(1)
@@ -431,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
431 558
432#else /* CONFIG_CPU_SUP_AMD */ 559#else /* CONFIG_CPU_SUP_AMD */
433static void __cpuinit 560static void __cpuinit
434amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 561amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
435{ 562{
436}; 563};
437#endif /* CONFIG_CPU_SUP_AMD */ 564#endif /* CONFIG_CPU_SUP_AMD */
@@ -447,8 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
447 574
448 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 575 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
449 amd_cpuid4(index, &eax, &ebx, &ecx); 576 amd_cpuid4(index, &eax, &ebx, &ecx);
450 if (boot_cpu_data.x86 >= 0x10) 577 amd_check_l3_disable(this_leaf, index);
451 amd_check_l3_disable(index, this_leaf);
452 } else { 578 } else {
453 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 579 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
454 } 580 }
@@ -705,6 +831,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
705 for (i = 0; i < num_cache_leaves; i++) 831 for (i = 0; i < num_cache_leaves; i++)
706 cache_remove_shared_cpu_map(cpu, i); 832 cache_remove_shared_cpu_map(cpu, i);
707 833
834 kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
708 kfree(per_cpu(ici_cpuid4_info, cpu)); 835 kfree(per_cpu(ici_cpuid4_info, cpu));
709 per_cpu(ici_cpuid4_info, cpu) = NULL; 836 per_cpu(ici_cpuid4_info, cpu) = NULL;
710} 837}
@@ -989,7 +1116,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
989 1116
990 this_leaf = CPUID4_INFO_IDX(cpu, i); 1117 this_leaf = CPUID4_INFO_IDX(cpu, i);
991 1118
992 if (this_leaf->can_disable) 1119 if (this_leaf->l3 && this_leaf->l3->can_disable)
993 ktype_cache.default_attrs = default_l3_attrs; 1120 ktype_cache.default_attrs = default_l3_attrs;
994 else 1121 else
995 ktype_cache.default_attrs = default_attrs; 1122 ktype_cache.default_attrs = default_attrs;
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 4ac6d48fe11b..bb34b03af252 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o 7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
8 8
9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o 9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
10
11obj-$(CONFIG_ACPI_APEI) += mce-apei.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
new file mode 100644
index 000000000000..745b54f9be89
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -0,0 +1,138 @@
1/*
2 * Bridge between MCE and APEI
3 *
4 * On some machine, corrected memory errors are reported via APEI
5 * generic hardware error source (GHES) instead of corrected Machine
6 * Check. These corrected memory errors can be reported to user space
7 * through /dev/mcelog via faking a corrected Machine Check, so that
8 * the error memory page can be offlined by /sbin/mcelog if the error
9 * count for one page is beyond the threshold.
10 *
11 * For fatal MCE, save MCE record into persistent storage via ERST, so
12 * that the MCE record can be logged after reboot via ERST.
13 *
14 * Copyright 2010 Intel Corp.
15 * Author: Huang Ying <ying.huang@intel.com>
16 *
17 * This program is free software; you can redistribute it and/or
18 * modify it under the terms of the GNU General Public License version
19 * 2 as published by the Free Software Foundation.
20 *
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
25 *
26 * You should have received a copy of the GNU General Public License
27 * along with this program; if not, write to the Free Software
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 */
30
31#include <linux/kernel.h>
32#include <linux/acpi.h>
33#include <linux/cper.h>
34#include <acpi/apei.h>
35#include <asm/mce.h>
36
37#include "mce-internal.h"
38
39void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
40{
41 struct mce m;
42
43 /* Only corrected MC is reported */
44 if (!corrected)
45 return;
46
47 mce_setup(&m);
48 m.bank = 1;
49 /* Fake a memory read corrected error with unknown channel */
50 m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
51 m.addr = mem_err->physical_addr;
52 mce_log(&m);
53 mce_notify_irq();
54}
55EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
56
57#define CPER_CREATOR_MCE \
58 UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
59 0x64, 0x90, 0xb8, 0x9d)
60#define CPER_SECTION_TYPE_MCE \
61 UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
62 0x04, 0x4a, 0x38, 0xfc)
63
64/*
65 * CPER specification (in UEFI specification 2.3 appendix N) requires
66 * byte-packed.
67 */
68struct cper_mce_record {
69 struct cper_record_header hdr;
70 struct cper_section_descriptor sec_hdr;
71 struct mce mce;
72} __packed;
73
74int apei_write_mce(struct mce *m)
75{
76 struct cper_mce_record rcd;
77
78 memset(&rcd, 0, sizeof(rcd));
79 memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
80 rcd.hdr.revision = CPER_RECORD_REV;
81 rcd.hdr.signature_end = CPER_SIG_END;
82 rcd.hdr.section_count = 1;
83 rcd.hdr.error_severity = CPER_SER_FATAL;
84 /* timestamp, platform_id, partition_id are all invalid */
85 rcd.hdr.validation_bits = 0;
86 rcd.hdr.record_length = sizeof(rcd);
87 rcd.hdr.creator_id = CPER_CREATOR_MCE;
88 rcd.hdr.notification_type = CPER_NOTIFY_MCE;
89 rcd.hdr.record_id = cper_next_record_id();
90 rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
91
92 rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
93 rcd.sec_hdr.section_length = sizeof(rcd.mce);
94 rcd.sec_hdr.revision = CPER_SEC_REV;
95 /* fru_id and fru_text is invalid */
96 rcd.sec_hdr.validation_bits = 0;
97 rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
98 rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
99 rcd.sec_hdr.section_severity = CPER_SER_FATAL;
100
101 memcpy(&rcd.mce, m, sizeof(*m));
102
103 return erst_write(&rcd.hdr);
104}
105
106ssize_t apei_read_mce(struct mce *m, u64 *record_id)
107{
108 struct cper_mce_record rcd;
109 ssize_t len;
110
111 len = erst_read_next(&rcd.hdr, sizeof(rcd));
112 if (len <= 0)
113 return len;
114 /* Can not skip other records in storage via ERST unless clear them */
115 else if (len != sizeof(rcd) ||
116 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
117 if (printk_ratelimit())
118 pr_warning(
119 "MCE-APEI: Can not skip the unknown record in ERST");
120 return -EIO;
121 }
122
123 memcpy(m, &rcd.mce, sizeof(*m));
124 *record_id = rcd.hdr.record_id;
125
126 return sizeof(*m);
127}
128
129/* Check whether there is record in ERST */
130int apei_check_mce(void)
131{
132 return erst_get_record_count();
133}
134
135int apei_clear_mce(u64 record_id)
136{
137 return erst_clear(record_id);
138}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 32996f9fab67..fefcc69ee8b5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,3 +28,26 @@ extern int mce_ser;
28 28
29extern struct mce_bank *mce_banks; 29extern struct mce_bank *mce_banks;
30 30
31#ifdef CONFIG_ACPI_APEI
32int apei_write_mce(struct mce *m);
33ssize_t apei_read_mce(struct mce *m, u64 *record_id);
34int apei_check_mce(void);
35int apei_clear_mce(u64 record_id);
36#else
37static inline int apei_write_mce(struct mce *m)
38{
39 return -EINVAL;
40}
41static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
42{
43 return 0;
44}
45static inline int apei_check_mce(void)
46{
47 return 0;
48}
49static inline int apei_clear_mce(u64 record_id)
50{
51 return -EINVAL;
52}
53#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767e..ed41562909fe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -36,6 +36,7 @@
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/mm.h> 37#include <linux/mm.h>
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/edac_mce.h>
39 40
40#include <asm/processor.h> 41#include <asm/processor.h>
41#include <asm/hw_irq.h> 42#include <asm/hw_irq.h>
@@ -50,7 +51,7 @@
50static DEFINE_MUTEX(mce_read_mutex); 51static DEFINE_MUTEX(mce_read_mutex);
51 52
52#define rcu_dereference_check_mce(p) \ 53#define rcu_dereference_check_mce(p) \
53 rcu_dereference_check((p), \ 54 rcu_dereference_index_check((p), \
54 rcu_read_lock_sched_held() || \ 55 rcu_read_lock_sched_held() || \
55 lockdep_is_held(&mce_read_mutex)) 56 lockdep_is_held(&mce_read_mutex))
56 57
@@ -106,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
106static int default_decode_mce(struct notifier_block *nb, unsigned long val, 107static int default_decode_mce(struct notifier_block *nb, unsigned long val,
107 void *data) 108 void *data)
108{ 109{
109 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 110 pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
110 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 111 pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
111 112
112 return NOTIFY_STOP; 113 return NOTIFY_STOP;
113} 114}
@@ -169,6 +170,15 @@ void mce_log(struct mce *mce)
169 entry = rcu_dereference_check_mce(mcelog.next); 170 entry = rcu_dereference_check_mce(mcelog.next);
170 for (;;) { 171 for (;;) {
171 /* 172 /*
173 * If edac_mce is enabled, it will check the error type
174 * and will process it, if it is a known error.
175 * Otherwise, the error will be sent through mcelog
176 * interface
177 */
178 if (edac_mce_parse(mce))
179 return;
180
181 /*
172 * When the buffer fills up discard new entries. 182 * When the buffer fills up discard new entries.
173 * Assume that the earlier errors are the more 183 * Assume that the earlier errors are the more
174 * interesting ones: 184 * interesting ones:
@@ -201,11 +211,11 @@ void mce_log(struct mce *mce)
201 211
202static void print_mce(struct mce *m) 212static void print_mce(struct mce *m)
203{ 213{
204 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 214 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
205 m->extcpu, m->mcgstatus, m->bank, m->status); 215 m->extcpu, m->mcgstatus, m->bank, m->status);
206 216
207 if (m->ip) { 217 if (m->ip) {
208 pr_emerg("RIP%s %02x:<%016Lx> ", 218 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
209 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 219 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
210 m->cs, m->ip); 220 m->cs, m->ip);
211 221
@@ -214,14 +224,14 @@ static void print_mce(struct mce *m)
214 pr_cont("\n"); 224 pr_cont("\n");
215 } 225 }
216 226
217 pr_emerg("TSC %llx ", m->tsc); 227 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
218 if (m->addr) 228 if (m->addr)
219 pr_cont("ADDR %llx ", m->addr); 229 pr_cont("ADDR %llx ", m->addr);
220 if (m->misc) 230 if (m->misc)
221 pr_cont("MISC %llx ", m->misc); 231 pr_cont("MISC %llx ", m->misc);
222 232
223 pr_cont("\n"); 233 pr_cont("\n");
224 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 234 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
225 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 235 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
226 236
227 /* 237 /*
@@ -231,16 +241,6 @@ static void print_mce(struct mce *m)
231 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
232} 242}
233 243
234static void print_mce_head(void)
235{
236 pr_emerg("\nHARDWARE ERROR\n");
237}
238
239static void print_mce_tail(void)
240{
241 pr_emerg("This is not a software problem!\n");
242}
243
244#define PANIC_TIMEOUT 5 /* 5 seconds */ 244#define PANIC_TIMEOUT 5 /* 5 seconds */
245 245
246static atomic_t mce_paniced; 246static atomic_t mce_paniced;
@@ -264,7 +264,7 @@ static void wait_for_panic(void)
264 264
265static void mce_panic(char *msg, struct mce *final, char *exp) 265static void mce_panic(char *msg, struct mce *final, char *exp)
266{ 266{
267 int i; 267 int i, apei_err = 0;
268 268
269 if (!fake_panic) { 269 if (!fake_panic) {
270 /* 270 /*
@@ -281,14 +281,16 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
281 if (atomic_inc_return(&mce_fake_paniced) > 1) 281 if (atomic_inc_return(&mce_fake_paniced) > 1)
282 return; 282 return;
283 } 283 }
284 print_mce_head();
285 /* First print corrected ones that are still unlogged */ 284 /* First print corrected ones that are still unlogged */
286 for (i = 0; i < MCE_LOG_LEN; i++) { 285 for (i = 0; i < MCE_LOG_LEN; i++) {
287 struct mce *m = &mcelog.entry[i]; 286 struct mce *m = &mcelog.entry[i];
288 if (!(m->status & MCI_STATUS_VAL)) 287 if (!(m->status & MCI_STATUS_VAL))
289 continue; 288 continue;
290 if (!(m->status & MCI_STATUS_UC)) 289 if (!(m->status & MCI_STATUS_UC)) {
291 print_mce(m); 290 print_mce(m);
291 if (!apei_err)
292 apei_err = apei_write_mce(m);
293 }
292 } 294 }
293 /* Now print uncorrected but with the final one last */ 295 /* Now print uncorrected but with the final one last */
294 for (i = 0; i < MCE_LOG_LEN; i++) { 296 for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -297,22 +299,27 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
297 continue; 299 continue;
298 if (!(m->status & MCI_STATUS_UC)) 300 if (!(m->status & MCI_STATUS_UC))
299 continue; 301 continue;
300 if (!final || memcmp(m, final, sizeof(struct mce))) 302 if (!final || memcmp(m, final, sizeof(struct mce))) {
301 print_mce(m); 303 print_mce(m);
304 if (!apei_err)
305 apei_err = apei_write_mce(m);
306 }
302 } 307 }
303 if (final) 308 if (final) {
304 print_mce(final); 309 print_mce(final);
310 if (!apei_err)
311 apei_err = apei_write_mce(final);
312 }
305 if (cpu_missing) 313 if (cpu_missing)
306 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 314 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
307 print_mce_tail();
308 if (exp) 315 if (exp)
309 printk(KERN_EMERG "Machine check: %s\n", exp); 316 pr_emerg(HW_ERR "Machine check: %s\n", exp);
310 if (!fake_panic) { 317 if (!fake_panic) {
311 if (panic_timeout == 0) 318 if (panic_timeout == 0)
312 panic_timeout = mce_panic_timeout; 319 panic_timeout = mce_panic_timeout;
313 panic(msg); 320 panic(msg);
314 } else 321 } else
315 printk(KERN_EMERG "Fake kernel panic: %s\n", msg); 322 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
316} 323}
317 324
318/* Support code for software error injection */ 325/* Support code for software error injection */
@@ -539,7 +546,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
539 struct mce m; 546 struct mce m;
540 int i; 547 int i;
541 548
542 __get_cpu_var(mce_poll_count)++; 549 percpu_inc(mce_poll_count);
543 550
544 mce_setup(&m); 551 mce_setup(&m);
545 552
@@ -581,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
581 */ 588 */
582 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 589 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
583 mce_log(&m); 590 mce_log(&m);
591 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
584 add_taint(TAINT_MACHINE_CHECK); 592 add_taint(TAINT_MACHINE_CHECK);
585 } 593 }
586 594
@@ -934,7 +942,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
934 942
935 atomic_inc(&mce_entry); 943 atomic_inc(&mce_entry);
936 944
937 __get_cpu_var(mce_exception_count)++; 945 percpu_inc(mce_exception_count);
938 946
939 if (notify_die(DIE_NMI, "machine check", regs, error_code, 947 if (notify_die(DIE_NMI, "machine check", regs, error_code,
940 18, SIGKILL) == NOTIFY_STOP) 948 18, SIGKILL) == NOTIFY_STOP)
@@ -1201,7 +1209,7 @@ int mce_notify_irq(void)
1201 schedule_work(&mce_trigger_work); 1209 schedule_work(&mce_trigger_work);
1202 1210
1203 if (__ratelimit(&ratelimit)) 1211 if (__ratelimit(&ratelimit))
1204 printk(KERN_INFO "Machine check events logged\n"); 1212 pr_info(HW_ERR "Machine check events logged\n");
1205 1213
1206 return 1; 1214 return 1;
1207 } 1215 }
@@ -1493,6 +1501,43 @@ static void collect_tscs(void *data)
1493 rdtscll(cpu_tsc[smp_processor_id()]); 1501 rdtscll(cpu_tsc[smp_processor_id()]);
1494} 1502}
1495 1503
1504static int mce_apei_read_done;
1505
1506/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1507static int __mce_read_apei(char __user **ubuf, size_t usize)
1508{
1509 int rc;
1510 u64 record_id;
1511 struct mce m;
1512
1513 if (usize < sizeof(struct mce))
1514 return -EINVAL;
1515
1516 rc = apei_read_mce(&m, &record_id);
1517 /* Error or no more MCE record */
1518 if (rc <= 0) {
1519 mce_apei_read_done = 1;
1520 return rc;
1521 }
1522 rc = -EFAULT;
1523 if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1524 return rc;
1525 /*
1526 * In fact, we should have cleared the record after that has
1527 * been flushed to the disk or sent to network in
1528 * /sbin/mcelog, but we have no interface to support that now,
1529 * so just clear it to avoid duplication.
1530 */
1531 rc = apei_clear_mce(record_id);
1532 if (rc) {
1533 mce_apei_read_done = 1;
1534 return rc;
1535 }
1536 *ubuf += sizeof(struct mce);
1537
1538 return 0;
1539}
1540
1496static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1541static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1497 loff_t *off) 1542 loff_t *off)
1498{ 1543{
@@ -1506,15 +1551,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1506 return -ENOMEM; 1551 return -ENOMEM;
1507 1552
1508 mutex_lock(&mce_read_mutex); 1553 mutex_lock(&mce_read_mutex);
1554
1555 if (!mce_apei_read_done) {
1556 err = __mce_read_apei(&buf, usize);
1557 if (err || buf != ubuf)
1558 goto out;
1559 }
1560
1509 next = rcu_dereference_check_mce(mcelog.next); 1561 next = rcu_dereference_check_mce(mcelog.next);
1510 1562
1511 /* Only supports full reads right now */ 1563 /* Only supports full reads right now */
1512 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1564 err = -EINVAL;
1513 mutex_unlock(&mce_read_mutex); 1565 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1514 kfree(cpu_tsc); 1566 goto out;
1515
1516 return -EINVAL;
1517 }
1518 1567
1519 err = 0; 1568 err = 0;
1520 prev = 0; 1569 prev = 0;
@@ -1562,10 +1611,15 @@ timeout:
1562 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1611 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1563 } 1612 }
1564 } 1613 }
1614
1615 if (err)
1616 err = -EFAULT;
1617
1618out:
1565 mutex_unlock(&mce_read_mutex); 1619 mutex_unlock(&mce_read_mutex);
1566 kfree(cpu_tsc); 1620 kfree(cpu_tsc);
1567 1621
1568 return err ? -EFAULT : buf - ubuf; 1622 return err ? err : buf - ubuf;
1569} 1623}
1570 1624
1571static unsigned int mce_poll(struct file *file, poll_table *wait) 1625static unsigned int mce_poll(struct file *file, poll_table *wait)
@@ -1573,6 +1627,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
1573 poll_wait(file, &mce_wait, wait); 1627 poll_wait(file, &mce_wait, wait);
1574 if (rcu_dereference_check_mce(mcelog.next)) 1628 if (rcu_dereference_check_mce(mcelog.next))
1575 return POLLIN | POLLRDNORM; 1629 return POLLIN | POLLRDNORM;
1630 if (!mce_apei_read_done && apei_check_mce())
1631 return POLLIN | POLLRDNORM;
1576 return 0; 1632 return 0;
1577} 1633}
1578 1634
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 62b48e40920a..6fcd0936194f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot)
95 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 95 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
96 96
97 /* Already owned by someone else? */ 97 /* Already owned by someone else? */
98 if (val & CMCI_EN) { 98 if (val & MCI_CTL2_CMCI_EN) {
99 if (test_and_clear_bit(i, owned) && !boot) 99 if (test_and_clear_bit(i, owned) && !boot)
100 print_update("SHD", &hdr, i); 100 print_update("SHD", &hdr, i);
101 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 101 __clear_bit(i, __get_cpu_var(mce_poll_banks));
102 continue; 102 continue;
103 } 103 }
104 104
105 val |= CMCI_EN | CMCI_THRESHOLD; 105 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
106 val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
106 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 107 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
107 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 108 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
108 109
109 /* Did the enable bit stick? -- the bank supports CMCI */ 110 /* Did the enable bit stick? -- the bank supports CMCI */
110 if (val & CMCI_EN) { 111 if (val & MCI_CTL2_CMCI_EN) {
111 if (!test_and_set_bit(i, owned) && !boot) 112 if (!test_and_set_bit(i, owned) && !boot)
112 print_update("CMCI", &hdr, i); 113 print_update("CMCI", &hdr, i);
113 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 114 __clear_bit(i, __get_cpu_var(mce_poll_banks));
@@ -155,7 +156,7 @@ void cmci_clear(void)
155 continue; 156 continue;
156 /* Disable CMCI */ 157 /* Disable CMCI */
157 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 158 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
158 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); 159 val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
159 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 160 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
160 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 161 __clear_bit(i, __get_cpu_var(mce_banks_owned));
161 } 162 }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 81c499eceb21..c2a8b26d4fea 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,15 +34,25 @@
34/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
35#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
36 36
37#define THERMAL_THROTTLING_EVENT 0
38#define POWER_LIMIT_EVENT 1
39
37/* 40/*
38 * Current thermal throttling state: 41 * Current thermal event state:
39 */ 42 */
40struct thermal_state { 43struct _thermal_state {
41 bool is_throttled; 44 bool new_event;
42 45 int event;
43 u64 next_check; 46 u64 next_check;
44 unsigned long throttle_count; 47 unsigned long count;
45 unsigned long last_throttle_count; 48 unsigned long last_count;
49};
50
51struct thermal_state {
52 struct _thermal_state core_throttle;
53 struct _thermal_state core_power_limit;
54 struct _thermal_state package_throttle;
55 struct _thermal_state package_power_limit;
46}; 56};
47 57
48static DEFINE_PER_CPU(struct thermal_state, thermal_state); 58static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly;
53 63
54#ifdef CONFIG_SYSFS 64#ifdef CONFIG_SYSFS
55#define define_therm_throt_sysdev_one_ro(_name) \ 65#define define_therm_throt_sysdev_one_ro(_name) \
56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 66 static SYSDEV_ATTR(_name, 0444, \
67 therm_throt_sysdev_show_##_name, \
68 NULL) \
57 69
58#define define_therm_throt_sysdev_show_func(name) \ 70#define define_therm_throt_sysdev_show_func(event, name) \
59 \ 71 \
60static ssize_t therm_throt_sysdev_show_##name( \ 72static ssize_t therm_throt_sysdev_show_##event##_##name( \
61 struct sys_device *dev, \ 73 struct sys_device *dev, \
62 struct sysdev_attribute *attr, \ 74 struct sysdev_attribute *attr, \
63 char *buf) \ 75 char *buf) \
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \
66 ssize_t ret; \ 78 ssize_t ret; \
67 \ 79 \
68 preempt_disable(); /* CPU hotplug */ \ 80 preempt_disable(); /* CPU hotplug */ \
69 if (cpu_online(cpu)) \ 81 if (cpu_online(cpu)) { \
70 ret = sprintf(buf, "%lu\n", \ 82 ret = sprintf(buf, "%lu\n", \
71 per_cpu(thermal_state, cpu).name); \ 83 per_cpu(thermal_state, cpu).event.name); \
72 else \ 84 } else \
73 ret = 0; \ 85 ret = 0; \
74 preempt_enable(); \ 86 preempt_enable(); \
75 \ 87 \
76 return ret; \ 88 return ret; \
77} 89}
78 90
79define_therm_throt_sysdev_show_func(throttle_count); 91define_therm_throt_sysdev_show_func(core_throttle, count);
80define_therm_throt_sysdev_one_ro(throttle_count); 92define_therm_throt_sysdev_one_ro(core_throttle_count);
93
94define_therm_throt_sysdev_show_func(core_power_limit, count);
95define_therm_throt_sysdev_one_ro(core_power_limit_count);
96
97define_therm_throt_sysdev_show_func(package_throttle, count);
98define_therm_throt_sysdev_one_ro(package_throttle_count);
99
100define_therm_throt_sysdev_show_func(package_power_limit, count);
101define_therm_throt_sysdev_one_ro(package_power_limit_count);
81 102
82static struct attribute *thermal_throttle_attrs[] = { 103static struct attribute *thermal_throttle_attrs[] = {
83 &attr_throttle_count.attr, 104 &attr_core_throttle_count.attr,
84 NULL 105 NULL
85}; 106};
86 107
87static struct attribute_group thermal_throttle_attr_group = { 108static struct attribute_group thermal_attr_group = {
88 .attrs = thermal_throttle_attrs, 109 .attrs = thermal_throttle_attrs,
89 .name = "thermal_throttle" 110 .name = "thermal_throttle"
90}; 111};
91#endif /* CONFIG_SYSFS */ 112#endif /* CONFIG_SYSFS */
92 113
114#define CORE_LEVEL 0
115#define PACKAGE_LEVEL 1
116
93/*** 117/***
94 * therm_throt_process - Process thermal throttling event from interrupt 118 * therm_throt_process - Process thermal throttling event from interrupt
95 * @curr: Whether the condition is current or not (boolean), since the 119 * @curr: Whether the condition is current or not (boolean), since the
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
106 * 1 : Event should be logged further, and a message has been 130 * 1 : Event should be logged further, and a message has been
107 * printed to the syslog. 131 * printed to the syslog.
108 */ 132 */
109static int therm_throt_process(bool is_throttled) 133static int therm_throt_process(bool new_event, int event, int level)
110{ 134{
111 struct thermal_state *state; 135 struct _thermal_state *state;
112 unsigned int this_cpu; 136 unsigned int this_cpu = smp_processor_id();
113 bool was_throttled; 137 bool old_event;
114 u64 now; 138 u64 now;
139 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
115 140
116 this_cpu = smp_processor_id();
117 now = get_jiffies_64(); 141 now = get_jiffies_64();
118 state = &per_cpu(thermal_state, this_cpu); 142 if (level == CORE_LEVEL) {
143 if (event == THERMAL_THROTTLING_EVENT)
144 state = &pstate->core_throttle;
145 else if (event == POWER_LIMIT_EVENT)
146 state = &pstate->core_power_limit;
147 else
148 return 0;
149 } else if (level == PACKAGE_LEVEL) {
150 if (event == THERMAL_THROTTLING_EVENT)
151 state = &pstate->package_throttle;
152 else if (event == POWER_LIMIT_EVENT)
153 state = &pstate->package_power_limit;
154 else
155 return 0;
156 } else
157 return 0;
119 158
120 was_throttled = state->is_throttled; 159 old_event = state->new_event;
121 state->is_throttled = is_throttled; 160 state->new_event = new_event;
122 161
123 if (is_throttled) 162 if (new_event)
124 state->throttle_count++; 163 state->count++;
125 164
126 if (time_before64(now, state->next_check) && 165 if (time_before64(now, state->next_check) &&
127 state->throttle_count != state->last_throttle_count) 166 state->count != state->last_count)
128 return 0; 167 return 0;
129 168
130 state->next_check = now + CHECK_INTERVAL; 169 state->next_check = now + CHECK_INTERVAL;
131 state->last_throttle_count = state->throttle_count; 170 state->last_count = state->count;
132 171
133 /* if we just entered the thermal event */ 172 /* if we just entered the thermal event */
134 if (is_throttled) { 173 if (new_event) {
135 printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); 174 if (event == THERMAL_THROTTLING_EVENT)
175 printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
176 this_cpu,
177 level == CORE_LEVEL ? "Core" : "Package",
178 state->count);
179 else
180 printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
181 this_cpu,
182 level == CORE_LEVEL ? "Core" : "Package",
183 state->count);
136 184
137 add_taint(TAINT_MACHINE_CHECK); 185 add_taint(TAINT_MACHINE_CHECK);
138 return 1; 186 return 1;
139 } 187 }
140 if (was_throttled) { 188 if (old_event) {
141 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); 189 if (event == THERMAL_THROTTLING_EVENT)
190 printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
191 this_cpu,
192 level == CORE_LEVEL ? "Core" : "Package");
193 else
194 printk(KERN_INFO "CPU%d: %s power limit normal\n",
195 this_cpu,
196 level == CORE_LEVEL ? "Core" : "Package");
142 return 1; 197 return 1;
143 } 198 }
144 199
@@ -149,13 +204,32 @@ static int therm_throt_process(bool is_throttled)
149/* Add/Remove thermal_throttle interface for CPU device: */ 204/* Add/Remove thermal_throttle interface for CPU device: */
150static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 205static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
151{ 206{
152 return sysfs_create_group(&sys_dev->kobj, 207 int err;
153 &thermal_throttle_attr_group); 208 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
209
210 err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
211 if (err)
212 return err;
213
214 if (cpu_has(c, X86_FEATURE_PLN))
215 err = sysfs_add_file_to_group(&sys_dev->kobj,
216 &attr_core_power_limit_count.attr,
217 thermal_attr_group.name);
218 if (cpu_has(c, X86_FEATURE_PTS))
219 err = sysfs_add_file_to_group(&sys_dev->kobj,
220 &attr_package_throttle_count.attr,
221 thermal_attr_group.name);
222 if (cpu_has(c, X86_FEATURE_PLN))
223 err = sysfs_add_file_to_group(&sys_dev->kobj,
224 &attr_package_power_limit_count.attr,
225 thermal_attr_group.name);
226
227 return err;
154} 228}
155 229
156static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 230static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
157{ 231{
158 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 232 sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
159} 233}
160 234
161/* Mutex protecting device creation against CPU hotplug: */ 235/* Mutex protecting device creation against CPU hotplug: */
@@ -190,7 +264,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
190 mutex_unlock(&therm_cpu_lock); 264 mutex_unlock(&therm_cpu_lock);
191 break; 265 break;
192 } 266 }
193 return err ? NOTIFY_BAD : NOTIFY_OK; 267 return notifier_from_errno(err);
194} 268}
195 269
196static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = 270static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
@@ -226,14 +300,50 @@ device_initcall(thermal_throttle_init_device);
226 300
227#endif /* CONFIG_SYSFS */ 301#endif /* CONFIG_SYSFS */
228 302
303/*
304 * Set up the most two significant bit to notify mce log that this thermal
305 * event type.
306 * This is a temp solution. May be changed in the future with mce log
307 * infrasture.
308 */
309#define CORE_THROTTLED (0)
310#define CORE_POWER_LIMIT ((__u64)1 << 62)
311#define PACKAGE_THROTTLED ((__u64)2 << 62)
312#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
313
229/* Thermal transition interrupt handler */ 314/* Thermal transition interrupt handler */
230static void intel_thermal_interrupt(void) 315static void intel_thermal_interrupt(void)
231{ 316{
232 __u64 msr_val; 317 __u64 msr_val;
318 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
233 319
234 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 320 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
235 if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) 321
236 mce_log_therm_throt_event(msr_val); 322 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
323 THERMAL_THROTTLING_EVENT,
324 CORE_LEVEL) != 0)
325 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
326
327 if (cpu_has(c, X86_FEATURE_PLN))
328 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
329 POWER_LIMIT_EVENT,
330 CORE_LEVEL) != 0)
331 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
332
333 if (cpu_has(c, X86_FEATURE_PTS)) {
334 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
335 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
336 THERMAL_THROTTLING_EVENT,
337 PACKAGE_LEVEL) != 0)
338 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
339 if (cpu_has(c, X86_FEATURE_PLN))
340 if (therm_throt_process(msr_val &
341 PACKAGE_THERM_STATUS_POWER_LIMIT,
342 POWER_LIMIT_EVENT,
343 PACKAGE_LEVEL) != 0)
344 mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
345 | msr_val);
346 }
237} 347}
238 348
239static void unexpected_thermal_interrupt(void) 349static void unexpected_thermal_interrupt(void)
@@ -335,8 +445,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
335 apic_write(APIC_LVTTHMR, h); 445 apic_write(APIC_LVTTHMR, h);
336 446
337 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 447 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
338 wrmsr(MSR_IA32_THERM_INTERRUPT, 448 if (cpu_has(c, X86_FEATURE_PLN))
339 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); 449 wrmsr(MSR_IA32_THERM_INTERRUPT,
450 l | (THERM_INT_LOW_ENABLE
451 | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
452 else
453 wrmsr(MSR_IA32_THERM_INTERRUPT,
454 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
455
456 if (cpu_has(c, X86_FEATURE_PTS)) {
457 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
458 if (cpu_has(c, X86_FEATURE_PLN))
459 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
460 l | (PACKAGE_THERM_INT_LOW_ENABLE
461 | PACKAGE_THERM_INT_HIGH_ENABLE
462 | PACKAGE_THERM_INT_PLN_ENABLE), h);
463 else
464 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
465 l | (PACKAGE_THERM_INT_LOW_ENABLE
466 | PACKAGE_THERM_INT_HIGH_ENABLE), h);
467 }
340 468
341 smp_thermal_vector = intel_thermal_interrupt; 469 smp_thermal_vector = intel_thermal_interrupt;
342 470
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 000000000000..d944bf6c50e9
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,56 @@
1/*
2 * HyperV Detection code.
3 *
4 * Copyright (C) 2010, Novell, Inc.
5 * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; version 2 of the License.
10 *
11 */
12
13#include <linux/types.h>
14#include <linux/module.h>
15#include <asm/processor.h>
16#include <asm/hypervisor.h>
17#include <asm/hyperv.h>
18#include <asm/mshyperv.h>
19
20struct ms_hyperv_info ms_hyperv;
21EXPORT_SYMBOL_GPL(ms_hyperv);
22
23static bool __init ms_hyperv_platform(void)
24{
25 u32 eax;
26 u32 hyp_signature[3];
27
28 if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
29 return false;
30
31 cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
32 &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
33
34 return eax >= HYPERV_CPUID_MIN &&
35 eax <= HYPERV_CPUID_MAX &&
36 !memcmp("Microsoft Hv", hyp_signature, 12);
37}
38
39static void __init ms_hyperv_init_platform(void)
40{
41 /*
42 * Extract the features and hints
43 */
44 ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
45 ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
46
47 printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
48 ms_hyperv.features, ms_hyperv.hints);
49}
50
51const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
52 .name = "Microsoft HyperV",
53 .detect = ms_hyperv_platform,
54 .init_platform = ms_hyperv_init_platform,
55};
56EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f012..c5f59d071425 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
632 unsigned long gran_base, chunk_base, lose_base; 632 unsigned long gran_base, chunk_base, lose_base;
633 char gran_factor, chunk_factor, lose_factor; 633 char gran_factor, chunk_factor, lose_factor;
634 634
635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), 637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
638 638
639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", 639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
640 result[i].bad ? "*BAD*" : " ", 640 result[i].bad ? "*BAD*" : " ",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61c..7d28d7d03885 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
433{ 433{
434 unsigned int mask_lo, mask_hi, base_lo, base_hi; 434 unsigned int mask_lo, mask_hi, base_lo, base_hi;
435 unsigned int tmp, hi; 435 unsigned int tmp, hi;
436 int cpu;
437 436
438 /* 437 /*
439 * get_mtrr doesn't need to update mtrr_state, also it could be called 438 * get_mtrr doesn't need to update mtrr_state, also it could be called
440 * from any cpu, so try to print it out directly. 439 * from any cpu, so try to print it out directly.
441 */ 440 */
442 cpu = get_cpu(); 441 get_cpu();
443 442
444 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 443 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
445 444
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79556bd9b602..01c0f3ee6cc3 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
35 35
36#include <linux/types.h> /* FIXME: kvm_para.h needs this */ 36#include <linux/types.h> /* FIXME: kvm_para.h needs this */
37 37
38#include <linux/stop_machine.h>
38#include <linux/kvm_para.h> 39#include <linux/kvm_para.h>
39#include <linux/uaccess.h> 40#include <linux/uaccess.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -143,22 +144,28 @@ struct set_mtrr_data {
143 mtrr_type smp_type; 144 mtrr_type smp_type;
144}; 145};
145 146
147static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
148
146/** 149/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 150 * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data 151 * @info: pointer to mtrr configuration data
149 * 152 *
150 * Returns nothing. 153 * Returns nothing.
151 */ 154 */
152static void ipi_handler(void *info) 155static int mtrr_work_handler(void *info)
153{ 156{
154#ifdef CONFIG_SMP 157#ifdef CONFIG_SMP
155 struct set_mtrr_data *data = info; 158 struct set_mtrr_data *data = info;
156 unsigned long flags; 159 unsigned long flags;
157 160
161 atomic_dec(&data->count);
162 while (!atomic_read(&data->gate))
163 cpu_relax();
164
158 local_irq_save(flags); 165 local_irq_save(flags);
159 166
160 atomic_dec(&data->count); 167 atomic_dec(&data->count);
161 while (!atomic_read(&data->gate)) 168 while (atomic_read(&data->gate))
162 cpu_relax(); 169 cpu_relax();
163 170
164 /* The master has cleared me to execute */ 171 /* The master has cleared me to execute */
@@ -173,12 +180,13 @@ static void ipi_handler(void *info)
173 } 180 }
174 181
175 atomic_dec(&data->count); 182 atomic_dec(&data->count);
176 while (atomic_read(&data->gate)) 183 while (!atomic_read(&data->gate))
177 cpu_relax(); 184 cpu_relax();
178 185
179 atomic_dec(&data->count); 186 atomic_dec(&data->count);
180 local_irq_restore(flags); 187 local_irq_restore(flags);
181#endif 188#endif
189 return 0;
182} 190}
183 191
184static inline int types_compatible(mtrr_type type1, mtrr_type type2) 192static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
198 * 206 *
199 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: 207 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
200 * 208 *
201 * 1. Send IPI to do the following: 209 * 1. Queue work to do the following on all processors:
202 * 2. Disable Interrupts 210 * 2. Disable Interrupts
203 * 3. Wait for all procs to do so 211 * 3. Wait for all procs to do so
204 * 4. Enter no-fill cache mode 212 * 4. Enter no-fill cache mode
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
215 * 15. Enable interrupts. 223 * 15. Enable interrupts.
216 * 224 *
217 * What does that mean for us? Well, first we set data.count to the number 225 * What does that mean for us? Well, first we set data.count to the number
218 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait 226 * of CPUs. As each CPU announces that it started the rendezvous handler by
219 * until it hits 0 and proceed. We set the data.gate flag and reset data.count. 227 * decrementing the count, We reset data.count and set the data.gate flag
220 * Meanwhile, they are waiting for that flag to be set. Once it's set, each 228 * allowing all the cpu's to proceed with the work. As each cpu disables
229 * interrupts, it'll decrement data.count once. We wait until it hits 0 and
230 * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
231 * are waiting for that flag to be cleared. Once it's cleared, each
221 * CPU goes through the transition of updating MTRRs. 232 * CPU goes through the transition of updating MTRRs.
222 * The CPU vendors may each do it differently, 233 * The CPU vendors may each do it differently,
223 * so we call mtrr_if->set() callback and let them take care of it. 234 * so we call mtrr_if->set() callback and let them take care of it.
224 * When they're done, they again decrement data->count and wait for data.gate 235 * When they're done, they again decrement data->count and wait for data.gate
225 * to be reset. 236 * to be set.
226 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag 237 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
227 * Everyone then enables interrupts and we all continue on. 238 * Everyone then enables interrupts and we all continue on.
228 * 239 *
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
234{ 245{
235 struct set_mtrr_data data; 246 struct set_mtrr_data data;
236 unsigned long flags; 247 unsigned long flags;
248 int cpu;
249
250 preempt_disable();
237 251
238 data.smp_reg = reg; 252 data.smp_reg = reg;
239 data.smp_base = base; 253 data.smp_base = base;
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
246 atomic_set(&data.gate, 0); 260 atomic_set(&data.gate, 0);
247 261
248 /* Start the ball rolling on other CPUs */ 262 /* Start the ball rolling on other CPUs */
249 if (smp_call_function(ipi_handler, &data, 0) != 0) 263 for_each_online_cpu(cpu) {
250 panic("mtrr: timed out waiting for other CPUs\n"); 264 struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
265
266 if (cpu == smp_processor_id())
267 continue;
268
269 stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
270 }
251 271
252 local_irq_save(flags);
253 272
254 while (atomic_read(&data.count)) 273 while (atomic_read(&data.count))
255 cpu_relax(); 274 cpu_relax();
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
259 smp_wmb(); 278 smp_wmb();
260 atomic_set(&data.gate, 1); 279 atomic_set(&data.gate, 1);
261 280
281 local_irq_save(flags);
282
283 while (atomic_read(&data.count))
284 cpu_relax();
285
286 /* Ok, reset count and toggle gate */
287 atomic_set(&data.count, num_booting_cpus() - 1);
288 smp_wmb();
289 atomic_set(&data.gate, 0);
290
262 /* Do our MTRR business */ 291 /* Do our MTRR business */
263 292
264 /* 293 /*
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
279 308
280 atomic_set(&data.count, num_booting_cpus() - 1); 309 atomic_set(&data.count, num_booting_cpus() - 1);
281 smp_wmb(); 310 smp_wmb();
282 atomic_set(&data.gate, 0); 311 atomic_set(&data.gate, 1);
283 312
284 /* 313 /*
285 * Wait here for everyone to have seen the gate change 314 * Wait here for everyone to have seen the gate change
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
289 cpu_relax(); 318 cpu_relax();
290 319
291 local_irq_restore(flags); 320 local_irq_restore(flags);
321 preempt_enable();
292} 322}
293 323
294/** 324/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index db5bdc8addf8..f2da20fda02d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,46 +31,51 @@
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33 33
34static u64 perf_event_mask __read_mostly; 34#if 0
35#undef wrmsrl
36#define wrmsrl(msr, val) \
37do { \
38 trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
39 (unsigned long)(val)); \
40 native_write_msr((msr), (u32)((u64)(val)), \
41 (u32)((u64)(val) >> 32)); \
42} while (0)
43#endif
35 44
36/* The maximal number of PEBS events: */ 45/*
37#define MAX_PEBS_EVENTS 4 46 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
47 */
48static unsigned long
49copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
50{
51 unsigned long offset, addr = (unsigned long)from;
52 int type = in_nmi() ? KM_NMI : KM_IRQ0;
53 unsigned long size, len = 0;
54 struct page *page;
55 void *map;
56 int ret;
38 57
39/* The size of a BTS record in bytes: */ 58 do {
40#define BTS_RECORD_SIZE 24 59 ret = __get_user_pages_fast(addr, 1, 0, &page);
60 if (!ret)
61 break;
41 62
42/* The size of a per-cpu BTS buffer in bytes: */ 63 offset = addr & (PAGE_SIZE - 1);
43#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) 64 size = min(PAGE_SIZE - offset, n - len);
44 65
45/* The BTS overflow threshold in bytes from the end of the buffer: */ 66 map = kmap_atomic(page, type);
46#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) 67 memcpy(to, map+offset, size);
68 kunmap_atomic(map, type);
69 put_page(page);
47 70
71 len += size;
72 to += size;
73 addr += size;
48 74
49/* 75 } while (len < n);
50 * Bits in the debugctlmsr controlling branch tracing.
51 */
52#define X86_DEBUGCTL_TR (1 << 6)
53#define X86_DEBUGCTL_BTS (1 << 7)
54#define X86_DEBUGCTL_BTINT (1 << 8)
55#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
56#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
57 76
58/* 77 return len;
59 * A debug store configuration. 78}
60 *
61 * We only support architectures that use 64bit fields.
62 */
63struct debug_store {
64 u64 bts_buffer_base;
65 u64 bts_index;
66 u64 bts_absolute_maximum;
67 u64 bts_interrupt_threshold;
68 u64 pebs_buffer_base;
69 u64 pebs_index;
70 u64 pebs_absolute_maximum;
71 u64 pebs_interrupt_threshold;
72 u64 pebs_event_reset[MAX_PEBS_EVENTS];
73};
74 79
75struct event_constraint { 80struct event_constraint {
76 union { 81 union {
@@ -89,18 +94,42 @@ struct amd_nb {
89 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 94 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
90}; 95};
91 96
97#define MAX_LBR_ENTRIES 16
98
92struct cpu_hw_events { 99struct cpu_hw_events {
100 /*
101 * Generic x86 PMC bits
102 */
93 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ 103 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
94 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 104 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
95 unsigned long interrupts;
96 int enabled; 105 int enabled;
97 struct debug_store *ds;
98 106
99 int n_events; 107 int n_events;
100 int n_added; 108 int n_added;
109 int n_txn;
101 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ 110 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
102 u64 tags[X86_PMC_IDX_MAX]; 111 u64 tags[X86_PMC_IDX_MAX];
103 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ 112 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
113
114 unsigned int group_flag;
115
116 /*
117 * Intel DebugStore bits
118 */
119 struct debug_store *ds;
120 u64 pebs_enabled;
121
122 /*
123 * Intel LBR bits
124 */
125 int lbr_users;
126 void *lbr_context;
127 struct perf_branch_stack lbr_stack;
128 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
129
130 /*
131 * AMD specific bits
132 */
104 struct amd_nb *amd_nb; 133 struct amd_nb *amd_nb;
105}; 134};
106 135
@@ -114,44 +143,75 @@ struct cpu_hw_events {
114#define EVENT_CONSTRAINT(c, n, m) \ 143#define EVENT_CONSTRAINT(c, n, m) \
115 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) 144 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
116 145
146/*
147 * Constraint on the Event code.
148 */
117#define INTEL_EVENT_CONSTRAINT(c, n) \ 149#define INTEL_EVENT_CONSTRAINT(c, n) \
118 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) 150 EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
119 151
152/*
153 * Constraint on the Event code + UMask + fixed-mask
154 *
155 * filter mask to validate fixed counter events.
156 * the following filters disqualify for fixed counters:
157 * - inv
158 * - edge
159 * - cnt-mask
160 * The other filters are supported by fixed counters.
161 * The any-thread option is supported starting with v3.
162 */
120#define FIXED_EVENT_CONSTRAINT(c, n) \ 163#define FIXED_EVENT_CONSTRAINT(c, n) \
121 EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) 164 EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
165
166/*
167 * Constraint on the Event code + UMask
168 */
169#define PEBS_EVENT_CONSTRAINT(c, n) \
170 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
122 171
123#define EVENT_CONSTRAINT_END \ 172#define EVENT_CONSTRAINT_END \
124 EVENT_CONSTRAINT(0, 0, 0) 173 EVENT_CONSTRAINT(0, 0, 0)
125 174
126#define for_each_event_constraint(e, c) \ 175#define for_each_event_constraint(e, c) \
127 for ((e) = (c); (e)->cmask; (e)++) 176 for ((e) = (c); (e)->weight; (e)++)
177
178union perf_capabilities {
179 struct {
180 u64 lbr_format : 6;
181 u64 pebs_trap : 1;
182 u64 pebs_arch_reg : 1;
183 u64 pebs_format : 4;
184 u64 smm_freeze : 1;
185 };
186 u64 capabilities;
187};
128 188
129/* 189/*
130 * struct x86_pmu - generic x86 pmu 190 * struct x86_pmu - generic x86 pmu
131 */ 191 */
132struct x86_pmu { 192struct x86_pmu {
193 /*
194 * Generic x86 PMC bits
195 */
133 const char *name; 196 const char *name;
134 int version; 197 int version;
135 int (*handle_irq)(struct pt_regs *); 198 int (*handle_irq)(struct pt_regs *);
136 void (*disable_all)(void); 199 void (*disable_all)(void);
137 void (*enable_all)(void); 200 void (*enable_all)(int added);
138 void (*enable)(struct perf_event *); 201 void (*enable)(struct perf_event *);
139 void (*disable)(struct perf_event *); 202 void (*disable)(struct perf_event *);
203 int (*hw_config)(struct perf_event *event);
204 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
140 unsigned eventsel; 205 unsigned eventsel;
141 unsigned perfctr; 206 unsigned perfctr;
142 u64 (*event_map)(int); 207 u64 (*event_map)(int);
143 u64 (*raw_event)(u64);
144 int max_events; 208 int max_events;
145 int num_events; 209 int num_counters;
146 int num_events_fixed; 210 int num_counters_fixed;
147 int event_bits; 211 int cntval_bits;
148 u64 event_mask; 212 u64 cntval_mask;
149 int apic; 213 int apic;
150 u64 max_period; 214 u64 max_period;
151 u64 intel_ctrl;
152 void (*enable_bts)(u64 config);
153 void (*disable_bts)(void);
154
155 struct event_constraint * 215 struct event_constraint *
156 (*get_event_constraints)(struct cpu_hw_events *cpuc, 216 (*get_event_constraints)(struct cpu_hw_events *cpuc,
157 struct perf_event *event); 217 struct perf_event *event);
@@ -159,11 +219,33 @@ struct x86_pmu {
159 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 219 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
160 struct perf_event *event); 220 struct perf_event *event);
161 struct event_constraint *event_constraints; 221 struct event_constraint *event_constraints;
222 void (*quirks)(void);
223 int perfctr_second_write;
162 224
163 int (*cpu_prepare)(int cpu); 225 int (*cpu_prepare)(int cpu);
164 void (*cpu_starting)(int cpu); 226 void (*cpu_starting)(int cpu);
165 void (*cpu_dying)(int cpu); 227 void (*cpu_dying)(int cpu);
166 void (*cpu_dead)(int cpu); 228 void (*cpu_dead)(int cpu);
229
230 /*
231 * Intel Arch Perfmon v2+
232 */
233 u64 intel_ctrl;
234 union perf_capabilities intel_cap;
235
236 /*
237 * Intel DebugStore bits
238 */
239 int bts, pebs;
240 int pebs_record_size;
241 void (*drain_pebs)(struct pt_regs *regs);
242 struct event_constraint *pebs_constraints;
243
244 /*
245 * Intel LBR
246 */
247 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
248 int lbr_nr; /* hardware stack size */
167}; 249};
168 250
169static struct x86_pmu x86_pmu __read_mostly; 251static struct x86_pmu x86_pmu __read_mostly;
@@ -198,7 +280,7 @@ static u64
198x86_perf_event_update(struct perf_event *event) 280x86_perf_event_update(struct perf_event *event)
199{ 281{
200 struct hw_perf_event *hwc = &event->hw; 282 struct hw_perf_event *hwc = &event->hw;
201 int shift = 64 - x86_pmu.event_bits; 283 int shift = 64 - x86_pmu.cntval_bits;
202 u64 prev_raw_count, new_raw_count; 284 u64 prev_raw_count, new_raw_count;
203 int idx = hwc->idx; 285 int idx = hwc->idx;
204 s64 delta; 286 s64 delta;
@@ -214,10 +296,10 @@ x86_perf_event_update(struct perf_event *event)
214 * count to the generic event atomically: 296 * count to the generic event atomically:
215 */ 297 */
216again: 298again:
217 prev_raw_count = atomic64_read(&hwc->prev_count); 299 prev_raw_count = local64_read(&hwc->prev_count);
218 rdmsrl(hwc->event_base + idx, new_raw_count); 300 rdmsrl(hwc->event_base + idx, new_raw_count);
219 301
220 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, 302 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
221 new_raw_count) != prev_raw_count) 303 new_raw_count) != prev_raw_count)
222 goto again; 304 goto again;
223 305
@@ -232,8 +314,8 @@ again:
232 delta = (new_raw_count << shift) - (prev_raw_count << shift); 314 delta = (new_raw_count << shift) - (prev_raw_count << shift);
233 delta >>= shift; 315 delta >>= shift;
234 316
235 atomic64_add(delta, &event->count); 317 local64_add(delta, &event->count);
236 atomic64_sub(delta, &hwc->period_left); 318 local64_sub(delta, &hwc->period_left);
237 319
238 return new_raw_count; 320 return new_raw_count;
239} 321}
@@ -241,33 +323,32 @@ again:
241static atomic_t active_events; 323static atomic_t active_events;
242static DEFINE_MUTEX(pmc_reserve_mutex); 324static DEFINE_MUTEX(pmc_reserve_mutex);
243 325
326#ifdef CONFIG_X86_LOCAL_APIC
327
244static bool reserve_pmc_hardware(void) 328static bool reserve_pmc_hardware(void)
245{ 329{
246#ifdef CONFIG_X86_LOCAL_APIC
247 int i; 330 int i;
248 331
249 if (nmi_watchdog == NMI_LOCAL_APIC) 332 if (nmi_watchdog == NMI_LOCAL_APIC)
250 disable_lapic_nmi_watchdog(); 333 disable_lapic_nmi_watchdog();
251 334
252 for (i = 0; i < x86_pmu.num_events; i++) { 335 for (i = 0; i < x86_pmu.num_counters; i++) {
253 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 336 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
254 goto perfctr_fail; 337 goto perfctr_fail;
255 } 338 }
256 339
257 for (i = 0; i < x86_pmu.num_events; i++) { 340 for (i = 0; i < x86_pmu.num_counters; i++) {
258 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 341 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
259 goto eventsel_fail; 342 goto eventsel_fail;
260 } 343 }
261#endif
262 344
263 return true; 345 return true;
264 346
265#ifdef CONFIG_X86_LOCAL_APIC
266eventsel_fail: 347eventsel_fail:
267 for (i--; i >= 0; i--) 348 for (i--; i >= 0; i--)
268 release_evntsel_nmi(x86_pmu.eventsel + i); 349 release_evntsel_nmi(x86_pmu.eventsel + i);
269 350
270 i = x86_pmu.num_events; 351 i = x86_pmu.num_counters;
271 352
272perfctr_fail: 353perfctr_fail:
273 for (i--; i >= 0; i--) 354 for (i--; i >= 0; i--)
@@ -277,128 +358,36 @@ perfctr_fail:
277 enable_lapic_nmi_watchdog(); 358 enable_lapic_nmi_watchdog();
278 359
279 return false; 360 return false;
280#endif
281} 361}
282 362
283static void release_pmc_hardware(void) 363static void release_pmc_hardware(void)
284{ 364{
285#ifdef CONFIG_X86_LOCAL_APIC
286 int i; 365 int i;
287 366
288 for (i = 0; i < x86_pmu.num_events; i++) { 367 for (i = 0; i < x86_pmu.num_counters; i++) {
289 release_perfctr_nmi(x86_pmu.perfctr + i); 368 release_perfctr_nmi(x86_pmu.perfctr + i);
290 release_evntsel_nmi(x86_pmu.eventsel + i); 369 release_evntsel_nmi(x86_pmu.eventsel + i);
291 } 370 }
292 371
293 if (nmi_watchdog == NMI_LOCAL_APIC) 372 if (nmi_watchdog == NMI_LOCAL_APIC)
294 enable_lapic_nmi_watchdog(); 373 enable_lapic_nmi_watchdog();
295#endif
296}
297
298static inline bool bts_available(void)
299{
300 return x86_pmu.enable_bts != NULL;
301}
302
303static void init_debug_store_on_cpu(int cpu)
304{
305 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
306
307 if (!ds)
308 return;
309
310 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
311 (u32)((u64)(unsigned long)ds),
312 (u32)((u64)(unsigned long)ds >> 32));
313}
314
315static void fini_debug_store_on_cpu(int cpu)
316{
317 if (!per_cpu(cpu_hw_events, cpu).ds)
318 return;
319
320 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
321}
322
323static void release_bts_hardware(void)
324{
325 int cpu;
326
327 if (!bts_available())
328 return;
329
330 get_online_cpus();
331
332 for_each_online_cpu(cpu)
333 fini_debug_store_on_cpu(cpu);
334
335 for_each_possible_cpu(cpu) {
336 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
337
338 if (!ds)
339 continue;
340
341 per_cpu(cpu_hw_events, cpu).ds = NULL;
342
343 kfree((void *)(unsigned long)ds->bts_buffer_base);
344 kfree(ds);
345 }
346
347 put_online_cpus();
348} 374}
349 375
350static int reserve_bts_hardware(void) 376#else
351{
352 int cpu, err = 0;
353
354 if (!bts_available())
355 return 0;
356
357 get_online_cpus();
358
359 for_each_possible_cpu(cpu) {
360 struct debug_store *ds;
361 void *buffer;
362
363 err = -ENOMEM;
364 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
365 if (unlikely(!buffer))
366 break;
367
368 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
369 if (unlikely(!ds)) {
370 kfree(buffer);
371 break;
372 }
373
374 ds->bts_buffer_base = (u64)(unsigned long)buffer;
375 ds->bts_index = ds->bts_buffer_base;
376 ds->bts_absolute_maximum =
377 ds->bts_buffer_base + BTS_BUFFER_SIZE;
378 ds->bts_interrupt_threshold =
379 ds->bts_absolute_maximum - BTS_OVFL_TH;
380 377
381 per_cpu(cpu_hw_events, cpu).ds = ds; 378static bool reserve_pmc_hardware(void) { return true; }
382 err = 0; 379static void release_pmc_hardware(void) {}
383 }
384 380
385 if (err) 381#endif
386 release_bts_hardware();
387 else {
388 for_each_online_cpu(cpu)
389 init_debug_store_on_cpu(cpu);
390 }
391
392 put_online_cpus();
393 382
394 return err; 383static int reserve_ds_buffers(void);
395} 384static void release_ds_buffers(void);
396 385
397static void hw_perf_event_destroy(struct perf_event *event) 386static void hw_perf_event_destroy(struct perf_event *event)
398{ 387{
399 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { 388 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
400 release_pmc_hardware(); 389 release_pmc_hardware();
401 release_bts_hardware(); 390 release_ds_buffers();
402 mutex_unlock(&pmc_reserve_mutex); 391 mutex_unlock(&pmc_reserve_mutex);
403 } 392 }
404} 393}
@@ -441,59 +430,16 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
441 return 0; 430 return 0;
442} 431}
443 432
444/* 433static int x86_setup_perfctr(struct perf_event *event)
445 * Setup the hardware configuration for a given attr_type
446 */
447static int __hw_perf_event_init(struct perf_event *event)
448{ 434{
449 struct perf_event_attr *attr = &event->attr; 435 struct perf_event_attr *attr = &event->attr;
450 struct hw_perf_event *hwc = &event->hw; 436 struct hw_perf_event *hwc = &event->hw;
451 u64 config; 437 u64 config;
452 int err;
453
454 if (!x86_pmu_initialized())
455 return -ENODEV;
456
457 err = 0;
458 if (!atomic_inc_not_zero(&active_events)) {
459 mutex_lock(&pmc_reserve_mutex);
460 if (atomic_read(&active_events) == 0) {
461 if (!reserve_pmc_hardware())
462 err = -EBUSY;
463 else
464 err = reserve_bts_hardware();
465 }
466 if (!err)
467 atomic_inc(&active_events);
468 mutex_unlock(&pmc_reserve_mutex);
469 }
470 if (err)
471 return err;
472
473 event->destroy = hw_perf_event_destroy;
474
475 /*
476 * Generate PMC IRQs:
477 * (keep 'enabled' bit clear for now)
478 */
479 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
480
481 hwc->idx = -1;
482 hwc->last_cpu = -1;
483 hwc->last_tag = ~0ULL;
484
485 /*
486 * Count user and OS events unless requested not to.
487 */
488 if (!attr->exclude_user)
489 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
490 if (!attr->exclude_kernel)
491 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
492 438
493 if (!hwc->sample_period) { 439 if (!hwc->sample_period) {
494 hwc->sample_period = x86_pmu.max_period; 440 hwc->sample_period = x86_pmu.max_period;
495 hwc->last_period = hwc->sample_period; 441 hwc->last_period = hwc->sample_period;
496 atomic64_set(&hwc->period_left, hwc->sample_period); 442 local64_set(&hwc->period_left, hwc->sample_period);
497 } else { 443 } else {
498 /* 444 /*
499 * If we have a PMU initialized but no APIC 445 * If we have a PMU initialized but no APIC
@@ -505,16 +451,8 @@ static int __hw_perf_event_init(struct perf_event *event)
505 return -EOPNOTSUPP; 451 return -EOPNOTSUPP;
506 } 452 }
507 453
508 /* 454 if (attr->type == PERF_TYPE_RAW)
509 * Raw hw_event type provide the config in the hw_event structure
510 */
511 if (attr->type == PERF_TYPE_RAW) {
512 hwc->config |= x86_pmu.raw_event(attr->config);
513 if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
514 perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
515 return -EACCES;
516 return 0; 455 return 0;
517 }
518 456
519 if (attr->type == PERF_TYPE_HW_CACHE) 457 if (attr->type == PERF_TYPE_HW_CACHE)
520 return set_ext_hw_attr(hwc, attr); 458 return set_ext_hw_attr(hwc, attr);
@@ -539,11 +477,11 @@ static int __hw_perf_event_init(struct perf_event *event)
539 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && 477 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
540 (hwc->sample_period == 1)) { 478 (hwc->sample_period == 1)) {
541 /* BTS is not supported by this architecture. */ 479 /* BTS is not supported by this architecture. */
542 if (!bts_available()) 480 if (!x86_pmu.bts)
543 return -EOPNOTSUPP; 481 return -EOPNOTSUPP;
544 482
545 /* BTS is currently only allowed for user-mode. */ 483 /* BTS is currently only allowed for user-mode. */
546 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) 484 if (!attr->exclude_kernel)
547 return -EOPNOTSUPP; 485 return -EOPNOTSUPP;
548 } 486 }
549 487
@@ -552,12 +490,87 @@ static int __hw_perf_event_init(struct perf_event *event)
552 return 0; 490 return 0;
553} 491}
554 492
493static int x86_pmu_hw_config(struct perf_event *event)
494{
495 if (event->attr.precise_ip) {
496 int precise = 0;
497
498 /* Support for constant skid */
499 if (x86_pmu.pebs)
500 precise++;
501
502 /* Support for IP fixup */
503 if (x86_pmu.lbr_nr)
504 precise++;
505
506 if (event->attr.precise_ip > precise)
507 return -EOPNOTSUPP;
508 }
509
510 /*
511 * Generate PMC IRQs:
512 * (keep 'enabled' bit clear for now)
513 */
514 event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
515
516 /*
517 * Count user and OS events unless requested not to
518 */
519 if (!event->attr.exclude_user)
520 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
521 if (!event->attr.exclude_kernel)
522 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
523
524 if (event->attr.type == PERF_TYPE_RAW)
525 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
526
527 return x86_setup_perfctr(event);
528}
529
530/*
531 * Setup the hardware configuration for a given attr_type
532 */
533static int __hw_perf_event_init(struct perf_event *event)
534{
535 int err;
536
537 if (!x86_pmu_initialized())
538 return -ENODEV;
539
540 err = 0;
541 if (!atomic_inc_not_zero(&active_events)) {
542 mutex_lock(&pmc_reserve_mutex);
543 if (atomic_read(&active_events) == 0) {
544 if (!reserve_pmc_hardware())
545 err = -EBUSY;
546 else {
547 err = reserve_ds_buffers();
548 if (err)
549 release_pmc_hardware();
550 }
551 }
552 if (!err)
553 atomic_inc(&active_events);
554 mutex_unlock(&pmc_reserve_mutex);
555 }
556 if (err)
557 return err;
558
559 event->destroy = hw_perf_event_destroy;
560
561 event->hw.idx = -1;
562 event->hw.last_cpu = -1;
563 event->hw.last_tag = ~0ULL;
564
565 return x86_pmu.hw_config(event);
566}
567
555static void x86_pmu_disable_all(void) 568static void x86_pmu_disable_all(void)
556{ 569{
557 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 570 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
558 int idx; 571 int idx;
559 572
560 for (idx = 0; idx < x86_pmu.num_events; idx++) { 573 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
561 u64 val; 574 u64 val;
562 575
563 if (!test_bit(idx, cpuc->active_mask)) 576 if (!test_bit(idx, cpuc->active_mask))
@@ -587,12 +600,12 @@ void hw_perf_disable(void)
587 x86_pmu.disable_all(); 600 x86_pmu.disable_all();
588} 601}
589 602
590static void x86_pmu_enable_all(void) 603static void x86_pmu_enable_all(int added)
591{ 604{
592 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 605 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
593 int idx; 606 int idx;
594 607
595 for (idx = 0; idx < x86_pmu.num_events; idx++) { 608 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
596 struct perf_event *event = cpuc->events[idx]; 609 struct perf_event *event = cpuc->events[idx];
597 u64 val; 610 u64 val;
598 611
@@ -667,14 +680,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
667 * assign events to counters starting with most 680 * assign events to counters starting with most
668 * constrained events. 681 * constrained events.
669 */ 682 */
670 wmax = x86_pmu.num_events; 683 wmax = x86_pmu.num_counters;
671 684
672 /* 685 /*
673 * when fixed event counters are present, 686 * when fixed event counters are present,
674 * wmax is incremented by 1 to account 687 * wmax is incremented by 1 to account
675 * for one more choice 688 * for one more choice
676 */ 689 */
677 if (x86_pmu.num_events_fixed) 690 if (x86_pmu.num_counters_fixed)
678 wmax++; 691 wmax++;
679 692
680 for (w = 1, num = n; num && w <= wmax; w++) { 693 for (w = 1, num = n; num && w <= wmax; w++) {
@@ -724,7 +737,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
724 struct perf_event *event; 737 struct perf_event *event;
725 int n, max_count; 738 int n, max_count;
726 739
727 max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; 740 max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
728 741
729 /* current number of events already accepted */ 742 /* current number of events already accepted */
730 n = cpuc->n_events; 743 n = cpuc->n_events;
@@ -795,7 +808,7 @@ void hw_perf_enable(void)
795 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 808 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
796 struct perf_event *event; 809 struct perf_event *event;
797 struct hw_perf_event *hwc; 810 struct hw_perf_event *hwc;
798 int i; 811 int i, added = cpuc->n_added;
799 812
800 if (!x86_pmu_initialized()) 813 if (!x86_pmu_initialized())
801 return; 814 return;
@@ -847,19 +860,20 @@ void hw_perf_enable(void)
847 cpuc->enabled = 1; 860 cpuc->enabled = 1;
848 barrier(); 861 barrier();
849 862
850 x86_pmu.enable_all(); 863 x86_pmu.enable_all(added);
851} 864}
852 865
853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) 866static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
867 u64 enable_mask)
854{ 868{
855 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 869 wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
856 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
857} 870}
858 871
859static inline void x86_pmu_disable_event(struct perf_event *event) 872static inline void x86_pmu_disable_event(struct perf_event *event)
860{ 873{
861 struct hw_perf_event *hwc = &event->hw; 874 struct hw_perf_event *hwc = &event->hw;
862 (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); 875
876 wrmsrl(hwc->config_base + hwc->idx, hwc->config);
863} 877}
864 878
865static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 879static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -872,9 +886,9 @@ static int
872x86_perf_event_set_period(struct perf_event *event) 886x86_perf_event_set_period(struct perf_event *event)
873{ 887{
874 struct hw_perf_event *hwc = &event->hw; 888 struct hw_perf_event *hwc = &event->hw;
875 s64 left = atomic64_read(&hwc->period_left); 889 s64 left = local64_read(&hwc->period_left);
876 s64 period = hwc->sample_period; 890 s64 period = hwc->sample_period;
877 int err, ret = 0, idx = hwc->idx; 891 int ret = 0, idx = hwc->idx;
878 892
879 if (idx == X86_PMC_IDX_FIXED_BTS) 893 if (idx == X86_PMC_IDX_FIXED_BTS)
880 return 0; 894 return 0;
@@ -884,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event)
884 */ 898 */
885 if (unlikely(left <= -period)) { 899 if (unlikely(left <= -period)) {
886 left = period; 900 left = period;
887 atomic64_set(&hwc->period_left, left); 901 local64_set(&hwc->period_left, left);
888 hwc->last_period = period; 902 hwc->last_period = period;
889 ret = 1; 903 ret = 1;
890 } 904 }
891 905
892 if (unlikely(left <= 0)) { 906 if (unlikely(left <= 0)) {
893 left += period; 907 left += period;
894 atomic64_set(&hwc->period_left, left); 908 local64_set(&hwc->period_left, left);
895 hwc->last_period = period; 909 hwc->last_period = period;
896 ret = 1; 910 ret = 1;
897 } 911 }
@@ -910,10 +924,19 @@ x86_perf_event_set_period(struct perf_event *event)
910 * The hw event starts counting from this event offset, 924 * The hw event starts counting from this event offset,
911 * mark it to be able to extra future deltas: 925 * mark it to be able to extra future deltas:
912 */ 926 */
913 atomic64_set(&hwc->prev_count, (u64)-left); 927 local64_set(&hwc->prev_count, (u64)-left);
928
929 wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
914 930
915 err = checking_wrmsrl(hwc->event_base + idx, 931 /*
916 (u64)(-left) & x86_pmu.event_mask); 932 * Due to erratum on certan cpu we need
933 * a second write to be sure the register
934 * is updated properly
935 */
936 if (x86_pmu.perfctr_second_write) {
937 wrmsrl(hwc->event_base + idx,
938 (u64)(-left) & x86_pmu.cntval_mask);
939 }
917 940
918 perf_event_update_userpage(event); 941 perf_event_update_userpage(event);
919 942
@@ -924,7 +947,8 @@ static void x86_pmu_enable_event(struct perf_event *event)
924{ 947{
925 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 948 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
926 if (cpuc->enabled) 949 if (cpuc->enabled)
927 __x86_pmu_enable_event(&event->hw); 950 __x86_pmu_enable_event(&event->hw,
951 ARCH_PERFMON_EVENTSEL_ENABLE);
928} 952}
929 953
930/* 954/*
@@ -950,7 +974,15 @@ static int x86_pmu_enable(struct perf_event *event)
950 if (n < 0) 974 if (n < 0)
951 return n; 975 return n;
952 976
953 ret = x86_schedule_events(cpuc, n, assign); 977 /*
978 * If group events scheduling transaction was started,
979 * skip the schedulability test here, it will be peformed
980 * at commit time(->commit_txn) as a whole
981 */
982 if (cpuc->group_flag & PERF_EVENT_TXN)
983 goto out;
984
985 ret = x86_pmu.schedule_events(cpuc, n, assign);
954 if (ret) 986 if (ret)
955 return ret; 987 return ret;
956 /* 988 /*
@@ -959,8 +991,10 @@ static int x86_pmu_enable(struct perf_event *event)
959 */ 991 */
960 memcpy(cpuc->assign, assign, n*sizeof(int)); 992 memcpy(cpuc->assign, assign, n*sizeof(int));
961 993
994out:
962 cpuc->n_events = n; 995 cpuc->n_events = n;
963 cpuc->n_added += n - n0; 996 cpuc->n_added += n - n0;
997 cpuc->n_txn += n - n0;
964 998
965 return 0; 999 return 0;
966} 1000}
@@ -991,11 +1025,12 @@ static void x86_pmu_unthrottle(struct perf_event *event)
991void perf_event_print_debug(void) 1025void perf_event_print_debug(void)
992{ 1026{
993 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1027 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1028 u64 pebs;
994 struct cpu_hw_events *cpuc; 1029 struct cpu_hw_events *cpuc;
995 unsigned long flags; 1030 unsigned long flags;
996 int cpu, idx; 1031 int cpu, idx;
997 1032
998 if (!x86_pmu.num_events) 1033 if (!x86_pmu.num_counters)
999 return; 1034 return;
1000 1035
1001 local_irq_save(flags); 1036 local_irq_save(flags);
@@ -1008,16 +1043,18 @@ void perf_event_print_debug(void)
1008 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1043 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1009 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1044 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1010 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1045 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1046 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1011 1047
1012 pr_info("\n"); 1048 pr_info("\n");
1013 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1049 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1014 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1050 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1015 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1051 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1016 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1052 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1053 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
1017 } 1054 }
1018 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1055 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1019 1056
1020 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1057 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1021 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1058 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1022 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1059 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1023 1060
@@ -1030,7 +1067,7 @@ void perf_event_print_debug(void)
1030 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1067 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1031 cpu, idx, prev_left); 1068 cpu, idx, prev_left);
1032 } 1069 }
1033 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 1070 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1034 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1071 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1035 1072
1036 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1073 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1064,6 +1101,14 @@ static void x86_pmu_disable(struct perf_event *event)
1064 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1101 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1065 int i; 1102 int i;
1066 1103
1104 /*
1105 * If we're called during a txn, we don't need to do anything.
1106 * The events never got scheduled and ->cancel_txn will truncate
1107 * the event_list.
1108 */
1109 if (cpuc->group_flag & PERF_EVENT_TXN)
1110 return;
1111
1067 x86_pmu_stop(event); 1112 x86_pmu_stop(event);
1068 1113
1069 for (i = 0; i < cpuc->n_events; i++) { 1114 for (i = 0; i < cpuc->n_events; i++) {
@@ -1095,7 +1140,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1095 1140
1096 cpuc = &__get_cpu_var(cpu_hw_events); 1141 cpuc = &__get_cpu_var(cpu_hw_events);
1097 1142
1098 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1143 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1099 if (!test_bit(idx, cpuc->active_mask)) 1144 if (!test_bit(idx, cpuc->active_mask))
1100 continue; 1145 continue;
1101 1146
@@ -1103,7 +1148,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1103 hwc = &event->hw; 1148 hwc = &event->hw;
1104 1149
1105 val = x86_perf_event_update(event); 1150 val = x86_perf_event_update(event);
1106 if (val & (1ULL << (x86_pmu.event_bits - 1))) 1151 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1107 continue; 1152 continue;
1108 1153
1109 /* 1154 /*
@@ -1146,7 +1191,6 @@ void set_perf_event_pending(void)
1146 1191
1147void perf_events_lapic_init(void) 1192void perf_events_lapic_init(void)
1148{ 1193{
1149#ifdef CONFIG_X86_LOCAL_APIC
1150 if (!x86_pmu.apic || !x86_pmu_initialized()) 1194 if (!x86_pmu.apic || !x86_pmu_initialized())
1151 return; 1195 return;
1152 1196
@@ -1154,7 +1198,6 @@ void perf_events_lapic_init(void)
1154 * Always use NMI for PMU 1198 * Always use NMI for PMU
1155 */ 1199 */
1156 apic_write(APIC_LVTPC, APIC_DM_NMI); 1200 apic_write(APIC_LVTPC, APIC_DM_NMI);
1157#endif
1158} 1201}
1159 1202
1160static int __kprobes 1203static int __kprobes
@@ -1178,9 +1221,7 @@ perf_event_nmi_handler(struct notifier_block *self,
1178 1221
1179 regs = args->regs; 1222 regs = args->regs;
1180 1223
1181#ifdef CONFIG_X86_LOCAL_APIC
1182 apic_write(APIC_LVTPC, APIC_DM_NMI); 1224 apic_write(APIC_LVTPC, APIC_DM_NMI);
1183#endif
1184 /* 1225 /*
1185 * Can't rely on the handled return value to say it was our NMI, two 1226 * Can't rely on the handled return value to say it was our NMI, two
1186 * events could trigger 'simultaneously' raising two back-to-back NMIs. 1227 * events could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1217,118 +1258,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1217 return &unconstrained; 1258 return &unconstrained;
1218} 1259}
1219 1260
1220static int x86_event_sched_in(struct perf_event *event,
1221 struct perf_cpu_context *cpuctx)
1222{
1223 int ret = 0;
1224
1225 event->state = PERF_EVENT_STATE_ACTIVE;
1226 event->oncpu = smp_processor_id();
1227 event->tstamp_running += event->ctx->time - event->tstamp_stopped;
1228
1229 if (!is_x86_event(event))
1230 ret = event->pmu->enable(event);
1231
1232 if (!ret && !is_software_event(event))
1233 cpuctx->active_oncpu++;
1234
1235 if (!ret && event->attr.exclusive)
1236 cpuctx->exclusive = 1;
1237
1238 return ret;
1239}
1240
1241static void x86_event_sched_out(struct perf_event *event,
1242 struct perf_cpu_context *cpuctx)
1243{
1244 event->state = PERF_EVENT_STATE_INACTIVE;
1245 event->oncpu = -1;
1246
1247 if (!is_x86_event(event))
1248 event->pmu->disable(event);
1249
1250 event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
1251
1252 if (!is_software_event(event))
1253 cpuctx->active_oncpu--;
1254
1255 if (event->attr.exclusive || !cpuctx->active_oncpu)
1256 cpuctx->exclusive = 0;
1257}
1258
1259/*
1260 * Called to enable a whole group of events.
1261 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
1262 * Assumes the caller has disabled interrupts and has
1263 * frozen the PMU with hw_perf_save_disable.
1264 *
1265 * called with PMU disabled. If successful and return value 1,
1266 * then guaranteed to call perf_enable() and hw_perf_enable()
1267 */
1268int hw_perf_group_sched_in(struct perf_event *leader,
1269 struct perf_cpu_context *cpuctx,
1270 struct perf_event_context *ctx)
1271{
1272 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1273 struct perf_event *sub;
1274 int assign[X86_PMC_IDX_MAX];
1275 int n0, n1, ret;
1276
1277 /* n0 = total number of events */
1278 n0 = collect_events(cpuc, leader, true);
1279 if (n0 < 0)
1280 return n0;
1281
1282 ret = x86_schedule_events(cpuc, n0, assign);
1283 if (ret)
1284 return ret;
1285
1286 ret = x86_event_sched_in(leader, cpuctx);
1287 if (ret)
1288 return ret;
1289
1290 n1 = 1;
1291 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1292 if (sub->state > PERF_EVENT_STATE_OFF) {
1293 ret = x86_event_sched_in(sub, cpuctx);
1294 if (ret)
1295 goto undo;
1296 ++n1;
1297 }
1298 }
1299 /*
1300 * copy new assignment, now we know it is possible
1301 * will be used by hw_perf_enable()
1302 */
1303 memcpy(cpuc->assign, assign, n0*sizeof(int));
1304
1305 cpuc->n_events = n0;
1306 cpuc->n_added += n1;
1307 ctx->nr_active += n1;
1308
1309 /*
1310 * 1 means successful and events are active
1311 * This is not quite true because we defer
1312 * actual activation until hw_perf_enable() but
1313 * this way we* ensure caller won't try to enable
1314 * individual events
1315 */
1316 return 1;
1317undo:
1318 x86_event_sched_out(leader, cpuctx);
1319 n0 = 1;
1320 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1321 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
1322 x86_event_sched_out(sub, cpuctx);
1323 if (++n0 == n1)
1324 break;
1325 }
1326 }
1327 return ret;
1328}
1329
1330#include "perf_event_amd.c" 1261#include "perf_event_amd.c"
1331#include "perf_event_p6.c" 1262#include "perf_event_p6.c"
1263#include "perf_event_p4.c"
1264#include "perf_event_intel_lbr.c"
1265#include "perf_event_intel_ds.c"
1332#include "perf_event_intel.c" 1266#include "perf_event_intel.c"
1333 1267
1334static int __cpuinit 1268static int __cpuinit
@@ -1402,48 +1336,50 @@ void __init init_hw_perf_events(void)
1402 1336
1403 pr_cont("%s PMU driver.\n", x86_pmu.name); 1337 pr_cont("%s PMU driver.\n", x86_pmu.name);
1404 1338
1405 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 1339 if (x86_pmu.quirks)
1340 x86_pmu.quirks();
1341
1342 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1406 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1343 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1407 x86_pmu.num_events, X86_PMC_MAX_GENERIC); 1344 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1408 x86_pmu.num_events = X86_PMC_MAX_GENERIC; 1345 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1409 } 1346 }
1410 perf_event_mask = (1 << x86_pmu.num_events) - 1; 1347 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1411 perf_max_events = x86_pmu.num_events; 1348 perf_max_events = x86_pmu.num_counters;
1412 1349
1413 if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { 1350 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1414 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", 1351 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1415 x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); 1352 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1416 x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; 1353 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1417 } 1354 }
1418 1355
1419 perf_event_mask |= 1356 x86_pmu.intel_ctrl |=
1420 ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; 1357 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1421 x86_pmu.intel_ctrl = perf_event_mask;
1422 1358
1423 perf_events_lapic_init(); 1359 perf_events_lapic_init();
1424 register_die_notifier(&perf_event_nmi_notifier); 1360 register_die_notifier(&perf_event_nmi_notifier);
1425 1361
1426 unconstrained = (struct event_constraint) 1362 unconstrained = (struct event_constraint)
1427 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 1363 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1428 0, x86_pmu.num_events); 1364 0, x86_pmu.num_counters);
1429 1365
1430 if (x86_pmu.event_constraints) { 1366 if (x86_pmu.event_constraints) {
1431 for_each_event_constraint(c, x86_pmu.event_constraints) { 1367 for_each_event_constraint(c, x86_pmu.event_constraints) {
1432 if (c->cmask != INTEL_ARCH_FIXED_MASK) 1368 if (c->cmask != X86_RAW_EVENT_MASK)
1433 continue; 1369 continue;
1434 1370
1435 c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; 1371 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1436 c->weight += x86_pmu.num_events; 1372 c->weight += x86_pmu.num_counters;
1437 } 1373 }
1438 } 1374 }
1439 1375
1440 pr_info("... version: %d\n", x86_pmu.version); 1376 pr_info("... version: %d\n", x86_pmu.version);
1441 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1377 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
1442 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1378 pr_info("... generic registers: %d\n", x86_pmu.num_counters);
1443 pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); 1379 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
1444 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1380 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1445 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); 1381 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
1446 pr_info("... event mask: %016Lx\n", perf_event_mask); 1382 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1447 1383
1448 perf_cpu_notifier(x86_pmu_notifier); 1384 perf_cpu_notifier(x86_pmu_notifier);
1449} 1385}
@@ -1453,6 +1389,67 @@ static inline void x86_pmu_read(struct perf_event *event)
1453 x86_perf_event_update(event); 1389 x86_perf_event_update(event);
1454} 1390}
1455 1391
1392/*
1393 * Start group events scheduling transaction
1394 * Set the flag to make pmu::enable() not perform the
1395 * schedulability test, it will be performed at commit time
1396 */
1397static void x86_pmu_start_txn(const struct pmu *pmu)
1398{
1399 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1400
1401 cpuc->group_flag |= PERF_EVENT_TXN;
1402 cpuc->n_txn = 0;
1403}
1404
1405/*
1406 * Stop group events scheduling transaction
1407 * Clear the flag and pmu::enable() will perform the
1408 * schedulability test.
1409 */
1410static void x86_pmu_cancel_txn(const struct pmu *pmu)
1411{
1412 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1413
1414 cpuc->group_flag &= ~PERF_EVENT_TXN;
1415 /*
1416 * Truncate the collected events.
1417 */
1418 cpuc->n_added -= cpuc->n_txn;
1419 cpuc->n_events -= cpuc->n_txn;
1420}
1421
1422/*
1423 * Commit group events scheduling transaction
1424 * Perform the group schedulability test as a whole
1425 * Return 0 if success
1426 */
1427static int x86_pmu_commit_txn(const struct pmu *pmu)
1428{
1429 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1430 int assign[X86_PMC_IDX_MAX];
1431 int n, ret;
1432
1433 n = cpuc->n_events;
1434
1435 if (!x86_pmu_initialized())
1436 return -EAGAIN;
1437
1438 ret = x86_pmu.schedule_events(cpuc, n, assign);
1439 if (ret)
1440 return ret;
1441
1442 /*
1443 * copy new assignment, now we know it is possible
1444 * will be used by hw_perf_enable()
1445 */
1446 memcpy(cpuc->assign, assign, n*sizeof(int));
1447
1448 cpuc->group_flag &= ~PERF_EVENT_TXN;
1449
1450 return 0;
1451}
1452
1456static const struct pmu pmu = { 1453static const struct pmu pmu = {
1457 .enable = x86_pmu_enable, 1454 .enable = x86_pmu_enable,
1458 .disable = x86_pmu_disable, 1455 .disable = x86_pmu_disable,
@@ -1460,9 +1457,38 @@ static const struct pmu pmu = {
1460 .stop = x86_pmu_stop, 1457 .stop = x86_pmu_stop,
1461 .read = x86_pmu_read, 1458 .read = x86_pmu_read,
1462 .unthrottle = x86_pmu_unthrottle, 1459 .unthrottle = x86_pmu_unthrottle,
1460 .start_txn = x86_pmu_start_txn,
1461 .cancel_txn = x86_pmu_cancel_txn,
1462 .commit_txn = x86_pmu_commit_txn,
1463}; 1463};
1464 1464
1465/* 1465/*
1466 * validate that we can schedule this event
1467 */
1468static int validate_event(struct perf_event *event)
1469{
1470 struct cpu_hw_events *fake_cpuc;
1471 struct event_constraint *c;
1472 int ret = 0;
1473
1474 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1475 if (!fake_cpuc)
1476 return -ENOMEM;
1477
1478 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1479
1480 if (!c || !c->weight)
1481 ret = -ENOSPC;
1482
1483 if (x86_pmu.put_event_constraints)
1484 x86_pmu.put_event_constraints(fake_cpuc, event);
1485
1486 kfree(fake_cpuc);
1487
1488 return ret;
1489}
1490
1491/*
1466 * validate a single event group 1492 * validate a single event group
1467 * 1493 *
1468 * validation include: 1494 * validation include:
@@ -1502,7 +1528,7 @@ static int validate_group(struct perf_event *event)
1502 1528
1503 fake_cpuc->n_events = n; 1529 fake_cpuc->n_events = n;
1504 1530
1505 ret = x86_schedule_events(fake_cpuc, n, NULL); 1531 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1506 1532
1507out_free: 1533out_free:
1508 kfree(fake_cpuc); 1534 kfree(fake_cpuc);
@@ -1527,6 +1553,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
1527 1553
1528 if (event->group_leader != event) 1554 if (event->group_leader != event)
1529 err = validate_group(event); 1555 err = validate_group(event);
1556 else
1557 err = validate_event(event);
1530 1558
1531 event->pmu = tmp; 1559 event->pmu = tmp;
1532 } 1560 }
@@ -1574,8 +1602,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1574{ 1602{
1575 struct perf_callchain_entry *entry = data; 1603 struct perf_callchain_entry *entry = data;
1576 1604
1577 if (reliable) 1605 callchain_store(entry, addr);
1578 callchain_store(entry, addr);
1579} 1606}
1580 1607
1581static const struct stacktrace_ops backtrace_ops = { 1608static const struct stacktrace_ops backtrace_ops = {
@@ -1586,8 +1613,6 @@ static const struct stacktrace_ops backtrace_ops = {
1586 .walk_stack = print_context_stack_bp, 1613 .walk_stack = print_context_stack_bp,
1587}; 1614};
1588 1615
1589#include "../dumpstack.h"
1590
1591static void 1616static void
1592perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) 1617perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1593{ 1618{
@@ -1597,41 +1622,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1597 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1622 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
1598} 1623}
1599 1624
1600/*
1601 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
1602 */
1603static unsigned long
1604copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
1605{
1606 unsigned long offset, addr = (unsigned long)from;
1607 int type = in_nmi() ? KM_NMI : KM_IRQ0;
1608 unsigned long size, len = 0;
1609 struct page *page;
1610 void *map;
1611 int ret;
1612
1613 do {
1614 ret = __get_user_pages_fast(addr, 1, 0, &page);
1615 if (!ret)
1616 break;
1617
1618 offset = addr & (PAGE_SIZE - 1);
1619 size = min(PAGE_SIZE - offset, n - len);
1620
1621 map = kmap_atomic(page, type);
1622 memcpy(to, map+offset, size);
1623 kunmap_atomic(map, type);
1624 put_page(page);
1625
1626 len += size;
1627 to += size;
1628 addr += size;
1629
1630 } while (len < n);
1631
1632 return len;
1633}
1634
1635#ifdef CONFIG_COMPAT 1625#ifdef CONFIG_COMPAT
1636static inline int 1626static inline int
1637perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) 1627perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -1727,6 +1717,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1727{ 1717{
1728 struct perf_callchain_entry *entry; 1718 struct perf_callchain_entry *entry;
1729 1719
1720 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1721 /* TODO: We don't support guest os callchain now */
1722 return NULL;
1723 }
1724
1730 if (in_nmi()) 1725 if (in_nmi())
1731 entry = &__get_cpu_var(pmc_nmi_entry); 1726 entry = &__get_cpu_var(pmc_nmi_entry);
1732 else 1727 else
@@ -1739,14 +1734,36 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1739 return entry; 1734 return entry;
1740} 1735}
1741 1736
1742void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) 1737unsigned long perf_instruction_pointer(struct pt_regs *regs)
1743{ 1738{
1744 regs->ip = ip; 1739 unsigned long ip;
1745 /* 1740
1746 * perf_arch_fetch_caller_regs adds another call, we need to increment 1741 if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1747 * the skip level 1742 ip = perf_guest_cbs->get_guest_ip();
1748 */ 1743 else
1749 regs->bp = rewind_frame_pointer(skip + 1); 1744 ip = instruction_pointer(regs);
1750 regs->cs = __KERNEL_CS; 1745
1751 local_save_flags(regs->flags); 1746 return ip;
1747}
1748
1749unsigned long perf_misc_flags(struct pt_regs *regs)
1750{
1751 int misc = 0;
1752
1753 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1754 if (perf_guest_cbs->is_user_mode())
1755 misc |= PERF_RECORD_MISC_GUEST_USER;
1756 else
1757 misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1758 } else {
1759 if (user_mode(regs))
1760 misc |= PERF_RECORD_MISC_USER;
1761 else
1762 misc |= PERF_RECORD_MISC_KERNEL;
1763 }
1764
1765 if (regs->flags & PERF_EFLAGS_EXACT)
1766 misc |= PERF_RECORD_MISC_EXACT_IP;
1767
1768 return misc;
1752} 1769}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index db6f7d4056e1..c2897b7b4a3b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,7 +2,7 @@
2 2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock); 3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4 4
5static __initconst u64 amd_hw_cache_event_ids 5static __initconst const u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX] 6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX] 7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -102,8 +102,8 @@ static const u64 amd_perfmon_event_map[] =
102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, 103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, 104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, 105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
107}; 107};
108 108
109static u64 amd_pmu_event_map(int hw_event) 109static u64 amd_pmu_event_map(int hw_event)
@@ -111,22 +111,19 @@ static u64 amd_pmu_event_map(int hw_event)
111 return amd_perfmon_event_map[hw_event]; 111 return amd_perfmon_event_map[hw_event];
112} 112}
113 113
114static u64 amd_pmu_raw_event(u64 hw_event) 114static int amd_pmu_hw_config(struct perf_event *event)
115{ 115{
116#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL 116 int ret = x86_pmu_hw_config(event);
117#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL 117
118#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL 118 if (ret)
119#define K7_EVNTSEL_INV_MASK 0x000800000ULL 119 return ret;
120#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL 120
121 121 if (event->attr.type != PERF_TYPE_RAW)
122#define K7_EVNTSEL_MASK \ 122 return 0;
123 (K7_EVNTSEL_EVENT_MASK | \ 123
124 K7_EVNTSEL_UNIT_MASK | \ 124 event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
125 K7_EVNTSEL_EDGE_MASK | \ 125
126 K7_EVNTSEL_INV_MASK | \ 126 return 0;
127 K7_EVNTSEL_REG_MASK)
128
129 return hw_event & K7_EVNTSEL_MASK;
130} 127}
131 128
132/* 129/*
@@ -165,7 +162,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
165 * be removed on one CPU at a time AND PMU is disabled 162 * be removed on one CPU at a time AND PMU is disabled
166 * when we come here 163 * when we come here
167 */ 164 */
168 for (i = 0; i < x86_pmu.num_events; i++) { 165 for (i = 0; i < x86_pmu.num_counters; i++) {
169 if (nb->owners[i] == event) { 166 if (nb->owners[i] == event) {
170 cmpxchg(nb->owners+i, event, NULL); 167 cmpxchg(nb->owners+i, event, NULL);
171 break; 168 break;
@@ -215,7 +212,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
215 struct hw_perf_event *hwc = &event->hw; 212 struct hw_perf_event *hwc = &event->hw;
216 struct amd_nb *nb = cpuc->amd_nb; 213 struct amd_nb *nb = cpuc->amd_nb;
217 struct perf_event *old = NULL; 214 struct perf_event *old = NULL;
218 int max = x86_pmu.num_events; 215 int max = x86_pmu.num_counters;
219 int i, j, k = -1; 216 int i, j, k = -1;
220 217
221 /* 218 /*
@@ -293,7 +290,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
293 /* 290 /*
294 * initialize all possible NB constraints 291 * initialize all possible NB constraints
295 */ 292 */
296 for (i = 0; i < x86_pmu.num_events; i++) { 293 for (i = 0; i < x86_pmu.num_counters; i++) {
297 __set_bit(i, nb->event_constraints[i].idxmsk); 294 __set_bit(i, nb->event_constraints[i].idxmsk);
298 nb->event_constraints[i].weight = 1; 295 nb->event_constraints[i].weight = 1;
299 } 296 }
@@ -371,21 +368,22 @@ static void amd_pmu_cpu_dead(int cpu)
371 raw_spin_unlock(&amd_nb_lock); 368 raw_spin_unlock(&amd_nb_lock);
372} 369}
373 370
374static __initconst struct x86_pmu amd_pmu = { 371static __initconst const struct x86_pmu amd_pmu = {
375 .name = "AMD", 372 .name = "AMD",
376 .handle_irq = x86_pmu_handle_irq, 373 .handle_irq = x86_pmu_handle_irq,
377 .disable_all = x86_pmu_disable_all, 374 .disable_all = x86_pmu_disable_all,
378 .enable_all = x86_pmu_enable_all, 375 .enable_all = x86_pmu_enable_all,
379 .enable = x86_pmu_enable_event, 376 .enable = x86_pmu_enable_event,
380 .disable = x86_pmu_disable_event, 377 .disable = x86_pmu_disable_event,
378 .hw_config = amd_pmu_hw_config,
379 .schedule_events = x86_schedule_events,
381 .eventsel = MSR_K7_EVNTSEL0, 380 .eventsel = MSR_K7_EVNTSEL0,
382 .perfctr = MSR_K7_PERFCTR0, 381 .perfctr = MSR_K7_PERFCTR0,
383 .event_map = amd_pmu_event_map, 382 .event_map = amd_pmu_event_map,
384 .raw_event = amd_pmu_raw_event,
385 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 383 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
386 .num_events = 4, 384 .num_counters = 4,
387 .event_bits = 48, 385 .cntval_bits = 48,
388 .event_mask = (1ULL << 48) - 1, 386 .cntval_mask = (1ULL << 48) - 1,
389 .apic = 1, 387 .apic = 1,
390 /* use highest bit to detect overflow */ 388 /* use highest bit to detect overflow */
391 .max_period = (1ULL << 47) - 1, 389 .max_period = (1ULL << 47) - 1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9c794ac87837..214ac860ebe0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -72,6 +72,7 @@ static struct event_constraint intel_westmere_event_constraints[] =
72 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ 72 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
73 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ 73 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
74 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ 74 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
75 INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
75 EVENT_CONSTRAINT_END 76 EVENT_CONSTRAINT_END
76}; 77};
77 78
@@ -88,7 +89,7 @@ static u64 intel_pmu_event_map(int hw_event)
88 return intel_perfmon_event_map[hw_event]; 89 return intel_perfmon_event_map[hw_event];
89} 90}
90 91
91static __initconst u64 westmere_hw_cache_event_ids 92static __initconst const u64 westmere_hw_cache_event_ids
92 [PERF_COUNT_HW_CACHE_MAX] 93 [PERF_COUNT_HW_CACHE_MAX]
93 [PERF_COUNT_HW_CACHE_OP_MAX] 94 [PERF_COUNT_HW_CACHE_OP_MAX]
94 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 95 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -179,7 +180,7 @@ static __initconst u64 westmere_hw_cache_event_ids
179 }, 180 },
180}; 181};
181 182
182static __initconst u64 nehalem_hw_cache_event_ids 183static __initconst const u64 nehalem_hw_cache_event_ids
183 [PERF_COUNT_HW_CACHE_MAX] 184 [PERF_COUNT_HW_CACHE_MAX]
184 [PERF_COUNT_HW_CACHE_OP_MAX] 185 [PERF_COUNT_HW_CACHE_OP_MAX]
185 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 186 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -270,7 +271,7 @@ static __initconst u64 nehalem_hw_cache_event_ids
270 }, 271 },
271}; 272};
272 273
273static __initconst u64 core2_hw_cache_event_ids 274static __initconst const u64 core2_hw_cache_event_ids
274 [PERF_COUNT_HW_CACHE_MAX] 275 [PERF_COUNT_HW_CACHE_MAX]
275 [PERF_COUNT_HW_CACHE_OP_MAX] 276 [PERF_COUNT_HW_CACHE_OP_MAX]
276 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 277 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -361,7 +362,7 @@ static __initconst u64 core2_hw_cache_event_ids
361 }, 362 },
362}; 363};
363 364
364static __initconst u64 atom_hw_cache_event_ids 365static __initconst const u64 atom_hw_cache_event_ids
365 [PERF_COUNT_HW_CACHE_MAX] 366 [PERF_COUNT_HW_CACHE_MAX]
366 [PERF_COUNT_HW_CACHE_OP_MAX] 367 [PERF_COUNT_HW_CACHE_OP_MAX]
367 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 368 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -452,60 +453,6 @@ static __initconst u64 atom_hw_cache_event_ids
452 }, 453 },
453}; 454};
454 455
455static u64 intel_pmu_raw_event(u64 hw_event)
456{
457#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
458#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
459#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
460#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
461#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
462
463#define CORE_EVNTSEL_MASK \
464 (INTEL_ARCH_EVTSEL_MASK | \
465 INTEL_ARCH_UNIT_MASK | \
466 INTEL_ARCH_EDGE_MASK | \
467 INTEL_ARCH_INV_MASK | \
468 INTEL_ARCH_CNT_MASK)
469
470 return hw_event & CORE_EVNTSEL_MASK;
471}
472
473static void intel_pmu_enable_bts(u64 config)
474{
475 unsigned long debugctlmsr;
476
477 debugctlmsr = get_debugctlmsr();
478
479 debugctlmsr |= X86_DEBUGCTL_TR;
480 debugctlmsr |= X86_DEBUGCTL_BTS;
481 debugctlmsr |= X86_DEBUGCTL_BTINT;
482
483 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
484 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
485
486 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
487 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
488
489 update_debugctlmsr(debugctlmsr);
490}
491
492static void intel_pmu_disable_bts(void)
493{
494 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
495 unsigned long debugctlmsr;
496
497 if (!cpuc->ds)
498 return;
499
500 debugctlmsr = get_debugctlmsr();
501
502 debugctlmsr &=
503 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
504 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
505
506 update_debugctlmsr(debugctlmsr);
507}
508
509static void intel_pmu_disable_all(void) 456static void intel_pmu_disable_all(void)
510{ 457{
511 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 458 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -514,12 +461,17 @@ static void intel_pmu_disable_all(void)
514 461
515 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 462 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
516 intel_pmu_disable_bts(); 463 intel_pmu_disable_bts();
464
465 intel_pmu_pebs_disable_all();
466 intel_pmu_lbr_disable_all();
517} 467}
518 468
519static void intel_pmu_enable_all(void) 469static void intel_pmu_enable_all(int added)
520{ 470{
521 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 471 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
522 472
473 intel_pmu_pebs_enable_all();
474 intel_pmu_lbr_enable_all();
523 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 475 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
524 476
525 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 477 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
@@ -533,6 +485,42 @@ static void intel_pmu_enable_all(void)
533 } 485 }
534} 486}
535 487
488/*
489 * Workaround for:
490 * Intel Errata AAK100 (model 26)
491 * Intel Errata AAP53 (model 30)
492 * Intel Errata BD53 (model 44)
493 *
494 * These chips need to be 'reset' when adding counters by programming
495 * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
496 * either in sequence on the same PMC or on different PMCs.
497 */
498static void intel_pmu_nhm_enable_all(int added)
499{
500 if (added) {
501 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
502 int i;
503
504 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
505 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
506 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
507
508 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
509 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
510
511 for (i = 0; i < 3; i++) {
512 struct perf_event *event = cpuc->events[i];
513
514 if (!event)
515 continue;
516
517 __x86_pmu_enable_event(&event->hw,
518 ARCH_PERFMON_EVENTSEL_ENABLE);
519 }
520 }
521 intel_pmu_enable_all(added);
522}
523
536static inline u64 intel_pmu_get_status(void) 524static inline u64 intel_pmu_get_status(void)
537{ 525{
538 u64 status; 526 u64 status;
@@ -547,8 +535,7 @@ static inline void intel_pmu_ack_status(u64 ack)
547 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 535 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
548} 536}
549 537
550static inline void 538static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
551intel_pmu_disable_fixed(struct hw_perf_event *hwc)
552{ 539{
553 int idx = hwc->idx - X86_PMC_IDX_FIXED; 540 int idx = hwc->idx - X86_PMC_IDX_FIXED;
554 u64 ctrl_val, mask; 541 u64 ctrl_val, mask;
@@ -557,71 +544,10 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc)
557 544
558 rdmsrl(hwc->config_base, ctrl_val); 545 rdmsrl(hwc->config_base, ctrl_val);
559 ctrl_val &= ~mask; 546 ctrl_val &= ~mask;
560 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 547 wrmsrl(hwc->config_base, ctrl_val);
561}
562
563static void intel_pmu_drain_bts_buffer(void)
564{
565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
566 struct debug_store *ds = cpuc->ds;
567 struct bts_record {
568 u64 from;
569 u64 to;
570 u64 flags;
571 };
572 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
573 struct bts_record *at, *top;
574 struct perf_output_handle handle;
575 struct perf_event_header header;
576 struct perf_sample_data data;
577 struct pt_regs regs;
578
579 if (!event)
580 return;
581
582 if (!ds)
583 return;
584
585 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
586 top = (struct bts_record *)(unsigned long)ds->bts_index;
587
588 if (top <= at)
589 return;
590
591 ds->bts_index = ds->bts_buffer_base;
592
593 perf_sample_data_init(&data, 0);
594
595 data.period = event->hw.last_period;
596 regs.ip = 0;
597
598 /*
599 * Prepare a generic sample, i.e. fill in the invariant fields.
600 * We will overwrite the from and to address before we output
601 * the sample.
602 */
603 perf_prepare_sample(&header, &data, event, &regs);
604
605 if (perf_output_begin(&handle, event,
606 header.size * (top - at), 1, 1))
607 return;
608
609 for (; at < top; at++) {
610 data.ip = at->from;
611 data.addr = at->to;
612
613 perf_output_sample(&handle, &header, &data, event);
614 }
615
616 perf_output_end(&handle);
617
618 /* There's new data available. */
619 event->hw.interrupts++;
620 event->pending_kill = POLL_IN;
621} 548}
622 549
623static inline void 550static void intel_pmu_disable_event(struct perf_event *event)
624intel_pmu_disable_event(struct perf_event *event)
625{ 551{
626 struct hw_perf_event *hwc = &event->hw; 552 struct hw_perf_event *hwc = &event->hw;
627 553
@@ -637,14 +563,15 @@ intel_pmu_disable_event(struct perf_event *event)
637 } 563 }
638 564
639 x86_pmu_disable_event(event); 565 x86_pmu_disable_event(event);
566
567 if (unlikely(event->attr.precise_ip))
568 intel_pmu_pebs_disable(event);
640} 569}
641 570
642static inline void 571static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
643intel_pmu_enable_fixed(struct hw_perf_event *hwc)
644{ 572{
645 int idx = hwc->idx - X86_PMC_IDX_FIXED; 573 int idx = hwc->idx - X86_PMC_IDX_FIXED;
646 u64 ctrl_val, bits, mask; 574 u64 ctrl_val, bits, mask;
647 int err;
648 575
649 /* 576 /*
650 * Enable IRQ generation (0x8), 577 * Enable IRQ generation (0x8),
@@ -669,7 +596,7 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc)
669 rdmsrl(hwc->config_base, ctrl_val); 596 rdmsrl(hwc->config_base, ctrl_val);
670 ctrl_val &= ~mask; 597 ctrl_val &= ~mask;
671 ctrl_val |= bits; 598 ctrl_val |= bits;
672 err = checking_wrmsrl(hwc->config_base, ctrl_val); 599 wrmsrl(hwc->config_base, ctrl_val);
673} 600}
674 601
675static void intel_pmu_enable_event(struct perf_event *event) 602static void intel_pmu_enable_event(struct perf_event *event)
@@ -689,7 +616,10 @@ static void intel_pmu_enable_event(struct perf_event *event)
689 return; 616 return;
690 } 617 }
691 618
692 __x86_pmu_enable_event(hwc); 619 if (unlikely(event->attr.precise_ip))
620 intel_pmu_pebs_enable(event);
621
622 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
693} 623}
694 624
695/* 625/*
@@ -708,20 +638,20 @@ static void intel_pmu_reset(void)
708 unsigned long flags; 638 unsigned long flags;
709 int idx; 639 int idx;
710 640
711 if (!x86_pmu.num_events) 641 if (!x86_pmu.num_counters)
712 return; 642 return;
713 643
714 local_irq_save(flags); 644 local_irq_save(flags);
715 645
716 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 646 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
717 647
718 for (idx = 0; idx < x86_pmu.num_events; idx++) { 648 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
719 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 649 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
720 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 650 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
721 } 651 }
722 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 652 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
723 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 653 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
724 } 654
725 if (ds) 655 if (ds)
726 ds->bts_index = ds->bts_buffer_base; 656 ds->bts_index = ds->bts_buffer_base;
727 657
@@ -747,7 +677,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
747 intel_pmu_drain_bts_buffer(); 677 intel_pmu_drain_bts_buffer();
748 status = intel_pmu_get_status(); 678 status = intel_pmu_get_status();
749 if (!status) { 679 if (!status) {
750 intel_pmu_enable_all(); 680 intel_pmu_enable_all(0);
751 return 0; 681 return 0;
752 } 682 }
753 683
@@ -762,6 +692,15 @@ again:
762 692
763 inc_irq_stat(apic_perf_irqs); 693 inc_irq_stat(apic_perf_irqs);
764 ack = status; 694 ack = status;
695
696 intel_pmu_lbr_read();
697
698 /*
699 * PEBS overflow sets bit 62 in the global status register
700 */
701 if (__test_and_clear_bit(62, (unsigned long *)&status))
702 x86_pmu.drain_pebs(regs);
703
765 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 704 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
766 struct perf_event *event = cpuc->events[bit]; 705 struct perf_event *event = cpuc->events[bit];
767 706
@@ -787,26 +726,22 @@ again:
787 goto again; 726 goto again;
788 727
789done: 728done:
790 intel_pmu_enable_all(); 729 intel_pmu_enable_all(0);
791 return 1; 730 return 1;
792} 731}
793 732
794static struct event_constraint bts_constraint =
795 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
796
797static struct event_constraint * 733static struct event_constraint *
798intel_special_constraints(struct perf_event *event) 734intel_bts_constraints(struct perf_event *event)
799{ 735{
800 unsigned int hw_event; 736 struct hw_perf_event *hwc = &event->hw;
801 737 unsigned int hw_event, bts_event;
802 hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
803 738
804 if (unlikely((hw_event == 739 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
805 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && 740 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
806 (event->hw.sample_period == 1))) {
807 741
742 if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
808 return &bts_constraint; 743 return &bts_constraint;
809 } 744
810 return NULL; 745 return NULL;
811} 746}
812 747
@@ -815,24 +750,53 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
815{ 750{
816 struct event_constraint *c; 751 struct event_constraint *c;
817 752
818 c = intel_special_constraints(event); 753 c = intel_bts_constraints(event);
754 if (c)
755 return c;
756
757 c = intel_pebs_constraints(event);
819 if (c) 758 if (c)
820 return c; 759 return c;
821 760
822 return x86_get_event_constraints(cpuc, event); 761 return x86_get_event_constraints(cpuc, event);
823} 762}
824 763
825static __initconst struct x86_pmu core_pmu = { 764static int intel_pmu_hw_config(struct perf_event *event)
765{
766 int ret = x86_pmu_hw_config(event);
767
768 if (ret)
769 return ret;
770
771 if (event->attr.type != PERF_TYPE_RAW)
772 return 0;
773
774 if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
775 return 0;
776
777 if (x86_pmu.version < 3)
778 return -EINVAL;
779
780 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
781 return -EACCES;
782
783 event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
784
785 return 0;
786}
787
788static __initconst const struct x86_pmu core_pmu = {
826 .name = "core", 789 .name = "core",
827 .handle_irq = x86_pmu_handle_irq, 790 .handle_irq = x86_pmu_handle_irq,
828 .disable_all = x86_pmu_disable_all, 791 .disable_all = x86_pmu_disable_all,
829 .enable_all = x86_pmu_enable_all, 792 .enable_all = x86_pmu_enable_all,
830 .enable = x86_pmu_enable_event, 793 .enable = x86_pmu_enable_event,
831 .disable = x86_pmu_disable_event, 794 .disable = x86_pmu_disable_event,
795 .hw_config = x86_pmu_hw_config,
796 .schedule_events = x86_schedule_events,
832 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 797 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
833 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 798 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
834 .event_map = intel_pmu_event_map, 799 .event_map = intel_pmu_event_map,
835 .raw_event = intel_pmu_raw_event,
836 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 800 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
837 .apic = 1, 801 .apic = 1,
838 /* 802 /*
@@ -845,17 +809,32 @@ static __initconst struct x86_pmu core_pmu = {
845 .event_constraints = intel_core_event_constraints, 809 .event_constraints = intel_core_event_constraints,
846}; 810};
847 811
848static __initconst struct x86_pmu intel_pmu = { 812static void intel_pmu_cpu_starting(int cpu)
813{
814 init_debug_store_on_cpu(cpu);
815 /*
816 * Deal with CPUs that don't clear their LBRs on power-up.
817 */
818 intel_pmu_lbr_reset();
819}
820
821static void intel_pmu_cpu_dying(int cpu)
822{
823 fini_debug_store_on_cpu(cpu);
824}
825
826static __initconst const struct x86_pmu intel_pmu = {
849 .name = "Intel", 827 .name = "Intel",
850 .handle_irq = intel_pmu_handle_irq, 828 .handle_irq = intel_pmu_handle_irq,
851 .disable_all = intel_pmu_disable_all, 829 .disable_all = intel_pmu_disable_all,
852 .enable_all = intel_pmu_enable_all, 830 .enable_all = intel_pmu_enable_all,
853 .enable = intel_pmu_enable_event, 831 .enable = intel_pmu_enable_event,
854 .disable = intel_pmu_disable_event, 832 .disable = intel_pmu_disable_event,
833 .hw_config = intel_pmu_hw_config,
834 .schedule_events = x86_schedule_events,
855 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 835 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
856 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 836 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
857 .event_map = intel_pmu_event_map, 837 .event_map = intel_pmu_event_map,
858 .raw_event = intel_pmu_raw_event,
859 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 838 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
860 .apic = 1, 839 .apic = 1,
861 /* 840 /*
@@ -864,14 +843,38 @@ static __initconst struct x86_pmu intel_pmu = {
864 * the generic event period: 843 * the generic event period:
865 */ 844 */
866 .max_period = (1ULL << 31) - 1, 845 .max_period = (1ULL << 31) - 1,
867 .enable_bts = intel_pmu_enable_bts,
868 .disable_bts = intel_pmu_disable_bts,
869 .get_event_constraints = intel_get_event_constraints, 846 .get_event_constraints = intel_get_event_constraints,
870 847
871 .cpu_starting = init_debug_store_on_cpu, 848 .cpu_starting = intel_pmu_cpu_starting,
872 .cpu_dying = fini_debug_store_on_cpu, 849 .cpu_dying = intel_pmu_cpu_dying,
873}; 850};
874 851
852static void intel_clovertown_quirks(void)
853{
854 /*
855 * PEBS is unreliable due to:
856 *
857 * AJ67 - PEBS may experience CPL leaks
858 * AJ68 - PEBS PMI may be delayed by one event
859 * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
860 * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
861 *
862 * AJ67 could be worked around by restricting the OS/USR flags.
863 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
864 *
865 * AJ106 could possibly be worked around by not allowing LBR
866 * usage from PEBS, including the fixup.
867 * AJ68 could possibly be worked around by always programming
868 * a pebs_event_reset[0] value and coping with the lost events.
869 *
870 * But taken together it might just make sense to not enable PEBS on
871 * these chips.
872 */
873 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
874 x86_pmu.pebs = 0;
875 x86_pmu.pebs_constraints = NULL;
876}
877
875static __init int intel_pmu_init(void) 878static __init int intel_pmu_init(void)
876{ 879{
877 union cpuid10_edx edx; 880 union cpuid10_edx edx;
@@ -881,12 +884,13 @@ static __init int intel_pmu_init(void)
881 int version; 884 int version;
882 885
883 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 886 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
884 /* check for P6 processor family */ 887 switch (boot_cpu_data.x86) {
885 if (boot_cpu_data.x86 == 6) { 888 case 0x6:
886 return p6_pmu_init(); 889 return p6_pmu_init();
887 } else { 890 case 0xf:
891 return p4_pmu_init();
892 }
888 return -ENODEV; 893 return -ENODEV;
889 }
890 } 894 }
891 895
892 /* 896 /*
@@ -904,16 +908,28 @@ static __init int intel_pmu_init(void)
904 x86_pmu = intel_pmu; 908 x86_pmu = intel_pmu;
905 909
906 x86_pmu.version = version; 910 x86_pmu.version = version;
907 x86_pmu.num_events = eax.split.num_events; 911 x86_pmu.num_counters = eax.split.num_counters;
908 x86_pmu.event_bits = eax.split.bit_width; 912 x86_pmu.cntval_bits = eax.split.bit_width;
909 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; 913 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
910 914
911 /* 915 /*
912 * Quirk: v2 perfmon does not report fixed-purpose events, so 916 * Quirk: v2 perfmon does not report fixed-purpose events, so
913 * assume at least 3 events: 917 * assume at least 3 events:
914 */ 918 */
915 if (version > 1) 919 if (version > 1)
916 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); 920 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
921
922 /*
923 * v2 and above have a perf capabilities MSR
924 */
925 if (version > 1) {
926 u64 capabilities;
927
928 rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
929 x86_pmu.intel_cap.capabilities = capabilities;
930 }
931
932 intel_ds_init();
917 933
918 /* 934 /*
919 * Install the hw-cache-events table: 935 * Install the hw-cache-events table:
@@ -924,12 +940,15 @@ static __init int intel_pmu_init(void)
924 break; 940 break;
925 941
926 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 942 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
943 x86_pmu.quirks = intel_clovertown_quirks;
927 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 944 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
928 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 945 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
929 case 29: /* six-core 45 nm xeon "Dunnington" */ 946 case 29: /* six-core 45 nm xeon "Dunnington" */
930 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, 947 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
931 sizeof(hw_cache_event_ids)); 948 sizeof(hw_cache_event_ids));
932 949
950 intel_pmu_lbr_init_core();
951
933 x86_pmu.event_constraints = intel_core2_event_constraints; 952 x86_pmu.event_constraints = intel_core2_event_constraints;
934 pr_cont("Core2 events, "); 953 pr_cont("Core2 events, ");
935 break; 954 break;
@@ -940,13 +959,19 @@ static __init int intel_pmu_init(void)
940 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 959 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
941 sizeof(hw_cache_event_ids)); 960 sizeof(hw_cache_event_ids));
942 961
962 intel_pmu_lbr_init_nhm();
963
943 x86_pmu.event_constraints = intel_nehalem_event_constraints; 964 x86_pmu.event_constraints = intel_nehalem_event_constraints;
944 pr_cont("Nehalem/Corei7 events, "); 965 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
966 pr_cont("Nehalem events, ");
945 break; 967 break;
968
946 case 28: /* Atom */ 969 case 28: /* Atom */
947 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, 970 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
948 sizeof(hw_cache_event_ids)); 971 sizeof(hw_cache_event_ids));
949 972
973 intel_pmu_lbr_init_atom();
974
950 x86_pmu.event_constraints = intel_gen_event_constraints; 975 x86_pmu.event_constraints = intel_gen_event_constraints;
951 pr_cont("Atom events, "); 976 pr_cont("Atom events, ");
952 break; 977 break;
@@ -956,7 +981,10 @@ static __init int intel_pmu_init(void)
956 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 981 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
957 sizeof(hw_cache_event_ids)); 982 sizeof(hw_cache_event_ids));
958 983
984 intel_pmu_lbr_init_nhm();
985
959 x86_pmu.event_constraints = intel_westmere_event_constraints; 986 x86_pmu.event_constraints = intel_westmere_event_constraints;
987 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
960 pr_cont("Westmere events, "); 988 pr_cont("Westmere events, ");
961 break; 989 break;
962 990
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 000000000000..18018d1311cd
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,641 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/* The maximal number of PEBS events: */
4#define MAX_PEBS_EVENTS 4
5
6/* The size of a BTS record in bytes: */
7#define BTS_RECORD_SIZE 24
8
9#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
10#define PEBS_BUFFER_SIZE PAGE_SIZE
11
12/*
13 * pebs_record_32 for p4 and core not supported
14
15struct pebs_record_32 {
16 u32 flags, ip;
17 u32 ax, bc, cx, dx;
18 u32 si, di, bp, sp;
19};
20
21 */
22
23struct pebs_record_core {
24 u64 flags, ip;
25 u64 ax, bx, cx, dx;
26 u64 si, di, bp, sp;
27 u64 r8, r9, r10, r11;
28 u64 r12, r13, r14, r15;
29};
30
31struct pebs_record_nhm {
32 u64 flags, ip;
33 u64 ax, bx, cx, dx;
34 u64 si, di, bp, sp;
35 u64 r8, r9, r10, r11;
36 u64 r12, r13, r14, r15;
37 u64 status, dla, dse, lat;
38};
39
40/*
41 * A debug store configuration.
42 *
43 * We only support architectures that use 64bit fields.
44 */
45struct debug_store {
46 u64 bts_buffer_base;
47 u64 bts_index;
48 u64 bts_absolute_maximum;
49 u64 bts_interrupt_threshold;
50 u64 pebs_buffer_base;
51 u64 pebs_index;
52 u64 pebs_absolute_maximum;
53 u64 pebs_interrupt_threshold;
54 u64 pebs_event_reset[MAX_PEBS_EVENTS];
55};
56
57static void init_debug_store_on_cpu(int cpu)
58{
59 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
60
61 if (!ds)
62 return;
63
64 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
65 (u32)((u64)(unsigned long)ds),
66 (u32)((u64)(unsigned long)ds >> 32));
67}
68
69static void fini_debug_store_on_cpu(int cpu)
70{
71 if (!per_cpu(cpu_hw_events, cpu).ds)
72 return;
73
74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
75}
76
77static void release_ds_buffers(void)
78{
79 int cpu;
80
81 if (!x86_pmu.bts && !x86_pmu.pebs)
82 return;
83
84 get_online_cpus();
85
86 for_each_online_cpu(cpu)
87 fini_debug_store_on_cpu(cpu);
88
89 for_each_possible_cpu(cpu) {
90 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
91
92 if (!ds)
93 continue;
94
95 per_cpu(cpu_hw_events, cpu).ds = NULL;
96
97 kfree((void *)(unsigned long)ds->pebs_buffer_base);
98 kfree((void *)(unsigned long)ds->bts_buffer_base);
99 kfree(ds);
100 }
101
102 put_online_cpus();
103}
104
105static int reserve_ds_buffers(void)
106{
107 int cpu, err = 0;
108
109 if (!x86_pmu.bts && !x86_pmu.pebs)
110 return 0;
111
112 get_online_cpus();
113
114 for_each_possible_cpu(cpu) {
115 struct debug_store *ds;
116 void *buffer;
117 int max, thresh;
118
119 err = -ENOMEM;
120 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
121 if (unlikely(!ds))
122 break;
123 per_cpu(cpu_hw_events, cpu).ds = ds;
124
125 if (x86_pmu.bts) {
126 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
127 if (unlikely(!buffer))
128 break;
129
130 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
131 thresh = max / 16;
132
133 ds->bts_buffer_base = (u64)(unsigned long)buffer;
134 ds->bts_index = ds->bts_buffer_base;
135 ds->bts_absolute_maximum = ds->bts_buffer_base +
136 max * BTS_RECORD_SIZE;
137 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
138 thresh * BTS_RECORD_SIZE;
139 }
140
141 if (x86_pmu.pebs) {
142 buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
143 if (unlikely(!buffer))
144 break;
145
146 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
147
148 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
149 ds->pebs_index = ds->pebs_buffer_base;
150 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
151 max * x86_pmu.pebs_record_size;
152 /*
153 * Always use single record PEBS
154 */
155 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
156 x86_pmu.pebs_record_size;
157 }
158
159 err = 0;
160 }
161
162 if (err)
163 release_ds_buffers();
164 else {
165 for_each_online_cpu(cpu)
166 init_debug_store_on_cpu(cpu);
167 }
168
169 put_online_cpus();
170
171 return err;
172}
173
174/*
175 * BTS
176 */
177
178static struct event_constraint bts_constraint =
179 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
180
181static void intel_pmu_enable_bts(u64 config)
182{
183 unsigned long debugctlmsr;
184
185 debugctlmsr = get_debugctlmsr();
186
187 debugctlmsr |= DEBUGCTLMSR_TR;
188 debugctlmsr |= DEBUGCTLMSR_BTS;
189 debugctlmsr |= DEBUGCTLMSR_BTINT;
190
191 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
192 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
193
194 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
195 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
196
197 update_debugctlmsr(debugctlmsr);
198}
199
200static void intel_pmu_disable_bts(void)
201{
202 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
203 unsigned long debugctlmsr;
204
205 if (!cpuc->ds)
206 return;
207
208 debugctlmsr = get_debugctlmsr();
209
210 debugctlmsr &=
211 ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
212 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
213
214 update_debugctlmsr(debugctlmsr);
215}
216
217static void intel_pmu_drain_bts_buffer(void)
218{
219 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
220 struct debug_store *ds = cpuc->ds;
221 struct bts_record {
222 u64 from;
223 u64 to;
224 u64 flags;
225 };
226 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
227 struct bts_record *at, *top;
228 struct perf_output_handle handle;
229 struct perf_event_header header;
230 struct perf_sample_data data;
231 struct pt_regs regs;
232
233 if (!event)
234 return;
235
236 if (!ds)
237 return;
238
239 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
240 top = (struct bts_record *)(unsigned long)ds->bts_index;
241
242 if (top <= at)
243 return;
244
245 ds->bts_index = ds->bts_buffer_base;
246
247 perf_sample_data_init(&data, 0);
248 data.period = event->hw.last_period;
249 regs.ip = 0;
250
251 /*
252 * Prepare a generic sample, i.e. fill in the invariant fields.
253 * We will overwrite the from and to address before we output
254 * the sample.
255 */
256 perf_prepare_sample(&header, &data, event, &regs);
257
258 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
259 return;
260
261 for (; at < top; at++) {
262 data.ip = at->from;
263 data.addr = at->to;
264
265 perf_output_sample(&handle, &header, &data, event);
266 }
267
268 perf_output_end(&handle);
269
270 /* There's new data available. */
271 event->hw.interrupts++;
272 event->pending_kill = POLL_IN;
273}
274
275/*
276 * PEBS
277 */
278
279static struct event_constraint intel_core_pebs_events[] = {
280 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
281 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
282 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
283 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
284 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
285 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
286 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
287 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
288 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
289 EVENT_CONSTRAINT_END
290};
291
292static struct event_constraint intel_nehalem_pebs_events[] = {
293 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
294 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
295 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
296 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
297 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
298 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
299 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
300 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
301 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
302 EVENT_CONSTRAINT_END
303};
304
305static struct event_constraint *
306intel_pebs_constraints(struct perf_event *event)
307{
308 struct event_constraint *c;
309
310 if (!event->attr.precise_ip)
311 return NULL;
312
313 if (x86_pmu.pebs_constraints) {
314 for_each_event_constraint(c, x86_pmu.pebs_constraints) {
315 if ((event->hw.config & c->cmask) == c->code)
316 return c;
317 }
318 }
319
320 return &emptyconstraint;
321}
322
323static void intel_pmu_pebs_enable(struct perf_event *event)
324{
325 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
326 struct hw_perf_event *hwc = &event->hw;
327
328 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
329
330 cpuc->pebs_enabled |= 1ULL << hwc->idx;
331 WARN_ON_ONCE(cpuc->enabled);
332
333 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
334 intel_pmu_lbr_enable(event);
335}
336
337static void intel_pmu_pebs_disable(struct perf_event *event)
338{
339 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
340 struct hw_perf_event *hwc = &event->hw;
341
342 cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
343 if (cpuc->enabled)
344 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
345
346 hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
347
348 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
349 intel_pmu_lbr_disable(event);
350}
351
352static void intel_pmu_pebs_enable_all(void)
353{
354 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
355
356 if (cpuc->pebs_enabled)
357 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
358}
359
360static void intel_pmu_pebs_disable_all(void)
361{
362 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
363
364 if (cpuc->pebs_enabled)
365 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
366}
367
368#include <asm/insn.h>
369
370static inline bool kernel_ip(unsigned long ip)
371{
372#ifdef CONFIG_X86_32
373 return ip > PAGE_OFFSET;
374#else
375 return (long)ip < 0;
376#endif
377}
378
379static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
380{
381 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
382 unsigned long from = cpuc->lbr_entries[0].from;
383 unsigned long old_to, to = cpuc->lbr_entries[0].to;
384 unsigned long ip = regs->ip;
385
386 /*
387 * We don't need to fixup if the PEBS assist is fault like
388 */
389 if (!x86_pmu.intel_cap.pebs_trap)
390 return 1;
391
392 /*
393 * No LBR entry, no basic block, no rewinding
394 */
395 if (!cpuc->lbr_stack.nr || !from || !to)
396 return 0;
397
398 /*
399 * Basic blocks should never cross user/kernel boundaries
400 */
401 if (kernel_ip(ip) != kernel_ip(to))
402 return 0;
403
404 /*
405 * unsigned math, either ip is before the start (impossible) or
406 * the basic block is larger than 1 page (sanity)
407 */
408 if ((ip - to) > PAGE_SIZE)
409 return 0;
410
411 /*
412 * We sampled a branch insn, rewind using the LBR stack
413 */
414 if (ip == to) {
415 regs->ip = from;
416 return 1;
417 }
418
419 do {
420 struct insn insn;
421 u8 buf[MAX_INSN_SIZE];
422 void *kaddr;
423
424 old_to = to;
425 if (!kernel_ip(ip)) {
426 int bytes, size = MAX_INSN_SIZE;
427
428 bytes = copy_from_user_nmi(buf, (void __user *)to, size);
429 if (bytes != size)
430 return 0;
431
432 kaddr = buf;
433 } else
434 kaddr = (void *)to;
435
436 kernel_insn_init(&insn, kaddr);
437 insn_get_length(&insn);
438 to += insn.length;
439 } while (to < ip);
440
441 if (to == ip) {
442 regs->ip = old_to;
443 return 1;
444 }
445
446 /*
447 * Even though we decoded the basic block, the instruction stream
448 * never matched the given IP, either the TO or the IP got corrupted.
449 */
450 return 0;
451}
452
453static int intel_pmu_save_and_restart(struct perf_event *event);
454
455static void __intel_pmu_pebs_event(struct perf_event *event,
456 struct pt_regs *iregs, void *__pebs)
457{
458 /*
459 * We cast to pebs_record_core since that is a subset of
460 * both formats and we don't use the other fields in this
461 * routine.
462 */
463 struct pebs_record_core *pebs = __pebs;
464 struct perf_sample_data data;
465 struct pt_regs regs;
466
467 if (!intel_pmu_save_and_restart(event))
468 return;
469
470 perf_sample_data_init(&data, 0);
471 data.period = event->hw.last_period;
472
473 /*
474 * We use the interrupt regs as a base because the PEBS record
475 * does not contain a full regs set, specifically it seems to
476 * lack segment descriptors, which get used by things like
477 * user_mode().
478 *
479 * In the simple case fix up only the IP and BP,SP regs, for
480 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
481 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
482 */
483 regs = *iregs;
484 regs.ip = pebs->ip;
485 regs.bp = pebs->bp;
486 regs.sp = pebs->sp;
487
488 if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
489 regs.flags |= PERF_EFLAGS_EXACT;
490 else
491 regs.flags &= ~PERF_EFLAGS_EXACT;
492
493 if (perf_event_overflow(event, 1, &data, &regs))
494 x86_pmu_stop(event);
495}
496
497static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
498{
499 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
500 struct debug_store *ds = cpuc->ds;
501 struct perf_event *event = cpuc->events[0]; /* PMC0 only */
502 struct pebs_record_core *at, *top;
503 int n;
504
505 if (!ds || !x86_pmu.pebs)
506 return;
507
508 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
509 top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
510
511 /*
512 * Whatever else happens, drain the thing
513 */
514 ds->pebs_index = ds->pebs_buffer_base;
515
516 if (!test_bit(0, cpuc->active_mask))
517 return;
518
519 WARN_ON_ONCE(!event);
520
521 if (!event->attr.precise_ip)
522 return;
523
524 n = top - at;
525 if (n <= 0)
526 return;
527
528 /*
529 * Should not happen, we program the threshold at 1 and do not
530 * set a reset value.
531 */
532 WARN_ON_ONCE(n > 1);
533 at += n - 1;
534
535 __intel_pmu_pebs_event(event, iregs, at);
536}
537
538static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
539{
540 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
541 struct debug_store *ds = cpuc->ds;
542 struct pebs_record_nhm *at, *top;
543 struct perf_event *event = NULL;
544 u64 status = 0;
545 int bit, n;
546
547 if (!ds || !x86_pmu.pebs)
548 return;
549
550 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
551 top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
552
553 ds->pebs_index = ds->pebs_buffer_base;
554
555 n = top - at;
556 if (n <= 0)
557 return;
558
559 /*
560 * Should not happen, we program the threshold at 1 and do not
561 * set a reset value.
562 */
563 WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
564
565 for ( ; at < top; at++) {
566 for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
567 event = cpuc->events[bit];
568 if (!test_bit(bit, cpuc->active_mask))
569 continue;
570
571 WARN_ON_ONCE(!event);
572
573 if (!event->attr.precise_ip)
574 continue;
575
576 if (__test_and_set_bit(bit, (unsigned long *)&status))
577 continue;
578
579 break;
580 }
581
582 if (!event || bit >= MAX_PEBS_EVENTS)
583 continue;
584
585 __intel_pmu_pebs_event(event, iregs, at);
586 }
587}
588
589/*
590 * BTS, PEBS probe and setup
591 */
592
593static void intel_ds_init(void)
594{
595 /*
596 * No support for 32bit formats
597 */
598 if (!boot_cpu_has(X86_FEATURE_DTES64))
599 return;
600
601 x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
602 x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
603 if (x86_pmu.pebs) {
604 char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
605 int format = x86_pmu.intel_cap.pebs_format;
606
607 switch (format) {
608 case 0:
609 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
610 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
611 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
612 x86_pmu.pebs_constraints = intel_core_pebs_events;
613 break;
614
615 case 1:
616 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
617 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
618 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
619 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
620 break;
621
622 default:
623 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
624 x86_pmu.pebs = 0;
625 break;
626 }
627 }
628}
629
630#else /* CONFIG_CPU_SUP_INTEL */
631
632static int reserve_ds_buffers(void)
633{
634 return 0;
635}
636
637static void release_ds_buffers(void)
638{
639}
640
641#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 000000000000..d202c1bece1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,218 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3enum {
4 LBR_FORMAT_32 = 0x00,
5 LBR_FORMAT_LIP = 0x01,
6 LBR_FORMAT_EIP = 0x02,
7 LBR_FORMAT_EIP_FLAGS = 0x03,
8};
9
10/*
11 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
12 * otherwise it becomes near impossible to get a reliable stack.
13 */
14
15static void __intel_pmu_lbr_enable(void)
16{
17 u64 debugctl;
18
19 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
20 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
21 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
22}
23
24static void __intel_pmu_lbr_disable(void)
25{
26 u64 debugctl;
27
28 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
29 debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
30 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
31}
32
33static void intel_pmu_lbr_reset_32(void)
34{
35 int i;
36
37 for (i = 0; i < x86_pmu.lbr_nr; i++)
38 wrmsrl(x86_pmu.lbr_from + i, 0);
39}
40
41static void intel_pmu_lbr_reset_64(void)
42{
43 int i;
44
45 for (i = 0; i < x86_pmu.lbr_nr; i++) {
46 wrmsrl(x86_pmu.lbr_from + i, 0);
47 wrmsrl(x86_pmu.lbr_to + i, 0);
48 }
49}
50
51static void intel_pmu_lbr_reset(void)
52{
53 if (!x86_pmu.lbr_nr)
54 return;
55
56 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
57 intel_pmu_lbr_reset_32();
58 else
59 intel_pmu_lbr_reset_64();
60}
61
62static void intel_pmu_lbr_enable(struct perf_event *event)
63{
64 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
65
66 if (!x86_pmu.lbr_nr)
67 return;
68
69 WARN_ON_ONCE(cpuc->enabled);
70
71 /*
72 * Reset the LBR stack if we changed task context to
73 * avoid data leaks.
74 */
75
76 if (event->ctx->task && cpuc->lbr_context != event->ctx) {
77 intel_pmu_lbr_reset();
78 cpuc->lbr_context = event->ctx;
79 }
80
81 cpuc->lbr_users++;
82}
83
84static void intel_pmu_lbr_disable(struct perf_event *event)
85{
86 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
87
88 if (!x86_pmu.lbr_nr)
89 return;
90
91 cpuc->lbr_users--;
92 WARN_ON_ONCE(cpuc->lbr_users < 0);
93
94 if (cpuc->enabled && !cpuc->lbr_users)
95 __intel_pmu_lbr_disable();
96}
97
98static void intel_pmu_lbr_enable_all(void)
99{
100 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
101
102 if (cpuc->lbr_users)
103 __intel_pmu_lbr_enable();
104}
105
106static void intel_pmu_lbr_disable_all(void)
107{
108 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
109
110 if (cpuc->lbr_users)
111 __intel_pmu_lbr_disable();
112}
113
114static inline u64 intel_pmu_lbr_tos(void)
115{
116 u64 tos;
117
118 rdmsrl(x86_pmu.lbr_tos, tos);
119
120 return tos;
121}
122
123static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
124{
125 unsigned long mask = x86_pmu.lbr_nr - 1;
126 u64 tos = intel_pmu_lbr_tos();
127 int i;
128
129 for (i = 0; i < x86_pmu.lbr_nr; i++) {
130 unsigned long lbr_idx = (tos - i) & mask;
131 union {
132 struct {
133 u32 from;
134 u32 to;
135 };
136 u64 lbr;
137 } msr_lastbranch;
138
139 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
140
141 cpuc->lbr_entries[i].from = msr_lastbranch.from;
142 cpuc->lbr_entries[i].to = msr_lastbranch.to;
143 cpuc->lbr_entries[i].flags = 0;
144 }
145 cpuc->lbr_stack.nr = i;
146}
147
148#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
149
150/*
151 * Due to lack of segmentation in Linux the effective address (offset)
152 * is the same as the linear address, allowing us to merge the LIP and EIP
153 * LBR formats.
154 */
155static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
156{
157 unsigned long mask = x86_pmu.lbr_nr - 1;
158 int lbr_format = x86_pmu.intel_cap.lbr_format;
159 u64 tos = intel_pmu_lbr_tos();
160 int i;
161
162 for (i = 0; i < x86_pmu.lbr_nr; i++) {
163 unsigned long lbr_idx = (tos - i) & mask;
164 u64 from, to, flags = 0;
165
166 rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
167 rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
168
169 if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
170 flags = !!(from & LBR_FROM_FLAG_MISPRED);
171 from = (u64)((((s64)from) << 1) >> 1);
172 }
173
174 cpuc->lbr_entries[i].from = from;
175 cpuc->lbr_entries[i].to = to;
176 cpuc->lbr_entries[i].flags = flags;
177 }
178 cpuc->lbr_stack.nr = i;
179}
180
181static void intel_pmu_lbr_read(void)
182{
183 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
184
185 if (!cpuc->lbr_users)
186 return;
187
188 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
189 intel_pmu_lbr_read_32(cpuc);
190 else
191 intel_pmu_lbr_read_64(cpuc);
192}
193
194static void intel_pmu_lbr_init_core(void)
195{
196 x86_pmu.lbr_nr = 4;
197 x86_pmu.lbr_tos = 0x01c9;
198 x86_pmu.lbr_from = 0x40;
199 x86_pmu.lbr_to = 0x60;
200}
201
202static void intel_pmu_lbr_init_nhm(void)
203{
204 x86_pmu.lbr_nr = 16;
205 x86_pmu.lbr_tos = 0x01c9;
206 x86_pmu.lbr_from = 0x680;
207 x86_pmu.lbr_to = 0x6c0;
208}
209
210static void intel_pmu_lbr_init_atom(void)
211{
212 x86_pmu.lbr_nr = 8;
213 x86_pmu.lbr_tos = 0x01c9;
214 x86_pmu.lbr_from = 0x40;
215 x86_pmu.lbr_to = 0x60;
216}
217
218#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 000000000000..107711bf0ee8
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,942 @@
1/*
2 * Netburst Perfomance Events (P4, old Xeon)
3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#ifdef CONFIG_CPU_SUP_INTEL
11
12#include <asm/perf_event_p4.h>
13
14#define P4_CNTR_LIMIT 3
15/*
16 * array indices: 0,1 - HT threads, used with HT enabled cpu
17 */
18struct p4_event_bind {
19 unsigned int opcode; /* Event code and ESCR selector */
20 unsigned int escr_msr[2]; /* ESCR MSR for this event */
21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22};
23
24struct p4_pebs_bind {
25 unsigned int metric_pebs;
26 unsigned int metric_vert;
27};
28
29/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
30#define P4_GEN_PEBS_BIND(name, pebs, vert) \
31 [P4_PEBS_METRIC__##name] = { \
32 .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
33 .metric_vert = vert, \
34 }
35
36/*
37 * note we have P4_PEBS_ENABLE_UOP_TAG always set here
38 *
39 * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
40 * event configuration to find out which values are to be
41 * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
42 * resgisters
43 */
44static struct p4_pebs_bind p4_pebs_bind_map[] = {
45 P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
46 P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
47 P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
48 P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
49 P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
50 P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
51 P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
52 P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
53 P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
54};
55
56/*
57 * Note that we don't use CCCR1 here, there is an
58 * exception for P4_BSQ_ALLOCATION but we just have
59 * no workaround
60 *
61 * consider this binding as resources which particular
62 * event may borrow, it doesn't contain EventMask,
63 * Tags and friends -- they are left to a caller
64 */
65static struct p4_event_bind p4_event_bind_map[] = {
66 [P4_EVENT_TC_DELIVER_MODE] = {
67 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
68 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
69 .cntr = { {4, 5, -1}, {6, 7, -1} },
70 },
71 [P4_EVENT_BPU_FETCH_REQUEST] = {
72 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
73 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
74 .cntr = { {0, -1, -1}, {2, -1, -1} },
75 },
76 [P4_EVENT_ITLB_REFERENCE] = {
77 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
78 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
79 .cntr = { {0, -1, -1}, {2, -1, -1} },
80 },
81 [P4_EVENT_MEMORY_CANCEL] = {
82 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
83 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
84 .cntr = { {8, 9, -1}, {10, 11, -1} },
85 },
86 [P4_EVENT_MEMORY_COMPLETE] = {
87 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
88 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
89 .cntr = { {8, 9, -1}, {10, 11, -1} },
90 },
91 [P4_EVENT_LOAD_PORT_REPLAY] = {
92 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
93 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
94 .cntr = { {8, 9, -1}, {10, 11, -1} },
95 },
96 [P4_EVENT_STORE_PORT_REPLAY] = {
97 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
98 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
99 .cntr = { {8, 9, -1}, {10, 11, -1} },
100 },
101 [P4_EVENT_MOB_LOAD_REPLAY] = {
102 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
103 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
104 .cntr = { {0, -1, -1}, {2, -1, -1} },
105 },
106 [P4_EVENT_PAGE_WALK_TYPE] = {
107 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
108 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
109 .cntr = { {0, -1, -1}, {2, -1, -1} },
110 },
111 [P4_EVENT_BSQ_CACHE_REFERENCE] = {
112 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
113 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
114 .cntr = { {0, -1, -1}, {2, -1, -1} },
115 },
116 [P4_EVENT_IOQ_ALLOCATION] = {
117 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
118 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
119 .cntr = { {0, -1, -1}, {2, -1, -1} },
120 },
121 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
122 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
123 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
124 .cntr = { {2, -1, -1}, {3, -1, -1} },
125 },
126 [P4_EVENT_FSB_DATA_ACTIVITY] = {
127 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
128 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
129 .cntr = { {0, -1, -1}, {2, -1, -1} },
130 },
131 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
132 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
133 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
134 .cntr = { {0, -1, -1}, {1, -1, -1} },
135 },
136 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
137 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
138 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
139 .cntr = { {2, -1, -1}, {3, -1, -1} },
140 },
141 [P4_EVENT_SSE_INPUT_ASSIST] = {
142 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
143 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
144 .cntr = { {8, 9, -1}, {10, 11, -1} },
145 },
146 [P4_EVENT_PACKED_SP_UOP] = {
147 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
148 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
149 .cntr = { {8, 9, -1}, {10, 11, -1} },
150 },
151 [P4_EVENT_PACKED_DP_UOP] = {
152 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
153 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
154 .cntr = { {8, 9, -1}, {10, 11, -1} },
155 },
156 [P4_EVENT_SCALAR_SP_UOP] = {
157 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
158 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
159 .cntr = { {8, 9, -1}, {10, 11, -1} },
160 },
161 [P4_EVENT_SCALAR_DP_UOP] = {
162 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
163 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
164 .cntr = { {8, 9, -1}, {10, 11, -1} },
165 },
166 [P4_EVENT_64BIT_MMX_UOP] = {
167 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
168 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
169 .cntr = { {8, 9, -1}, {10, 11, -1} },
170 },
171 [P4_EVENT_128BIT_MMX_UOP] = {
172 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
173 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
174 .cntr = { {8, 9, -1}, {10, 11, -1} },
175 },
176 [P4_EVENT_X87_FP_UOP] = {
177 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
178 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
179 .cntr = { {8, 9, -1}, {10, 11, -1} },
180 },
181 [P4_EVENT_TC_MISC] = {
182 .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
183 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
184 .cntr = { {4, 5, -1}, {6, 7, -1} },
185 },
186 [P4_EVENT_GLOBAL_POWER_EVENTS] = {
187 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
188 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
189 .cntr = { {0, -1, -1}, {2, -1, -1} },
190 },
191 [P4_EVENT_TC_MS_XFER] = {
192 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
193 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
194 .cntr = { {4, 5, -1}, {6, 7, -1} },
195 },
196 [P4_EVENT_UOP_QUEUE_WRITES] = {
197 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
198 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
199 .cntr = { {4, 5, -1}, {6, 7, -1} },
200 },
201 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
202 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
203 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
204 .cntr = { {4, 5, -1}, {6, 7, -1} },
205 },
206 [P4_EVENT_RETIRED_BRANCH_TYPE] = {
207 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
208 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
209 .cntr = { {4, 5, -1}, {6, 7, -1} },
210 },
211 [P4_EVENT_RESOURCE_STALL] = {
212 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
213 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
214 .cntr = { {12, 13, 16}, {14, 15, 17} },
215 },
216 [P4_EVENT_WC_BUFFER] = {
217 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
218 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
219 .cntr = { {8, 9, -1}, {10, 11, -1} },
220 },
221 [P4_EVENT_B2B_CYCLES] = {
222 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
223 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
224 .cntr = { {0, -1, -1}, {2, -1, -1} },
225 },
226 [P4_EVENT_BNR] = {
227 .opcode = P4_OPCODE(P4_EVENT_BNR),
228 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
229 .cntr = { {0, -1, -1}, {2, -1, -1} },
230 },
231 [P4_EVENT_SNOOP] = {
232 .opcode = P4_OPCODE(P4_EVENT_SNOOP),
233 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
234 .cntr = { {0, -1, -1}, {2, -1, -1} },
235 },
236 [P4_EVENT_RESPONSE] = {
237 .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
238 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
239 .cntr = { {0, -1, -1}, {2, -1, -1} },
240 },
241 [P4_EVENT_FRONT_END_EVENT] = {
242 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
243 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
244 .cntr = { {12, 13, 16}, {14, 15, 17} },
245 },
246 [P4_EVENT_EXECUTION_EVENT] = {
247 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
248 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
249 .cntr = { {12, 13, 16}, {14, 15, 17} },
250 },
251 [P4_EVENT_REPLAY_EVENT] = {
252 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
253 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
254 .cntr = { {12, 13, 16}, {14, 15, 17} },
255 },
256 [P4_EVENT_INSTR_RETIRED] = {
257 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
258 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
259 .cntr = { {12, 13, 16}, {14, 15, 17} },
260 },
261 [P4_EVENT_UOPS_RETIRED] = {
262 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
263 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
264 .cntr = { {12, 13, 16}, {14, 15, 17} },
265 },
266 [P4_EVENT_UOP_TYPE] = {
267 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
268 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
269 .cntr = { {12, 13, 16}, {14, 15, 17} },
270 },
271 [P4_EVENT_BRANCH_RETIRED] = {
272 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
273 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
274 .cntr = { {12, 13, 16}, {14, 15, 17} },
275 },
276 [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
277 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
278 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
279 .cntr = { {12, 13, 16}, {14, 15, 17} },
280 },
281 [P4_EVENT_X87_ASSIST] = {
282 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
283 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
284 .cntr = { {12, 13, 16}, {14, 15, 17} },
285 },
286 [P4_EVENT_MACHINE_CLEAR] = {
287 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
288 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
289 .cntr = { {12, 13, 16}, {14, 15, 17} },
290 },
291 [P4_EVENT_INSTR_COMPLETED] = {
292 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
293 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
294 .cntr = { {12, 13, 16}, {14, 15, 17} },
295 },
296};
297
298#define P4_GEN_CACHE_EVENT(event, bit, metric) \
299 p4_config_pack_escr(P4_ESCR_EVENT(event) | \
300 P4_ESCR_EMASK_BIT(event, bit)) | \
301 p4_config_pack_cccr(metric | \
302 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
303
304static __initconst const u64 p4_hw_cache_event_ids
305 [PERF_COUNT_HW_CACHE_MAX]
306 [PERF_COUNT_HW_CACHE_OP_MAX]
307 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
308{
309 [ C(L1D ) ] = {
310 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x0,
312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
313 P4_PEBS_METRIC__1stl_cache_load_miss_retired),
314 },
315 },
316 [ C(LL ) ] = {
317 [ C(OP_READ) ] = {
318 [ C(RESULT_ACCESS) ] = 0x0,
319 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
320 P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
321 },
322},
323 [ C(DTLB) ] = {
324 [ C(OP_READ) ] = {
325 [ C(RESULT_ACCESS) ] = 0x0,
326 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
327 P4_PEBS_METRIC__dtlb_load_miss_retired),
328 },
329 [ C(OP_WRITE) ] = {
330 [ C(RESULT_ACCESS) ] = 0x0,
331 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
332 P4_PEBS_METRIC__dtlb_store_miss_retired),
333 },
334 },
335 [ C(ITLB) ] = {
336 [ C(OP_READ) ] = {
337 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
338 P4_PEBS_METRIC__none),
339 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
340 P4_PEBS_METRIC__none),
341 },
342 [ C(OP_WRITE) ] = {
343 [ C(RESULT_ACCESS) ] = -1,
344 [ C(RESULT_MISS) ] = -1,
345 },
346 [ C(OP_PREFETCH) ] = {
347 [ C(RESULT_ACCESS) ] = -1,
348 [ C(RESULT_MISS) ] = -1,
349 },
350 },
351};
352
353static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
354 /* non-halted CPU clocks */
355 [PERF_COUNT_HW_CPU_CYCLES] =
356 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
357 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
358
359 /*
360 * retired instructions
361 * in a sake of simplicity we don't use the FSB tagging
362 */
363 [PERF_COUNT_HW_INSTRUCTIONS] =
364 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
365 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
366 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
367
368 /* cache hits */
369 [PERF_COUNT_HW_CACHE_REFERENCES] =
370 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
371 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
372 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
373 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
374 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
375 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
376 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
377
378 /* cache misses */
379 [PERF_COUNT_HW_CACHE_MISSES] =
380 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
381 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
382 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
383 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
384
385 /* branch instructions retired */
386 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
387 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
388 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
389 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
390 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
391 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
392
393 /* mispredicted branches retired */
394 [PERF_COUNT_HW_BRANCH_MISSES] =
395 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
396 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
397
398 /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
399 [PERF_COUNT_HW_BUS_CYCLES] =
400 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
401 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
402 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
403 p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
404};
405
406static struct p4_event_bind *p4_config_get_bind(u64 config)
407{
408 unsigned int evnt = p4_config_unpack_event(config);
409 struct p4_event_bind *bind = NULL;
410
411 if (evnt < ARRAY_SIZE(p4_event_bind_map))
412 bind = &p4_event_bind_map[evnt];
413
414 return bind;
415}
416
417static u64 p4_pmu_event_map(int hw_event)
418{
419 struct p4_event_bind *bind;
420 unsigned int esel;
421 u64 config;
422
423 config = p4_general_events[hw_event];
424 bind = p4_config_get_bind(config);
425 esel = P4_OPCODE_ESEL(bind->opcode);
426 config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
427
428 return config;
429}
430
431static int p4_validate_raw_event(struct perf_event *event)
432{
433 unsigned int v;
434
435 /* user data may have out-of-bound event index */
436 v = p4_config_unpack_event(event->attr.config);
437 if (v >= ARRAY_SIZE(p4_event_bind_map)) {
438 pr_warning("P4 PMU: Unknown event code: %d\n", v);
439 return -EINVAL;
440 }
441
442 /*
443 * it may have some screwed PEBS bits
444 */
445 if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
446 pr_warning("P4 PMU: PEBS are not supported yet\n");
447 return -EINVAL;
448 }
449 v = p4_config_unpack_metric(event->attr.config);
450 if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
451 pr_warning("P4 PMU: Unknown metric code: %d\n", v);
452 return -EINVAL;
453 }
454
455 return 0;
456}
457
458static int p4_hw_config(struct perf_event *event)
459{
460 int cpu = get_cpu();
461 int rc = 0;
462 u32 escr, cccr;
463
464 /*
465 * the reason we use cpu that early is that: if we get scheduled
466 * first time on the same cpu -- we will not need swap thread
467 * specific flags in config (and will save some cpu cycles)
468 */
469
470 cccr = p4_default_cccr_conf(cpu);
471 escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
472 event->attr.exclude_user);
473 event->hw.config = p4_config_pack_escr(escr) |
474 p4_config_pack_cccr(cccr);
475
476 if (p4_ht_active() && p4_ht_thread(cpu))
477 event->hw.config = p4_set_ht_bit(event->hw.config);
478
479 if (event->attr.type == PERF_TYPE_RAW) {
480
481 rc = p4_validate_raw_event(event);
482 if (rc)
483 goto out;
484
485 /*
486 * We don't control raw events so it's up to the caller
487 * to pass sane values (and we don't count the thread number
488 * on HT machine but allow HT-compatible specifics to be
489 * passed on)
490 *
491 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
492 * bits since we keep additional info here (for cache events and etc)
493 *
494 * XXX: HT wide things should check perf_paranoid_cpu() &&
495 * CAP_SYS_ADMIN
496 */
497 event->hw.config |= event->attr.config &
498 (p4_config_pack_escr(P4_ESCR_MASK_HT) |
499 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
500 }
501
502 rc = x86_setup_perfctr(event);
503out:
504 put_cpu();
505 return rc;
506}
507
508static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
509{
510 int overflow = 0;
511 u32 low, high;
512
513 rdmsr(hwc->config_base + hwc->idx, low, high);
514
515 /* we need to check high bit for unflagged overflows */
516 if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
517 overflow = 1;
518 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
519 ((u64)low) & ~P4_CCCR_OVF);
520 }
521
522 return overflow;
523}
524
525static void p4_pmu_disable_pebs(void)
526{
527 /*
528 * FIXME
529 *
530 * It's still allowed that two threads setup same cache
531 * events so we can't simply clear metrics until we knew
532 * noone is depending on us, so we need kind of counter
533 * for "ReplayEvent" users.
534 *
535 * What is more complex -- RAW events, if user (for some
536 * reason) will pass some cache event metric with improper
537 * event opcode -- it's fine from hardware point of view
538 * but completely nonsence from "meaning" of such action.
539 *
540 * So at moment let leave metrics turned on forever -- it's
541 * ok for now but need to be revisited!
542 *
543 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
544 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
545 */
546}
547
548static inline void p4_pmu_disable_event(struct perf_event *event)
549{
550 struct hw_perf_event *hwc = &event->hw;
551
552 /*
553 * If event gets disabled while counter is in overflowed
554 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
555 * asserted again and again
556 */
557 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
558 (u64)(p4_config_unpack_cccr(hwc->config)) &
559 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
560}
561
562static void p4_pmu_disable_all(void)
563{
564 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
565 int idx;
566
567 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
568 struct perf_event *event = cpuc->events[idx];
569 if (!test_bit(idx, cpuc->active_mask))
570 continue;
571 p4_pmu_disable_event(event);
572 }
573
574 p4_pmu_disable_pebs();
575}
576
577/* configuration must be valid */
578static void p4_pmu_enable_pebs(u64 config)
579{
580 struct p4_pebs_bind *bind;
581 unsigned int idx;
582
583 BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
584
585 idx = p4_config_unpack_metric(config);
586 if (idx == P4_PEBS_METRIC__none)
587 return;
588
589 bind = &p4_pebs_bind_map[idx];
590
591 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
592 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
593}
594
595static void p4_pmu_enable_event(struct perf_event *event)
596{
597 struct hw_perf_event *hwc = &event->hw;
598 int thread = p4_ht_config_thread(hwc->config);
599 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
600 unsigned int idx = p4_config_unpack_event(hwc->config);
601 struct p4_event_bind *bind;
602 u64 escr_addr, cccr;
603
604 bind = &p4_event_bind_map[idx];
605 escr_addr = (u64)bind->escr_msr[thread];
606
607 /*
608 * - we dont support cascaded counters yet
609 * - and counter 1 is broken (erratum)
610 */
611 WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
612 WARN_ON_ONCE(hwc->idx == 1);
613
614 /* we need a real Event value */
615 escr_conf &= ~P4_ESCR_EVENT_MASK;
616 escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
617
618 cccr = p4_config_unpack_cccr(hwc->config);
619
620 /*
621 * it could be Cache event so we need to write metrics
622 * into additional MSRs
623 */
624 p4_pmu_enable_pebs(hwc->config);
625
626 (void)checking_wrmsrl(escr_addr, escr_conf);
627 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
628 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
629}
630
631static void p4_pmu_enable_all(int added)
632{
633 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
634 int idx;
635
636 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
637 struct perf_event *event = cpuc->events[idx];
638 if (!test_bit(idx, cpuc->active_mask))
639 continue;
640 p4_pmu_enable_event(event);
641 }
642}
643
644static int p4_pmu_handle_irq(struct pt_regs *regs)
645{
646 struct perf_sample_data data;
647 struct cpu_hw_events *cpuc;
648 struct perf_event *event;
649 struct hw_perf_event *hwc;
650 int idx, handled = 0;
651 u64 val;
652
653 data.addr = 0;
654 data.raw = NULL;
655
656 cpuc = &__get_cpu_var(cpu_hw_events);
657
658 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
659
660 if (!test_bit(idx, cpuc->active_mask))
661 continue;
662
663 event = cpuc->events[idx];
664 hwc = &event->hw;
665
666 WARN_ON_ONCE(hwc->idx != idx);
667
668 /* it might be unflagged overflow */
669 handled = p4_pmu_clear_cccr_ovf(hwc);
670
671 val = x86_perf_event_update(event);
672 if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
673 continue;
674
675 /* event overflow for sure */
676 data.period = event->hw.last_period;
677
678 if (!x86_perf_event_set_period(event))
679 continue;
680 if (perf_event_overflow(event, 1, &data, regs))
681 p4_pmu_disable_event(event);
682 }
683
684 if (handled) {
685 /* p4 quirk: unmask it again */
686 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
687 inc_irq_stat(apic_perf_irqs);
688 }
689
690 return handled;
691}
692
693/*
694 * swap thread specific fields according to a thread
695 * we are going to run on
696 */
697static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
698{
699 u32 escr, cccr;
700
701 /*
702 * we either lucky and continue on same cpu or no HT support
703 */
704 if (!p4_should_swap_ts(hwc->config, cpu))
705 return;
706
707 /*
708 * the event is migrated from an another logical
709 * cpu, so we need to swap thread specific flags
710 */
711
712 escr = p4_config_unpack_escr(hwc->config);
713 cccr = p4_config_unpack_cccr(hwc->config);
714
715 if (p4_ht_thread(cpu)) {
716 cccr &= ~P4_CCCR_OVF_PMI_T0;
717 cccr |= P4_CCCR_OVF_PMI_T1;
718 if (escr & P4_ESCR_T0_OS) {
719 escr &= ~P4_ESCR_T0_OS;
720 escr |= P4_ESCR_T1_OS;
721 }
722 if (escr & P4_ESCR_T0_USR) {
723 escr &= ~P4_ESCR_T0_USR;
724 escr |= P4_ESCR_T1_USR;
725 }
726 hwc->config = p4_config_pack_escr(escr);
727 hwc->config |= p4_config_pack_cccr(cccr);
728 hwc->config |= P4_CONFIG_HT;
729 } else {
730 cccr &= ~P4_CCCR_OVF_PMI_T1;
731 cccr |= P4_CCCR_OVF_PMI_T0;
732 if (escr & P4_ESCR_T1_OS) {
733 escr &= ~P4_ESCR_T1_OS;
734 escr |= P4_ESCR_T0_OS;
735 }
736 if (escr & P4_ESCR_T1_USR) {
737 escr &= ~P4_ESCR_T1_USR;
738 escr |= P4_ESCR_T0_USR;
739 }
740 hwc->config = p4_config_pack_escr(escr);
741 hwc->config |= p4_config_pack_cccr(cccr);
742 hwc->config &= ~P4_CONFIG_HT;
743 }
744}
745
746/*
747 * ESCR address hashing is tricky, ESCRs are not sequential
748 * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
749 * the metric between any ESCRs is laid in range [0xa0,0xe1]
750 *
751 * so we make ~70% filled hashtable
752 */
753
754#define P4_ESCR_MSR_BASE 0x000003a0
755#define P4_ESCR_MSR_MAX 0x000003e1
756#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
757#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE)
758#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr
759
760static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
761 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
762 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
763 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
764 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
765 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
766 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
767 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
768 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
769 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
770 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
771 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
772 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
773 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
774 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
775 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
776 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
777 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
778 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
779 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
780 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
781 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
782 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
783 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
784 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
785 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
786 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
787 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
788 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
789 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
790 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
791 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
792 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
793 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
794 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
795 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
796 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
797 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
798 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
799 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
800 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
801 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
802 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
803 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
804 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
805 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
806 P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
807};
808
809static int p4_get_escr_idx(unsigned int addr)
810{
811 unsigned int idx = P4_ESCR_MSR_IDX(addr);
812
813 if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
814 !p4_escr_table[idx] ||
815 p4_escr_table[idx] != addr)) {
816 WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
817 return -1;
818 }
819
820 return idx;
821}
822
823static int p4_next_cntr(int thread, unsigned long *used_mask,
824 struct p4_event_bind *bind)
825{
826 int i, j;
827
828 for (i = 0; i < P4_CNTR_LIMIT; i++) {
829 j = bind->cntr[thread][i];
830 if (j != -1 && !test_bit(j, used_mask))
831 return j;
832 }
833
834 return -1;
835}
836
837static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
838{
839 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
840 unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
841 int cpu = smp_processor_id();
842 struct hw_perf_event *hwc;
843 struct p4_event_bind *bind;
844 unsigned int i, thread, num;
845 int cntr_idx, escr_idx;
846
847 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
848 bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
849
850 for (i = 0, num = n; i < n; i++, num--) {
851
852 hwc = &cpuc->event_list[i]->hw;
853 thread = p4_ht_thread(cpu);
854 bind = p4_config_get_bind(hwc->config);
855 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
856 if (unlikely(escr_idx == -1))
857 goto done;
858
859 if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
860 cntr_idx = hwc->idx;
861 if (assign)
862 assign[i] = hwc->idx;
863 goto reserve;
864 }
865
866 cntr_idx = p4_next_cntr(thread, used_mask, bind);
867 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
868 goto done;
869
870 p4_pmu_swap_config_ts(hwc, cpu);
871 if (assign)
872 assign[i] = cntr_idx;
873reserve:
874 set_bit(cntr_idx, used_mask);
875 set_bit(escr_idx, escr_mask);
876 }
877
878done:
879 return num ? -ENOSPC : 0;
880}
881
882static __initconst const struct x86_pmu p4_pmu = {
883 .name = "Netburst P4/Xeon",
884 .handle_irq = p4_pmu_handle_irq,
885 .disable_all = p4_pmu_disable_all,
886 .enable_all = p4_pmu_enable_all,
887 .enable = p4_pmu_enable_event,
888 .disable = p4_pmu_disable_event,
889 .eventsel = MSR_P4_BPU_CCCR0,
890 .perfctr = MSR_P4_BPU_PERFCTR0,
891 .event_map = p4_pmu_event_map,
892 .max_events = ARRAY_SIZE(p4_general_events),
893 .get_event_constraints = x86_get_event_constraints,
894 /*
895 * IF HT disabled we may need to use all
896 * ARCH_P4_MAX_CCCR counters simulaneously
897 * though leave it restricted at moment assuming
898 * HT is on
899 */
900 .num_counters = ARCH_P4_MAX_CCCR,
901 .apic = 1,
902 .cntval_bits = 40,
903 .cntval_mask = (1ULL << 40) - 1,
904 .max_period = (1ULL << 39) - 1,
905 .hw_config = p4_hw_config,
906 .schedule_events = p4_pmu_schedule_events,
907 /*
908 * This handles erratum N15 in intel doc 249199-029,
909 * the counter may not be updated correctly on write
910 * so we need a second write operation to do the trick
911 * (the official workaround didn't work)
912 *
913 * the former idea is taken from OProfile code
914 */
915 .perfctr_second_write = 1,
916};
917
918static __init int p4_pmu_init(void)
919{
920 unsigned int low, high;
921
922 /* If we get stripped -- indexig fails */
923 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
924
925 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
926 if (!(low & (1 << 7))) {
927 pr_cont("unsupported Netburst CPU model %d ",
928 boot_cpu_data.x86_model);
929 return -ENODEV;
930 }
931
932 memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
933 sizeof(hw_cache_event_ids));
934
935 pr_cont("Netburst events, ");
936
937 x86_pmu = p4_pmu;
938
939 return 0;
940}
941
942#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index a330485d14da..34ba07be2cda 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event)
27 */ 27 */
28#define P6_NOP_EVENT 0x0000002EULL 28#define P6_NOP_EVENT 0x0000002EULL
29 29
30static u64 p6_pmu_raw_event(u64 hw_event)
31{
32#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
33#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
34#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
35#define P6_EVNTSEL_INV_MASK 0x00800000ULL
36#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
37
38#define P6_EVNTSEL_MASK \
39 (P6_EVNTSEL_EVENT_MASK | \
40 P6_EVNTSEL_UNIT_MASK | \
41 P6_EVNTSEL_EDGE_MASK | \
42 P6_EVNTSEL_INV_MASK | \
43 P6_EVNTSEL_REG_MASK)
44
45 return hw_event & P6_EVNTSEL_MASK;
46}
47
48static struct event_constraint p6_event_constraints[] = 30static struct event_constraint p6_event_constraints[] =
49{ 31{
50 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ 32 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
@@ -66,7 +48,7 @@ static void p6_pmu_disable_all(void)
66 wrmsrl(MSR_P6_EVNTSEL0, val); 48 wrmsrl(MSR_P6_EVNTSEL0, val);
67} 49}
68 50
69static void p6_pmu_enable_all(void) 51static void p6_pmu_enable_all(int added)
70{ 52{
71 unsigned long val; 53 unsigned long val;
72 54
@@ -102,22 +84,23 @@ static void p6_pmu_enable_event(struct perf_event *event)
102 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
103} 85}
104 86
105static __initconst struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
106 .name = "p6", 88 .name = "p6",
107 .handle_irq = x86_pmu_handle_irq, 89 .handle_irq = x86_pmu_handle_irq,
108 .disable_all = p6_pmu_disable_all, 90 .disable_all = p6_pmu_disable_all,
109 .enable_all = p6_pmu_enable_all, 91 .enable_all = p6_pmu_enable_all,
110 .enable = p6_pmu_enable_event, 92 .enable = p6_pmu_enable_event,
111 .disable = p6_pmu_disable_event, 93 .disable = p6_pmu_disable_event,
94 .hw_config = x86_pmu_hw_config,
95 .schedule_events = x86_schedule_events,
112 .eventsel = MSR_P6_EVNTSEL0, 96 .eventsel = MSR_P6_EVNTSEL0,
113 .perfctr = MSR_P6_PERFCTR0, 97 .perfctr = MSR_P6_PERFCTR0,
114 .event_map = p6_pmu_event_map, 98 .event_map = p6_pmu_event_map,
115 .raw_event = p6_pmu_raw_event,
116 .max_events = ARRAY_SIZE(p6_perfmon_event_map), 99 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
117 .apic = 1, 100 .apic = 1,
118 .max_period = (1ULL << 31) - 1, 101 .max_period = (1ULL << 31) - 1,
119 .version = 0, 102 .version = 0,
120 .num_events = 2, 103 .num_counters = 2,
121 /* 104 /*
122 * Events have 40 bits implemented. However they are designed such 105 * Events have 40 bits implemented. However they are designed such
123 * that bits [32-39] are sign extensions of bit 31. As such the 106 * that bits [32-39] are sign extensions of bit 31. As such the
@@ -125,8 +108,8 @@ static __initconst struct x86_pmu p6_pmu = {
125 * 108 *
126 * See IA-32 Intel Architecture Software developer manual Vol 3B 109 * See IA-32 Intel Architecture Software developer manual Vol 3B
127 */ 110 */
128 .event_bits = 32, 111 .cntval_bits = 32,
129 .event_mask = (1ULL << 32) - 1, 112 .cntval_mask = (1ULL << 32) - 1,
130 .get_event_constraints = x86_get_event_constraints, 113 .get_event_constraints = x86_get_event_constraints,
131 .event_constraints = p6_event_constraints, 114 .event_constraints = p6_event_constraints,
132}; 115};
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000000..34b4dad6f0b8
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,63 @@
1/*
2 * Routines to indentify additional cpu features that are scattered in
3 * cpuid space.
4 */
5#include <linux/cpu.h>
6
7#include <asm/pat.h>
8#include <asm/processor.h>
9
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17 u32 sub_leaf;
18};
19
20enum cpuid_regs {
21 CR_EAX = 0,
22 CR_ECX,
23 CR_EDX,
24 CR_EBX
25};
26
27void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
28{
29 u32 max_level;
30 u32 regs[4];
31 const struct cpuid_bit *cb;
32
33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
34 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
35 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
36 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
37 { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
38 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
39 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
40 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
41 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
42 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
43 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
44 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
45 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
46 { 0, 0, 0, 0, 0 }
47 };
48
49 for (cb = cpuid_bits; cb->feature; cb++) {
50
51 /* Verify that the level is valid */
52 max_level = cpuid_eax(cb->level & 0xffff0000);
53 if (max_level < cb->level ||
54 max_level > (cb->level | 0xffff))
55 continue;
56
57 cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
58 &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
59
60 if (regs[cb->reg] & (1 << cb->bit))
61 set_cpu_cap(c, cb->feature);
62 }
63}
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c
index 97ad79cdf688..4397e987a1cf 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,60 +1,14 @@
1/* 1/*
2 * Routines to indentify additional cpu features that are scattered in 2 * Check for extended topology enumeration cpuid leaf 0xb and if it
3 * cpuid space. 3 * exists, use it for populating initial_apicid and cpu topology
4 * detection.
4 */ 5 */
5#include <linux/cpu.h>
6 6
7#include <linux/cpu.h>
8#include <asm/apic.h>
7#include <asm/pat.h> 9#include <asm/pat.h>
8#include <asm/processor.h> 10#include <asm/processor.h>
9 11
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17};
18
19enum cpuid_regs {
20 CR_EAX = 0,
21 CR_ECX,
22 CR_EDX,
23 CR_EBX
24};
25
26void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
27{
28 u32 max_level;
29 u32 regs[4];
30 const struct cpuid_bit *cb;
31
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
36 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
37 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
38 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
39 { 0, 0, 0, 0 }
40 };
41
42 for (cb = cpuid_bits; cb->feature; cb++) {
43
44 /* Verify that the level is valid */
45 max_level = cpuid_eax(cb->level & 0xffff0000);
46 if (max_level < cb->level ||
47 max_level > (cb->level | 0xffff))
48 continue;
49
50 cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
51 &regs[CR_ECX], &regs[CR_EDX]);
52
53 if (regs[cb->reg] & (1 << cb->bit))
54 set_cpu_cap(c, cb->feature);
55 }
56}
57
58/* leaf 0xb SMT level */ 12/* leaf 0xb SMT level */
59#define SMT_LEVEL 0 13#define SMT_LEVEL 0
60 14
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index dfdb4dba2320..227b0448960d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,8 +24,8 @@
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <asm/div64.h> 26#include <asm/div64.h>
27#include <asm/vmware.h>
28#include <asm/x86_init.h> 27#include <asm/x86_init.h>
28#include <asm/hypervisor.h>
29 29
30#define CPUID_VMWARE_INFO_LEAF 0x40000000 30#define CPUID_VMWARE_INFO_LEAF 0x40000000
31#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 31#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void)
51 51
52static unsigned long vmware_get_tsc_khz(void) 52static unsigned long vmware_get_tsc_khz(void)
53{ 53{
54 uint64_t tsc_hz; 54 uint64_t tsc_hz, lpj;
55 uint32_t eax, ebx, ecx, edx; 55 uint32_t eax, ebx, ecx, edx;
56 56
57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); 57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,10 +62,17 @@ static unsigned long vmware_get_tsc_khz(void)
62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", 62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
63 (unsigned long) tsc_hz / 1000, 63 (unsigned long) tsc_hz / 1000,
64 (unsigned long) tsc_hz % 1000); 64 (unsigned long) tsc_hz % 1000);
65
66 if (!preset_lpj) {
67 lpj = ((u64)tsc_hz * 1000);
68 do_div(lpj, HZ);
69 preset_lpj = lpj;
70 }
71
65 return tsc_hz; 72 return tsc_hz;
66} 73}
67 74
68void __init vmware_platform_setup(void) 75static void __init vmware_platform_setup(void)
69{ 76{
70 uint32_t eax, ebx, ecx, edx; 77 uint32_t eax, ebx, ecx, edx;
71 78
@@ -83,26 +90,22 @@ void __init vmware_platform_setup(void)
83 * serial key should be enough, as this will always have a VMware 90 * serial key should be enough, as this will always have a VMware
84 * specific string when running under VMware hypervisor. 91 * specific string when running under VMware hypervisor.
85 */ 92 */
86int vmware_platform(void) 93static bool __init vmware_platform(void)
87{ 94{
88 if (cpu_has_hypervisor) { 95 if (cpu_has_hypervisor) {
89 unsigned int eax, ebx, ecx, edx; 96 unsigned int eax;
90 char hyper_vendor_id[13]; 97 unsigned int hyper_vendor_id[3];
91 98
92 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); 99 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
93 memcpy(hyper_vendor_id + 0, &ebx, 4); 100 &hyper_vendor_id[1], &hyper_vendor_id[2]);
94 memcpy(hyper_vendor_id + 4, &ecx, 4); 101 if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
95 memcpy(hyper_vendor_id + 8, &edx, 4); 102 return true;
96 hyper_vendor_id[12] = '\0';
97 if (!strcmp(hyper_vendor_id, "VMwareVMware"))
98 return 1;
99 } else if (dmi_available && dmi_name_in_serial("VMware") && 103 } else if (dmi_available && dmi_name_in_serial("VMware") &&
100 __vmware_platform()) 104 __vmware_platform())
101 return 1; 105 return true;
102 106
103 return 0; 107 return false;
104} 108}
105EXPORT_SYMBOL(vmware_platform);
106 109
107/* 110/*
108 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 111 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -116,8 +119,16 @@ EXPORT_SYMBOL(vmware_platform);
116 * so that the kernel could just trust the hypervisor with providing a 119 * so that the kernel could just trust the hypervisor with providing a
117 * reliable virtual TSC that is suitable for timekeeping. 120 * reliable virtual TSC that is suitable for timekeeping.
118 */ 121 */
119void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) 122static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
120{ 123{
121 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 124 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
122 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); 125 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
123} 126}
127
128const __refconst struct hypervisor_x86 x86_hyper_vmware = {
129 .name = "VMware",
130 .detect = vmware_platform,
131 .set_cpu_features = vmware_set_cpu_features,
132 .init_platform = vmware_platform_setup,
133};
134EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8b862d5900fe..1b7b31ab7d86 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -170,7 +170,7 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
170 cpuid_device_destroy(cpu); 170 cpuid_device_destroy(cpu);
171 break; 171 break;
172 } 172 }
173 return err ? NOTIFY_BAD : NOTIFY_OK; 173 return notifier_from_errno(err);
174} 174}
175 175
176static struct notifier_block __refdata cpuid_class_cpu_notifier = 176static struct notifier_block __refdata cpuid_class_cpu_notifier =
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 1c47390dd0e5..000000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
1/*
2 * Debug Store support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS).
7 *
8 * It manages:
9 * - DS and BTS hardware configuration
10 * - buffer overflow handling (to be done)
11 * - buffer access
12 *
13 * It does not do:
14 * - security checking (is the caller allowed to trace the task)
15 * - buffer allocation (memory accounting)
16 *
17 *
18 * Copyright (C) 2007-2009 Intel Corporation.
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
20 */
21
22#include <linux/kernel.h>
23#include <linux/string.h>
24#include <linux/errno.h>
25#include <linux/sched.h>
26#include <linux/slab.h>
27#include <linux/mm.h>
28#include <linux/trace_clock.h>
29
30#include <asm/ds.h>
31
32#include "ds_selftest.h"
33
34/*
35 * The configuration for a particular DS hardware implementation:
36 */
37struct ds_configuration {
38 /* The name of the configuration: */
39 const char *name;
40
41 /* The size of pointer-typed fields in DS, BTS, and PEBS: */
42 unsigned char sizeof_ptr_field;
43
44 /* The size of a BTS/PEBS record in bytes: */
45 unsigned char sizeof_rec[2];
46
47 /* The number of pebs counter reset values in the DS structure. */
48 unsigned char nr_counter_reset;
49
50 /* Control bit-masks indexed by enum ds_feature: */
51 unsigned long ctl[dsf_ctl_max];
52};
53static struct ds_configuration ds_cfg __read_mostly;
54
55
56/* Maximal size of a DS configuration: */
57#define MAX_SIZEOF_DS 0x80
58
59/* Maximal size of a BTS record: */
60#define MAX_SIZEOF_BTS (3 * 8)
61
62/* BTS and PEBS buffer alignment: */
63#define DS_ALIGNMENT (1 << 3)
64
65/* Number of buffer pointers in DS: */
66#define NUM_DS_PTR_FIELDS 8
67
68/* Size of a pebs reset value in DS: */
69#define PEBS_RESET_FIELD_SIZE 8
70
71/* Mask of control bits in the DS MSR register: */
72#define BTS_CONTROL \
73 ( ds_cfg.ctl[dsf_bts] | \
74 ds_cfg.ctl[dsf_bts_kernel] | \
75 ds_cfg.ctl[dsf_bts_user] | \
76 ds_cfg.ctl[dsf_bts_overflow] )
77
78/*
79 * A BTS or PEBS tracer.
80 *
81 * This holds the configuration of the tracer and serves as a handle
82 * to identify tracers.
83 */
84struct ds_tracer {
85 /* The DS context (partially) owned by this tracer. */
86 struct ds_context *context;
87 /* The buffer provided on ds_request() and its size in bytes. */
88 void *buffer;
89 size_t size;
90};
91
92struct bts_tracer {
93 /* The common DS part: */
94 struct ds_tracer ds;
95
96 /* The trace including the DS configuration: */
97 struct bts_trace trace;
98
99 /* Buffer overflow notification function: */
100 bts_ovfl_callback_t ovfl;
101
102 /* Active flags affecting trace collection. */
103 unsigned int flags;
104};
105
106struct pebs_tracer {
107 /* The common DS part: */
108 struct ds_tracer ds;
109
110 /* The trace including the DS configuration: */
111 struct pebs_trace trace;
112
113 /* Buffer overflow notification function: */
114 pebs_ovfl_callback_t ovfl;
115};
116
117/*
118 * Debug Store (DS) save area configuration (see Intel64 and IA32
119 * Architectures Software Developer's Manual, section 18.5)
120 *
121 * The DS configuration consists of the following fields; different
122 * architetures vary in the size of those fields.
123 *
124 * - double-word aligned base linear address of the BTS buffer
125 * - write pointer into the BTS buffer
126 * - end linear address of the BTS buffer (one byte beyond the end of
127 * the buffer)
128 * - interrupt pointer into BTS buffer
129 * (interrupt occurs when write pointer passes interrupt pointer)
130 * - double-word aligned base linear address of the PEBS buffer
131 * - write pointer into the PEBS buffer
132 * - end linear address of the PEBS buffer (one byte beyond the end of
133 * the buffer)
134 * - interrupt pointer into PEBS buffer
135 * (interrupt occurs when write pointer passes interrupt pointer)
136 * - value to which counter is reset following counter overflow
137 *
138 * Later architectures use 64bit pointers throughout, whereas earlier
139 * architectures use 32bit pointers in 32bit mode.
140 *
141 *
142 * We compute the base address for the first 8 fields based on:
143 * - the field size stored in the DS configuration
144 * - the relative field position
145 * - an offset giving the start of the respective region
146 *
147 * This offset is further used to index various arrays holding
148 * information for BTS and PEBS at the respective index.
149 *
150 * On later 32bit processors, we only access the lower 32bit of the
151 * 64bit pointer fields. The upper halves will be zeroed out.
152 */
153
154enum ds_field {
155 ds_buffer_base = 0,
156 ds_index,
157 ds_absolute_maximum,
158 ds_interrupt_threshold,
159};
160
161enum ds_qualifier {
162 ds_bts = 0,
163 ds_pebs
164};
165
166static inline unsigned long
167ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
168{
169 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
170 return *(unsigned long *)base;
171}
172
173static inline void
174ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
175 unsigned long value)
176{
177 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
178 (*(unsigned long *)base) = value;
179}
180
181
182/*
183 * Locking is done only for allocating BTS or PEBS resources.
184 */
185static DEFINE_SPINLOCK(ds_lock);
186
187/*
188 * We either support (system-wide) per-cpu or per-thread allocation.
189 * We distinguish the two based on the task_struct pointer, where a
190 * NULL pointer indicates per-cpu allocation for the current cpu.
191 *
192 * Allocations are use-counted. As soon as resources are allocated,
193 * further allocations must be of the same type (per-cpu or
194 * per-thread). We model this by counting allocations (i.e. the number
195 * of tracers of a certain type) for one type negatively:
196 * =0 no tracers
197 * >0 number of per-thread tracers
198 * <0 number of per-cpu tracers
199 *
200 * Tracers essentially gives the number of ds contexts for a certain
201 * type of allocation.
202 */
203static atomic_t tracers = ATOMIC_INIT(0);
204
205static inline int get_tracer(struct task_struct *task)
206{
207 int error;
208
209 spin_lock_irq(&ds_lock);
210
211 if (task) {
212 error = -EPERM;
213 if (atomic_read(&tracers) < 0)
214 goto out;
215 atomic_inc(&tracers);
216 } else {
217 error = -EPERM;
218 if (atomic_read(&tracers) > 0)
219 goto out;
220 atomic_dec(&tracers);
221 }
222
223 error = 0;
224out:
225 spin_unlock_irq(&ds_lock);
226 return error;
227}
228
229static inline void put_tracer(struct task_struct *task)
230{
231 if (task)
232 atomic_dec(&tracers);
233 else
234 atomic_inc(&tracers);
235}
236
237/*
238 * The DS context is either attached to a thread or to a cpu:
239 * - in the former case, the thread_struct contains a pointer to the
240 * attached context.
241 * - in the latter case, we use a static array of per-cpu context
242 * pointers.
243 *
244 * Contexts are use-counted. They are allocated on first access and
245 * deallocated when the last user puts the context.
246 */
247struct ds_context {
248 /* The DS configuration; goes into MSR_IA32_DS_AREA: */
249 unsigned char ds[MAX_SIZEOF_DS];
250
251 /* The owner of the BTS and PEBS configuration, respectively: */
252 struct bts_tracer *bts_master;
253 struct pebs_tracer *pebs_master;
254
255 /* Use count: */
256 unsigned long count;
257
258 /* Pointer to the context pointer field: */
259 struct ds_context **this;
260
261 /* The traced task; NULL for cpu tracing: */
262 struct task_struct *task;
263
264 /* The traced cpu; only valid if task is NULL: */
265 int cpu;
266};
267
268static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
269
270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
272{
273 struct ds_context **p_context =
274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
275 struct ds_context *context = NULL;
276 struct ds_context *new_context = NULL;
277
278 /* Chances are small that we already have a context. */
279 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
280 if (!new_context)
281 return NULL;
282
283 spin_lock_irq(&ds_lock);
284
285 context = *p_context;
286 if (likely(!context)) {
287 context = new_context;
288
289 context->this = p_context;
290 context->task = task;
291 context->cpu = cpu;
292 context->count = 0;
293
294 *p_context = context;
295 }
296
297 context->count++;
298
299 spin_unlock_irq(&ds_lock);
300
301 if (context != new_context)
302 kfree(new_context);
303
304 return context;
305}
306
307static void ds_put_context(struct ds_context *context)
308{
309 struct task_struct *task;
310 unsigned long irq;
311
312 if (!context)
313 return;
314
315 spin_lock_irqsave(&ds_lock, irq);
316
317 if (--context->count) {
318 spin_unlock_irqrestore(&ds_lock, irq);
319 return;
320 }
321
322 *(context->this) = NULL;
323
324 task = context->task;
325
326 if (task)
327 clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
328
329 /*
330 * We leave the (now dangling) pointer to the DS configuration in
331 * the DS_AREA msr. This is as good or as bad as replacing it with
332 * NULL - the hardware would crash if we enabled tracing.
333 *
334 * This saves us some problems with having to write an msr on a
335 * different cpu while preventing others from doing the same for the
336 * next context for that same cpu.
337 */
338
339 spin_unlock_irqrestore(&ds_lock, irq);
340
341 /* The context might still be in use for context switching. */
342 if (task && (task != current))
343 wait_task_context_switch(task);
344
345 kfree(context);
346}
347
348static void ds_install_ds_area(struct ds_context *context)
349{
350 unsigned long ds;
351
352 ds = (unsigned long)context->ds;
353
354 /*
355 * There is a race between the bts master and the pebs master.
356 *
357 * The thread/cpu access is synchronized via get/put_cpu() for
358 * task tracing and via wrmsr_on_cpu for cpu tracing.
359 *
360 * If bts and pebs are collected for the same task or same cpu,
361 * the same confiuration is written twice.
362 */
363 if (context->task) {
364 get_cpu();
365 if (context->task == current)
366 wrmsrl(MSR_IA32_DS_AREA, ds);
367 set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
368 put_cpu();
369 } else
370 wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
371 (u32)((u64)ds), (u32)((u64)ds >> 32));
372}
373
374/*
375 * Call the tracer's callback on a buffer overflow.
376 *
377 * context: the ds context
378 * qual: the buffer type
379 */
380static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
381{
382 switch (qual) {
383 case ds_bts:
384 if (context->bts_master &&
385 context->bts_master->ovfl)
386 context->bts_master->ovfl(context->bts_master);
387 break;
388 case ds_pebs:
389 if (context->pebs_master &&
390 context->pebs_master->ovfl)
391 context->pebs_master->ovfl(context->pebs_master);
392 break;
393 }
394}
395
396
397/*
398 * Write raw data into the BTS or PEBS buffer.
399 *
400 * The remainder of any partially written record is zeroed out.
401 *
402 * context: the DS context
403 * qual: the buffer type
404 * record: the data to write
405 * size: the size of the data
406 */
407static int ds_write(struct ds_context *context, enum ds_qualifier qual,
408 const void *record, size_t size)
409{
410 int bytes_written = 0;
411
412 if (!record)
413 return -EINVAL;
414
415 while (size) {
416 unsigned long base, index, end, write_end, int_th;
417 unsigned long write_size, adj_write_size;
418
419 /*
420 * Write as much as possible without producing an
421 * overflow interrupt.
422 *
423 * Interrupt_threshold must either be
424 * - bigger than absolute_maximum or
425 * - point to a record between buffer_base and absolute_maximum
426 *
427 * Index points to a valid record.
428 */
429 base = ds_get(context->ds, qual, ds_buffer_base);
430 index = ds_get(context->ds, qual, ds_index);
431 end = ds_get(context->ds, qual, ds_absolute_maximum);
432 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
433
434 write_end = min(end, int_th);
435
436 /*
437 * If we are already beyond the interrupt threshold,
438 * we fill the entire buffer.
439 */
440 if (write_end <= index)
441 write_end = end;
442
443 if (write_end <= index)
444 break;
445
446 write_size = min((unsigned long) size, write_end - index);
447 memcpy((void *)index, record, write_size);
448
449 record = (const char *)record + write_size;
450 size -= write_size;
451 bytes_written += write_size;
452
453 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
454 adj_write_size *= ds_cfg.sizeof_rec[qual];
455
456 /* Zero out trailing bytes. */
457 memset((char *)index + write_size, 0,
458 adj_write_size - write_size);
459 index += adj_write_size;
460
461 if (index >= end)
462 index = base;
463 ds_set(context->ds, qual, ds_index, index);
464
465 if (index >= int_th)
466 ds_overflow(context, qual);
467 }
468
469 return bytes_written;
470}
471
472
473/*
474 * Branch Trace Store (BTS) uses the following format. Different
475 * architectures vary in the size of those fields.
476 * - source linear address
477 * - destination linear address
478 * - flags
479 *
480 * Later architectures use 64bit pointers throughout, whereas earlier
481 * architectures use 32bit pointers in 32bit mode.
482 *
483 * We compute the base address for the fields based on:
484 * - the field size stored in the DS configuration
485 * - the relative field position
486 *
487 * In order to store additional information in the BTS buffer, we use
488 * a special source address to indicate that the record requires
489 * special interpretation.
490 *
491 * Netburst indicated via a bit in the flags field whether the branch
492 * was predicted; this is ignored.
493 *
494 * We use two levels of abstraction:
495 * - the raw data level defined here
496 * - an arch-independent level defined in ds.h
497 */
498
499enum bts_field {
500 bts_from,
501 bts_to,
502 bts_flags,
503
504 bts_qual = bts_from,
505 bts_clock = bts_to,
506 bts_pid = bts_flags,
507
508 bts_qual_mask = (bts_qual_max - 1),
509 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
510};
511
512static inline unsigned long bts_get(const char *base, unsigned long field)
513{
514 base += (ds_cfg.sizeof_ptr_field * field);
515 return *(unsigned long *)base;
516}
517
518static inline void bts_set(char *base, unsigned long field, unsigned long val)
519{
520 base += (ds_cfg.sizeof_ptr_field * field);
521 (*(unsigned long *)base) = val;
522}
523
524
525/*
526 * The raw BTS data is architecture dependent.
527 *
528 * For higher-level users, we give an arch-independent view.
529 * - ds.h defines struct bts_struct
530 * - bts_read translates one raw bts record into a bts_struct
531 * - bts_write translates one bts_struct into the raw format and
532 * writes it into the top of the parameter tracer's buffer.
533 *
534 * return: bytes read/written on success; -Eerrno, otherwise
535 */
536static int
537bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
538{
539 if (!tracer)
540 return -EINVAL;
541
542 if (at < tracer->trace.ds.begin)
543 return -EINVAL;
544
545 if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
546 return -EINVAL;
547
548 memset(out, 0, sizeof(*out));
549 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
550 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
551 out->variant.event.clock = bts_get(at, bts_clock);
552 out->variant.event.pid = bts_get(at, bts_pid);
553 } else {
554 out->qualifier = bts_branch;
555 out->variant.lbr.from = bts_get(at, bts_from);
556 out->variant.lbr.to = bts_get(at, bts_to);
557
558 if (!out->variant.lbr.from && !out->variant.lbr.to)
559 out->qualifier = bts_invalid;
560 }
561
562 return ds_cfg.sizeof_rec[ds_bts];
563}
564
565static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
566{
567 unsigned char raw[MAX_SIZEOF_BTS];
568
569 if (!tracer)
570 return -EINVAL;
571
572 if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
573 return -EOVERFLOW;
574
575 switch (in->qualifier) {
576 case bts_invalid:
577 bts_set(raw, bts_from, 0);
578 bts_set(raw, bts_to, 0);
579 bts_set(raw, bts_flags, 0);
580 break;
581 case bts_branch:
582 bts_set(raw, bts_from, in->variant.lbr.from);
583 bts_set(raw, bts_to, in->variant.lbr.to);
584 bts_set(raw, bts_flags, 0);
585 break;
586 case bts_task_arrives:
587 case bts_task_departs:
588 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
589 bts_set(raw, bts_clock, in->variant.event.clock);
590 bts_set(raw, bts_pid, in->variant.event.pid);
591 break;
592 default:
593 return -EINVAL;
594 }
595
596 return ds_write(tracer->ds.context, ds_bts, raw,
597 ds_cfg.sizeof_rec[ds_bts]);
598}
599
600
601static void ds_write_config(struct ds_context *context,
602 struct ds_trace *cfg, enum ds_qualifier qual)
603{
604 unsigned char *ds = context->ds;
605
606 ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
607 ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
608 ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
609 ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
610}
611
612static void ds_read_config(struct ds_context *context,
613 struct ds_trace *cfg, enum ds_qualifier qual)
614{
615 unsigned char *ds = context->ds;
616
617 cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
618 cfg->top = (void *)ds_get(ds, qual, ds_index);
619 cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
620 cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
621}
622
623static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
624 void *base, size_t size, size_t ith,
625 unsigned int flags) {
626 unsigned long buffer, adj;
627
628 /*
629 * Adjust the buffer address and size to meet alignment
630 * constraints:
631 * - buffer is double-word aligned
632 * - size is multiple of record size
633 *
634 * We checked the size at the very beginning; we have enough
635 * space to do the adjustment.
636 */
637 buffer = (unsigned long)base;
638
639 adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
640 buffer += adj;
641 size -= adj;
642
643 trace->n = size / ds_cfg.sizeof_rec[qual];
644 trace->size = ds_cfg.sizeof_rec[qual];
645
646 size = (trace->n * trace->size);
647
648 trace->begin = (void *)buffer;
649 trace->top = trace->begin;
650 trace->end = (void *)(buffer + size);
651 /*
652 * The value for 'no threshold' is -1, which will set the
653 * threshold outside of the buffer, just like we want it.
654 */
655 ith *= ds_cfg.sizeof_rec[qual];
656 trace->ith = (void *)(buffer + size - ith);
657
658 trace->flags = flags;
659}
660
661
662static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
663 enum ds_qualifier qual, struct task_struct *task,
664 int cpu, void *base, size_t size, size_t th)
665{
666 struct ds_context *context;
667 int error;
668 size_t req_size;
669
670 error = -EOPNOTSUPP;
671 if (!ds_cfg.sizeof_rec[qual])
672 goto out;
673
674 error = -EINVAL;
675 if (!base)
676 goto out;
677
678 req_size = ds_cfg.sizeof_rec[qual];
679 /* We might need space for alignment adjustments. */
680 if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
681 req_size += DS_ALIGNMENT;
682
683 error = -EINVAL;
684 if (size < req_size)
685 goto out;
686
687 if (th != (size_t)-1) {
688 th *= ds_cfg.sizeof_rec[qual];
689
690 error = -EINVAL;
691 if (size <= th)
692 goto out;
693 }
694
695 tracer->buffer = base;
696 tracer->size = size;
697
698 error = -ENOMEM;
699 context = ds_get_context(task, cpu);
700 if (!context)
701 goto out;
702 tracer->context = context;
703
704 /*
705 * Defer any tracer-specific initialization work for the context until
706 * context ownership has been clarified.
707 */
708
709 error = 0;
710 out:
711 return error;
712}
713
714static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
715 void *base, size_t size,
716 bts_ovfl_callback_t ovfl, size_t th,
717 unsigned int flags)
718{
719 struct bts_tracer *tracer;
720 int error;
721
722 /* Buffer overflow notification is not yet implemented. */
723 error = -EOPNOTSUPP;
724 if (ovfl)
725 goto out;
726
727 error = get_tracer(task);
728 if (error < 0)
729 goto out;
730
731 error = -ENOMEM;
732 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
733 if (!tracer)
734 goto out_put_tracer;
735 tracer->ovfl = ovfl;
736
737 /* Do some more error checking and acquire a tracing context. */
738 error = ds_request(&tracer->ds, &tracer->trace.ds,
739 ds_bts, task, cpu, base, size, th);
740 if (error < 0)
741 goto out_tracer;
742
743 /* Claim the bts part of the tracing context we acquired above. */
744 spin_lock_irq(&ds_lock);
745
746 error = -EPERM;
747 if (tracer->ds.context->bts_master)
748 goto out_unlock;
749 tracer->ds.context->bts_master = tracer;
750
751 spin_unlock_irq(&ds_lock);
752
753 /*
754 * Now that we own the bts part of the context, let's complete the
755 * initialization for that part.
756 */
757 ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
758 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
759 ds_install_ds_area(tracer->ds.context);
760
761 tracer->trace.read = bts_read;
762 tracer->trace.write = bts_write;
763
764 /* Start tracing. */
765 ds_resume_bts(tracer);
766
767 return tracer;
768
769 out_unlock:
770 spin_unlock_irq(&ds_lock);
771 ds_put_context(tracer->ds.context);
772 out_tracer:
773 kfree(tracer);
774 out_put_tracer:
775 put_tracer(task);
776 out:
777 return ERR_PTR(error);
778}
779
780struct bts_tracer *ds_request_bts_task(struct task_struct *task,
781 void *base, size_t size,
782 bts_ovfl_callback_t ovfl,
783 size_t th, unsigned int flags)
784{
785 return ds_request_bts(task, 0, base, size, ovfl, th, flags);
786}
787
788struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
789 bts_ovfl_callback_t ovfl,
790 size_t th, unsigned int flags)
791{
792 return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
793}
794
795static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
796 void *base, size_t size,
797 pebs_ovfl_callback_t ovfl, size_t th,
798 unsigned int flags)
799{
800 struct pebs_tracer *tracer;
801 int error;
802
803 /* Buffer overflow notification is not yet implemented. */
804 error = -EOPNOTSUPP;
805 if (ovfl)
806 goto out;
807
808 error = get_tracer(task);
809 if (error < 0)
810 goto out;
811
812 error = -ENOMEM;
813 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
814 if (!tracer)
815 goto out_put_tracer;
816 tracer->ovfl = ovfl;
817
818 /* Do some more error checking and acquire a tracing context. */
819 error = ds_request(&tracer->ds, &tracer->trace.ds,
820 ds_pebs, task, cpu, base, size, th);
821 if (error < 0)
822 goto out_tracer;
823
824 /* Claim the pebs part of the tracing context we acquired above. */
825 spin_lock_irq(&ds_lock);
826
827 error = -EPERM;
828 if (tracer->ds.context->pebs_master)
829 goto out_unlock;
830 tracer->ds.context->pebs_master = tracer;
831
832 spin_unlock_irq(&ds_lock);
833
834 /*
835 * Now that we own the pebs part of the context, let's complete the
836 * initialization for that part.
837 */
838 ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
839 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
840 ds_install_ds_area(tracer->ds.context);
841
842 /* Start tracing. */
843 ds_resume_pebs(tracer);
844
845 return tracer;
846
847 out_unlock:
848 spin_unlock_irq(&ds_lock);
849 ds_put_context(tracer->ds.context);
850 out_tracer:
851 kfree(tracer);
852 out_put_tracer:
853 put_tracer(task);
854 out:
855 return ERR_PTR(error);
856}
857
858struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
859 void *base, size_t size,
860 pebs_ovfl_callback_t ovfl,
861 size_t th, unsigned int flags)
862{
863 return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
864}
865
866struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
867 pebs_ovfl_callback_t ovfl,
868 size_t th, unsigned int flags)
869{
870 return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
871}
872
873static void ds_free_bts(struct bts_tracer *tracer)
874{
875 struct task_struct *task;
876
877 task = tracer->ds.context->task;
878
879 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
880 tracer->ds.context->bts_master = NULL;
881
882 /* Make sure tracing stopped and the tracer is not in use. */
883 if (task && (task != current))
884 wait_task_context_switch(task);
885
886 ds_put_context(tracer->ds.context);
887 put_tracer(task);
888
889 kfree(tracer);
890}
891
892void ds_release_bts(struct bts_tracer *tracer)
893{
894 might_sleep();
895
896 if (!tracer)
897 return;
898
899 ds_suspend_bts(tracer);
900 ds_free_bts(tracer);
901}
902
903int ds_release_bts_noirq(struct bts_tracer *tracer)
904{
905 struct task_struct *task;
906 unsigned long irq;
907 int error;
908
909 if (!tracer)
910 return 0;
911
912 task = tracer->ds.context->task;
913
914 local_irq_save(irq);
915
916 error = -EPERM;
917 if (!task &&
918 (tracer->ds.context->cpu != smp_processor_id()))
919 goto out;
920
921 error = -EPERM;
922 if (task && (task != current))
923 goto out;
924
925 ds_suspend_bts_noirq(tracer);
926 ds_free_bts(tracer);
927
928 error = 0;
929 out:
930 local_irq_restore(irq);
931 return error;
932}
933
934static void update_task_debugctlmsr(struct task_struct *task,
935 unsigned long debugctlmsr)
936{
937 task->thread.debugctlmsr = debugctlmsr;
938
939 get_cpu();
940 if (task == current)
941 update_debugctlmsr(debugctlmsr);
942 put_cpu();
943}
944
945void ds_suspend_bts(struct bts_tracer *tracer)
946{
947 struct task_struct *task;
948 unsigned long debugctlmsr;
949 int cpu;
950
951 if (!tracer)
952 return;
953
954 tracer->flags = 0;
955
956 task = tracer->ds.context->task;
957 cpu = tracer->ds.context->cpu;
958
959 WARN_ON(!task && irqs_disabled());
960
961 debugctlmsr = (task ?
962 task->thread.debugctlmsr :
963 get_debugctlmsr_on_cpu(cpu));
964 debugctlmsr &= ~BTS_CONTROL;
965
966 if (task)
967 update_task_debugctlmsr(task, debugctlmsr);
968 else
969 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
970}
971
972int ds_suspend_bts_noirq(struct bts_tracer *tracer)
973{
974 struct task_struct *task;
975 unsigned long debugctlmsr, irq;
976 int cpu, error = 0;
977
978 if (!tracer)
979 return 0;
980
981 tracer->flags = 0;
982
983 task = tracer->ds.context->task;
984 cpu = tracer->ds.context->cpu;
985
986 local_irq_save(irq);
987
988 error = -EPERM;
989 if (!task && (cpu != smp_processor_id()))
990 goto out;
991
992 debugctlmsr = (task ?
993 task->thread.debugctlmsr :
994 get_debugctlmsr());
995 debugctlmsr &= ~BTS_CONTROL;
996
997 if (task)
998 update_task_debugctlmsr(task, debugctlmsr);
999 else
1000 update_debugctlmsr(debugctlmsr);
1001
1002 error = 0;
1003 out:
1004 local_irq_restore(irq);
1005 return error;
1006}
1007
1008static unsigned long ds_bts_control(struct bts_tracer *tracer)
1009{
1010 unsigned long control;
1011
1012 control = ds_cfg.ctl[dsf_bts];
1013 if (!(tracer->trace.ds.flags & BTS_KERNEL))
1014 control |= ds_cfg.ctl[dsf_bts_kernel];
1015 if (!(tracer->trace.ds.flags & BTS_USER))
1016 control |= ds_cfg.ctl[dsf_bts_user];
1017
1018 return control;
1019}
1020
1021void ds_resume_bts(struct bts_tracer *tracer)
1022{
1023 struct task_struct *task;
1024 unsigned long debugctlmsr;
1025 int cpu;
1026
1027 if (!tracer)
1028 return;
1029
1030 tracer->flags = tracer->trace.ds.flags;
1031
1032 task = tracer->ds.context->task;
1033 cpu = tracer->ds.context->cpu;
1034
1035 WARN_ON(!task && irqs_disabled());
1036
1037 debugctlmsr = (task ?
1038 task->thread.debugctlmsr :
1039 get_debugctlmsr_on_cpu(cpu));
1040 debugctlmsr |= ds_bts_control(tracer);
1041
1042 if (task)
1043 update_task_debugctlmsr(task, debugctlmsr);
1044 else
1045 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
1046}
1047
1048int ds_resume_bts_noirq(struct bts_tracer *tracer)
1049{
1050 struct task_struct *task;
1051 unsigned long debugctlmsr, irq;
1052 int cpu, error = 0;
1053
1054 if (!tracer)
1055 return 0;
1056
1057 tracer->flags = tracer->trace.ds.flags;
1058
1059 task = tracer->ds.context->task;
1060 cpu = tracer->ds.context->cpu;
1061
1062 local_irq_save(irq);
1063
1064 error = -EPERM;
1065 if (!task && (cpu != smp_processor_id()))
1066 goto out;
1067
1068 debugctlmsr = (task ?
1069 task->thread.debugctlmsr :
1070 get_debugctlmsr());
1071 debugctlmsr |= ds_bts_control(tracer);
1072
1073 if (task)
1074 update_task_debugctlmsr(task, debugctlmsr);
1075 else
1076 update_debugctlmsr(debugctlmsr);
1077
1078 error = 0;
1079 out:
1080 local_irq_restore(irq);
1081 return error;
1082}
1083
1084static void ds_free_pebs(struct pebs_tracer *tracer)
1085{
1086 struct task_struct *task;
1087
1088 task = tracer->ds.context->task;
1089
1090 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
1091 tracer->ds.context->pebs_master = NULL;
1092
1093 ds_put_context(tracer->ds.context);
1094 put_tracer(task);
1095
1096 kfree(tracer);
1097}
1098
1099void ds_release_pebs(struct pebs_tracer *tracer)
1100{
1101 might_sleep();
1102
1103 if (!tracer)
1104 return;
1105
1106 ds_suspend_pebs(tracer);
1107 ds_free_pebs(tracer);
1108}
1109
1110int ds_release_pebs_noirq(struct pebs_tracer *tracer)
1111{
1112 struct task_struct *task;
1113 unsigned long irq;
1114 int error;
1115
1116 if (!tracer)
1117 return 0;
1118
1119 task = tracer->ds.context->task;
1120
1121 local_irq_save(irq);
1122
1123 error = -EPERM;
1124 if (!task &&
1125 (tracer->ds.context->cpu != smp_processor_id()))
1126 goto out;
1127
1128 error = -EPERM;
1129 if (task && (task != current))
1130 goto out;
1131
1132 ds_suspend_pebs_noirq(tracer);
1133 ds_free_pebs(tracer);
1134
1135 error = 0;
1136 out:
1137 local_irq_restore(irq);
1138 return error;
1139}
1140
1141void ds_suspend_pebs(struct pebs_tracer *tracer)
1142{
1143
1144}
1145
1146int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
1147{
1148 return 0;
1149}
1150
1151void ds_resume_pebs(struct pebs_tracer *tracer)
1152{
1153
1154}
1155
1156int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
1157{
1158 return 0;
1159}
1160
1161const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
1162{
1163 if (!tracer)
1164 return NULL;
1165
1166 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
1167 return &tracer->trace;
1168}
1169
1170const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
1171{
1172 if (!tracer)
1173 return NULL;
1174
1175 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
1176
1177 tracer->trace.counters = ds_cfg.nr_counter_reset;
1178 memcpy(tracer->trace.counter_reset,
1179 tracer->ds.context->ds +
1180 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
1181 ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
1182
1183 return &tracer->trace;
1184}
1185
1186int ds_reset_bts(struct bts_tracer *tracer)
1187{
1188 if (!tracer)
1189 return -EINVAL;
1190
1191 tracer->trace.ds.top = tracer->trace.ds.begin;
1192
1193 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
1194 (unsigned long)tracer->trace.ds.top);
1195
1196 return 0;
1197}
1198
1199int ds_reset_pebs(struct pebs_tracer *tracer)
1200{
1201 if (!tracer)
1202 return -EINVAL;
1203
1204 tracer->trace.ds.top = tracer->trace.ds.begin;
1205
1206 ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
1207 (unsigned long)tracer->trace.ds.top);
1208
1209 return 0;
1210}
1211
1212int ds_set_pebs_reset(struct pebs_tracer *tracer,
1213 unsigned int counter, u64 value)
1214{
1215 if (!tracer)
1216 return -EINVAL;
1217
1218 if (ds_cfg.nr_counter_reset < counter)
1219 return -EINVAL;
1220
1221 *(u64 *)(tracer->ds.context->ds +
1222 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
1223 (counter * PEBS_RESET_FIELD_SIZE)) = value;
1224
1225 return 0;
1226}
1227
1228static const struct ds_configuration ds_cfg_netburst = {
1229 .name = "Netburst",
1230 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
1231 .ctl[dsf_bts_kernel] = (1 << 5),
1232 .ctl[dsf_bts_user] = (1 << 6),
1233 .nr_counter_reset = 1,
1234};
1235static const struct ds_configuration ds_cfg_pentium_m = {
1236 .name = "Pentium M",
1237 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1238 .nr_counter_reset = 1,
1239};
1240static const struct ds_configuration ds_cfg_core2_atom = {
1241 .name = "Core 2/Atom",
1242 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1243 .ctl[dsf_bts_kernel] = (1 << 9),
1244 .ctl[dsf_bts_user] = (1 << 10),
1245 .nr_counter_reset = 1,
1246};
1247static const struct ds_configuration ds_cfg_core_i7 = {
1248 .name = "Core i7",
1249 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1250 .ctl[dsf_bts_kernel] = (1 << 9),
1251 .ctl[dsf_bts_user] = (1 << 10),
1252 .nr_counter_reset = 4,
1253};
1254
1255static void
1256ds_configure(const struct ds_configuration *cfg,
1257 struct cpuinfo_x86 *cpu)
1258{
1259 unsigned long nr_pebs_fields = 0;
1260
1261 printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
1262
1263#ifdef __i386__
1264 nr_pebs_fields = 10;
1265#else
1266 nr_pebs_fields = 18;
1267#endif
1268
1269 /*
1270 * Starting with version 2, architectural performance
1271 * monitoring supports a format specifier.
1272 */
1273 if ((cpuid_eax(0xa) & 0xff) > 1) {
1274 unsigned long perf_capabilities, format;
1275
1276 rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
1277
1278 format = (perf_capabilities >> 8) & 0xf;
1279
1280 switch (format) {
1281 case 0:
1282 nr_pebs_fields = 18;
1283 break;
1284 case 1:
1285 nr_pebs_fields = 22;
1286 break;
1287 default:
1288 printk(KERN_INFO
1289 "[ds] unknown PEBS format: %lu\n", format);
1290 nr_pebs_fields = 0;
1291 break;
1292 }
1293 }
1294
1295 memset(&ds_cfg, 0, sizeof(ds_cfg));
1296 ds_cfg = *cfg;
1297
1298 ds_cfg.sizeof_ptr_field =
1299 (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
1300
1301 ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
1302 ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
1303
1304 if (!cpu_has(cpu, X86_FEATURE_BTS)) {
1305 ds_cfg.sizeof_rec[ds_bts] = 0;
1306 printk(KERN_INFO "[ds] bts not available\n");
1307 }
1308 if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
1309 ds_cfg.sizeof_rec[ds_pebs] = 0;
1310 printk(KERN_INFO "[ds] pebs not available\n");
1311 }
1312
1313 printk(KERN_INFO "[ds] sizes: address: %u bit, ",
1314 8 * ds_cfg.sizeof_ptr_field);
1315 printk("bts/pebs record: %u/%u bytes\n",
1316 ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
1317
1318 WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
1319}
1320
1321void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
1322{
1323 /* Only configure the first cpu. Others are identical. */
1324 if (ds_cfg.name)
1325 return;
1326
1327 switch (c->x86) {
1328 case 0x6:
1329 switch (c->x86_model) {
1330 case 0x9:
1331 case 0xd: /* Pentium M */
1332 ds_configure(&ds_cfg_pentium_m, c);
1333 break;
1334 case 0xf:
1335 case 0x17: /* Core2 */
1336 case 0x1c: /* Atom */
1337 ds_configure(&ds_cfg_core2_atom, c);
1338 break;
1339 case 0x1a: /* Core i7 */
1340 ds_configure(&ds_cfg_core_i7, c);
1341 break;
1342 default:
1343 /* Sorry, don't know about them. */
1344 break;
1345 }
1346 break;
1347 case 0xf:
1348 switch (c->x86_model) {
1349 case 0x0:
1350 case 0x1:
1351 case 0x2: /* Netburst */
1352 ds_configure(&ds_cfg_netburst, c);
1353 break;
1354 default:
1355 /* Sorry, don't know about them. */
1356 break;
1357 }
1358 break;
1359 default:
1360 /* Sorry, don't know about them. */
1361 break;
1362 }
1363}
1364
1365static inline void ds_take_timestamp(struct ds_context *context,
1366 enum bts_qualifier qualifier,
1367 struct task_struct *task)
1368{
1369 struct bts_tracer *tracer = context->bts_master;
1370 struct bts_struct ts;
1371
1372 /* Prevent compilers from reading the tracer pointer twice. */
1373 barrier();
1374
1375 if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
1376 return;
1377
1378 memset(&ts, 0, sizeof(ts));
1379 ts.qualifier = qualifier;
1380 ts.variant.event.clock = trace_clock_global();
1381 ts.variant.event.pid = task->pid;
1382
1383 bts_write(tracer, &ts);
1384}
1385
1386/*
1387 * Change the DS configuration from tracing prev to tracing next.
1388 */
1389void ds_switch_to(struct task_struct *prev, struct task_struct *next)
1390{
1391 struct ds_context *prev_ctx = prev->thread.ds_ctx;
1392 struct ds_context *next_ctx = next->thread.ds_ctx;
1393 unsigned long debugctlmsr = next->thread.debugctlmsr;
1394
1395 /* Make sure all data is read before we start. */
1396 barrier();
1397
1398 if (prev_ctx) {
1399 update_debugctlmsr(0);
1400
1401 ds_take_timestamp(prev_ctx, bts_task_departs, prev);
1402 }
1403
1404 if (next_ctx) {
1405 ds_take_timestamp(next_ctx, bts_task_arrives, next);
1406
1407 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1408 }
1409
1410 update_debugctlmsr(debugctlmsr);
1411}
1412
1413static __init int ds_selftest(void)
1414{
1415 if (ds_cfg.sizeof_rec[ds_bts]) {
1416 int error;
1417
1418 error = ds_selftest_bts();
1419 if (error) {
1420 WARN(1, "[ds] selftest failed. disabling bts.\n");
1421 ds_cfg.sizeof_rec[ds_bts] = 0;
1422 }
1423 }
1424
1425 if (ds_cfg.sizeof_rec[ds_pebs]) {
1426 int error;
1427
1428 error = ds_selftest_pebs();
1429 if (error) {
1430 WARN(1, "[ds] selftest failed. disabling pebs.\n");
1431 ds_cfg.sizeof_rec[ds_pebs] = 0;
1432 }
1433 }
1434
1435 return 0;
1436}
1437device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab99..000000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#include "ds_selftest.h"
10
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/smp.h>
14#include <linux/cpu.h>
15
16#include <asm/ds.h>
17
18
19#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
20#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
21
22struct ds_selftest_bts_conf {
23 struct bts_tracer *tracer;
24 int error;
25 int (*suspend)(struct bts_tracer *);
26 int (*resume)(struct bts_tracer *);
27};
28
29static int ds_selftest_bts_consistency(const struct bts_trace *trace)
30{
31 int error = 0;
32
33 if (!trace) {
34 printk(KERN_CONT "failed to access trace...");
35 /* Bail out. Other tests are pointless. */
36 return -1;
37 }
38
39 if (!trace->read) {
40 printk(KERN_CONT "bts read not available...");
41 error = -1;
42 }
43
44 /* Do some sanity checks on the trace configuration. */
45 if (!trace->ds.n) {
46 printk(KERN_CONT "empty bts buffer...");
47 error = -1;
48 }
49 if (!trace->ds.size) {
50 printk(KERN_CONT "bad bts trace setup...");
51 error = -1;
52 }
53 if (trace->ds.end !=
54 (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
55 printk(KERN_CONT "bad bts buffer setup...");
56 error = -1;
57 }
58 /*
59 * We allow top in [begin; end], since its not clear when the
60 * overflow adjustment happens: after the increment or before the
61 * write.
62 */
63 if ((trace->ds.top < trace->ds.begin) ||
64 (trace->ds.end < trace->ds.top)) {
65 printk(KERN_CONT "bts top out of bounds...");
66 error = -1;
67 }
68
69 return error;
70}
71
72static int ds_selftest_bts_read(struct bts_tracer *tracer,
73 const struct bts_trace *trace,
74 const void *from, const void *to)
75{
76 const unsigned char *at;
77
78 /*
79 * Check a few things which do not belong to this test.
80 * They should be covered by other tests.
81 */
82 if (!trace)
83 return -1;
84
85 if (!trace->read)
86 return -1;
87
88 if (to < from)
89 return -1;
90
91 if (from < trace->ds.begin)
92 return -1;
93
94 if (trace->ds.end < to)
95 return -1;
96
97 if (!trace->ds.size)
98 return -1;
99
100 /* Now to the test itself. */
101 for (at = from; (void *)at < to; at += trace->ds.size) {
102 struct bts_struct bts;
103 unsigned long index;
104 int error;
105
106 if (((void *)at - trace->ds.begin) % trace->ds.size) {
107 printk(KERN_CONT
108 "read from non-integer index...");
109 return -1;
110 }
111 index = ((void *)at - trace->ds.begin) / trace->ds.size;
112
113 memset(&bts, 0, sizeof(bts));
114 error = trace->read(tracer, at, &bts);
115 if (error < 0) {
116 printk(KERN_CONT
117 "error reading bts trace at [%lu] (0x%p)...",
118 index, at);
119 return error;
120 }
121
122 switch (bts.qualifier) {
123 case BTS_BRANCH:
124 break;
125 default:
126 printk(KERN_CONT
127 "unexpected bts entry %llu at [%lu] (0x%p)...",
128 bts.qualifier, index, at);
129 return -1;
130 }
131 }
132
133 return 0;
134}
135
136static void ds_selftest_bts_cpu(void *arg)
137{
138 struct ds_selftest_bts_conf *conf = arg;
139 const struct bts_trace *trace;
140 void *top;
141
142 if (IS_ERR(conf->tracer)) {
143 conf->error = PTR_ERR(conf->tracer);
144 conf->tracer = NULL;
145
146 printk(KERN_CONT
147 "initialization failed (err: %d)...", conf->error);
148 return;
149 }
150
151 /* We should meanwhile have enough trace. */
152 conf->error = conf->suspend(conf->tracer);
153 if (conf->error < 0)
154 return;
155
156 /* Let's see if we can access the trace. */
157 trace = ds_read_bts(conf->tracer);
158
159 conf->error = ds_selftest_bts_consistency(trace);
160 if (conf->error < 0)
161 return;
162
163 /* If everything went well, we should have a few trace entries. */
164 if (trace->ds.top == trace->ds.begin) {
165 /*
166 * It is possible but highly unlikely that we got a
167 * buffer overflow and end up at exactly the same
168 * position we started from.
169 * Let's issue a warning, but continue.
170 */
171 printk(KERN_CONT "no trace/overflow...");
172 }
173
174 /* Let's try to read the trace we collected. */
175 conf->error =
176 ds_selftest_bts_read(conf->tracer, trace,
177 trace->ds.begin, trace->ds.top);
178 if (conf->error < 0)
179 return;
180
181 /*
182 * Let's read the trace again.
183 * Since we suspended tracing, we should get the same result.
184 */
185 top = trace->ds.top;
186
187 trace = ds_read_bts(conf->tracer);
188 conf->error = ds_selftest_bts_consistency(trace);
189 if (conf->error < 0)
190 return;
191
192 if (top != trace->ds.top) {
193 printk(KERN_CONT "suspend not working...");
194 conf->error = -1;
195 return;
196 }
197
198 /* Let's collect some more trace - see if resume is working. */
199 conf->error = conf->resume(conf->tracer);
200 if (conf->error < 0)
201 return;
202
203 conf->error = conf->suspend(conf->tracer);
204 if (conf->error < 0)
205 return;
206
207 trace = ds_read_bts(conf->tracer);
208
209 conf->error = ds_selftest_bts_consistency(trace);
210 if (conf->error < 0)
211 return;
212
213 if (trace->ds.top == top) {
214 /*
215 * It is possible but highly unlikely that we got a
216 * buffer overflow and end up at exactly the same
217 * position we started from.
218 * Let's issue a warning and check the full trace.
219 */
220 printk(KERN_CONT
221 "no resume progress/overflow...");
222
223 conf->error =
224 ds_selftest_bts_read(conf->tracer, trace,
225 trace->ds.begin, trace->ds.end);
226 } else if (trace->ds.top < top) {
227 /*
228 * We had a buffer overflow - the entire buffer should
229 * contain trace records.
230 */
231 conf->error =
232 ds_selftest_bts_read(conf->tracer, trace,
233 trace->ds.begin, trace->ds.end);
234 } else {
235 /*
236 * It is quite likely that the buffer did not overflow.
237 * Let's just check the delta trace.
238 */
239 conf->error =
240 ds_selftest_bts_read(conf->tracer, trace, top,
241 trace->ds.top);
242 }
243 if (conf->error < 0)
244 return;
245
246 conf->error = 0;
247}
248
249static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
250{
251 ds_suspend_bts(tracer);
252 return 0;
253}
254
255static int ds_resume_bts_wrap(struct bts_tracer *tracer)
256{
257 ds_resume_bts(tracer);
258 return 0;
259}
260
261static void ds_release_bts_noirq_wrap(void *tracer)
262{
263 (void)ds_release_bts_noirq(tracer);
264}
265
266static int ds_selftest_bts_bad_release_noirq(int cpu,
267 struct bts_tracer *tracer)
268{
269 int error = -EPERM;
270
271 /* Try to release the tracer on the wrong cpu. */
272 get_cpu();
273 if (cpu != smp_processor_id()) {
274 error = ds_release_bts_noirq(tracer);
275 if (error != -EPERM)
276 printk(KERN_CONT "release on wrong cpu...");
277 }
278 put_cpu();
279
280 return error ? 0 : -1;
281}
282
283static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
284{
285 struct bts_tracer *tracer;
286 int error;
287
288 /* Try to request cpu tracing while task tracing is active. */
289 tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
290 (size_t)-1, BTS_KERNEL);
291 error = PTR_ERR(tracer);
292 if (!IS_ERR(tracer)) {
293 ds_release_bts(tracer);
294 error = 0;
295 }
296
297 if (error != -EPERM)
298 printk(KERN_CONT "cpu/task tracing overlap...");
299
300 return error ? 0 : -1;
301}
302
303static int ds_selftest_bts_bad_request_task(void *buffer)
304{
305 struct bts_tracer *tracer;
306 int error;
307
308 /* Try to request cpu tracing while task tracing is active. */
309 tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
310 (size_t)-1, BTS_KERNEL);
311 error = PTR_ERR(tracer);
312 if (!IS_ERR(tracer)) {
313 error = 0;
314 ds_release_bts(tracer);
315 }
316
317 if (error != -EPERM)
318 printk(KERN_CONT "task/cpu tracing overlap...");
319
320 return error ? 0 : -1;
321}
322
323int ds_selftest_bts(void)
324{
325 struct ds_selftest_bts_conf conf;
326 unsigned char buffer[BUFFER_SIZE], *small_buffer;
327 unsigned long irq;
328 int cpu;
329
330 printk(KERN_INFO "[ds] bts selftest...");
331 conf.error = 0;
332
333 small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
334
335 get_online_cpus();
336 for_each_online_cpu(cpu) {
337 conf.suspend = ds_suspend_bts_wrap;
338 conf.resume = ds_resume_bts_wrap;
339 conf.tracer =
340 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
341 NULL, (size_t)-1, BTS_KERNEL);
342 ds_selftest_bts_cpu(&conf);
343 if (conf.error >= 0)
344 conf.error = ds_selftest_bts_bad_request_task(buffer);
345 ds_release_bts(conf.tracer);
346 if (conf.error < 0)
347 goto out;
348
349 conf.suspend = ds_suspend_bts_noirq;
350 conf.resume = ds_resume_bts_noirq;
351 conf.tracer =
352 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
353 NULL, (size_t)-1, BTS_KERNEL);
354 smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
355 if (conf.error >= 0) {
356 conf.error =
357 ds_selftest_bts_bad_release_noirq(cpu,
358 conf.tracer);
359 /* We must not release the tracer twice. */
360 if (conf.error < 0)
361 conf.tracer = NULL;
362 }
363 if (conf.error >= 0)
364 conf.error = ds_selftest_bts_bad_request_task(buffer);
365 smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
366 conf.tracer, 1);
367 if (conf.error < 0)
368 goto out;
369 }
370
371 conf.suspend = ds_suspend_bts_wrap;
372 conf.resume = ds_resume_bts_wrap;
373 conf.tracer =
374 ds_request_bts_task(current, buffer, BUFFER_SIZE,
375 NULL, (size_t)-1, BTS_KERNEL);
376 ds_selftest_bts_cpu(&conf);
377 if (conf.error >= 0)
378 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
379 ds_release_bts(conf.tracer);
380 if (conf.error < 0)
381 goto out;
382
383 conf.suspend = ds_suspend_bts_noirq;
384 conf.resume = ds_resume_bts_noirq;
385 conf.tracer =
386 ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
387 NULL, (size_t)-1, BTS_KERNEL);
388 local_irq_save(irq);
389 ds_selftest_bts_cpu(&conf);
390 if (conf.error >= 0)
391 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
392 ds_release_bts_noirq(conf.tracer);
393 local_irq_restore(irq);
394 if (conf.error < 0)
395 goto out;
396
397 conf.error = 0;
398 out:
399 put_online_cpus();
400 printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
401
402 return conf.error;
403}
404
405int ds_selftest_pebs(void)
406{
407 return 0;
408}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c6663..000000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#ifdef CONFIG_X86_DS_SELFTEST
10extern int ds_selftest_bts(void);
11extern int ds_selftest_pebs(void);
12#else
13static inline int ds_selftest_bts(void) { return 0; }
14static inline int ds_selftest_pebs(void) { return 0; }
15#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6d817554780a..6e8752c1bd52 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,7 +18,6 @@
18 18
19#include <asm/stacktrace.h> 19#include <asm/stacktrace.h>
20 20
21#include "dumpstack.h"
22 21
23int panic_on_unrecovered_nmi; 22int panic_on_unrecovered_nmi;
24int panic_on_io_nmi; 23int panic_on_io_nmi;
@@ -224,11 +223,6 @@ unsigned __kprobes long oops_begin(void)
224 int cpu; 223 int cpu;
225 unsigned long flags; 224 unsigned long flags;
226 225
227 /* notify the hw-branch tracer so it may disable tracing and
228 add the last trace to the trace buffer -
229 the earlier this happens, the more useful the trace. */
230 trace_hw_branch_oops();
231
232 oops_enter(); 226 oops_enter();
233 227
234 /* racy, but better than risking deadlock. */ 228 /* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
deleted file mode 100644
index e1a93be4fd44..000000000000
--- a/arch/x86/kernel/dumpstack.h
+++ /dev/null
@@ -1,56 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5
6#ifndef DUMPSTACK_H
7#define DUMPSTACK_H
8
9#ifdef CONFIG_X86_32
10#define STACKSLOTS_PER_LINE 8
11#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
12#else
13#define STACKSLOTS_PER_LINE 4
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif
16
17#include <linux/uaccess.h>
18
19extern void
20show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
21 unsigned long *stack, unsigned long bp, char *log_lvl);
22
23extern void
24show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *sp, unsigned long bp, char *log_lvl);
26
27extern unsigned int code_bytes;
28
29/* The form of the top of the frame on the stack */
30struct stack_frame {
31 struct stack_frame *next_frame;
32 unsigned long return_address;
33};
34
35struct stack_frame_ia32 {
36 u32 next_frame;
37 u32 return_address;
38};
39
40static inline unsigned long rewind_frame_pointer(int n)
41{
42 struct stack_frame *frame;
43
44 get_bp(frame);
45
46#ifdef CONFIG_FRAME_POINTER
47 while (n--) {
48 if (probe_kernel_address(&frame->next_frame, frame))
49 break;
50 }
51#endif
52
53 return (unsigned long)frame;
54}
55
56#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 11540a189d93..0f6376ffa2d9 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,8 +16,6 @@
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h"
20
21 19
22void dump_trace(struct task_struct *task, struct pt_regs *regs, 20void dump_trace(struct task_struct *task, struct pt_regs *regs,
23 unsigned long *stack, unsigned long bp, 21 unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 272c9f1f05f3..57a21f11c791 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,7 +16,6 @@
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h"
20 19
21#define N_EXCEPTION_STACKS_END \ 20#define N_EXCEPTION_STACKS_END \
22 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) 21 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7bca3c6a02fb..0d6fc71bedb1 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -729,7 +729,7 @@ static int __init e820_mark_nvs_memory(void)
729 struct e820entry *ei = &e820.map[i]; 729 struct e820entry *ei = &e820.map[i];
730 730
731 if (ei->type == E820_NVS) 731 if (ei->type == E820_NVS)
732 hibernate_nvs_register(ei->addr, ei->size); 732 suspend_nvs_register(ei->addr, ei->size);
733 } 733 }
734 734
735 return 0; 735 return 0;
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index ebdb85cf2686..e5cc7e82e60d 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -18,6 +18,7 @@
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <asm/iommu.h> 19#include <asm/iommu.h>
20#include <asm/gart.h> 20#include <asm/gart.h>
21#include <asm/hpet.h>
21 22
22static void __init fix_hypertransport_config(int num, int slot, int func) 23static void __init fix_hypertransport_config(int num, int slot, int func)
23{ 24{
@@ -191,6 +192,21 @@ static void __init ati_bugs_contd(int num, int slot, int func)
191} 192}
192#endif 193#endif
193 194
195/*
196 * Force the read back of the CMP register in hpet_next_event()
197 * to work around the problem that the CMP register write seems to be
198 * delayed. See hpet_next_event() for details.
199 *
200 * We do this on all SMBUS incarnations for now until we have more
201 * information about the affected chipsets.
202 */
203static void __init ati_hpet_bugs(int num, int slot, int func)
204{
205#ifdef CONFIG_HPET_TIMER
206 hpet_readback_cmp = 1;
207#endif
208}
209
194#define QFLAG_APPLY_ONCE 0x1 210#define QFLAG_APPLY_ONCE 0x1
195#define QFLAG_APPLIED 0x2 211#define QFLAG_APPLIED 0x2
196#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 212#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -220,6 +236,8 @@ static struct chipset early_qrk[] __initdata = {
220 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, 236 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
221 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, 237 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
222 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, 238 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
239 { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
240 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs },
223 {} 241 {}
224}; 242};
225 243
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index b9c830c12b4a..fa99bae75ace 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -41,6 +41,14 @@ static void early_vga_write(struct console *con, const char *str, unsigned n)
41 writew(0x720, VGABASE + 2*(max_xpos*j + i)); 41 writew(0x720, VGABASE + 2*(max_xpos*j + i));
42 current_ypos = max_ypos-1; 42 current_ypos = max_ypos-1;
43 } 43 }
44#ifdef CONFIG_KGDB_KDB
45 if (c == '\b') {
46 if (current_xpos > 0)
47 current_xpos--;
48 } else if (c == '\r') {
49 current_xpos = 0;
50 } else
51#endif
44 if (c == '\n') { 52 if (c == '\n') {
45 current_xpos = 0; 53 current_xpos = 0;
46 current_ypos++; 54 current_ypos++;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 44a8e0dc6737..227d00920d2f 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -53,6 +53,7 @@
53#include <asm/processor-flags.h> 53#include <asm/processor-flags.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
56#include <asm/cpufeature.h>
56 57
57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 58/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
58#include <linux/elf-em.h> 59#include <linux/elf-em.h>
@@ -610,14 +611,14 @@ ldt_ss:
610 * compensating for the offset by changing to the ESPFIX segment with 611 * compensating for the offset by changing to the ESPFIX segment with
611 * a base address that matches for the difference. 612 * a base address that matches for the difference.
612 */ 613 */
614#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
613 mov %esp, %edx /* load kernel esp */ 615 mov %esp, %edx /* load kernel esp */
614 mov PT_OLDESP(%esp), %eax /* load userspace esp */ 616 mov PT_OLDESP(%esp), %eax /* load userspace esp */
615 mov %dx, %ax /* eax: new kernel esp */ 617 mov %dx, %ax /* eax: new kernel esp */
616 sub %eax, %edx /* offset (low word is 0) */ 618 sub %eax, %edx /* offset (low word is 0) */
617 PER_CPU(gdt_page, %ebx)
618 shr $16, %edx 619 shr $16, %edx
619 mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ 620 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
620 mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ 621 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
621 pushl $__ESPFIX_SS 622 pushl $__ESPFIX_SS
622 CFI_ADJUST_CFA_OFFSET 4 623 CFI_ADJUST_CFA_OFFSET 4
623 push %eax /* new kernel esp */ 624 push %eax /* new kernel esp */
@@ -790,9 +791,8 @@ ptregs_clone:
790 * normal stack and adjusts ESP with the matching offset. 791 * normal stack and adjusts ESP with the matching offset.
791 */ 792 */
792 /* fixup the stack */ 793 /* fixup the stack */
793 PER_CPU(gdt_page, %ebx) 794 mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
794 mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ 795 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
795 mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
796 shl $16, %eax 796 shl $16, %eax
797 addl %esp, %eax /* the adjusted stack pointer */ 797 addl %esp, %eax /* the adjusted stack pointer */
798 pushl $__KERNEL_DS 798 pushl $__KERNEL_DS
@@ -905,7 +905,25 @@ ENTRY(simd_coprocessor_error)
905 RING0_INT_FRAME 905 RING0_INT_FRAME
906 pushl $0 906 pushl $0
907 CFI_ADJUST_CFA_OFFSET 4 907 CFI_ADJUST_CFA_OFFSET 4
908#ifdef CONFIG_X86_INVD_BUG
909 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
910661: pushl $do_general_protection
911662:
912.section .altinstructions,"a"
913 .balign 4
914 .long 661b
915 .long 663f
916 .word X86_FEATURE_XMM
917 .byte 662b-661b
918 .byte 664f-663f
919.previous
920.section .altinstr_replacement,"ax"
921663: pushl $do_simd_coprocessor_error
922664:
923.previous
924#else
908 pushl $do_simd_coprocessor_error 925 pushl $do_simd_coprocessor_error
926#endif
909 CFI_ADJUST_CFA_OFFSET 4 927 CFI_ADJUST_CFA_OFFSET 4
910 jmp error_code 928 jmp error_code
911 CFI_ENDPROC 929 CFI_ENDPROC
@@ -1147,6 +1165,9 @@ ENTRY(xen_failsafe_callback)
1147.previous 1165.previous
1148ENDPROC(xen_failsafe_callback) 1166ENDPROC(xen_failsafe_callback)
1149 1167
1168BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
1169 xen_evtchn_do_upcall)
1170
1150#endif /* CONFIG_XEN */ 1171#endif /* CONFIG_XEN */
1151 1172
1152#ifdef CONFIG_FUNCTION_TRACER 1173#ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 0697ff139837..c5ea5cdbe7b3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -571,8 +571,8 @@ auditsys:
571 * masked off. 571 * masked off.
572 */ 572 */
573sysret_audit: 573sysret_audit:
574 movq %rax,%rsi /* second arg, syscall return value */ 574 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
575 cmpq $0,%rax /* is it < 0? */ 575 cmpq $0,%rsi /* is it < 0? */
576 setl %al /* 1 if so, 0 if not */ 576 setl %al /* 1 if so, 0 if not */
577 movzbl %al,%edi /* zero-extend that into %edi */ 577 movzbl %al,%edi /* zero-extend that into %edi */
578 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ 578 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
@@ -1065,6 +1065,7 @@ ENTRY(\sym)
1065END(\sym) 1065END(\sym)
1066.endm 1066.endm
1067 1067
1068#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
1068.macro paranoidzeroentry_ist sym do_sym ist 1069.macro paranoidzeroentry_ist sym do_sym ist
1069ENTRY(\sym) 1070ENTRY(\sym)
1070 INTR_FRAME 1071 INTR_FRAME
@@ -1076,10 +1077,9 @@ ENTRY(\sym)
1076 TRACE_IRQS_OFF 1077 TRACE_IRQS_OFF
1077 movq %rsp,%rdi /* pt_regs pointer */ 1078 movq %rsp,%rdi /* pt_regs pointer */
1078 xorl %esi,%esi /* no error code */ 1079 xorl %esi,%esi /* no error code */
1079 PER_CPU(init_tss, %r12) 1080 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1080 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1081 call \do_sym 1081 call \do_sym
1082 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) 1082 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1083 jmp paranoid_exit /* %ebx: no swapgs flag */ 1083 jmp paranoid_exit /* %ebx: no swapgs flag */
1084 CFI_ENDPROC 1084 CFI_ENDPROC
1085END(\sym) 1085END(\sym)
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback)
1329 CFI_ENDPROC 1329 CFI_ENDPROC
1330END(xen_failsafe_callback) 1330END(xen_failsafe_callback)
1331 1331
1332apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
1333 xen_hvm_callback_vector xen_evtchn_do_upcall
1334
1332#endif /* CONFIG_XEN */ 1335#endif /* CONFIG_XEN */
1333 1336
1334/* 1337/*
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index b2e246037392..784360c0625c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -20,7 +20,7 @@
20 20
21static void __init i386_default_early_setup(void) 21static void __init i386_default_early_setup(void)
22{ 22{
23 /* Initilize 32bit specific setup functions */ 23 /* Initialize 32bit specific setup functions */
24 x86_init.resources.probe_roms = probe_roms; 24 x86_init.resources.probe_roms = probe_roms;
25 x86_init.resources.reserve_resources = i386_reserve_resources; 25 x86_init.resources.reserve_resources = i386_reserve_resources;
26 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; 26 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 37c3d4b17d85..ff4c453e13f3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -131,6 +131,12 @@ ENTRY(startup_32)
131 movsl 131 movsl
1321: 1321:
133 133
134#ifdef CONFIG_OLPC_OPENFIRMWARE
135 /* save OFW's pgdir table for later use when calling into OFW */
136 movl %cr3, %eax
137 movl %eax, pa(olpc_ofw_pgd)
138#endif
139
134#ifdef CONFIG_PARAVIRT 140#ifdef CONFIG_PARAVIRT
135 /* This is can only trip for a broken bootloader... */ 141 /* This is can only trip for a broken bootloader... */
136 cmpw $0x207, pa(boot_params + BP_version) 142 cmpw $0x207, pa(boot_params + BP_version)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3d1e6f16b7a6..239046bd447f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64)
234 * init data section till per cpu areas are set up. 234 * init data section till per cpu areas are set up.
235 */ 235 */
236 movl $MSR_GS_BASE,%ecx 236 movl $MSR_GS_BASE,%ecx
237 movq initial_gs(%rip),%rax 237 movl initial_gs(%rip),%eax
238 movq %rax,%rdx 238 movl initial_gs+4(%rip),%edx
239 shrq $32,%rdx
240 wrmsr 239 wrmsr
241 240
242 /* esi is pointer to real mode structure with interesting info. 241 /* esi is pointer to real mode structure with interesting info.
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 23b4ecdffa9b..33dbcc4ec5ff 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,6 @@
16#include <asm/hpet.h> 16#include <asm/hpet.h>
17 17
18#define HPET_MASK CLOCKSOURCE_MASK(32) 18#define HPET_MASK CLOCKSOURCE_MASK(32)
19#define HPET_SHIFT 22
20 19
21/* FSEC = 10^-15 20/* FSEC = 10^-15
22 NSEC = 10^-9 */ 21 NSEC = 10^-9 */
@@ -36,6 +35,7 @@
36unsigned long hpet_address; 35unsigned long hpet_address;
37u8 hpet_blockid; /* OS timer block num */ 36u8 hpet_blockid; /* OS timer block num */
38u8 hpet_msi_disable; 37u8 hpet_msi_disable;
38u8 hpet_readback_cmp;
39 39
40#ifdef CONFIG_PCI_MSI 40#ifdef CONFIG_PCI_MSI
41static unsigned long hpet_num_timers; 41static unsigned long hpet_num_timers;
@@ -395,19 +395,23 @@ static int hpet_next_event(unsigned long delta,
395 * at that point and we would wait for the next hpet interrupt 395 * at that point and we would wait for the next hpet interrupt
396 * forever. We found out that reading the CMP register back 396 * forever. We found out that reading the CMP register back
397 * forces the transfer so we can rely on the comparison with 397 * forces the transfer so we can rely on the comparison with
398 * the counter register below. If the read back from the 398 * the counter register below.
399 * compare register does not match the value we programmed 399 *
400 * then we might have a real hardware problem. We can not do 400 * That works fine on those ATI chipsets, but on newer Intel
401 * much about it here, but at least alert the user/admin with 401 * chipsets (ICH9...) this triggers due to an erratum: Reading
402 * a prominent warning. 402 * the comparator immediately following a write is returning
403 * An erratum on some chipsets (ICH9,..), results in comparator read 403 * the old value.
404 * immediately following a write returning old value. Workaround 404 *
405 * for this is to read this value second time, when first 405 * We restrict the read back to the affected ATI chipsets (set
406 * read returns old value. 406 * by quirks) and also run it with hpet=verbose for debugging
407 * purposes.
407 */ 408 */
408 if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { 409 if (hpet_readback_cmp || hpet_verbose) {
409 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, 410 u32 cmp = hpet_readl(HPET_Tn_CMP(timer));
410 KERN_WARNING "hpet: compare register read back failed.\n"); 411
412 if (cmp != cnt)
413 printk_once(KERN_WARNING
414 "hpet: compare register read back failed.\n");
411 } 415 }
412 416
413 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 417 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
@@ -782,7 +786,6 @@ static struct clocksource clocksource_hpet = {
782 .rating = 250, 786 .rating = 250,
783 .read = read_hpet, 787 .read = read_hpet,
784 .mask = HPET_MASK, 788 .mask = HPET_MASK,
785 .shift = HPET_SHIFT,
786 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 789 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
787 .resume = hpet_resume_counter, 790 .resume = hpet_resume_counter,
788#ifdef CONFIG_X86_64 791#ifdef CONFIG_X86_64
@@ -793,6 +796,7 @@ static struct clocksource clocksource_hpet = {
793static int hpet_clocksource_register(void) 796static int hpet_clocksource_register(void)
794{ 797{
795 u64 start, now; 798 u64 start, now;
799 u64 hpet_freq;
796 cycle_t t1; 800 cycle_t t1;
797 801
798 /* Start the counter */ 802 /* Start the counter */
@@ -827,9 +831,15 @@ static int hpet_clocksource_register(void)
827 * mult = (hpet_period * 2^shift)/10^6 831 * mult = (hpet_period * 2^shift)/10^6
828 * mult = (hpet_period << shift)/FSEC_PER_NSEC 832 * mult = (hpet_period << shift)/FSEC_PER_NSEC
829 */ 833 */
830 clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
831 834
832 clocksource_register(&clocksource_hpet); 835 /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
836 *
837 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
838 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
839 */
840 hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC;
841 do_div(hpet_freq, hpet_period);
842 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
833 843
834 return 0; 844 return 0;
835} 845}
@@ -959,7 +969,7 @@ fs_initcall(hpet_late_init);
959 969
960void hpet_disable(void) 970void hpet_disable(void)
961{ 971{
962 if (is_hpet_capable()) { 972 if (is_hpet_capable() && hpet_virt_address) {
963 unsigned int cfg = hpet_readl(HPET_CFG); 973 unsigned int cfg = hpet_readl(HPET_CFG);
964 974
965 if (hpet_legacy_int_enabled) { 975 if (hpet_legacy_int_enabled) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index d6cc065f519f..a474ec37c32f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -189,25 +189,16 @@ static int get_hbp_len(u8 hbp_len)
189} 189}
190 190
191/* 191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space. 192 * Check for virtual address in kernel space.
205 */ 193 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) 194int arch_check_bp_in_kernelspace(struct perf_event *bp)
207{ 195{
208 unsigned int len; 196 unsigned int len;
197 unsigned long va;
198 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
209 199
210 len = get_hbp_len(hbp_len); 200 va = info->address;
201 len = get_hbp_len(info->len);
211 202
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); 203 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213} 204}
@@ -217,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
217{ 208{
218 /* Len */ 209 /* Len */
219 switch (x86_len) { 210 switch (x86_len) {
211 case X86_BREAKPOINT_LEN_X:
212 *gen_len = sizeof(long);
213 break;
220 case X86_BREAKPOINT_LEN_1: 214 case X86_BREAKPOINT_LEN_1:
221 *gen_len = HW_BREAKPOINT_LEN_1; 215 *gen_len = HW_BREAKPOINT_LEN_1;
222 break; 216 break;
@@ -260,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp)
260 254
261 info->address = bp->attr.bp_addr; 255 info->address = bp->attr.bp_addr;
262 256
257 /* Type */
258 switch (bp->attr.bp_type) {
259 case HW_BREAKPOINT_W:
260 info->type = X86_BREAKPOINT_WRITE;
261 break;
262 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
263 info->type = X86_BREAKPOINT_RW;
264 break;
265 case HW_BREAKPOINT_X:
266 info->type = X86_BREAKPOINT_EXECUTE;
267 /*
268 * x86 inst breakpoints need to have a specific undefined len.
269 * But we still need to check userspace is not trying to setup
270 * an unsupported length, to get a range breakpoint for example.
271 */
272 if (bp->attr.bp_len == sizeof(long)) {
273 info->len = X86_BREAKPOINT_LEN_X;
274 return 0;
275 }
276 default:
277 return -EINVAL;
278 }
279
263 /* Len */ 280 /* Len */
264 switch (bp->attr.bp_len) { 281 switch (bp->attr.bp_len) {
265 case HW_BREAKPOINT_LEN_1: 282 case HW_BREAKPOINT_LEN_1:
@@ -280,28 +297,12 @@ static int arch_build_bp_info(struct perf_event *bp)
280 return -EINVAL; 297 return -EINVAL;
281 } 298 }
282 299
283 /* Type */
284 switch (bp->attr.bp_type) {
285 case HW_BREAKPOINT_W:
286 info->type = X86_BREAKPOINT_WRITE;
287 break;
288 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
289 info->type = X86_BREAKPOINT_RW;
290 break;
291 case HW_BREAKPOINT_X:
292 info->type = X86_BREAKPOINT_EXECUTE;
293 break;
294 default:
295 return -EINVAL;
296 }
297
298 return 0; 300 return 0;
299} 301}
300/* 302/*
301 * Validate the arch-specific HW Breakpoint register settings 303 * Validate the arch-specific HW Breakpoint register settings
302 */ 304 */
303int arch_validate_hwbkpt_settings(struct perf_event *bp, 305int arch_validate_hwbkpt_settings(struct perf_event *bp)
304 struct task_struct *tsk)
305{ 306{
306 struct arch_hw_breakpoint *info = counter_arch_bp(bp); 307 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
307 unsigned int align; 308 unsigned int align;
@@ -314,17 +315,10 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
314 315
315 ret = -EINVAL; 316 ret = -EINVAL;
316 317
317 if (info->type == X86_BREAKPOINT_EXECUTE)
318 /*
319 * Ptrace-refactoring code
320 * For now, we'll allow instruction breakpoint only for user-space
321 * addresses
322 */
323 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
324 info->len != X86_BREAKPOINT_EXECUTE)
325 return ret;
326
327 switch (info->len) { 318 switch (info->len) {
319 case X86_BREAKPOINT_LEN_X:
320 align = sizeof(long) -1;
321 break;
328 case X86_BREAKPOINT_LEN_1: 322 case X86_BREAKPOINT_LEN_1:
329 align = 0; 323 align = 0;
330 break; 324 break;
@@ -350,15 +344,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
350 if (info->address & align) 344 if (info->address & align)
351 return -EINVAL; 345 return -EINVAL;
352 346
353 /* Check that the virtual address is in the proper range */
354 if (tsk) {
355 if (!arch_check_va_in_userspace(info->address, info->len))
356 return -EFAULT;
357 } else {
358 if (!arch_check_va_in_kernelspace(info->address, info->len))
359 return -EFAULT;
360 }
361
362 return 0; 347 return 0;
363} 348}
364 349
@@ -495,6 +480,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
495 480
496 perf_bp_event(bp, args->regs); 481 perf_bp_event(bp, args->regs);
497 482
483 /*
484 * Set up resume flag to avoid breakpoint recursion when
485 * returning back to origin.
486 */
487 if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
488 args->regs->flags |= X86_EFLAGS_RF;
489
498 rcu_read_unlock(); 490 rcu_read_unlock();
499 } 491 }
500 /* 492 /*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 54c31c285488..1f11f5ce668f 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void)
59 stts(); 59 stts();
60} 60}
61 61
62void __cpuinit init_thread_xstate(void) 62static void __cpuinit init_thread_xstate(void)
63{ 63{
64 /*
65 * Note that xstate_size might be overwriten later during
66 * xsave_init().
67 */
68
64 if (!HAVE_HWFP) { 69 if (!HAVE_HWFP) {
65 xstate_size = sizeof(struct i387_soft_struct); 70 xstate_size = sizeof(struct i387_soft_struct);
66 return; 71 return;
67 } 72 }
68 73
69 if (cpu_has_xsave) {
70 xsave_cntxt_init();
71 return;
72 }
73
74 if (cpu_has_fxsr) 74 if (cpu_has_fxsr)
75 xstate_size = sizeof(struct i387_fxsave_struct); 75 xstate_size = sizeof(struct i387_fxsave_struct);
76#ifdef CONFIG_X86_32 76#ifdef CONFIG_X86_32
@@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void)
84 * Called at bootup to set up the initial FPU state that is later cloned 84 * Called at bootup to set up the initial FPU state that is later cloned
85 * into all processes. 85 * into all processes.
86 */ 86 */
87
87void __cpuinit fpu_init(void) 88void __cpuinit fpu_init(void)
88{ 89{
89 unsigned long oldcr0 = read_cr0(); 90 unsigned long oldcr0 = read_cr0();
@@ -93,74 +94,77 @@ void __cpuinit fpu_init(void)
93 94
94 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 95 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
95 96
96 /*
97 * Boot processor to setup the FP and extended state context info.
98 */
99 if (!smp_processor_id()) 97 if (!smp_processor_id())
100 init_thread_xstate(); 98 init_thread_xstate();
101 xsave_init();
102 99
103 mxcsr_feature_mask_init(); 100 mxcsr_feature_mask_init();
104 /* clean state in init */ 101 /* clean state in init */
105 if (cpu_has_xsave) 102 current_thread_info()->status = 0;
106 current_thread_info()->status = TS_XSAVE;
107 else
108 current_thread_info()->status = 0;
109 clear_used_math(); 103 clear_used_math();
110} 104}
111#endif /* CONFIG_X86_64 */
112 105
113/* 106#else /* CONFIG_X86_64 */
114 * The _current_ task is using the FPU for the first time 107
115 * so initialize it and set the mxcsr to its default 108void __cpuinit fpu_init(void)
116 * value at reset if we support XMM instructions and then
117 * remeber the current task has used the FPU.
118 */
119int init_fpu(struct task_struct *tsk)
120{ 109{
121 if (tsk_used_math(tsk)) { 110 if (!smp_processor_id())
122 if (HAVE_HWFP && tsk == current) 111 init_thread_xstate();
123 unlazy_fpu(tsk); 112}
124 return 0;
125 }
126 113
127 /* 114#endif /* CONFIG_X86_32 */
128 * Memory allocation at the first usage of the FPU and other state.
129 */
130 if (!tsk->thread.xstate) {
131 tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
132 GFP_KERNEL);
133 if (!tsk->thread.xstate)
134 return -ENOMEM;
135 }
136 115
116void fpu_finit(struct fpu *fpu)
117{
137#ifdef CONFIG_X86_32 118#ifdef CONFIG_X86_32
138 if (!HAVE_HWFP) { 119 if (!HAVE_HWFP) {
139 memset(tsk->thread.xstate, 0, xstate_size); 120 finit_soft_fpu(&fpu->state->soft);
140 finit_task(tsk); 121 return;
141 set_stopped_child_used_math(tsk);
142 return 0;
143 } 122 }
144#endif 123#endif
145 124
146 if (cpu_has_fxsr) { 125 if (cpu_has_fxsr) {
147 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 126 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
148 127
149 memset(fx, 0, xstate_size); 128 memset(fx, 0, xstate_size);
150 fx->cwd = 0x37f; 129 fx->cwd = 0x37f;
151 if (cpu_has_xmm) 130 if (cpu_has_xmm)
152 fx->mxcsr = MXCSR_DEFAULT; 131 fx->mxcsr = MXCSR_DEFAULT;
153 } else { 132 } else {
154 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 133 struct i387_fsave_struct *fp = &fpu->state->fsave;
155 memset(fp, 0, xstate_size); 134 memset(fp, 0, xstate_size);
156 fp->cwd = 0xffff037fu; 135 fp->cwd = 0xffff037fu;
157 fp->swd = 0xffff0000u; 136 fp->swd = 0xffff0000u;
158 fp->twd = 0xffffffffu; 137 fp->twd = 0xffffffffu;
159 fp->fos = 0xffff0000u; 138 fp->fos = 0xffff0000u;
160 } 139 }
140}
141EXPORT_SYMBOL_GPL(fpu_finit);
142
143/*
144 * The _current_ task is using the FPU for the first time
145 * so initialize it and set the mxcsr to its default
146 * value at reset if we support XMM instructions and then
147 * remeber the current task has used the FPU.
148 */
149int init_fpu(struct task_struct *tsk)
150{
151 int ret;
152
153 if (tsk_used_math(tsk)) {
154 if (HAVE_HWFP && tsk == current)
155 unlazy_fpu(tsk);
156 return 0;
157 }
158
161 /* 159 /*
162 * Only the device not available exception or ptrace can call init_fpu. 160 * Memory allocation at the first usage of the FPU and other state.
163 */ 161 */
162 ret = fpu_alloc(&tsk->thread.fpu);
163 if (ret)
164 return ret;
165
166 fpu_finit(&tsk->thread.fpu);
167
164 set_stopped_child_used_math(tsk); 168 set_stopped_child_used_math(tsk);
165 return 0; 169 return 0;
166} 170}
@@ -193,8 +197,10 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
193 if (ret) 197 if (ret)
194 return ret; 198 return ret;
195 199
200 sanitize_i387_state(target);
201
196 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 202 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
197 &target->thread.xstate->fxsave, 0, -1); 203 &target->thread.fpu.state->fxsave, 0, -1);
198} 204}
199 205
200int xfpregs_set(struct task_struct *target, const struct user_regset *regset, 206int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -210,20 +216,22 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
210 if (ret) 216 if (ret)
211 return ret; 217 return ret;
212 218
219 sanitize_i387_state(target);
220
213 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 221 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
214 &target->thread.xstate->fxsave, 0, -1); 222 &target->thread.fpu.state->fxsave, 0, -1);
215 223
216 /* 224 /*
217 * mxcsr reserved bits must be masked to zero for security reasons. 225 * mxcsr reserved bits must be masked to zero for security reasons.
218 */ 226 */
219 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 227 target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
220 228
221 /* 229 /*
222 * update the header bits in the xsave header, indicating the 230 * update the header bits in the xsave header, indicating the
223 * presence of FP and SSE state. 231 * presence of FP and SSE state.
224 */ 232 */
225 if (cpu_has_xsave) 233 if (cpu_has_xsave)
226 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; 234 target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
227 235
228 return ret; 236 return ret;
229} 237}
@@ -246,14 +254,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
246 * memory layout in the thread struct, so that we can copy the entire 254 * memory layout in the thread struct, so that we can copy the entire
247 * xstateregs to the user using one user_regset_copyout(). 255 * xstateregs to the user using one user_regset_copyout().
248 */ 256 */
249 memcpy(&target->thread.xstate->fxsave.sw_reserved, 257 memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
250 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); 258 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
251 259
252 /* 260 /*
253 * Copy the xstate memory layout. 261 * Copy the xstate memory layout.
254 */ 262 */
255 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, 263 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
256 &target->thread.xstate->xsave, 0, -1); 264 &target->thread.fpu.state->xsave, 0, -1);
257 return ret; 265 return ret;
258} 266}
259 267
@@ -272,14 +280,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
272 return ret; 280 return ret;
273 281
274 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 282 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
275 &target->thread.xstate->xsave, 0, -1); 283 &target->thread.fpu.state->xsave, 0, -1);
276 284
277 /* 285 /*
278 * mxcsr reserved bits must be masked to zero for security reasons. 286 * mxcsr reserved bits must be masked to zero for security reasons.
279 */ 287 */
280 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 288 target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
281 289
282 xsave_hdr = &target->thread.xstate->xsave.xsave_hdr; 290 xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
283 291
284 xsave_hdr->xstate_bv &= pcntxt_mask; 292 xsave_hdr->xstate_bv &= pcntxt_mask;
285 /* 293 /*
@@ -365,7 +373,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
365static void 373static void
366convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) 374convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
367{ 375{
368 struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave; 376 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
369 struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; 377 struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
370 struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; 378 struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
371 int i; 379 int i;
@@ -405,7 +413,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
405 const struct user_i387_ia32_struct *env) 413 const struct user_i387_ia32_struct *env)
406 414
407{ 415{
408 struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave; 416 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
409 struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; 417 struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
410 struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; 418 struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
411 int i; 419 int i;
@@ -445,10 +453,12 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
445 453
446 if (!cpu_has_fxsr) { 454 if (!cpu_has_fxsr) {
447 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 455 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
448 &target->thread.xstate->fsave, 0, 456 &target->thread.fpu.state->fsave, 0,
449 -1); 457 -1);
450 } 458 }
451 459
460 sanitize_i387_state(target);
461
452 if (kbuf && pos == 0 && count == sizeof(env)) { 462 if (kbuf && pos == 0 && count == sizeof(env)) {
453 convert_from_fxsr(kbuf, target); 463 convert_from_fxsr(kbuf, target);
454 return 0; 464 return 0;
@@ -470,12 +480,14 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
470 if (ret) 480 if (ret)
471 return ret; 481 return ret;
472 482
483 sanitize_i387_state(target);
484
473 if (!HAVE_HWFP) 485 if (!HAVE_HWFP)
474 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); 486 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
475 487
476 if (!cpu_has_fxsr) { 488 if (!cpu_has_fxsr) {
477 return user_regset_copyin(&pos, &count, &kbuf, &ubuf, 489 return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
478 &target->thread.xstate->fsave, 0, -1); 490 &target->thread.fpu.state->fsave, 0, -1);
479 } 491 }
480 492
481 if (pos > 0 || count < sizeof(env)) 493 if (pos > 0 || count < sizeof(env))
@@ -490,7 +502,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
490 * presence of FP. 502 * presence of FP.
491 */ 503 */
492 if (cpu_has_xsave) 504 if (cpu_has_xsave)
493 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; 505 target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
494 return ret; 506 return ret;
495} 507}
496 508
@@ -501,7 +513,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
501static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) 513static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
502{ 514{
503 struct task_struct *tsk = current; 515 struct task_struct *tsk = current;
504 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 516 struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
505 517
506 fp->status = fp->swd; 518 fp->status = fp->swd;
507 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) 519 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
@@ -512,7 +524,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
512static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) 524static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
513{ 525{
514 struct task_struct *tsk = current; 526 struct task_struct *tsk = current;
515 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 527 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
516 struct user_i387_ia32_struct env; 528 struct user_i387_ia32_struct env;
517 int err = 0; 529 int err = 0;
518 530
@@ -536,6 +548,9 @@ static int save_i387_xsave(void __user *buf)
536 struct _fpstate_ia32 __user *fx = buf; 548 struct _fpstate_ia32 __user *fx = buf;
537 int err = 0; 549 int err = 0;
538 550
551
552 sanitize_i387_state(tsk);
553
539 /* 554 /*
540 * For legacy compatible, we always set FP/SSE bits in the bit 555 * For legacy compatible, we always set FP/SSE bits in the bit
541 * vector while saving the state to the user context. 556 * vector while saving the state to the user context.
@@ -547,7 +562,7 @@ static int save_i387_xsave(void __user *buf)
547 * header as well as change any contents in the memory layout. 562 * header as well as change any contents in the memory layout.
548 * xrestore as part of sigreturn will capture all the changes. 563 * xrestore as part of sigreturn will capture all the changes.
549 */ 564 */
550 tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; 565 tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
551 566
552 if (save_i387_fxsave(fx) < 0) 567 if (save_i387_fxsave(fx) < 0)
553 return -1; 568 return -1;
@@ -599,7 +614,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
599{ 614{
600 struct task_struct *tsk = current; 615 struct task_struct *tsk = current;
601 616
602 return __copy_from_user(&tsk->thread.xstate->fsave, buf, 617 return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
603 sizeof(struct i387_fsave_struct)); 618 sizeof(struct i387_fsave_struct));
604} 619}
605 620
@@ -610,10 +625,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
610 struct user_i387_ia32_struct env; 625 struct user_i387_ia32_struct env;
611 int err; 626 int err;
612 627
613 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], 628 err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
614 size); 629 size);
615 /* mxcsr reserved bits must be masked to zero for security reasons */ 630 /* mxcsr reserved bits must be masked to zero for security reasons */
616 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 631 tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
617 if (err || __copy_from_user(&env, buf, sizeof(env))) 632 if (err || __copy_from_user(&env, buf, sizeof(env)))
618 return 1; 633 return 1;
619 convert_to_fxsr(tsk, &env); 634 convert_to_fxsr(tsk, &env);
@@ -629,7 +644,7 @@ static int restore_i387_xsave(void __user *buf)
629 struct i387_fxsave_struct __user *fx = 644 struct i387_fxsave_struct __user *fx =
630 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; 645 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
631 struct xsave_hdr_struct *xsave_hdr = 646 struct xsave_hdr_struct *xsave_hdr =
632 &current->thread.xstate->xsave.xsave_hdr; 647 &current->thread.fpu.state->xsave.xsave_hdr;
633 u64 mask; 648 u64 mask;
634 int err; 649 int err;
635 650
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 23c167925a5c..2dfd31597443 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -16,7 +16,7 @@
16#include <asm/hpet.h> 16#include <asm/hpet.h>
17#include <asm/smp.h> 17#include <asm/smp.h>
18 18
19DEFINE_SPINLOCK(i8253_lock); 19DEFINE_RAW_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock); 20EXPORT_SYMBOL(i8253_lock);
21 21
22/* 22/*
@@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event;
33static void init_pit_timer(enum clock_event_mode mode, 33static void init_pit_timer(enum clock_event_mode mode,
34 struct clock_event_device *evt) 34 struct clock_event_device *evt)
35{ 35{
36 spin_lock(&i8253_lock); 36 raw_spin_lock(&i8253_lock);
37 37
38 switch (mode) { 38 switch (mode) {
39 case CLOCK_EVT_MODE_PERIODIC: 39 case CLOCK_EVT_MODE_PERIODIC:
@@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode,
62 /* Nothing to do here */ 62 /* Nothing to do here */
63 break; 63 break;
64 } 64 }
65 spin_unlock(&i8253_lock); 65 raw_spin_unlock(&i8253_lock);
66} 66}
67 67
68/* 68/*
@@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode,
72 */ 72 */
73static int pit_next_event(unsigned long delta, struct clock_event_device *evt) 73static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
74{ 74{
75 spin_lock(&i8253_lock); 75 raw_spin_lock(&i8253_lock);
76 outb_pit(delta & 0xff , PIT_CH0); /* LSB */ 76 outb_pit(delta & 0xff , PIT_CH0); /* LSB */
77 outb_pit(delta >> 8 , PIT_CH0); /* MSB */ 77 outb_pit(delta >> 8 , PIT_CH0); /* MSB */
78 spin_unlock(&i8253_lock); 78 raw_spin_unlock(&i8253_lock);
79 79
80 return 0; 80 return 0;
81} 81}
@@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs)
130 int count; 130 int count;
131 u32 jifs; 131 u32 jifs;
132 132
133 spin_lock_irqsave(&i8253_lock, flags); 133 raw_spin_lock_irqsave(&i8253_lock, flags);
134 /* 134 /*
135 * Although our caller may have the read side of xtime_lock, 135 * Although our caller may have the read side of xtime_lock,
136 * this is now a seqlock, and we are cheating in this routine 136 * this is now a seqlock, and we are cheating in this routine
@@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs)
176 old_count = count; 176 old_count = count;
177 old_jifs = jifs; 177 old_jifs = jifs;
178 178
179 spin_unlock_irqrestore(&i8253_lock, flags); 179 raw_spin_unlock_irqrestore(&i8253_lock, flags);
180 180
181 count = (LATCH - 1) - count; 181 count = (LATCH - 1) - count;
182 182
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 7c9f02c130f3..cafa7c80ac95 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -276,16 +276,6 @@ static struct sys_device device_i8259A = {
276 .cls = &i8259_sysdev_class, 276 .cls = &i8259_sysdev_class,
277}; 277};
278 278
279static int __init i8259A_init_sysfs(void)
280{
281 int error = sysdev_class_register(&i8259_sysdev_class);
282 if (!error)
283 error = sysdev_register(&device_i8259A);
284 return error;
285}
286
287device_initcall(i8259A_init_sysfs);
288
289static void mask_8259A(void) 279static void mask_8259A(void)
290{ 280{
291 unsigned long flags; 281 unsigned long flags;
@@ -407,3 +397,18 @@ struct legacy_pic default_legacy_pic = {
407}; 397};
408 398
409struct legacy_pic *legacy_pic = &default_legacy_pic; 399struct legacy_pic *legacy_pic = &default_legacy_pic;
400
401static int __init i8259A_init_sysfs(void)
402{
403 int error;
404
405 if (legacy_pic != &default_legacy_pic)
406 return 0;
407
408 error = sysdev_class_register(&i8259_sysdev_class);
409 if (!error)
410 error = sysdev_register(&device_i8259A);
411 return error;
412}
413
414device_initcall(i8259A_init_sysfs);
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 3a54dcb9cd0e..43e9ccf44947 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -34,7 +34,7 @@ EXPORT_SYMBOL(init_task);
34/* 34/*
35 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 35 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
36 * no more per-task TSS's. The TSS size is kept cacheline-aligned 36 * no more per-task TSS's. The TSS size is kept cacheline-aligned
37 * so they are allowed to end up in the .data.cacheline_aligned 37 * so they are allowed to end up in the .data..cacheline_aligned
38 * section. Since TSS's are completely CPU-local, we want them 38 * section. Since TSS's are completely CPU-local, we want them
39 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 39 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
40 */ 40 */
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 0ed2d300cd46..990ae7cfc578 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -60,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
60 outb(0, 0xF0); 60 outb(0, 0xF0);
61 if (ignore_fpu_irq || !boot_cpu_data.hard_math) 61 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
62 return IRQ_NONE; 62 return IRQ_NONE;
63 math_error((void __user *)get_irq_regs()->ip); 63 math_error(get_irq_regs(), 0, 16);
64 return IRQ_HANDLED; 64 return IRQ_HANDLED;
65} 65}
66 66
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b2258ca91003..ef10940e1af0 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -47,69 +47,96 @@
47#include <asm/debugreg.h> 47#include <asm/debugreg.h>
48#include <asm/apicdef.h> 48#include <asm/apicdef.h>
49#include <asm/system.h> 49#include <asm/system.h>
50
51#include <asm/apic.h> 50#include <asm/apic.h>
52 51
53/* 52struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
54 * Put the error code here just in case the user cares:
55 */
56static int gdb_x86errcode;
57
58/*
59 * Likewise, the vector number here (since GDB only gets the signal
60 * number through the usual means, and that's not very specific):
61 */
62static int gdb_x86vector = -1;
63
64/**
65 * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
66 * @gdb_regs: A pointer to hold the registers in the order GDB wants.
67 * @regs: The &struct pt_regs of the current process.
68 *
69 * Convert the pt_regs in @regs into the format for registers that
70 * GDB expects, stored in @gdb_regs.
71 */
72void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
73{ 53{
74#ifndef CONFIG_X86_32 54#ifdef CONFIG_X86_32
75 u32 *gdb_regs32 = (u32 *)gdb_regs; 55 { "ax", 4, offsetof(struct pt_regs, ax) },
56 { "cx", 4, offsetof(struct pt_regs, cx) },
57 { "dx", 4, offsetof(struct pt_regs, dx) },
58 { "bx", 4, offsetof(struct pt_regs, bx) },
59 { "sp", 4, offsetof(struct pt_regs, sp) },
60 { "bp", 4, offsetof(struct pt_regs, bp) },
61 { "si", 4, offsetof(struct pt_regs, si) },
62 { "di", 4, offsetof(struct pt_regs, di) },
63 { "ip", 4, offsetof(struct pt_regs, ip) },
64 { "flags", 4, offsetof(struct pt_regs, flags) },
65 { "cs", 4, offsetof(struct pt_regs, cs) },
66 { "ss", 4, offsetof(struct pt_regs, ss) },
67 { "ds", 4, offsetof(struct pt_regs, ds) },
68 { "es", 4, offsetof(struct pt_regs, es) },
69 { "fs", 4, -1 },
70 { "gs", 4, -1 },
71#else
72 { "ax", 8, offsetof(struct pt_regs, ax) },
73 { "bx", 8, offsetof(struct pt_regs, bx) },
74 { "cx", 8, offsetof(struct pt_regs, cx) },
75 { "dx", 8, offsetof(struct pt_regs, dx) },
76 { "si", 8, offsetof(struct pt_regs, dx) },
77 { "di", 8, offsetof(struct pt_regs, di) },
78 { "bp", 8, offsetof(struct pt_regs, bp) },
79 { "sp", 8, offsetof(struct pt_regs, sp) },
80 { "r8", 8, offsetof(struct pt_regs, r8) },
81 { "r9", 8, offsetof(struct pt_regs, r9) },
82 { "r10", 8, offsetof(struct pt_regs, r10) },
83 { "r11", 8, offsetof(struct pt_regs, r11) },
84 { "r12", 8, offsetof(struct pt_regs, r12) },
85 { "r13", 8, offsetof(struct pt_regs, r13) },
86 { "r14", 8, offsetof(struct pt_regs, r14) },
87 { "r15", 8, offsetof(struct pt_regs, r15) },
88 { "ip", 8, offsetof(struct pt_regs, ip) },
89 { "flags", 4, offsetof(struct pt_regs, flags) },
90 { "cs", 4, offsetof(struct pt_regs, cs) },
91 { "ss", 4, offsetof(struct pt_regs, ss) },
76#endif 92#endif
77 gdb_regs[GDB_AX] = regs->ax; 93};
78 gdb_regs[GDB_BX] = regs->bx; 94
79 gdb_regs[GDB_CX] = regs->cx; 95int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
80 gdb_regs[GDB_DX] = regs->dx; 96{
81 gdb_regs[GDB_SI] = regs->si; 97 if (
82 gdb_regs[GDB_DI] = regs->di;
83 gdb_regs[GDB_BP] = regs->bp;
84 gdb_regs[GDB_PC] = regs->ip;
85#ifdef CONFIG_X86_32 98#ifdef CONFIG_X86_32
86 gdb_regs[GDB_PS] = regs->flags; 99 regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
87 gdb_regs[GDB_DS] = regs->ds; 100#endif
88 gdb_regs[GDB_ES] = regs->es; 101 regno == GDB_SP || regno == GDB_ORIG_AX)
89 gdb_regs[GDB_CS] = regs->cs; 102 return 0;
90 gdb_regs[GDB_FS] = 0xFFFF; 103
91 gdb_regs[GDB_GS] = 0xFFFF; 104 if (dbg_reg_def[regno].offset != -1)
92 if (user_mode_vm(regs)) { 105 memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
93 gdb_regs[GDB_SS] = regs->ss; 106 dbg_reg_def[regno].size);
94 gdb_regs[GDB_SP] = regs->sp; 107 return 0;
95 } else { 108}
96 gdb_regs[GDB_SS] = __KERNEL_DS; 109
97 gdb_regs[GDB_SP] = kernel_stack_pointer(regs); 110char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
111{
112 if (regno == GDB_ORIG_AX) {
113 memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
114 return "orig_ax";
98 } 115 }
99#else 116 if (regno >= DBG_MAX_REG_NUM || regno < 0)
100 gdb_regs[GDB_R8] = regs->r8; 117 return NULL;
101 gdb_regs[GDB_R9] = regs->r9; 118
102 gdb_regs[GDB_R10] = regs->r10; 119 if (dbg_reg_def[regno].offset != -1)
103 gdb_regs[GDB_R11] = regs->r11; 120 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
104 gdb_regs[GDB_R12] = regs->r12; 121 dbg_reg_def[regno].size);
105 gdb_regs[GDB_R13] = regs->r13; 122
106 gdb_regs[GDB_R14] = regs->r14; 123 switch (regno) {
107 gdb_regs[GDB_R15] = regs->r15; 124#ifdef CONFIG_X86_32
108 gdb_regs32[GDB_PS] = regs->flags; 125 case GDB_SS:
109 gdb_regs32[GDB_CS] = regs->cs; 126 if (!user_mode_vm(regs))
110 gdb_regs32[GDB_SS] = regs->ss; 127 *(unsigned long *)mem = __KERNEL_DS;
111 gdb_regs[GDB_SP] = kernel_stack_pointer(regs); 128 break;
129 case GDB_SP:
130 if (!user_mode_vm(regs))
131 *(unsigned long *)mem = kernel_stack_pointer(regs);
132 break;
133 case GDB_GS:
134 case GDB_FS:
135 *(unsigned long *)mem = 0xFFFF;
136 break;
112#endif 137#endif
138 }
139 return dbg_reg_def[regno].name;
113} 140}
114 141
115/** 142/**
@@ -162,66 +189,35 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
162 gdb_regs[GDB_SP] = p->thread.sp; 189 gdb_regs[GDB_SP] = p->thread.sp;
163} 190}
164 191
165/**
166 * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
167 * @gdb_regs: A pointer to hold the registers we've received from GDB.
168 * @regs: A pointer to a &struct pt_regs to hold these values in.
169 *
170 * Convert the GDB regs in @gdb_regs into the pt_regs, and store them
171 * in @regs.
172 */
173void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
174{
175#ifndef CONFIG_X86_32
176 u32 *gdb_regs32 = (u32 *)gdb_regs;
177#endif
178 regs->ax = gdb_regs[GDB_AX];
179 regs->bx = gdb_regs[GDB_BX];
180 regs->cx = gdb_regs[GDB_CX];
181 regs->dx = gdb_regs[GDB_DX];
182 regs->si = gdb_regs[GDB_SI];
183 regs->di = gdb_regs[GDB_DI];
184 regs->bp = gdb_regs[GDB_BP];
185 regs->ip = gdb_regs[GDB_PC];
186#ifdef CONFIG_X86_32
187 regs->flags = gdb_regs[GDB_PS];
188 regs->ds = gdb_regs[GDB_DS];
189 regs->es = gdb_regs[GDB_ES];
190 regs->cs = gdb_regs[GDB_CS];
191#else
192 regs->r8 = gdb_regs[GDB_R8];
193 regs->r9 = gdb_regs[GDB_R9];
194 regs->r10 = gdb_regs[GDB_R10];
195 regs->r11 = gdb_regs[GDB_R11];
196 regs->r12 = gdb_regs[GDB_R12];
197 regs->r13 = gdb_regs[GDB_R13];
198 regs->r14 = gdb_regs[GDB_R14];
199 regs->r15 = gdb_regs[GDB_R15];
200 regs->flags = gdb_regs32[GDB_PS];
201 regs->cs = gdb_regs32[GDB_CS];
202 regs->ss = gdb_regs32[GDB_SS];
203#endif
204}
205
206static struct hw_breakpoint { 192static struct hw_breakpoint {
207 unsigned enabled; 193 unsigned enabled;
208 unsigned long addr; 194 unsigned long addr;
209 int len; 195 int len;
210 int type; 196 int type;
211 struct perf_event **pev; 197 struct perf_event **pev;
212} breakinfo[4]; 198} breakinfo[HBP_NUM];
199
200static unsigned long early_dr7;
213 201
214static void kgdb_correct_hw_break(void) 202static void kgdb_correct_hw_break(void)
215{ 203{
216 int breakno; 204 int breakno;
217 205
218 for (breakno = 0; breakno < 4; breakno++) { 206 for (breakno = 0; breakno < HBP_NUM; breakno++) {
219 struct perf_event *bp; 207 struct perf_event *bp;
220 struct arch_hw_breakpoint *info; 208 struct arch_hw_breakpoint *info;
221 int val; 209 int val;
222 int cpu = raw_smp_processor_id(); 210 int cpu = raw_smp_processor_id();
223 if (!breakinfo[breakno].enabled) 211 if (!breakinfo[breakno].enabled)
224 continue; 212 continue;
213 if (dbg_is_early) {
214 set_debugreg(breakinfo[breakno].addr, breakno);
215 early_dr7 |= encode_dr7(breakno,
216 breakinfo[breakno].len,
217 breakinfo[breakno].type);
218 set_debugreg(early_dr7, 7);
219 continue;
220 }
225 bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); 221 bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
226 info = counter_arch_bp(bp); 222 info = counter_arch_bp(bp);
227 if (bp->attr.disabled != 1) 223 if (bp->attr.disabled != 1)
@@ -236,7 +232,8 @@ static void kgdb_correct_hw_break(void)
236 if (!val) 232 if (!val)
237 bp->attr.disabled = 0; 233 bp->attr.disabled = 0;
238 } 234 }
239 hw_breakpoint_restore(); 235 if (!dbg_is_early)
236 hw_breakpoint_restore();
240} 237}
241 238
242static int hw_break_reserve_slot(int breakno) 239static int hw_break_reserve_slot(int breakno)
@@ -245,6 +242,9 @@ static int hw_break_reserve_slot(int breakno)
245 int cnt = 0; 242 int cnt = 0;
246 struct perf_event **pevent; 243 struct perf_event **pevent;
247 244
245 if (dbg_is_early)
246 return 0;
247
248 for_each_online_cpu(cpu) { 248 for_each_online_cpu(cpu) {
249 cnt++; 249 cnt++;
250 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); 250 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
@@ -270,6 +270,9 @@ static int hw_break_release_slot(int breakno)
270 struct perf_event **pevent; 270 struct perf_event **pevent;
271 int cpu; 271 int cpu;
272 272
273 if (dbg_is_early)
274 return 0;
275
273 for_each_online_cpu(cpu) { 276 for_each_online_cpu(cpu) {
274 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); 277 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
275 if (dbg_release_bp_slot(*pevent)) 278 if (dbg_release_bp_slot(*pevent))
@@ -287,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
287{ 290{
288 int i; 291 int i;
289 292
290 for (i = 0; i < 4; i++) 293 for (i = 0; i < HBP_NUM; i++)
291 if (breakinfo[i].addr == addr && breakinfo[i].enabled) 294 if (breakinfo[i].addr == addr && breakinfo[i].enabled)
292 break; 295 break;
293 if (i == 4) 296 if (i == HBP_NUM)
294 return -1; 297 return -1;
295 298
296 if (hw_break_release_slot(i)) { 299 if (hw_break_release_slot(i)) {
@@ -308,13 +311,17 @@ static void kgdb_remove_all_hw_break(void)
308 int cpu = raw_smp_processor_id(); 311 int cpu = raw_smp_processor_id();
309 struct perf_event *bp; 312 struct perf_event *bp;
310 313
311 for (i = 0; i < 4; i++) { 314 for (i = 0; i < HBP_NUM; i++) {
312 if (!breakinfo[i].enabled) 315 if (!breakinfo[i].enabled)
313 continue; 316 continue;
314 bp = *per_cpu_ptr(breakinfo[i].pev, cpu); 317 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
315 if (bp->attr.disabled == 1) 318 if (bp->attr.disabled == 1)
316 continue; 319 continue;
317 arch_uninstall_hw_breakpoint(bp); 320 if (dbg_is_early)
321 early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
322 breakinfo[i].type);
323 else
324 arch_uninstall_hw_breakpoint(bp);
318 bp->attr.disabled = 1; 325 bp->attr.disabled = 1;
319 } 326 }
320} 327}
@@ -324,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
324{ 331{
325 int i; 332 int i;
326 333
327 for (i = 0; i < 4; i++) 334 for (i = 0; i < HBP_NUM; i++)
328 if (!breakinfo[i].enabled) 335 if (!breakinfo[i].enabled)
329 break; 336 break;
330 if (i == 4) 337 if (i == HBP_NUM)
331 return -1; 338 return -1;
332 339
333 switch (bptype) { 340 switch (bptype) {
@@ -388,9 +395,14 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
388 395
389 /* Disable hardware debugging while we are in kgdb: */ 396 /* Disable hardware debugging while we are in kgdb: */
390 set_debugreg(0UL, 7); 397 set_debugreg(0UL, 7);
391 for (i = 0; i < 4; i++) { 398 for (i = 0; i < HBP_NUM; i++) {
392 if (!breakinfo[i].enabled) 399 if (!breakinfo[i].enabled)
393 continue; 400 continue;
401 if (dbg_is_early) {
402 early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
403 breakinfo[i].type);
404 continue;
405 }
394 bp = *per_cpu_ptr(breakinfo[i].pev, cpu); 406 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
395 if (bp->attr.disabled == 1) 407 if (bp->attr.disabled == 1)
396 continue; 408 continue;
@@ -399,23 +411,6 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
399 } 411 }
400} 412}
401 413
402/**
403 * kgdb_post_primary_code - Save error vector/code numbers.
404 * @regs: Original pt_regs.
405 * @e_vector: Original error vector.
406 * @err_code: Original error code.
407 *
408 * This is needed on architectures which support SMP and KGDB.
409 * This function is called after all the slave cpus have been put
410 * to a know spin state and the primary CPU has control over KGDB.
411 */
412void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
413{
414 /* primary processor is completely in the debugger */
415 gdb_x86vector = e_vector;
416 gdb_x86errcode = err_code;
417}
418
419#ifdef CONFIG_SMP 414#ifdef CONFIG_SMP
420/** 415/**
421 * kgdb_roundup_cpus - Get other CPUs into a holding pattern 416 * kgdb_roundup_cpus - Get other CPUs into a holding pattern
@@ -461,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
461{ 456{
462 unsigned long addr; 457 unsigned long addr;
463 char *ptr; 458 char *ptr;
464 int newPC;
465 459
466 switch (remcomInBuffer[0]) { 460 switch (remcomInBuffer[0]) {
467 case 'c': 461 case 'c':
@@ -472,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
472 linux_regs->ip = addr; 466 linux_regs->ip = addr;
473 case 'D': 467 case 'D':
474 case 'k': 468 case 'k':
475 newPC = linux_regs->ip;
476
477 /* clear the trace bit */ 469 /* clear the trace bit */
478 linux_regs->flags &= ~X86_EFLAGS_TF; 470 linux_regs->flags &= ~X86_EFLAGS_TF;
479 atomic_set(&kgdb_cpu_doing_single_step, -1); 471 atomic_set(&kgdb_cpu_doing_single_step, -1);
@@ -567,7 +559,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
567 return NOTIFY_DONE; 559 return NOTIFY_DONE;
568 } 560 }
569 561
570 if (kgdb_handle_exception(args->trapnr, args->signr, args->err, regs)) 562 if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs))
571 return NOTIFY_DONE; 563 return NOTIFY_DONE;
572 564
573 /* Must touch watchdog before return to normal operation */ 565 /* Must touch watchdog before return to normal operation */
@@ -575,6 +567,24 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
575 return NOTIFY_STOP; 567 return NOTIFY_STOP;
576} 568}
577 569
570int kgdb_ll_trap(int cmd, const char *str,
571 struct pt_regs *regs, long err, int trap, int sig)
572{
573 struct die_args args = {
574 .regs = regs,
575 .str = str,
576 .err = err,
577 .trapnr = trap,
578 .signr = sig,
579
580 };
581
582 if (!kgdb_io_module_registered)
583 return NOTIFY_DONE;
584
585 return __kgdb_notify(&args, cmd);
586}
587
578static int 588static int
579kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) 589kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
580{ 590{
@@ -605,14 +615,21 @@ static struct notifier_block kgdb_notifier = {
605 */ 615 */
606int kgdb_arch_init(void) 616int kgdb_arch_init(void)
607{ 617{
618 return register_die_notifier(&kgdb_notifier);
619}
620
621static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
622 struct perf_sample_data *data, struct pt_regs *regs)
623{
624 kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP);
625}
626
627void kgdb_arch_late(void)
628{
608 int i, cpu; 629 int i, cpu;
609 int ret;
610 struct perf_event_attr attr; 630 struct perf_event_attr attr;
611 struct perf_event **pevent; 631 struct perf_event **pevent;
612 632
613 ret = register_die_notifier(&kgdb_notifier);
614 if (ret != 0)
615 return ret;
616 /* 633 /*
617 * Pre-allocate the hw breakpoint structions in the non-atomic 634 * Pre-allocate the hw breakpoint structions in the non-atomic
618 * portion of kgdb because this operation requires mutexs to 635 * portion of kgdb because this operation requires mutexs to
@@ -623,24 +640,27 @@ int kgdb_arch_init(void)
623 attr.bp_len = HW_BREAKPOINT_LEN_1; 640 attr.bp_len = HW_BREAKPOINT_LEN_1;
624 attr.bp_type = HW_BREAKPOINT_W; 641 attr.bp_type = HW_BREAKPOINT_W;
625 attr.disabled = 1; 642 attr.disabled = 1;
626 for (i = 0; i < 4; i++) { 643 for (i = 0; i < HBP_NUM; i++) {
644 if (breakinfo[i].pev)
645 continue;
627 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); 646 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
628 if (IS_ERR(breakinfo[i].pev)) { 647 if (IS_ERR(breakinfo[i].pev)) {
629 printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n"); 648 printk(KERN_ERR "kgdb: Could not allocate hw"
649 "breakpoints\nDisabling the kernel debugger\n");
630 breakinfo[i].pev = NULL; 650 breakinfo[i].pev = NULL;
631 kgdb_arch_exit(); 651 kgdb_arch_exit();
632 return -1; 652 return;
633 } 653 }
634 for_each_online_cpu(cpu) { 654 for_each_online_cpu(cpu) {
635 pevent = per_cpu_ptr(breakinfo[i].pev, cpu); 655 pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
636 pevent[0]->hw.sample_period = 1; 656 pevent[0]->hw.sample_period = 1;
657 pevent[0]->overflow_handler = kgdb_hw_overflow_handler;
637 if (pevent[0]->destroy != NULL) { 658 if (pevent[0]->destroy != NULL) {
638 pevent[0]->destroy = NULL; 659 pevent[0]->destroy = NULL;
639 release_bp_slot(*pevent); 660 release_bp_slot(*pevent);
640 } 661 }
641 } 662 }
642 } 663 }
643 return ret;
644} 664}
645 665
646/** 666/**
@@ -690,6 +710,11 @@ unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs)
690 return instruction_pointer(regs); 710 return instruction_pointer(regs);
691} 711}
692 712
713void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
714{
715 regs->ip = ip;
716}
717
693struct kgdb_arch arch_kgdb_ops = { 718struct kgdb_arch arch_kgdb_ops = {
694 /* Breakpoint instruction: */ 719 /* Breakpoint instruction: */
695 .gdb_bpt_instr = { 0xcc }, 720 .gdb_bpt_instr = { 0xcc },
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 1658efdfb4e5..1bfb6cf4dd55 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to)
126} 126}
127 127
128/* 128/*
129 * Check for the REX prefix which can only exist on X86_64 129 * Skip the prefixes of the instruction.
130 * X86_32 always returns 0
131 */ 130 */
132static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) 131static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
133{ 132{
133 insn_attr_t attr;
134
135 attr = inat_get_opcode_attribute((insn_byte_t)*insn);
136 while (inat_is_legacy_prefix(attr)) {
137 insn++;
138 attr = inat_get_opcode_attribute((insn_byte_t)*insn);
139 }
134#ifdef CONFIG_X86_64 140#ifdef CONFIG_X86_64
135 if ((*insn & 0xf0) == 0x40) 141 if (inat_is_rex_prefix(attr))
136 return 1; 142 insn++;
137#endif 143#endif
138 return 0; 144 return insn;
139} 145}
140 146
141/* 147/*
@@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr)
272 */ 278 */
273static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) 279static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
274{ 280{
281 /* Skip prefixes */
282 insn = skip_prefixes(insn);
283
275 switch (*insn) { 284 switch (*insn) {
276 case 0xfa: /* cli */ 285 case 0xfa: /* cli */
277 case 0xfb: /* sti */ 286 case 0xfb: /* sti */
@@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
280 return 1; 289 return 1;
281 } 290 }
282 291
283 /*
284 * on X86_64, 0x40-0x4f are REX prefixes so we need to look
285 * at the next byte instead.. but of course not recurse infinitely
286 */
287 if (is_REX_prefix(insn))
288 return is_IF_modifier(++insn);
289
290 return 0; 292 return 0;
291} 293}
292 294
@@ -422,14 +424,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
422 424
423static void __kprobes clear_btf(void) 425static void __kprobes clear_btf(void)
424{ 426{
425 if (test_thread_flag(TIF_DEBUGCTLMSR)) 427 if (test_thread_flag(TIF_BLOCKSTEP)) {
426 update_debugctlmsr(0); 428 unsigned long debugctl = get_debugctlmsr();
429
430 debugctl &= ~DEBUGCTLMSR_BTF;
431 update_debugctlmsr(debugctl);
432 }
427} 433}
428 434
429static void __kprobes restore_btf(void) 435static void __kprobes restore_btf(void)
430{ 436{
431 if (test_thread_flag(TIF_DEBUGCTLMSR)) 437 if (test_thread_flag(TIF_BLOCKSTEP)) {
432 update_debugctlmsr(current->thread.debugctlmsr); 438 unsigned long debugctl = get_debugctlmsr();
439
440 debugctl |= DEBUGCTLMSR_BTF;
441 update_debugctlmsr(debugctl);
442 }
433} 443}
434 444
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 445void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
@@ -632,8 +642,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
632 /* Skip cs, ip, orig_ax and gs. */ \ 642 /* Skip cs, ip, orig_ax and gs. */ \
633 " subl $16, %esp\n" \ 643 " subl $16, %esp\n" \
634 " pushl %fs\n" \ 644 " pushl %fs\n" \
635 " pushl %ds\n" \
636 " pushl %es\n" \ 645 " pushl %es\n" \
646 " pushl %ds\n" \
637 " pushl %eax\n" \ 647 " pushl %eax\n" \
638 " pushl %ebp\n" \ 648 " pushl %ebp\n" \
639 " pushl %edi\n" \ 649 " pushl %edi\n" \
@@ -795,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p,
795 unsigned long orig_ip = (unsigned long)p->addr; 805 unsigned long orig_ip = (unsigned long)p->addr;
796 kprobe_opcode_t *insn = p->ainsn.insn; 806 kprobe_opcode_t *insn = p->ainsn.insn;
797 807
798 /*skip the REX prefix*/ 808 /* Skip prefixes */
799 if (is_REX_prefix(insn)) 809 insn = skip_prefixes(insn);
800 insn++;
801 810
802 regs->flags &= ~X86_EFLAGS_TF; 811 regs->flags &= ~X86_EFLAGS_TF;
803 switch (*insn) { 812 switch (*insn) {
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index feaeb0d3aa4f..eb9b76c716c2 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,6 +29,8 @@
29#define KVM_SCALE 22 29#define KVM_SCALE 22
30 30
31static int kvmclock = 1; 31static int kvmclock = 1;
32static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
33static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
32 34
33static int parse_no_kvmclock(char *arg) 35static int parse_no_kvmclock(char *arg)
34{ 36{
@@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void)
54 56
55 low = (int)__pa_symbol(&wall_clock); 57 low = (int)__pa_symbol(&wall_clock);
56 high = ((u64)__pa_symbol(&wall_clock) >> 32); 58 high = ((u64)__pa_symbol(&wall_clock) >> 32);
57 native_write_msr(MSR_KVM_WALL_CLOCK, low, high); 59
60 native_write_msr(msr_kvm_wall_clock, low, high);
58 61
59 vcpu_time = &get_cpu_var(hv_clock); 62 vcpu_time = &get_cpu_var(hv_clock);
60 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); 63 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
@@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt)
130 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 133 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
131 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 134 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
132 cpu, high, low, txt); 135 cpu, high, low, txt);
133 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); 136
137 return native_write_msr_safe(msr_kvm_system_time, low, high);
134} 138}
135 139
136#ifdef CONFIG_X86_LOCAL_APIC 140#ifdef CONFIG_X86_LOCAL_APIC
@@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void)
165#ifdef CONFIG_KEXEC 169#ifdef CONFIG_KEXEC
166static void kvm_crash_shutdown(struct pt_regs *regs) 170static void kvm_crash_shutdown(struct pt_regs *regs)
167{ 171{
168 native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); 172 native_write_msr(msr_kvm_system_time, 0, 0);
169 native_machine_crash_shutdown(regs); 173 native_machine_crash_shutdown(regs);
170} 174}
171#endif 175#endif
172 176
173static void kvm_shutdown(void) 177static void kvm_shutdown(void)
174{ 178{
175 native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); 179 native_write_msr(msr_kvm_system_time, 0, 0);
176 native_machine_shutdown(); 180 native_machine_shutdown();
177} 181}
178 182
@@ -181,27 +185,37 @@ void __init kvmclock_init(void)
181 if (!kvm_para_available()) 185 if (!kvm_para_available())
182 return; 186 return;
183 187
184 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 188 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
185 if (kvm_register_clock("boot clock")) 189 msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
186 return; 190 msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
187 pv_time_ops.sched_clock = kvm_clock_read; 191 } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
188 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 192 return;
189 x86_platform.get_wallclock = kvm_get_wallclock; 193
190 x86_platform.set_wallclock = kvm_set_wallclock; 194 printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
195 msr_kvm_system_time, msr_kvm_wall_clock);
196
197 if (kvm_register_clock("boot clock"))
198 return;
199 pv_time_ops.sched_clock = kvm_clock_read;
200 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
201 x86_platform.get_wallclock = kvm_get_wallclock;
202 x86_platform.set_wallclock = kvm_set_wallclock;
191#ifdef CONFIG_X86_LOCAL_APIC 203#ifdef CONFIG_X86_LOCAL_APIC
192 x86_cpuinit.setup_percpu_clockev = 204 x86_cpuinit.setup_percpu_clockev =
193 kvm_setup_secondary_clock; 205 kvm_setup_secondary_clock;
194#endif 206#endif
195#ifdef CONFIG_SMP 207#ifdef CONFIG_SMP
196 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 208 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
197#endif 209#endif
198 machine_ops.shutdown = kvm_shutdown; 210 machine_ops.shutdown = kvm_shutdown;
199#ifdef CONFIG_KEXEC 211#ifdef CONFIG_KEXEC
200 machine_ops.crash_shutdown = kvm_crash_shutdown; 212 machine_ops.crash_shutdown = kvm_crash_shutdown;
201#endif 213#endif
202 kvm_get_preset_lpj(); 214 kvm_get_preset_lpj();
203 clocksource_register(&kvm_clock); 215 clocksource_register(&kvm_clock);
204 pv_info.paravirt_enabled = 1; 216 pv_info.paravirt_enabled = 1;
205 pv_info.name = "KVM"; 217 pv_info.name = "KVM";
206 } 218
219 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
220 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
207} 221}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index cceb5bc3c3c2..fa6551d36c10 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -201,9 +201,9 @@ static int do_microcode_update(const void __user *buf, size_t size)
201 return error; 201 return error;
202} 202}
203 203
204static int microcode_open(struct inode *unused1, struct file *unused2) 204static int microcode_open(struct inode *inode, struct file *file)
205{ 205{
206 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 206 return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
207} 207}
208 208
209static ssize_t microcode_write(struct file *file, const char __user *buf, 209static ssize_t microcode_write(struct file *file, const char __user *buf,
@@ -260,6 +260,7 @@ static void microcode_dev_exit(void)
260} 260}
261 261
262MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); 262MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
263MODULE_ALIAS("devname:cpu/microcode");
263#else 264#else
264#define microcode_dev_init() 0 265#define microcode_dev_init() 0
265#define microcode_dev_exit() do { } while (0) 266#define microcode_dev_exit() do { } while (0)
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 85a343e28937..356170262a93 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -343,10 +343,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
343 int (*get_ucode_data)(void *, const void *, size_t)) 343 int (*get_ucode_data)(void *, const void *, size_t))
344{ 344{
345 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 345 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
346 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 346 u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
347 int new_rev = uci->cpu_sig.rev; 347 int new_rev = uci->cpu_sig.rev;
348 unsigned int leftover = size; 348 unsigned int leftover = size;
349 enum ucode_state state = UCODE_OK; 349 enum ucode_state state = UCODE_OK;
350 unsigned int curr_mc_size = 0;
350 351
351 while (leftover) { 352 while (leftover) {
352 struct microcode_header_intel mc_header; 353 struct microcode_header_intel mc_header;
@@ -361,9 +362,15 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
361 break; 362 break;
362 } 363 }
363 364
364 mc = vmalloc(mc_size); 365 /* For performance reasons, reuse mc area when possible */
365 if (!mc) 366 if (!mc || mc_size > curr_mc_size) {
366 break; 367 if (mc)
368 vfree(mc);
369 mc = vmalloc(mc_size);
370 if (!mc)
371 break;
372 curr_mc_size = mc_size;
373 }
367 374
368 if (get_ucode_data(mc, ucode_ptr, mc_size) || 375 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
369 microcode_sanity_check(mc) < 0) { 376 microcode_sanity_check(mc) < 0) {
@@ -376,13 +383,16 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
376 vfree(new_mc); 383 vfree(new_mc);
377 new_rev = mc_header.rev; 384 new_rev = mc_header.rev;
378 new_mc = mc; 385 new_mc = mc;
379 } else 386 mc = NULL; /* trigger new vmalloc */
380 vfree(mc); 387 }
381 388
382 ucode_ptr += mc_size; 389 ucode_ptr += mc_size;
383 leftover -= mc_size; 390 leftover -= mc_size;
384 } 391 }
385 392
393 if (mc)
394 vfree(mc);
395
386 if (leftover) { 396 if (leftover) {
387 if (new_mc) 397 if (new_mc)
388 vfree(new_mc); 398 vfree(new_mc);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e81030f71a8f..d86dbf7e54be 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
116} 116}
117 117
118static int bad_ioapic(unsigned long address)
119{
120 if (nr_ioapics >= MAX_IO_APICS) {
121 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
122 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
123 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
124 }
125 if (!address) {
126 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
127 " found in table, skipping!\n");
128 return 1;
129 }
130 return 0;
131}
132
133static void __init MP_ioapic_info(struct mpc_ioapic *m) 118static void __init MP_ioapic_info(struct mpc_ioapic *m)
134{ 119{
135 if (!(m->flags & MPC_APIC_USABLE)) 120 if (!(m->flags & MPC_APIC_USABLE))
@@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
138 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", 123 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
139 m->apicid, m->apicver, m->apicaddr); 124 m->apicid, m->apicver, m->apicaddr);
140 125
141 if (bad_ioapic(m->apicaddr)) 126 mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
142 return;
143
144 mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
145 mp_ioapics[nr_ioapics].apicid = m->apicid;
146 mp_ioapics[nr_ioapics].type = m->type;
147 mp_ioapics[nr_ioapics].apicver = m->apicver;
148 mp_ioapics[nr_ioapics].flags = m->flags;
149 nr_ioapics++;
150} 127}
151 128
152static void print_MP_intsrc_info(struct mpc_intsrc *m) 129static void print_MP_intsrc_info(struct mpc_intsrc *m)
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 0aad8670858e..79ae68154e87 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -25,8 +25,34 @@
25#include <asm/i8259.h> 25#include <asm/i8259.h>
26#include <asm/apb_timer.h> 26#include <asm/apb_timer.h>
27 27
28/*
29 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
30 * cmdline option x86_mrst_timer can be used to override the configuration
31 * to prefer one or the other.
32 * at runtime, there are basically three timer configurations:
33 * 1. per cpu apbt clock only
34 * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
35 * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
36 *
37 * by default (without cmdline option), platform code first detects cpu type
38 * to see if we are on lincroft or penwell, then set up both lapic or apbt
39 * clocks accordingly.
40 * i.e. by default, medfield uses configuration #2, moorestown uses #1.
41 * config #3 is supported but not recommended on medfield.
42 *
43 * rating and feature summary:
44 * lapic (with C3STOP) --------- 100
45 * apbt (always-on) ------------ 110
46 * lapic (always-on,ARAT) ------ 150
47 */
48
49__cpuinitdata enum mrst_timer_options mrst_timer_options;
50
28static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; 51static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
29static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; 52static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
53enum mrst_cpu_type __mrst_cpu_chip;
54EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
55
30int sfi_mtimer_num; 56int sfi_mtimer_num;
31 57
32struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; 58struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
@@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
167 return 0; 193 return 0;
168} 194}
169 195
170/*
171 * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
172 * APBT but cmdline option can also override it.
173 */
174static void __cpuinit mrst_setup_secondary_clock(void)
175{
176 /* restore default lapic clock if disabled by cmdline */
177 if (disable_apbt_percpu)
178 return setup_secondary_APIC_clock();
179 apbt_setup_secondary_clock();
180}
181
182static unsigned long __init mrst_calibrate_tsc(void) 196static unsigned long __init mrst_calibrate_tsc(void)
183{ 197{
184 unsigned long flags, fast_calibrate; 198 unsigned long flags, fast_calibrate;
@@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void)
195 209
196void __init mrst_time_init(void) 210void __init mrst_time_init(void)
197{ 211{
212 switch (mrst_timer_options) {
213 case MRST_TIMER_APBT_ONLY:
214 break;
215 case MRST_TIMER_LAPIC_APBT:
216 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
217 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
218 break;
219 default:
220 if (!boot_cpu_has(X86_FEATURE_ARAT))
221 break;
222 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
223 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
224 return;
225 }
226 /* we need at least one APB timer */
198 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); 227 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
199 pre_init_apic_IRQ0(); 228 pre_init_apic_IRQ0();
200 apbt_time_init(); 229 apbt_time_init();
@@ -205,16 +234,27 @@ void __init mrst_rtc_init(void)
205 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); 234 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
206} 235}
207 236
208/* 237void __cpuinit mrst_arch_setup(void)
209 * if we use per cpu apb timer, the bootclock already setup. if we use lapic
210 * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
211 */
212static void __init mrst_setup_boot_clock(void)
213{ 238{
214 pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); 239 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
215 if (disable_apbt_percpu) 240 __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
216 setup_boot_APIC_clock(); 241 else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
217}; 242 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
243 else {
244 pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
245 boot_cpu_data.x86, boot_cpu_data.x86_model);
246 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
247 }
248 pr_debug("Moorestown CPU %s identified\n",
249 (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
250 "Lincroft" : "Penwell");
251}
252
253/* MID systems don't have i8042 controller */
254static int mrst_i8042_detect(void)
255{
256 return 0;
257}
218 258
219/* 259/*
220 * Moorestown specific x86_init function overrides and early setup 260 * Moorestown specific x86_init function overrides and early setup
@@ -226,15 +266,46 @@ void __init x86_mrst_early_setup(void)
226 x86_init.resources.reserve_resources = x86_init_noop; 266 x86_init.resources.reserve_resources = x86_init_noop;
227 267
228 x86_init.timers.timer_init = mrst_time_init; 268 x86_init.timers.timer_init = mrst_time_init;
229 x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; 269 x86_init.timers.setup_percpu_clockev = x86_init_noop;
230 270
231 x86_init.irqs.pre_vector_init = x86_init_noop; 271 x86_init.irqs.pre_vector_init = x86_init_noop;
232 272
233 x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; 273 x86_init.oem.arch_setup = mrst_arch_setup;
274
275 x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
234 276
235 x86_platform.calibrate_tsc = mrst_calibrate_tsc; 277 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
278 x86_platform.i8042_detect = mrst_i8042_detect;
236 x86_init.pci.init = pci_mrst_init; 279 x86_init.pci.init = pci_mrst_init;
237 x86_init.pci.fixup_irqs = x86_init_noop; 280 x86_init.pci.fixup_irqs = x86_init_noop;
238 281
239 legacy_pic = &null_legacy_pic; 282 legacy_pic = &null_legacy_pic;
283
284 /* Avoid searching for BIOS MP tables */
285 x86_init.mpparse.find_smp_config = x86_init_noop;
286 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
287
288}
289
290/*
291 * if user does not want to use per CPU apb timer, just give it a lower rating
292 * than local apic timer and skip the late per cpu timer init.
293 */
294static inline int __init setup_x86_mrst_timer(char *arg)
295{
296 if (!arg)
297 return -EINVAL;
298
299 if (strcmp("apbt_only", arg) == 0)
300 mrst_timer_options = MRST_TIMER_APBT_ONLY;
301 else if (strcmp("lapic_and_apbt", arg) == 0)
302 mrst_timer_options = MRST_TIMER_LAPIC_APBT;
303 else {
304 pr_warning("X86 MRST timer option %s not recognised"
305 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
306 arg);
307 return -EINVAL;
308 }
309 return 0;
240} 310}
311__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 4d4468e9f47c..7bf2dc4c8f70 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -230,7 +230,7 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
230 msr_device_destroy(cpu); 230 msr_device_destroy(cpu);
231 break; 231 break;
232 } 232 }
233 return err ? NOTIFY_BAD : NOTIFY_OK; 233 return notifier_from_errno(err);
234} 234}
235 235
236static struct notifier_block __refdata msr_class_cpu_notifier = { 236static struct notifier_block __refdata msr_class_cpu_notifier = {
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 8297160c41b3..0e0cdde519be 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -21,10 +21,7 @@
21#include <asm/geode.h> 21#include <asm/geode.h>
22#include <asm/setup.h> 22#include <asm/setup.h>
23#include <asm/olpc.h> 23#include <asm/olpc.h>
24 24#include <asm/olpc_ofw.h>
25#ifdef CONFIG_OPEN_FIRMWARE
26#include <asm/ofw.h>
27#endif
28 25
29struct olpc_platform_t olpc_platform_info; 26struct olpc_platform_t olpc_platform_info;
30EXPORT_SYMBOL_GPL(olpc_platform_info); 27EXPORT_SYMBOL_GPL(olpc_platform_info);
@@ -145,7 +142,7 @@ restart:
145 * The OBF flag will sometimes misbehave due to what we believe 142 * The OBF flag will sometimes misbehave due to what we believe
146 * is a hardware quirk.. 143 * is a hardware quirk..
147 */ 144 */
148 printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd); 145 pr_devel("olpc-ec: running cmd 0x%x\n", cmd);
149 outb(cmd, 0x6c); 146 outb(cmd, 0x6c);
150 147
151 if (wait_on_ibf(0x6c, 0)) { 148 if (wait_on_ibf(0x6c, 0)) {
@@ -162,8 +159,7 @@ restart:
162 " EC accept data!\n"); 159 " EC accept data!\n");
163 goto err; 160 goto err;
164 } 161 }
165 printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n", 162 pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]);
166 inbuf[i]);
167 outb(inbuf[i], 0x68); 163 outb(inbuf[i], 0x68);
168 } 164 }
169 } 165 }
@@ -176,8 +172,7 @@ restart:
176 goto restart; 172 goto restart;
177 } 173 }
178 outbuf[i] = inb(0x68); 174 outbuf[i] = inb(0x68);
179 printk(KERN_DEBUG "olpc-ec: received 0x%x\n", 175 pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
180 outbuf[i]);
181 } 176 }
182 } 177 }
183 178
@@ -188,14 +183,15 @@ err:
188} 183}
189EXPORT_SYMBOL_GPL(olpc_ec_cmd); 184EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190 185
191#ifdef CONFIG_OPEN_FIRMWARE 186#ifdef CONFIG_OLPC_OPENFIRMWARE
192static void __init platform_detect(void) 187static void __init platform_detect(void)
193{ 188{
194 size_t propsize; 189 size_t propsize;
195 __be32 rev; 190 __be32 rev;
191 const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
192 void *res[] = { &propsize };
196 193
197 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 194 if (olpc_ofw("getprop", args, res) || propsize != 4) {
198 &propsize) || propsize != 4) {
199 printk(KERN_ERR "ofw: getprop call failed!\n"); 195 printk(KERN_ERR "ofw: getprop call failed!\n");
200 rev = cpu_to_be32(0); 196 rev = cpu_to_be32(0);
201 } 197 }
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
new file mode 100644
index 000000000000..3218aa71ab5e
--- /dev/null
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -0,0 +1,106 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/init.h>
4#include <asm/page.h>
5#include <asm/setup.h>
6#include <asm/io.h>
7#include <asm/pgtable.h>
8#include <asm/olpc_ofw.h>
9
10/* address of OFW callback interface; will be NULL if OFW isn't found */
11static int (*olpc_ofw_cif)(int *);
12
13/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
14u32 olpc_ofw_pgd __initdata;
15
16static DEFINE_SPINLOCK(ofw_lock);
17
18#define MAXARGS 10
19
20void __init setup_olpc_ofw_pgd(void)
21{
22 pgd_t *base, *ofw_pde;
23
24 if (!olpc_ofw_cif)
25 return;
26
27 /* fetch OFW's PDE */
28 base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
29 if (!base) {
30 printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
31 olpc_ofw_cif = NULL;
32 return;
33 }
34 ofw_pde = &base[OLPC_OFW_PDE_NR];
35
36 /* install OFW's PDE permanently into the kernel's pgtable */
37 set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
38 /* implicit optimization barrier here due to uninline function return */
39
40 early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
41}
42
43int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
44 void **res)
45{
46 int ofw_args[MAXARGS + 3];
47 unsigned long flags;
48 int ret, i, *p;
49
50 BUG_ON(nr_args + nr_res > MAXARGS);
51
52 if (!olpc_ofw_cif)
53 return -EIO;
54
55 ofw_args[0] = (int)name;
56 ofw_args[1] = nr_args;
57 ofw_args[2] = nr_res;
58
59 p = &ofw_args[3];
60 for (i = 0; i < nr_args; i++, p++)
61 *p = (int)args[i];
62
63 /* call into ofw */
64 spin_lock_irqsave(&ofw_lock, flags);
65 ret = olpc_ofw_cif(ofw_args);
66 spin_unlock_irqrestore(&ofw_lock, flags);
67
68 if (!ret) {
69 for (i = 0; i < nr_res; i++, p++)
70 *((int *)res[i]) = *p;
71 }
72
73 return ret;
74}
75EXPORT_SYMBOL_GPL(__olpc_ofw);
76
77/* OFW cif _should_ be above this address */
78#define OFW_MIN 0xff000000
79
80/* OFW starts on a 1MB boundary */
81#define OFW_BOUND (1<<20)
82
83void __init olpc_ofw_detect(void)
84{
85 struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
86 unsigned long start;
87
88 /* ensure OFW booted us by checking for "OFW " string */
89 if (hdr->ofw_magic != OLPC_OFW_SIG)
90 return;
91
92 olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
93
94 if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
95 printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
96 (unsigned long)olpc_ofw_cif);
97 olpc_ofw_cif = NULL;
98 return;
99 }
100
101 /* determine where OFW starts in memory */
102 start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
103 printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
104 (unsigned long)olpc_ofw_cif, (-start) >> 20);
105 reserve_top_address(-start);
106}
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index fb99f7edb341..078d4ec1a9d9 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -103,11 +103,16 @@ int use_calgary __read_mostly = 0;
103#define PMR_SOFTSTOPFAULT 0x40000000 103#define PMR_SOFTSTOPFAULT 0x40000000
104#define PMR_HARDSTOP 0x20000000 104#define PMR_HARDSTOP 0x20000000
105 105
106#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ 106/*
107#define MAX_NUM_CHASSIS 8 /* max number of chassis */ 107 * The maximum PHB bus number.
108/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */ 108 * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384
109#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) 109 * x3950M2: 4 chassis, 48 PHBs per chassis = 192
110#define PHBS_PER_CALGARY 4 110 * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256
111 * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128
112 */
113#define MAX_PHB_BUS_NUM 256
114
115#define PHBS_PER_CALGARY 4
111 116
112/* register offsets in Calgary's internal register space */ 117/* register offsets in Calgary's internal register space */
113static const unsigned long tar_offsets[] = { 118static const unsigned long tar_offsets[] = {
@@ -1051,8 +1056,6 @@ static int __init calgary_init_one(struct pci_dev *dev)
1051 struct iommu_table *tbl; 1056 struct iommu_table *tbl;
1052 int ret; 1057 int ret;
1053 1058
1054 BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
1055
1056 bbar = busno_to_bbar(dev->bus->number); 1059 bbar = busno_to_bbar(dev->bus->number);
1057 ret = calgary_setup_tar(dev, bbar); 1060 ret = calgary_setup_tar(dev, bbar);
1058 if (ret) 1061 if (ret)
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 7d2829dde20e..a5bc528d4328 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -31,8 +31,6 @@ static struct dma_map_ops swiotlb_dma_ops = {
31 .free_coherent = swiotlb_free_coherent, 31 .free_coherent = swiotlb_free_coherent,
32 .sync_single_for_cpu = swiotlb_sync_single_for_cpu, 32 .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
33 .sync_single_for_device = swiotlb_sync_single_for_device, 33 .sync_single_for_device = swiotlb_sync_single_for_device,
34 .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
35 .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
36 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, 34 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
37 .sync_sg_for_device = swiotlb_sync_sg_for_device, 35 .sync_sg_for_device = swiotlb_sync_sg_for_device,
38 .map_sg = swiotlb_map_sg_attrs, 36 .map_sg = swiotlb_map_sg_attrs,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0415c3ef91b5..d401f1d2d06e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,7 +20,6 @@
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <asm/i387.h> 22#include <asm/i387.h>
23#include <asm/ds.h>
24#include <asm/debugreg.h> 23#include <asm/debugreg.h>
25 24
26unsigned long idle_halt; 25unsigned long idle_halt;
@@ -29,29 +28,26 @@ unsigned long idle_nomwait;
29EXPORT_SYMBOL(idle_nomwait); 28EXPORT_SYMBOL(idle_nomwait);
30 29
31struct kmem_cache *task_xstate_cachep; 30struct kmem_cache *task_xstate_cachep;
31EXPORT_SYMBOL_GPL(task_xstate_cachep);
32 32
33int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 33int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
34{ 34{
35 int ret;
36
35 *dst = *src; 37 *dst = *src;
36 if (src->thread.xstate) { 38 if (fpu_allocated(&src->thread.fpu)) {
37 dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, 39 memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
38 GFP_KERNEL); 40 ret = fpu_alloc(&dst->thread.fpu);
39 if (!dst->thread.xstate) 41 if (ret)
40 return -ENOMEM; 42 return ret;
41 WARN_ON((unsigned long)dst->thread.xstate & 15); 43 fpu_copy(&dst->thread.fpu, &src->thread.fpu);
42 memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
43 } 44 }
44 return 0; 45 return 0;
45} 46}
46 47
47void free_thread_xstate(struct task_struct *tsk) 48void free_thread_xstate(struct task_struct *tsk)
48{ 49{
49 if (tsk->thread.xstate) { 50 fpu_free(&tsk->thread.fpu);
50 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
51 tsk->thread.xstate = NULL;
52 }
53
54 WARN(tsk->thread.ds_ctx, "leaking DS context\n");
55} 51}
56 52
57void free_thread_info(struct thread_info *ti) 53void free_thread_info(struct thread_info *ti)
@@ -198,11 +194,16 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
198 prev = &prev_p->thread; 194 prev = &prev_p->thread;
199 next = &next_p->thread; 195 next = &next_p->thread;
200 196
201 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || 197 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
202 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) 198 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
203 ds_switch_to(prev_p, next_p); 199 unsigned long debugctl = get_debugctlmsr();
204 else if (next->debugctlmsr != prev->debugctlmsr) 200
205 update_debugctlmsr(next->debugctlmsr); 201 debugctl &= ~DEBUGCTLMSR_BTF;
202 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
203 debugctl |= DEBUGCTLMSR_BTF;
204
205 update_debugctlmsr(debugctl);
206 }
206 207
207 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 208 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
208 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 209 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
@@ -371,7 +372,7 @@ static inline int hlt_use_halt(void)
371void default_idle(void) 372void default_idle(void)
372{ 373{
373 if (hlt_use_halt()) { 374 if (hlt_use_halt()) {
374 trace_power_start(POWER_CSTATE, 1); 375 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
375 current_thread_info()->status &= ~TS_POLLING; 376 current_thread_info()->status &= ~TS_POLLING;
376 /* 377 /*
377 * TS_POLLING-cleared state must be visible before we 378 * TS_POLLING-cleared state must be visible before we
@@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
441 */ 442 */
442void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 443void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
443{ 444{
444 trace_power_start(POWER_CSTATE, (ax>>4)+1); 445 trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
445 if (!need_resched()) { 446 if (!need_resched()) {
446 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 447 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
447 clflush((void *)&current_thread_info()->flags); 448 clflush((void *)&current_thread_info()->flags);
@@ -457,7 +458,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
457static void mwait_idle(void) 458static void mwait_idle(void)
458{ 459{
459 if (!need_resched()) { 460 if (!need_resched()) {
460 trace_power_start(POWER_CSTATE, 1); 461 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
461 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 462 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
462 clflush((void *)&current_thread_info()->flags); 463 clflush((void *)&current_thread_info()->flags);
463 464
@@ -478,7 +479,7 @@ static void mwait_idle(void)
478 */ 479 */
479static void poll_idle(void) 480static void poll_idle(void)
480{ 481{
481 trace_power_start(POWER_CSTATE, 0); 482 trace_power_start(POWER_CSTATE, 0, smp_processor_id());
482 local_irq_enable(); 483 local_irq_enable();
483 while (!need_resched()) 484 while (!need_resched())
484 cpu_relax(); 485 cpu_relax();
@@ -525,44 +526,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
525 return (edx & MWAIT_EDX_C1); 526 return (edx & MWAIT_EDX_C1);
526} 527}
527 528
528/* 529bool c1e_detected;
529 * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. 530EXPORT_SYMBOL(c1e_detected);
530 * For more information see
531 * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
532 * - Erratum #365 for family 0x11 (not affected because C1e not in use)
533 */
534static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
535{
536 u64 val;
537 if (c->x86_vendor != X86_VENDOR_AMD)
538 goto no_c1e_idle;
539
540 /* Family 0x0f models < rev F do not have C1E */
541 if (c->x86 == 0x0F && c->x86_model >= 0x40)
542 return 1;
543
544 if (c->x86 == 0x10) {
545 /*
546 * check OSVW bit for CPUs that are not affected
547 * by erratum #400
548 */
549 if (cpu_has(c, X86_FEATURE_OSVW)) {
550 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
551 if (val >= 2) {
552 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
553 if (!(val & BIT(1)))
554 goto no_c1e_idle;
555 }
556 }
557 return 1;
558 }
559
560no_c1e_idle:
561 return 0;
562}
563 531
564static cpumask_var_t c1e_mask; 532static cpumask_var_t c1e_mask;
565static int c1e_detected;
566 533
567void c1e_remove_cpu(int cpu) 534void c1e_remove_cpu(int cpu)
568{ 535{
@@ -584,12 +551,12 @@ static void c1e_idle(void)
584 u32 lo, hi; 551 u32 lo, hi;
585 552
586 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 553 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
554
587 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 555 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
588 c1e_detected = 1; 556 c1e_detected = true;
589 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 557 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
590 mark_tsc_unstable("TSC halt in AMD C1E"); 558 mark_tsc_unstable("TSC halt in AMD C1E");
591 printk(KERN_INFO "System has AMD C1E enabled\n"); 559 printk(KERN_INFO "System has AMD C1E enabled\n");
592 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
593 } 560 }
594 } 561 }
595 562
@@ -638,7 +605,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
638 */ 605 */
639 printk(KERN_INFO "using mwait in idle threads.\n"); 606 printk(KERN_INFO "using mwait in idle threads.\n");
640 pm_idle = mwait_idle; 607 pm_idle = mwait_idle;
641 } else if (check_c1e_idle(c)) { 608 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
609 /* E400: APIC timer interrupt does not wake up CPU from C1e */
642 printk(KERN_INFO "using C1E aware idle routine\n"); 610 printk(KERN_INFO "using C1E aware idle routine\n");
643 pm_idle = c1e_idle; 611 pm_idle = c1e_idle;
644 } else 612 } else
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30c..96586c3cbbbf 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,9 +55,10 @@
55#include <asm/cpu.h> 55#include <asm/cpu.h>
56#include <asm/idle.h> 56#include <asm/idle.h>
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/ds.h>
59#include <asm/debugreg.h> 58#include <asm/debugreg.h>
60 59
60#include <trace/events/power.h>
61
61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
62 63
63/* 64/*
@@ -112,6 +113,8 @@ void cpu_idle(void)
112 stop_critical_timings(); 113 stop_critical_timings();
113 pm_idle(); 114 pm_idle();
114 start_critical_timings(); 115 start_critical_timings();
116
117 trace_power_end(smp_processor_id());
115 } 118 }
116 tick_nohz_restart_sched_tick(); 119 tick_nohz_restart_sched_tick();
117 preempt_enable_no_resched(); 120 preempt_enable_no_resched();
@@ -238,13 +241,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
238 kfree(p->thread.io_bitmap_ptr); 241 kfree(p->thread.io_bitmap_ptr);
239 p->thread.io_bitmap_max = 0; 242 p->thread.io_bitmap_max = 0;
240 } 243 }
241
242 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
243 p->thread.ds_ctx = NULL;
244
245 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
246 p->thread.debugctlmsr = 0;
247
248 return err; 244 return err;
249} 245}
250 246
@@ -317,7 +313,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
317 313
318 /* we're going to use this soon, after a few expensive things */ 314 /* we're going to use this soon, after a few expensive things */
319 if (preload_fpu) 315 if (preload_fpu)
320 prefetch(next->xstate); 316 prefetch(next->fpu.state);
321 317
322 /* 318 /*
323 * Reload esp0. 319 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 17cb3295cbf7..3d9ea531ddd1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,9 +49,10 @@
49#include <asm/ia32.h> 49#include <asm/ia32.h>
50#include <asm/idle.h> 50#include <asm/idle.h>
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/ds.h>
53#include <asm/debugreg.h> 52#include <asm/debugreg.h>
54 53
54#include <trace/events/power.h>
55
55asmlinkage extern void ret_from_fork(void); 56asmlinkage extern void ret_from_fork(void);
56 57
57DEFINE_PER_CPU(unsigned long, old_rsp); 58DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -139,6 +140,9 @@ void cpu_idle(void)
139 stop_critical_timings(); 140 stop_critical_timings();
140 pm_idle(); 141 pm_idle();
141 start_critical_timings(); 142 start_critical_timings();
143
144 trace_power_end(smp_processor_id());
145
142 /* In many cases the interrupt that ended idle 146 /* In many cases the interrupt that ended idle
143 has already called exit_idle. But some idle 147 has already called exit_idle. But some idle
144 loops can be woken up without interrupt. */ 148 loops can be woken up without interrupt. */
@@ -313,13 +317,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
313 if (err) 317 if (err)
314 goto out; 318 goto out;
315 } 319 }
316
317 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
318 p->thread.ds_ctx = NULL;
319
320 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
321 p->thread.debugctlmsr = 0;
322
323 err = 0; 320 err = 0;
324out: 321out:
325 if (err && p->thread.io_bitmap_ptr) { 322 if (err && p->thread.io_bitmap_ptr) {
@@ -396,7 +393,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
396 393
397 /* we're going to use this soon, after a few expensive things */ 394 /* we're going to use this soon, after a few expensive things */
398 if (preload_fpu) 395 if (preload_fpu)
399 prefetch(next->xstate); 396 prefetch(next->fpu.state);
400 397
401 /* 398 /*
402 * Reload esp0, LDT and the page table pointer: 399 * Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2e9b55027b7e..70c4872cd8aa 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,9 +2,6 @@
2/* 2/*
3 * Pentium III FXSR, SSE support 3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000 4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 *
6 * BTS tracing
7 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
8 */ 5 */
9 6
10#include <linux/kernel.h> 7#include <linux/kernel.h>
@@ -22,7 +19,6 @@
22#include <linux/audit.h> 19#include <linux/audit.h>
23#include <linux/seccomp.h> 20#include <linux/seccomp.h>
24#include <linux/signal.h> 21#include <linux/signal.h>
25#include <linux/workqueue.h>
26#include <linux/perf_event.h> 22#include <linux/perf_event.h>
27#include <linux/hw_breakpoint.h> 23#include <linux/hw_breakpoint.h>
28 24
@@ -36,7 +32,6 @@
36#include <asm/desc.h> 32#include <asm/desc.h>
37#include <asm/prctl.h> 33#include <asm/prctl.h>
38#include <asm/proto.h> 34#include <asm/proto.h>
39#include <asm/ds.h>
40#include <asm/hw_breakpoint.h> 35#include <asm/hw_breakpoint.h>
41 36
42#include "tls.h" 37#include "tls.h"
@@ -693,7 +688,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
693 struct perf_event_attr attr; 688 struct perf_event_attr attr;
694 689
695 if (!t->ptrace_bps[nr]) { 690 if (!t->ptrace_bps[nr]) {
696 hw_breakpoint_init(&attr); 691 ptrace_breakpoint_init(&attr);
697 /* 692 /*
698 * Put stub len and type to register (reserve) an inactive but 693 * Put stub len and type to register (reserve) an inactive but
699 * correct bp 694 * correct bp
@@ -789,342 +784,6 @@ static int ioperm_get(struct task_struct *target,
789 0, IO_BITMAP_BYTES); 784 0, IO_BITMAP_BYTES);
790} 785}
791 786
792#ifdef CONFIG_X86_PTRACE_BTS
793/*
794 * A branch trace store context.
795 *
796 * Contexts may only be installed by ptrace_bts_config() and only for
797 * ptraced tasks.
798 *
799 * Contexts are destroyed when the tracee is detached from the tracer.
800 * The actual destruction work requires interrupts enabled, so the
801 * work is deferred and will be scheduled during __ptrace_unlink().
802 *
803 * Contexts hold an additional task_struct reference on the traced
804 * task, as well as a reference on the tracer's mm.
805 *
806 * Ptrace already holds a task_struct for the duration of ptrace operations,
807 * but since destruction is deferred, it may be executed after both
808 * tracer and tracee exited.
809 */
810struct bts_context {
811 /* The branch trace handle. */
812 struct bts_tracer *tracer;
813
814 /* The buffer used to store the branch trace and its size. */
815 void *buffer;
816 unsigned int size;
817
818 /* The mm that paid for the above buffer. */
819 struct mm_struct *mm;
820
821 /* The task this context belongs to. */
822 struct task_struct *task;
823
824 /* The signal to send on a bts buffer overflow. */
825 unsigned int bts_ovfl_signal;
826
827 /* The work struct to destroy a context. */
828 struct work_struct work;
829};
830
831static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
832{
833 void *buffer = NULL;
834 int err = -ENOMEM;
835
836 err = account_locked_memory(current->mm, current->signal->rlim, size);
837 if (err < 0)
838 return err;
839
840 buffer = kzalloc(size, GFP_KERNEL);
841 if (!buffer)
842 goto out_refund;
843
844 context->buffer = buffer;
845 context->size = size;
846 context->mm = get_task_mm(current);
847
848 return 0;
849
850 out_refund:
851 refund_locked_memory(current->mm, size);
852 return err;
853}
854
855static inline void free_bts_buffer(struct bts_context *context)
856{
857 if (!context->buffer)
858 return;
859
860 kfree(context->buffer);
861 context->buffer = NULL;
862
863 refund_locked_memory(context->mm, context->size);
864 context->size = 0;
865
866 mmput(context->mm);
867 context->mm = NULL;
868}
869
870static void free_bts_context_work(struct work_struct *w)
871{
872 struct bts_context *context;
873
874 context = container_of(w, struct bts_context, work);
875
876 ds_release_bts(context->tracer);
877 put_task_struct(context->task);
878 free_bts_buffer(context);
879 kfree(context);
880}
881
882static inline void free_bts_context(struct bts_context *context)
883{
884 INIT_WORK(&context->work, free_bts_context_work);
885 schedule_work(&context->work);
886}
887
888static inline struct bts_context *alloc_bts_context(struct task_struct *task)
889{
890 struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
891 if (context) {
892 context->task = task;
893 task->bts = context;
894
895 get_task_struct(task);
896 }
897
898 return context;
899}
900
901static int ptrace_bts_read_record(struct task_struct *child, size_t index,
902 struct bts_struct __user *out)
903{
904 struct bts_context *context;
905 const struct bts_trace *trace;
906 struct bts_struct bts;
907 const unsigned char *at;
908 int error;
909
910 context = child->bts;
911 if (!context)
912 return -ESRCH;
913
914 trace = ds_read_bts(context->tracer);
915 if (!trace)
916 return -ESRCH;
917
918 at = trace->ds.top - ((index + 1) * trace->ds.size);
919 if ((void *)at < trace->ds.begin)
920 at += (trace->ds.n * trace->ds.size);
921
922 if (!trace->read)
923 return -EOPNOTSUPP;
924
925 error = trace->read(context->tracer, at, &bts);
926 if (error < 0)
927 return error;
928
929 if (copy_to_user(out, &bts, sizeof(bts)))
930 return -EFAULT;
931
932 return sizeof(bts);
933}
934
935static int ptrace_bts_drain(struct task_struct *child,
936 long size,
937 struct bts_struct __user *out)
938{
939 struct bts_context *context;
940 const struct bts_trace *trace;
941 const unsigned char *at;
942 int error, drained = 0;
943
944 context = child->bts;
945 if (!context)
946 return -ESRCH;
947
948 trace = ds_read_bts(context->tracer);
949 if (!trace)
950 return -ESRCH;
951
952 if (!trace->read)
953 return -EOPNOTSUPP;
954
955 if (size < (trace->ds.top - trace->ds.begin))
956 return -EIO;
957
958 for (at = trace->ds.begin; (void *)at < trace->ds.top;
959 out++, drained++, at += trace->ds.size) {
960 struct bts_struct bts;
961
962 error = trace->read(context->tracer, at, &bts);
963 if (error < 0)
964 return error;
965
966 if (copy_to_user(out, &bts, sizeof(bts)))
967 return -EFAULT;
968 }
969
970 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
971
972 error = ds_reset_bts(context->tracer);
973 if (error < 0)
974 return error;
975
976 return drained;
977}
978
979static int ptrace_bts_config(struct task_struct *child,
980 long cfg_size,
981 const struct ptrace_bts_config __user *ucfg)
982{
983 struct bts_context *context;
984 struct ptrace_bts_config cfg;
985 unsigned int flags = 0;
986
987 if (cfg_size < sizeof(cfg))
988 return -EIO;
989
990 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
991 return -EFAULT;
992
993 context = child->bts;
994 if (!context)
995 context = alloc_bts_context(child);
996 if (!context)
997 return -ENOMEM;
998
999 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
1000 if (!cfg.signal)
1001 return -EINVAL;
1002
1003 return -EOPNOTSUPP;
1004 context->bts_ovfl_signal = cfg.signal;
1005 }
1006
1007 ds_release_bts(context->tracer);
1008 context->tracer = NULL;
1009
1010 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
1011 int err;
1012
1013 free_bts_buffer(context);
1014 if (!cfg.size)
1015 return 0;
1016
1017 err = alloc_bts_buffer(context, cfg.size);
1018 if (err < 0)
1019 return err;
1020 }
1021
1022 if (cfg.flags & PTRACE_BTS_O_TRACE)
1023 flags |= BTS_USER;
1024
1025 if (cfg.flags & PTRACE_BTS_O_SCHED)
1026 flags |= BTS_TIMESTAMPS;
1027
1028 context->tracer =
1029 ds_request_bts_task(child, context->buffer, context->size,
1030 NULL, (size_t)-1, flags);
1031 if (unlikely(IS_ERR(context->tracer))) {
1032 int error = PTR_ERR(context->tracer);
1033
1034 free_bts_buffer(context);
1035 context->tracer = NULL;
1036 return error;
1037 }
1038
1039 return sizeof(cfg);
1040}
1041
1042static int ptrace_bts_status(struct task_struct *child,
1043 long cfg_size,
1044 struct ptrace_bts_config __user *ucfg)
1045{
1046 struct bts_context *context;
1047 const struct bts_trace *trace;
1048 struct ptrace_bts_config cfg;
1049
1050 context = child->bts;
1051 if (!context)
1052 return -ESRCH;
1053
1054 if (cfg_size < sizeof(cfg))
1055 return -EIO;
1056
1057 trace = ds_read_bts(context->tracer);
1058 if (!trace)
1059 return -ESRCH;
1060
1061 memset(&cfg, 0, sizeof(cfg));
1062 cfg.size = trace->ds.end - trace->ds.begin;
1063 cfg.signal = context->bts_ovfl_signal;
1064 cfg.bts_size = sizeof(struct bts_struct);
1065
1066 if (cfg.signal)
1067 cfg.flags |= PTRACE_BTS_O_SIGNAL;
1068
1069 if (trace->ds.flags & BTS_USER)
1070 cfg.flags |= PTRACE_BTS_O_TRACE;
1071
1072 if (trace->ds.flags & BTS_TIMESTAMPS)
1073 cfg.flags |= PTRACE_BTS_O_SCHED;
1074
1075 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
1076 return -EFAULT;
1077
1078 return sizeof(cfg);
1079}
1080
1081static int ptrace_bts_clear(struct task_struct *child)
1082{
1083 struct bts_context *context;
1084 const struct bts_trace *trace;
1085
1086 context = child->bts;
1087 if (!context)
1088 return -ESRCH;
1089
1090 trace = ds_read_bts(context->tracer);
1091 if (!trace)
1092 return -ESRCH;
1093
1094 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
1095
1096 return ds_reset_bts(context->tracer);
1097}
1098
1099static int ptrace_bts_size(struct task_struct *child)
1100{
1101 struct bts_context *context;
1102 const struct bts_trace *trace;
1103
1104 context = child->bts;
1105 if (!context)
1106 return -ESRCH;
1107
1108 trace = ds_read_bts(context->tracer);
1109 if (!trace)
1110 return -ESRCH;
1111
1112 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
1113}
1114
1115/*
1116 * Called from __ptrace_unlink() after the child has been moved back
1117 * to its original parent.
1118 */
1119void ptrace_bts_untrace(struct task_struct *child)
1120{
1121 if (unlikely(child->bts)) {
1122 free_bts_context(child->bts);
1123 child->bts = NULL;
1124 }
1125}
1126#endif /* CONFIG_X86_PTRACE_BTS */
1127
1128/* 787/*
1129 * Called by kernel/ptrace.c when detaching.. 788 * Called by kernel/ptrace.c when detaching..
1130 * 789 *
@@ -1252,39 +911,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1252 break; 911 break;
1253#endif 912#endif
1254 913
1255 /*
1256 * These bits need more cooking - not enabled yet:
1257 */
1258#ifdef CONFIG_X86_PTRACE_BTS
1259 case PTRACE_BTS_CONFIG:
1260 ret = ptrace_bts_config
1261 (child, data, (struct ptrace_bts_config __user *)addr);
1262 break;
1263
1264 case PTRACE_BTS_STATUS:
1265 ret = ptrace_bts_status
1266 (child, data, (struct ptrace_bts_config __user *)addr);
1267 break;
1268
1269 case PTRACE_BTS_SIZE:
1270 ret = ptrace_bts_size(child);
1271 break;
1272
1273 case PTRACE_BTS_GET:
1274 ret = ptrace_bts_read_record
1275 (child, data, (struct bts_struct __user *) addr);
1276 break;
1277
1278 case PTRACE_BTS_CLEAR:
1279 ret = ptrace_bts_clear(child);
1280 break;
1281
1282 case PTRACE_BTS_DRAIN:
1283 ret = ptrace_bts_drain
1284 (child, data, (struct bts_struct __user *) addr);
1285 break;
1286#endif /* CONFIG_X86_PTRACE_BTS */
1287
1288 default: 914 default:
1289 ret = ptrace_request(child, request, addr, data); 915 ret = ptrace_request(child, request, addr, data);
1290 break; 916 break;
@@ -1544,14 +1170,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1544 1170
1545 case PTRACE_GET_THREAD_AREA: 1171 case PTRACE_GET_THREAD_AREA:
1546 case PTRACE_SET_THREAD_AREA: 1172 case PTRACE_SET_THREAD_AREA:
1547#ifdef CONFIG_X86_PTRACE_BTS
1548 case PTRACE_BTS_CONFIG:
1549 case PTRACE_BTS_STATUS:
1550 case PTRACE_BTS_SIZE:
1551 case PTRACE_BTS_GET:
1552 case PTRACE_BTS_CLEAR:
1553 case PTRACE_BTS_DRAIN:
1554#endif /* CONFIG_X86_PTRACE_BTS */
1555 return arch_ptrace(child, request, addr, data); 1173 return arch_ptrace(child, request, addr, data);
1556 1174
1557 default: 1175 default:
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 03801f2f761f..239427ca02af 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -31,8 +31,16 @@ struct pvclock_shadow_time {
31 u32 tsc_to_nsec_mul; 31 u32 tsc_to_nsec_mul;
32 int tsc_shift; 32 int tsc_shift;
33 u32 version; 33 u32 version;
34 u8 flags;
34}; 35};
35 36
37static u8 valid_flags __read_mostly = 0;
38
39void pvclock_set_flags(u8 flags)
40{
41 valid_flags = flags;
42}
43
36/* 44/*
37 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, 45 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
38 * yielding a 64-bit result. 46 * yielding a 64-bit result.
@@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
91 dst->system_timestamp = src->system_time; 99 dst->system_timestamp = src->system_time;
92 dst->tsc_to_nsec_mul = src->tsc_to_system_mul; 100 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
93 dst->tsc_shift = src->tsc_shift; 101 dst->tsc_shift = src->tsc_shift;
102 dst->flags = src->flags;
94 rmb(); /* test version after fetching data */ 103 rmb(); /* test version after fetching data */
95 } while ((src->version & 1) || (dst->version != src->version)); 104 } while ((src->version & 1) || (dst->version != src->version));
96 105
@@ -109,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
109 return pv_tsc_khz; 118 return pv_tsc_khz;
110} 119}
111 120
121static atomic64_t last_value = ATOMIC64_INIT(0);
122
112cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 123cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
113{ 124{
114 struct pvclock_shadow_time shadow; 125 struct pvclock_shadow_time shadow;
115 unsigned version; 126 unsigned version;
116 cycle_t ret, offset; 127 cycle_t ret, offset;
128 u64 last;
117 129
118 do { 130 do {
119 version = pvclock_get_time_values(&shadow, src); 131 version = pvclock_get_time_values(&shadow, src);
@@ -123,6 +135,31 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
123 barrier(); 135 barrier();
124 } while (version != src->version); 136 } while (version != src->version);
125 137
138 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
139 (shadow.flags & PVCLOCK_TSC_STABLE_BIT))
140 return ret;
141
142 /*
143 * Assumption here is that last_value, a global accumulator, always goes
144 * forward. If we are less than that, we should not be much smaller.
145 * We assume there is an error marging we're inside, and then the correction
146 * does not sacrifice accuracy.
147 *
148 * For reads: global may have changed between test and return,
149 * but this means someone else updated poked the clock at a later time.
150 * We just need to make sure we are not seeing a backwards event.
151 *
152 * For updates: last_value = ret is not enough, since two vcpus could be
153 * updating at the same time, and one of them could be slightly behind,
154 * making the assumption that last_value always go forward fail to hold.
155 */
156 last = atomic64_read(&last_value);
157 do {
158 if (ret < last)
159 return last;
160 last = atomic64_cmpxchg(&last_value, last, ret);
161 } while (unlikely(last != ret));
162
126 return ret; 163 return ret;
127} 164}
128 165
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 12e9feaa2f7a..939b9e98245f 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -495,6 +495,9 @@ void force_hpet_resume(void)
495/* 495/*
496 * HPET MSI on some boards (ATI SB700/SB800) has side effect on 496 * HPET MSI on some boards (ATI SB700/SB800) has side effect on
497 * floppy DMA. Disable HPET MSI on such platforms. 497 * floppy DMA. Disable HPET MSI on such platforms.
498 * See erratum #27 (Misinterpreted MSI Requests May Result in
499 * Corrupted LPC DMA Data) in AMD Publication #46837,
500 * "SB700 Family Product Errata", Rev. 1.0, March 2010.
498 */ 501 */
499static void force_disable_hpet_msi(struct pci_dev *unused) 502static void force_disable_hpet_msi(struct pci_dev *unused)
500{ 503{
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 8e1aac86b50c..e3af342fe83a 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -228,6 +228,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
228 DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), 228 DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
229 }, 229 },
230 }, 230 },
231 { /* Handle problems with rebooting on Dell T7400's */
232 .callback = set_bios_reboot,
233 .ident = "Dell Precision T7400",
234 .matches = {
235 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
236 DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),
237 },
238 },
231 { /* Handle problems with rebooting on HP laptops */ 239 { /* Handle problems with rebooting on HP laptops */
232 .callback = set_bios_reboot, 240 .callback = set_bios_reboot,
233 .ident = "HP Compaq Laptop", 241 .ident = "HP Compaq Laptop",
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c4851eff57b3..b008e7883207 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -102,6 +102,7 @@
102 102
103#include <asm/paravirt.h> 103#include <asm/paravirt.h>
104#include <asm/hypervisor.h> 104#include <asm/hypervisor.h>
105#include <asm/olpc_ofw.h>
105 106
106#include <asm/percpu.h> 107#include <asm/percpu.h>
107#include <asm/topology.h> 108#include <asm/topology.h>
@@ -676,6 +677,17 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
676 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), 677 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
677 }, 678 },
678 }, 679 },
680 /*
681 * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
682 * match on the product name.
683 */
684 {
685 .callback = dmi_low_memory_corruption,
686 .ident = "Phoenix BIOS",
687 .matches = {
688 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
689 },
690 },
679#endif 691#endif
680 {} 692 {}
681}; 693};
@@ -725,9 +737,15 @@ void __init setup_arch(char **cmdline_p)
725 /* VMI may relocate the fixmap; do this before touching ioremap area */ 737 /* VMI may relocate the fixmap; do this before touching ioremap area */
726 vmi_init(); 738 vmi_init();
727 739
740 /* OFW also may relocate the fixmap */
741 olpc_ofw_detect();
742
743 early_trap_init();
728 early_cpu_init(); 744 early_cpu_init();
729 early_ioremap_init(); 745 early_ioremap_init();
730 746
747 setup_olpc_ofw_pgd();
748
731 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 749 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
732 screen_info = boot_params.screen_info; 750 screen_info = boot_params.screen_info;
733 edid_info = boot_params.edid_info; 751 edid_info = boot_params.edid_info;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ef6370b00e70..a60df9ae6454 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -21,12 +21,6 @@
21#include <asm/cpu.h> 21#include <asm/cpu.h>
22#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
23 23
24#ifdef CONFIG_DEBUG_PER_CPU_MAPS
25# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
26#else
27# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
28#endif
29
30DEFINE_PER_CPU(int, cpu_number); 24DEFINE_PER_CPU(int, cpu_number);
31EXPORT_PER_CPU_SYMBOL(cpu_number); 25EXPORT_PER_CPU_SYMBOL(cpu_number);
32 26
@@ -244,10 +238,19 @@ void __init setup_per_cpu_areas(void)
244#ifdef CONFIG_NUMA 238#ifdef CONFIG_NUMA
245 per_cpu(x86_cpu_to_node_map, cpu) = 239 per_cpu(x86_cpu_to_node_map, cpu) =
246 early_per_cpu_map(x86_cpu_to_node_map, cpu); 240 early_per_cpu_map(x86_cpu_to_node_map, cpu);
241 /*
242 * Ensure that the boot cpu numa_node is correct when the boot
243 * cpu is on a node that doesn't have memory installed.
244 * Also cpu_up() will call cpu_to_node() for APs when
245 * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set
246 * up later with c_init aka intel_init/amd_init.
247 * So set them all (boot cpu and all APs).
248 */
249 set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
247#endif 250#endif
248#endif 251#endif
249 /* 252 /*
250 * Up to this point, the boot CPU has been using .data.init 253 * Up to this point, the boot CPU has been using .init.data
251 * area. Reload any changed state for the boot CPU. 254 * area. Reload any changed state for the boot CPU.
252 */ 255 */
253 if (cpu == boot_cpu_id) 256 if (cpu == boot_cpu_id)
@@ -263,14 +266,6 @@ void __init setup_per_cpu_areas(void)
263 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 266 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
264#endif 267#endif
265 268
266#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
267 /*
268 * make sure boot cpu node_number is right, when boot cpu is on the
269 * node that doesn't have mem installed
270 */
271 per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
272#endif
273
274 /* Setup node to cpumask map */ 269 /* Setup node to cpumask map */
275 setup_node_to_cpumask_map(); 270 setup_node_to_cpumask_map();
276 271
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index 34e099382651..cb22acf3ed09 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
81#endif /* CONFIG_X86_LOCAL_APIC */ 81#endif /* CONFIG_X86_LOCAL_APIC */
82 82
83#ifdef CONFIG_X86_IO_APIC 83#ifdef CONFIG_X86_IO_APIC
84static u32 gsi_base;
85 84
86static int __init sfi_parse_ioapic(struct sfi_table_header *table) 85static int __init sfi_parse_ioapic(struct sfi_table_header *table)
87{ 86{
@@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
94 pentry = (struct sfi_apic_table_entry *)sb->pentry; 93 pentry = (struct sfi_apic_table_entry *)sb->pentry;
95 94
96 for (i = 0; i < num; i++) { 95 for (i = 0; i < num; i++) {
97 mp_register_ioapic(i, pentry->phys_addr, gsi_base); 96 mp_register_ioapic(i, pentry->phys_addr, gsi_top);
98 gsi_base += io_apic_get_redir_entries(i);
99 pentry++; 97 pentry++;
100 } 98 }
101 99
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 763d815e27a0..a5e928b0cb5f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -686,7 +686,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
686static void __cpuinit announce_cpu(int cpu, int apicid) 686static void __cpuinit announce_cpu(int cpu, int apicid)
687{ 687{
688 static int current_node = -1; 688 static int current_node = -1;
689 int node = cpu_to_node(cpu); 689 int node = early_cpu_to_node(cpu);
690 690
691 if (system_state == SYSTEM_BOOTING) { 691 if (system_state == SYSTEM_BOOTING) {
692 if (node != current_node) { 692 if (node != current_node) {
@@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
735 goto do_rest; 735 goto do_rest;
736 } 736 }
737 737
738 if (!keventd_up() || current_is_keventd()) 738 schedule_work(&c_idle.work);
739 c_idle.work.func(&c_idle.work); 739 wait_for_completion(&c_idle.done);
740 else {
741 schedule_work(&c_idle.work);
742 wait_for_completion(&c_idle.done);
743 }
744 740
745 if (IS_ERR(c_idle.idle)) { 741 if (IS_ERR(c_idle.idle)) {
746 printk("failed fork for CPU %d\n", cpu); 742 printk("failed fork for CPU %d\n", cpu);
@@ -816,6 +812,13 @@ do_rest:
816 if (cpumask_test_cpu(cpu, cpu_callin_mask)) 812 if (cpumask_test_cpu(cpu, cpu_callin_mask))
817 break; /* It has booted */ 813 break; /* It has booted */
818 udelay(100); 814 udelay(100);
815 /*
816 * Allow other tasks to run while we wait for the
817 * AP to come online. This also gives a chance
818 * for the MTRR work(triggered by the AP coming online)
819 * to be completed in the stop machine context.
820 */
821 schedule();
819 } 822 }
820 823
821 if (cpumask_test_cpu(cpu, cpu_callin_mask)) 824 if (cpumask_test_cpu(cpu, cpu_callin_mask))
@@ -1215,9 +1218,17 @@ __init void prefill_possible_map(void)
1215 if (!num_processors) 1218 if (!num_processors)
1216 num_processors = 1; 1219 num_processors = 1;
1217 1220
1218 if (setup_possible_cpus == -1) 1221 i = setup_max_cpus ?: 1;
1219 possible = num_processors + disabled_cpus; 1222 if (setup_possible_cpus == -1) {
1220 else 1223 possible = num_processors;
1224#ifdef CONFIG_HOTPLUG_CPU
1225 if (setup_max_cpus)
1226 possible += disabled_cpus;
1227#else
1228 if (possible > i)
1229 possible = i;
1230#endif
1231 } else
1221 possible = setup_possible_cpus; 1232 possible = setup_possible_cpus;
1222 1233
1223 total_cpus = max_t(int, possible, num_processors + disabled_cpus); 1234 total_cpus = max_t(int, possible, num_processors + disabled_cpus);
@@ -1230,11 +1241,23 @@ __init void prefill_possible_map(void)
1230 possible = nr_cpu_ids; 1241 possible = nr_cpu_ids;
1231 } 1242 }
1232 1243
1244#ifdef CONFIG_HOTPLUG_CPU
1245 if (!setup_max_cpus)
1246#endif
1247 if (possible > i) {
1248 printk(KERN_WARNING
1249 "%d Processors exceeds max_cpus limit of %u\n",
1250 possible, setup_max_cpus);
1251 possible = i;
1252 }
1253
1233 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1254 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
1234 possible, max_t(int, possible - num_processors, 0)); 1255 possible, max_t(int, possible - num_processors, 0));
1235 1256
1236 for (i = 0; i < possible; i++) 1257 for (i = 0; i < possible; i++)
1237 set_cpu_possible(i, true); 1258 set_cpu_possible(i, true);
1259 for (; i < NR_CPUS; i++)
1260 set_cpu_possible(i, false);
1238 1261
1239 nr_cpu_ids = possible; 1262 nr_cpu_ids = possible;
1240} 1263}
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 922eefbb3f6c..b53c525368a7 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -23,11 +23,16 @@ static int save_stack_stack(void *data, char *name)
23 return 0; 23 return 0;
24} 24}
25 25
26static void save_stack_address(void *data, unsigned long addr, int reliable) 26static void
27__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
27{ 28{
28 struct stack_trace *trace = data; 29 struct stack_trace *trace = data;
30#ifdef CONFIG_FRAME_POINTER
29 if (!reliable) 31 if (!reliable)
30 return; 32 return;
33#endif
34 if (nosched && in_sched_functions(addr))
35 return;
31 if (trace->skip > 0) { 36 if (trace->skip > 0) {
32 trace->skip--; 37 trace->skip--;
33 return; 38 return;
@@ -36,20 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
36 trace->entries[trace->nr_entries++] = addr; 41 trace->entries[trace->nr_entries++] = addr;
37} 42}
38 43
44static void save_stack_address(void *data, unsigned long addr, int reliable)
45{
46 return __save_stack_address(data, addr, reliable, false);
47}
48
39static void 49static void
40save_stack_address_nosched(void *data, unsigned long addr, int reliable) 50save_stack_address_nosched(void *data, unsigned long addr, int reliable)
41{ 51{
42 struct stack_trace *trace = (struct stack_trace *)data; 52 return __save_stack_address(data, addr, reliable, true);
43 if (!reliable)
44 return;
45 if (in_sched_functions(addr))
46 return;
47 if (trace->skip > 0) {
48 trace->skip--;
49 return;
50 }
51 if (trace->nr_entries < trace->max_entries)
52 trace->entries[trace->nr_entries++] = addr;
53} 53}
54 54
55static const struct stacktrace_ops save_stack_ops = { 55static const struct stacktrace_ops save_stack_ops = {
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
96 96
97/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ 97/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
98 98
99struct stack_frame { 99struct stack_frame_user {
100 const void __user *next_fp; 100 const void __user *next_fp;
101 unsigned long ret_addr; 101 unsigned long ret_addr;
102}; 102};
103 103
104static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) 104static int
105copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
105{ 106{
106 int ret; 107 int ret;
107 108
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace)
126 trace->entries[trace->nr_entries++] = regs->ip; 127 trace->entries[trace->nr_entries++] = regs->ip;
127 128
128 while (trace->nr_entries < trace->max_entries) { 129 while (trace->nr_entries < trace->max_entries) {
129 struct stack_frame frame; 130 struct stack_frame_user frame;
130 131
131 frame.next_fp = NULL; 132 frame.next_fp = NULL;
132 frame.ret_addr = 0; 133 frame.ret_addr = 0;
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3149032ff107..58de45ee08b6 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -158,22 +158,6 @@ static int enable_single_step(struct task_struct *child)
158} 158}
159 159
160/* 160/*
161 * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
162 */
163static void write_debugctlmsr(struct task_struct *child, unsigned long val)
164{
165 if (child->thread.debugctlmsr == val)
166 return;
167
168 child->thread.debugctlmsr = val;
169
170 if (child != current)
171 return;
172
173 update_debugctlmsr(val);
174}
175
176/*
177 * Enable single or block step. 161 * Enable single or block step.
178 */ 162 */
179static void enable_step(struct task_struct *child, bool block) 163static void enable_step(struct task_struct *child, bool block)
@@ -186,15 +170,17 @@ static void enable_step(struct task_struct *child, bool block)
186 * that uses user-mode single stepping itself. 170 * that uses user-mode single stepping itself.
187 */ 171 */
188 if (enable_single_step(child) && block) { 172 if (enable_single_step(child) && block) {
189 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 173 unsigned long debugctl = get_debugctlmsr();
190 write_debugctlmsr(child, 174
191 child->thread.debugctlmsr | DEBUGCTLMSR_BTF); 175 debugctl |= DEBUGCTLMSR_BTF;
192 } else { 176 update_debugctlmsr(debugctl);
193 write_debugctlmsr(child, 177 set_tsk_thread_flag(child, TIF_BLOCKSTEP);
194 child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); 178 } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
195 179 unsigned long debugctl = get_debugctlmsr();
196 if (!child->thread.debugctlmsr) 180
197 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 181 debugctl &= ~DEBUGCTLMSR_BTF;
182 update_debugctlmsr(debugctl);
183 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
198 } 184 }
199} 185}
200 186
@@ -213,11 +199,13 @@ void user_disable_single_step(struct task_struct *child)
213 /* 199 /*
214 * Make sure block stepping (BTF) is disabled. 200 * Make sure block stepping (BTF) is disabled.
215 */ 201 */
216 write_debugctlmsr(child, 202 if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
217 child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); 203 unsigned long debugctl = get_debugctlmsr();
218 204
219 if (!child->thread.debugctlmsr) 205 debugctl &= ~DEBUGCTLMSR_BTF;
220 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 206 update_debugctlmsr(debugctl);
207 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
208 }
221 209
222 /* Always clear TIF_SINGLESTEP... */ 210 /* Always clear TIF_SINGLESTEP... */
223 clear_tsk_thread_flag(child, TIF_SINGLESTEP); 211 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b3729341216..b35786dc9b8f 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,6 @@ ENTRY(sys_call_table)
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_recvmmsg 339 .long sys_recvmmsg
340 .long sys_fanotify_init
341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 86c9f91b48ae..c2f1b26141e2 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -46,6 +46,7 @@
46 46
47/* Global pointer to shared data; NULL means no measured launch. */ 47/* Global pointer to shared data; NULL means no measured launch. */
48struct tboot *tboot __read_mostly; 48struct tboot *tboot __read_mostly;
49EXPORT_SYMBOL(tboot);
49 50
50/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ 51/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
51#define AP_WAIT_TIMEOUT 1 52#define AP_WAIT_TIMEOUT 1
@@ -175,6 +176,9 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
175 struct tboot_mac_region *mr; 176 struct tboot_mac_region *mr;
176 phys_addr_t end = start + size; 177 phys_addr_t end = start + size;
177 178
179 if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
180 panic("tboot: Too many MAC regions\n");
181
178 if (start && size) { 182 if (start && size) {
179 mr = &tboot->mac_regions[tboot->num_mac_regions++]; 183 mr = &tboot->mac_regions[tboot->num_mac_regions++];
180 mr->start = round_down(start, PAGE_SIZE); 184 mr->start = round_down(start, PAGE_SIZE);
@@ -184,18 +188,17 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
184 188
185static int tboot_setup_sleep(void) 189static int tboot_setup_sleep(void)
186{ 190{
191 int i;
192
187 tboot->num_mac_regions = 0; 193 tboot->num_mac_regions = 0;
188 194
189 /* S3 resume code */ 195 for (i = 0; i < e820.nr_map; i++) {
190 add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); 196 if ((e820.map[i].type != E820_RAM)
197 && (e820.map[i].type != E820_RESERVED_KERN))
198 continue;
191 199
192#ifdef CONFIG_X86_TRAMPOLINE 200 add_mac_region(e820.map[i].addr, e820.map[i].size);
193 /* AP trampoline code */ 201 }
194 add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
195#endif
196
197 /* kernel code + data + bss */
198 add_mac_region(virt_to_phys(_text), _end - _text);
199 202
200 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; 203 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
201 204
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 17b03dd3a6b5..7fea555929e2 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * SGI UltraViolet TLB flush routines. 2 * SGI UltraViolet TLB flush routines.
3 * 3 *
4 * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI. 4 * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
5 * 5 *
6 * This code is released under the GNU General Public License version 2 or 6 * This code is released under the GNU General Public License version 2 or
7 * later. 7 * later.
@@ -20,42 +20,67 @@
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/tsc.h> 21#include <asm/tsc.h>
22#include <asm/irq_vectors.h> 22#include <asm/irq_vectors.h>
23#include <asm/timer.h>
23 24
24static struct bau_control **uv_bau_table_bases __read_mostly; 25struct msg_desc {
25static int uv_bau_retry_limit __read_mostly; 26 struct bau_payload_queue_entry *msg;
27 int msg_slot;
28 int sw_ack_slot;
29 struct bau_payload_queue_entry *va_queue_first;
30 struct bau_payload_queue_entry *va_queue_last;
31};
26 32
27/* base pnode in this partition */ 33#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
28static int uv_partition_base_pnode __read_mostly; 34
35static int uv_bau_max_concurrent __read_mostly;
36
37static int nobau;
38static int __init setup_nobau(char *arg)
39{
40 nobau = 1;
41 return 0;
42}
43early_param("nobau", setup_nobau);
29 44
30static unsigned long uv_mmask __read_mostly; 45/* base pnode in this partition */
46static int uv_partition_base_pnode __read_mostly;
47/* position of pnode (which is nasid>>1): */
48static int uv_nshift __read_mostly;
49static unsigned long uv_mmask __read_mostly;
31 50
32static DEFINE_PER_CPU(struct ptc_stats, ptcstats); 51static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
33static DEFINE_PER_CPU(struct bau_control, bau_control); 52static DEFINE_PER_CPU(struct bau_control, bau_control);
53static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
54
55struct reset_args {
56 int sender;
57};
34 58
35/* 59/*
36 * Determine the first node on a blade. 60 * Determine the first node on a uvhub. 'Nodes' are used for kernel
61 * memory allocation.
37 */ 62 */
38static int __init blade_to_first_node(int blade) 63static int __init uvhub_to_first_node(int uvhub)
39{ 64{
40 int node, b; 65 int node, b;
41 66
42 for_each_online_node(node) { 67 for_each_online_node(node) {
43 b = uv_node_to_blade_id(node); 68 b = uv_node_to_blade_id(node);
44 if (blade == b) 69 if (uvhub == b)
45 return node; 70 return node;
46 } 71 }
47 return -1; /* shouldn't happen */ 72 return -1;
48} 73}
49 74
50/* 75/*
51 * Determine the apicid of the first cpu on a blade. 76 * Determine the apicid of the first cpu on a uvhub.
52 */ 77 */
53static int __init blade_to_first_apicid(int blade) 78static int __init uvhub_to_first_apicid(int uvhub)
54{ 79{
55 int cpu; 80 int cpu;
56 81
57 for_each_present_cpu(cpu) 82 for_each_present_cpu(cpu)
58 if (blade == uv_cpu_to_blade_id(cpu)) 83 if (uvhub == uv_cpu_to_blade_id(cpu))
59 return per_cpu(x86_cpu_to_apicid, cpu); 84 return per_cpu(x86_cpu_to_apicid, cpu);
60 return -1; 85 return -1;
61} 86}
@@ -68,195 +93,459 @@ static int __init blade_to_first_apicid(int blade)
68 * clear of the Timeout bit (as well) will free the resource. No reply will 93 * clear of the Timeout bit (as well) will free the resource. No reply will
69 * be sent (the hardware will only do one reply per message). 94 * be sent (the hardware will only do one reply per message).
70 */ 95 */
71static void uv_reply_to_message(int resource, 96static inline void uv_reply_to_message(struct msg_desc *mdp,
72 struct bau_payload_queue_entry *msg, 97 struct bau_control *bcp)
73 struct bau_msg_status *msp)
74{ 98{
75 unsigned long dw; 99 unsigned long dw;
100 struct bau_payload_queue_entry *msg;
76 101
77 dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); 102 msg = mdp->msg;
103 if (!msg->canceled) {
104 dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
105 msg->sw_ack_vector;
106 uv_write_local_mmr(
107 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
108 }
78 msg->replied_to = 1; 109 msg->replied_to = 1;
79 msg->sw_ack_vector = 0; 110 msg->sw_ack_vector = 0;
80 if (msp)
81 msp->seen_by.bits = 0;
82 uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
83} 111}
84 112
85/* 113/*
86 * Do all the things a cpu should do for a TLB shootdown message. 114 * Process the receipt of a RETRY message
87 * Other cpu's may come here at the same time for this message.
88 */ 115 */
89static void uv_bau_process_message(struct bau_payload_queue_entry *msg, 116static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
90 int msg_slot, int sw_ack_slot) 117 struct bau_control *bcp)
91{ 118{
92 unsigned long this_cpu_mask; 119 int i;
93 struct bau_msg_status *msp; 120 int cancel_count = 0;
94 int cpu; 121 int slot2;
122 unsigned long msg_res;
123 unsigned long mmr = 0;
124 struct bau_payload_queue_entry *msg;
125 struct bau_payload_queue_entry *msg2;
126 struct ptc_stats *stat;
95 127
96 msp = __get_cpu_var(bau_control).msg_statuses + msg_slot; 128 msg = mdp->msg;
97 cpu = uv_blade_processor_id(); 129 stat = &per_cpu(ptcstats, bcp->cpu);
98 msg->number_of_cpus = 130 stat->d_retries++;
99 uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id())); 131 /*
100 this_cpu_mask = 1UL << cpu; 132 * cancel any message from msg+1 to the retry itself
101 if (msp->seen_by.bits & this_cpu_mask) 133 */
102 return; 134 for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
103 atomic_or_long(&msp->seen_by.bits, this_cpu_mask); 135 if (msg2 > mdp->va_queue_last)
136 msg2 = mdp->va_queue_first;
137 if (msg2 == msg)
138 break;
139
140 /* same conditions for cancellation as uv_do_reset */
141 if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
142 (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
143 msg->sw_ack_vector) == 0) &&
144 (msg2->sending_cpu == msg->sending_cpu) &&
145 (msg2->msg_type != MSG_NOOP)) {
146 slot2 = msg2 - mdp->va_queue_first;
147 mmr = uv_read_local_mmr
148 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
149 msg_res = ((msg2->sw_ack_vector << 8) |
150 msg2->sw_ack_vector);
151 /*
152 * This is a message retry; clear the resources held
153 * by the previous message only if they timed out.
154 * If it has not timed out we have an unexpected
155 * situation to report.
156 */
157 if (mmr & (msg_res << 8)) {
158 /*
159 * is the resource timed out?
160 * make everyone ignore the cancelled message.
161 */
162 msg2->canceled = 1;
163 stat->d_canceled++;
164 cancel_count++;
165 uv_write_local_mmr(
166 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
167 (msg_res << 8) | msg_res);
168 } else
169 printk(KERN_INFO "note bau retry: no effect\n");
170 }
171 }
172 if (!cancel_count)
173 stat->d_nocanceled++;
174}
104 175
105 if (msg->replied_to == 1) 176/*
106 return; 177 * Do all the things a cpu should do for a TLB shootdown message.
178 * Other cpu's may come here at the same time for this message.
179 */
180static void uv_bau_process_message(struct msg_desc *mdp,
181 struct bau_control *bcp)
182{
183 int msg_ack_count;
184 short socket_ack_count = 0;
185 struct ptc_stats *stat;
186 struct bau_payload_queue_entry *msg;
187 struct bau_control *smaster = bcp->socket_master;
107 188
189 /*
190 * This must be a normal message, or retry of a normal message
191 */
192 msg = mdp->msg;
193 stat = &per_cpu(ptcstats, bcp->cpu);
108 if (msg->address == TLB_FLUSH_ALL) { 194 if (msg->address == TLB_FLUSH_ALL) {
109 local_flush_tlb(); 195 local_flush_tlb();
110 __get_cpu_var(ptcstats).alltlb++; 196 stat->d_alltlb++;
111 } else { 197 } else {
112 __flush_tlb_one(msg->address); 198 __flush_tlb_one(msg->address);
113 __get_cpu_var(ptcstats).onetlb++; 199 stat->d_onetlb++;
114 } 200 }
201 stat->d_requestee++;
202
203 /*
204 * One cpu on each uvhub has the additional job on a RETRY
205 * of releasing the resource held by the message that is
206 * being retried. That message is identified by sending
207 * cpu number.
208 */
209 if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
210 uv_bau_process_retry_msg(mdp, bcp);
115 211
116 __get_cpu_var(ptcstats).requestee++; 212 /*
213 * This is a sw_ack message, so we have to reply to it.
214 * Count each responding cpu on the socket. This avoids
215 * pinging the count's cache line back and forth between
216 * the sockets.
217 */
218 socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
219 &smaster->socket_acknowledge_count[mdp->msg_slot]);
220 if (socket_ack_count == bcp->cpus_in_socket) {
221 /*
222 * Both sockets dump their completed count total into
223 * the message's count.
224 */
225 smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
226 msg_ack_count = atomic_add_short_return(socket_ack_count,
227 (struct atomic_short *)&msg->acknowledge_count);
228
229 if (msg_ack_count == bcp->cpus_in_uvhub) {
230 /*
231 * All cpus in uvhub saw it; reply
232 */
233 uv_reply_to_message(mdp, bcp);
234 }
235 }
117 236
118 atomic_inc_short(&msg->acknowledge_count); 237 return;
119 if (msg->number_of_cpus == msg->acknowledge_count)
120 uv_reply_to_message(sw_ack_slot, msg, msp);
121} 238}
122 239
123/* 240/*
124 * Examine the payload queue on one distribution node to see 241 * Determine the first cpu on a uvhub.
125 * which messages have not been seen, and which cpu(s) have not seen them. 242 */
243static int uvhub_to_first_cpu(int uvhub)
244{
245 int cpu;
246 for_each_present_cpu(cpu)
247 if (uvhub == uv_cpu_to_blade_id(cpu))
248 return cpu;
249 return -1;
250}
251
252/*
253 * Last resort when we get a large number of destination timeouts is
254 * to clear resources held by a given cpu.
255 * Do this with IPI so that all messages in the BAU message queue
256 * can be identified by their nonzero sw_ack_vector field.
126 * 257 *
127 * Returns the number of cpu's that have not responded. 258 * This is entered for a single cpu on the uvhub.
259 * The sender want's this uvhub to free a specific message's
260 * sw_ack resources.
128 */ 261 */
129static int uv_examine_destination(struct bau_control *bau_tablesp, int sender) 262static void
263uv_do_reset(void *ptr)
130{ 264{
131 struct bau_payload_queue_entry *msg;
132 struct bau_msg_status *msp;
133 int count = 0;
134 int i; 265 int i;
135 int j; 266 int slot;
267 int count = 0;
268 unsigned long mmr;
269 unsigned long msg_res;
270 struct bau_control *bcp;
271 struct reset_args *rap;
272 struct bau_payload_queue_entry *msg;
273 struct ptc_stats *stat;
136 274
137 for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE; 275 bcp = &per_cpu(bau_control, smp_processor_id());
138 msg++, i++) { 276 rap = (struct reset_args *)ptr;
139 if ((msg->sending_cpu == sender) && (!msg->replied_to)) { 277 stat = &per_cpu(ptcstats, bcp->cpu);
140 msp = bau_tablesp->msg_statuses + i; 278 stat->d_resets++;
141 printk(KERN_DEBUG 279
142 "blade %d: address:%#lx %d of %d, not cpu(s): ", 280 /*
143 i, msg->address, msg->acknowledge_count, 281 * We're looking for the given sender, and
144 msg->number_of_cpus); 282 * will free its sw_ack resource.
145 for (j = 0; j < msg->number_of_cpus; j++) { 283 * If all cpu's finally responded after the timeout, its
146 if (!((1L << j) & msp->seen_by.bits)) { 284 * message 'replied_to' was set.
147 count++; 285 */
148 printk("%d ", j); 286 for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
149 } 287 /* uv_do_reset: same conditions for cancellation as
288 uv_bau_process_retry_msg() */
289 if ((msg->replied_to == 0) &&
290 (msg->canceled == 0) &&
291 (msg->sending_cpu == rap->sender) &&
292 (msg->sw_ack_vector) &&
293 (msg->msg_type != MSG_NOOP)) {
294 /*
295 * make everyone else ignore this message
296 */
297 msg->canceled = 1;
298 slot = msg - bcp->va_queue_first;
299 count++;
300 /*
301 * only reset the resource if it is still pending
302 */
303 mmr = uv_read_local_mmr
304 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
305 msg_res = ((msg->sw_ack_vector << 8) |
306 msg->sw_ack_vector);
307 if (mmr & msg_res) {
308 stat->d_rcanceled++;
309 uv_write_local_mmr(
310 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
311 msg_res);
150 } 312 }
151 printk("\n");
152 } 313 }
153 } 314 }
154 return count; 315 return;
155} 316}
156 317
157/* 318/*
158 * Examine the payload queue on all the distribution nodes to see 319 * Use IPI to get all target uvhubs to release resources held by
159 * which messages have not been seen, and which cpu(s) have not seen them. 320 * a given sending cpu number.
160 *
161 * Returns the number of cpu's that have not responded.
162 */ 321 */
163static int uv_examine_destinations(struct bau_target_nodemask *distribution) 322static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
323 int sender)
164{ 324{
165 int sender; 325 int uvhub;
166 int i; 326 int cpu;
167 int count = 0; 327 cpumask_t mask;
328 struct reset_args reset_args;
329
330 reset_args.sender = sender;
168 331
169 sender = smp_processor_id(); 332 cpus_clear(mask);
170 for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) { 333 /* find a single cpu for each uvhub in this distribution mask */
171 if (!bau_node_isset(i, distribution)) 334 for (uvhub = 0;
335 uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
336 uvhub++) {
337 if (!bau_uvhub_isset(uvhub, distribution))
172 continue; 338 continue;
173 count += uv_examine_destination(uv_bau_table_bases[i], sender); 339 /* find a cpu for this uvhub */
340 cpu = uvhub_to_first_cpu(uvhub);
341 cpu_set(cpu, mask);
174 } 342 }
175 return count; 343 /* IPI all cpus; Preemption is already disabled */
344 smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
345 return;
346}
347
348static inline unsigned long
349cycles_2_us(unsigned long long cyc)
350{
351 unsigned long long ns;
352 unsigned long us;
353 ns = (cyc * per_cpu(cyc2ns, smp_processor_id()))
354 >> CYC2NS_SCALE_FACTOR;
355 us = ns / 1000;
356 return us;
176} 357}
177 358
178/* 359/*
179 * wait for completion of a broadcast message 360 * wait for all cpus on this hub to finish their sends and go quiet
180 * 361 * leaves uvhub_quiesce set so that no new broadcasts are started by
181 * return COMPLETE, RETRY or GIVEUP 362 * bau_flush_send_and_wait()
363 */
364static inline void
365quiesce_local_uvhub(struct bau_control *hmaster)
366{
367 atomic_add_short_return(1, (struct atomic_short *)
368 &hmaster->uvhub_quiesce);
369}
370
371/*
372 * mark this quiet-requestor as done
373 */
374static inline void
375end_uvhub_quiesce(struct bau_control *hmaster)
376{
377 atomic_add_short_return(-1, (struct atomic_short *)
378 &hmaster->uvhub_quiesce);
379}
380
381/*
382 * Wait for completion of a broadcast software ack message
383 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
182 */ 384 */
183static int uv_wait_completion(struct bau_desc *bau_desc, 385static int uv_wait_completion(struct bau_desc *bau_desc,
184 unsigned long mmr_offset, int right_shift) 386 unsigned long mmr_offset, int right_shift, int this_cpu,
387 struct bau_control *bcp, struct bau_control *smaster, long try)
185{ 388{
186 int exams = 0; 389 int relaxes = 0;
187 long destination_timeouts = 0;
188 long source_timeouts = 0;
189 unsigned long descriptor_status; 390 unsigned long descriptor_status;
391 unsigned long mmr;
392 unsigned long mask;
393 cycles_t ttime;
394 cycles_t timeout_time;
395 struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
396 struct bau_control *hmaster;
397
398 hmaster = bcp->uvhub_master;
399 timeout_time = get_cycles() + bcp->timeout_interval;
190 400
401 /* spin on the status MMR, waiting for it to go idle */
191 while ((descriptor_status = (((unsigned long) 402 while ((descriptor_status = (((unsigned long)
192 uv_read_local_mmr(mmr_offset) >> 403 uv_read_local_mmr(mmr_offset) >>
193 right_shift) & UV_ACT_STATUS_MASK)) != 404 right_shift) & UV_ACT_STATUS_MASK)) !=
194 DESC_STATUS_IDLE) { 405 DESC_STATUS_IDLE) {
195 if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
196 source_timeouts++;
197 if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
198 source_timeouts = 0;
199 __get_cpu_var(ptcstats).s_retry++;
200 return FLUSH_RETRY;
201 }
202 /* 406 /*
203 * spin here looking for progress at the destinations 407 * Our software ack messages may be blocked because there are
408 * no swack resources available. As long as none of them
409 * has timed out hardware will NACK our message and its
410 * state will stay IDLE.
204 */ 411 */
205 if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { 412 if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
206 destination_timeouts++; 413 stat->s_stimeout++;
207 if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { 414 return FLUSH_GIVEUP;
208 /* 415 } else if (descriptor_status ==
209 * returns number of cpus not responding 416 DESC_STATUS_DESTINATION_TIMEOUT) {
210 */ 417 stat->s_dtimeout++;
211 if (uv_examine_destinations 418 ttime = get_cycles();
212 (&bau_desc->distribution) == 0) { 419
213 __get_cpu_var(ptcstats).d_retry++; 420 /*
214 return FLUSH_RETRY; 421 * Our retries may be blocked by all destination
215 } 422 * swack resources being consumed, and a timeout
216 exams++; 423 * pending. In that case hardware returns the
217 if (exams >= uv_bau_retry_limit) { 424 * ERROR that looks like a destination timeout.
218 printk(KERN_DEBUG 425 */
219 "uv_flush_tlb_others"); 426 if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
220 printk("giving up on cpu %d\n", 427 bcp->conseccompletes = 0;
221 smp_processor_id()); 428 return FLUSH_RETRY_PLUGGED;
429 }
430
431 bcp->conseccompletes = 0;
432 return FLUSH_RETRY_TIMEOUT;
433 } else {
434 /*
435 * descriptor_status is still BUSY
436 */
437 cpu_relax();
438 relaxes++;
439 if (relaxes >= 10000) {
440 relaxes = 0;
441 if (get_cycles() > timeout_time) {
442 quiesce_local_uvhub(hmaster);
443
444 /* single-thread the register change */
445 spin_lock(&hmaster->masks_lock);
446 mmr = uv_read_local_mmr(mmr_offset);
447 mask = 0UL;
448 mask |= (3UL < right_shift);
449 mask = ~mask;
450 mmr &= mask;
451 uv_write_local_mmr(mmr_offset, mmr);
452 spin_unlock(&hmaster->masks_lock);
453 end_uvhub_quiesce(hmaster);
454 stat->s_busy++;
222 return FLUSH_GIVEUP; 455 return FLUSH_GIVEUP;
223 } 456 }
224 /*
225 * delays can hang the simulator
226 udelay(1000);
227 */
228 destination_timeouts = 0;
229 } 457 }
230 } 458 }
231 cpu_relax();
232 } 459 }
460 bcp->conseccompletes++;
233 return FLUSH_COMPLETE; 461 return FLUSH_COMPLETE;
234} 462}
235 463
464static inline cycles_t
465sec_2_cycles(unsigned long sec)
466{
467 unsigned long ns;
468 cycles_t cyc;
469
470 ns = sec * 1000000000;
471 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
472 return cyc;
473}
474
475/*
476 * conditionally add 1 to *v, unless *v is >= u
477 * return 0 if we cannot add 1 to *v because it is >= u
478 * return 1 if we can add 1 to *v because it is < u
479 * the add is atomic
480 *
481 * This is close to atomic_add_unless(), but this allows the 'u' value
482 * to be lowered below the current 'v'. atomic_add_unless can only stop
483 * on equal.
484 */
485static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
486{
487 spin_lock(lock);
488 if (atomic_read(v) >= u) {
489 spin_unlock(lock);
490 return 0;
491 }
492 atomic_inc(v);
493 spin_unlock(lock);
494 return 1;
495}
496
236/** 497/**
237 * uv_flush_send_and_wait 498 * uv_flush_send_and_wait
238 * 499 *
239 * Send a broadcast and wait for a broadcast message to complete. 500 * Send a broadcast and wait for it to complete.
240 * 501 *
241 * The flush_mask contains the cpus the broadcast was sent to. 502 * The flush_mask contains the cpus the broadcast is to be sent to, plus
503 * cpus that are on the local uvhub.
242 * 504 *
243 * Returns NULL if all remote flushing was done. The mask is zeroed. 505 * Returns NULL if all flushing represented in the mask was done. The mask
506 * is zeroed.
244 * Returns @flush_mask if some remote flushing remains to be done. The 507 * Returns @flush_mask if some remote flushing remains to be done. The
245 * mask will have some bits still set. 508 * mask will have some bits still set, representing any cpus on the local
509 * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
246 */ 510 */
247const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, 511const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
248 struct bau_desc *bau_desc, 512 struct cpumask *flush_mask,
249 struct cpumask *flush_mask) 513 struct bau_control *bcp)
250{ 514{
251 int completion_status = 0;
252 int right_shift; 515 int right_shift;
253 int tries = 0; 516 int uvhub;
254 int pnode;
255 int bit; 517 int bit;
518 int completion_status = 0;
519 int seq_number = 0;
520 long try = 0;
521 int cpu = bcp->uvhub_cpu;
522 int this_cpu = bcp->cpu;
523 int this_uvhub = bcp->uvhub;
256 unsigned long mmr_offset; 524 unsigned long mmr_offset;
257 unsigned long index; 525 unsigned long index;
258 cycles_t time1; 526 cycles_t time1;
259 cycles_t time2; 527 cycles_t time2;
528 struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
529 struct bau_control *smaster = bcp->socket_master;
530 struct bau_control *hmaster = bcp->uvhub_master;
531
532 /*
533 * Spin here while there are hmaster->max_concurrent or more active
534 * descriptors. This is the per-uvhub 'throttle'.
535 */
536 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
537 &hmaster->active_descriptor_count,
538 hmaster->max_concurrent)) {
539 stat->s_throttles++;
540 do {
541 cpu_relax();
542 } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
543 &hmaster->active_descriptor_count,
544 hmaster->max_concurrent));
545 }
546
547 while (hmaster->uvhub_quiesce)
548 cpu_relax();
260 549
261 if (cpu < UV_CPUS_PER_ACT_STATUS) { 550 if (cpu < UV_CPUS_PER_ACT_STATUS) {
262 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; 551 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -268,24 +557,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
268 } 557 }
269 time1 = get_cycles(); 558 time1 = get_cycles();
270 do { 559 do {
271 tries++; 560 /*
561 * Every message from any given cpu gets a unique message
562 * sequence number. But retries use that same number.
563 * Our message may have timed out at the destination because
564 * all sw-ack resources are in use and there is a timeout
565 * pending there. In that case, our last send never got
566 * placed into the queue and we need to persist until it
567 * does.
568 *
569 * Make any retry a type MSG_RETRY so that the destination will
570 * free any resource held by a previous message from this cpu.
571 */
572 if (try == 0) {
573 /* use message type set by the caller the first time */
574 seq_number = bcp->message_number++;
575 } else {
576 /* use RETRY type on all the rest; same sequence */
577 bau_desc->header.msg_type = MSG_RETRY;
578 stat->s_retry_messages++;
579 }
580 bau_desc->header.sequence = seq_number;
272 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | 581 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
273 cpu; 582 bcp->uvhub_cpu;
583 bcp->send_message = get_cycles();
584
274 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); 585 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
586
587 try++;
275 completion_status = uv_wait_completion(bau_desc, mmr_offset, 588 completion_status = uv_wait_completion(bau_desc, mmr_offset,
276 right_shift); 589 right_shift, this_cpu, bcp, smaster, try);
277 } while (completion_status == FLUSH_RETRY); 590
591 if (completion_status == FLUSH_RETRY_PLUGGED) {
592 /*
593 * Our retries may be blocked by all destination swack
594 * resources being consumed, and a timeout pending. In
595 * that case hardware immediately returns the ERROR
596 * that looks like a destination timeout.
597 */
598 udelay(TIMEOUT_DELAY);
599 bcp->plugged_tries++;
600 if (bcp->plugged_tries >= PLUGSB4RESET) {
601 bcp->plugged_tries = 0;
602 quiesce_local_uvhub(hmaster);
603 spin_lock(&hmaster->queue_lock);
604 uv_reset_with_ipi(&bau_desc->distribution,
605 this_cpu);
606 spin_unlock(&hmaster->queue_lock);
607 end_uvhub_quiesce(hmaster);
608 bcp->ipi_attempts++;
609 stat->s_resets_plug++;
610 }
611 } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
612 hmaster->max_concurrent = 1;
613 bcp->timeout_tries++;
614 udelay(TIMEOUT_DELAY);
615 if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
616 bcp->timeout_tries = 0;
617 quiesce_local_uvhub(hmaster);
618 spin_lock(&hmaster->queue_lock);
619 uv_reset_with_ipi(&bau_desc->distribution,
620 this_cpu);
621 spin_unlock(&hmaster->queue_lock);
622 end_uvhub_quiesce(hmaster);
623 bcp->ipi_attempts++;
624 stat->s_resets_timeout++;
625 }
626 }
627 if (bcp->ipi_attempts >= 3) {
628 bcp->ipi_attempts = 0;
629 completion_status = FLUSH_GIVEUP;
630 break;
631 }
632 cpu_relax();
633 } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
634 (completion_status == FLUSH_RETRY_TIMEOUT));
278 time2 = get_cycles(); 635 time2 = get_cycles();
279 __get_cpu_var(ptcstats).sflush += (time2 - time1);
280 if (tries > 1)
281 __get_cpu_var(ptcstats).retriesok++;
282 636
283 if (completion_status == FLUSH_GIVEUP) { 637 if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
638 && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
639 hmaster->max_concurrent++;
640
641 /*
642 * hold any cpu not timing out here; no other cpu currently held by
643 * the 'throttle' should enter the activation code
644 */
645 while (hmaster->uvhub_quiesce)
646 cpu_relax();
647 atomic_dec(&hmaster->active_descriptor_count);
648
649 /* guard against cycles wrap */
650 if (time2 > time1)
651 stat->s_time += (time2 - time1);
652 else
653 stat->s_requestor--; /* don't count this one */
654 if (completion_status == FLUSH_COMPLETE && try > 1)
655 stat->s_retriesok++;
656 else if (completion_status == FLUSH_GIVEUP) {
284 /* 657 /*
285 * Cause the caller to do an IPI-style TLB shootdown on 658 * Cause the caller to do an IPI-style TLB shootdown on
286 * the cpu's, all of which are still in the mask. 659 * the target cpu's, all of which are still in the mask.
287 */ 660 */
288 __get_cpu_var(ptcstats).ptc_i++; 661 stat->s_giveup++;
289 return flush_mask; 662 return flush_mask;
290 } 663 }
291 664
@@ -294,18 +667,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
294 * use the IPI method of shootdown on them. 667 * use the IPI method of shootdown on them.
295 */ 668 */
296 for_each_cpu(bit, flush_mask) { 669 for_each_cpu(bit, flush_mask) {
297 pnode = uv_cpu_to_pnode(bit); 670 uvhub = uv_cpu_to_blade_id(bit);
298 if (pnode == this_pnode) 671 if (uvhub == this_uvhub)
299 continue; 672 continue;
300 cpumask_clear_cpu(bit, flush_mask); 673 cpumask_clear_cpu(bit, flush_mask);
301 } 674 }
302 if (!cpumask_empty(flush_mask)) 675 if (!cpumask_empty(flush_mask))
303 return flush_mask; 676 return flush_mask;
677
304 return NULL; 678 return NULL;
305} 679}
306 680
307static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
308
309/** 681/**
310 * uv_flush_tlb_others - globally purge translation cache of a virtual 682 * uv_flush_tlb_others - globally purge translation cache of a virtual
311 * address or all TLB's 683 * address or all TLB's
@@ -322,8 +694,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
322 * The caller has derived the cpumask from the mm_struct. This function 694 * The caller has derived the cpumask from the mm_struct. This function
323 * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) 695 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
324 * 696 *
325 * The cpumask is converted into a nodemask of the nodes containing 697 * The cpumask is converted into a uvhubmask of the uvhubs containing
326 * the cpus. 698 * those cpus.
327 * 699 *
328 * Note that this function should be called with preemption disabled. 700 * Note that this function should be called with preemption disabled.
329 * 701 *
@@ -335,52 +707,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
335 struct mm_struct *mm, 707 struct mm_struct *mm,
336 unsigned long va, unsigned int cpu) 708 unsigned long va, unsigned int cpu)
337{ 709{
338 struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask); 710 int remotes;
339 int i; 711 int tcpu;
340 int bit; 712 int uvhub;
341 int pnode;
342 int uv_cpu;
343 int this_pnode;
344 int locals = 0; 713 int locals = 0;
345 struct bau_desc *bau_desc; 714 struct bau_desc *bau_desc;
715 struct cpumask *flush_mask;
716 struct ptc_stats *stat;
717 struct bau_control *bcp;
346 718
347 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 719 if (nobau)
720 return cpumask;
348 721
349 uv_cpu = uv_blade_processor_id(); 722 bcp = &per_cpu(bau_control, cpu);
350 this_pnode = uv_hub_info->pnode; 723 /*
351 bau_desc = __get_cpu_var(bau_control).descriptor_base; 724 * Each sending cpu has a per-cpu mask which it fills from the caller's
352 bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu; 725 * cpu mask. Only remote cpus are converted to uvhubs and copied.
726 */
727 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
728 /*
729 * copy cpumask to flush_mask, removing current cpu
730 * (current cpu should already have been flushed by the caller and
731 * should never be returned if we return flush_mask)
732 */
733 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
734 if (cpu_isset(cpu, *cpumask))
735 locals++; /* current cpu was targeted */
353 736
354 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 737 bau_desc = bcp->descriptor_base;
738 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
355 739
356 i = 0; 740 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
357 for_each_cpu(bit, flush_mask) { 741 remotes = 0;
358 pnode = uv_cpu_to_pnode(bit); 742 for_each_cpu(tcpu, flush_mask) {
359 BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1)); 743 uvhub = uv_cpu_to_blade_id(tcpu);
360 if (pnode == this_pnode) { 744 if (uvhub == bcp->uvhub) {
361 locals++; 745 locals++;
362 continue; 746 continue;
363 } 747 }
364 bau_node_set(pnode - uv_partition_base_pnode, 748 bau_uvhub_set(uvhub, &bau_desc->distribution);
365 &bau_desc->distribution); 749 remotes++;
366 i++;
367 } 750 }
368 if (i == 0) { 751 if (remotes == 0) {
369 /* 752 /*
370 * no off_node flushing; return status for local node 753 * No off_hub flushing; return status for local hub.
754 * Return the caller's mask if all were local (the current
755 * cpu may be in that mask).
371 */ 756 */
372 if (locals) 757 if (locals)
373 return flush_mask; 758 return cpumask;
374 else 759 else
375 return NULL; 760 return NULL;
376 } 761 }
377 __get_cpu_var(ptcstats).requestor++; 762 stat = &per_cpu(ptcstats, cpu);
378 __get_cpu_var(ptcstats).ntargeted += i; 763 stat->s_requestor++;
764 stat->s_ntargcpu += remotes;
765 remotes = bau_uvhub_weight(&bau_desc->distribution);
766 stat->s_ntarguvhub += remotes;
767 if (remotes >= 16)
768 stat->s_ntarguvhub16++;
769 else if (remotes >= 8)
770 stat->s_ntarguvhub8++;
771 else if (remotes >= 4)
772 stat->s_ntarguvhub4++;
773 else if (remotes >= 2)
774 stat->s_ntarguvhub2++;
775 else
776 stat->s_ntarguvhub1++;
379 777
380 bau_desc->payload.address = va; 778 bau_desc->payload.address = va;
381 bau_desc->payload.sending_cpu = cpu; 779 bau_desc->payload.sending_cpu = cpu;
382 780
383 return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask); 781 /*
782 * uv_flush_send_and_wait returns null if all cpu's were messaged, or
783 * the adjusted flush_mask if any cpu's were not messaged.
784 */
785 return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
384} 786}
385 787
386/* 788/*
@@ -389,87 +791,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
389 * 791 *
390 * We received a broadcast assist message. 792 * We received a broadcast assist message.
391 * 793 *
392 * Interrupts may have been disabled; this interrupt could represent 794 * Interrupts are disabled; this interrupt could represent
393 * the receipt of several messages. 795 * the receipt of several messages.
394 * 796 *
395 * All cores/threads on this node get this interrupt. 797 * All cores/threads on this hub get this interrupt.
396 * The last one to see it does the s/w ack. 798 * The last one to see it does the software ack.
397 * (the resource will not be freed until noninterruptable cpus see this 799 * (the resource will not be freed until noninterruptable cpus see this
398 * interrupt; hardware will timeout the s/w ack and reply ERROR) 800 * interrupt; hardware may timeout the s/w ack and reply ERROR)
399 */ 801 */
400void uv_bau_message_interrupt(struct pt_regs *regs) 802void uv_bau_message_interrupt(struct pt_regs *regs)
401{ 803{
402 struct bau_payload_queue_entry *va_queue_first;
403 struct bau_payload_queue_entry *va_queue_last;
404 struct bau_payload_queue_entry *msg;
405 struct pt_regs *old_regs = set_irq_regs(regs);
406 cycles_t time1;
407 cycles_t time2;
408 int msg_slot;
409 int sw_ack_slot;
410 int fw;
411 int count = 0; 804 int count = 0;
412 unsigned long local_pnode; 805 cycles_t time_start;
413 806 struct bau_payload_queue_entry *msg;
414 ack_APIC_irq(); 807 struct bau_control *bcp;
415 exit_idle(); 808 struct ptc_stats *stat;
416 irq_enter(); 809 struct msg_desc msgdesc;
417 810
418 time1 = get_cycles(); 811 time_start = get_cycles();
419 812 bcp = &per_cpu(bau_control, smp_processor_id());
420 local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); 813 stat = &per_cpu(ptcstats, smp_processor_id());
421 814 msgdesc.va_queue_first = bcp->va_queue_first;
422 va_queue_first = __get_cpu_var(bau_control).va_queue_first; 815 msgdesc.va_queue_last = bcp->va_queue_last;
423 va_queue_last = __get_cpu_var(bau_control).va_queue_last; 816 msg = bcp->bau_msg_head;
424
425 msg = __get_cpu_var(bau_control).bau_msg_head;
426 while (msg->sw_ack_vector) { 817 while (msg->sw_ack_vector) {
427 count++; 818 count++;
428 fw = msg->sw_ack_vector; 819 msgdesc.msg_slot = msg - msgdesc.va_queue_first;
429 msg_slot = msg - va_queue_first; 820 msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
430 sw_ack_slot = ffs(fw) - 1; 821 msgdesc.msg = msg;
431 822 uv_bau_process_message(&msgdesc, bcp);
432 uv_bau_process_message(msg, msg_slot, sw_ack_slot);
433
434 msg++; 823 msg++;
435 if (msg > va_queue_last) 824 if (msg > msgdesc.va_queue_last)
436 msg = va_queue_first; 825 msg = msgdesc.va_queue_first;
437 __get_cpu_var(bau_control).bau_msg_head = msg; 826 bcp->bau_msg_head = msg;
438 } 827 }
828 stat->d_time += (get_cycles() - time_start);
439 if (!count) 829 if (!count)
440 __get_cpu_var(ptcstats).nomsg++; 830 stat->d_nomsg++;
441 else if (count > 1) 831 else if (count > 1)
442 __get_cpu_var(ptcstats).multmsg++; 832 stat->d_multmsg++;
443 833 ack_APIC_irq();
444 time2 = get_cycles();
445 __get_cpu_var(ptcstats).dflush += (time2 - time1);
446
447 irq_exit();
448 set_irq_regs(old_regs);
449} 834}
450 835
451/* 836/*
452 * uv_enable_timeouts 837 * uv_enable_timeouts
453 * 838 *
454 * Each target blade (i.e. blades that have cpu's) needs to have 839 * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
455 * shootdown message timeouts enabled. The timeout does not cause 840 * shootdown message timeouts enabled. The timeout does not cause
456 * an interrupt, but causes an error message to be returned to 841 * an interrupt, but causes an error message to be returned to
457 * the sender. 842 * the sender.
458 */ 843 */
459static void uv_enable_timeouts(void) 844static void uv_enable_timeouts(void)
460{ 845{
461 int blade; 846 int uvhub;
462 int nblades; 847 int nuvhubs;
463 int pnode; 848 int pnode;
464 unsigned long mmr_image; 849 unsigned long mmr_image;
465 850
466 nblades = uv_num_possible_blades(); 851 nuvhubs = uv_num_possible_blades();
467 852
468 for (blade = 0; blade < nblades; blade++) { 853 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
469 if (!uv_blade_nr_possible_cpus(blade)) 854 if (!uv_blade_nr_possible_cpus(uvhub))
470 continue; 855 continue;
471 856
472 pnode = uv_blade_to_pnode(blade); 857 pnode = uv_blade_to_pnode(uvhub);
473 mmr_image = 858 mmr_image =
474 uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); 859 uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
475 /* 860 /*
@@ -479,16 +864,16 @@ static void uv_enable_timeouts(void)
479 * To program the period, the SOFT_ACK_MODE must be off. 864 * To program the period, the SOFT_ACK_MODE must be off.
480 */ 865 */
481 mmr_image &= ~((unsigned long)1 << 866 mmr_image &= ~((unsigned long)1 <<
482 UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT); 867 UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
483 uv_write_global_mmr64 868 uv_write_global_mmr64
484 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); 869 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
485 /* 870 /*
486 * Set the 4-bit period. 871 * Set the 4-bit period.
487 */ 872 */
488 mmr_image &= ~((unsigned long)0xf << 873 mmr_image &= ~((unsigned long)0xf <<
489 UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT); 874 UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
490 mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD << 875 mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
491 UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT); 876 UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
492 uv_write_global_mmr64 877 uv_write_global_mmr64
493 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); 878 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
494 /* 879 /*
@@ -497,7 +882,7 @@ static void uv_enable_timeouts(void)
497 * indicated in bits 2:0 (7 causes all of them to timeout). 882 * indicated in bits 2:0 (7 causes all of them to timeout).
498 */ 883 */
499 mmr_image |= ((unsigned long)1 << 884 mmr_image |= ((unsigned long)1 <<
500 UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT); 885 UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
501 uv_write_global_mmr64 886 uv_write_global_mmr64
502 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); 887 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
503 } 888 }
@@ -522,9 +907,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
522{ 907{
523} 908}
524 909
910static inline unsigned long long
911millisec_2_cycles(unsigned long millisec)
912{
913 unsigned long ns;
914 unsigned long long cyc;
915
916 ns = millisec * 1000;
917 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
918 return cyc;
919}
920
525/* 921/*
526 * Display the statistics thru /proc 922 * Display the statistics thru /proc.
527 * data points to the cpu number 923 * 'data' points to the cpu number
528 */ 924 */
529static int uv_ptc_seq_show(struct seq_file *file, void *data) 925static int uv_ptc_seq_show(struct seq_file *file, void *data)
530{ 926{
@@ -535,78 +931,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
535 931
536 if (!cpu) { 932 if (!cpu) {
537 seq_printf(file, 933 seq_printf(file,
538 "# cpu requestor requestee one all sretry dretry ptc_i "); 934 "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
539 seq_printf(file, 935 seq_printf(file,
540 "sw_ack sflush dflush sok dnomsg dmult starget\n"); 936 "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
937 seq_printf(file,
938 "retries rok resetp resett giveup sto bz throt ");
939 seq_printf(file,
940 "sw_ack recv rtime all ");
941 seq_printf(file,
942 "one mult none retry canc nocan reset rcan\n");
541 } 943 }
542 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 944 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
543 stat = &per_cpu(ptcstats, cpu); 945 stat = &per_cpu(ptcstats, cpu);
544 seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ", 946 /* source side statistics */
545 cpu, stat->requestor, 947 seq_printf(file,
546 stat->requestee, stat->onetlb, stat->alltlb, 948 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
547 stat->s_retry, stat->d_retry, stat->ptc_i); 949 cpu, stat->s_requestor, cycles_2_us(stat->s_time),
548 seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n", 950 stat->s_ntarguvhub, stat->s_ntarguvhub16,
951 stat->s_ntarguvhub8, stat->s_ntarguvhub4,
952 stat->s_ntarguvhub2, stat->s_ntarguvhub1,
953 stat->s_ntargcpu, stat->s_dtimeout);
954 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
955 stat->s_retry_messages, stat->s_retriesok,
956 stat->s_resets_plug, stat->s_resets_timeout,
957 stat->s_giveup, stat->s_stimeout,
958 stat->s_busy, stat->s_throttles);
959 /* destination side statistics */
960 seq_printf(file,
961 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
549 uv_read_global_mmr64(uv_cpu_to_pnode(cpu), 962 uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
550 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), 963 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
551 stat->sflush, stat->dflush, 964 stat->d_requestee, cycles_2_us(stat->d_time),
552 stat->retriesok, stat->nomsg, 965 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
553 stat->multmsg, stat->ntargeted); 966 stat->d_nomsg, stat->d_retries, stat->d_canceled,
967 stat->d_nocanceled, stat->d_resets,
968 stat->d_rcanceled);
554 } 969 }
555 970
556 return 0; 971 return 0;
557} 972}
558 973
559/* 974/*
975 * -1: resetf the statistics
560 * 0: display meaning of the statistics 976 * 0: display meaning of the statistics
561 * >0: retry limit 977 * >0: maximum concurrent active descriptors per uvhub (throttle)
562 */ 978 */
563static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, 979static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
564 size_t count, loff_t *data) 980 size_t count, loff_t *data)
565{ 981{
566 long newmode; 982 int cpu;
983 long input_arg;
567 char optstr[64]; 984 char optstr[64];
985 struct ptc_stats *stat;
986 struct bau_control *bcp;
568 987
569 if (count == 0 || count > sizeof(optstr)) 988 if (count == 0 || count > sizeof(optstr))
570 return -EINVAL; 989 return -EINVAL;
571 if (copy_from_user(optstr, user, count)) 990 if (copy_from_user(optstr, user, count))
572 return -EFAULT; 991 return -EFAULT;
573 optstr[count - 1] = '\0'; 992 optstr[count - 1] = '\0';
574 if (strict_strtoul(optstr, 10, &newmode) < 0) { 993 if (strict_strtol(optstr, 10, &input_arg) < 0) {
575 printk(KERN_DEBUG "%s is invalid\n", optstr); 994 printk(KERN_DEBUG "%s is invalid\n", optstr);
576 return -EINVAL; 995 return -EINVAL;
577 } 996 }
578 997
579 if (newmode == 0) { 998 if (input_arg == 0) {
580 printk(KERN_DEBUG "# cpu: cpu number\n"); 999 printk(KERN_DEBUG "# cpu: cpu number\n");
1000 printk(KERN_DEBUG "Sender statistics:\n");
1001 printk(KERN_DEBUG
1002 "sent: number of shootdown messages sent\n");
1003 printk(KERN_DEBUG
1004 "stime: time spent sending messages\n");
1005 printk(KERN_DEBUG
1006 "numuvhubs: number of hubs targeted with shootdown\n");
1007 printk(KERN_DEBUG
1008 "numuvhubs16: number times 16 or more hubs targeted\n");
1009 printk(KERN_DEBUG
1010 "numuvhubs8: number times 8 or more hubs targeted\n");
1011 printk(KERN_DEBUG
1012 "numuvhubs4: number times 4 or more hubs targeted\n");
1013 printk(KERN_DEBUG
1014 "numuvhubs2: number times 2 or more hubs targeted\n");
1015 printk(KERN_DEBUG
1016 "numuvhubs1: number times 1 hub targeted\n");
1017 printk(KERN_DEBUG
1018 "numcpus: number of cpus targeted with shootdown\n");
1019 printk(KERN_DEBUG
1020 "dto: number of destination timeouts\n");
1021 printk(KERN_DEBUG
1022 "retries: destination timeout retries sent\n");
1023 printk(KERN_DEBUG
1024 "rok: : destination timeouts successfully retried\n");
1025 printk(KERN_DEBUG
1026 "resetp: ipi-style resource resets for plugs\n");
1027 printk(KERN_DEBUG
1028 "resett: ipi-style resource resets for timeouts\n");
1029 printk(KERN_DEBUG
1030 "giveup: fall-backs to ipi-style shootdowns\n");
1031 printk(KERN_DEBUG
1032 "sto: number of source timeouts\n");
1033 printk(KERN_DEBUG
1034 "bz: number of stay-busy's\n");
1035 printk(KERN_DEBUG
1036 "throt: number times spun in throttle\n");
1037 printk(KERN_DEBUG "Destination side statistics:\n");
581 printk(KERN_DEBUG 1038 printk(KERN_DEBUG
582 "requestor: times this cpu was the flush requestor\n"); 1039 "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
583 printk(KERN_DEBUG 1040 printk(KERN_DEBUG
584 "requestee: times this cpu was requested to flush its TLBs\n"); 1041 "recv: shootdown messages received\n");
585 printk(KERN_DEBUG 1042 printk(KERN_DEBUG
586 "one: times requested to flush a single address\n"); 1043 "rtime: time spent processing messages\n");
587 printk(KERN_DEBUG 1044 printk(KERN_DEBUG
588 "all: times requested to flush all TLB's\n"); 1045 "all: shootdown all-tlb messages\n");
589 printk(KERN_DEBUG 1046 printk(KERN_DEBUG
590 "sretry: number of retries of source-side timeouts\n"); 1047 "one: shootdown one-tlb messages\n");
591 printk(KERN_DEBUG 1048 printk(KERN_DEBUG
592 "dretry: number of retries of destination-side timeouts\n"); 1049 "mult: interrupts that found multiple messages\n");
593 printk(KERN_DEBUG 1050 printk(KERN_DEBUG
594 "ptc_i: times UV fell through to IPI-style flushes\n"); 1051 "none: interrupts that found no messages\n");
595 printk(KERN_DEBUG 1052 printk(KERN_DEBUG
596 "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); 1053 "retry: number of retry messages processed\n");
597 printk(KERN_DEBUG 1054 printk(KERN_DEBUG
598 "sflush_us: cycles spent in uv_flush_tlb_others()\n"); 1055 "canc: number messages canceled by retries\n");
599 printk(KERN_DEBUG 1056 printk(KERN_DEBUG
600 "dflush_us: cycles spent in handling flush requests\n"); 1057 "nocan: number retries that found nothing to cancel\n");
601 printk(KERN_DEBUG "sok: successes on retry\n");
602 printk(KERN_DEBUG "dnomsg: interrupts with no message\n");
603 printk(KERN_DEBUG 1058 printk(KERN_DEBUG
604 "dmult: interrupts with multiple messages\n"); 1059 "reset: number of ipi-style reset requests processed\n");
605 printk(KERN_DEBUG "starget: nodes targeted\n"); 1060 printk(KERN_DEBUG
1061 "rcan: number messages canceled by reset requests\n");
1062 } else if (input_arg == -1) {
1063 for_each_present_cpu(cpu) {
1064 stat = &per_cpu(ptcstats, cpu);
1065 memset(stat, 0, sizeof(struct ptc_stats));
1066 }
606 } else { 1067 } else {
607 uv_bau_retry_limit = newmode; 1068 uv_bau_max_concurrent = input_arg;
608 printk(KERN_DEBUG "timeout retry limit:%d\n", 1069 bcp = &per_cpu(bau_control, smp_processor_id());
609 uv_bau_retry_limit); 1070 if (uv_bau_max_concurrent < 1 ||
1071 uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
1072 printk(KERN_DEBUG
1073 "Error: BAU max concurrent %d; %d is invalid\n",
1074 bcp->max_concurrent, uv_bau_max_concurrent);
1075 return -EINVAL;
1076 }
1077 printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
1078 uv_bau_max_concurrent);
1079 for_each_present_cpu(cpu) {
1080 bcp = &per_cpu(bau_control, cpu);
1081 bcp->max_concurrent = uv_bau_max_concurrent;
1082 }
610 } 1083 }
611 1084
612 return count; 1085 return count;
@@ -650,79 +1123,30 @@ static int __init uv_ptc_init(void)
650} 1123}
651 1124
652/* 1125/*
653 * begin the initialization of the per-blade control structures
654 */
655static struct bau_control * __init uv_table_bases_init(int blade, int node)
656{
657 int i;
658 struct bau_msg_status *msp;
659 struct bau_control *bau_tabp;
660
661 bau_tabp =
662 kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
663 BUG_ON(!bau_tabp);
664
665 bau_tabp->msg_statuses =
666 kmalloc_node(sizeof(struct bau_msg_status) *
667 DEST_Q_SIZE, GFP_KERNEL, node);
668 BUG_ON(!bau_tabp->msg_statuses);
669
670 for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
671 bau_cpubits_clear(&msp->seen_by, (int)
672 uv_blade_nr_possible_cpus(blade));
673
674 uv_bau_table_bases[blade] = bau_tabp;
675
676 return bau_tabp;
677}
678
679/*
680 * finish the initialization of the per-blade control structures
681 */
682static void __init
683uv_table_bases_finish(int blade,
684 struct bau_control *bau_tablesp,
685 struct bau_desc *adp)
686{
687 struct bau_control *bcp;
688 int cpu;
689
690 for_each_present_cpu(cpu) {
691 if (blade != uv_cpu_to_blade_id(cpu))
692 continue;
693
694 bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
695 bcp->bau_msg_head = bau_tablesp->va_queue_first;
696 bcp->va_queue_first = bau_tablesp->va_queue_first;
697 bcp->va_queue_last = bau_tablesp->va_queue_last;
698 bcp->msg_statuses = bau_tablesp->msg_statuses;
699 bcp->descriptor_base = adp;
700 }
701}
702
703/*
704 * initialize the sending side's sending buffers 1126 * initialize the sending side's sending buffers
705 */ 1127 */
706static struct bau_desc * __init 1128static void
707uv_activation_descriptor_init(int node, int pnode) 1129uv_activation_descriptor_init(int node, int pnode)
708{ 1130{
709 int i; 1131 int i;
1132 int cpu;
710 unsigned long pa; 1133 unsigned long pa;
711 unsigned long m; 1134 unsigned long m;
712 unsigned long n; 1135 unsigned long n;
713 struct bau_desc *adp; 1136 struct bau_desc *bau_desc;
714 struct bau_desc *ad2; 1137 struct bau_desc *bd2;
1138 struct bau_control *bcp;
715 1139
716 /* 1140 /*
717 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) 1141 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
718 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade 1142 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
719 */ 1143 */
720 adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* 1144 bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
721 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); 1145 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
722 BUG_ON(!adp); 1146 BUG_ON(!bau_desc);
723 1147
724 pa = uv_gpa(adp); /* need the real nasid*/ 1148 pa = uv_gpa(bau_desc); /* need the real nasid*/
725 n = uv_gpa_to_pnode(pa); 1149 n = pa >> uv_nshift;
726 m = pa & uv_mmask; 1150 m = pa & uv_mmask;
727 1151
728 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, 1152 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -731,96 +1155,188 @@ uv_activation_descriptor_init(int node, int pnode)
731 /* 1155 /*
732 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each 1156 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
733 * cpu even though we only use the first one; one descriptor can 1157 * cpu even though we only use the first one; one descriptor can
734 * describe a broadcast to 256 nodes. 1158 * describe a broadcast to 256 uv hubs.
735 */ 1159 */
736 for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); 1160 for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
737 i++, ad2++) { 1161 i++, bd2++) {
738 memset(ad2, 0, sizeof(struct bau_desc)); 1162 memset(bd2, 0, sizeof(struct bau_desc));
739 ad2->header.sw_ack_flag = 1; 1163 bd2->header.sw_ack_flag = 1;
740 /* 1164 /*
741 * base_dest_nodeid is the first node in the partition, so 1165 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
742 * the bit map will indicate partition-relative node numbers. 1166 * in the partition. The bit map will indicate uvhub numbers,
743 * note that base_dest_nodeid is actually a nasid. 1167 * which are 0-N in a partition. Pnodes are unique system-wide.
744 */ 1168 */
745 ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; 1169 bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
746 ad2->header.dest_subnodeid = 0x10; /* the LB */ 1170 bd2->header.dest_subnodeid = 0x10; /* the LB */
747 ad2->header.command = UV_NET_ENDPOINT_INTD; 1171 bd2->header.command = UV_NET_ENDPOINT_INTD;
748 ad2->header.int_both = 1; 1172 bd2->header.int_both = 1;
749 /* 1173 /*
750 * all others need to be set to zero: 1174 * all others need to be set to zero:
751 * fairness chaining multilevel count replied_to 1175 * fairness chaining multilevel count replied_to
752 */ 1176 */
753 } 1177 }
754 return adp; 1178 for_each_present_cpu(cpu) {
1179 if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
1180 continue;
1181 bcp = &per_cpu(bau_control, cpu);
1182 bcp->descriptor_base = bau_desc;
1183 }
755} 1184}
756 1185
757/* 1186/*
758 * initialize the destination side's receiving buffers 1187 * initialize the destination side's receiving buffers
1188 * entered for each uvhub in the partition
1189 * - node is first node (kernel memory notion) on the uvhub
1190 * - pnode is the uvhub's physical identifier
759 */ 1191 */
760static struct bau_payload_queue_entry * __init 1192static void
761uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) 1193uv_payload_queue_init(int node, int pnode)
762{ 1194{
763 struct bau_payload_queue_entry *pqp;
764 unsigned long pa;
765 int pn; 1195 int pn;
1196 int cpu;
766 char *cp; 1197 char *cp;
1198 unsigned long pa;
1199 struct bau_payload_queue_entry *pqp;
1200 struct bau_payload_queue_entry *pqp_malloc;
1201 struct bau_control *bcp;
767 1202
768 pqp = (struct bau_payload_queue_entry *) kmalloc_node( 1203 pqp = (struct bau_payload_queue_entry *) kmalloc_node(
769 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), 1204 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
770 GFP_KERNEL, node); 1205 GFP_KERNEL, node);
771 BUG_ON(!pqp); 1206 BUG_ON(!pqp);
1207 pqp_malloc = pqp;
772 1208
773 cp = (char *)pqp + 31; 1209 cp = (char *)pqp + 31;
774 pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); 1210 pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
775 bau_tablesp->va_queue_first = pqp; 1211
1212 for_each_present_cpu(cpu) {
1213 if (pnode != uv_cpu_to_pnode(cpu))
1214 continue;
1215 /* for every cpu on this pnode: */
1216 bcp = &per_cpu(bau_control, cpu);
1217 bcp->va_queue_first = pqp;
1218 bcp->bau_msg_head = pqp;
1219 bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
1220 }
776 /* 1221 /*
777 * need the pnode of where the memory was really allocated 1222 * need the pnode of where the memory was really allocated
778 */ 1223 */
779 pa = uv_gpa(pqp); 1224 pa = uv_gpa(pqp);
780 pn = uv_gpa_to_pnode(pa); 1225 pn = pa >> uv_nshift;
781 uv_write_global_mmr64(pnode, 1226 uv_write_global_mmr64(pnode,
782 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, 1227 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
783 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | 1228 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
784 uv_physnodeaddr(pqp)); 1229 uv_physnodeaddr(pqp));
785 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, 1230 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
786 uv_physnodeaddr(pqp)); 1231 uv_physnodeaddr(pqp));
787 bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
788 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, 1232 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
789 (unsigned long) 1233 (unsigned long)
790 uv_physnodeaddr(bau_tablesp->va_queue_last)); 1234 uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
1235 /* in effect, all msg_type's are set to MSG_NOOP */
791 memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); 1236 memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
792
793 return pqp;
794} 1237}
795 1238
796/* 1239/*
797 * Initialization of each UV blade's structures 1240 * Initialization of each UV hub's structures
798 */ 1241 */
799static int __init uv_init_blade(int blade) 1242static void __init uv_init_uvhub(int uvhub, int vector)
800{ 1243{
801 int node; 1244 int node;
802 int pnode; 1245 int pnode;
803 unsigned long pa;
804 unsigned long apicid; 1246 unsigned long apicid;
805 struct bau_desc *adp; 1247
806 struct bau_payload_queue_entry *pqp; 1248 node = uvhub_to_first_node(uvhub);
807 struct bau_control *bau_tablesp; 1249 pnode = uv_blade_to_pnode(uvhub);
808 1250 uv_activation_descriptor_init(node, pnode);
809 node = blade_to_first_node(blade); 1251 uv_payload_queue_init(node, pnode);
810 bau_tablesp = uv_table_bases_init(blade, node);
811 pnode = uv_blade_to_pnode(blade);
812 adp = uv_activation_descriptor_init(node, pnode);
813 pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
814 uv_table_bases_finish(blade, bau_tablesp, adp);
815 /* 1252 /*
816 * the below initialization can't be in firmware because the 1253 * the below initialization can't be in firmware because the
817 * messaging IRQ will be determined by the OS 1254 * messaging IRQ will be determined by the OS
818 */ 1255 */
819 apicid = blade_to_first_apicid(blade); 1256 apicid = uvhub_to_first_apicid(uvhub);
820 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
821 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, 1257 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
822 ((apicid << 32) | UV_BAU_MESSAGE)); 1258 ((apicid << 32) | vector));
823 return 0; 1259}
1260
1261/*
1262 * initialize the bau_control structure for each cpu
1263 */
1264static void uv_init_per_cpu(int nuvhubs)
1265{
1266 int i, j, k;
1267 int cpu;
1268 int pnode;
1269 int uvhub;
1270 short socket = 0;
1271 struct bau_control *bcp;
1272 struct uvhub_desc *bdp;
1273 struct socket_desc *sdp;
1274 struct bau_control *hmaster = NULL;
1275 struct bau_control *smaster = NULL;
1276 struct socket_desc {
1277 short num_cpus;
1278 short cpu_number[16];
1279 };
1280 struct uvhub_desc {
1281 short num_sockets;
1282 short num_cpus;
1283 short uvhub;
1284 short pnode;
1285 struct socket_desc socket[2];
1286 };
1287 struct uvhub_desc *uvhub_descs;
1288
1289 uvhub_descs = (struct uvhub_desc *)
1290 kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1291 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1292 for_each_present_cpu(cpu) {
1293 bcp = &per_cpu(bau_control, cpu);
1294 memset(bcp, 0, sizeof(struct bau_control));
1295 spin_lock_init(&bcp->masks_lock);
1296 bcp->max_concurrent = uv_bau_max_concurrent;
1297 pnode = uv_cpu_hub_info(cpu)->pnode;
1298 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1299 bdp = &uvhub_descs[uvhub];
1300 bdp->num_cpus++;
1301 bdp->uvhub = uvhub;
1302 bdp->pnode = pnode;
1303 /* time interval to catch a hardware stay-busy bug */
1304 bcp->timeout_interval = millisec_2_cycles(3);
1305 /* kludge: assume uv_hub.h is constant */
1306 socket = (cpu_physical_id(cpu)>>5)&1;
1307 if (socket >= bdp->num_sockets)
1308 bdp->num_sockets = socket+1;
1309 sdp = &bdp->socket[socket];
1310 sdp->cpu_number[sdp->num_cpus] = cpu;
1311 sdp->num_cpus++;
1312 }
1313 socket = 0;
1314 for_each_possible_blade(uvhub) {
1315 bdp = &uvhub_descs[uvhub];
1316 for (i = 0; i < bdp->num_sockets; i++) {
1317 sdp = &bdp->socket[i];
1318 for (j = 0; j < sdp->num_cpus; j++) {
1319 cpu = sdp->cpu_number[j];
1320 bcp = &per_cpu(bau_control, cpu);
1321 bcp->cpu = cpu;
1322 if (j == 0) {
1323 smaster = bcp;
1324 if (i == 0)
1325 hmaster = bcp;
1326 }
1327 bcp->cpus_in_uvhub = bdp->num_cpus;
1328 bcp->cpus_in_socket = sdp->num_cpus;
1329 bcp->socket_master = smaster;
1330 bcp->uvhub_master = hmaster;
1331 for (k = 0; k < DEST_Q_SIZE; k++)
1332 bcp->socket_acknowledge_count[k] = 0;
1333 bcp->uvhub_cpu =
1334 uv_cpu_hub_info(cpu)->blade_processor_id;
1335 }
1336 socket++;
1337 }
1338 }
1339 kfree(uvhub_descs);
824} 1340}
825 1341
826/* 1342/*
@@ -828,38 +1344,54 @@ static int __init uv_init_blade(int blade)
828 */ 1344 */
829static int __init uv_bau_init(void) 1345static int __init uv_bau_init(void)
830{ 1346{
831 int blade; 1347 int uvhub;
832 int nblades; 1348 int pnode;
1349 int nuvhubs;
833 int cur_cpu; 1350 int cur_cpu;
1351 int vector;
1352 unsigned long mmr;
834 1353
835 if (!is_uv_system()) 1354 if (!is_uv_system())
836 return 0; 1355 return 0;
837 1356
1357 if (nobau)
1358 return 0;
1359
838 for_each_possible_cpu(cur_cpu) 1360 for_each_possible_cpu(cur_cpu)
839 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 1361 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
840 GFP_KERNEL, cpu_to_node(cur_cpu)); 1362 GFP_KERNEL, cpu_to_node(cur_cpu));
841 1363
842 uv_bau_retry_limit = 1; 1364 uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
1365 uv_nshift = uv_hub_info->m_val;
843 uv_mmask = (1UL << uv_hub_info->m_val) - 1; 1366 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
844 nblades = uv_num_possible_blades(); 1367 nuvhubs = uv_num_possible_blades();
845 1368
846 uv_bau_table_bases = (struct bau_control **) 1369 uv_init_per_cpu(nuvhubs);
847 kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
848 BUG_ON(!uv_bau_table_bases);
849 1370
850 uv_partition_base_pnode = 0x7fffffff; 1371 uv_partition_base_pnode = 0x7fffffff;
851 for (blade = 0; blade < nblades; blade++) 1372 for (uvhub = 0; uvhub < nuvhubs; uvhub++)
852 if (uv_blade_nr_possible_cpus(blade) && 1373 if (uv_blade_nr_possible_cpus(uvhub) &&
853 (uv_blade_to_pnode(blade) < uv_partition_base_pnode)) 1374 (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
854 uv_partition_base_pnode = uv_blade_to_pnode(blade); 1375 uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
855 for (blade = 0; blade < nblades; blade++) 1376
856 if (uv_blade_nr_possible_cpus(blade)) 1377 vector = UV_BAU_MESSAGE;
857 uv_init_blade(blade); 1378 for_each_possible_blade(uvhub)
858 1379 if (uv_blade_nr_possible_cpus(uvhub))
859 alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); 1380 uv_init_uvhub(uvhub, vector);
1381
860 uv_enable_timeouts(); 1382 uv_enable_timeouts();
1383 alloc_intr_gate(vector, uv_bau_message_intr1);
1384
1385 for_each_possible_blade(uvhub) {
1386 pnode = uv_blade_to_pnode(uvhub);
1387 /* INIT the bau */
1388 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
1389 ((unsigned long)1 << 63));
1390 mmr = 1; /* should be 1 to broadcast to both sockets */
1391 uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
1392 }
861 1393
862 return 0; 1394 return 0;
863} 1395}
864__initcall(uv_bau_init); 1396core_initcall(uv_bau_init);
865__initcall(uv_ptc_init); 1397core_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e4454188..60788dee0f8a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -15,6 +15,7 @@
15#include <linux/kprobes.h> 15#include <linux/kprobes.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/kdebug.h> 17#include <linux/kdebug.h>
18#include <linux/kgdb.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/module.h> 20#include <linux/module.h>
20#include <linux/ptrace.h> 21#include <linux/ptrace.h>
@@ -108,15 +109,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
108 dec_preempt_count(); 109 dec_preempt_count();
109} 110}
110 111
111#ifdef CONFIG_X86_32
112static inline void
113die_if_kernel(const char *str, struct pt_regs *regs, long err)
114{
115 if (!user_mode_vm(regs))
116 die(str, regs, err);
117}
118#endif
119
120static void __kprobes 112static void __kprobes
121do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, 113do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
122 long error_code, siginfo_t *info) 114 long error_code, siginfo_t *info)
@@ -400,7 +392,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
400 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) 392 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
401 == NOTIFY_STOP) 393 == NOTIFY_STOP)
402 return; 394 return;
395
403#ifdef CONFIG_X86_LOCAL_APIC 396#ifdef CONFIG_X86_LOCAL_APIC
397 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
398 == NOTIFY_STOP)
399 return;
400
401#ifndef CONFIG_LOCKUP_DETECTOR
404 /* 402 /*
405 * Ok, so this is none of the documented NMI sources, 403 * Ok, so this is none of the documented NMI sources,
406 * so it must be the NMI watchdog. 404 * so it must be the NMI watchdog.
@@ -408,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
408 if (nmi_watchdog_tick(regs, reason)) 406 if (nmi_watchdog_tick(regs, reason))
409 return; 407 return;
410 if (!do_nmi_callback(regs, cpu)) 408 if (!do_nmi_callback(regs, cpu))
409#endif /* !CONFIG_LOCKUP_DETECTOR */
411 unknown_nmi_error(reason, regs); 410 unknown_nmi_error(reason, regs);
412#else 411#else
413 unknown_nmi_error(reason, regs); 412 unknown_nmi_error(reason, regs);
@@ -460,6 +459,11 @@ void restart_nmi(void)
460/* May run on IST stack. */ 459/* May run on IST stack. */
461dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) 460dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
462{ 461{
462#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
463 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
464 == NOTIFY_STOP)
465 return;
466#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
463#ifdef CONFIG_KPROBES 467#ifdef CONFIG_KPROBES
464 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) 468 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
465 == NOTIFY_STOP) 469 == NOTIFY_STOP)
@@ -529,6 +533,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 533dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
530{ 534{
531 struct task_struct *tsk = current; 535 struct task_struct *tsk = current;
536 int user_icebp = 0;
532 unsigned long dr6; 537 unsigned long dr6;
533 int si_code; 538 int si_code;
534 539
@@ -537,17 +542,25 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
537 /* Filter out all the reserved bits which are preset to 1 */ 542 /* Filter out all the reserved bits which are preset to 1 */
538 dr6 &= ~DR6_RESERVED; 543 dr6 &= ~DR6_RESERVED;
539 544
545 /*
546 * If dr6 has no reason to give us about the origin of this trap,
547 * then it's very likely the result of an icebp/int01 trap.
548 * User wants a sigtrap for that.
549 */
550 if (!dr6 && user_mode(regs))
551 user_icebp = 1;
552
540 /* Catch kmemcheck conditions first of all! */ 553 /* Catch kmemcheck conditions first of all! */
541 if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) 554 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
542 return; 555 return;
543 556
544 /* DR6 may or may not be cleared by the CPU */ 557 /* DR6 may or may not be cleared by the CPU */
545 set_debugreg(0, 6); 558 set_debugreg(0, 6);
559
546 /* 560 /*
547 * The processor cleared BTF, so don't mark that we need it set. 561 * The processor cleared BTF, so don't mark that we need it set.
548 */ 562 */
549 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 563 clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
550 tsk->thread.debugctlmsr = 0;
551 564
552 /* Store the virtualized DR6 value */ 565 /* Store the virtualized DR6 value */
553 tsk->thread.debugreg6 = dr6; 566 tsk->thread.debugreg6 = dr6;
@@ -578,62 +591,74 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
578 regs->flags &= ~X86_EFLAGS_TF; 591 regs->flags &= ~X86_EFLAGS_TF;
579 } 592 }
580 si_code = get_si_code(tsk->thread.debugreg6); 593 si_code = get_si_code(tsk->thread.debugreg6);
581 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) 594 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
582 send_sigtrap(tsk, regs, error_code, si_code); 595 send_sigtrap(tsk, regs, error_code, si_code);
583 preempt_conditional_cli(regs); 596 preempt_conditional_cli(regs);
584 597
585 return; 598 return;
586} 599}
587 600
588#ifdef CONFIG_X86_64
589static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
590{
591 if (fixup_exception(regs))
592 return 1;
593
594 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
595 /* Illegal floating point operation in the kernel */
596 current->thread.trap_no = trapnr;
597 die(str, regs, 0);
598 return 0;
599}
600#endif
601
602/* 601/*
603 * Note that we play around with the 'TS' bit in an attempt to get 602 * Note that we play around with the 'TS' bit in an attempt to get
604 * the correct behaviour even in the presence of the asynchronous 603 * the correct behaviour even in the presence of the asynchronous
605 * IRQ13 behaviour 604 * IRQ13 behaviour
606 */ 605 */
607void math_error(void __user *ip) 606void math_error(struct pt_regs *regs, int error_code, int trapnr)
608{ 607{
609 struct task_struct *task; 608 struct task_struct *task = current;
610 siginfo_t info; 609 siginfo_t info;
611 unsigned short cwd, swd, err; 610 unsigned short err;
611 char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
612
613 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
614 return;
615 conditional_sti(regs);
616
617 if (!user_mode_vm(regs))
618 {
619 if (!fixup_exception(regs)) {
620 task->thread.error_code = error_code;
621 task->thread.trap_no = trapnr;
622 die(str, regs, error_code);
623 }
624 return;
625 }
612 626
613 /* 627 /*
614 * Save the info for the exception handler and clear the error. 628 * Save the info for the exception handler and clear the error.
615 */ 629 */
616 task = current;
617 save_init_fpu(task); 630 save_init_fpu(task);
618 task->thread.trap_no = 16; 631 task->thread.trap_no = trapnr;
619 task->thread.error_code = 0; 632 task->thread.error_code = error_code;
620 info.si_signo = SIGFPE; 633 info.si_signo = SIGFPE;
621 info.si_errno = 0; 634 info.si_errno = 0;
622 info.si_addr = ip; 635 info.si_addr = (void __user *)regs->ip;
623 /* 636 if (trapnr == 16) {
624 * (~cwd & swd) will mask out exceptions that are not set to unmasked 637 unsigned short cwd, swd;
625 * status. 0x3f is the exception bits in these regs, 0x200 is the 638 /*
626 * C1 reg you need in case of a stack fault, 0x040 is the stack 639 * (~cwd & swd) will mask out exceptions that are not set to unmasked
627 * fault bit. We should only be taking one exception at a time, 640 * status. 0x3f is the exception bits in these regs, 0x200 is the
628 * so if this combination doesn't produce any single exception, 641 * C1 reg you need in case of a stack fault, 0x040 is the stack
629 * then we have a bad program that isn't synchronizing its FPU usage 642 * fault bit. We should only be taking one exception at a time,
630 * and it will suffer the consequences since we won't be able to 643 * so if this combination doesn't produce any single exception,
631 * fully reproduce the context of the exception 644 * then we have a bad program that isn't synchronizing its FPU usage
632 */ 645 * and it will suffer the consequences since we won't be able to
633 cwd = get_fpu_cwd(task); 646 * fully reproduce the context of the exception
634 swd = get_fpu_swd(task); 647 */
648 cwd = get_fpu_cwd(task);
649 swd = get_fpu_swd(task);
635 650
636 err = swd & ~cwd; 651 err = swd & ~cwd;
652 } else {
653 /*
654 * The SIMD FPU exceptions are handled a little differently, as there
655 * is only a single status/control register. Thus, to determine which
656 * unmasked exception was caught we must mask the exception mask bits
657 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
658 */
659 unsigned short mxcsr = get_fpu_mxcsr(task);
660 err = ~(mxcsr >> 7) & mxcsr;
661 }
637 662
638 if (err & 0x001) { /* Invalid op */ 663 if (err & 0x001) { /* Invalid op */
639 /* 664 /*
@@ -662,97 +687,17 @@ void math_error(void __user *ip)
662 687
663dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 688dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
664{ 689{
665 conditional_sti(regs);
666
667#ifdef CONFIG_X86_32 690#ifdef CONFIG_X86_32
668 ignore_fpu_irq = 1; 691 ignore_fpu_irq = 1;
669#else
670 if (!user_mode(regs) &&
671 kernel_math_error(regs, "kernel x87 math error", 16))
672 return;
673#endif 692#endif
674 693
675 math_error((void __user *)regs->ip); 694 math_error(regs, error_code, 16);
676}
677
678static void simd_math_error(void __user *ip)
679{
680 struct task_struct *task;
681 siginfo_t info;
682 unsigned short mxcsr;
683
684 /*
685 * Save the info for the exception handler and clear the error.
686 */
687 task = current;
688 save_init_fpu(task);
689 task->thread.trap_no = 19;
690 task->thread.error_code = 0;
691 info.si_signo = SIGFPE;
692 info.si_errno = 0;
693 info.si_code = __SI_FAULT;
694 info.si_addr = ip;
695 /*
696 * The SIMD FPU exceptions are handled a little differently, as there
697 * is only a single status/control register. Thus, to determine which
698 * unmasked exception was caught we must mask the exception mask bits
699 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
700 */
701 mxcsr = get_fpu_mxcsr(task);
702 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
703 case 0x000:
704 default:
705 break;
706 case 0x001: /* Invalid Op */
707 info.si_code = FPE_FLTINV;
708 break;
709 case 0x002: /* Denormalize */
710 case 0x010: /* Underflow */
711 info.si_code = FPE_FLTUND;
712 break;
713 case 0x004: /* Zero Divide */
714 info.si_code = FPE_FLTDIV;
715 break;
716 case 0x008: /* Overflow */
717 info.si_code = FPE_FLTOVF;
718 break;
719 case 0x020: /* Precision */
720 info.si_code = FPE_FLTRES;
721 break;
722 }
723 force_sig_info(SIGFPE, &info, task);
724} 695}
725 696
726dotraplinkage void 697dotraplinkage void
727do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 698do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
728{ 699{
729 conditional_sti(regs); 700 math_error(regs, error_code, 19);
730
731#ifdef CONFIG_X86_32
732 if (cpu_has_xmm) {
733 /* Handle SIMD FPU exceptions on PIII+ processors. */
734 ignore_fpu_irq = 1;
735 simd_math_error((void __user *)regs->ip);
736 return;
737 }
738 /*
739 * Handle strange cache flush from user space exception
740 * in all other cases. This is undocumented behaviour.
741 */
742 if (regs->flags & X86_VM_MASK) {
743 handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
744 return;
745 }
746 current->thread.trap_no = 19;
747 current->thread.error_code = error_code;
748 die_if_kernel("cache flush denied", regs, error_code);
749 force_sig(SIGSEGV, current);
750#else
751 if (!user_mode(regs) &&
752 kernel_math_error(regs, "kernel simd math error", 19))
753 return;
754 simd_math_error((void __user *)regs->ip);
755#endif
756} 701}
757 702
758dotraplinkage void 703dotraplinkage void
@@ -879,6 +824,16 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
879} 824}
880#endif 825#endif
881 826
827/* Set of traps needed for early debugging. */
828void __init early_trap_init(void)
829{
830 set_intr_gate_ist(1, &debug, DEBUG_STACK);
831 /* int3 can be called from all */
832 set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
833 set_intr_gate(14, &page_fault);
834 load_idt(&idt_descr);
835}
836
882void __init trap_init(void) 837void __init trap_init(void)
883{ 838{
884 int i; 839 int i;
@@ -892,10 +847,7 @@ void __init trap_init(void)
892#endif 847#endif
893 848
894 set_intr_gate(0, &divide_error); 849 set_intr_gate(0, &divide_error);
895 set_intr_gate_ist(1, &debug, DEBUG_STACK);
896 set_intr_gate_ist(2, &nmi, NMI_STACK); 850 set_intr_gate_ist(2, &nmi, NMI_STACK);
897 /* int3 can be called from all */
898 set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
899 /* int4 can be called from all */ 851 /* int4 can be called from all */
900 set_system_intr_gate(4, &overflow); 852 set_system_intr_gate(4, &overflow);
901 set_intr_gate(5, &bounds); 853 set_intr_gate(5, &bounds);
@@ -911,7 +863,6 @@ void __init trap_init(void)
911 set_intr_gate(11, &segment_not_present); 863 set_intr_gate(11, &segment_not_present);
912 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); 864 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
913 set_intr_gate(13, &general_protection); 865 set_intr_gate(13, &general_protection);
914 set_intr_gate(14, &page_fault);
915 set_intr_gate(15, &spurious_interrupt_bug); 866 set_intr_gate(15, &spurious_interrupt_bug);
916 set_intr_gate(16, &coprocessor_error); 867 set_intr_gate(16, &coprocessor_error);
917 set_intr_gate(17, &alignment_check); 868 set_intr_gate(17, &alignment_check);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9faf91ae1841..ce8e50239332 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = {
751 .read = read_tsc, 751 .read = read_tsc,
752 .resume = resume_tsc, 752 .resume = resume_tsc,
753 .mask = CLOCKSOURCE_MASK(64), 753 .mask = CLOCKSOURCE_MASK(64),
754 .shift = 22,
755 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 754 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
756 CLOCK_SOURCE_MUST_VERIFY, 755 CLOCK_SOURCE_MUST_VERIFY,
757#ifdef CONFIG_X86_64 756#ifdef CONFIG_X86_64
@@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void)
845 844
846static void __init init_tsc_clocksource(void) 845static void __init init_tsc_clocksource(void)
847{ 846{
848 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
849 clocksource_tsc.shift);
850 if (tsc_clocksource_reliable) 847 if (tsc_clocksource_reliable)
851 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 848 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
852 /* lower the rating if we already know its unstable: */ 849 /* lower the rating if we already know its unstable: */
@@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void)
854 clocksource_tsc.rating = 0; 851 clocksource_tsc.rating = 0;
855 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 852 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
856 } 853 }
857 clocksource_register(&clocksource_tsc); 854 clocksource_register_khz(&clocksource_tsc, tsc_khz);
858} 855}
859 856
860#ifdef CONFIG_X86_64 857#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index 1d40336b030a..1132129db792 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -44,7 +44,7 @@ static void uv_ack_apic(unsigned int irq)
44 ack_APIC_irq(); 44 ack_APIC_irq();
45} 45}
46 46
47struct irq_chip uv_irq_chip = { 47static struct irq_chip uv_irq_chip = {
48 .name = "UV-CORE", 48 .name = "UV-CORE",
49 .startup = uv_noop_ret, 49 .startup = uv_noop_ret,
50 .shutdown = uv_noop, 50 .shutdown = uv_noop,
@@ -141,7 +141,7 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
141 */ 141 */
142static int 142static int
143arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, 143arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
144 unsigned long mmr_offset, int restrict) 144 unsigned long mmr_offset, int limit)
145{ 145{
146 const struct cpumask *eligible_cpu = cpumask_of(cpu); 146 const struct cpumask *eligible_cpu = cpumask_of(cpu);
147 struct irq_desc *desc = irq_to_desc(irq); 147 struct irq_desc *desc = irq_to_desc(irq);
@@ -160,7 +160,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
160 if (err != 0) 160 if (err != 0)
161 return err; 161 return err;
162 162
163 if (restrict == UV_AFFINITY_CPU) 163 if (limit == UV_AFFINITY_CPU)
164 desc->status |= IRQ_NO_BALANCING; 164 desc->status |= IRQ_NO_BALANCING;
165 else 165 else
166 desc->status |= IRQ_MOVE_PCNTXT; 166 desc->status |= IRQ_MOVE_PCNTXT;
@@ -214,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
214 unsigned long mmr_value; 214 unsigned long mmr_value;
215 struct uv_IO_APIC_route_entry *entry; 215 struct uv_IO_APIC_route_entry *entry;
216 unsigned long mmr_offset; 216 unsigned long mmr_offset;
217 unsigned mmr_pnode; 217 int mmr_pnode;
218 218
219 if (set_desc_affinity(desc, mask, &dest)) 219 if (set_desc_affinity(desc, mask, &dest))
220 return -1; 220 return -1;
@@ -248,7 +248,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
248 * interrupt is raised. 248 * interrupt is raised.
249 */ 249 */
250int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, 250int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
251 unsigned long mmr_offset, int restrict) 251 unsigned long mmr_offset, int limit)
252{ 252{
253 int irq, ret; 253 int irq, ret;
254 254
@@ -258,7 +258,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
258 return -EBUSY; 258 return -EBUSY;
259 259
260 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, 260 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
261 restrict); 261 limit);
262 if (ret == irq) 262 if (ret == irq)
263 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); 263 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
264 else 264 else
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
index 45b6f8a975a1..56a8c2a867d9 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <asm/cpufeature.h> 33#include <asm/cpufeature.h>
34#include <asm/msr-index.h>
34 35
35verify_cpu: 36verify_cpu:
36 pushfl # Save caller passed flags 37 pushfl # Save caller passed flags
@@ -88,7 +89,7 @@ verify_cpu_sse_test:
88 je verify_cpu_sse_ok 89 je verify_cpu_sse_ok
89 test %di,%di 90 test %di,%di
90 jz verify_cpu_no_longmode # only try to force SSE on AMD 91 jz verify_cpu_no_longmode # only try to force SSE on AMD
91 movl $0xc0010015,%ecx # HWCR 92 movl $MSR_K7_HWCR,%ecx
92 rdmsr 93 rdmsr
93 btr $15,%eax # enable SSE 94 btr $15,%eax # enable SSE
94 wrmsr 95 wrmsr
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 2cc249718c46..d0bb52296fa3 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -97,7 +97,7 @@ SECTIONS
97 HEAD_TEXT 97 HEAD_TEXT
98#ifdef CONFIG_X86_32 98#ifdef CONFIG_X86_32
99 . = ALIGN(PAGE_SIZE); 99 . = ALIGN(PAGE_SIZE);
100 *(.text.page_aligned) 100 *(.text..page_aligned)
101#endif 101#endif
102 . = ALIGN(8); 102 . = ALIGN(8);
103 _stext = .; 103 _stext = .;
@@ -305,7 +305,7 @@ SECTIONS
305 . = ALIGN(PAGE_SIZE); 305 . = ALIGN(PAGE_SIZE);
306 .bss : AT(ADDR(.bss) - LOAD_OFFSET) { 306 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
307 __bss_start = .; 307 __bss_start = .;
308 *(.bss.page_aligned) 308 *(.bss..page_aligned)
309 *(.bss) 309 *(.bss)
310 . = ALIGN(4); 310 . = ALIGN(4);
311 __bss_stop = .; 311 __bss_stop = .;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1c0c6ab9c60f..dcbb28c4b694 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, 76void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
77 u32 mult) 77 struct clocksource *clock, u32 mult)
78{ 78{
79 unsigned long flags; 79 unsigned long flags;
80 80
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
87 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
90 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; 90 vsyscall_gtod_data.wall_to_monotonic = *wtm;
91 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 91 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
92 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 92 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
93} 93}
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
169 * unlikely */ 169 * unlikely */
170time_t __vsyscall(1) vtime(time_t *t) 170time_t __vsyscall(1) vtime(time_t *t)
171{ 171{
172 struct timeval tv; 172 unsigned seq;
173 time_t result; 173 time_t result;
174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) 174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
175 return time_syscall(t); 175 return time_syscall(t);
176 176
177 vgettimeofday(&tv, NULL); 177 do {
178 result = tv.tv_sec; 178 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
179
180 result = __vsyscall_gtod_data.wall_time_sec;
181
182 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
183
179 if (t) 184 if (t)
180 *t = result; 185 *t = result;
181 return result; 186 return result;
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 693920b22496..1b950d151e58 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy);
54EXPORT_SYMBOL(__memcpy); 54EXPORT_SYMBOL(__memcpy);
55 55
56EXPORT_SYMBOL(empty_zero_page); 56EXPORT_SYMBOL(empty_zero_page);
57EXPORT_SYMBOL(init_level4_pgt);
58#ifndef CONFIG_PARAVIRT 57#ifndef CONFIG_PARAVIRT
59EXPORT_SYMBOL(native_load_gs_index); 58EXPORT_SYMBOL(native_load_gs_index);
60#endif 59#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 61a1e8c7e19f..cd6da6bf3eca 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -5,6 +5,7 @@
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/ioport.h> 7#include <linux/ioport.h>
8#include <linux/module.h>
8 9
9#include <asm/bios_ebda.h> 10#include <asm/bios_ebda.h>
10#include <asm/paravirt.h> 11#include <asm/paravirt.h>
@@ -85,6 +86,7 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
85}; 86};
86 87
87static void default_nmi_init(void) { }; 88static void default_nmi_init(void) { };
89static int default_i8042_detect(void) { return 1; };
88 90
89struct x86_platform_ops x86_platform = { 91struct x86_platform_ops x86_platform = {
90 .calibrate_tsc = native_calibrate_tsc, 92 .calibrate_tsc = native_calibrate_tsc,
@@ -92,5 +94,8 @@ struct x86_platform_ops x86_platform = {
92 .set_wallclock = mach_set_rtc_mmss, 94 .set_wallclock = mach_set_rtc_mmss,
93 .iommu_shutdown = iommu_shutdown_noop, 95 .iommu_shutdown = iommu_shutdown_noop,
94 .is_untracked_pat_range = is_ISA_range, 96 .is_untracked_pat_range = is_ISA_range,
95 .nmi_init = default_nmi_init 97 .nmi_init = default_nmi_init,
98 .i8042_detect = default_i8042_detect
96}; 99};
100
101EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 782c3a362ec6..9c253bd65e24 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -16,11 +16,88 @@
16 */ 16 */
17u64 pcntxt_mask; 17u64 pcntxt_mask;
18 18
19/*
20 * Represents init state for the supported extended state.
21 */
22static struct xsave_struct *init_xstate_buf;
23
19struct _fpx_sw_bytes fx_sw_reserved; 24struct _fpx_sw_bytes fx_sw_reserved;
20#ifdef CONFIG_IA32_EMULATION 25#ifdef CONFIG_IA32_EMULATION
21struct _fpx_sw_bytes fx_sw_reserved_ia32; 26struct _fpx_sw_bytes fx_sw_reserved_ia32;
22#endif 27#endif
23 28
29static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
30
31/*
32 * If a processor implementation discern that a processor state component is
33 * in its initialized state it may modify the corresponding bit in the
34 * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
35 * layout in the case of xsaveopt. While presenting the xstate information to
36 * the user, we always ensure that the memory layout of a feature will be in
37 * the init state if the corresponding header bit is zero. This is to ensure
38 * that the user doesn't see some stale state in the memory layout during
39 * signal handling, debugging etc.
40 */
41void __sanitize_i387_state(struct task_struct *tsk)
42{
43 u64 xstate_bv;
44 int feature_bit = 0x2;
45 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
46
47 if (!fx)
48 return;
49
50 BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
51
52 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
53
54 /*
55 * None of the feature bits are in init state. So nothing else
56 * to do for us, as the memory layout is upto date.
57 */
58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
59 return;
60
61 /*
62 * FP is in init state
63 */
64 if (!(xstate_bv & XSTATE_FP)) {
65 fx->cwd = 0x37f;
66 fx->swd = 0;
67 fx->twd = 0;
68 fx->fop = 0;
69 fx->rip = 0;
70 fx->rdp = 0;
71 memset(&fx->st_space[0], 0, 128);
72 }
73
74 /*
75 * SSE is in init state
76 */
77 if (!(xstate_bv & XSTATE_SSE))
78 memset(&fx->xmm_space[0], 0, 256);
79
80 xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
81
82 /*
83 * Update all the other memory layouts for which the corresponding
84 * header bit is in the init state.
85 */
86 while (xstate_bv) {
87 if (xstate_bv & 0x1) {
88 int offset = xstate_offsets[feature_bit];
89 int size = xstate_sizes[feature_bit];
90
91 memcpy(((void *) fx) + offset,
92 ((void *) init_xstate_buf) + offset,
93 size);
94 }
95
96 xstate_bv >>= 1;
97 feature_bit++;
98 }
99}
100
24/* 101/*
25 * Check for the presence of extended state information in the 102 * Check for the presence of extended state information in the
26 * user fpstate pointer in the sigcontext. 103 * user fpstate pointer in the sigcontext.
@@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
36 113
37 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], 114 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
38 sizeof(struct _fpx_sw_bytes)); 115 sizeof(struct _fpx_sw_bytes));
39
40 if (err) 116 if (err)
41 return err; 117 return -EFAULT;
42 118
43 /* 119 /*
44 * First Magic check failed. 120 * First Magic check failed.
45 */ 121 */
46 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) 122 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
47 return -1; 123 return -EINVAL;
48 124
49 /* 125 /*
50 * Check for error scenarios. 126 * Check for error scenarios.
@@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
52 if (fx_sw_user->xstate_size < min_xstate_size || 128 if (fx_sw_user->xstate_size < min_xstate_size ||
53 fx_sw_user->xstate_size > xstate_size || 129 fx_sw_user->xstate_size > xstate_size ||
54 fx_sw_user->xstate_size > fx_sw_user->extended_size) 130 fx_sw_user->xstate_size > fx_sw_user->extended_size)
55 return -1; 131 return -EINVAL;
56 132
57 err = __get_user(magic2, (__u32 *) (((void *)fpstate) + 133 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
58 fx_sw_user->extended_size - 134 fx_sw_user->extended_size -
59 FP_XSTATE_MAGIC2_SIZE)); 135 FP_XSTATE_MAGIC2_SIZE));
136 if (err)
137 return err;
60 /* 138 /*
61 * Check for the presence of second magic word at the end of memory 139 * Check for the presence of second magic word at the end of memory
62 * layout. This detects the case where the user just copied the legacy 140 * layout. This detects the case where the user just copied the legacy
63 * fpstate layout with out copying the extended state information 141 * fpstate layout with out copying the extended state information
64 * in the memory layout. 142 * in the memory layout.
65 */ 143 */
66 if (err || magic2 != FP_XSTATE_MAGIC2) 144 if (magic2 != FP_XSTATE_MAGIC2)
67 return -1; 145 return -EFAULT;
68 146
69 return 0; 147 return 0;
70} 148}
@@ -91,15 +169,7 @@ int save_i387_xstate(void __user *buf)
91 return 0; 169 return 0;
92 170
93 if (task_thread_info(tsk)->status & TS_USEDFPU) { 171 if (task_thread_info(tsk)->status & TS_USEDFPU) {
94 /* 172 if (use_xsave())
95 * Start with clearing the user buffer. This will present a
96 * clean context for the bytes not touched by the fxsave/xsave.
97 */
98 err = __clear_user(buf, sig_xstate_size);
99 if (err)
100 return err;
101
102 if (task_thread_info(tsk)->status & TS_XSAVE)
103 err = xsave_user(buf); 173 err = xsave_user(buf);
104 else 174 else
105 err = fxsave_user(buf); 175 err = fxsave_user(buf);
@@ -109,14 +179,15 @@ int save_i387_xstate(void __user *buf)
109 task_thread_info(tsk)->status &= ~TS_USEDFPU; 179 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts(); 180 stts();
111 } else { 181 } else {
112 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, 182 sanitize_i387_state(tsk);
183 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
113 xstate_size)) 184 xstate_size))
114 return -1; 185 return -1;
115 } 186 }
116 187
117 clear_used_math(); /* trigger finit */ 188 clear_used_math(); /* trigger finit */
118 189
119 if (task_thread_info(tsk)->status & TS_XSAVE) { 190 if (use_xsave()) {
120 struct _fpstate __user *fx = buf; 191 struct _fpstate __user *fx = buf;
121 struct _xstate __user *x = buf; 192 struct _xstate __user *x = buf;
122 u64 xstate_bv; 193 u64 xstate_bv;
@@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf)
184 * init the state skipped by the user. 255 * init the state skipped by the user.
185 */ 256 */
186 mask = pcntxt_mask & ~mask; 257 mask = pcntxt_mask & ~mask;
187 258 if (unlikely(mask))
188 xrstor_state(init_xstate_buf, mask); 259 xrstor_state(init_xstate_buf, mask);
189 260
190 return 0; 261 return 0;
191 262
@@ -225,7 +296,7 @@ int restore_i387_xstate(void __user *buf)
225 clts(); 296 clts();
226 task_thread_info(current)->status |= TS_USEDFPU; 297 task_thread_info(current)->status |= TS_USEDFPU;
227 } 298 }
228 if (task_thread_info(tsk)->status & TS_XSAVE) 299 if (use_xsave())
229 err = restore_user_xstate(buf); 300 err = restore_user_xstate(buf);
230 else 301 else
231 err = fxrstor_checking((__force struct i387_fxsave_struct *) 302 err = fxrstor_checking((__force struct i387_fxsave_struct *)
@@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void)
274#endif 345#endif
275} 346}
276 347
277/*
278 * Represents init state for the supported extended state.
279 */
280struct xsave_struct *init_xstate_buf;
281
282#ifdef CONFIG_X86_64 348#ifdef CONFIG_X86_64
283unsigned int sig_xstate_size = sizeof(struct _fpstate); 349unsigned int sig_xstate_size = sizeof(struct _fpstate);
284#endif 350#endif
@@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
286/* 352/*
287 * Enable the extended processor state save/restore feature 353 * Enable the extended processor state save/restore feature
288 */ 354 */
289void __cpuinit xsave_init(void) 355static inline void xstate_enable(void)
290{ 356{
291 if (!cpu_has_xsave)
292 return;
293
294 set_in_cr4(X86_CR4_OSXSAVE); 357 set_in_cr4(X86_CR4_OSXSAVE);
295
296 /*
297 * Enable all the features that the HW is capable of
298 * and the Linux kernel is aware of.
299 */
300 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); 358 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
301} 359}
302 360
303/* 361/*
362 * Record the offsets and sizes of different state managed by the xsave
363 * memory layout.
364 */
365static void __init setup_xstate_features(void)
366{
367 int eax, ebx, ecx, edx, leaf = 0x2;
368
369 xstate_features = fls64(pcntxt_mask);
370 xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
371 xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
372
373 do {
374 cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
375
376 if (eax == 0)
377 break;
378
379 xstate_offsets[leaf] = ebx;
380 xstate_sizes[leaf] = eax;
381
382 leaf++;
383 } while (1);
384}
385
386/*
304 * setup the xstate image representing the init state 387 * setup the xstate image representing the init state
305 */ 388 */
306static void __init setup_xstate_init(void) 389static void __init setup_xstate_init(void)
307{ 390{
391 setup_xstate_features();
392
393 /*
394 * Setup init_xstate_buf to represent the init state of
395 * all the features managed by the xsave
396 */
308 init_xstate_buf = alloc_bootmem(xstate_size); 397 init_xstate_buf = alloc_bootmem(xstate_size);
309 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; 398 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
399
400 clts();
401 /*
402 * Init all the features state with header_bv being 0x0
403 */
404 xrstor_state(init_xstate_buf, -1);
405 /*
406 * Dump the init state again. This is to identify the init state
407 * of any feature which is not represented by all zero's.
408 */
409 xsave_state(init_xstate_buf, -1);
410 stts();
310} 411}
311 412
312/* 413/*
313 * Enable and initialize the xsave feature. 414 * Enable and initialize the xsave feature.
314 */ 415 */
315void __ref xsave_cntxt_init(void) 416static void __init xstate_enable_boot_cpu(void)
316{ 417{
317 unsigned int eax, ebx, ecx, edx; 418 unsigned int eax, ebx, ecx, edx;
318 419
319 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 420 if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
421 WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
422 return;
423 }
424
425 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
320 pcntxt_mask = eax + ((u64)edx << 32); 426 pcntxt_mask = eax + ((u64)edx << 32);
321 427
322 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { 428 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
@@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void)
329 * Support only the state known to OS. 435 * Support only the state known to OS.
330 */ 436 */
331 pcntxt_mask = pcntxt_mask & XCNTXT_MASK; 437 pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
332 xsave_init(); 438
439 xstate_enable();
333 440
334 /* 441 /*
335 * Recompute the context size for enabled features 442 * Recompute the context size for enabled features
336 */ 443 */
337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 444 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
338 xstate_size = ebx; 445 xstate_size = ebx;
339 446
340 update_regset_xstate_info(xstate_size, pcntxt_mask); 447 update_regset_xstate_info(xstate_size, pcntxt_mask);
@@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void)
346 "cntxt size 0x%x\n", 453 "cntxt size 0x%x\n",
347 pcntxt_mask, xstate_size); 454 pcntxt_mask, xstate_size);
348} 455}
456
457/*
458 * For the very first instance, this calls xstate_enable_boot_cpu();
459 * for all subsequent instances, this calls xstate_enable().
460 *
461 * This is somewhat obfuscated due to the lack of powerful enough
462 * overrides for the section checks.
463 */
464void __cpuinit xsave_init(void)
465{
466 static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
467 void (*this_func)(void);
468
469 if (!cpu_has_xsave)
470 return;
471
472 this_func = next_func;
473 next_func = xstate_enable;
474 this_func();
475}