aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-06-17 06:52:15 -0400
committerIngo Molnar <mingo@elte.hu>2009-06-17 06:56:49 -0400
commiteadb8a091b27a840de7450f84ecff5ef13476424 (patch)
tree58c3782d40def63baa8167f3d31e3048cb4c7660 /arch/x86/kernel
parent73874005cd8800440be4299bd095387fff4b90ac (diff)
parent65795efbd380a832ae508b04dba8f8e53f0b84d9 (diff)
Merge branch 'linus' into tracing/hw-breakpoints
Conflicts: arch/x86/Kconfig arch/x86/kernel/traps.c arch/x86/power/cpu.c arch/x86/power/cpu_32.c kernel/Makefile Semantic conflict: arch/x86/kernel/hw_breakpoint.c Merge reason: Resolve the conflicts, move from put_cpu_no_sched() to put_cpu() in arch/x86/kernel/hw_breakpoint.c. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/boot.c156
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile2
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_iommu.c500
-rw-r--r--arch/x86/kernel/amd_iommu_init.c273
-rw-r--r--arch/x86/kernel/apic/apic.c318
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/es7000_32.c10
-rw-r--r--arch/x86/kernel/apic/io_apic.c908
-rw-r--r--arch/x86/kernel/apic/nmi.c4
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/probe_64.c2
-rw-r--r--arch/x86/kernel/apic/summit_32.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c22
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/amd.c12
-rw-r--r--arch/x86/kernel/cpu/common.c43
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c431
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c12
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c57
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c29
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c153
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile10
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c218
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c1964
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c76
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c1187
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c203
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c74
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c66
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c57
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c86
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c48
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c73
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c17
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c30
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h15
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c6
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1711
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpuid.c6
-rw-r--r--arch/x86/kernel/dumpstack.h1
-rw-r--r--arch/x86/kernel/e820.c46
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/entry_64.S21
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/hw_breakpoint.c4
-rw-r--r--arch/x86/kernel/i8253.c1
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/irq.c47
-rw-r--r--arch/x86/kernel/irqinit.c (renamed from arch/x86/kernel/irqinit_32.c)155
-rw-r--r--arch/x86/kernel/irqinit_64.c177
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/microcode_amd.c70
-rw-r--r--arch/x86/kernel/microcode_core.c330
-rw-r--r--arch/x86/kernel/microcode_intel.c90
-rw-r--r--arch/x86/kernel/module.c (renamed from arch/x86/kernel/module_64.c)82
-rw-r--r--arch/x86/kernel/module_32.c152
-rw-r--r--arch/x86/kernel/mpparse.c34
-rw-r--r--arch/x86/kernel/msr.c6
-rw-r--r--arch/x86/kernel/paravirt.c58
-rw-r--r--arch/x86/kernel/pci-calgary_64.c54
-rw-r--r--arch/x86/kernel/pci-gart_64.c55
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/process.c17
-rw-r--r--arch/x86/kernel/process_32.c17
-rw-r--r--arch/x86/kernel/process_64.c17
-rw-r--r--arch/x86/kernel/quirks.c37
-rw-r--r--arch/x86/kernel/reboot.c17
-rw-r--r--arch/x86/kernel/setup.c55
-rw-r--r--arch/x86/kernel/setup_percpu.c12
-rw-r--r--arch/x86/kernel/signal.c7
-rw-r--r--arch/x86/kernel/smp.c51
-rw-r--r--arch/x86/kernel/smpboot.c24
-rw-r--r--arch/x86/kernel/stacktrace.c7
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tlb_uv.c17
-rw-r--r--arch/x86/kernel/traps.c31
-rw-r--r--arch/x86/kernel/tsc.c20
-rw-r--r--arch/x86/kernel/tsc_sync.c14
-rw-r--r--arch/x86/kernel/vm86_32.c13
-rw-r--r--arch/x86/kernel/vmi_32.c20
-rw-r--r--arch/x86/kernel/vmlinux.lds.S432
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S229
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S298
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
107 files changed, 7329 insertions, 4518 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cbc781829173..b67efd1cf59b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -28,7 +28,7 @@ CFLAGS_paravirt.o := $(nostackp)
28obj-y := process_$(BITS).o signal.o entry_$(BITS).o 28obj-y := process_$(BITS).o signal.o entry_$(BITS).o
29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
30obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 30obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
31obj-y += setup.o i8259.o irqinit_$(BITS).o 31obj-y += setup.o i8259.o irqinit.o
32obj-$(CONFIG_X86_VISWS) += visws_quirks.o 32obj-$(CONFIG_X86_VISWS) += visws_quirks.o
33obj-$(CONFIG_X86_32) += probe_roms_32.o 33obj-$(CONFIG_X86_32) += probe_roms_32.o
34obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 34obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -73,7 +73,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
73obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 73obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
74obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 74obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
75obj-$(CONFIG_KPROBES) += kprobes.o 75obj-$(CONFIG_KPROBES) += kprobes.o
76obj-$(CONFIG_MODULES) += module_$(BITS).o 76obj-$(CONFIG_MODULES) += module.o
77obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o 77obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
78obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 78obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
79obj-$(CONFIG_KGDB) += kgdb.o 79obj-$(CONFIG_KGDB) += kgdb.o
@@ -90,7 +90,8 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
90obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 90obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
91obj-$(CONFIG_KVM_GUEST) += kvm.o 91obj-$(CONFIG_KVM_GUEST) += kvm.o
92obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 92obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
93obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o 93obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
94obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
94obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 95obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
95 96
96obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 97obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 723989d7f802..631086159c53 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -33,6 +33,7 @@
33#include <linux/irq.h> 33#include <linux/irq.h>
34#include <linux/bootmem.h> 34#include <linux/bootmem.h>
35#include <linux/ioport.h> 35#include <linux/ioport.h>
36#include <linux/pci.h>
36 37
37#include <asm/pgtable.h> 38#include <asm/pgtable.h>
38#include <asm/io_apic.h> 39#include <asm/io_apic.h>
@@ -522,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
522 * success: return IRQ number (>=0) 523 * success: return IRQ number (>=0)
523 * failure: return < 0 524 * failure: return < 0
524 */ 525 */
525int acpi_register_gsi(u32 gsi, int triggering, int polarity) 526int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
526{ 527{
527 unsigned int irq; 528 unsigned int irq;
528 unsigned int plat_gsi = gsi; 529 unsigned int plat_gsi = gsi;
@@ -532,14 +533,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
532 * Make sure all (legacy) PCI IRQs are set as level-triggered. 533 * Make sure all (legacy) PCI IRQs are set as level-triggered.
533 */ 534 */
534 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { 535 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
535 if (triggering == ACPI_LEVEL_SENSITIVE) 536 if (trigger == ACPI_LEVEL_SENSITIVE)
536 eisa_set_level_irq(gsi); 537 eisa_set_level_irq(gsi);
537 } 538 }
538#endif 539#endif
539 540
540#ifdef CONFIG_X86_IO_APIC 541#ifdef CONFIG_X86_IO_APIC
541 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { 542 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
542 plat_gsi = mp_register_gsi(gsi, triggering, polarity); 543 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
543 } 544 }
544#endif 545#endif
545 acpi_gsi_to_irq(plat_gsi, &irq); 546 acpi_gsi_to_irq(plat_gsi, &irq);
@@ -903,10 +904,8 @@ extern int es7000_plat;
903#endif 904#endif
904 905
905static struct { 906static struct {
906 int apic_id;
907 int gsi_base; 907 int gsi_base;
908 int gsi_end; 908 int gsi_end;
909 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
910} mp_ioapic_routing[MAX_IO_APICS]; 909} mp_ioapic_routing[MAX_IO_APICS];
911 910
912int mp_find_ioapic(int gsi) 911int mp_find_ioapic(int gsi)
@@ -986,16 +985,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
986 985
987 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 986 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
988 mp_ioapics[idx].apicid = uniq_ioapic_id(id); 987 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
989#ifdef CONFIG_X86_32
990 mp_ioapics[idx].apicver = io_apic_get_version(idx); 988 mp_ioapics[idx].apicver = io_apic_get_version(idx);
991#else 989
992 mp_ioapics[idx].apicver = 0;
993#endif
994 /* 990 /*
995 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 991 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
996 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 992 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
997 */ 993 */
998 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
999 mp_ioapic_routing[idx].gsi_base = gsi_base; 994 mp_ioapic_routing[idx].gsi_base = gsi_base;
1000 mp_ioapic_routing[idx].gsi_end = gsi_base + 995 mp_ioapic_routing[idx].gsi_end = gsi_base +
1001 io_apic_get_redir_entries(idx); 996 io_apic_get_redir_entries(idx);
@@ -1158,26 +1153,52 @@ void __init mp_config_acpi_legacy_irqs(void)
1158 } 1153 }
1159} 1154}
1160 1155
1161int mp_register_gsi(u32 gsi, int triggering, int polarity) 1156static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
1157 int polarity)
1162{ 1158{
1159#ifdef CONFIG_X86_MPPARSE
1160 struct mpc_intsrc mp_irq;
1161 struct pci_dev *pdev;
1162 unsigned char number;
1163 unsigned int devfn;
1163 int ioapic; 1164 int ioapic;
1164 int ioapic_pin; 1165 u8 pin;
1165#ifdef CONFIG_X86_32
1166#define MAX_GSI_NUM 4096
1167#define IRQ_COMPRESSION_START 64
1168 1166
1169 static int pci_irq = IRQ_COMPRESSION_START; 1167 if (!acpi_ioapic)
1170 /* 1168 return 0;
1171 * Mapping between Global System Interrupts, which 1169 if (!dev)
1172 * represent all possible interrupts, and IRQs 1170 return 0;
1173 * assigned to actual devices. 1171 if (dev->bus != &pci_bus_type)
1174 */ 1172 return 0;
1175 static int gsi_to_irq[MAX_GSI_NUM]; 1173
1176#else 1174 pdev = to_pci_dev(dev);
1175 number = pdev->bus->number;
1176 devfn = pdev->devfn;
1177 pin = pdev->pin;
1178 /* print the entry should happen on mptable identically */
1179 mp_irq.type = MP_INTSRC;
1180 mp_irq.irqtype = mp_INT;
1181 mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1182 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1183 mp_irq.srcbus = number;
1184 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1185 ioapic = mp_find_ioapic(gsi);
1186 mp_irq.dstapic = mp_ioapics[ioapic].apicid;
1187 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1188
1189 save_mp_irq(&mp_irq);
1190#endif
1191 return 0;
1192}
1193
1194int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1195{
1196 int ioapic;
1197 int ioapic_pin;
1198 struct io_apic_irq_attr irq_attr;
1177 1199
1178 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) 1200 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
1179 return gsi; 1201 return gsi;
1180#endif
1181 1202
1182 /* Don't set up the ACPI SCI because it's already set up */ 1203 /* Don't set up the ACPI SCI because it's already set up */
1183 if (acpi_gbl_FADT.sci_interrupt == gsi) 1204 if (acpi_gbl_FADT.sci_interrupt == gsi)
@@ -1196,93 +1217,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1196 gsi = ioapic_renumber_irq(ioapic, gsi); 1217 gsi = ioapic_renumber_irq(ioapic, gsi);
1197#endif 1218#endif
1198 1219
1199 /*
1200 * Avoid pin reprogramming. PRTs typically include entries
1201 * with redundant pin->gsi mappings (but unique PCI devices);
1202 * we only program the IOAPIC on the first.
1203 */
1204 if (ioapic_pin > MP_MAX_IOAPIC_PIN) { 1220 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1205 printk(KERN_ERR "Invalid reference to IOAPIC pin " 1221 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1206 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 1222 "%d-%d\n", mp_ioapics[ioapic].apicid,
1207 ioapic_pin); 1223 ioapic_pin);
1208 return gsi; 1224 return gsi;
1209 } 1225 }
1210 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1211 pr_debug("Pin %d-%d already programmed\n",
1212 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1213#ifdef CONFIG_X86_32
1214 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1215#else
1216 return gsi;
1217#endif
1218 }
1219
1220 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
1221#ifdef CONFIG_X86_32
1222 /*
1223 * For GSI >= 64, use IRQ compression
1224 */
1225 if ((gsi >= IRQ_COMPRESSION_START)
1226 && (triggering == ACPI_LEVEL_SENSITIVE)) {
1227 /*
1228 * For PCI devices assign IRQs in order, avoiding gaps
1229 * due to unused I/O APIC pins.
1230 */
1231 int irq = gsi;
1232 if (gsi < MAX_GSI_NUM) {
1233 /*
1234 * Retain the VIA chipset work-around (gsi > 15), but
1235 * avoid a problem where the 8254 timer (IRQ0) is setup
1236 * via an override (so it's not on pin 0 of the ioapic),
1237 * and at the same time, the pin 0 interrupt is a PCI
1238 * type. The gsi > 15 test could cause these two pins
1239 * to be shared as IRQ0, and they are not shareable.
1240 * So test for this condition, and if necessary, avoid
1241 * the pin collision.
1242 */
1243 gsi = pci_irq++;
1244 /*
1245 * Don't assign IRQ used by ACPI SCI
1246 */
1247 if (gsi == acpi_gbl_FADT.sci_interrupt)
1248 gsi = pci_irq++;
1249 gsi_to_irq[irq] = gsi;
1250 } else {
1251 printk(KERN_ERR "GSI %u is too high\n", gsi);
1252 return gsi;
1253 }
1254 }
1255#endif
1256 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1257 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
1258 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1259 return gsi;
1260}
1261 1226
1262int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, 1227 if (enable_update_mptable)
1263 u32 gsi, int triggering, int polarity) 1228 mp_config_acpi_gsi(dev, gsi, trigger, polarity);
1264{
1265#ifdef CONFIG_X86_MPPARSE
1266 struct mpc_intsrc mp_irq;
1267 int ioapic;
1268 1229
1269 if (!acpi_ioapic) 1230 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
1270 return 0; 1231 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
1232 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1233 io_apic_set_pci_routing(dev, gsi, &irq_attr);
1271 1234
1272 /* print the entry should happen on mptable identically */ 1235 return gsi;
1273 mp_irq.type = MP_INTSRC;
1274 mp_irq.irqtype = mp_INT;
1275 mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1276 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1277 mp_irq.srcbus = number;
1278 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1279 ioapic = mp_find_ioapic(gsi);
1280 mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
1281 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1282
1283 save_mp_irq(&mp_irq);
1284#endif
1285 return 0;
1286} 1236}
1287 1237
1288/* 1238/*
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 1c31cc0e9def..167bc16ce0e5 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -9,7 +9,7 @@
9always := wakeup.bin 9always := wakeup.bin
10targets := wakeup.elf wakeup.lds 10targets := wakeup.elf wakeup.lds
11 11
12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o 12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
13 13
14# The link order of the video-*.o modules can matter. In particular, 14# The link order of the video-*.o modules can matter. In particular,
15# video-vga.o *must* be listed first, followed by video-vesa.o. 15# video-vga.o *must* be listed first, followed by video-vesa.o.
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644
index 000000000000..f51eb0bb56ce
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644
index 000000000000..6206033ba202
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 7c243a2c5115..ca93638ba430 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -104,7 +104,7 @@ int acpi_save_state_mem(void)
104 initial_gs = per_cpu_offset(smp_processor_id()); 104 initial_gs = per_cpu_offset(smp_processor_id());
105#endif 105#endif
106 initial_code = (unsigned long)wakeup_long64; 106 initial_code = (unsigned long)wakeup_long64;
107 saved_magic = 0x123456789abcdef0; 107 saved_magic = 0x123456789abcdef0L;
108#endif /* CONFIG_64BIT */ 108#endif /* CONFIG_64BIT */
109 109
110 return 0; 110 return 0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad52..1c60554537c3 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,16 @@ struct iommu_cmd {
55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
56 struct unity_map_entry *e); 56 struct unity_map_entry *e);
57static struct dma_ops_domain *find_protection_domain(u16 devid); 57static struct dma_ops_domain *find_protection_domain(u16 devid);
58static u64* alloc_pte(struct protection_domain *dom,
59 unsigned long address, u64
60 **pte_page, gfp_t gfp);
61static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
62 unsigned long start_page,
63 unsigned int pages);
58 64
65#ifndef BUS_NOTIFY_UNBOUND_DRIVER
66#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
67#endif
59 68
60#ifdef CONFIG_AMD_IOMMU_STATS 69#ifdef CONFIG_AMD_IOMMU_STATS
61 70
@@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
213{ 222{
214 struct amd_iommu *iommu; 223 struct amd_iommu *iommu;
215 224
216 list_for_each_entry(iommu, &amd_iommu_list, list) 225 for_each_iommu(iommu)
217 iommu_poll_events(iommu); 226 iommu_poll_events(iommu);
218 227
219 return IRQ_HANDLED; 228 return IRQ_HANDLED;
@@ -440,7 +449,7 @@ static void iommu_flush_domain(u16 domid)
440 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 449 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
441 domid, 1, 1); 450 domid, 1, 1);
442 451
443 list_for_each_entry(iommu, &amd_iommu_list, list) { 452 for_each_iommu(iommu) {
444 spin_lock_irqsave(&iommu->lock, flags); 453 spin_lock_irqsave(&iommu->lock, flags);
445 __iommu_queue_command(iommu, &cmd); 454 __iommu_queue_command(iommu, &cmd);
446 __iommu_completion_wait(iommu); 455 __iommu_completion_wait(iommu);
@@ -449,6 +458,35 @@ static void iommu_flush_domain(u16 domid)
449 } 458 }
450} 459}
451 460
461void amd_iommu_flush_all_domains(void)
462{
463 int i;
464
465 for (i = 1; i < MAX_DOMAIN_ID; ++i) {
466 if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
467 continue;
468 iommu_flush_domain(i);
469 }
470}
471
472void amd_iommu_flush_all_devices(void)
473{
474 struct amd_iommu *iommu;
475 int i;
476
477 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
478 if (amd_iommu_pd_table[i] == NULL)
479 continue;
480
481 iommu = amd_iommu_rlookup_table[i];
482 if (!iommu)
483 continue;
484
485 iommu_queue_inv_dev_entry(iommu, i);
486 iommu_completion_wait(iommu);
487 }
488}
489
452/**************************************************************************** 490/****************************************************************************
453 * 491 *
454 * The functions below are used the create the page table mappings for 492 * The functions below are used the create the page table mappings for
@@ -468,7 +506,7 @@ static int iommu_map_page(struct protection_domain *dom,
468 unsigned long phys_addr, 506 unsigned long phys_addr,
469 int prot) 507 int prot)
470{ 508{
471 u64 __pte, *pte, *page; 509 u64 __pte, *pte;
472 510
473 bus_addr = PAGE_ALIGN(bus_addr); 511 bus_addr = PAGE_ALIGN(bus_addr);
474 phys_addr = PAGE_ALIGN(phys_addr); 512 phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +515,7 @@ static int iommu_map_page(struct protection_domain *dom,
477 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) 515 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
478 return -EINVAL; 516 return -EINVAL;
479 517
480 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; 518 pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
481
482 if (!IOMMU_PTE_PRESENT(*pte)) {
483 page = (u64 *)get_zeroed_page(GFP_KERNEL);
484 if (!page)
485 return -ENOMEM;
486 *pte = IOMMU_L2_PDE(virt_to_phys(page));
487 }
488
489 pte = IOMMU_PTE_PAGE(*pte);
490 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
491
492 if (!IOMMU_PTE_PRESENT(*pte)) {
493 page = (u64 *)get_zeroed_page(GFP_KERNEL);
494 if (!page)
495 return -ENOMEM;
496 *pte = IOMMU_L1_PDE(virt_to_phys(page));
497 }
498
499 pte = IOMMU_PTE_PAGE(*pte);
500 pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
501 519
502 if (IOMMU_PTE_PRESENT(*pte)) 520 if (IOMMU_PTE_PRESENT(*pte))
503 return -EBUSY; 521 return -EBUSY;
@@ -595,7 +613,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
595 * as allocated in the aperture 613 * as allocated in the aperture
596 */ 614 */
597 if (addr < dma_dom->aperture_size) 615 if (addr < dma_dom->aperture_size)
598 __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); 616 __set_bit(addr >> PAGE_SHIFT,
617 dma_dom->aperture[0]->bitmap);
599 } 618 }
600 619
601 return 0; 620 return 0;
@@ -632,42 +651,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
632 ****************************************************************************/ 651 ****************************************************************************/
633 652
634/* 653/*
635 * The address allocator core function. 654 * The address allocator core functions.
636 * 655 *
637 * called with domain->lock held 656 * called with domain->lock held
638 */ 657 */
658
659/*
660 * This function checks if there is a PTE for a given dma address. If
661 * there is one, it returns the pointer to it.
662 */
663static u64* fetch_pte(struct protection_domain *domain,
664 unsigned long address)
665{
666 u64 *pte;
667
668 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
669
670 if (!IOMMU_PTE_PRESENT(*pte))
671 return NULL;
672
673 pte = IOMMU_PTE_PAGE(*pte);
674 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
675
676 if (!IOMMU_PTE_PRESENT(*pte))
677 return NULL;
678
679 pte = IOMMU_PTE_PAGE(*pte);
680 pte = &pte[IOMMU_PTE_L0_INDEX(address)];
681
682 return pte;
683}
684
685/*
686 * This function is used to add a new aperture range to an existing
687 * aperture in case of dma_ops domain allocation or address allocation
688 * failure.
689 */
690static int alloc_new_range(struct amd_iommu *iommu,
691 struct dma_ops_domain *dma_dom,
692 bool populate, gfp_t gfp)
693{
694 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
695 int i;
696
697#ifdef CONFIG_IOMMU_STRESS
698 populate = false;
699#endif
700
701 if (index >= APERTURE_MAX_RANGES)
702 return -ENOMEM;
703
704 dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
705 if (!dma_dom->aperture[index])
706 return -ENOMEM;
707
708 dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
709 if (!dma_dom->aperture[index]->bitmap)
710 goto out_free;
711
712 dma_dom->aperture[index]->offset = dma_dom->aperture_size;
713
714 if (populate) {
715 unsigned long address = dma_dom->aperture_size;
716 int i, num_ptes = APERTURE_RANGE_PAGES / 512;
717 u64 *pte, *pte_page;
718
719 for (i = 0; i < num_ptes; ++i) {
720 pte = alloc_pte(&dma_dom->domain, address,
721 &pte_page, gfp);
722 if (!pte)
723 goto out_free;
724
725 dma_dom->aperture[index]->pte_pages[i] = pte_page;
726
727 address += APERTURE_RANGE_SIZE / 64;
728 }
729 }
730
731 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
732
733 /* Intialize the exclusion range if necessary */
734 if (iommu->exclusion_start &&
735 iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
736 iommu->exclusion_start < dma_dom->aperture_size) {
737 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
738 int pages = iommu_num_pages(iommu->exclusion_start,
739 iommu->exclusion_length,
740 PAGE_SIZE);
741 dma_ops_reserve_addresses(dma_dom, startpage, pages);
742 }
743
744 /*
745 * Check for areas already mapped as present in the new aperture
746 * range and mark those pages as reserved in the allocator. Such
747 * mappings may already exist as a result of requested unity
748 * mappings for devices.
749 */
750 for (i = dma_dom->aperture[index]->offset;
751 i < dma_dom->aperture_size;
752 i += PAGE_SIZE) {
753 u64 *pte = fetch_pte(&dma_dom->domain, i);
754 if (!pte || !IOMMU_PTE_PRESENT(*pte))
755 continue;
756
757 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
758 }
759
760 return 0;
761
762out_free:
763 free_page((unsigned long)dma_dom->aperture[index]->bitmap);
764
765 kfree(dma_dom->aperture[index]);
766 dma_dom->aperture[index] = NULL;
767
768 return -ENOMEM;
769}
770
771static unsigned long dma_ops_area_alloc(struct device *dev,
772 struct dma_ops_domain *dom,
773 unsigned int pages,
774 unsigned long align_mask,
775 u64 dma_mask,
776 unsigned long start)
777{
778 unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
779 int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
780 int i = start >> APERTURE_RANGE_SHIFT;
781 unsigned long boundary_size;
782 unsigned long address = -1;
783 unsigned long limit;
784
785 next_bit >>= PAGE_SHIFT;
786
787 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
788 PAGE_SIZE) >> PAGE_SHIFT;
789
790 for (;i < max_index; ++i) {
791 unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
792
793 if (dom->aperture[i]->offset >= dma_mask)
794 break;
795
796 limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
797 dma_mask >> PAGE_SHIFT);
798
799 address = iommu_area_alloc(dom->aperture[i]->bitmap,
800 limit, next_bit, pages, 0,
801 boundary_size, align_mask);
802 if (address != -1) {
803 address = dom->aperture[i]->offset +
804 (address << PAGE_SHIFT);
805 dom->next_address = address + (pages << PAGE_SHIFT);
806 break;
807 }
808
809 next_bit = 0;
810 }
811
812 return address;
813}
814
639static unsigned long dma_ops_alloc_addresses(struct device *dev, 815static unsigned long dma_ops_alloc_addresses(struct device *dev,
640 struct dma_ops_domain *dom, 816 struct dma_ops_domain *dom,
641 unsigned int pages, 817 unsigned int pages,
642 unsigned long align_mask, 818 unsigned long align_mask,
643 u64 dma_mask) 819 u64 dma_mask)
644{ 820{
645 unsigned long limit;
646 unsigned long address; 821 unsigned long address;
647 unsigned long boundary_size;
648 822
649 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 823#ifdef CONFIG_IOMMU_STRESS
650 PAGE_SIZE) >> PAGE_SHIFT; 824 dom->next_address = 0;
651 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, 825 dom->need_flush = true;
652 dma_mask >> PAGE_SHIFT); 826#endif
653 827
654 if (dom->next_bit >= limit) { 828 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
655 dom->next_bit = 0; 829 dma_mask, dom->next_address);
656 dom->need_flush = true;
657 }
658 830
659 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
660 0 , boundary_size, align_mask);
661 if (address == -1) { 831 if (address == -1) {
662 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 832 dom->next_address = 0;
663 0, boundary_size, align_mask); 833 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
834 dma_mask, 0);
664 dom->need_flush = true; 835 dom->need_flush = true;
665 } 836 }
666 837
667 if (likely(address != -1)) { 838 if (unlikely(address == -1))
668 dom->next_bit = address + pages;
669 address <<= PAGE_SHIFT;
670 } else
671 address = bad_dma_address; 839 address = bad_dma_address;
672 840
673 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 841 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -684,11 +852,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
684 unsigned long address, 852 unsigned long address,
685 unsigned int pages) 853 unsigned int pages)
686{ 854{
687 address >>= PAGE_SHIFT; 855 unsigned i = address >> APERTURE_RANGE_SHIFT;
688 iommu_area_free(dom->bitmap, address, pages); 856 struct aperture_range *range = dom->aperture[i];
689 857
690 if (address >= dom->next_bit) 858 BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
859
860#ifdef CONFIG_IOMMU_STRESS
861 if (i < 4)
862 return;
863#endif
864
865 if (address >= dom->next_address)
691 dom->need_flush = true; 866 dom->need_flush = true;
867
868 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
869
870 iommu_area_free(range->bitmap, address, pages);
871
692} 872}
693 873
694/**************************************************************************** 874/****************************************************************************
@@ -736,12 +916,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
736 unsigned long start_page, 916 unsigned long start_page,
737 unsigned int pages) 917 unsigned int pages)
738{ 918{
739 unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; 919 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
740 920
741 if (start_page + pages > last_page) 921 if (start_page + pages > last_page)
742 pages = last_page - start_page; 922 pages = last_page - start_page;
743 923
744 iommu_area_reserve(dom->bitmap, start_page, pages); 924 for (i = start_page; i < start_page + pages; ++i) {
925 int index = i / APERTURE_RANGE_PAGES;
926 int page = i % APERTURE_RANGE_PAGES;
927 __set_bit(page, dom->aperture[index]->bitmap);
928 }
745} 929}
746 930
747static void free_pagetable(struct protection_domain *domain) 931static void free_pagetable(struct protection_domain *domain)
@@ -780,14 +964,19 @@ static void free_pagetable(struct protection_domain *domain)
780 */ 964 */
781static void dma_ops_domain_free(struct dma_ops_domain *dom) 965static void dma_ops_domain_free(struct dma_ops_domain *dom)
782{ 966{
967 int i;
968
783 if (!dom) 969 if (!dom)
784 return; 970 return;
785 971
786 free_pagetable(&dom->domain); 972 free_pagetable(&dom->domain);
787 973
788 kfree(dom->pte_pages); 974 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
789 975 if (!dom->aperture[i])
790 kfree(dom->bitmap); 976 continue;
977 free_page((unsigned long)dom->aperture[i]->bitmap);
978 kfree(dom->aperture[i]);
979 }
791 980
792 kfree(dom); 981 kfree(dom);
793} 982}
@@ -797,19 +986,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
797 * It also intializes the page table and the address allocator data 986 * It also intializes the page table and the address allocator data
798 * structures required for the dma_ops interface 987 * structures required for the dma_ops interface
799 */ 988 */
800static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 989static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
801 unsigned order)
802{ 990{
803 struct dma_ops_domain *dma_dom; 991 struct dma_ops_domain *dma_dom;
804 unsigned i, num_pte_pages;
805 u64 *l2_pde;
806 u64 address;
807
808 /*
809 * Currently the DMA aperture must be between 32 MB and 1GB in size
810 */
811 if ((order < 25) || (order > 30))
812 return NULL;
813 992
814 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); 993 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
815 if (!dma_dom) 994 if (!dma_dom)
@@ -826,55 +1005,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
826 dma_dom->domain.priv = dma_dom; 1005 dma_dom->domain.priv = dma_dom;
827 if (!dma_dom->domain.pt_root) 1006 if (!dma_dom->domain.pt_root)
828 goto free_dma_dom; 1007 goto free_dma_dom;
829 dma_dom->aperture_size = (1ULL << order);
830 dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
831 GFP_KERNEL);
832 if (!dma_dom->bitmap)
833 goto free_dma_dom;
834 /*
835 * mark the first page as allocated so we never return 0 as
836 * a valid dma-address. So we can use 0 as error value
837 */
838 dma_dom->bitmap[0] = 1;
839 dma_dom->next_bit = 0;
840 1008
841 dma_dom->need_flush = false; 1009 dma_dom->need_flush = false;
842 dma_dom->target_dev = 0xffff; 1010 dma_dom->target_dev = 0xffff;
843 1011
844 /* Intialize the exclusion range if necessary */ 1012 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
845 if (iommu->exclusion_start && 1013 goto free_dma_dom;
846 iommu->exclusion_start < dma_dom->aperture_size) {
847 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
848 int pages = iommu_num_pages(iommu->exclusion_start,
849 iommu->exclusion_length,
850 PAGE_SIZE);
851 dma_ops_reserve_addresses(dma_dom, startpage, pages);
852 }
853 1014
854 /* 1015 /*
855 * At the last step, build the page tables so we don't need to 1016 * mark the first page as allocated so we never return 0 as
856 * allocate page table pages in the dma_ops mapping/unmapping 1017 * a valid dma-address. So we can use 0 as error value
857 * path.
858 */ 1018 */
859 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 1019 dma_dom->aperture[0]->bitmap[0] = 1;
860 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 1020 dma_dom->next_address = 0;
861 GFP_KERNEL);
862 if (!dma_dom->pte_pages)
863 goto free_dma_dom;
864
865 l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
866 if (l2_pde == NULL)
867 goto free_dma_dom;
868 1021
869 dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
870
871 for (i = 0; i < num_pte_pages; ++i) {
872 dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
873 if (!dma_dom->pte_pages[i])
874 goto free_dma_dom;
875 address = virt_to_phys(dma_dom->pte_pages[i]);
876 l2_pde[i] = IOMMU_L1_PDE(address);
877 }
878 1022
879 return dma_dom; 1023 return dma_dom;
880 1024
@@ -983,7 +1127,6 @@ static int device_change_notifier(struct notifier_block *nb,
983 struct protection_domain *domain; 1127 struct protection_domain *domain;
984 struct dma_ops_domain *dma_domain; 1128 struct dma_ops_domain *dma_domain;
985 struct amd_iommu *iommu; 1129 struct amd_iommu *iommu;
986 int order = amd_iommu_aperture_order;
987 unsigned long flags; 1130 unsigned long flags;
988 1131
989 if (devid > amd_iommu_last_bdf) 1132 if (devid > amd_iommu_last_bdf)
@@ -1002,17 +1145,7 @@ static int device_change_notifier(struct notifier_block *nb,
1002 "to a non-dma-ops domain\n", dev_name(dev)); 1145 "to a non-dma-ops domain\n", dev_name(dev));
1003 1146
1004 switch (action) { 1147 switch (action) {
1005 case BUS_NOTIFY_BOUND_DRIVER: 1148 case BUS_NOTIFY_UNBOUND_DRIVER:
1006 if (domain)
1007 goto out;
1008 dma_domain = find_protection_domain(devid);
1009 if (!dma_domain)
1010 dma_domain = iommu->default_dom;
1011 attach_device(iommu, &dma_domain->domain, devid);
1012 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
1013 "device %s\n", dma_domain->domain.id, dev_name(dev));
1014 break;
1015 case BUS_NOTIFY_UNBIND_DRIVER:
1016 if (!domain) 1149 if (!domain)
1017 goto out; 1150 goto out;
1018 detach_device(domain, devid); 1151 detach_device(domain, devid);
@@ -1022,7 +1155,7 @@ static int device_change_notifier(struct notifier_block *nb,
1022 dma_domain = find_protection_domain(devid); 1155 dma_domain = find_protection_domain(devid);
1023 if (dma_domain) 1156 if (dma_domain)
1024 goto out; 1157 goto out;
1025 dma_domain = dma_ops_domain_alloc(iommu, order); 1158 dma_domain = dma_ops_domain_alloc(iommu);
1026 if (!dma_domain) 1159 if (!dma_domain)
1027 goto out; 1160 goto out;
1028 dma_domain->target_dev = devid; 1161 dma_domain->target_dev = devid;
@@ -1133,8 +1266,8 @@ static int get_device_resources(struct device *dev,
1133 dma_dom = (*iommu)->default_dom; 1266 dma_dom = (*iommu)->default_dom;
1134 *domain = &dma_dom->domain; 1267 *domain = &dma_dom->domain;
1135 attach_device(*iommu, *domain, *bdf); 1268 attach_device(*iommu, *domain, *bdf);
1136 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 1269 DUMP_printk("Using protection domain %d for device %s\n",
1137 "device %s\n", (*domain)->id, dev_name(dev)); 1270 (*domain)->id, dev_name(dev));
1138 } 1271 }
1139 1272
1140 if (domain_for_device(_bdf) == NULL) 1273 if (domain_for_device(_bdf) == NULL)
@@ -1144,6 +1277,66 @@ static int get_device_resources(struct device *dev,
1144} 1277}
1145 1278
1146/* 1279/*
1280 * If the pte_page is not yet allocated this function is called
1281 */
1282static u64* alloc_pte(struct protection_domain *dom,
1283 unsigned long address, u64 **pte_page, gfp_t gfp)
1284{
1285 u64 *pte, *page;
1286
1287 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
1288
1289 if (!IOMMU_PTE_PRESENT(*pte)) {
1290 page = (u64 *)get_zeroed_page(gfp);
1291 if (!page)
1292 return NULL;
1293 *pte = IOMMU_L2_PDE(virt_to_phys(page));
1294 }
1295
1296 pte = IOMMU_PTE_PAGE(*pte);
1297 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
1298
1299 if (!IOMMU_PTE_PRESENT(*pte)) {
1300 page = (u64 *)get_zeroed_page(gfp);
1301 if (!page)
1302 return NULL;
1303 *pte = IOMMU_L1_PDE(virt_to_phys(page));
1304 }
1305
1306 pte = IOMMU_PTE_PAGE(*pte);
1307
1308 if (pte_page)
1309 *pte_page = pte;
1310
1311 pte = &pte[IOMMU_PTE_L0_INDEX(address)];
1312
1313 return pte;
1314}
1315
1316/*
1317 * This function fetches the PTE for a given address in the aperture
1318 */
1319static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1320 unsigned long address)
1321{
1322 struct aperture_range *aperture;
1323 u64 *pte, *pte_page;
1324
1325 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1326 if (!aperture)
1327 return NULL;
1328
1329 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1330 if (!pte) {
1331 pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
1332 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1333 } else
1334 pte += IOMMU_PTE_L0_INDEX(address);
1335
1336 return pte;
1337}
1338
1339/*
1147 * This is the generic map function. It maps one 4kb page at paddr to 1340 * This is the generic map function. It maps one 4kb page at paddr to
1148 * the given address in the DMA address space for the domain. 1341 * the given address in the DMA address space for the domain.
1149 */ 1342 */
@@ -1159,8 +1352,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1159 1352
1160 paddr &= PAGE_MASK; 1353 paddr &= PAGE_MASK;
1161 1354
1162 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; 1355 pte = dma_ops_get_pte(dom, address);
1163 pte += IOMMU_PTE_L0_INDEX(address); 1356 if (!pte)
1357 return bad_dma_address;
1164 1358
1165 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1359 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1166 1360
@@ -1185,14 +1379,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1185 struct dma_ops_domain *dom, 1379 struct dma_ops_domain *dom,
1186 unsigned long address) 1380 unsigned long address)
1187{ 1381{
1382 struct aperture_range *aperture;
1188 u64 *pte; 1383 u64 *pte;
1189 1384
1190 if (address >= dom->aperture_size) 1385 if (address >= dom->aperture_size)
1191 return; 1386 return;
1192 1387
1193 WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); 1388 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1389 if (!aperture)
1390 return;
1391
1392 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1393 if (!pte)
1394 return;
1194 1395
1195 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
1196 pte += IOMMU_PTE_L0_INDEX(address); 1396 pte += IOMMU_PTE_L0_INDEX(address);
1197 1397
1198 WARN_ON(!*pte); 1398 WARN_ON(!*pte);
@@ -1216,7 +1416,7 @@ static dma_addr_t __map_single(struct device *dev,
1216 u64 dma_mask) 1416 u64 dma_mask)
1217{ 1417{
1218 dma_addr_t offset = paddr & ~PAGE_MASK; 1418 dma_addr_t offset = paddr & ~PAGE_MASK;
1219 dma_addr_t address, start; 1419 dma_addr_t address, start, ret;
1220 unsigned int pages; 1420 unsigned int pages;
1221 unsigned long align_mask = 0; 1421 unsigned long align_mask = 0;
1222 int i; 1422 int i;
@@ -1232,14 +1432,33 @@ static dma_addr_t __map_single(struct device *dev,
1232 if (align) 1432 if (align)
1233 align_mask = (1UL << get_order(size)) - 1; 1433 align_mask = (1UL << get_order(size)) - 1;
1234 1434
1435retry:
1235 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1436 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1236 dma_mask); 1437 dma_mask);
1237 if (unlikely(address == bad_dma_address)) 1438 if (unlikely(address == bad_dma_address)) {
1238 goto out; 1439 /*
1440 * setting next_address here will let the address
1441 * allocator only scan the new allocated range in the
1442 * first run. This is a small optimization.
1443 */
1444 dma_dom->next_address = dma_dom->aperture_size;
1445
1446 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
1447 goto out;
1448
1449 /*
1450 * aperture was sucessfully enlarged by 128 MB, try
1451 * allocation again
1452 */
1453 goto retry;
1454 }
1239 1455
1240 start = address; 1456 start = address;
1241 for (i = 0; i < pages; ++i) { 1457 for (i = 0; i < pages; ++i) {
1242 dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1458 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
1459 if (ret == bad_dma_address)
1460 goto out_unmap;
1461
1243 paddr += PAGE_SIZE; 1462 paddr += PAGE_SIZE;
1244 start += PAGE_SIZE; 1463 start += PAGE_SIZE;
1245 } 1464 }
@@ -1255,6 +1474,17 @@ static dma_addr_t __map_single(struct device *dev,
1255 1474
1256out: 1475out:
1257 return address; 1476 return address;
1477
1478out_unmap:
1479
1480 for (--i; i >= 0; --i) {
1481 start -= PAGE_SIZE;
1482 dma_ops_domain_unmap(iommu, dma_dom, start);
1483 }
1484
1485 dma_ops_free_addresses(dma_dom, address, pages);
1486
1487 return bad_dma_address;
1258} 1488}
1259 1489
1260/* 1490/*
@@ -1537,8 +1767,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
1537 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1767 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1538 size, DMA_BIDIRECTIONAL, true, dma_mask); 1768 size, DMA_BIDIRECTIONAL, true, dma_mask);
1539 1769
1540 if (*dma_addr == bad_dma_address) 1770 if (*dma_addr == bad_dma_address) {
1771 spin_unlock_irqrestore(&domain->lock, flags);
1541 goto out_free; 1772 goto out_free;
1773 }
1542 1774
1543 iommu_completion_wait(iommu); 1775 iommu_completion_wait(iommu);
1544 1776
@@ -1625,7 +1857,6 @@ static void prealloc_protection_domains(void)
1625 struct pci_dev *dev = NULL; 1857 struct pci_dev *dev = NULL;
1626 struct dma_ops_domain *dma_dom; 1858 struct dma_ops_domain *dma_dom;
1627 struct amd_iommu *iommu; 1859 struct amd_iommu *iommu;
1628 int order = amd_iommu_aperture_order;
1629 u16 devid; 1860 u16 devid;
1630 1861
1631 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1862 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1638,7 +1869,7 @@ static void prealloc_protection_domains(void)
1638 iommu = amd_iommu_rlookup_table[devid]; 1869 iommu = amd_iommu_rlookup_table[devid];
1639 if (!iommu) 1870 if (!iommu)
1640 continue; 1871 continue;
1641 dma_dom = dma_ops_domain_alloc(iommu, order); 1872 dma_dom = dma_ops_domain_alloc(iommu);
1642 if (!dma_dom) 1873 if (!dma_dom)
1643 continue; 1874 continue;
1644 init_unity_mappings_for_device(dma_dom, devid); 1875 init_unity_mappings_for_device(dma_dom, devid);
@@ -1664,7 +1895,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
1664int __init amd_iommu_init_dma_ops(void) 1895int __init amd_iommu_init_dma_ops(void)
1665{ 1896{
1666 struct amd_iommu *iommu; 1897 struct amd_iommu *iommu;
1667 int order = amd_iommu_aperture_order;
1668 int ret; 1898 int ret;
1669 1899
1670 /* 1900 /*
@@ -1672,8 +1902,8 @@ int __init amd_iommu_init_dma_ops(void)
1672 * found in the system. Devices not assigned to any other 1902 * found in the system. Devices not assigned to any other
1673 * protection domain will be assigned to the default one. 1903 * protection domain will be assigned to the default one.
1674 */ 1904 */
1675 list_for_each_entry(iommu, &amd_iommu_list, list) { 1905 for_each_iommu(iommu) {
1676 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1906 iommu->default_dom = dma_ops_domain_alloc(iommu);
1677 if (iommu->default_dom == NULL) 1907 if (iommu->default_dom == NULL)
1678 return -ENOMEM; 1908 return -ENOMEM;
1679 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 1909 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -1710,7 +1940,7 @@ int __init amd_iommu_init_dma_ops(void)
1710 1940
1711free_domains: 1941free_domains:
1712 1942
1713 list_for_each_entry(iommu, &amd_iommu_list, list) { 1943 for_each_iommu(iommu) {
1714 if (iommu->default_dom) 1944 if (iommu->default_dom)
1715 dma_ops_domain_free(iommu->default_dom); 1945 dma_ops_domain_free(iommu->default_dom);
1716 } 1946 }
@@ -1842,7 +2072,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
1842 2072
1843 old_domain = domain_for_device(devid); 2073 old_domain = domain_for_device(devid);
1844 if (old_domain) 2074 if (old_domain)
1845 return -EBUSY; 2075 detach_device(old_domain, devid);
1846 2076
1847 attach_device(iommu, domain, devid); 2077 attach_device(iommu, domain, devid);
1848 2078
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902dac..238989ec077d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -115,15 +115,21 @@ struct ivmd_header {
115 u64 range_length; 115 u64 range_length;
116} __attribute__((packed)); 116} __attribute__((packed));
117 117
118bool amd_iommu_dump;
119
118static int __initdata amd_iommu_detected; 120static int __initdata amd_iommu_detected;
119 121
120u16 amd_iommu_last_bdf; /* largest PCI device id we have 122u16 amd_iommu_last_bdf; /* largest PCI device id we have
121 to handle */ 123 to handle */
122LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
123 we find in ACPI */ 125 we find in ACPI */
124unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
125bool amd_iommu_isolate = true; /* if true, device isolation is 129bool amd_iommu_isolate = true; /* if true, device isolation is
126 enabled */ 130 enabled */
131#endif
132
127bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
128 134
129LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid)
175static inline unsigned long tbl_size(int entry_size) 181static inline unsigned long tbl_size(int entry_size)
176{ 182{
177 unsigned shift = PAGE_SHIFT + 183 unsigned shift = PAGE_SHIFT +
178 get_order(amd_iommu_last_bdf * entry_size); 184 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
179 185
180 return 1UL << shift; 186 return 1UL << shift;
181} 187}
@@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size)
193 * This function set the exclusion range in the IOMMU. DMA accesses to the 199 * This function set the exclusion range in the IOMMU. DMA accesses to the
194 * exclusion range are passed through untranslated 200 * exclusion range are passed through untranslated
195 */ 201 */
196static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 202static void iommu_set_exclusion_range(struct amd_iommu *iommu)
197{ 203{
198 u64 start = iommu->exclusion_start & PAGE_MASK; 204 u64 start = iommu->exclusion_start & PAGE_MASK;
199 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; 205 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
225} 231}
226 232
227/* Generic functions to enable/disable certain features of the IOMMU. */ 233/* Generic functions to enable/disable certain features of the IOMMU. */
228static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 234static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
229{ 235{
230 u32 ctrl; 236 u32 ctrl;
231 237
@@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
244} 250}
245 251
246/* Function to enable the hardware */ 252/* Function to enable the hardware */
247static void __init iommu_enable(struct amd_iommu *iommu) 253static void iommu_enable(struct amd_iommu *iommu)
248{ 254{
249 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", 255 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
250 dev_name(&iommu->dev->dev), iommu->cap_ptr); 256 dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -252,11 +258,9 @@ static void __init iommu_enable(struct amd_iommu *iommu)
252 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 258 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
253} 259}
254 260
255/* Function to enable IOMMU event logging and event interrupts */ 261static void iommu_disable(struct amd_iommu *iommu)
256static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
257{ 262{
258 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); 263 iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
259 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
260} 264}
261 265
262/* 266/*
@@ -413,25 +417,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
413{ 417{
414 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 418 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
415 get_order(CMD_BUFFER_SIZE)); 419 get_order(CMD_BUFFER_SIZE));
416 u64 entry;
417 420
418 if (cmd_buf == NULL) 421 if (cmd_buf == NULL)
419 return NULL; 422 return NULL;
420 423
421 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 424 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
422 425
423 entry = (u64)virt_to_phys(cmd_buf); 426 return cmd_buf;
427}
428
429/*
430 * This function writes the command buffer address to the hardware and
431 * enables it.
432 */
433static void iommu_enable_command_buffer(struct amd_iommu *iommu)
434{
435 u64 entry;
436
437 BUG_ON(iommu->cmd_buf == NULL);
438
439 entry = (u64)virt_to_phys(iommu->cmd_buf);
424 entry |= MMIO_CMD_SIZE_512; 440 entry |= MMIO_CMD_SIZE_512;
441
425 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 442 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
426 &entry, sizeof(entry)); 443 &entry, sizeof(entry));
427 444
428 /* set head and tail to zero manually */ 445 /* set head and tail to zero manually */
429 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 446 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
430 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 447 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
431 448
432 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); 449 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
433
434 return cmd_buf;
435} 450}
436 451
437static void __init free_command_buffer(struct amd_iommu *iommu) 452static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +458,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
443/* allocates the memory where the IOMMU will log its events to */ 458/* allocates the memory where the IOMMU will log its events to */
444static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) 459static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
445{ 460{
446 u64 entry;
447 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 461 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
448 get_order(EVT_BUFFER_SIZE)); 462 get_order(EVT_BUFFER_SIZE));
449 463
450 if (iommu->evt_buf == NULL) 464 if (iommu->evt_buf == NULL)
451 return NULL; 465 return NULL;
452 466
467 return iommu->evt_buf;
468}
469
470static void iommu_enable_event_buffer(struct amd_iommu *iommu)
471{
472 u64 entry;
473
474 BUG_ON(iommu->evt_buf == NULL);
475
453 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; 476 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
477
454 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, 478 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
455 &entry, sizeof(entry)); 479 &entry, sizeof(entry));
456 480
457 iommu->evt_buf_size = EVT_BUFFER_SIZE; 481 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
458
459 return iommu->evt_buf;
460} 482}
461 483
462static void __init free_event_buffer(struct amd_iommu *iommu) 484static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -596,32 +618,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
596 p += sizeof(struct ivhd_header); 618 p += sizeof(struct ivhd_header);
597 end += h->length; 619 end += h->length;
598 620
621
599 while (p < end) { 622 while (p < end) {
600 e = (struct ivhd_entry *)p; 623 e = (struct ivhd_entry *)p;
601 switch (e->type) { 624 switch (e->type) {
602 case IVHD_DEV_ALL: 625 case IVHD_DEV_ALL:
626
627 DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
628 " last device %02x:%02x.%x flags: %02x\n",
629 PCI_BUS(iommu->first_device),
630 PCI_SLOT(iommu->first_device),
631 PCI_FUNC(iommu->first_device),
632 PCI_BUS(iommu->last_device),
633 PCI_SLOT(iommu->last_device),
634 PCI_FUNC(iommu->last_device),
635 e->flags);
636
603 for (dev_i = iommu->first_device; 637 for (dev_i = iommu->first_device;
604 dev_i <= iommu->last_device; ++dev_i) 638 dev_i <= iommu->last_device; ++dev_i)
605 set_dev_entry_from_acpi(iommu, dev_i, 639 set_dev_entry_from_acpi(iommu, dev_i,
606 e->flags, 0); 640 e->flags, 0);
607 break; 641 break;
608 case IVHD_DEV_SELECT: 642 case IVHD_DEV_SELECT:
643
644 DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
645 "flags: %02x\n",
646 PCI_BUS(e->devid),
647 PCI_SLOT(e->devid),
648 PCI_FUNC(e->devid),
649 e->flags);
650
609 devid = e->devid; 651 devid = e->devid;
610 set_dev_entry_from_acpi(iommu, devid, e->flags, 0); 652 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
611 break; 653 break;
612 case IVHD_DEV_SELECT_RANGE_START: 654 case IVHD_DEV_SELECT_RANGE_START:
655
656 DUMP_printk(" DEV_SELECT_RANGE_START\t "
657 "devid: %02x:%02x.%x flags: %02x\n",
658 PCI_BUS(e->devid),
659 PCI_SLOT(e->devid),
660 PCI_FUNC(e->devid),
661 e->flags);
662
613 devid_start = e->devid; 663 devid_start = e->devid;
614 flags = e->flags; 664 flags = e->flags;
615 ext_flags = 0; 665 ext_flags = 0;
616 alias = false; 666 alias = false;
617 break; 667 break;
618 case IVHD_DEV_ALIAS: 668 case IVHD_DEV_ALIAS:
669
670 DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
671 "flags: %02x devid_to: %02x:%02x.%x\n",
672 PCI_BUS(e->devid),
673 PCI_SLOT(e->devid),
674 PCI_FUNC(e->devid),
675 e->flags,
676 PCI_BUS(e->ext >> 8),
677 PCI_SLOT(e->ext >> 8),
678 PCI_FUNC(e->ext >> 8));
679
619 devid = e->devid; 680 devid = e->devid;
620 devid_to = e->ext >> 8; 681 devid_to = e->ext >> 8;
621 set_dev_entry_from_acpi(iommu, devid, e->flags, 0); 682 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
622 amd_iommu_alias_table[devid] = devid_to; 683 amd_iommu_alias_table[devid] = devid_to;
623 break; 684 break;
624 case IVHD_DEV_ALIAS_RANGE: 685 case IVHD_DEV_ALIAS_RANGE:
686
687 DUMP_printk(" DEV_ALIAS_RANGE\t\t "
688 "devid: %02x:%02x.%x flags: %02x "
689 "devid_to: %02x:%02x.%x\n",
690 PCI_BUS(e->devid),
691 PCI_SLOT(e->devid),
692 PCI_FUNC(e->devid),
693 e->flags,
694 PCI_BUS(e->ext >> 8),
695 PCI_SLOT(e->ext >> 8),
696 PCI_FUNC(e->ext >> 8));
697
625 devid_start = e->devid; 698 devid_start = e->devid;
626 flags = e->flags; 699 flags = e->flags;
627 devid_to = e->ext >> 8; 700 devid_to = e->ext >> 8;
@@ -629,17 +702,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
629 alias = true; 702 alias = true;
630 break; 703 break;
631 case IVHD_DEV_EXT_SELECT: 704 case IVHD_DEV_EXT_SELECT:
705
706 DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
707 "flags: %02x ext: %08x\n",
708 PCI_BUS(e->devid),
709 PCI_SLOT(e->devid),
710 PCI_FUNC(e->devid),
711 e->flags, e->ext);
712
632 devid = e->devid; 713 devid = e->devid;
633 set_dev_entry_from_acpi(iommu, devid, e->flags, 714 set_dev_entry_from_acpi(iommu, devid, e->flags,
634 e->ext); 715 e->ext);
635 break; 716 break;
636 case IVHD_DEV_EXT_SELECT_RANGE: 717 case IVHD_DEV_EXT_SELECT_RANGE:
718
719 DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
720 "%02x:%02x.%x flags: %02x ext: %08x\n",
721 PCI_BUS(e->devid),
722 PCI_SLOT(e->devid),
723 PCI_FUNC(e->devid),
724 e->flags, e->ext);
725
637 devid_start = e->devid; 726 devid_start = e->devid;
638 flags = e->flags; 727 flags = e->flags;
639 ext_flags = e->ext; 728 ext_flags = e->ext;
640 alias = false; 729 alias = false;
641 break; 730 break;
642 case IVHD_DEV_RANGE_END: 731 case IVHD_DEV_RANGE_END:
732
733 DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
734 PCI_BUS(e->devid),
735 PCI_SLOT(e->devid),
736 PCI_FUNC(e->devid));
737
643 devid = e->devid; 738 devid = e->devid;
644 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 739 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
645 if (alias) 740 if (alias)
@@ -679,7 +774,7 @@ static void __init free_iommu_all(void)
679{ 774{
680 struct amd_iommu *iommu, *next; 775 struct amd_iommu *iommu, *next;
681 776
682 list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { 777 for_each_iommu_safe(iommu, next) {
683 list_del(&iommu->list); 778 list_del(&iommu->list);
684 free_iommu_one(iommu); 779 free_iommu_one(iommu);
685 kfree(iommu); 780 kfree(iommu);
@@ -710,7 +805,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
710 if (!iommu->mmio_base) 805 if (!iommu->mmio_base)
711 return -ENOMEM; 806 return -ENOMEM;
712 807
713 iommu_set_device_table(iommu);
714 iommu->cmd_buf = alloc_command_buffer(iommu); 808 iommu->cmd_buf = alloc_command_buffer(iommu);
715 if (!iommu->cmd_buf) 809 if (!iommu->cmd_buf)
716 return -ENOMEM; 810 return -ENOMEM;
@@ -746,6 +840,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
746 h = (struct ivhd_header *)p; 840 h = (struct ivhd_header *)p;
747 switch (*p) { 841 switch (*p) {
748 case ACPI_IVHD_TYPE: 842 case ACPI_IVHD_TYPE:
843
844 DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
845 "seg: %d flags: %01x info %04x\n",
846 PCI_BUS(h->devid), PCI_SLOT(h->devid),
847 PCI_FUNC(h->devid), h->cap_ptr,
848 h->pci_seg, h->flags, h->info);
849 DUMP_printk(" mmio-addr: %016llx\n",
850 h->mmio_phys);
851
749 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); 852 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
750 if (iommu == NULL) 853 if (iommu == NULL)
751 return -ENOMEM; 854 return -ENOMEM;
@@ -773,56 +876,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
773 * 876 *
774 ****************************************************************************/ 877 ****************************************************************************/
775 878
776static int __init iommu_setup_msix(struct amd_iommu *iommu)
777{
778 struct amd_iommu *curr;
779 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
780 int nvec = 0, i;
781
782 list_for_each_entry(curr, &amd_iommu_list, list) {
783 if (curr->dev == iommu->dev) {
784 entries[nvec].entry = curr->evt_msi_num;
785 entries[nvec].vector = 0;
786 curr->int_enabled = true;
787 nvec++;
788 }
789 }
790
791 if (pci_enable_msix(iommu->dev, entries, nvec)) {
792 pci_disable_msix(iommu->dev);
793 return 1;
794 }
795
796 for (i = 0; i < nvec; ++i) {
797 int r = request_irq(entries->vector, amd_iommu_int_handler,
798 IRQF_SAMPLE_RANDOM,
799 "AMD IOMMU",
800 NULL);
801 if (r)
802 goto out_free;
803 }
804
805 return 0;
806
807out_free:
808 for (i -= 1; i >= 0; --i)
809 free_irq(entries->vector, NULL);
810
811 pci_disable_msix(iommu->dev);
812
813 return 1;
814}
815
816static int __init iommu_setup_msi(struct amd_iommu *iommu) 879static int __init iommu_setup_msi(struct amd_iommu *iommu)
817{ 880{
818 int r; 881 int r;
819 struct amd_iommu *curr;
820
821 list_for_each_entry(curr, &amd_iommu_list, list) {
822 if (curr->dev == iommu->dev)
823 curr->int_enabled = true;
824 }
825
826 882
827 if (pci_enable_msi(iommu->dev)) 883 if (pci_enable_msi(iommu->dev))
828 return 1; 884 return 1;
@@ -837,17 +893,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
837 return 1; 893 return 1;
838 } 894 }
839 895
896 iommu->int_enabled = true;
897 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
898
840 return 0; 899 return 0;
841} 900}
842 901
843static int __init iommu_init_msi(struct amd_iommu *iommu) 902static int iommu_init_msi(struct amd_iommu *iommu)
844{ 903{
845 if (iommu->int_enabled) 904 if (iommu->int_enabled)
846 return 0; 905 return 0;
847 906
848 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) 907 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
849 return iommu_setup_msix(iommu);
850 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
851 return iommu_setup_msi(iommu); 908 return iommu_setup_msi(iommu);
852 909
853 return 1; 910 return 1;
@@ -899,6 +956,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
899static int __init init_unity_map_range(struct ivmd_header *m) 956static int __init init_unity_map_range(struct ivmd_header *m)
900{ 957{
901 struct unity_map_entry *e = 0; 958 struct unity_map_entry *e = 0;
959 char *s;
902 960
903 e = kzalloc(sizeof(*e), GFP_KERNEL); 961 e = kzalloc(sizeof(*e), GFP_KERNEL);
904 if (e == NULL) 962 if (e == NULL)
@@ -906,14 +964,19 @@ static int __init init_unity_map_range(struct ivmd_header *m)
906 964
907 switch (m->type) { 965 switch (m->type) {
908 default: 966 default:
967 kfree(e);
968 return 0;
909 case ACPI_IVMD_TYPE: 969 case ACPI_IVMD_TYPE:
970 s = "IVMD_TYPEi\t\t\t";
910 e->devid_start = e->devid_end = m->devid; 971 e->devid_start = e->devid_end = m->devid;
911 break; 972 break;
912 case ACPI_IVMD_TYPE_ALL: 973 case ACPI_IVMD_TYPE_ALL:
974 s = "IVMD_TYPE_ALL\t\t";
913 e->devid_start = 0; 975 e->devid_start = 0;
914 e->devid_end = amd_iommu_last_bdf; 976 e->devid_end = amd_iommu_last_bdf;
915 break; 977 break;
916 case ACPI_IVMD_TYPE_RANGE: 978 case ACPI_IVMD_TYPE_RANGE:
979 s = "IVMD_TYPE_RANGE\t\t";
917 e->devid_start = m->devid; 980 e->devid_start = m->devid;
918 e->devid_end = m->aux; 981 e->devid_end = m->aux;
919 break; 982 break;
@@ -922,6 +985,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
922 e->address_end = e->address_start + PAGE_ALIGN(m->range_length); 985 e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
923 e->prot = m->flags >> 1; 986 e->prot = m->flags >> 1;
924 987
988 DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
989 " range_start: %016llx range_end: %016llx flags: %x\n", s,
990 PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
991 PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
992 PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
993 e->address_start, e->address_end, m->flags);
994
925 list_add_tail(&e->list, &amd_iommu_unity_map); 995 list_add_tail(&e->list, &amd_iommu_unity_map);
926 996
927 return 0; 997 return 0;
@@ -967,18 +1037,28 @@ static void init_device_table(void)
967 * This function finally enables all IOMMUs found in the system after 1037 * This function finally enables all IOMMUs found in the system after
968 * they have been initialized 1038 * they have been initialized
969 */ 1039 */
970static void __init enable_iommus(void) 1040static void enable_iommus(void)
971{ 1041{
972 struct amd_iommu *iommu; 1042 struct amd_iommu *iommu;
973 1043
974 list_for_each_entry(iommu, &amd_iommu_list, list) { 1044 for_each_iommu(iommu) {
1045 iommu_set_device_table(iommu);
1046 iommu_enable_command_buffer(iommu);
1047 iommu_enable_event_buffer(iommu);
975 iommu_set_exclusion_range(iommu); 1048 iommu_set_exclusion_range(iommu);
976 iommu_init_msi(iommu); 1049 iommu_init_msi(iommu);
977 iommu_enable_event_logging(iommu);
978 iommu_enable(iommu); 1050 iommu_enable(iommu);
979 } 1051 }
980} 1052}
981 1053
1054static void disable_iommus(void)
1055{
1056 struct amd_iommu *iommu;
1057
1058 for_each_iommu(iommu)
1059 iommu_disable(iommu);
1060}
1061
982/* 1062/*
983 * Suspend/Resume support 1063 * Suspend/Resume support
984 * disable suspend until real resume implemented 1064 * disable suspend until real resume implemented
@@ -986,12 +1066,31 @@ static void __init enable_iommus(void)
986 1066
987static int amd_iommu_resume(struct sys_device *dev) 1067static int amd_iommu_resume(struct sys_device *dev)
988{ 1068{
1069 /*
1070 * Disable IOMMUs before reprogramming the hardware registers.
1071 * IOMMU is still enabled from the resume kernel.
1072 */
1073 disable_iommus();
1074
1075 /* re-load the hardware */
1076 enable_iommus();
1077
1078 /*
1079 * we have to flush after the IOMMUs are enabled because a
1080 * disabled IOMMU will never execute the commands we send
1081 */
1082 amd_iommu_flush_all_domains();
1083 amd_iommu_flush_all_devices();
1084
989 return 0; 1085 return 0;
990} 1086}
991 1087
992static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) 1088static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
993{ 1089{
994 return -EINVAL; 1090 /* disable IOMMUs to go out of the way for BIOS */
1091 disable_iommus();
1092
1093 return 0;
995} 1094}
996 1095
997static struct sysdev_class amd_iommu_sysdev_class = { 1096static struct sysdev_class amd_iommu_sysdev_class = {
@@ -1137,9 +1236,6 @@ int __init amd_iommu_init(void)
1137 1236
1138 enable_iommus(); 1237 enable_iommus();
1139 1238
1140 printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
1141 (1 << (amd_iommu_aperture_order-20)));
1142
1143 printk(KERN_INFO "AMD IOMMU: device isolation "); 1239 printk(KERN_INFO "AMD IOMMU: device isolation ");
1144 if (amd_iommu_isolate) 1240 if (amd_iommu_isolate)
1145 printk("enabled\n"); 1241 printk("enabled\n");
@@ -1211,6 +1307,13 @@ void __init amd_iommu_detect(void)
1211 * 1307 *
1212 ****************************************************************************/ 1308 ****************************************************************************/
1213 1309
1310static int __init parse_amd_iommu_dump(char *str)
1311{
1312 amd_iommu_dump = true;
1313
1314 return 1;
1315}
1316
1214static int __init parse_amd_iommu_options(char *str) 1317static int __init parse_amd_iommu_options(char *str)
1215{ 1318{
1216 for (; *str; ++str) { 1319 for (; *str; ++str) {
@@ -1225,15 +1328,5 @@ static int __init parse_amd_iommu_options(char *str)
1225 return 1; 1328 return 1;
1226} 1329}
1227 1330
1228static int __init parse_amd_iommu_size_options(char *str) 1331__setup("amd_iommu_dump", parse_amd_iommu_dump);
1229{
1230 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
1231
1232 if ((order > 24) && (order < 31))
1233 amd_iommu_aperture_order = order;
1234
1235 return 1;
1236}
1237
1238__setup("amd_iommu=", parse_amd_iommu_options); 1332__setup("amd_iommu=", parse_amd_iommu_options);
1239__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f2870920f246..8c7c042ecad1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/perf_counter.h>
17#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
18#include <linux/mc146818rtc.h> 19#include <linux/mc146818rtc.h>
19#include <linux/acpi_pmtmr.h> 20#include <linux/acpi_pmtmr.h>
@@ -34,6 +35,7 @@
34#include <linux/smp.h> 35#include <linux/smp.h>
35#include <linux/mm.h> 36#include <linux/mm.h>
36 37
38#include <asm/perf_counter.h>
37#include <asm/pgalloc.h> 39#include <asm/pgalloc.h>
38#include <asm/atomic.h> 40#include <asm/atomic.h>
39#include <asm/mpspec.h> 41#include <asm/mpspec.h>
@@ -98,6 +100,29 @@ early_param("lapic", parse_lapic);
98/* Local APIC was disabled by the BIOS and enabled by the kernel */ 100/* Local APIC was disabled by the BIOS and enabled by the kernel */
99static int enabled_via_apicbase; 101static int enabled_via_apicbase;
100 102
103/*
104 * Handle interrupt mode configuration register (IMCR).
105 * This register controls whether the interrupt signals
106 * that reach the BSP come from the master PIC or from the
107 * local APIC. Before entering Symmetric I/O Mode, either
108 * the BIOS or the operating system must switch out of
109 * PIC Mode by changing the IMCR.
110 */
111static inline void imcr_pic_to_apic(void)
112{
113 /* select IMCR register */
114 outb(0x70, 0x22);
115 /* NMI and 8259 INTR go through APIC */
116 outb(0x01, 0x23);
117}
118
119static inline void imcr_apic_to_pic(void)
120{
121 /* select IMCR register */
122 outb(0x70, 0x22);
123 /* NMI and 8259 INTR go directly to BSP */
124 outb(0x00, 0x23);
125}
101#endif 126#endif
102 127
103#ifdef CONFIG_X86_64 128#ifdef CONFIG_X86_64
@@ -111,13 +136,19 @@ static __init int setup_apicpmtimer(char *s)
111__setup("apicpmtimer", setup_apicpmtimer); 136__setup("apicpmtimer", setup_apicpmtimer);
112#endif 137#endif
113 138
139int x2apic_mode;
114#ifdef CONFIG_X86_X2APIC 140#ifdef CONFIG_X86_X2APIC
115int x2apic;
116/* x2apic enabled before OS handover */ 141/* x2apic enabled before OS handover */
117static int x2apic_preenabled; 142static int x2apic_preenabled;
118static int disable_x2apic; 143static int disable_x2apic;
119static __init int setup_nox2apic(char *str) 144static __init int setup_nox2apic(char *str)
120{ 145{
146 if (x2apic_enabled()) {
147 pr_warning("Bios already enabled x2apic, "
148 "can't enforce nox2apic");
149 return 0;
150 }
151
121 disable_x2apic = 1; 152 disable_x2apic = 1;
122 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 153 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
123 return 0; 154 return 0;
@@ -209,6 +240,31 @@ static int modern_apic(void)
209 return lapic_get_version() >= 0x14; 240 return lapic_get_version() >= 0x14;
210} 241}
211 242
243/*
244 * bare function to substitute write operation
245 * and it's _that_ fast :)
246 */
247static void native_apic_write_dummy(u32 reg, u32 v)
248{
249 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
250}
251
252static u32 native_apic_read_dummy(u32 reg)
253{
254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
255 return 0;
256}
257
258/*
259 * right after this call apic->write/read doesn't do anything
260 * note that there is no restore operation it works one way
261 */
262void apic_disable(void)
263{
264 apic->read = native_apic_read_dummy;
265 apic->write = native_apic_write_dummy;
266}
267
212void native_apic_wait_icr_idle(void) 268void native_apic_wait_icr_idle(void)
213{ 269{
214 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 270 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -348,7 +404,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
348 404
349static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) 405static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
350{ 406{
351 unsigned long reg = (lvt_off << 4) + APIC_EILVT0; 407 unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
352 unsigned int v = (mask << 16) | (msg_type << 8) | vector; 408 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
353 409
354 apic_write(reg, v); 410 apic_write(reg, v);
@@ -815,7 +871,7 @@ void clear_local_APIC(void)
815 u32 v; 871 u32 v;
816 872
817 /* APIC hasn't been mapped yet */ 873 /* APIC hasn't been mapped yet */
818 if (!x2apic && !apic_phys) 874 if (!x2apic_mode && !apic_phys)
819 return; 875 return;
820 876
821 maxlvt = lapic_get_maxlvt(); 877 maxlvt = lapic_get_maxlvt();
@@ -843,7 +899,7 @@ void clear_local_APIC(void)
843 } 899 }
844 900
845 /* lets not touch this if we didn't frob it */ 901 /* lets not touch this if we didn't frob it */
846#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) 902#ifdef CONFIG_X86_THERMAL_VECTOR
847 if (maxlvt >= 5) { 903 if (maxlvt >= 5) {
848 v = apic_read(APIC_LVTTHMR); 904 v = apic_read(APIC_LVTTHMR);
849 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 905 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -1133,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)
1133 apic_write(APIC_ESR, 0); 1189 apic_write(APIC_ESR, 0);
1134 } 1190 }
1135#endif 1191#endif
1192 perf_counters_lapic_init();
1136 1193
1137 preempt_disable(); 1194 preempt_disable();
1138 1195
@@ -1287,7 +1344,7 @@ void check_x2apic(void)
1287{ 1344{
1288 if (x2apic_enabled()) { 1345 if (x2apic_enabled()) {
1289 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); 1346 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
1290 x2apic_preenabled = x2apic = 1; 1347 x2apic_preenabled = x2apic_mode = 1;
1291 } 1348 }
1292} 1349}
1293 1350
@@ -1295,7 +1352,7 @@ void enable_x2apic(void)
1295{ 1352{
1296 int msr, msr2; 1353 int msr, msr2;
1297 1354
1298 if (!x2apic) 1355 if (!x2apic_mode)
1299 return; 1356 return;
1300 1357
1301 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1358 rdmsr(MSR_IA32_APICBASE, msr, msr2);
@@ -1304,6 +1361,7 @@ void enable_x2apic(void)
1304 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1361 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1305 } 1362 }
1306} 1363}
1364#endif /* CONFIG_X86_X2APIC */
1307 1365
1308void __init enable_IR_x2apic(void) 1366void __init enable_IR_x2apic(void)
1309{ 1367{
@@ -1312,32 +1370,21 @@ void __init enable_IR_x2apic(void)
1312 unsigned long flags; 1370 unsigned long flags;
1313 struct IO_APIC_route_entry **ioapic_entries = NULL; 1371 struct IO_APIC_route_entry **ioapic_entries = NULL;
1314 1372
1315 if (!cpu_has_x2apic) 1373 ret = dmar_table_init();
1316 return; 1374 if (ret) {
1317 1375 pr_debug("dmar_table_init() failed with %d:\n", ret);
1318 if (!x2apic_preenabled && disable_x2apic) { 1376 goto ir_failed;
1319 pr_info("Skipped enabling x2apic and Interrupt-remapping "
1320 "because of nox2apic\n");
1321 return;
1322 } 1377 }
1323 1378
1324 if (x2apic_preenabled && disable_x2apic) 1379 if (!intr_remapping_supported()) {
1325 panic("Bios already enabled x2apic, can't enforce nox2apic"); 1380 pr_debug("intr-remapping not supported\n");
1326 1381 goto ir_failed;
1327 if (!x2apic_preenabled && skip_ioapic_setup) {
1328 pr_info("Skipped enabling x2apic and Interrupt-remapping "
1329 "because of skipping io-apic setup\n");
1330 return;
1331 } 1382 }
1332 1383
1333 ret = dmar_table_init();
1334 if (ret) {
1335 pr_info("dmar_table_init() failed with %d:\n", ret);
1336 1384
1337 if (x2apic_preenabled) 1385 if (!x2apic_preenabled && skip_ioapic_setup) {
1338 panic("x2apic enabled by bios. But IR enabling failed"); 1386 pr_info("Skipped enabling intr-remap because of skipping "
1339 else 1387 "io-apic setup\n");
1340 pr_info("Not enabling x2apic,Intr-remapping\n");
1341 return; 1388 return;
1342 } 1389 }
1343 1390
@@ -1357,19 +1404,16 @@ void __init enable_IR_x2apic(void)
1357 mask_IO_APIC_setup(ioapic_entries); 1404 mask_IO_APIC_setup(ioapic_entries);
1358 mask_8259A(); 1405 mask_8259A();
1359 1406
1360 ret = enable_intr_remapping(EIM_32BIT_APIC_ID); 1407 ret = enable_intr_remapping(x2apic_supported());
1361
1362 if (ret && x2apic_preenabled) {
1363 local_irq_restore(flags);
1364 panic("x2apic enabled by bios. But IR enabling failed");
1365 }
1366
1367 if (ret) 1408 if (ret)
1368 goto end_restore; 1409 goto end_restore;
1369 1410
1370 if (!x2apic) { 1411 pr_info("Enabled Interrupt-remapping\n");
1371 x2apic = 1; 1412
1413 if (x2apic_supported() && !x2apic_mode) {
1414 x2apic_mode = 1;
1372 enable_x2apic(); 1415 enable_x2apic();
1416 pr_info("Enabled x2apic\n");
1373 } 1417 }
1374 1418
1375end_restore: 1419end_restore:
@@ -1378,37 +1422,34 @@ end_restore:
1378 * IR enabling failed 1422 * IR enabling failed
1379 */ 1423 */
1380 restore_IO_APIC_setup(ioapic_entries); 1424 restore_IO_APIC_setup(ioapic_entries);
1381 else
1382 reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
1383 1425
1384 unmask_8259A(); 1426 unmask_8259A();
1385 local_irq_restore(flags); 1427 local_irq_restore(flags);
1386 1428
1387end: 1429end:
1388 if (!ret) {
1389 if (!x2apic_preenabled)
1390 pr_info("Enabled x2apic and interrupt-remapping\n");
1391 else
1392 pr_info("Enabled Interrupt-remapping\n");
1393 } else
1394 pr_err("Failed to enable Interrupt-remapping and x2apic\n");
1395 if (ioapic_entries) 1430 if (ioapic_entries)
1396 free_ioapic_entries(ioapic_entries); 1431 free_ioapic_entries(ioapic_entries);
1432
1433 if (!ret)
1434 return;
1435
1436ir_failed:
1437 if (x2apic_preenabled)
1438 panic("x2apic enabled by bios. But IR enabling failed");
1439 else if (cpu_has_x2apic)
1440 pr_info("Not enabling x2apic,Intr-remapping\n");
1397#else 1441#else
1398 if (!cpu_has_x2apic) 1442 if (!cpu_has_x2apic)
1399 return; 1443 return;
1400 1444
1401 if (x2apic_preenabled) 1445 if (x2apic_preenabled)
1402 panic("x2apic enabled prior OS handover," 1446 panic("x2apic enabled prior OS handover,"
1403 " enable CONFIG_INTR_REMAP"); 1447 " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
1404
1405 pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1406 " and x2apic\n");
1407#endif 1448#endif
1408 1449
1409 return; 1450 return;
1410} 1451}
1411#endif /* CONFIG_X86_X2APIC */ 1452
1412 1453
1413#ifdef CONFIG_X86_64 1454#ifdef CONFIG_X86_64
1414/* 1455/*
@@ -1425,7 +1466,6 @@ static int __init detect_init_APIC(void)
1425 } 1466 }
1426 1467
1427 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 1468 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1428 boot_cpu_physical_apicid = 0;
1429 return 0; 1469 return 0;
1430} 1470}
1431#else 1471#else
@@ -1539,32 +1579,49 @@ void __init early_init_lapic_mapping(void)
1539 */ 1579 */
1540void __init init_apic_mappings(void) 1580void __init init_apic_mappings(void)
1541{ 1581{
1542 if (x2apic) { 1582 unsigned int new_apicid;
1583
1584 if (x2apic_mode) {
1543 boot_cpu_physical_apicid = read_apic_id(); 1585 boot_cpu_physical_apicid = read_apic_id();
1544 return; 1586 return;
1545 } 1587 }
1546 1588
1547 /* 1589 /* If no local APIC can be found return early */
1548 * If no local APIC can be found then set up a fake all
1549 * zeroes page to simulate the local APIC and another
1550 * one for the IO-APIC.
1551 */
1552 if (!smp_found_config && detect_init_APIC()) { 1590 if (!smp_found_config && detect_init_APIC()) {
1553 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); 1591 /* lets NOP'ify apic operations */
1554 apic_phys = __pa(apic_phys); 1592 pr_info("APIC: disable apic facility\n");
1555 } else 1593 apic_disable();
1594 } else {
1556 apic_phys = mp_lapic_addr; 1595 apic_phys = mp_lapic_addr;
1557 1596
1558 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1597 /*
1559 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", 1598 * acpi lapic path already maps that address in
1560 APIC_BASE, apic_phys); 1599 * acpi_register_lapic_address()
1600 */
1601 if (!acpi_lapic)
1602 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1603
1604 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
1605 APIC_BASE, apic_phys);
1606 }
1561 1607
1562 /* 1608 /*
1563 * Fetch the APIC ID of the BSP in case we have a 1609 * Fetch the APIC ID of the BSP in case we have a
1564 * default configuration (or the MP table is broken). 1610 * default configuration (or the MP table is broken).
1565 */ 1611 */
1566 if (boot_cpu_physical_apicid == -1U) 1612 new_apicid = read_apic_id();
1567 boot_cpu_physical_apicid = read_apic_id(); 1613 if (boot_cpu_physical_apicid != new_apicid) {
1614 boot_cpu_physical_apicid = new_apicid;
1615 /*
1616 * yeah -- we lie about apic_version
1617 * in case if apic was disabled via boot option
1618 * but it's not a problem for SMP compiled kernel
1619 * since smp_sanity_check is prepared for such a case
1620 * and disable smp mode
1621 */
1622 apic_version[new_apicid] =
1623 GET_APIC_VERSION(apic_read(APIC_LVR));
1624 }
1568} 1625}
1569 1626
1570/* 1627/*
@@ -1733,8 +1790,7 @@ void __init connect_bsp_APIC(void)
1733 */ 1790 */
1734 apic_printk(APIC_VERBOSE, "leaving PIC mode, " 1791 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1735 "enabling APIC mode.\n"); 1792 "enabling APIC mode.\n");
1736 outb(0x70, 0x22); 1793 imcr_pic_to_apic();
1737 outb(0x01, 0x23);
1738 } 1794 }
1739#endif 1795#endif
1740 if (apic->enable_apic_mode) 1796 if (apic->enable_apic_mode)
@@ -1762,8 +1818,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1762 */ 1818 */
1763 apic_printk(APIC_VERBOSE, "disabling APIC mode, " 1819 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1764 "entering PIC mode.\n"); 1820 "entering PIC mode.\n");
1765 outb(0x70, 0x22); 1821 imcr_apic_to_pic();
1766 outb(0x00, 0x23);
1767 return; 1822 return;
1768 } 1823 }
1769#endif 1824#endif
@@ -1962,17 +2017,17 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1962 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 2017 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1963 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 2018 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1964 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 2019 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1965#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) 2020#ifdef CONFIG_X86_THERMAL_VECTOR
1966 if (maxlvt >= 5) 2021 if (maxlvt >= 5)
1967 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 2022 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1968#endif 2023#endif
1969 2024
1970 local_irq_save(flags); 2025 local_irq_save(flags);
1971 disable_local_APIC(); 2026 disable_local_APIC();
1972#ifdef CONFIG_INTR_REMAP 2027
1973 if (intr_remapping_enabled) 2028 if (intr_remapping_enabled)
1974 disable_intr_remapping(); 2029 disable_intr_remapping();
1975#endif 2030
1976 local_irq_restore(flags); 2031 local_irq_restore(flags);
1977 return 0; 2032 return 0;
1978} 2033}
@@ -1982,42 +2037,34 @@ static int lapic_resume(struct sys_device *dev)
1982 unsigned int l, h; 2037 unsigned int l, h;
1983 unsigned long flags; 2038 unsigned long flags;
1984 int maxlvt; 2039 int maxlvt;
1985 2040 int ret = 0;
1986#ifdef CONFIG_INTR_REMAP
1987 int ret;
1988 struct IO_APIC_route_entry **ioapic_entries = NULL; 2041 struct IO_APIC_route_entry **ioapic_entries = NULL;
1989 2042
1990 if (!apic_pm_state.active) 2043 if (!apic_pm_state.active)
1991 return 0; 2044 return 0;
1992 2045
1993 local_irq_save(flags); 2046 local_irq_save(flags);
1994 if (x2apic) { 2047 if (intr_remapping_enabled) {
1995 ioapic_entries = alloc_ioapic_entries(); 2048 ioapic_entries = alloc_ioapic_entries();
1996 if (!ioapic_entries) { 2049 if (!ioapic_entries) {
1997 WARN(1, "Alloc ioapic_entries in lapic resume failed."); 2050 WARN(1, "Alloc ioapic_entries in lapic resume failed.");
1998 return -ENOMEM; 2051 ret = -ENOMEM;
2052 goto restore;
1999 } 2053 }
2000 2054
2001 ret = save_IO_APIC_setup(ioapic_entries); 2055 ret = save_IO_APIC_setup(ioapic_entries);
2002 if (ret) { 2056 if (ret) {
2003 WARN(1, "Saving IO-APIC state failed: %d\n", ret); 2057 WARN(1, "Saving IO-APIC state failed: %d\n", ret);
2004 free_ioapic_entries(ioapic_entries); 2058 free_ioapic_entries(ioapic_entries);
2005 return ret; 2059 goto restore;
2006 } 2060 }
2007 2061
2008 mask_IO_APIC_setup(ioapic_entries); 2062 mask_IO_APIC_setup(ioapic_entries);
2009 mask_8259A(); 2063 mask_8259A();
2010 enable_x2apic();
2011 } 2064 }
2012#else
2013 if (!apic_pm_state.active)
2014 return 0;
2015 2065
2016 local_irq_save(flags); 2066 if (x2apic_mode)
2017 if (x2apic)
2018 enable_x2apic(); 2067 enable_x2apic();
2019#endif
2020
2021 else { 2068 else {
2022 /* 2069 /*
2023 * Make sure the APICBASE points to the right address 2070 * Make sure the APICBASE points to the right address
@@ -2055,21 +2102,16 @@ static int lapic_resume(struct sys_device *dev)
2055 apic_write(APIC_ESR, 0); 2102 apic_write(APIC_ESR, 0);
2056 apic_read(APIC_ESR); 2103 apic_read(APIC_ESR);
2057 2104
2058#ifdef CONFIG_INTR_REMAP 2105 if (intr_remapping_enabled) {
2059 if (intr_remapping_enabled) 2106 reenable_intr_remapping(x2apic_mode);
2060 reenable_intr_remapping(EIM_32BIT_APIC_ID);
2061
2062 if (x2apic) {
2063 unmask_8259A(); 2107 unmask_8259A();
2064 restore_IO_APIC_setup(ioapic_entries); 2108 restore_IO_APIC_setup(ioapic_entries);
2065 free_ioapic_entries(ioapic_entries); 2109 free_ioapic_entries(ioapic_entries);
2066 } 2110 }
2067#endif 2111restore:
2068
2069 local_irq_restore(flags); 2112 local_irq_restore(flags);
2070 2113
2071 2114 return ret;
2072 return 0;
2073} 2115}
2074 2116
2075/* 2117/*
@@ -2117,31 +2159,14 @@ static void apic_pm_activate(void) { }
2117#endif /* CONFIG_PM */ 2159#endif /* CONFIG_PM */
2118 2160
2119#ifdef CONFIG_X86_64 2161#ifdef CONFIG_X86_64
2120/* 2162
2121 * apic_is_clustered_box() -- Check if we can expect good TSC 2163static int __cpuinit apic_cluster_num(void)
2122 *
2123 * Thus far, the major user of this is IBM's Summit2 series:
2124 *
2125 * Clustered boxes may have unsynced TSC problems if they are
2126 * multi-chassis. Use available data to take a good guess.
2127 * If in doubt, go HPET.
2128 */
2129__cpuinit int apic_is_clustered_box(void)
2130{ 2164{
2131 int i, clusters, zeros; 2165 int i, clusters, zeros;
2132 unsigned id; 2166 unsigned id;
2133 u16 *bios_cpu_apicid; 2167 u16 *bios_cpu_apicid;
2134 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); 2168 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
2135 2169
2136 /*
2137 * there is not this kind of box with AMD CPU yet.
2138 * Some AMD box with quadcore cpu and 8 sockets apicid
2139 * will be [4, 0x23] or [8, 0x27] could be thought to
2140 * vsmp box still need checking...
2141 */
2142 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
2143 return 0;
2144
2145 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); 2170 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
2146 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 2171 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
2147 2172
@@ -2177,18 +2202,67 @@ __cpuinit int apic_is_clustered_box(void)
2177 ++zeros; 2202 ++zeros;
2178 } 2203 }
2179 2204
2180 /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are 2205 return clusters;
2181 * not guaranteed to be synced between boards 2206}
2182 */ 2207
2183 if (is_vsmp_box() && clusters > 1) 2208static int __cpuinitdata multi_checked;
2209static int __cpuinitdata multi;
2210
2211static int __cpuinit set_multi(const struct dmi_system_id *d)
2212{
2213 if (multi)
2214 return 0;
2215 pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
2216 multi = 1;
2217 return 0;
2218}
2219
2220static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
2221 {
2222 .callback = set_multi,
2223 .ident = "IBM System Summit2",
2224 .matches = {
2225 DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
2226 DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
2227 },
2228 },
2229 {}
2230};
2231
2232static void __cpuinit dmi_check_multi(void)
2233{
2234 if (multi_checked)
2235 return;
2236
2237 dmi_check_system(multi_dmi_table);
2238 multi_checked = 1;
2239}
2240
2241/*
2242 * apic_is_clustered_box() -- Check if we can expect good TSC
2243 *
2244 * Thus far, the major user of this is IBM's Summit2 series:
2245 * Clustered boxes may have unsynced TSC problems if they are
2246 * multi-chassis.
2247 * Use DMI to check them
2248 */
2249__cpuinit int apic_is_clustered_box(void)
2250{
2251 dmi_check_multi();
2252 if (multi)
2184 return 1; 2253 return 1;
2185 2254
2255 if (!is_vsmp_box())
2256 return 0;
2257
2186 /* 2258 /*
2187 * If clusters > 2, then should be multi-chassis. 2259 * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
2188 * May have to revisit this when multi-core + hyperthreaded CPUs come 2260 * not guaranteed to be synced between boards
2189 * out, but AFAIK this will work even for them.
2190 */ 2261 */
2191 return (clusters > 2); 2262 if (apic_cluster_num() > 1)
2263 return 1;
2264
2265 return 0;
2192} 2266}
2193#endif 2267#endif
2194 2268
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 306e5e88fb6f..d0c99abc26c3 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -161,7 +161,7 @@ static int flat_apic_id_registered(void)
161 161
162static int flat_phys_pkg_id(int initial_apic_id, int index_msb) 162static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
163{ 163{
164 return hard_smp_processor_id() >> index_msb; 164 return initial_apic_id >> index_msb;
165} 165}
166 166
167struct apic apic_flat = { 167struct apic apic_flat = {
@@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
235 * regardless of how many processors are present (x86_64 ES7000 235 * regardless of how many processors are present (x86_64 ES7000
236 * is an example). 236 * is an example).
237 */ 237 */
238 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && 238 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
239 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { 239 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
240 printk(KERN_DEBUG "system APIC only can use physical flat"); 240 printk(KERN_DEBUG "system APIC only can use physical flat");
241 return 1; 241 return 1;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 1c11b819f245..69328ac8de9c 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi)
145 return gsi; 145 return gsi;
146} 146}
147 147
148static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) 148static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
149{ 149{
150 unsigned long vect = 0, psaival = 0; 150 unsigned long vect = 0, psaival = 0;
151 151
@@ -254,7 +254,7 @@ static int parse_unisys_oem(char *oemptr)
254} 254}
255 255
256#ifdef CONFIG_ACPI 256#ifdef CONFIG_ACPI
257static int find_unisys_acpi_oem_table(unsigned long *oem_addr) 257static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
258{ 258{
259 struct acpi_table_header *header = NULL; 259 struct acpi_table_header *header = NULL;
260 struct es7000_oem_table *table; 260 struct es7000_oem_table *table;
@@ -285,7 +285,7 @@ static int find_unisys_acpi_oem_table(unsigned long *oem_addr)
285 return 0; 285 return 0;
286} 286}
287 287
288static void unmap_unisys_acpi_oem_table(unsigned long oem_addr) 288static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
289{ 289{
290 if (!oem_addr) 290 if (!oem_addr)
291 return; 291 return;
@@ -306,7 +306,7 @@ static int es7000_check_dsdt(void)
306static int es7000_acpi_ret; 306static int es7000_acpi_ret;
307 307
308/* Hook from generic ACPI tables.c */ 308/* Hook from generic ACPI tables.c */
309static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 309static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
310{ 310{
311 unsigned long oem_addr = 0; 311 unsigned long oem_addr = 0;
312 int check_dsdt; 312 int check_dsdt;
@@ -717,7 +717,7 @@ struct apic apic_es7000_cluster = {
717 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 717 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
718}; 718};
719 719
720struct apic apic_es7000 = { 720struct apic __refdata apic_es7000 = {
721 721
722 .name = "es7000", 722 .name = "es7000",
723 .probe = probe_es7000, 723 .probe = probe_es7000,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 30da617d18e4..ef8d9290c7ea 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -59,6 +59,7 @@
59#include <asm/setup.h> 59#include <asm/setup.h>
60#include <asm/irq_remapping.h> 60#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 61#include <asm/hpet.h>
62#include <asm/hw_irq.h>
62#include <asm/uv/uv_hub.h> 63#include <asm/uv/uv_hub.h>
63#include <asm/uv/uv_irq.h> 64#include <asm/uv/uv_irq.h>
64 65
@@ -129,12 +130,9 @@ struct irq_pin_list {
129 struct irq_pin_list *next; 130 struct irq_pin_list *next;
130}; 131};
131 132
132static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) 133static struct irq_pin_list *get_one_free_irq_2_pin(int node)
133{ 134{
134 struct irq_pin_list *pin; 135 struct irq_pin_list *pin;
135 int node;
136
137 node = cpu_to_node(cpu);
138 136
139 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); 137 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
140 138
@@ -148,9 +146,6 @@ struct irq_cfg {
148 unsigned move_cleanup_count; 146 unsigned move_cleanup_count;
149 u8 vector; 147 u8 vector;
150 u8 move_in_progress : 1; 148 u8 move_in_progress : 1;
151#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
152 u8 move_desc_pending : 1;
153#endif
154}; 149};
155 150
156/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 151/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -182,16 +177,18 @@ int __init arch_early_irq_init(void)
182 struct irq_cfg *cfg; 177 struct irq_cfg *cfg;
183 struct irq_desc *desc; 178 struct irq_desc *desc;
184 int count; 179 int count;
180 int node;
185 int i; 181 int i;
186 182
187 cfg = irq_cfgx; 183 cfg = irq_cfgx;
188 count = ARRAY_SIZE(irq_cfgx); 184 count = ARRAY_SIZE(irq_cfgx);
185 node= cpu_to_node(boot_cpu_id);
189 186
190 for (i = 0; i < count; i++) { 187 for (i = 0; i < count; i++) {
191 desc = irq_to_desc(i); 188 desc = irq_to_desc(i);
192 desc->chip_data = &cfg[i]; 189 desc->chip_data = &cfg[i];
193 alloc_bootmem_cpumask_var(&cfg[i].domain); 190 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
194 alloc_bootmem_cpumask_var(&cfg[i].old_domain); 191 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
195 if (i < NR_IRQS_LEGACY) 192 if (i < NR_IRQS_LEGACY)
196 cpumask_setall(cfg[i].domain); 193 cpumask_setall(cfg[i].domain);
197 } 194 }
@@ -212,12 +209,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
212 return cfg; 209 return cfg;
213} 210}
214 211
215static struct irq_cfg *get_one_free_irq_cfg(int cpu) 212static struct irq_cfg *get_one_free_irq_cfg(int node)
216{ 213{
217 struct irq_cfg *cfg; 214 struct irq_cfg *cfg;
218 int node;
219
220 node = cpu_to_node(cpu);
221 215
222 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); 216 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
223 if (cfg) { 217 if (cfg) {
@@ -238,13 +232,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
238 return cfg; 232 return cfg;
239} 233}
240 234
241int arch_init_chip_data(struct irq_desc *desc, int cpu) 235int arch_init_chip_data(struct irq_desc *desc, int node)
242{ 236{
243 struct irq_cfg *cfg; 237 struct irq_cfg *cfg;
244 238
245 cfg = desc->chip_data; 239 cfg = desc->chip_data;
246 if (!cfg) { 240 if (!cfg) {
247 desc->chip_data = get_one_free_irq_cfg(cpu); 241 desc->chip_data = get_one_free_irq_cfg(node);
248 if (!desc->chip_data) { 242 if (!desc->chip_data) {
249 printk(KERN_ERR "can not alloc irq_cfg\n"); 243 printk(KERN_ERR "can not alloc irq_cfg\n");
250 BUG_ON(1); 244 BUG_ON(1);
@@ -254,10 +248,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
254 return 0; 248 return 0;
255} 249}
256 250
257#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC 251/* for move_irq_desc */
258
259static void 252static void
260init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) 253init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
261{ 254{
262 struct irq_pin_list *old_entry, *head, *tail, *entry; 255 struct irq_pin_list *old_entry, *head, *tail, *entry;
263 256
@@ -266,7 +259,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
266 if (!old_entry) 259 if (!old_entry)
267 return; 260 return;
268 261
269 entry = get_one_free_irq_2_pin(cpu); 262 entry = get_one_free_irq_2_pin(node);
270 if (!entry) 263 if (!entry)
271 return; 264 return;
272 265
@@ -276,7 +269,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
276 tail = entry; 269 tail = entry;
277 old_entry = old_entry->next; 270 old_entry = old_entry->next;
278 while (old_entry) { 271 while (old_entry) {
279 entry = get_one_free_irq_2_pin(cpu); 272 entry = get_one_free_irq_2_pin(node);
280 if (!entry) { 273 if (!entry) {
281 entry = head; 274 entry = head;
282 while (entry) { 275 while (entry) {
@@ -316,12 +309,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
316} 309}
317 310
318void arch_init_copy_chip_data(struct irq_desc *old_desc, 311void arch_init_copy_chip_data(struct irq_desc *old_desc,
319 struct irq_desc *desc, int cpu) 312 struct irq_desc *desc, int node)
320{ 313{
321 struct irq_cfg *cfg; 314 struct irq_cfg *cfg;
322 struct irq_cfg *old_cfg; 315 struct irq_cfg *old_cfg;
323 316
324 cfg = get_one_free_irq_cfg(cpu); 317 cfg = get_one_free_irq_cfg(node);
325 318
326 if (!cfg) 319 if (!cfg)
327 return; 320 return;
@@ -332,7 +325,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
332 325
333 memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); 326 memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
334 327
335 init_copy_irq_2_pin(old_cfg, cfg, cpu); 328 init_copy_irq_2_pin(old_cfg, cfg, node);
336} 329}
337 330
338static void free_irq_cfg(struct irq_cfg *old_cfg) 331static void free_irq_cfg(struct irq_cfg *old_cfg)
@@ -356,19 +349,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
356 old_desc->chip_data = NULL; 349 old_desc->chip_data = NULL;
357 } 350 }
358} 351}
359 352/* end for move_irq_desc */
360static void
361set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
362{
363 struct irq_cfg *cfg = desc->chip_data;
364
365 if (!cfg->move_in_progress) {
366 /* it means that domain is not changed */
367 if (!cpumask_intersects(desc->affinity, mask))
368 cfg->move_desc_pending = 1;
369 }
370}
371#endif
372 353
373#else 354#else
374static struct irq_cfg *irq_cfg(unsigned int irq) 355static struct irq_cfg *irq_cfg(unsigned int irq)
@@ -378,13 +359,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
378 359
379#endif 360#endif
380 361
381#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
382static inline void
383set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
384{
385}
386#endif
387
388struct io_apic { 362struct io_apic {
389 unsigned int index; 363 unsigned int index;
390 unsigned int unused[3]; 364 unsigned int unused[3];
@@ -518,132 +492,18 @@ static void ioapic_mask_entry(int apic, int pin)
518 spin_unlock_irqrestore(&ioapic_lock, flags); 492 spin_unlock_irqrestore(&ioapic_lock, flags);
519} 493}
520 494
521#ifdef CONFIG_SMP
522static void send_cleanup_vector(struct irq_cfg *cfg)
523{
524 cpumask_var_t cleanup_mask;
525
526 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
527 unsigned int i;
528 cfg->move_cleanup_count = 0;
529 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
530 cfg->move_cleanup_count++;
531 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
532 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
533 } else {
534 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
535 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
536 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
537 free_cpumask_var(cleanup_mask);
538 }
539 cfg->move_in_progress = 0;
540}
541
542static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
543{
544 int apic, pin;
545 struct irq_pin_list *entry;
546 u8 vector = cfg->vector;
547
548 entry = cfg->irq_2_pin;
549 for (;;) {
550 unsigned int reg;
551
552 if (!entry)
553 break;
554
555 apic = entry->apic;
556 pin = entry->pin;
557 /*
558 * With interrupt-remapping, destination information comes
559 * from interrupt-remapping table entry.
560 */
561 if (!irq_remapped(irq))
562 io_apic_write(apic, 0x11 + pin*2, dest);
563 reg = io_apic_read(apic, 0x10 + pin*2);
564 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
565 reg |= vector;
566 io_apic_modify(apic, 0x10 + pin*2, reg);
567 if (!entry->next)
568 break;
569 entry = entry->next;
570 }
571}
572
573static int
574assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
575
576/*
577 * Either sets desc->affinity to a valid value, and returns
578 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
579 * leaves desc->affinity untouched.
580 */
581static unsigned int
582set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
583{
584 struct irq_cfg *cfg;
585 unsigned int irq;
586
587 if (!cpumask_intersects(mask, cpu_online_mask))
588 return BAD_APICID;
589
590 irq = desc->irq;
591 cfg = desc->chip_data;
592 if (assign_irq_vector(irq, cfg, mask))
593 return BAD_APICID;
594
595 /* check that before desc->addinity get updated */
596 set_extra_move_desc(desc, mask);
597
598 cpumask_copy(desc->affinity, mask);
599
600 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
601}
602
603static void
604set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
605{
606 struct irq_cfg *cfg;
607 unsigned long flags;
608 unsigned int dest;
609 unsigned int irq;
610
611 irq = desc->irq;
612 cfg = desc->chip_data;
613
614 spin_lock_irqsave(&ioapic_lock, flags);
615 dest = set_desc_affinity(desc, mask);
616 if (dest != BAD_APICID) {
617 /* Only the high 8 bits are valid. */
618 dest = SET_APIC_LOGICAL_ID(dest);
619 __target_IO_APIC_irq(irq, dest, cfg);
620 }
621 spin_unlock_irqrestore(&ioapic_lock, flags);
622}
623
624static void
625set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
626{
627 struct irq_desc *desc;
628
629 desc = irq_to_desc(irq);
630
631 set_ioapic_affinity_irq_desc(desc, mask);
632}
633#endif /* CONFIG_SMP */
634
635/* 495/*
636 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are 496 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
637 * shared ISA-space IRQs, so we have to support them. We are super 497 * shared ISA-space IRQs, so we have to support them. We are super
638 * fast in the common case, and fast for shared ISA-space IRQs. 498 * fast in the common case, and fast for shared ISA-space IRQs.
639 */ 499 */
640static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) 500static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
641{ 501{
642 struct irq_pin_list *entry; 502 struct irq_pin_list *entry;
643 503
644 entry = cfg->irq_2_pin; 504 entry = cfg->irq_2_pin;
645 if (!entry) { 505 if (!entry) {
646 entry = get_one_free_irq_2_pin(cpu); 506 entry = get_one_free_irq_2_pin(node);
647 if (!entry) { 507 if (!entry) {
648 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", 508 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
649 apic, pin); 509 apic, pin);
@@ -663,7 +523,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
663 entry = entry->next; 523 entry = entry->next;
664 } 524 }
665 525
666 entry->next = get_one_free_irq_2_pin(cpu); 526 entry->next = get_one_free_irq_2_pin(node);
667 entry = entry->next; 527 entry = entry->next;
668 entry->apic = apic; 528 entry->apic = apic;
669 entry->pin = pin; 529 entry->pin = pin;
@@ -672,7 +532,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
672/* 532/*
673 * Reroute an IRQ to a different pin. 533 * Reroute an IRQ to a different pin.
674 */ 534 */
675static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, 535static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
676 int oldapic, int oldpin, 536 int oldapic, int oldpin,
677 int newapic, int newpin) 537 int newapic, int newpin)
678{ 538{
@@ -692,7 +552,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
692 552
693 /* why? call replace before add? */ 553 /* why? call replace before add? */
694 if (!replaced) 554 if (!replaced)
695 add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); 555 add_pin_to_irq_node(cfg, node, newapic, newpin);
696} 556}
697 557
698static inline void io_apic_modify_irq(struct irq_cfg *cfg, 558static inline void io_apic_modify_irq(struct irq_cfg *cfg,
@@ -850,7 +710,6 @@ static int __init ioapic_pirq_setup(char *str)
850__setup("pirq=", ioapic_pirq_setup); 710__setup("pirq=", ioapic_pirq_setup);
851#endif /* CONFIG_X86_32 */ 711#endif /* CONFIG_X86_32 */
852 712
853#ifdef CONFIG_INTR_REMAP
854struct IO_APIC_route_entry **alloc_ioapic_entries(void) 713struct IO_APIC_route_entry **alloc_ioapic_entries(void)
855{ 714{
856 int apic; 715 int apic;
@@ -948,20 +807,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
948 return 0; 807 return 0;
949} 808}
950 809
951void reinit_intr_remapped_IO_APIC(int intr_remapping,
952 struct IO_APIC_route_entry **ioapic_entries)
953
954{
955 /*
956 * for now plain restore of previous settings.
957 * TBD: In the case of OS enabling interrupt-remapping,
958 * IO-APIC RTE's need to be setup to point to interrupt-remapping
959 * table entries. for now, do a plain restore, and wait for
960 * the setup_IO_APIC_irqs() to do proper initialization.
961 */
962 restore_IO_APIC_setup(ioapic_entries);
963}
964
965void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) 810void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
966{ 811{
967 int apic; 812 int apic;
@@ -971,7 +816,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
971 816
972 kfree(ioapic_entries); 817 kfree(ioapic_entries);
973} 818}
974#endif
975 819
976/* 820/*
977 * Find the IRQ entry number of a certain pin. 821 * Find the IRQ entry number of a certain pin.
@@ -1032,54 +876,6 @@ static int __init find_isa_irq_apic(int irq, int type)
1032 return -1; 876 return -1;
1033} 877}
1034 878
1035/*
1036 * Find a specific PCI IRQ entry.
1037 * Not an __init, possibly needed by modules
1038 */
1039static int pin_2_irq(int idx, int apic, int pin);
1040
1041int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
1042{
1043 int apic, i, best_guess = -1;
1044
1045 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
1046 bus, slot, pin);
1047 if (test_bit(bus, mp_bus_not_pci)) {
1048 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
1049 return -1;
1050 }
1051 for (i = 0; i < mp_irq_entries; i++) {
1052 int lbus = mp_irqs[i].srcbus;
1053
1054 for (apic = 0; apic < nr_ioapics; apic++)
1055 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
1056 mp_irqs[i].dstapic == MP_APIC_ALL)
1057 break;
1058
1059 if (!test_bit(lbus, mp_bus_not_pci) &&
1060 !mp_irqs[i].irqtype &&
1061 (bus == lbus) &&
1062 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1063 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1064
1065 if (!(apic || IO_APIC_IRQ(irq)))
1066 continue;
1067
1068 if (pin == (mp_irqs[i].srcbusirq & 3))
1069 return irq;
1070 /*
1071 * Use the first all-but-pin matching entry as a
1072 * best-guess fuzzy result for broken mptables.
1073 */
1074 if (best_guess < 0)
1075 best_guess = irq;
1076 }
1077 }
1078 return best_guess;
1079}
1080
1081EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
1082
1083#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 879#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1084/* 880/*
1085 * EISA Edge/Level control register, ELCR 881 * EISA Edge/Level control register, ELCR
@@ -1298,6 +1094,64 @@ static int pin_2_irq(int idx, int apic, int pin)
1298 return irq; 1094 return irq;
1299} 1095}
1300 1096
1097/*
1098 * Find a specific PCI IRQ entry.
1099 * Not an __init, possibly needed by modules
1100 */
1101int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
1102 struct io_apic_irq_attr *irq_attr)
1103{
1104 int apic, i, best_guess = -1;
1105
1106 apic_printk(APIC_DEBUG,
1107 "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
1108 bus, slot, pin);
1109 if (test_bit(bus, mp_bus_not_pci)) {
1110 apic_printk(APIC_VERBOSE,
1111 "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
1112 return -1;
1113 }
1114 for (i = 0; i < mp_irq_entries; i++) {
1115 int lbus = mp_irqs[i].srcbus;
1116
1117 for (apic = 0; apic < nr_ioapics; apic++)
1118 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
1119 mp_irqs[i].dstapic == MP_APIC_ALL)
1120 break;
1121
1122 if (!test_bit(lbus, mp_bus_not_pci) &&
1123 !mp_irqs[i].irqtype &&
1124 (bus == lbus) &&
1125 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1126 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1127
1128 if (!(apic || IO_APIC_IRQ(irq)))
1129 continue;
1130
1131 if (pin == (mp_irqs[i].srcbusirq & 3)) {
1132 set_io_apic_irq_attr(irq_attr, apic,
1133 mp_irqs[i].dstirq,
1134 irq_trigger(i),
1135 irq_polarity(i));
1136 return irq;
1137 }
1138 /*
1139 * Use the first all-but-pin matching entry as a
1140 * best-guess fuzzy result for broken mptables.
1141 */
1142 if (best_guess < 0) {
1143 set_io_apic_irq_attr(irq_attr, apic,
1144 mp_irqs[i].dstirq,
1145 irq_trigger(i),
1146 irq_polarity(i));
1147 best_guess = irq;
1148 }
1149 }
1150 }
1151 return best_guess;
1152}
1153EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
1154
1301void lock_vector_lock(void) 1155void lock_vector_lock(void)
1302{ 1156{
1303 /* Used to the online set of cpus does not change 1157 /* Used to the online set of cpus does not change
@@ -1628,58 +1482,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1628 ioapic_write_entry(apic_id, pin, entry); 1482 ioapic_write_entry(apic_id, pin, entry);
1629} 1483}
1630 1484
1485static struct {
1486 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
1487} mp_ioapic_routing[MAX_IO_APICS];
1488
1631static void __init setup_IO_APIC_irqs(void) 1489static void __init setup_IO_APIC_irqs(void)
1632{ 1490{
1633 int apic_id, pin, idx, irq; 1491 int apic_id = 0, pin, idx, irq;
1634 int notcon = 0; 1492 int notcon = 0;
1635 struct irq_desc *desc; 1493 struct irq_desc *desc;
1636 struct irq_cfg *cfg; 1494 struct irq_cfg *cfg;
1637 int cpu = boot_cpu_id; 1495 int node = cpu_to_node(boot_cpu_id);
1638 1496
1639 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1497 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1640 1498
1641 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { 1499#ifdef CONFIG_ACPI
1642 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1500 if (!acpi_disabled && acpi_ioapic) {
1643 1501 apic_id = mp_find_ioapic(0);
1644 idx = find_irq_entry(apic_id, pin, mp_INT); 1502 if (apic_id < 0)
1645 if (idx == -1) { 1503 apic_id = 0;
1646 if (!notcon) { 1504 }
1647 notcon = 1; 1505#endif
1648 apic_printk(APIC_VERBOSE,
1649 KERN_DEBUG " %d-%d",
1650 mp_ioapics[apic_id].apicid, pin);
1651 } else
1652 apic_printk(APIC_VERBOSE, " %d-%d",
1653 mp_ioapics[apic_id].apicid, pin);
1654 continue;
1655 }
1656 if (notcon) {
1657 apic_printk(APIC_VERBOSE,
1658 " (apicid-pin) not connected\n");
1659 notcon = 0;
1660 }
1661 1506
1662 irq = pin_2_irq(idx, apic_id, pin); 1507 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1508 idx = find_irq_entry(apic_id, pin, mp_INT);
1509 if (idx == -1) {
1510 if (!notcon) {
1511 notcon = 1;
1512 apic_printk(APIC_VERBOSE,
1513 KERN_DEBUG " %d-%d",
1514 mp_ioapics[apic_id].apicid, pin);
1515 } else
1516 apic_printk(APIC_VERBOSE, " %d-%d",
1517 mp_ioapics[apic_id].apicid, pin);
1518 continue;
1519 }
1520 if (notcon) {
1521 apic_printk(APIC_VERBOSE,
1522 " (apicid-pin) not connected\n");
1523 notcon = 0;
1524 }
1663 1525
1664 /* 1526 irq = pin_2_irq(idx, apic_id, pin);
1665 * Skip the timer IRQ if there's a quirk handler
1666 * installed and if it returns 1:
1667 */
1668 if (apic->multi_timer_check &&
1669 apic->multi_timer_check(apic_id, irq))
1670 continue;
1671 1527
1672 desc = irq_to_desc_alloc_cpu(irq, cpu); 1528 /*
1673 if (!desc) { 1529 * Skip the timer IRQ if there's a quirk handler
1674 printk(KERN_INFO "can not get irq_desc for %d\n", irq); 1530 * installed and if it returns 1:
1675 continue; 1531 */
1676 } 1532 if (apic->multi_timer_check &&
1677 cfg = desc->chip_data; 1533 apic->multi_timer_check(apic_id, irq))
1678 add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); 1534 continue;
1679 1535
1680 setup_IO_APIC_irq(apic_id, pin, irq, desc, 1536 desc = irq_to_desc_alloc_node(irq, node);
1681 irq_trigger(idx), irq_polarity(idx)); 1537 if (!desc) {
1538 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1539 continue;
1682 } 1540 }
1541 cfg = desc->chip_data;
1542 add_pin_to_irq_node(cfg, node, apic_id, pin);
1543 /*
1544 * don't mark it in pin_programmed, so later acpi could
1545 * set it correctly when irq < 16
1546 */
1547 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1548 irq_trigger(idx), irq_polarity(idx));
1683 } 1549 }
1684 1550
1685 if (notcon) 1551 if (notcon)
@@ -1869,7 +1735,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)
1869 1735
1870__apicdebuginit(void) print_local_APIC(void *dummy) 1736__apicdebuginit(void) print_local_APIC(void *dummy)
1871{ 1737{
1872 unsigned int v, ver, maxlvt; 1738 unsigned int i, v, ver, maxlvt;
1873 u64 icr; 1739 u64 icr;
1874 1740
1875 if (apic_verbosity == APIC_QUIET) 1741 if (apic_verbosity == APIC_QUIET)
@@ -1957,6 +1823,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1957 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); 1823 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1958 v = apic_read(APIC_TDCR); 1824 v = apic_read(APIC_TDCR);
1959 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); 1825 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1826
1827 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
1828 v = apic_read(APIC_EFEAT);
1829 maxlvt = (v >> 16) & 0xff;
1830 printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
1831 v = apic_read(APIC_ECTRL);
1832 printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
1833 for (i = 0; i < maxlvt; i++) {
1834 v = apic_read(APIC_EILVTn(i));
1835 printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
1836 }
1837 }
1960 printk("\n"); 1838 printk("\n");
1961} 1839}
1962 1840
@@ -2005,6 +1883,11 @@ __apicdebuginit(void) print_PIC(void)
2005__apicdebuginit(int) print_all_ICs(void) 1883__apicdebuginit(int) print_all_ICs(void)
2006{ 1884{
2007 print_PIC(); 1885 print_PIC();
1886
1887 /* don't print out if apic is not there */
1888 if (!cpu_has_apic || disable_apic)
1889 return 0;
1890
2008 print_all_local_APICs(); 1891 print_all_local_APICs();
2009 print_IO_APIC(); 1892 print_IO_APIC();
2010 1893
@@ -2360,6 +2243,118 @@ static int ioapic_retrigger_irq(unsigned int irq)
2360 */ 2243 */
2361 2244
2362#ifdef CONFIG_SMP 2245#ifdef CONFIG_SMP
2246static void send_cleanup_vector(struct irq_cfg *cfg)
2247{
2248 cpumask_var_t cleanup_mask;
2249
2250 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
2251 unsigned int i;
2252 cfg->move_cleanup_count = 0;
2253 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2254 cfg->move_cleanup_count++;
2255 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2256 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
2257 } else {
2258 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
2259 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
2260 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2261 free_cpumask_var(cleanup_mask);
2262 }
2263 cfg->move_in_progress = 0;
2264}
2265
2266static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
2267{
2268 int apic, pin;
2269 struct irq_pin_list *entry;
2270 u8 vector = cfg->vector;
2271
2272 entry = cfg->irq_2_pin;
2273 for (;;) {
2274 unsigned int reg;
2275
2276 if (!entry)
2277 break;
2278
2279 apic = entry->apic;
2280 pin = entry->pin;
2281 /*
2282 * With interrupt-remapping, destination information comes
2283 * from interrupt-remapping table entry.
2284 */
2285 if (!irq_remapped(irq))
2286 io_apic_write(apic, 0x11 + pin*2, dest);
2287 reg = io_apic_read(apic, 0x10 + pin*2);
2288 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2289 reg |= vector;
2290 io_apic_modify(apic, 0x10 + pin*2, reg);
2291 if (!entry->next)
2292 break;
2293 entry = entry->next;
2294 }
2295}
2296
2297static int
2298assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
2299
2300/*
2301 * Either sets desc->affinity to a valid value, and returns
2302 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
2303 * leaves desc->affinity untouched.
2304 */
2305static unsigned int
2306set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
2307{
2308 struct irq_cfg *cfg;
2309 unsigned int irq;
2310
2311 if (!cpumask_intersects(mask, cpu_online_mask))
2312 return BAD_APICID;
2313
2314 irq = desc->irq;
2315 cfg = desc->chip_data;
2316 if (assign_irq_vector(irq, cfg, mask))
2317 return BAD_APICID;
2318
2319 cpumask_copy(desc->affinity, mask);
2320
2321 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
2322}
2323
2324static int
2325set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2326{
2327 struct irq_cfg *cfg;
2328 unsigned long flags;
2329 unsigned int dest;
2330 unsigned int irq;
2331 int ret = -1;
2332
2333 irq = desc->irq;
2334 cfg = desc->chip_data;
2335
2336 spin_lock_irqsave(&ioapic_lock, flags);
2337 dest = set_desc_affinity(desc, mask);
2338 if (dest != BAD_APICID) {
2339 /* Only the high 8 bits are valid. */
2340 dest = SET_APIC_LOGICAL_ID(dest);
2341 __target_IO_APIC_irq(irq, dest, cfg);
2342 ret = 0;
2343 }
2344 spin_unlock_irqrestore(&ioapic_lock, flags);
2345
2346 return ret;
2347}
2348
2349static int
2350set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
2351{
2352 struct irq_desc *desc;
2353
2354 desc = irq_to_desc(irq);
2355
2356 return set_ioapic_affinity_irq_desc(desc, mask);
2357}
2363 2358
2364#ifdef CONFIG_INTR_REMAP 2359#ifdef CONFIG_INTR_REMAP
2365 2360
@@ -2374,26 +2369,25 @@ static int ioapic_retrigger_irq(unsigned int irq)
2374 * Real vector that is used for interrupting cpu will be coming from 2369 * Real vector that is used for interrupting cpu will be coming from
2375 * the interrupt-remapping table entry. 2370 * the interrupt-remapping table entry.
2376 */ 2371 */
2377static void 2372static int
2378migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2373migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2379{ 2374{
2380 struct irq_cfg *cfg; 2375 struct irq_cfg *cfg;
2381 struct irte irte; 2376 struct irte irte;
2382 unsigned int dest; 2377 unsigned int dest;
2383 unsigned int irq; 2378 unsigned int irq;
2379 int ret = -1;
2384 2380
2385 if (!cpumask_intersects(mask, cpu_online_mask)) 2381 if (!cpumask_intersects(mask, cpu_online_mask))
2386 return; 2382 return ret;
2387 2383
2388 irq = desc->irq; 2384 irq = desc->irq;
2389 if (get_irte(irq, &irte)) 2385 if (get_irte(irq, &irte))
2390 return; 2386 return ret;
2391 2387
2392 cfg = desc->chip_data; 2388 cfg = desc->chip_data;
2393 if (assign_irq_vector(irq, cfg, mask)) 2389 if (assign_irq_vector(irq, cfg, mask))
2394 return; 2390 return ret;
2395
2396 set_extra_move_desc(desc, mask);
2397 2391
2398 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); 2392 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2399 2393
@@ -2409,27 +2403,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2409 send_cleanup_vector(cfg); 2403 send_cleanup_vector(cfg);
2410 2404
2411 cpumask_copy(desc->affinity, mask); 2405 cpumask_copy(desc->affinity, mask);
2406
2407 return 0;
2412} 2408}
2413 2409
2414/* 2410/*
2415 * Migrates the IRQ destination in the process context. 2411 * Migrates the IRQ destination in the process context.
2416 */ 2412 */
2417static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2413static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2418 const struct cpumask *mask) 2414 const struct cpumask *mask)
2419{ 2415{
2420 migrate_ioapic_irq_desc(desc, mask); 2416 return migrate_ioapic_irq_desc(desc, mask);
2421} 2417}
2422static void set_ir_ioapic_affinity_irq(unsigned int irq, 2418static int set_ir_ioapic_affinity_irq(unsigned int irq,
2423 const struct cpumask *mask) 2419 const struct cpumask *mask)
2424{ 2420{
2425 struct irq_desc *desc = irq_to_desc(irq); 2421 struct irq_desc *desc = irq_to_desc(irq);
2426 2422
2427 set_ir_ioapic_affinity_irq_desc(desc, mask); 2423 return set_ir_ioapic_affinity_irq_desc(desc, mask);
2428} 2424}
2429#else 2425#else
2430static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2426static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2431 const struct cpumask *mask) 2427 const struct cpumask *mask)
2432{ 2428{
2429 return 0;
2433} 2430}
2434#endif 2431#endif
2435 2432
@@ -2491,86 +2488,19 @@ static void irq_complete_move(struct irq_desc **descp)
2491 struct irq_cfg *cfg = desc->chip_data; 2488 struct irq_cfg *cfg = desc->chip_data;
2492 unsigned vector, me; 2489 unsigned vector, me;
2493 2490
2494 if (likely(!cfg->move_in_progress)) { 2491 if (likely(!cfg->move_in_progress))
2495#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2496 if (likely(!cfg->move_desc_pending))
2497 return;
2498
2499 /* domain has not changed, but affinity did */
2500 me = smp_processor_id();
2501 if (cpumask_test_cpu(me, desc->affinity)) {
2502 *descp = desc = move_irq_desc(desc, me);
2503 /* get the new one */
2504 cfg = desc->chip_data;
2505 cfg->move_desc_pending = 0;
2506 }
2507#endif
2508 return; 2492 return;
2509 }
2510 2493
2511 vector = ~get_irq_regs()->orig_ax; 2494 vector = ~get_irq_regs()->orig_ax;
2512 me = smp_processor_id(); 2495 me = smp_processor_id();
2513 2496
2514 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { 2497 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2515#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2516 *descp = desc = move_irq_desc(desc, me);
2517 /* get the new one */
2518 cfg = desc->chip_data;
2519#endif
2520 send_cleanup_vector(cfg); 2498 send_cleanup_vector(cfg);
2521 }
2522} 2499}
2523#else 2500#else
2524static inline void irq_complete_move(struct irq_desc **descp) {} 2501static inline void irq_complete_move(struct irq_desc **descp) {}
2525#endif 2502#endif
2526 2503
2527static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2528{
2529 int apic, pin;
2530 struct irq_pin_list *entry;
2531
2532 entry = cfg->irq_2_pin;
2533 for (;;) {
2534
2535 if (!entry)
2536 break;
2537
2538 apic = entry->apic;
2539 pin = entry->pin;
2540 io_apic_eoi(apic, pin);
2541 entry = entry->next;
2542 }
2543}
2544
2545static void
2546eoi_ioapic_irq(struct irq_desc *desc)
2547{
2548 struct irq_cfg *cfg;
2549 unsigned long flags;
2550 unsigned int irq;
2551
2552 irq = desc->irq;
2553 cfg = desc->chip_data;
2554
2555 spin_lock_irqsave(&ioapic_lock, flags);
2556 __eoi_ioapic_irq(irq, cfg);
2557 spin_unlock_irqrestore(&ioapic_lock, flags);
2558}
2559
2560#ifdef CONFIG_X86_X2APIC
2561static void ack_x2apic_level(unsigned int irq)
2562{
2563 struct irq_desc *desc = irq_to_desc(irq);
2564 ack_x2APIC_irq();
2565 eoi_ioapic_irq(desc);
2566}
2567
2568static void ack_x2apic_edge(unsigned int irq)
2569{
2570 ack_x2APIC_irq();
2571}
2572#endif
2573
2574static void ack_apic_edge(unsigned int irq) 2504static void ack_apic_edge(unsigned int irq)
2575{ 2505{
2576 struct irq_desc *desc = irq_to_desc(irq); 2506 struct irq_desc *desc = irq_to_desc(irq);
@@ -2634,9 +2564,6 @@ static void ack_apic_level(unsigned int irq)
2634 */ 2564 */
2635 ack_APIC_irq(); 2565 ack_APIC_irq();
2636 2566
2637 if (irq_remapped(irq))
2638 eoi_ioapic_irq(desc);
2639
2640 /* Now we can move and renable the irq */ 2567 /* Now we can move and renable the irq */
2641 if (unlikely(do_unmask_irq)) { 2568 if (unlikely(do_unmask_irq)) {
2642 /* Only migrate the irq if the ack has been received. 2569 /* Only migrate the irq if the ack has been received.
@@ -2683,22 +2610,50 @@ static void ack_apic_level(unsigned int irq)
2683} 2610}
2684 2611
2685#ifdef CONFIG_INTR_REMAP 2612#ifdef CONFIG_INTR_REMAP
2613static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2614{
2615 int apic, pin;
2616 struct irq_pin_list *entry;
2617
2618 entry = cfg->irq_2_pin;
2619 for (;;) {
2620
2621 if (!entry)
2622 break;
2623
2624 apic = entry->apic;
2625 pin = entry->pin;
2626 io_apic_eoi(apic, pin);
2627 entry = entry->next;
2628 }
2629}
2630
2631static void
2632eoi_ioapic_irq(struct irq_desc *desc)
2633{
2634 struct irq_cfg *cfg;
2635 unsigned long flags;
2636 unsigned int irq;
2637
2638 irq = desc->irq;
2639 cfg = desc->chip_data;
2640
2641 spin_lock_irqsave(&ioapic_lock, flags);
2642 __eoi_ioapic_irq(irq, cfg);
2643 spin_unlock_irqrestore(&ioapic_lock, flags);
2644}
2645
2686static void ir_ack_apic_edge(unsigned int irq) 2646static void ir_ack_apic_edge(unsigned int irq)
2687{ 2647{
2688#ifdef CONFIG_X86_X2APIC 2648 ack_APIC_irq();
2689 if (x2apic_enabled())
2690 return ack_x2apic_edge(irq);
2691#endif
2692 return ack_apic_edge(irq);
2693} 2649}
2694 2650
2695static void ir_ack_apic_level(unsigned int irq) 2651static void ir_ack_apic_level(unsigned int irq)
2696{ 2652{
2697#ifdef CONFIG_X86_X2APIC 2653 struct irq_desc *desc = irq_to_desc(irq);
2698 if (x2apic_enabled()) 2654
2699 return ack_x2apic_level(irq); 2655 ack_APIC_irq();
2700#endif 2656 eoi_ioapic_irq(desc);
2701 return ack_apic_level(irq);
2702} 2657}
2703#endif /* CONFIG_INTR_REMAP */ 2658#endif /* CONFIG_INTR_REMAP */
2704 2659
@@ -2903,7 +2858,7 @@ static inline void __init check_timer(void)
2903{ 2858{
2904 struct irq_desc *desc = irq_to_desc(0); 2859 struct irq_desc *desc = irq_to_desc(0);
2905 struct irq_cfg *cfg = desc->chip_data; 2860 struct irq_cfg *cfg = desc->chip_data;
2906 int cpu = boot_cpu_id; 2861 int node = cpu_to_node(boot_cpu_id);
2907 int apic1, pin1, apic2, pin2; 2862 int apic1, pin1, apic2, pin2;
2908 unsigned long flags; 2863 unsigned long flags;
2909 int no_pin1 = 0; 2864 int no_pin1 = 0;
@@ -2969,7 +2924,7 @@ static inline void __init check_timer(void)
2969 * Ok, does IRQ0 through the IOAPIC work? 2924 * Ok, does IRQ0 through the IOAPIC work?
2970 */ 2925 */
2971 if (no_pin1) { 2926 if (no_pin1) {
2972 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); 2927 add_pin_to_irq_node(cfg, node, apic1, pin1);
2973 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2928 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2974 } else { 2929 } else {
2975 /* for edge trigger, setup_IO_APIC_irq already 2930 /* for edge trigger, setup_IO_APIC_irq already
@@ -3006,7 +2961,7 @@ static inline void __init check_timer(void)
3006 /* 2961 /*
3007 * legacy devices should be connected to IO APIC #0 2962 * legacy devices should be connected to IO APIC #0
3008 */ 2963 */
3009 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); 2964 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
3010 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2965 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
3011 enable_8259A_irq(0); 2966 enable_8259A_irq(0);
3012 if (timer_irq_works()) { 2967 if (timer_irq_works()) {
@@ -3218,14 +3173,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;
3218/* 3173/*
3219 * Dynamic irq allocate and deallocation 3174 * Dynamic irq allocate and deallocation
3220 */ 3175 */
3221unsigned int create_irq_nr(unsigned int irq_want) 3176unsigned int create_irq_nr(unsigned int irq_want, int node)
3222{ 3177{
3223 /* Allocate an unused irq */ 3178 /* Allocate an unused irq */
3224 unsigned int irq; 3179 unsigned int irq;
3225 unsigned int new; 3180 unsigned int new;
3226 unsigned long flags; 3181 unsigned long flags;
3227 struct irq_cfg *cfg_new = NULL; 3182 struct irq_cfg *cfg_new = NULL;
3228 int cpu = boot_cpu_id;
3229 struct irq_desc *desc_new = NULL; 3183 struct irq_desc *desc_new = NULL;
3230 3184
3231 irq = 0; 3185 irq = 0;
@@ -3234,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
3234 3188
3235 spin_lock_irqsave(&vector_lock, flags); 3189 spin_lock_irqsave(&vector_lock, flags);
3236 for (new = irq_want; new < nr_irqs; new++) { 3190 for (new = irq_want; new < nr_irqs; new++) {
3237 desc_new = irq_to_desc_alloc_cpu(new, cpu); 3191 desc_new = irq_to_desc_alloc_node(new, node);
3238 if (!desc_new) { 3192 if (!desc_new) {
3239 printk(KERN_INFO "can not get irq_desc for %d\n", new); 3193 printk(KERN_INFO "can not get irq_desc for %d\n", new);
3240 continue; 3194 continue;
@@ -3243,6 +3197,9 @@ unsigned int create_irq_nr(unsigned int irq_want)
3243 3197
3244 if (cfg_new->vector != 0) 3198 if (cfg_new->vector != 0)
3245 continue; 3199 continue;
3200
3201 desc_new = move_irq_desc(desc_new, node);
3202
3246 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3203 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3247 irq = new; 3204 irq = new;
3248 break; 3205 break;
@@ -3260,11 +3217,12 @@ unsigned int create_irq_nr(unsigned int irq_want)
3260 3217
3261int create_irq(void) 3218int create_irq(void)
3262{ 3219{
3220 int node = cpu_to_node(boot_cpu_id);
3263 unsigned int irq_want; 3221 unsigned int irq_want;
3264 int irq; 3222 int irq;
3265 3223
3266 irq_want = nr_irqs_gsi; 3224 irq_want = nr_irqs_gsi;
3267 irq = create_irq_nr(irq_want); 3225 irq = create_irq_nr(irq_want, node);
3268 3226
3269 if (irq == 0) 3227 if (irq == 0)
3270 irq = -1; 3228 irq = -1;
@@ -3366,7 +3324,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3366} 3324}
3367 3325
3368#ifdef CONFIG_SMP 3326#ifdef CONFIG_SMP
3369static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3327static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3370{ 3328{
3371 struct irq_desc *desc = irq_to_desc(irq); 3329 struct irq_desc *desc = irq_to_desc(irq);
3372 struct irq_cfg *cfg; 3330 struct irq_cfg *cfg;
@@ -3375,7 +3333,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3375 3333
3376 dest = set_desc_affinity(desc, mask); 3334 dest = set_desc_affinity(desc, mask);
3377 if (dest == BAD_APICID) 3335 if (dest == BAD_APICID)
3378 return; 3336 return -1;
3379 3337
3380 cfg = desc->chip_data; 3338 cfg = desc->chip_data;
3381 3339
@@ -3387,13 +3345,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3387 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3345 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3388 3346
3389 write_msi_msg_desc(desc, &msg); 3347 write_msi_msg_desc(desc, &msg);
3348
3349 return 0;
3390} 3350}
3391#ifdef CONFIG_INTR_REMAP 3351#ifdef CONFIG_INTR_REMAP
3392/* 3352/*
3393 * Migrate the MSI irq to another cpumask. This migration is 3353 * Migrate the MSI irq to another cpumask. This migration is
3394 * done in the process context using interrupt-remapping hardware. 3354 * done in the process context using interrupt-remapping hardware.
3395 */ 3355 */
3396static void 3356static int
3397ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3357ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3398{ 3358{
3399 struct irq_desc *desc = irq_to_desc(irq); 3359 struct irq_desc *desc = irq_to_desc(irq);
@@ -3402,11 +3362,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3402 struct irte irte; 3362 struct irte irte;
3403 3363
3404 if (get_irte(irq, &irte)) 3364 if (get_irte(irq, &irte))
3405 return; 3365 return -1;
3406 3366
3407 dest = set_desc_affinity(desc, mask); 3367 dest = set_desc_affinity(desc, mask);
3408 if (dest == BAD_APICID) 3368 if (dest == BAD_APICID)
3409 return; 3369 return -1;
3410 3370
3411 irte.vector = cfg->vector; 3371 irte.vector = cfg->vector;
3412 irte.dest_id = IRTE_DEST(dest); 3372 irte.dest_id = IRTE_DEST(dest);
@@ -3423,6 +3383,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3423 */ 3383 */
3424 if (cfg->move_in_progress) 3384 if (cfg->move_in_progress)
3425 send_cleanup_vector(cfg); 3385 send_cleanup_vector(cfg);
3386
3387 return 0;
3426} 3388}
3427 3389
3428#endif 3390#endif
@@ -3518,15 +3480,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3518 unsigned int irq_want; 3480 unsigned int irq_want;
3519 struct intel_iommu *iommu = NULL; 3481 struct intel_iommu *iommu = NULL;
3520 int index = 0; 3482 int index = 0;
3483 int node;
3521 3484
3522 /* x86 doesn't support multiple MSI yet */ 3485 /* x86 doesn't support multiple MSI yet */
3523 if (type == PCI_CAP_ID_MSI && nvec > 1) 3486 if (type == PCI_CAP_ID_MSI && nvec > 1)
3524 return 1; 3487 return 1;
3525 3488
3489 node = dev_to_node(&dev->dev);
3526 irq_want = nr_irqs_gsi; 3490 irq_want = nr_irqs_gsi;
3527 sub_handle = 0; 3491 sub_handle = 0;
3528 list_for_each_entry(msidesc, &dev->msi_list, list) { 3492 list_for_each_entry(msidesc, &dev->msi_list, list) {
3529 irq = create_irq_nr(irq_want); 3493 irq = create_irq_nr(irq_want, node);
3530 if (irq == 0) 3494 if (irq == 0)
3531 return -1; 3495 return -1;
3532 irq_want = irq + 1; 3496 irq_want = irq + 1;
@@ -3576,7 +3540,7 @@ void arch_teardown_msi_irq(unsigned int irq)
3576 3540
3577#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) 3541#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
3578#ifdef CONFIG_SMP 3542#ifdef CONFIG_SMP
3579static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3543static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3580{ 3544{
3581 struct irq_desc *desc = irq_to_desc(irq); 3545 struct irq_desc *desc = irq_to_desc(irq);
3582 struct irq_cfg *cfg; 3546 struct irq_cfg *cfg;
@@ -3585,7 +3549,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3585 3549
3586 dest = set_desc_affinity(desc, mask); 3550 dest = set_desc_affinity(desc, mask);
3587 if (dest == BAD_APICID) 3551 if (dest == BAD_APICID)
3588 return; 3552 return -1;
3589 3553
3590 cfg = desc->chip_data; 3554 cfg = desc->chip_data;
3591 3555
@@ -3597,6 +3561,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3597 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3561 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3598 3562
3599 dmar_msi_write(irq, &msg); 3563 dmar_msi_write(irq, &msg);
3564
3565 return 0;
3600} 3566}
3601 3567
3602#endif /* CONFIG_SMP */ 3568#endif /* CONFIG_SMP */
@@ -3630,7 +3596,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3630#ifdef CONFIG_HPET_TIMER 3596#ifdef CONFIG_HPET_TIMER
3631 3597
3632#ifdef CONFIG_SMP 3598#ifdef CONFIG_SMP
3633static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3599static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3634{ 3600{
3635 struct irq_desc *desc = irq_to_desc(irq); 3601 struct irq_desc *desc = irq_to_desc(irq);
3636 struct irq_cfg *cfg; 3602 struct irq_cfg *cfg;
@@ -3639,7 +3605,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3639 3605
3640 dest = set_desc_affinity(desc, mask); 3606 dest = set_desc_affinity(desc, mask);
3641 if (dest == BAD_APICID) 3607 if (dest == BAD_APICID)
3642 return; 3608 return -1;
3643 3609
3644 cfg = desc->chip_data; 3610 cfg = desc->chip_data;
3645 3611
@@ -3651,6 +3617,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3651 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3617 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3652 3618
3653 hpet_msi_write(irq, &msg); 3619 hpet_msi_write(irq, &msg);
3620
3621 return 0;
3654} 3622}
3655 3623
3656#endif /* CONFIG_SMP */ 3624#endif /* CONFIG_SMP */
@@ -3707,7 +3675,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3707 write_ht_irq_msg(irq, &msg); 3675 write_ht_irq_msg(irq, &msg);
3708} 3676}
3709 3677
3710static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) 3678static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3711{ 3679{
3712 struct irq_desc *desc = irq_to_desc(irq); 3680 struct irq_desc *desc = irq_to_desc(irq);
3713 struct irq_cfg *cfg; 3681 struct irq_cfg *cfg;
@@ -3715,11 +3683,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3715 3683
3716 dest = set_desc_affinity(desc, mask); 3684 dest = set_desc_affinity(desc, mask);
3717 if (dest == BAD_APICID) 3685 if (dest == BAD_APICID)
3718 return; 3686 return -1;
3719 3687
3720 cfg = desc->chip_data; 3688 cfg = desc->chip_data;
3721 3689
3722 target_ht_irq(irq, dest, cfg->vector); 3690 target_ht_irq(irq, dest, cfg->vector);
3691
3692 return 0;
3723} 3693}
3724 3694
3725#endif 3695#endif
@@ -3794,6 +3764,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3794 unsigned long flags; 3764 unsigned long flags;
3795 int err; 3765 int err;
3796 3766
3767 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3768
3797 cfg = irq_cfg(irq); 3769 cfg = irq_cfg(irq);
3798 3770
3799 err = assign_irq_vector(irq, cfg, eligible_cpu); 3771 err = assign_irq_vector(irq, cfg, eligible_cpu);
@@ -3807,15 +3779,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3807 3779
3808 mmr_value = 0; 3780 mmr_value = 0;
3809 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3781 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3810 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); 3782 entry->vector = cfg->vector;
3811 3783 entry->delivery_mode = apic->irq_delivery_mode;
3812 entry->vector = cfg->vector; 3784 entry->dest_mode = apic->irq_dest_mode;
3813 entry->delivery_mode = apic->irq_delivery_mode; 3785 entry->polarity = 0;
3814 entry->dest_mode = apic->irq_dest_mode; 3786 entry->trigger = 0;
3815 entry->polarity = 0; 3787 entry->mask = 0;
3816 entry->trigger = 0; 3788 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3817 entry->mask = 0;
3818 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3819 3789
3820 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3790 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3821 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3791 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3833,10 +3803,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3833 struct uv_IO_APIC_route_entry *entry; 3803 struct uv_IO_APIC_route_entry *entry;
3834 int mmr_pnode; 3804 int mmr_pnode;
3835 3805
3806 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3807
3836 mmr_value = 0; 3808 mmr_value = 0;
3837 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3809 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3838 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3839
3840 entry->mask = 1; 3810 entry->mask = 1;
3841 3811
3842 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3812 mmr_pnode = uv_blade_to_pnode(mmr_blade);
@@ -3900,6 +3870,71 @@ int __init arch_probe_nr_irqs(void)
3900} 3870}
3901#endif 3871#endif
3902 3872
3873static int __io_apic_set_pci_routing(struct device *dev, int irq,
3874 struct io_apic_irq_attr *irq_attr)
3875{
3876 struct irq_desc *desc;
3877 struct irq_cfg *cfg;
3878 int node;
3879 int ioapic, pin;
3880 int trigger, polarity;
3881
3882 ioapic = irq_attr->ioapic;
3883 if (!IO_APIC_IRQ(irq)) {
3884 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3885 ioapic);
3886 return -EINVAL;
3887 }
3888
3889 if (dev)
3890 node = dev_to_node(dev);
3891 else
3892 node = cpu_to_node(boot_cpu_id);
3893
3894 desc = irq_to_desc_alloc_node(irq, node);
3895 if (!desc) {
3896 printk(KERN_INFO "can not get irq_desc %d\n", irq);
3897 return 0;
3898 }
3899
3900 pin = irq_attr->ioapic_pin;
3901 trigger = irq_attr->trigger;
3902 polarity = irq_attr->polarity;
3903
3904 /*
3905 * IRQs < 16 are already in the irq_2_pin[] map
3906 */
3907 if (irq >= NR_IRQS_LEGACY) {
3908 cfg = desc->chip_data;
3909 add_pin_to_irq_node(cfg, node, ioapic, pin);
3910 }
3911
3912 setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
3913
3914 return 0;
3915}
3916
3917int io_apic_set_pci_routing(struct device *dev, int irq,
3918 struct io_apic_irq_attr *irq_attr)
3919{
3920 int ioapic, pin;
3921 /*
3922 * Avoid pin reprogramming. PRTs typically include entries
3923 * with redundant pin->gsi mappings (but unique PCI devices);
3924 * we only program the IOAPIC on the first.
3925 */
3926 ioapic = irq_attr->ioapic;
3927 pin = irq_attr->ioapic_pin;
3928 if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
3929 pr_debug("Pin %d-%d already programmed\n",
3930 mp_ioapics[ioapic].apicid, pin);
3931 return 0;
3932 }
3933 set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
3934
3935 return __io_apic_set_pci_routing(dev, irq, irq_attr);
3936}
3937
3903/* -------------------------------------------------------------------------- 3938/* --------------------------------------------------------------------------
3904 ACPI-based IOAPIC Configuration 3939 ACPI-based IOAPIC Configuration
3905 -------------------------------------------------------------------------- */ 3940 -------------------------------------------------------------------------- */
@@ -3980,6 +4015,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3980 4015
3981 return apic_id; 4016 return apic_id;
3982} 4017}
4018#endif
3983 4019
3984int __init io_apic_get_version(int ioapic) 4020int __init io_apic_get_version(int ioapic)
3985{ 4021{
@@ -3992,39 +4028,6 @@ int __init io_apic_get_version(int ioapic)
3992 4028
3993 return reg_01.bits.version; 4029 return reg_01.bits.version;
3994} 4030}
3995#endif
3996
3997int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
3998{
3999 struct irq_desc *desc;
4000 struct irq_cfg *cfg;
4001 int cpu = boot_cpu_id;
4002
4003 if (!IO_APIC_IRQ(irq)) {
4004 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
4005 ioapic);
4006 return -EINVAL;
4007 }
4008
4009 desc = irq_to_desc_alloc_cpu(irq, cpu);
4010 if (!desc) {
4011 printk(KERN_INFO "can not get irq_desc %d\n", irq);
4012 return 0;
4013 }
4014
4015 /*
4016 * IRQs < 16 are already in the irq_2_pin[] map
4017 */
4018 if (irq >= NR_IRQS_LEGACY) {
4019 cfg = desc->chip_data;
4020 add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
4021 }
4022
4023 setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
4024
4025 return 0;
4026}
4027
4028 4031
4029int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) 4032int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4030{ 4033{
@@ -4055,51 +4058,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4055#ifdef CONFIG_SMP 4058#ifdef CONFIG_SMP
4056void __init setup_ioapic_dest(void) 4059void __init setup_ioapic_dest(void)
4057{ 4060{
4058 int pin, ioapic, irq, irq_entry; 4061 int pin, ioapic = 0, irq, irq_entry;
4059 struct irq_desc *desc; 4062 struct irq_desc *desc;
4060 struct irq_cfg *cfg;
4061 const struct cpumask *mask; 4063 const struct cpumask *mask;
4062 4064
4063 if (skip_ioapic_setup == 1) 4065 if (skip_ioapic_setup == 1)
4064 return; 4066 return;
4065 4067
4066 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { 4068#ifdef CONFIG_ACPI
4067 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 4069 if (!acpi_disabled && acpi_ioapic) {
4068 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 4070 ioapic = mp_find_ioapic(0);
4069 if (irq_entry == -1) 4071 if (ioapic < 0)
4070 continue; 4072 ioapic = 0;
4071 irq = pin_2_irq(irq_entry, ioapic, pin); 4073 }
4072 4074#endif
4073 /* setup_IO_APIC_irqs could fail to get vector for some device
4074 * when you have too many devices, because at that time only boot
4075 * cpu is online.
4076 */
4077 desc = irq_to_desc(irq);
4078 cfg = desc->chip_data;
4079 if (!cfg->vector) {
4080 setup_IO_APIC_irq(ioapic, pin, irq, desc,
4081 irq_trigger(irq_entry),
4082 irq_polarity(irq_entry));
4083 continue;
4084 4075
4085 } 4076 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4077 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4078 if (irq_entry == -1)
4079 continue;
4080 irq = pin_2_irq(irq_entry, ioapic, pin);
4086 4081
4087 /* 4082 desc = irq_to_desc(irq);
4088 * Honour affinities which have been set in early boot
4089 */
4090 if (desc->status &
4091 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4092 mask = desc->affinity;
4093 else
4094 mask = apic->target_cpus();
4095 4083
4096 if (intr_remapping_enabled) 4084 /*
4097 set_ir_ioapic_affinity_irq_desc(desc, mask); 4085 * Honour affinities which have been set in early boot
4098 else 4086 */
4099 set_ioapic_affinity_irq_desc(desc, mask); 4087 if (desc->status &
4100 } 4088 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4089 mask = desc->affinity;
4090 else
4091 mask = apic->target_cpus();
4101 4092
4093 if (intr_remapping_enabled)
4094 set_ir_ioapic_affinity_irq_desc(desc, mask);
4095 else
4096 set_ioapic_affinity_irq_desc(desc, mask);
4102 } 4097 }
4098
4103} 4099}
4104#endif 4100#endif
4105 4101
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index ce4fbfa315a1..b3025b43b63a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
66 66
67static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
68{ 68{
69#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) 69#if defined(CONFIG_X86_NEW_MCE)
70 return atomic_read(&mce_entry) > 0; 70 return atomic_read(&mce_entry) > 0;
71#endif 71#endif
72 return 0; 72 return 0;
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
104} 104}
105#endif 105#endif
106 106
107static void report_broken_nmi(int cpu, int *prev_nmi_count) 107static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
108{ 108{
109 printk(KERN_CONT "\n"); 109 printk(KERN_CONT "\n");
110 110
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 01eda2ac65e4..440a8bccd91a 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -160,7 +160,6 @@ extern struct apic apic_summit;
160extern struct apic apic_bigsmp; 160extern struct apic apic_bigsmp;
161extern struct apic apic_es7000; 161extern struct apic apic_es7000;
162extern struct apic apic_es7000_cluster; 162extern struct apic apic_es7000_cluster;
163extern struct apic apic_default;
164 163
165struct apic *apic = &apic_default; 164struct apic *apic = &apic_default;
166EXPORT_SYMBOL_GPL(apic); 165EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 1783652bb0e5..bc3e880f9b82 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {
50void __init default_setup_apic_routing(void) 50void __init default_setup_apic_routing(void)
51{ 51{
52#ifdef CONFIG_X86_X2APIC 52#ifdef CONFIG_X86_X2APIC
53 if (x2apic && (apic != &apic_x2apic_phys && 53 if (x2apic_mode && (apic != &apic_x2apic_phys &&
54#ifdef CONFIG_X86_UV 54#ifdef CONFIG_X86_UV
55 apic != &apic_x2apic_uv_x && 55 apic != &apic_x2apic_uv_x &&
56#endif 56#endif
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9cfe1f415d81..344eee4ac0a4 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){
173 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); 173 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
174} 174}
175 175
176
177/* In clustered mode, the high nibble of APIC ID is a cluster number.
178 * The low nibble is a 4-bit bitmap. */
179#define XAPIC_DEST_CPUS_SHIFT 4
180#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
181#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
182
183#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) 176#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
184 177
185static const struct cpumask *summit_target_cpus(void) 178static const struct cpumask *summit_target_cpus(void)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4a903e2f0d17..8e4cbb255c38 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -10,7 +10,7 @@
10#include <asm/apic.h> 10#include <asm/apic.h>
11#include <asm/ipi.h> 11#include <asm/ipi.h>
12 12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); 13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14 14
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{ 16{
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2bda69352976..096d19aea2f7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
105 cpumask_set_cpu(cpu, retmask); 105 cpumask_set_cpu(cpu, retmask);
106} 106}
107 107
108static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) 108static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
109{ 109{
110#ifdef CONFIG_SMP 110#ifdef CONFIG_SMP
111 unsigned long val; 111 unsigned long val;
@@ -463,7 +463,7 @@ static void uv_heartbeat(unsigned long ignored)
463 uv_set_scir_bits(bits); 463 uv_set_scir_bits(bits);
464 464
465 /* enable next timer period */ 465 /* enable next timer period */
466 mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); 466 mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
467} 467}
468 468
469static void __cpuinit uv_heartbeat_enable(int cpu) 469static void __cpuinit uv_heartbeat_enable(int cpu)
@@ -562,7 +562,7 @@ void __init uv_system_init(void)
562 union uvh_node_id_u node_id; 562 union uvh_node_id_u node_id;
563 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 563 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
564 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; 564 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
565 int max_pnode = 0; 565 int gnode_extra, max_pnode = 0;
566 unsigned long mmr_base, present, paddr; 566 unsigned long mmr_base, present, paddr;
567 unsigned short pnode_mask; 567 unsigned short pnode_mask;
568 568
@@ -574,6 +574,13 @@ void __init uv_system_init(void)
574 mmr_base = 574 mmr_base =
575 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 575 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
576 ~UV_MMR_ENABLE; 576 ~UV_MMR_ENABLE;
577 pnode_mask = (1 << n_val) - 1;
578 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
579 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
580 gnode_upper = ((unsigned long)gnode_extra << m_val);
581 printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
582 n_val, m_val, gnode_upper, gnode_extra);
583
577 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 584 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
578 585
579 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) 586 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -583,15 +590,18 @@ void __init uv_system_init(void)
583 590
584 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 591 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
585 uv_blade_info = kmalloc(bytes, GFP_KERNEL); 592 uv_blade_info = kmalloc(bytes, GFP_KERNEL);
593 BUG_ON(!uv_blade_info);
586 594
587 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); 595 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
588 596
589 bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); 597 bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
590 uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); 598 uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
599 BUG_ON(!uv_node_to_blade);
591 memset(uv_node_to_blade, 255, bytes); 600 memset(uv_node_to_blade, 255, bytes);
592 601
593 bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); 602 bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
594 uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); 603 uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
604 BUG_ON(!uv_cpu_to_blade);
595 memset(uv_cpu_to_blade, 255, bytes); 605 memset(uv_cpu_to_blade, 255, bytes);
596 606
597 blade = 0; 607 blade = 0;
@@ -607,11 +617,6 @@ void __init uv_system_init(void)
607 } 617 }
608 } 618 }
609 619
610 pnode_mask = (1 << n_val) - 1;
611 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
612 gnode_upper = (((unsigned long)node_id.s.node_id) &
613 ~((1 << n_val) - 1)) << m_val;
614
615 uv_bios_init(); 620 uv_bios_init();
616 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 621 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
617 &sn_coherency_id, &sn_region_size); 622 &sn_coherency_id, &sn_region_size);
@@ -634,6 +639,7 @@ void __init uv_system_init(void)
634 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; 639 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
635 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 640 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
636 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 641 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
642 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
637 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 643 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
638 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 644 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
639 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 645 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 49e0939bac42..79302e9a33a4 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1233,9 +1233,9 @@ static int suspend(int vetoable)
1233 int err; 1233 int err;
1234 struct apm_user *as; 1234 struct apm_user *as;
1235 1235
1236 device_suspend(PMSG_SUSPEND); 1236 dpm_suspend_start(PMSG_SUSPEND);
1237 1237
1238 device_power_down(PMSG_SUSPEND); 1238 dpm_suspend_noirq(PMSG_SUSPEND);
1239 1239
1240 local_irq_disable(); 1240 local_irq_disable();
1241 sysdev_suspend(PMSG_SUSPEND); 1241 sysdev_suspend(PMSG_SUSPEND);
@@ -1259,9 +1259,9 @@ static int suspend(int vetoable)
1259 sysdev_resume(); 1259 sysdev_resume();
1260 local_irq_enable(); 1260 local_irq_enable();
1261 1261
1262 device_power_up(PMSG_RESUME); 1262 dpm_resume_noirq(PMSG_RESUME);
1263 1263
1264 device_resume(PMSG_RESUME); 1264 dpm_resume_end(PMSG_RESUME);
1265 queue_event(APM_NORMAL_RESUME, NULL); 1265 queue_event(APM_NORMAL_RESUME, NULL);
1266 spin_lock(&user_list_lock); 1266 spin_lock(&user_list_lock);
1267 for (as = user_list; as != NULL; as = as->next) { 1267 for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1277,7 @@ static void standby(void)
1277{ 1277{
1278 int err; 1278 int err;
1279 1279
1280 device_power_down(PMSG_SUSPEND); 1280 dpm_suspend_noirq(PMSG_SUSPEND);
1281 1281
1282 local_irq_disable(); 1282 local_irq_disable();
1283 sysdev_suspend(PMSG_SUSPEND); 1283 sysdev_suspend(PMSG_SUSPEND);
@@ -1291,7 +1291,7 @@ static void standby(void)
1291 sysdev_resume(); 1291 sysdev_resume();
1292 local_irq_enable(); 1292 local_irq_enable();
1293 1293
1294 device_power_up(PMSG_RESUME); 1294 dpm_resume_noirq(PMSG_RESUME);
1295} 1295}
1296 1296
1297static apm_event_t get_event(void) 1297static apm_event_t get_event(void)
@@ -1376,7 +1376,7 @@ static void check_events(void)
1376 ignore_bounce = 1; 1376 ignore_bounce = 1;
1377 if ((event != APM_NORMAL_RESUME) 1377 if ((event != APM_NORMAL_RESUME)
1378 || (ignore_normal_resume == 0)) { 1378 || (ignore_normal_resume == 0)) {
1379 device_resume(PMSG_RESUME); 1379 dpm_resume_end(PMSG_RESUME);
1380 queue_event(event, NULL); 1380 queue_event(event, NULL);
1381 } 1381 }
1382 ignore_normal_resume = 0; 1382 ignore_normal_resume = 0;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 5a6aa1c1162f..dfdbf6403895 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -126,6 +126,7 @@ void foo(void)
126#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 126#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
127 BLANK(); 127 BLANK();
128 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 128 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
129 OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
129 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); 130 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
130 131
131 BLANK(); 132 BLANK();
@@ -146,4 +147,5 @@ void foo(void)
146 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 147 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
147 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 148 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
148 OFFSET(BP_version, boot_params, hdr.version); 149 OFFSET(BP_version, boot_params, hdr.version);
150 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
149} 151}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72f062fb4b5..898ecc47e129 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -125,6 +125,7 @@ int main(void)
125 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 125 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
126 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 126 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
127 OFFSET(BP_version, boot_params, hdr.version); 127 OFFSET(BP_version, boot_params, hdr.version);
128 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
128 129
129 BLANK(); 130 BLANK();
130 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 131 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e4..3efcb2b96a15 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
25 25
26obj-$(CONFIG_X86_MCE) += mcheck/ 26obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
27obj-$(CONFIG_MTRR) += mtrr/
28obj-$(CONFIG_CPU_FREQ) += cpufreq/
29 27
30obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 28obj-$(CONFIG_X86_MCE) += mcheck/
29obj-$(CONFIG_MTRR) += mtrr/
30obj-$(CONFIG_CPU_FREQ) += cpufreq/
31
32obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
31 33
32quiet_cmd_mkcapflags = MKCAP $@ 34quiet_cmd_mkcapflags = MKCAP $@
33 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 35 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459daa64..e5b27d8f1b47 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h> 8#include <asm/cpu.h>
9#include <asm/pci-direct.h>
9 10
10#ifdef CONFIG_X86_64 11#ifdef CONFIG_X86_64
11# include <asm/numa_64.h> 12# include <asm/numa_64.h>
@@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
272#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 273#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
273 int cpu = smp_processor_id(); 274 int cpu = smp_processor_id();
274 int node; 275 int node;
275 unsigned apicid = hard_smp_processor_id(); 276 unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
276 277
277 node = c->phys_proc_id; 278 node = c->phys_proc_id;
278 if (apicid_to_node[apicid] != NUMA_NO_NODE) 279 if (apicid_to_node[apicid] != NUMA_NO_NODE)
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
351 (c->x86_model == 8 && c->x86_mask >= 8)) 352 (c->x86_model == 8 && c->x86_mask >= 8))
352 set_cpu_cap(c, X86_FEATURE_K6_MTRR); 353 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
353#endif 354#endif
355#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
356 /* check CPU config space for extended APIC ID */
357 if (c->x86 >= 0xf) {
358 unsigned int val;
359 val = read_pci_config(0, 24, 0, 0x68);
360 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
361 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
362 }
363#endif
354} 364}
355 365
356static void __cpuinit init_amd(struct cpuinfo_x86 *c) 366static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1caefc82e62..9fa33886c0d7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h>
16#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
17#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -114,6 +115,13 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
114} }; 115} };
115EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 116EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
116 117
118static int __init x86_xsave_setup(char *s)
119{
120 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
121 return 1;
122}
123__setup("noxsave", x86_xsave_setup);
124
117#ifdef CONFIG_X86_32 125#ifdef CONFIG_X86_32
118static int cachesize_override __cpuinitdata = -1; 126static int cachesize_override __cpuinitdata = -1;
119static int disable_x86_serial_nr __cpuinitdata = 1; 127static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -292,7 +300,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
292 return NULL; /* Not found */ 300 return NULL; /* Not found */
293} 301}
294 302
295__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 303__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
304__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
296 305
297void load_percpu_segment(int cpu) 306void load_percpu_segment(int cpu)
298{ 307{
@@ -478,7 +487,6 @@ out:
478static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) 487static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
479{ 488{
480 char *v = c->x86_vendor_id; 489 char *v = c->x86_vendor_id;
481 static int printed;
482 int i; 490 int i;
483 491
484 for (i = 0; i < X86_VENDOR_NUM; i++) { 492 for (i = 0; i < X86_VENDOR_NUM; i++) {
@@ -495,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
495 } 503 }
496 } 504 }
497 505
498 if (!printed) { 506 printk_once(KERN_ERR
499 printed++; 507 "CPU: vendor_id '%s' unknown, using generic init.\n" \
500 printk(KERN_ERR 508 "CPU: Your system may be unstable.\n", v);
501 "CPU: vendor_id '%s' unknown, using generic init.\n", v);
502
503 printk(KERN_ERR "CPU: Your system may be unstable.\n");
504 }
505 509
506 c->x86_vendor = X86_VENDOR_UNKNOWN; 510 c->x86_vendor = X86_VENDOR_UNKNOWN;
507 this_cpu = &default_cpu; 511 this_cpu = &default_cpu;
@@ -761,6 +765,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
761 if (this_cpu->c_identify) 765 if (this_cpu->c_identify)
762 this_cpu->c_identify(c); 766 this_cpu->c_identify(c);
763 767
768 /* Clear/Set all flags overriden by options, after probe */
769 for (i = 0; i < NCAPINTS; i++) {
770 c->x86_capability[i] &= ~cpu_caps_cleared[i];
771 c->x86_capability[i] |= cpu_caps_set[i];
772 }
773
764#ifdef CONFIG_X86_64 774#ifdef CONFIG_X86_64
765 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); 775 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
766#endif 776#endif
@@ -806,6 +816,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
806#endif 816#endif
807 817
808 init_hypervisor(c); 818 init_hypervisor(c);
819
820 /*
821 * Clear/Set all flags overriden by options, need do it
822 * before following smp all cpus cap AND.
823 */
824 for (i = 0; i < NCAPINTS; i++) {
825 c->x86_capability[i] &= ~cpu_caps_cleared[i];
826 c->x86_capability[i] |= cpu_caps_set[i];
827 }
828
809 /* 829 /*
810 * On SMP, boot_cpu_data holds the common feature set between 830 * On SMP, boot_cpu_data holds the common feature set between
811 * all CPUs; so make sure that we indicate which features are 831 * all CPUs; so make sure that we indicate which features are
@@ -818,10 +838,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
818 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 838 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
819 } 839 }
820 840
821 /* Clear all flags overriden by options */
822 for (i = 0; i < NCAPINTS; i++)
823 c->x86_capability[i] &= ~cleared_cpu_caps[i];
824
825#ifdef CONFIG_X86_MCE 841#ifdef CONFIG_X86_MCE
826 /* Init Machine Check Exception if available. */ 842 /* Init Machine Check Exception if available. */
827 mcheck_init(c); 843 mcheck_init(c);
@@ -854,6 +870,7 @@ void __init identify_boot_cpu(void)
854#else 870#else
855 vgetcpu_set_mode(); 871 vgetcpu_set_mode();
856#endif 872#endif
873 init_hw_perf_counters();
857} 874}
858 875
859void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 876void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 46e29ab96c6a..6b2a52dd0403 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -32,9 +32,7 @@
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
35static DEFINE_PER_CPU(unsigned, cpu_modelflag);
36static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
37static DEFINE_PER_CPU(unsigned, cpu_model);
38 36
39static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
40 38
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
80 { "value", CPU_REG_ALL, 1 }, 78 { "value", CPU_REG_ALL, 1 },
81}; 79};
82 80
83/* Intel Registers Range */ 81/* CPU Registers Range */
84static struct cpu_debug_range cpu_intel_range[] = { 82static struct cpu_debug_range cpu_reg_range[] = {
85 { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, 83 { 0x00000000, 0x00000001, CPU_MC, },
86 { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, 84 { 0x00000006, 0x00000007, CPU_MONITOR, },
87 { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, 85 { 0x00000010, 0x00000010, CPU_TIME, },
88 { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, 86 { 0x00000011, 0x00000013, CPU_PMC, },
89 { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, 87 { 0x00000017, 0x00000017, CPU_PLATFORM, },
90 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, 88 { 0x0000001B, 0x0000001B, CPU_APIC, },
91 89 { 0x0000002A, 0x0000002B, CPU_POWERON, },
92 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, 90 { 0x0000002C, 0x0000002C, CPU_FREQ, },
93 { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, 91 { 0x0000003A, 0x0000003A, CPU_CONTROL, },
94 { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, 92 { 0x00000040, 0x00000047, CPU_LBRANCH, },
95 { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, 93 { 0x00000060, 0x00000067, CPU_LBRANCH, },
96 94 { 0x00000079, 0x00000079, CPU_BIOS, },
97 { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, 95 { 0x00000088, 0x0000008A, CPU_CACHE, },
98 { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, 96 { 0x0000008B, 0x0000008B, CPU_BIOS, },
99 { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, 97 { 0x0000009B, 0x0000009B, CPU_MONITOR, },
100 { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, 98 { 0x000000C1, 0x000000C4, CPU_PMC, },
101 99 { 0x000000CD, 0x000000CD, CPU_FREQ, },
102 { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, 100 { 0x000000E7, 0x000000E8, CPU_PERF, },
103 { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, 101 { 0x000000FE, 0x000000FE, CPU_MTRR, },
104 { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, 102
105 { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, 103 { 0x00000116, 0x0000011E, CPU_CACHE, },
106 104 { 0x00000174, 0x00000176, CPU_SYSENTER, },
107 { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, 105 { 0x00000179, 0x0000017B, CPU_MC, },
108 { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, 106 { 0x00000186, 0x00000189, CPU_PMC, },
109 { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, 107 { 0x00000198, 0x00000199, CPU_PERF, },
110 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, 108 { 0x0000019A, 0x0000019A, CPU_TIME, },
111 109 { 0x0000019B, 0x0000019D, CPU_THERM, },
112 { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, 110 { 0x000001A0, 0x000001A0, CPU_MISC, },
113 { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, 111 { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
114 { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, 112 { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
115 { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, 113 { 0x000001D9, 0x000001D9, CPU_DEBUG, },
116 { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, 114 { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
117 115
118 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, 116 { 0x00000200, 0x0000020F, CPU_MTRR, },
119 { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, 117 { 0x00000250, 0x00000250, CPU_MTRR, },
120 { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, 118 { 0x00000258, 0x00000259, CPU_MTRR, },
121 { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, 119 { 0x00000268, 0x0000026F, CPU_MTRR, },
122 { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, 120 { 0x00000277, 0x00000277, CPU_PAT, },
123 { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, 121 { 0x000002FF, 0x000002FF, CPU_MTRR, },
124 { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, 122
125 { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, 123 { 0x00000300, 0x00000311, CPU_PMC, },
126 124 { 0x00000345, 0x00000345, CPU_PMC, },
127 { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, 125 { 0x00000360, 0x00000371, CPU_PMC, },
128 { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, 126 { 0x0000038D, 0x00000390, CPU_PMC, },
129 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, 127 { 0x000003A0, 0x000003BE, CPU_PMC, },
130 { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, 128 { 0x000003C0, 0x000003CD, CPU_PMC, },
131 { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, 129 { 0x000003E0, 0x000003E1, CPU_PMC, },
132 { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, 130 { 0x000003F0, 0x000003F2, CPU_PMC, },
133 { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, 131
134 { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, 132 { 0x00000400, 0x00000417, CPU_MC, },
135 133 { 0x00000480, 0x0000048B, CPU_VMX, },
136 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, 134
137 { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, 135 { 0x00000600, 0x00000600, CPU_DEBUG, },
138 { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, 136 { 0x00000680, 0x0000068F, CPU_LBRANCH, },
139 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, 137 { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
140 { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, 138
141 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, 139 { 0x000107CC, 0x000107D3, CPU_PMC, },
142 140
143 { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, 141 { 0xC0000080, 0xC0000080, CPU_FEATURES, },
144 { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, 142 { 0xC0000081, 0xC0000084, CPU_CALL, },
145 { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, 143 { 0xC0000100, 0xC0000102, CPU_BASE, },
146 { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, 144 { 0xC0000103, 0xC0000103, CPU_TIME, },
147 { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, 145
148 { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, 146 { 0xC0010000, 0xC0010007, CPU_PMC, },
149 { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, 147 { 0xC0010010, 0xC0010010, CPU_CONF, },
150 { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, 148 { 0xC0010015, 0xC0010015, CPU_CONF, },
151 { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, 149 { 0xC0010016, 0xC001001A, CPU_MTRR, },
152 { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, 150 { 0xC001001D, 0xC001001D, CPU_MTRR, },
153 { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, 151 { 0xC001001F, 0xC001001F, CPU_CONF, },
154 { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, 152 { 0xC0010030, 0xC0010035, CPU_BIOS, },
155 153 { 0xC0010044, 0xC0010048, CPU_MC, },
156 { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, 154 { 0xC0010050, 0xC0010056, CPU_SMM, },
157 { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, 155 { 0xC0010058, 0xC0010058, CPU_CONF, },
158 { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, 156 { 0xC0010060, 0xC0010060, CPU_CACHE, },
159 { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, 157 { 0xC0010061, 0xC0010068, CPU_SMM, },
160 { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, 158 { 0xC0010069, 0xC001006B, CPU_SMM, },
161 { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, 159 { 0xC0010070, 0xC0010071, CPU_SMM, },
162 { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, 160 { 0xC0010111, 0xC0010113, CPU_SMM, },
163 { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, 161 { 0xC0010114, 0xC0010118, CPU_SVM, },
164 { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, 162 { 0xC0010140, 0xC0010141, CPU_OSVM, },
165 { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, 163 { 0xC0011022, 0xC0011023, CPU_CONF, },
166 { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
167
168 { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
169 { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
170 { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
171
172 { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
173
174 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
175 { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
176 { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
177 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
178}; 164};
179 165
180/* AMD Registers Range */
181static struct cpu_debug_range cpu_amd_range[] = {
182 { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
183 { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
184 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
185 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
186 { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
187 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
188
189 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
190 { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
191 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
192 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
193
194 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
195 { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
196 { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
197 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
198 { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
199 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
200
201 { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
202
203 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
204 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
205 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
206 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
207
208 { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
209 { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
210 { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
211 { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
212 { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
213 { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
214 { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
215 { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
216 { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
217 { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
218 { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
219 { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
220 { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
221 { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
222 { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
223 { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
224 { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
225 { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
226};
227
228
229/* Intel */
230static int get_intel_modelflag(unsigned model)
231{
232 int flag;
233
234 switch (model) {
235 case 0x0501:
236 case 0x0502:
237 case 0x0504:
238 flag = CPU_INTEL_PENTIUM;
239 break;
240 case 0x0601:
241 case 0x0603:
242 case 0x0605:
243 case 0x0607:
244 case 0x0608:
245 case 0x060A:
246 case 0x060B:
247 flag = CPU_INTEL_P6;
248 break;
249 case 0x0609:
250 case 0x060D:
251 flag = CPU_INTEL_PENTIUM_M;
252 break;
253 case 0x060E:
254 flag = CPU_INTEL_CORE;
255 break;
256 case 0x060F:
257 case 0x0617:
258 flag = CPU_INTEL_CORE2;
259 break;
260 case 0x061C:
261 flag = CPU_INTEL_ATOM;
262 break;
263 case 0x0F00:
264 case 0x0F01:
265 case 0x0F02:
266 case 0x0F03:
267 case 0x0F04:
268 flag = CPU_INTEL_XEON_P4;
269 break;
270 case 0x0F06:
271 flag = CPU_INTEL_XEON_MP;
272 break;
273 default:
274 flag = CPU_NONE;
275 break;
276 }
277
278 return flag;
279}
280
281/* AMD */
282static int get_amd_modelflag(unsigned model)
283{
284 int flag;
285
286 switch (model >> 8) {
287 case 0x6:
288 flag = CPU_AMD_K6;
289 break;
290 case 0x7:
291 flag = CPU_AMD_K7;
292 break;
293 case 0x8:
294 flag = CPU_AMD_K8;
295 break;
296 case 0xf:
297 flag = CPU_AMD_0F;
298 break;
299 case 0x10:
300 flag = CPU_AMD_10;
301 break;
302 case 0x11:
303 flag = CPU_AMD_11;
304 break;
305 default:
306 flag = CPU_NONE;
307 break;
308 }
309
310 return flag;
311}
312
313static int get_cpu_modelflag(unsigned cpu)
314{
315 int flag;
316
317 flag = per_cpu(cpu_model, cpu);
318
319 switch (flag >> 16) {
320 case X86_VENDOR_INTEL:
321 flag = get_intel_modelflag(flag);
322 break;
323 case X86_VENDOR_AMD:
324 flag = get_amd_modelflag(flag & 0xffff);
325 break;
326 default:
327 flag = CPU_NONE;
328 break;
329 }
330
331 return flag;
332}
333
334static int get_cpu_range_count(unsigned cpu)
335{
336 int index;
337
338 switch (per_cpu(cpu_model, cpu) >> 16) {
339 case X86_VENDOR_INTEL:
340 index = ARRAY_SIZE(cpu_intel_range);
341 break;
342 case X86_VENDOR_AMD:
343 index = ARRAY_SIZE(cpu_amd_range);
344 break;
345 default:
346 index = 0;
347 break;
348 }
349
350 return index;
351}
352
353static int is_typeflag_valid(unsigned cpu, unsigned flag) 166static int is_typeflag_valid(unsigned cpu, unsigned flag)
354{ 167{
355 unsigned vendor, modelflag; 168 int i;
356 int i, index;
357 169
358 /* Standard Registers should be always valid */ 170 /* Standard Registers should be always valid */
359 if (flag >= CPU_TSS) 171 if (flag >= CPU_TSS)
360 return 1; 172 return 1;
361 173
362 modelflag = per_cpu(cpu_modelflag, cpu); 174 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
363 vendor = per_cpu(cpu_model, cpu) >> 16; 175 if (cpu_reg_range[i].flag == flag)
364 index = get_cpu_range_count(cpu); 176 return 1;
365
366 for (i = 0; i < index; i++) {
367 switch (vendor) {
368 case X86_VENDOR_INTEL:
369 if ((cpu_intel_range[i].model & modelflag) &&
370 (cpu_intel_range[i].flag & flag))
371 return 1;
372 break;
373 case X86_VENDOR_AMD:
374 if ((cpu_amd_range[i].model & modelflag) &&
375 (cpu_amd_range[i].flag & flag))
376 return 1;
377 break;
378 }
379 } 177 }
380 178
381 /* Invalid */ 179 /* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
385static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, 183static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
386 int index, unsigned flag) 184 int index, unsigned flag)
387{ 185{
388 unsigned modelflag; 186 if (cpu_reg_range[index].flag == flag) {
389 187 *min = cpu_reg_range[index].min;
390 modelflag = per_cpu(cpu_modelflag, cpu); 188 *max = cpu_reg_range[index].max;
391 *max = 0; 189 } else
392 switch (per_cpu(cpu_model, cpu) >> 16) { 190 *max = 0;
393 case X86_VENDOR_INTEL:
394 if ((cpu_intel_range[index].model & modelflag) &&
395 (cpu_intel_range[index].flag & flag)) {
396 *min = cpu_intel_range[index].min;
397 *max = cpu_intel_range[index].max;
398 }
399 break;
400 case X86_VENDOR_AMD:
401 if ((cpu_amd_range[index].model & modelflag) &&
402 (cpu_amd_range[index].flag & flag)) {
403 *min = cpu_amd_range[index].min;
404 *max = cpu_amd_range[index].max;
405 }
406 break;
407 }
408 191
409 return *max; 192 return *max;
410} 193}
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
434 unsigned msr, msr_min, msr_max; 217 unsigned msr, msr_min, msr_max;
435 struct cpu_private *priv; 218 struct cpu_private *priv;
436 u32 low, high; 219 u32 low, high;
437 int i, range; 220 int i;
438 221
439 if (seq) { 222 if (seq) {
440 priv = seq->private; 223 priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
446 } 229 }
447 } 230 }
448 231
449 range = get_cpu_range_count(cpu); 232 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
450
451 for (i = 0; i < range; i++) {
452 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) 233 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
453 continue; 234 continue;
454 235
@@ -588,8 +369,20 @@ static void print_apic(void *arg)
588 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); 369 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
589 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); 370 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
590 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); 371 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
591#endif /* CONFIG_X86_LOCAL_APIC */ 372 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
373 unsigned int i, v, maxeilvt;
374
375 v = apic_read(APIC_EFEAT);
376 maxeilvt = (v >> 16) & 0xff;
377 seq_printf(seq, " EFEAT\t\t: %08x\n", v);
378 seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
592 379
380 for (i = 0; i < maxeilvt; i++) {
381 v = apic_read(APIC_EILVTn(i));
382 seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
383 }
384 }
385#endif /* CONFIG_X86_LOCAL_APIC */
593 seq_printf(seq, "\n MSR\t:\n"); 386 seq_printf(seq, "\n MSR\t:\n");
594} 387}
595 388
@@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
788{ 581{
789 struct dentry *cpu_dentry = NULL; 582 struct dentry *cpu_dentry = NULL;
790 unsigned reg, reg_min, reg_max; 583 unsigned reg, reg_min, reg_max;
791 int i, range, err = 0; 584 int i, err = 0;
792 char reg_dir[12]; 585 char reg_dir[12];
793 u32 low, high; 586 u32 low, high;
794 587
795 range = get_cpu_range_count(cpu); 588 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
796
797 for (i = 0; i < range; i++) {
798 if (!get_cpu_range(cpu, &reg_min, &reg_max, i, 589 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
799 cpu_base[type].flag)) 590 cpu_base[type].flag))
800 continue; 591 continue;
@@ -850,10 +641,6 @@ static int cpu_init_cpu(void)
850 cpui = &cpu_data(cpu); 641 cpui = &cpu_data(cpu);
851 if (!cpu_has(cpui, X86_FEATURE_MSR)) 642 if (!cpu_has(cpui, X86_FEATURE_MSR))
852 continue; 643 continue;
853 per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
854 (cpui->x86 << 8) |
855 (cpui->x86_model));
856 per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
857 644
858 sprintf(cpu_dir, "cpu%d", cpu); 645 sprintf(cpu_dir, "cpu%d", cpu);
859 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); 646 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 52c839875478..f138c6c389b9 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -220,11 +220,14 @@ config X86_LONGHAUL
220 If in doubt, say N. 220 If in doubt, say N.
221 221
222config X86_E_POWERSAVER 222config X86_E_POWERSAVER
223 tristate "VIA C7 Enhanced PowerSaver" 223 tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
224 select CPU_FREQ_TABLE 224 select CPU_FREQ_TABLE
225 depends on X86_32 225 depends on X86_32 && EXPERIMENTAL
226 help 226 help
227 This adds the CPUFreq driver for VIA C7 processors. 227 This adds the CPUFreq driver for VIA C7 processors. However, this driver
228 does not have any safeguards to prevent operating the CPU out of spec
229 and is thus considered dangerous. Please use the regular ACPI cpufreq
230 driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
228 231
229 If in doubt, say N. 232 If in doubt, say N.
230 233
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 208ecf6643df..ae9b503220ca 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
90{ 90{
91 struct cpuinfo_x86 *cpu = &cpu_data(cpuid); 91 struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
92 92
93 if (cpu->x86_vendor != X86_VENDOR_INTEL || 93 return cpu_has(cpu, X86_FEATURE_EST);
94 !cpu_has(cpu, X86_FEATURE_EST))
95 return 0;
96
97 return 1;
98} 94}
99 95
100static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) 96static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
@@ -550,7 +546,7 @@ static int __init acpi_cpufreq_early_init(void)
550 return -ENOMEM; 546 return -ENOMEM;
551 } 547 }
552 for_each_possible_cpu(i) { 548 for_each_possible_cpu(i) {
553 if (!alloc_cpumask_var_node( 549 if (!zalloc_cpumask_var_node(
554 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, 550 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
555 GFP_KERNEL, cpu_to_node(i))) { 551 GFP_KERNEL, cpu_to_node(i))) {
556 552
@@ -693,8 +689,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
693 if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE && 689 if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
694 policy->cpuinfo.transition_latency > 20 * 1000) { 690 policy->cpuinfo.transition_latency > 20 * 1000) {
695 policy->cpuinfo.transition_latency = 20 * 1000; 691 policy->cpuinfo.transition_latency = 20 * 1000;
696 printk_once(KERN_INFO "Capping off P-state tranision" 692 printk_once(KERN_INFO
697 " latency at 20 uS\n"); 693 "P-state transition latency capped at 20 uS\n");
698 } 694 }
699 695
700 /* table init */ 696 /* table init */
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 6ac55bd341ae..869615193720 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -168,6 +168,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
168 case 0x0E: /* Core */ 168 case 0x0E: /* Core */
169 case 0x0F: /* Core Duo */ 169 case 0x0F: /* Core Duo */
170 case 0x16: /* Celeron Core */ 170 case 0x16: /* Celeron Core */
171 case 0x1C: /* Atom */
171 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; 172 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
172 return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE); 173 return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
173 case 0x0D: /* Pentium M (Dothan) */ 174 case 0x0D: /* Pentium M (Dothan) */
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 3c28ccd49742..d47c775eb0ab 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -168,10 +168,12 @@ static int check_powernow(void)
168 return 1; 168 return 1;
169} 169}
170 170
171#ifdef CONFIG_X86_POWERNOW_K7_ACPI
171static void invalidate_entry(unsigned int entry) 172static void invalidate_entry(unsigned int entry)
172{ 173{
173 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; 174 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
174} 175}
176#endif
175 177
176static int get_ranges(unsigned char *pst) 178static int get_ranges(unsigned char *pst)
177{ 179{
@@ -320,7 +322,7 @@ static int powernow_acpi_init(void)
320 goto err0; 322 goto err0;
321 } 323 }
322 324
323 if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, 325 if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
324 GFP_KERNEL)) { 326 GFP_KERNEL)) {
325 retval = -ENOMEM; 327 retval = -ENOMEM;
326 goto err05; 328 goto err05;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 4709ead2db52..cf52215d9eb1 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -649,6 +649,20 @@ static void print_basics(struct powernow_k8_data *data)
649 data->batps); 649 data->batps);
650} 650}
651 651
652static u32 freq_from_fid_did(u32 fid, u32 did)
653{
654 u32 mhz = 0;
655
656 if (boot_cpu_data.x86 == 0x10)
657 mhz = (100 * (fid + 0x10)) >> did;
658 else if (boot_cpu_data.x86 == 0x11)
659 mhz = (100 * (fid + 8)) >> did;
660 else
661 BUG();
662
663 return mhz * 1000;
664}
665
652static int fill_powernow_table(struct powernow_k8_data *data, 666static int fill_powernow_table(struct powernow_k8_data *data,
653 struct pst_s *pst, u8 maxvid) 667 struct pst_s *pst, u8 maxvid)
654{ 668{
@@ -821,7 +835,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
821{ 835{
822 struct cpufreq_frequency_table *powernow_table; 836 struct cpufreq_frequency_table *powernow_table;
823 int ret_val = -ENODEV; 837 int ret_val = -ENODEV;
824 acpi_integer space_id; 838 acpi_integer control, status;
825 839
826 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 840 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
827 dprintk("register performance failed: bad ACPI data\n"); 841 dprintk("register performance failed: bad ACPI data\n");
@@ -834,12 +848,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
834 goto err_out; 848 goto err_out;
835 } 849 }
836 850
837 space_id = data->acpi_data.control_register.space_id; 851 control = data->acpi_data.control_register.space_id;
838 if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || 852 status = data->acpi_data.status_register.space_id;
839 (space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { 853
854 if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
855 (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
840 dprintk("Invalid control/status registers (%x - %x)\n", 856 dprintk("Invalid control/status registers (%x - %x)\n",
841 data->acpi_data.control_register.space_id, 857 control, status);
842 space_id);
843 goto err_out; 858 goto err_out;
844 } 859 }
845 860
@@ -872,7 +887,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
872 /* notify BIOS that we exist */ 887 /* notify BIOS that we exist */
873 acpi_processor_notify_smm(THIS_MODULE); 888 acpi_processor_notify_smm(THIS_MODULE);
874 889
875 if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { 890 if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
876 printk(KERN_ERR PFX 891 printk(KERN_ERR PFX
877 "unable to alloc powernow_k8_data cpumask\n"); 892 "unable to alloc powernow_k8_data cpumask\n");
878 ret_val = -ENOMEM; 893 ret_val = -ENOMEM;
@@ -923,8 +938,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
923 938
924 powernow_table[i].index = index; 939 powernow_table[i].index = index;
925 940
926 powernow_table[i].frequency = 941 /* Frequency may be rounded for these */
927 data->acpi_data.states[i].core_frequency * 1000; 942 if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) {
943 powernow_table[i].frequency =
944 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
945 } else
946 powernow_table[i].frequency =
947 data->acpi_data.states[i].core_frequency * 1000;
928 } 948 }
929 return 0; 949 return 0;
930} 950}
@@ -1215,13 +1235,16 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
1215 return cpufreq_frequency_table_verify(pol, data->powernow_table); 1235 return cpufreq_frequency_table_verify(pol, data->powernow_table);
1216} 1236}
1217 1237
1238static const char ACPI_PSS_BIOS_BUG_MSG[] =
1239 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1240 KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n";
1241
1218/* per CPU init entry point to the driver */ 1242/* per CPU init entry point to the driver */
1219static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) 1243static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1220{ 1244{
1221 struct powernow_k8_data *data; 1245 struct powernow_k8_data *data;
1222 cpumask_t oldmask; 1246 cpumask_t oldmask;
1223 int rc; 1247 int rc;
1224 static int print_once;
1225 1248
1226 if (!cpu_online(pol->cpu)) 1249 if (!cpu_online(pol->cpu))
1227 return -ENODEV; 1250 return -ENODEV;
@@ -1244,19 +1267,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1244 * an UP version, and is deprecated by AMD. 1267 * an UP version, and is deprecated by AMD.
1245 */ 1268 */
1246 if (num_online_cpus() != 1) { 1269 if (num_online_cpus() != 1) {
1247 /* 1270 printk_once(ACPI_PSS_BIOS_BUG_MSG);
1248 * Replace this one with print_once as soon as such a
1249 * thing gets introduced
1250 */
1251 if (!print_once) {
1252 WARN_ONCE(1, KERN_ERR FW_BUG PFX "Your BIOS "
1253 "does not provide ACPI _PSS objects "
1254 "in a way that Linux understands. "
1255 "Please report this to the Linux ACPI"
1256 " maintainers and complain to your "
1257 "BIOS vendor.\n");
1258 print_once++;
1259 }
1260 goto err_out; 1271 goto err_out;
1261 } 1272 }
1262 if (pol->cpu != 0) { 1273 if (pol->cpu != 0) {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index c9f1fdc02830..55c831ed71ce 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -471,7 +471,7 @@ static int centrino_target (struct cpufreq_policy *policy,
471 471
472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) 472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL)))
473 return -ENOMEM; 473 return -ENOMEM;
474 if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { 474 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
475 free_cpumask_var(saved_mask); 475 free_cpumask_var(saved_mask);
476 return -ENOMEM; 476 return -ENOMEM;
477 } 477 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7437fa133c02..3260ab044996 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
86 */ 86 */
87 if (c->x86 == 6 && c->x86_model < 15) 87 if (c->x86 == 6 && c->x86_model < 15)
88 clear_cpu_cap(c, X86_FEATURE_PAT); 88 clear_cpu_cap(c, X86_FEATURE_PAT);
89
90#ifdef CONFIG_KMEMCHECK
91 /*
92 * P4s have a "fast strings" feature which causes single-
93 * stepping REP instructions to only generate a #DB on
94 * cache-line boundaries.
95 *
96 * Ingo Molnar reported a Pentium D (model 6) and a Xeon
97 * (model 2) with the same problem.
98 */
99 if (c->x86 == 15) {
100 u64 misc_enable;
101
102 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
103
104 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
105 printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
106
107 misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
108 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
109 }
110 }
111#endif
89} 112}
90 113
91#ifdef CONFIG_X86_32 114#ifdef CONFIG_X86_32
@@ -229,12 +252,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
229} 252}
230#endif 253#endif
231 254
232static void __cpuinit srat_detect_node(void) 255static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
233{ 256{
234#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 257#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
235 unsigned node; 258 unsigned node;
236 int cpu = smp_processor_id(); 259 int cpu = smp_processor_id();
237 int apicid = hard_smp_processor_id(); 260 int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
238 261
239 /* Don't do the funky fallback heuristics the AMD version employs 262 /* Don't do the funky fallback heuristics the AMD version employs
240 for now. */ 263 for now. */
@@ -400,7 +423,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
400 } 423 }
401 424
402 /* Work around errata */ 425 /* Work around errata */
403 srat_detect_node(); 426 srat_detect_node(c);
404 427
405 if (cpu_has(c, X86_FEATURE_VMX)) 428 if (cpu_has(c, X86_FEATURE_VMX))
406 detect_vmx_virtcap(c); 429 detect_vmx_virtcap(c);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 483eda96e102..789efe217e1a 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,6 +17,7 @@
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20#include <asm/k8.h>
20 21
21#define LVL_1_INST 1 22#define LVL_1_INST 1
22#define LVL_1_DATA 2 23#define LVL_1_DATA 2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
159 unsigned long can_disable; 160 unsigned long can_disable;
160}; 161};
161 162
162#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
163static struct pci_device_id k8_nb_id[] = {
164 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
165 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
166 {}
167};
168#endif
169
170unsigned short num_cache_leaves; 163unsigned short num_cache_leaves;
171 164
172/* AMD doesn't have CPUID4. Emulate it here to report the same 165/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -207,10 +200,17 @@ union l3_cache {
207}; 200};
208 201
209static const unsigned short __cpuinitconst assocs[] = { 202static const unsigned short __cpuinitconst assocs[] = {
210 [1] = 1, [2] = 2, [4] = 4, [6] = 8, 203 [1] = 1,
211 [8] = 16, [0xa] = 32, [0xb] = 48, 204 [2] = 2,
205 [4] = 4,
206 [6] = 8,
207 [8] = 16,
208 [0xa] = 32,
209 [0xb] = 48,
212 [0xc] = 64, 210 [0xc] = 64,
213 [0xf] = 0xffff // ?? 211 [0xd] = 96,
212 [0xe] = 128,
213 [0xf] = 0xffff /* fully associative - no way to show this currently */
214}; 214};
215 215
216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; 216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
271 eax->split.type = types[leaf]; 271 eax->split.type = types[leaf];
272 eax->split.level = levels[leaf]; 272 eax->split.level = levels[leaf];
273 if (leaf == 3) 273 if (leaf == 3)
274 eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; 274 eax->split.num_threads_sharing =
275 current_cpu_data.x86_max_cores - 1;
275 else 276 else
276 eax->split.num_threads_sharing = 0; 277 eax->split.num_threads_sharing = 0;
277 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; 278 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
@@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
291{ 292{
292 if (index < 3) 293 if (index < 3)
293 return; 294 return;
295
296 if (boot_cpu_data.x86 == 0x11)
297 return;
298
299 /* see erratum #382 */
300 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
301 return;
302
294 this_leaf->can_disable = 1; 303 this_leaf->can_disable = 1;
295} 304}
296 305
@@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
696#define to_object(k) container_of(k, struct _index_kobject, kobj) 705#define to_object(k) container_of(k, struct _index_kobject, kobj)
697#define to_attr(a) container_of(a, struct _cache_attr, attr) 706#define to_attr(a) container_of(a, struct _cache_attr, attr)
698 707
699#ifdef CONFIG_PCI 708static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
700static struct pci_dev *get_k8_northbridge(int node) 709 unsigned int index)
701{
702 struct pci_dev *dev = NULL;
703 int i;
704
705 for (i = 0; i <= node; i++) {
706 do {
707 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
708 if (!dev)
709 break;
710 } while (!pci_match_id(&k8_nb_id[0], dev));
711 if (!dev)
712 break;
713 }
714 return dev;
715}
716#else
717static struct pci_dev *get_k8_northbridge(int node)
718{
719 return NULL;
720}
721#endif
722
723static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
724{ 710{
725 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); 711 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
726 int node = cpu_to_node(cpumask_first(mask)); 712 int node = cpu_to_node(cpu);
727 struct pci_dev *dev = NULL; 713 struct pci_dev *dev = node_to_k8_nb_misc(node);
728 ssize_t ret = 0; 714 unsigned int reg = 0;
729 int i;
730 715
731 if (!this_leaf->can_disable) 716 if (!this_leaf->can_disable)
732 return sprintf(buf, "Feature not enabled\n");
733
734 dev = get_k8_northbridge(node);
735 if (!dev) {
736 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
737 return -EINVAL; 717 return -EINVAL;
738 }
739 718
740 for (i = 0; i < 2; i++) { 719 if (!dev)
741 unsigned int reg; 720 return -EINVAL;
742 721
743 pci_read_config_dword(dev, 0x1BC + i * 4, &reg); 722 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
723 return sprintf(buf, "%x\n", reg);
724}
744 725
745 ret += sprintf(buf, "%sEntry: %d\n", buf, i); 726#define SHOW_CACHE_DISABLE(index) \
746 ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", 727static ssize_t \
747 buf, 728show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
748 reg & 0x80000000 ? "Disabled" : "Allowed", 729{ \
749 reg & 0x40000000 ? "Disabled" : "Allowed"); 730 return show_cache_disable(this_leaf, buf, index); \
750 ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
751 buf, (reg & 0x30000) >> 16, reg & 0xfff);
752 }
753 return ret;
754} 731}
732SHOW_CACHE_DISABLE(0)
733SHOW_CACHE_DISABLE(1)
755 734
756static ssize_t 735static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
757store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, 736 const char *buf, size_t count, unsigned int index)
758 size_t count)
759{ 737{
760 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); 738 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
761 int node = cpu_to_node(cpumask_first(mask)); 739 int node = cpu_to_node(cpu);
762 struct pci_dev *dev = NULL; 740 struct pci_dev *dev = node_to_k8_nb_misc(node);
763 unsigned int ret, index, val; 741 unsigned long val = 0;
742 unsigned int scrubber = 0;
764 743
765 if (!this_leaf->can_disable) 744 if (!this_leaf->can_disable)
766 return 0;
767
768 if (strlen(buf) > 15)
769 return -EINVAL; 745 return -EINVAL;
770 746
771 ret = sscanf(buf, "%x %x", &index, &val); 747 if (!capable(CAP_SYS_ADMIN))
772 if (ret != 2) 748 return -EPERM;
749
750 if (!dev)
773 return -EINVAL; 751 return -EINVAL;
774 if (index > 1) 752
753 if (strict_strtoul(buf, 10, &val) < 0)
775 return -EINVAL; 754 return -EINVAL;
776 755
777 val |= 0xc0000000; 756 val |= 0xc0000000;
778 dev = get_k8_northbridge(node); 757
779 if (!dev) { 758 pci_read_config_dword(dev, 0x58, &scrubber);
780 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); 759 scrubber &= ~0x1f000000;
781 return -EINVAL; 760 pci_write_config_dword(dev, 0x58, scrubber);
782 }
783 761
784 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); 762 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
785 wbinvd(); 763 wbinvd();
786 pci_write_config_dword(dev, 0x1BC + index * 4, val); 764 pci_write_config_dword(dev, 0x1BC + index * 4, val);
765 return count;
766}
787 767
788 return 1; 768#define STORE_CACHE_DISABLE(index) \
769static ssize_t \
770store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
771 const char *buf, size_t count) \
772{ \
773 return store_cache_disable(this_leaf, buf, count, index); \
789} 774}
775STORE_CACHE_DISABLE(0)
776STORE_CACHE_DISABLE(1)
790 777
791struct _cache_attr { 778struct _cache_attr {
792 struct attribute attr; 779 struct attribute attr;
@@ -808,7 +795,10 @@ define_one_ro(size);
808define_one_ro(shared_cpu_map); 795define_one_ro(shared_cpu_map);
809define_one_ro(shared_cpu_list); 796define_one_ro(shared_cpu_list);
810 797
811static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); 798static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
799 show_cache_disable_0, store_cache_disable_0);
800static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
801 show_cache_disable_1, store_cache_disable_1);
812 802
813static struct attribute * default_attrs[] = { 803static struct attribute * default_attrs[] = {
814 &type.attr, 804 &type.attr,
@@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {
820 &size.attr, 810 &size.attr,
821 &shared_cpu_map.attr, 811 &shared_cpu_map.attr,
822 &shared_cpu_list.attr, 812 &shared_cpu_list.attr,
823 &cache_disable.attr, 813 &cache_disable_0.attr,
814 &cache_disable_1.attr,
824 NULL 815 NULL
825}; 816};
826 817
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index b2f89829bbe8..45004faf67ea 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,7 +1,11 @@
1obj-y = mce_$(BITS).o therm_throt.o 1obj-y = mce.o therm_throt.o
2 2
3obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o 3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 8obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 9obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 10obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
11obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index dd3af6e7b39a..89e510424152 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -2,11 +2,10 @@
2 * Athlon specific Machine Check Exception Reporting 2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@redhat.com> 3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -15,12 +14,12 @@
15 14
16#include "mce.h" 15#include "mce.h"
17 16
18/* Machine Check Handler For AMD Athlon/Duron */ 17/* Machine Check Handler For AMD Athlon/Duron: */
19static void k7_machine_check(struct pt_regs *regs, long error_code) 18static void k7_machine_check(struct pt_regs *regs, long error_code)
20{ 19{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 20 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 21 u32 mcgstl, mcgsth;
22 int recover = 1;
24 int i; 23 int i;
25 24
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 25 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -32,15 +31,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
32 31
33 for (i = 1; i < nr_mce_banks; i++) { 32 for (i = 1; i < nr_mce_banks; i++) {
34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 33 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high&(1<<31)) { 34 if (high & (1<<31)) {
36 char misc[20]; 35 char misc[20];
37 char addr[24]; 36 char addr[24];
38 misc[0] = addr[0] = '\0'; 37
38 misc[0] = '\0';
39 addr[0] = '\0';
40
39 if (high & (1<<29)) 41 if (high & (1<<29))
40 recover |= 1; 42 recover |= 1;
41 if (high & (1<<25)) 43 if (high & (1<<25))
42 recover |= 2; 44 recover |= 2;
43 high &= ~(1<<31); 45 high &= ~(1<<31);
46
44 if (high & (1<<27)) { 47 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 48 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 49 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,27 +52,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 52 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 53 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 54 }
55
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 56 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 57 smp_processor_id(), i, high, low, misc, addr);
54 /* Clear it */ 58
59 /* Clear it: */
55 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 60 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
56 /* Serialize */ 61 /* Serialize: */
57 wmb(); 62 wmb();
58 add_taint(TAINT_MACHINE_CHECK); 63 add_taint(TAINT_MACHINE_CHECK);
59 } 64 }
60 } 65 }
61 66
62 if (recover&2) 67 if (recover & 2)
63 panic("CPU context corrupt"); 68 panic("CPU context corrupt");
64 if (recover&1) 69 if (recover & 1)
65 panic("Unable to continue"); 70 panic("Unable to continue");
71
66 printk(KERN_EMERG "Attempting to continue.\n"); 72 printk(KERN_EMERG "Attempting to continue.\n");
73
67 mcgstl &= ~(1<<2); 74 mcgstl &= ~(1<<2);
68 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 75 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
69} 76}
70 77
71 78
72/* AMD K7 machine check is Intel like */ 79/* AMD K7 machine check is Intel like: */
73void amd_mcheck_init(struct cpuinfo_x86 *c) 80void amd_mcheck_init(struct cpuinfo_x86 *c)
74{ 81{
75 u32 l, h; 82 u32 l, h;
@@ -79,21 +86,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
79 return; 86 return;
80 87
81 machine_check_vector = k7_machine_check; 88 machine_check_vector = k7_machine_check;
89 /* Make sure the vector pointer is visible before we enable MCEs: */
82 wmb(); 90 wmb();
83 91
84 printk(KERN_INFO "Intel machine check architecture supported.\n"); 92 printk(KERN_INFO "Intel machine check architecture supported.\n");
93
85 rdmsr(MSR_IA32_MCG_CAP, l, h); 94 rdmsr(MSR_IA32_MCG_CAP, l, h);
86 if (l & (1<<8)) /* Control register present ? */ 95 if (l & (1<<8)) /* Control register present ? */
87 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 96 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
88 nr_mce_banks = l & 0xff; 97 nr_mce_banks = l & 0xff;
89 98
90 /* Clear status for MC index 0 separately, we don't touch CTL, 99 /*
91 * as some K7 Athlons cause spurious MCEs when its enabled. */ 100 * Clear status for MC index 0 separately, we don't touch CTL,
101 * as some K7 Athlons cause spurious MCEs when its enabled:
102 */
92 if (boot_cpu_data.x86 == 6) { 103 if (boot_cpu_data.x86 == 6) {
93 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); 104 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
94 i = 1; 105 i = 1;
95 } else 106 } else
96 i = 0; 107 i = 0;
108
97 for (; i < nr_mce_banks; i++) { 109 for (; i < nr_mce_banks; i++) {
98 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 110 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
99 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 111 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
new file mode 100644
index 000000000000..a3a235a53f09
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -0,0 +1,127 @@
1/*
2 * Machine check injection support.
3 * Copyright 2008 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Authors:
11 * Andi Kleen
12 * Ying Huang
13 */
14#include <linux/uaccess.h>
15#include <linux/module.h>
16#include <linux/timer.h>
17#include <linux/kernel.h>
18#include <linux/string.h>
19#include <linux/fs.h>
20#include <linux/smp.h>
21#include <asm/mce.h>
22
23/* Update fake mce registers on current CPU. */
24static void inject_mce(struct mce *m)
25{
26 struct mce *i = &per_cpu(injectm, m->extcpu);
27
28 /* Make sure noone reads partially written injectm */
29 i->finished = 0;
30 mb();
31 m->finished = 0;
32 /* First set the fields after finished */
33 i->extcpu = m->extcpu;
34 mb();
35 /* Now write record in order, finished last (except above) */
36 memcpy(i, m, sizeof(struct mce));
37 /* Finally activate it */
38 mb();
39 i->finished = 1;
40}
41
42struct delayed_mce {
43 struct timer_list timer;
44 struct mce m;
45};
46
47/* Inject mce on current CPU */
48static void raise_mce(unsigned long data)
49{
50 struct delayed_mce *dm = (struct delayed_mce *)data;
51 struct mce *m = &dm->m;
52 int cpu = m->extcpu;
53
54 inject_mce(m);
55 if (m->status & MCI_STATUS_UC) {
56 struct pt_regs regs;
57 memset(&regs, 0, sizeof(struct pt_regs));
58 regs.ip = m->ip;
59 regs.cs = m->cs;
60 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61 do_machine_check(&regs, 0);
62 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63 } else {
64 mce_banks_t b;
65 memset(&b, 0xff, sizeof(mce_banks_t));
66 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67 machine_check_poll(0, &b);
68 mce_notify_irq();
69 printk(KERN_INFO "Finished machine check poll on CPU %d\n",
70 cpu);
71 }
72 kfree(dm);
73}
74
75/* Error injection interface */
76static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77 size_t usize, loff_t *off)
78{
79 struct delayed_mce *dm;
80 struct mce m;
81
82 if (!capable(CAP_SYS_ADMIN))
83 return -EPERM;
84 /*
85 * There are some cases where real MSR reads could slip
86 * through.
87 */
88 if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
89 return -EIO;
90
91 if ((unsigned long)usize > sizeof(struct mce))
92 usize = sizeof(struct mce);
93 if (copy_from_user(&m, ubuf, usize))
94 return -EFAULT;
95
96 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
97 return -EINVAL;
98
99 dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100 if (!dm)
101 return -ENOMEM;
102
103 /*
104 * Need to give user space some time to set everything up,
105 * so do it a jiffie or two later everywhere.
106 * Should we use a hrtimer here for better synchronization?
107 */
108 memcpy(&dm->m, &m, sizeof(struct mce));
109 setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
110 dm->timer.expires = jiffies + 2;
111 add_timer_on(&dm->timer, m.extcpu);
112 return usize;
113}
114
115static int inject_init(void)
116{
117 printk(KERN_INFO "Machine check injector initialized\n");
118 mce_chrdev_ops.write = mce_write;
119 return 0;
120}
121
122module_init(inject_init);
123/*
124 * Cannot tolerate unloading currently because we cannot
125 * guarantee all openers of mce_chrdev will get a reference to us.
126 */
127MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
new file mode 100644
index 000000000000..54dcb8ff12e5
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -0,0 +1,15 @@
1#include <asm/mce.h>
2
3enum severity_level {
4 MCE_NO_SEVERITY,
5 MCE_KEEP_SEVERITY,
6 MCE_SOME_SEVERITY,
7 MCE_AO_SEVERITY,
8 MCE_UC_SEVERITY,
9 MCE_AR_SEVERITY,
10 MCE_PANIC_SEVERITY,
11};
12
13int mce_severity(struct mce *a, int tolerant, char **msg);
14
15extern int mce_ser;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
new file mode 100644
index 000000000000..ff0807f97056
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -0,0 +1,218 @@
1/*
2 * MCE grading rules.
3 * Copyright 2008, 2009 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Author: Andi Kleen
11 */
12#include <linux/kernel.h>
13#include <linux/seq_file.h>
14#include <linux/init.h>
15#include <linux/debugfs.h>
16#include <asm/mce.h>
17
18#include "mce-internal.h"
19
20/*
21 * Grade an mce by severity. In general the most severe ones are processed
22 * first. Since there are quite a lot of combinations test the bits in a
23 * table-driven way. The rules are simply processed in order, first
24 * match wins.
25 *
26 * Note this is only used for machine check exceptions, the corrected
27 * errors use much simpler rules. The exceptions still check for the corrected
28 * errors, but only to leave them alone for the CMCI handler (except for
29 * panic situations)
30 */
31
32enum context { IN_KERNEL = 1, IN_USER = 2 };
33enum ser { SER_REQUIRED = 1, NO_SER = 2 };
34
35static struct severity {
36 u64 mask;
37 u64 result;
38 unsigned char sev;
39 unsigned char mcgmask;
40 unsigned char mcgres;
41 unsigned char ser;
42 unsigned char context;
43 unsigned char covered;
44 char *msg;
45} severities[] = {
46#define KERNEL .context = IN_KERNEL
47#define USER .context = IN_USER
48#define SER .ser = SER_REQUIRED
49#define NOSER .ser = NO_SER
50#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
51#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
52#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
53#define MCGMASK(x, res, s, m, r...) \
54 { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
55#define MASK(x, y, s, m, r...) \
56 { .mask = x, .result = y, SEV(s), .msg = m, ## r }
57#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
58#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
59#define MCACOD 0xffff
60
61 BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
62 BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
63 BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
64 /* When MCIP is not set something is very confused */
65 MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
66 /* Neither return not error IP -- no chance to recover -> PANIC */
67 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
68 "Neither restart nor error IP"),
69 MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
70 KERNEL),
71 BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
72 MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
73 "Spurious not enabled", SER),
74
75 /* ignore OVER for UCNA */
76 MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
77 "Uncorrected no action required", SER),
78 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
79 "Illegal combination (UCNA with AR=1)", SER),
80 MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
81
82 /* AR add known MCACODs here */
83 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
84 "Action required with lost events", SER),
85 MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
86 "Action required; unknown MCACOD", SER),
87
88 /* known AO MCACODs: */
89 MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
90 "Action optional: memory scrubbing error", SER),
91 MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
92 "Action optional: last level cache writeback error", SER),
93
94 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
95 "Action optional unknown MCACOD", SER),
96 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
97 "Action optional with lost events", SER),
98 BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
99 BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
100 BITSET(0, SOME, "No match") /* always matches. keep at end */
101};
102
103/*
104 * If the EIPV bit is set, it means the saved IP is the
105 * instruction which caused the MCE.
106 */
107static int error_context(struct mce *m)
108{
109 if (m->mcgstatus & MCG_STATUS_EIPV)
110 return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
111 /* Unknown, assume kernel */
112 return IN_KERNEL;
113}
114
115int mce_severity(struct mce *a, int tolerant, char **msg)
116{
117 enum context ctx = error_context(a);
118 struct severity *s;
119
120 for (s = severities;; s++) {
121 if ((a->status & s->mask) != s->result)
122 continue;
123 if ((a->mcgstatus & s->mcgmask) != s->mcgres)
124 continue;
125 if (s->ser == SER_REQUIRED && !mce_ser)
126 continue;
127 if (s->ser == NO_SER && mce_ser)
128 continue;
129 if (s->context && ctx != s->context)
130 continue;
131 if (msg)
132 *msg = s->msg;
133 s->covered = 1;
134 if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
135 if (panic_on_oops || tolerant < 1)
136 return MCE_PANIC_SEVERITY;
137 }
138 return s->sev;
139 }
140}
141
142static void *s_start(struct seq_file *f, loff_t *pos)
143{
144 if (*pos >= ARRAY_SIZE(severities))
145 return NULL;
146 return &severities[*pos];
147}
148
149static void *s_next(struct seq_file *f, void *data, loff_t *pos)
150{
151 if (++(*pos) >= ARRAY_SIZE(severities))
152 return NULL;
153 return &severities[*pos];
154}
155
156static void s_stop(struct seq_file *f, void *data)
157{
158}
159
160static int s_show(struct seq_file *f, void *data)
161{
162 struct severity *ser = data;
163 seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
164 return 0;
165}
166
167static const struct seq_operations severities_seq_ops = {
168 .start = s_start,
169 .next = s_next,
170 .stop = s_stop,
171 .show = s_show,
172};
173
174static int severities_coverage_open(struct inode *inode, struct file *file)
175{
176 return seq_open(file, &severities_seq_ops);
177}
178
179static ssize_t severities_coverage_write(struct file *file,
180 const char __user *ubuf,
181 size_t count, loff_t *ppos)
182{
183 int i;
184 for (i = 0; i < ARRAY_SIZE(severities); i++)
185 severities[i].covered = 0;
186 return count;
187}
188
189static const struct file_operations severities_coverage_fops = {
190 .open = severities_coverage_open,
191 .release = seq_release,
192 .read = seq_read,
193 .write = severities_coverage_write,
194};
195
196static int __init severities_debugfs_init(void)
197{
198 struct dentry *dmce = NULL, *fseverities_coverage = NULL;
199
200 dmce = debugfs_create_dir("mce", NULL);
201 if (dmce == NULL)
202 goto err_out;
203 fseverities_coverage = debugfs_create_file("severities-coverage",
204 0444, dmce, NULL,
205 &severities_coverage_fops);
206 if (fseverities_coverage == NULL)
207 goto err_out;
208
209 return 0;
210
211err_out:
212 if (fseverities_coverage)
213 debugfs_remove(fseverities_coverage);
214 if (dmce)
215 debugfs_remove(dmce);
216 return -ENOMEM;
217}
218late_initcall(severities_debugfs_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 000000000000..fabba15e4558
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,1964 @@
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/sysdev.h>
24#include <linux/delay.h>
25#include <linux/ctype.h>
26#include <linux/sched.h>
27#include <linux/sysfs.h>
28#include <linux/types.h>
29#include <linux/init.h>
30#include <linux/kmod.h>
31#include <linux/poll.h>
32#include <linux/nmi.h>
33#include <linux/cpu.h>
34#include <linux/smp.h>
35#include <linux/fs.h>
36#include <linux/mm.h>
37
38#include <asm/processor.h>
39#include <asm/hw_irq.h>
40#include <asm/apic.h>
41#include <asm/idle.h>
42#include <asm/ipi.h>
43#include <asm/mce.h>
44#include <asm/msr.h>
45
46#include "mce-internal.h"
47#include "mce.h"
48
49/* Handle unconfigured int18 (should never happen) */
50static void unexpected_machine_check(struct pt_regs *regs, long error_code)
51{
52 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
53 smp_processor_id());
54}
55
56/* Call the installed machine check handler for this CPU setup. */
57void (*machine_check_vector)(struct pt_regs *, long error_code) =
58 unexpected_machine_check;
59
60int mce_disabled;
61
62#ifdef CONFIG_X86_NEW_MCE
63
64#define MISC_MCELOG_MINOR 227
65
66#define SPINUNIT 100 /* 100ns */
67
68atomic_t mce_entry;
69
70DEFINE_PER_CPU(unsigned, mce_exception_count);
71
72/*
73 * Tolerant levels:
74 * 0: always panic on uncorrected errors, log corrected errors
75 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
76 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
77 * 3: never panic or SIGBUS, log all errors (for testing only)
78 */
79static int tolerant = 1;
80static int banks;
81static u64 *bank;
82static unsigned long notify_user;
83static int rip_msr;
84static int mce_bootlog = -1;
85static int monarch_timeout = -1;
86static int mce_panic_timeout;
87static int mce_dont_log_ce;
88int mce_cmci_disabled;
89int mce_ignore_ce;
90int mce_ser;
91
92static char trigger[128];
93static char *trigger_argv[2] = { trigger, NULL };
94
95static unsigned long dont_init_banks;
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing;
100
101
102/* MCA banks polled by the period polling timer for corrected events */
103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
105};
106
107static inline int skip_bank_init(int i)
108{
109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
110}
111
112static DEFINE_PER_CPU(struct work_struct, mce_work);
113
114/* Do initial initialization of a struct mce */
115void mce_setup(struct mce *m)
116{
117 memset(m, 0, sizeof(struct mce));
118 m->cpu = m->extcpu = smp_processor_id();
119 rdtscll(m->tsc);
120 /* We hope get_seconds stays lockless */
121 m->time = get_seconds();
122 m->cpuvendor = boot_cpu_data.x86_vendor;
123 m->cpuid = cpuid_eax(1);
124#ifdef CONFIG_SMP
125 m->socketid = cpu_data(m->extcpu).phys_proc_id;
126#endif
127 m->apicid = cpu_data(m->extcpu).initial_apicid;
128 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
129}
130
131DEFINE_PER_CPU(struct mce, injectm);
132EXPORT_PER_CPU_SYMBOL_GPL(injectm);
133
134/*
135 * Lockless MCE logging infrastructure.
136 * This avoids deadlocks on printk locks without having to break locks. Also
137 * separate MCEs from kernel messages to avoid bogus bug reports.
138 */
139
140static struct mce_log mcelog = {
141 .signature = MCE_LOG_SIGNATURE,
142 .len = MCE_LOG_LEN,
143 .recordlen = sizeof(struct mce),
144};
145
146void mce_log(struct mce *mce)
147{
148 unsigned next, entry;
149
150 mce->finished = 0;
151 wmb();
152 for (;;) {
153 entry = rcu_dereference(mcelog.next);
154 for (;;) {
155 /*
156 * When the buffer fills up discard new entries.
157 * Assume that the earlier errors are the more
158 * interesting ones:
159 */
160 if (entry >= MCE_LOG_LEN) {
161 set_bit(MCE_OVERFLOW,
162 (unsigned long *)&mcelog.flags);
163 return;
164 }
165 /* Old left over entry. Skip: */
166 if (mcelog.entry[entry].finished) {
167 entry++;
168 continue;
169 }
170 break;
171 }
172 smp_rmb();
173 next = entry + 1;
174 if (cmpxchg(&mcelog.next, entry, next) == entry)
175 break;
176 }
177 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
178 wmb();
179 mcelog.entry[entry].finished = 1;
180 wmb();
181
182 mce->finished = 1;
183 set_bit(0, &notify_user);
184}
185
186static void print_mce(struct mce *m)
187{
188 printk(KERN_EMERG
189 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
190 m->extcpu, m->mcgstatus, m->bank, m->status);
191 if (m->ip) {
192 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
193 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
194 m->cs, m->ip);
195 if (m->cs == __KERNEL_CS)
196 print_symbol("{%s}", m->ip);
197 printk("\n");
198 }
199 printk(KERN_EMERG "TSC %llx ", m->tsc);
200 if (m->addr)
201 printk("ADDR %llx ", m->addr);
202 if (m->misc)
203 printk("MISC %llx ", m->misc);
204 printk("\n");
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid);
208}
209
210static void print_mce_head(void)
211{
212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
213}
214
215static void print_mce_tail(void)
216{
217 printk(KERN_EMERG "This is not a software problem!\n"
218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
219}
220
221#define PANIC_TIMEOUT 5 /* 5 seconds */
222
223static atomic_t mce_paniced;
224
225/* Panic in progress. Enable interrupts and wait for final IPI */
226static void wait_for_panic(void)
227{
228 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
229 preempt_disable();
230 local_irq_enable();
231 while (timeout-- > 0)
232 udelay(1);
233 if (panic_timeout == 0)
234 panic_timeout = mce_panic_timeout;
235 panic("Panicing machine check CPU died");
236}
237
238static void mce_panic(char *msg, struct mce *final, char *exp)
239{
240 int i;
241
242 /*
243 * Make sure only one CPU runs in machine check panic
244 */
245 if (atomic_add_return(1, &mce_paniced) > 1)
246 wait_for_panic();
247 barrier();
248
249 bust_spinlocks(1);
250 console_verbose();
251 print_mce_head();
252 /* First print corrected ones that are still unlogged */
253 for (i = 0; i < MCE_LOG_LEN; i++) {
254 struct mce *m = &mcelog.entry[i];
255 if (!(m->status & MCI_STATUS_VAL))
256 continue;
257 if (!(m->status & MCI_STATUS_UC))
258 print_mce(m);
259 }
260 /* Now print uncorrected but with the final one last */
261 for (i = 0; i < MCE_LOG_LEN; i++) {
262 struct mce *m = &mcelog.entry[i];
263 if (!(m->status & MCI_STATUS_VAL))
264 continue;
265 if (!(m->status & MCI_STATUS_UC))
266 continue;
267 if (!final || memcmp(m, final, sizeof(struct mce)))
268 print_mce(m);
269 }
270 if (final)
271 print_mce(final);
272 if (cpu_missing)
273 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
274 print_mce_tail();
275 if (exp)
276 printk(KERN_EMERG "Machine check: %s\n", exp);
277 if (panic_timeout == 0)
278 panic_timeout = mce_panic_timeout;
279 panic(msg);
280}
281
282/* Support code for software error injection */
283
284static int msr_to_offset(u32 msr)
285{
286 unsigned bank = __get_cpu_var(injectm.bank);
287 if (msr == rip_msr)
288 return offsetof(struct mce, ip);
289 if (msr == MSR_IA32_MC0_STATUS + bank*4)
290 return offsetof(struct mce, status);
291 if (msr == MSR_IA32_MC0_ADDR + bank*4)
292 return offsetof(struct mce, addr);
293 if (msr == MSR_IA32_MC0_MISC + bank*4)
294 return offsetof(struct mce, misc);
295 if (msr == MSR_IA32_MCG_STATUS)
296 return offsetof(struct mce, mcgstatus);
297 return -1;
298}
299
300/* MSR access wrappers used for error injection */
301static u64 mce_rdmsrl(u32 msr)
302{
303 u64 v;
304 if (__get_cpu_var(injectm).finished) {
305 int offset = msr_to_offset(msr);
306 if (offset < 0)
307 return 0;
308 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
309 }
310 rdmsrl(msr, v);
311 return v;
312}
313
314static void mce_wrmsrl(u32 msr, u64 v)
315{
316 if (__get_cpu_var(injectm).finished) {
317 int offset = msr_to_offset(msr);
318 if (offset >= 0)
319 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
320 return;
321 }
322 wrmsrl(msr, v);
323}
324
325/*
326 * Simple lockless ring to communicate PFNs from the exception handler with the
327 * process context work function. This is vastly simplified because there's
328 * only a single reader and a single writer.
329 */
330#define MCE_RING_SIZE 16 /* we use one entry less */
331
332struct mce_ring {
333 unsigned short start;
334 unsigned short end;
335 unsigned long ring[MCE_RING_SIZE];
336};
337static DEFINE_PER_CPU(struct mce_ring, mce_ring);
338
339/* Runs with CPU affinity in workqueue */
340static int mce_ring_empty(void)
341{
342 struct mce_ring *r = &__get_cpu_var(mce_ring);
343
344 return r->start == r->end;
345}
346
347static int mce_ring_get(unsigned long *pfn)
348{
349 struct mce_ring *r;
350 int ret = 0;
351
352 *pfn = 0;
353 get_cpu();
354 r = &__get_cpu_var(mce_ring);
355 if (r->start == r->end)
356 goto out;
357 *pfn = r->ring[r->start];
358 r->start = (r->start + 1) % MCE_RING_SIZE;
359 ret = 1;
360out:
361 put_cpu();
362 return ret;
363}
364
365/* Always runs in MCE context with preempt off */
366static int mce_ring_add(unsigned long pfn)
367{
368 struct mce_ring *r = &__get_cpu_var(mce_ring);
369 unsigned next;
370
371 next = (r->end + 1) % MCE_RING_SIZE;
372 if (next == r->start)
373 return -1;
374 r->ring[r->end] = pfn;
375 wmb();
376 r->end = next;
377 return 0;
378}
379
380int mce_available(struct cpuinfo_x86 *c)
381{
382 if (mce_disabled)
383 return 0;
384 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
385}
386
387static void mce_schedule_work(void)
388{
389 if (!mce_ring_empty()) {
390 struct work_struct *work = &__get_cpu_var(mce_work);
391 if (!work_pending(work))
392 schedule_work(work);
393 }
394}
395
396/*
397 * Get the address of the instruction at the time of the machine check
398 * error.
399 */
400static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
401{
402
403 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
404 m->ip = regs->ip;
405 m->cs = regs->cs;
406 } else {
407 m->ip = 0;
408 m->cs = 0;
409 }
410 if (rip_msr)
411 m->ip = mce_rdmsrl(rip_msr);
412}
413
414#ifdef CONFIG_X86_LOCAL_APIC
415/*
416 * Called after interrupts have been reenabled again
417 * when a MCE happened during an interrupts off region
418 * in the kernel.
419 */
420asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
421{
422 ack_APIC_irq();
423 exit_idle();
424 irq_enter();
425 mce_notify_irq();
426 mce_schedule_work();
427 irq_exit();
428}
429#endif
430
431static void mce_report_event(struct pt_regs *regs)
432{
433 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
434 mce_notify_irq();
435 /*
436 * Triggering the work queue here is just an insurance
437 * policy in case the syscall exit notify handler
438 * doesn't run soon enough or ends up running on the
439 * wrong CPU (can happen when audit sleeps)
440 */
441 mce_schedule_work();
442 return;
443 }
444
445#ifdef CONFIG_X86_LOCAL_APIC
446 /*
447 * Without APIC do not notify. The event will be picked
448 * up eventually.
449 */
450 if (!cpu_has_apic)
451 return;
452
453 /*
454 * When interrupts are disabled we cannot use
455 * kernel services safely. Trigger an self interrupt
456 * through the APIC to instead do the notification
457 * after interrupts are reenabled again.
458 */
459 apic->send_IPI_self(MCE_SELF_VECTOR);
460
461 /*
462 * Wait for idle afterwards again so that we don't leave the
463 * APIC in a non idle state because the normal APIC writes
464 * cannot exclude us.
465 */
466 apic_wait_icr_idle();
467#endif
468}
469
470DEFINE_PER_CPU(unsigned, mce_poll_count);
471
472/*
473 * Poll for corrected events or events that happened before reset.
474 * Those are just logged through /dev/mcelog.
475 *
476 * This is executed in standard interrupt context.
477 *
478 * Note: spec recommends to panic for fatal unsignalled
479 * errors here. However this would be quite problematic --
480 * we would need to reimplement the Monarch handling and
481 * it would mess up the exclusion between exception handler
482 * and poll hander -- * so we skip this for now.
483 * These cases should not happen anyways, or only when the CPU
484 * is already totally * confused. In this case it's likely it will
485 * not fully execute the machine check handler either.
486 */
487void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
488{
489 struct mce m;
490 int i;
491
492 __get_cpu_var(mce_poll_count)++;
493
494 mce_setup(&m);
495
496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
497 for (i = 0; i < banks; i++) {
498 if (!bank[i] || !test_bit(i, *b))
499 continue;
500
501 m.misc = 0;
502 m.addr = 0;
503 m.bank = i;
504 m.tsc = 0;
505
506 barrier();
507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
508 if (!(m.status & MCI_STATUS_VAL))
509 continue;
510
511 /*
512 * Uncorrected or signalled events are handled by the exception
513 * handler when it is enabled, so don't process those here.
514 *
515 * TBD do the same check for MCI_STATUS_EN here?
516 */
517 if (!(flags & MCP_UC) &&
518 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
519 continue;
520
521 if (m.status & MCI_STATUS_MISCV)
522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
523 if (m.status & MCI_STATUS_ADDRV)
524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
525
526 if (!(flags & MCP_TIMESTAMP))
527 m.tsc = 0;
528 /*
529 * Don't get the IP here because it's unlikely to
530 * have anything to do with the actual error location.
531 */
532 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
533 mce_log(&m);
534 add_taint(TAINT_MACHINE_CHECK);
535 }
536
537 /*
538 * Clear state for this bank.
539 */
540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
541 }
542
543 /*
544 * Don't clear MCG_STATUS here because it's only defined for
545 * exceptions.
546 */
547
548 sync_core();
549}
550EXPORT_SYMBOL_GPL(machine_check_poll);
551
552/*
553 * Do a quick check if any of the events requires a panic.
554 * This decides if we keep the events around or clear them.
555 */
556static int mce_no_way_out(struct mce *m, char **msg)
557{
558 int i;
559
560 for (i = 0; i < banks; i++) {
561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
563 return 1;
564 }
565 return 0;
566}
567
568/*
569 * Variable to establish order between CPUs while scanning.
570 * Each CPU spins initially until executing is equal its number.
571 */
572static atomic_t mce_executing;
573
574/*
575 * Defines order of CPUs on entry. First CPU becomes Monarch.
576 */
577static atomic_t mce_callin;
578
579/*
580 * Check if a timeout waiting for other CPUs happened.
581 */
582static int mce_timed_out(u64 *t)
583{
584 /*
585 * The others already did panic for some reason.
586 * Bail out like in a timeout.
587 * rmb() to tell the compiler that system_state
588 * might have been modified by someone else.
589 */
590 rmb();
591 if (atomic_read(&mce_paniced))
592 wait_for_panic();
593 if (!monarch_timeout)
594 goto out;
595 if ((s64)*t < SPINUNIT) {
596 /* CHECKME: Make panic default for 1 too? */
597 if (tolerant < 1)
598 mce_panic("Timeout synchronizing machine check over CPUs",
599 NULL, NULL);
600 cpu_missing = 1;
601 return 1;
602 }
603 *t -= SPINUNIT;
604out:
605 touch_nmi_watchdog();
606 return 0;
607}
608
609/*
610 * The Monarch's reign. The Monarch is the CPU who entered
611 * the machine check handler first. It waits for the others to
612 * raise the exception too and then grades them. When any
613 * error is fatal panic. Only then let the others continue.
614 *
615 * The other CPUs entering the MCE handler will be controlled by the
616 * Monarch. They are called Subjects.
617 *
618 * This way we prevent any potential data corruption in a unrecoverable case
619 * and also makes sure always all CPU's errors are examined.
620 *
621 * Also this detects the case of an machine check event coming from outer
622 * space (not detected by any CPUs) In this case some external agent wants
623 * us to shut down, so panic too.
624 *
625 * The other CPUs might still decide to panic if the handler happens
626 * in a unrecoverable place, but in this case the system is in a semi-stable
627 * state and won't corrupt anything by itself. It's ok to let the others
628 * continue for a bit first.
629 *
630 * All the spin loops have timeouts; when a timeout happens a CPU
631 * typically elects itself to be Monarch.
632 */
633static void mce_reign(void)
634{
635 int cpu;
636 struct mce *m = NULL;
637 int global_worst = 0;
638 char *msg = NULL;
639 char *nmsg = NULL;
640
641 /*
642 * This CPU is the Monarch and the other CPUs have run
643 * through their handlers.
644 * Grade the severity of the errors of all the CPUs.
645 */
646 for_each_possible_cpu(cpu) {
647 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
648 &nmsg);
649 if (severity > global_worst) {
650 msg = nmsg;
651 global_worst = severity;
652 m = &per_cpu(mces_seen, cpu);
653 }
654 }
655
656 /*
657 * Cannot recover? Panic here then.
658 * This dumps all the mces in the log buffer and stops the
659 * other CPUs.
660 */
661 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
662 mce_panic("Fatal Machine check", m, msg);
663
664 /*
665 * For UC somewhere we let the CPU who detects it handle it.
666 * Also must let continue the others, otherwise the handling
667 * CPU could deadlock on a lock.
668 */
669
670 /*
671 * No machine check event found. Must be some external
672 * source or one CPU is hung. Panic.
673 */
674 if (!m && tolerant < 3)
675 mce_panic("Machine check from unknown source", NULL, NULL);
676
677 /*
678 * Now clear all the mces_seen so that they don't reappear on
679 * the next mce.
680 */
681 for_each_possible_cpu(cpu)
682 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
683}
684
685static atomic_t global_nwo;
686
687/*
688 * Start of Monarch synchronization. This waits until all CPUs have
689 * entered the exception handler and then determines if any of them
690 * saw a fatal event that requires panic. Then it executes them
691 * in the entry order.
692 * TBD double check parallel CPU hotunplug
693 */
694static int mce_start(int no_way_out, int *order)
695{
696 int nwo;
697 int cpus = num_online_cpus();
698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
699
700 if (!timeout) {
701 *order = -1;
702 return no_way_out;
703 }
704
705 atomic_add(no_way_out, &global_nwo);
706
707 /*
708 * Wait for everyone.
709 */
710 while (atomic_read(&mce_callin) != cpus) {
711 if (mce_timed_out(&timeout)) {
712 atomic_set(&global_nwo, 0);
713 *order = -1;
714 return no_way_out;
715 }
716 ndelay(SPINUNIT);
717 }
718
719 /*
720 * Cache the global no_way_out state.
721 */
722 nwo = atomic_read(&global_nwo);
723
724 /*
725 * Monarch starts executing now, the others wait.
726 */
727 if (*order == 1) {
728 atomic_set(&mce_executing, 1);
729 return nwo;
730 }
731
732 /*
733 * Now start the scanning loop one by one
734 * in the original callin order.
735 * This way when there are any shared banks it will
736 * be only seen by one CPU before cleared, avoiding duplicates.
737 */
738 while (atomic_read(&mce_executing) < *order) {
739 if (mce_timed_out(&timeout)) {
740 atomic_set(&global_nwo, 0);
741 *order = -1;
742 return no_way_out;
743 }
744 ndelay(SPINUNIT);
745 }
746 return nwo;
747}
748
749/*
750 * Synchronize between CPUs after main scanning loop.
751 * This invokes the bulk of the Monarch processing.
752 */
753static int mce_end(int order)
754{
755 int ret = -1;
756 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
757
758 if (!timeout)
759 goto reset;
760 if (order < 0)
761 goto reset;
762
763 /*
764 * Allow others to run.
765 */
766 atomic_inc(&mce_executing);
767
768 if (order == 1) {
769 /* CHECKME: Can this race with a parallel hotplug? */
770 int cpus = num_online_cpus();
771
772 /*
773 * Monarch: Wait for everyone to go through their scanning
774 * loops.
775 */
776 while (atomic_read(&mce_executing) <= cpus) {
777 if (mce_timed_out(&timeout))
778 goto reset;
779 ndelay(SPINUNIT);
780 }
781
782 mce_reign();
783 barrier();
784 ret = 0;
785 } else {
786 /*
787 * Subject: Wait for Monarch to finish.
788 */
789 while (atomic_read(&mce_executing) != 0) {
790 if (mce_timed_out(&timeout))
791 goto reset;
792 ndelay(SPINUNIT);
793 }
794
795 /*
796 * Don't reset anything. That's done by the Monarch.
797 */
798 return 0;
799 }
800
801 /*
802 * Reset all global state.
803 */
804reset:
805 atomic_set(&global_nwo, 0);
806 atomic_set(&mce_callin, 0);
807 barrier();
808
809 /*
810 * Let others run again.
811 */
812 atomic_set(&mce_executing, 0);
813 return ret;
814}
815
816/*
817 * Check if the address reported by the CPU is in a format we can parse.
818 * It would be possible to add code for most other cases, but all would
819 * be somewhat complicated (e.g. segment offset would require an instruction
820 * parser). So only support physical addresses upto page granuality for now.
821 */
822static int mce_usable_address(struct mce *m)
823{
824 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
825 return 0;
826 if ((m->misc & 0x3f) > PAGE_SHIFT)
827 return 0;
828 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
829 return 0;
830 return 1;
831}
832
833static void mce_clear_state(unsigned long *toclear)
834{
835 int i;
836
837 for (i = 0; i < banks; i++) {
838 if (test_bit(i, toclear))
839 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
840 }
841}
842
843/*
844 * The actual machine check handler. This only handles real
845 * exceptions when something got corrupted coming in through int 18.
846 *
847 * This is executed in NMI context not subject to normal locking rules. This
848 * implies that most kernel services cannot be safely used. Don't even
849 * think about putting a printk in there!
850 *
851 * On Intel systems this is entered on all CPUs in parallel through
852 * MCE broadcast. However some CPUs might be broken beyond repair,
853 * so be always careful when synchronizing with others.
854 */
855void do_machine_check(struct pt_regs *regs, long error_code)
856{
857 struct mce m, *final;
858 int i;
859 int worst = 0;
860 int severity;
861 /*
862 * Establish sequential order between the CPUs entering the machine
863 * check handler.
864 */
865 int order;
866
867 /*
868 * If no_way_out gets set, there is no safe way to recover from this
869 * MCE. If tolerant is cranked up, we'll try anyway.
870 */
871 int no_way_out = 0;
872 /*
873 * If kill_it gets set, there might be a way to recover from this
874 * error.
875 */
876 int kill_it = 0;
877 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
878 char *msg = "Unknown";
879
880 atomic_inc(&mce_entry);
881
882 __get_cpu_var(mce_exception_count)++;
883
884 if (notify_die(DIE_NMI, "machine check", regs, error_code,
885 18, SIGKILL) == NOTIFY_STOP)
886 goto out;
887 if (!banks)
888 goto out;
889
890 order = atomic_add_return(1, &mce_callin);
891 mce_setup(&m);
892
893 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
894 no_way_out = mce_no_way_out(&m, &msg);
895
896 final = &__get_cpu_var(mces_seen);
897 *final = m;
898
899 barrier();
900
901 /*
902 * When no restart IP must always kill or panic.
903 */
904 if (!(m.mcgstatus & MCG_STATUS_RIPV))
905 kill_it = 1;
906
907 /*
908 * Go through all the banks in exclusion of the other CPUs.
909 * This way we don't report duplicated events on shared banks
910 * because the first one to see it will clear it.
911 */
912 no_way_out = mce_start(no_way_out, &order);
913 for (i = 0; i < banks; i++) {
914 __clear_bit(i, toclear);
915 if (!bank[i])
916 continue;
917
918 m.misc = 0;
919 m.addr = 0;
920 m.bank = i;
921
922 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
923 if ((m.status & MCI_STATUS_VAL) == 0)
924 continue;
925
926 /*
927 * Non uncorrected or non signaled errors are handled by
928 * machine_check_poll. Leave them alone, unless this panics.
929 */
930 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
931 !no_way_out)
932 continue;
933
934 /*
935 * Set taint even when machine check was not enabled.
936 */
937 add_taint(TAINT_MACHINE_CHECK);
938
939 severity = mce_severity(&m, tolerant, NULL);
940
941 /*
942 * When machine check was for corrected handler don't touch,
943 * unless we're panicing.
944 */
945 if (severity == MCE_KEEP_SEVERITY && !no_way_out)
946 continue;
947 __set_bit(i, toclear);
948 if (severity == MCE_NO_SEVERITY) {
949 /*
950 * Machine check event was not enabled. Clear, but
951 * ignore.
952 */
953 continue;
954 }
955
956 /*
957 * Kill on action required.
958 */
959 if (severity == MCE_AR_SEVERITY)
960 kill_it = 1;
961
962 if (m.status & MCI_STATUS_MISCV)
963 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
964 if (m.status & MCI_STATUS_ADDRV)
965 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
966
967 /*
968 * Action optional error. Queue address for later processing.
969 * When the ring overflows we just ignore the AO error.
970 * RED-PEN add some logging mechanism when
971 * usable_address or mce_add_ring fails.
972 * RED-PEN don't ignore overflow for tolerant == 0
973 */
974 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
975 mce_ring_add(m.addr >> PAGE_SHIFT);
976
977 mce_get_rip(&m, regs);
978 mce_log(&m);
979
980 if (severity > worst) {
981 *final = m;
982 worst = severity;
983 }
984 }
985
986 if (!no_way_out)
987 mce_clear_state(toclear);
988
989 /*
990 * Do most of the synchronization with other CPUs.
991 * When there's any problem use only local no_way_out state.
992 */
993 if (mce_end(order) < 0)
994 no_way_out = worst >= MCE_PANIC_SEVERITY;
995
996 /*
997 * If we have decided that we just CAN'T continue, and the user
998 * has not set tolerant to an insane level, give up and die.
999 *
1000 * This is mainly used in the case when the system doesn't
1001 * support MCE broadcasting or it has been disabled.
1002 */
1003 if (no_way_out && tolerant < 3)
1004 mce_panic("Fatal machine check on current CPU", final, msg);
1005
1006 /*
1007 * If the error seems to be unrecoverable, something should be
1008 * done. Try to kill as little as possible. If we can kill just
1009 * one task, do that. If the user has set the tolerance very
1010 * high, don't try to do anything at all.
1011 */
1012
1013 if (kill_it && tolerant < 3)
1014 force_sig(SIGBUS, current);
1015
1016 /* notify userspace ASAP */
1017 set_thread_flag(TIF_MCE_NOTIFY);
1018
1019 if (worst > 0)
1020 mce_report_event(regs);
1021 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1022out:
1023 atomic_dec(&mce_entry);
1024 sync_core();
1025}
1026EXPORT_SYMBOL_GPL(do_machine_check);
1027
1028/* dummy to break dependency. actual code is in mm/memory-failure.c */
1029void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1030{
1031 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1032}
1033
1034/*
1035 * Called after mce notification in process context. This code
1036 * is allowed to sleep. Call the high level VM handler to process
1037 * any corrupted pages.
1038 * Assume that the work queue code only calls this one at a time
1039 * per CPU.
1040 * Note we don't disable preemption, so this code might run on the wrong
1041 * CPU. In this case the event is picked up by the scheduled work queue.
1042 * This is merely a fast path to expedite processing in some common
1043 * cases.
1044 */
1045void mce_notify_process(void)
1046{
1047 unsigned long pfn;
1048 mce_notify_irq();
1049 while (mce_ring_get(&pfn))
1050 memory_failure(pfn, MCE_VECTOR);
1051}
1052
1053static void mce_process_work(struct work_struct *dummy)
1054{
1055 mce_notify_process();
1056}
1057
1058#ifdef CONFIG_X86_MCE_INTEL
1059/***
1060 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1061 * @cpu: The CPU on which the event occurred.
1062 * @status: Event status information
1063 *
1064 * This function should be called by the thermal interrupt after the
1065 * event has been processed and the decision was made to log the event
1066 * further.
1067 *
1068 * The status parameter will be saved to the 'status' field of 'struct mce'
1069 * and historically has been the register value of the
1070 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1071 */
1072void mce_log_therm_throt_event(__u64 status)
1073{
1074 struct mce m;
1075
1076 mce_setup(&m);
1077 m.bank = MCE_THERMAL_BANK;
1078 m.status = status;
1079 mce_log(&m);
1080}
1081#endif /* CONFIG_X86_MCE_INTEL */
1082
1083/*
1084 * Periodic polling timer for "silent" machine check errors. If the
1085 * poller finds an MCE, poll 2x faster. When the poller finds no more
1086 * errors, poll 2x slower (up to check_interval seconds).
1087 */
1088static int check_interval = 5 * 60; /* 5 minutes */
1089
1090static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
1091static DEFINE_PER_CPU(struct timer_list, mce_timer);
1092
1093static void mcheck_timer(unsigned long data)
1094{
1095 struct timer_list *t = &per_cpu(mce_timer, data);
1096 int *n;
1097
1098 WARN_ON(smp_processor_id() != data);
1099
1100 if (mce_available(&current_cpu_data)) {
1101 machine_check_poll(MCP_TIMESTAMP,
1102 &__get_cpu_var(mce_poll_banks));
1103 }
1104
1105 /*
1106 * Alert userspace if needed. If we logged an MCE, reduce the
1107 * polling interval, otherwise increase the polling interval.
1108 */
1109 n = &__get_cpu_var(next_interval);
1110 if (mce_notify_irq())
1111 *n = max(*n/2, HZ/100);
1112 else
1113 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1114
1115 t->expires = jiffies + *n;
1116 add_timer(t);
1117}
1118
1119static void mce_do_trigger(struct work_struct *work)
1120{
1121 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
1122}
1123
1124static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1125
1126/*
1127 * Notify the user(s) about new machine check events.
1128 * Can be called from interrupt context, but not from machine check/NMI
1129 * context.
1130 */
1131int mce_notify_irq(void)
1132{
1133 /* Not more than two messages every minute */
1134 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1135
1136 clear_thread_flag(TIF_MCE_NOTIFY);
1137
1138 if (test_and_clear_bit(0, &notify_user)) {
1139 wake_up_interruptible(&mce_wait);
1140
1141 /*
1142 * There is no risk of missing notifications because
1143 * work_pending is always cleared before the function is
1144 * executed.
1145 */
1146 if (trigger[0] && !work_pending(&mce_trigger_work))
1147 schedule_work(&mce_trigger_work);
1148
1149 if (__ratelimit(&ratelimit))
1150 printk(KERN_INFO "Machine check events logged\n");
1151
1152 return 1;
1153 }
1154 return 0;
1155}
1156EXPORT_SYMBOL_GPL(mce_notify_irq);
1157
1158/*
1159 * Initialize Machine Checks for a CPU.
1160 */
1161static int mce_cap_init(void)
1162{
1163 unsigned b;
1164 u64 cap;
1165
1166 rdmsrl(MSR_IA32_MCG_CAP, cap);
1167
1168 b = cap & MCG_BANKCNT_MASK;
1169 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1170
1171 if (b > MAX_NR_BANKS) {
1172 printk(KERN_WARNING
1173 "MCE: Using only %u machine check banks out of %u\n",
1174 MAX_NR_BANKS, b);
1175 b = MAX_NR_BANKS;
1176 }
1177
1178 /* Don't support asymmetric configurations today */
1179 WARN_ON(banks != 0 && b != banks);
1180 banks = b;
1181 if (!bank) {
1182 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
1183 if (!bank)
1184 return -ENOMEM;
1185 memset(bank, 0xff, banks * sizeof(u64));
1186 }
1187
1188 /* Use accurate RIP reporting if available. */
1189 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1190 rip_msr = MSR_IA32_MCG_EIP;
1191
1192 if (cap & MCG_SER_P)
1193 mce_ser = 1;
1194
1195 return 0;
1196}
1197
1198static void mce_init(void)
1199{
1200 mce_banks_t all_banks;
1201 u64 cap;
1202 int i;
1203
1204 /*
1205 * Log the machine checks left over from the previous reset.
1206 */
1207 bitmap_fill(all_banks, MAX_NR_BANKS);
1208 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1209
1210 set_in_cr4(X86_CR4_MCE);
1211
1212 rdmsrl(MSR_IA32_MCG_CAP, cap);
1213 if (cap & MCG_CTL_P)
1214 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1215
1216 for (i = 0; i < banks; i++) {
1217 if (skip_bank_init(i))
1218 continue;
1219 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1220 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
1221 }
1222}
1223
1224/* Add per CPU specific workarounds here */
1225static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1226{
1227 /* This should be disabled by the BIOS, but isn't always */
1228 if (c->x86_vendor == X86_VENDOR_AMD) {
1229 if (c->x86 == 15 && banks > 4) {
1230 /*
1231 * disable GART TBL walk error reporting, which
1232 * trips off incorrectly with the IOMMU & 3ware
1233 * & Cerberus:
1234 */
1235 clear_bit(10, (unsigned long *)&bank[4]);
1236 }
1237 if (c->x86 <= 17 && mce_bootlog < 0) {
1238 /*
1239 * Lots of broken BIOS around that don't clear them
1240 * by default and leave crap in there. Don't log:
1241 */
1242 mce_bootlog = 0;
1243 }
1244 /*
1245 * Various K7s with broken bank 0 around. Always disable
1246 * by default.
1247 */
1248 if (c->x86 == 6)
1249 bank[0] = 0;
1250 }
1251
1252 if (c->x86_vendor == X86_VENDOR_INTEL) {
1253 /*
1254 * SDM documents that on family 6 bank 0 should not be written
1255 * because it aliases to another special BIOS controlled
1256 * register.
1257 * But it's not aliased anymore on model 0x1a+
1258 * Don't ignore bank 0 completely because there could be a
1259 * valid event later, merely don't write CTL0.
1260 */
1261
1262 if (c->x86 == 6 && c->x86_model < 0x1A)
1263 __set_bit(0, &dont_init_banks);
1264
1265 /*
1266 * All newer Intel systems support MCE broadcasting. Enable
1267 * synchronization with a one second timeout.
1268 */
1269 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1270 monarch_timeout < 0)
1271 monarch_timeout = USEC_PER_SEC;
1272 }
1273 if (monarch_timeout < 0)
1274 monarch_timeout = 0;
1275 if (mce_bootlog != 0)
1276 mce_panic_timeout = 30;
1277}
1278
1279static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1280{
1281 if (c->x86 != 5)
1282 return;
1283 switch (c->x86_vendor) {
1284 case X86_VENDOR_INTEL:
1285 if (mce_p5_enabled())
1286 intel_p5_mcheck_init(c);
1287 break;
1288 case X86_VENDOR_CENTAUR:
1289 winchip_mcheck_init(c);
1290 break;
1291 }
1292}
1293
1294static void mce_cpu_features(struct cpuinfo_x86 *c)
1295{
1296 switch (c->x86_vendor) {
1297 case X86_VENDOR_INTEL:
1298 mce_intel_feature_init(c);
1299 break;
1300 case X86_VENDOR_AMD:
1301 mce_amd_feature_init(c);
1302 break;
1303 default:
1304 break;
1305 }
1306}
1307
1308static void mce_init_timer(void)
1309{
1310 struct timer_list *t = &__get_cpu_var(mce_timer);
1311 int *n = &__get_cpu_var(next_interval);
1312
1313 if (mce_ignore_ce)
1314 return;
1315
1316 *n = check_interval * HZ;
1317 if (!*n)
1318 return;
1319 setup_timer(t, mcheck_timer, smp_processor_id());
1320 t->expires = round_jiffies(jiffies + *n);
1321 add_timer(t);
1322}
1323
1324/*
1325 * Called for each booted CPU to set up machine checks.
1326 * Must be called with preempt off:
1327 */
1328void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1329{
1330 if (mce_disabled)
1331 return;
1332
1333 mce_ancient_init(c);
1334
1335 if (!mce_available(c))
1336 return;
1337
1338 if (mce_cap_init() < 0) {
1339 mce_disabled = 1;
1340 return;
1341 }
1342 mce_cpu_quirks(c);
1343
1344 machine_check_vector = do_machine_check;
1345
1346 mce_init();
1347 mce_cpu_features(c);
1348 mce_init_timer();
1349 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1350}
1351
1352/*
1353 * Character device to read and clear the MCE log.
1354 */
1355
1356static DEFINE_SPINLOCK(mce_state_lock);
1357static int open_count; /* #times opened */
1358static int open_exclu; /* already open exclusive? */
1359
1360static int mce_open(struct inode *inode, struct file *file)
1361{
1362 spin_lock(&mce_state_lock);
1363
1364 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1365 spin_unlock(&mce_state_lock);
1366
1367 return -EBUSY;
1368 }
1369
1370 if (file->f_flags & O_EXCL)
1371 open_exclu = 1;
1372 open_count++;
1373
1374 spin_unlock(&mce_state_lock);
1375
1376 return nonseekable_open(inode, file);
1377}
1378
1379static int mce_release(struct inode *inode, struct file *file)
1380{
1381 spin_lock(&mce_state_lock);
1382
1383 open_count--;
1384 open_exclu = 0;
1385
1386 spin_unlock(&mce_state_lock);
1387
1388 return 0;
1389}
1390
1391static void collect_tscs(void *data)
1392{
1393 unsigned long *cpu_tsc = (unsigned long *)data;
1394
1395 rdtscll(cpu_tsc[smp_processor_id()]);
1396}
1397
1398static DEFINE_MUTEX(mce_read_mutex);
1399
1400static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1401 loff_t *off)
1402{
1403 char __user *buf = ubuf;
1404 unsigned long *cpu_tsc;
1405 unsigned prev, next;
1406 int i, err;
1407
1408 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1409 if (!cpu_tsc)
1410 return -ENOMEM;
1411
1412 mutex_lock(&mce_read_mutex);
1413 next = rcu_dereference(mcelog.next);
1414
1415 /* Only supports full reads right now */
1416 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
1417 mutex_unlock(&mce_read_mutex);
1418 kfree(cpu_tsc);
1419
1420 return -EINVAL;
1421 }
1422
1423 err = 0;
1424 prev = 0;
1425 do {
1426 for (i = prev; i < next; i++) {
1427 unsigned long start = jiffies;
1428
1429 while (!mcelog.entry[i].finished) {
1430 if (time_after_eq(jiffies, start + 2)) {
1431 memset(mcelog.entry + i, 0,
1432 sizeof(struct mce));
1433 goto timeout;
1434 }
1435 cpu_relax();
1436 }
1437 smp_rmb();
1438 err |= copy_to_user(buf, mcelog.entry + i,
1439 sizeof(struct mce));
1440 buf += sizeof(struct mce);
1441timeout:
1442 ;
1443 }
1444
1445 memset(mcelog.entry + prev, 0,
1446 (next - prev) * sizeof(struct mce));
1447 prev = next;
1448 next = cmpxchg(&mcelog.next, prev, 0);
1449 } while (next != prev);
1450
1451 synchronize_sched();
1452
1453 /*
1454 * Collect entries that were still getting written before the
1455 * synchronize.
1456 */
1457 on_each_cpu(collect_tscs, cpu_tsc, 1);
1458
1459 for (i = next; i < MCE_LOG_LEN; i++) {
1460 if (mcelog.entry[i].finished &&
1461 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1462 err |= copy_to_user(buf, mcelog.entry+i,
1463 sizeof(struct mce));
1464 smp_rmb();
1465 buf += sizeof(struct mce);
1466 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1467 }
1468 }
1469 mutex_unlock(&mce_read_mutex);
1470 kfree(cpu_tsc);
1471
1472 return err ? -EFAULT : buf - ubuf;
1473}
1474
1475static unsigned int mce_poll(struct file *file, poll_table *wait)
1476{
1477 poll_wait(file, &mce_wait, wait);
1478 if (rcu_dereference(mcelog.next))
1479 return POLLIN | POLLRDNORM;
1480 return 0;
1481}
1482
1483static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1484{
1485 int __user *p = (int __user *)arg;
1486
1487 if (!capable(CAP_SYS_ADMIN))
1488 return -EPERM;
1489
1490 switch (cmd) {
1491 case MCE_GET_RECORD_LEN:
1492 return put_user(sizeof(struct mce), p);
1493 case MCE_GET_LOG_LEN:
1494 return put_user(MCE_LOG_LEN, p);
1495 case MCE_GETCLEAR_FLAGS: {
1496 unsigned flags;
1497
1498 do {
1499 flags = mcelog.flags;
1500 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1501
1502 return put_user(flags, p);
1503 }
1504 default:
1505 return -ENOTTY;
1506 }
1507}
1508
1509/* Modified in mce-inject.c, so not static or const */
1510struct file_operations mce_chrdev_ops = {
1511 .open = mce_open,
1512 .release = mce_release,
1513 .read = mce_read,
1514 .poll = mce_poll,
1515 .unlocked_ioctl = mce_ioctl,
1516};
1517EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1518
1519static struct miscdevice mce_log_device = {
1520 MISC_MCELOG_MINOR,
1521 "mcelog",
1522 &mce_chrdev_ops,
1523};
1524
1525/*
1526 * mce=off Disables machine check
1527 * mce=no_cmci Disables CMCI
1528 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1529 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1530 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1531 * monarchtimeout is how long to wait for other CPUs on machine
1532 * check, or 0 to not wait
1533 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1534 * mce=nobootlog Don't log MCEs from before booting.
1535 */
1536static int __init mcheck_enable(char *str)
1537{
1538 if (*str == 0)
1539 enable_p5_mce();
1540 if (*str == '=')
1541 str++;
1542 if (!strcmp(str, "off"))
1543 mce_disabled = 1;
1544 else if (!strcmp(str, "no_cmci"))
1545 mce_cmci_disabled = 1;
1546 else if (!strcmp(str, "dont_log_ce"))
1547 mce_dont_log_ce = 1;
1548 else if (!strcmp(str, "ignore_ce"))
1549 mce_ignore_ce = 1;
1550 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1551 mce_bootlog = (str[0] == 'b');
1552 else if (isdigit(str[0])) {
1553 get_option(&str, &tolerant);
1554 if (*str == ',') {
1555 ++str;
1556 get_option(&str, &monarch_timeout);
1557 }
1558 } else {
1559 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1560 str);
1561 return 0;
1562 }
1563 return 1;
1564}
1565__setup("mce", mcheck_enable);
1566
1567/*
1568 * Sysfs support
1569 */
1570
1571/*
1572 * Disable machine checks on suspend and shutdown. We can't really handle
1573 * them later.
1574 */
1575static int mce_disable(void)
1576{
1577 int i;
1578
1579 for (i = 0; i < banks; i++) {
1580 if (!skip_bank_init(i))
1581 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1582 }
1583 return 0;
1584}
1585
1586static int mce_suspend(struct sys_device *dev, pm_message_t state)
1587{
1588 return mce_disable();
1589}
1590
1591static int mce_shutdown(struct sys_device *dev)
1592{
1593 return mce_disable();
1594}
1595
1596/*
1597 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1598 * Only one CPU is active at this time, the others get re-added later using
1599 * CPU hotplug:
1600 */
1601static int mce_resume(struct sys_device *dev)
1602{
1603 mce_init();
1604 mce_cpu_features(&current_cpu_data);
1605
1606 return 0;
1607}
1608
1609static void mce_cpu_restart(void *data)
1610{
1611 del_timer_sync(&__get_cpu_var(mce_timer));
1612 if (mce_available(&current_cpu_data))
1613 mce_init();
1614 mce_init_timer();
1615}
1616
1617/* Reinit MCEs after user configuration changes */
1618static void mce_restart(void)
1619{
1620 on_each_cpu(mce_cpu_restart, NULL, 1);
1621}
1622
1623static struct sysdev_class mce_sysclass = {
1624 .suspend = mce_suspend,
1625 .shutdown = mce_shutdown,
1626 .resume = mce_resume,
1627 .name = "machinecheck",
1628};
1629
1630DEFINE_PER_CPU(struct sys_device, mce_dev);
1631
1632__cpuinitdata
1633void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1634
1635static struct sysdev_attribute *bank_attrs;
1636
1637static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1638 char *buf)
1639{
1640 u64 b = bank[attr - bank_attrs];
1641
1642 return sprintf(buf, "%llx\n", b);
1643}
1644
1645static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1646 const char *buf, size_t size)
1647{
1648 u64 new;
1649
1650 if (strict_strtoull(buf, 0, &new) < 0)
1651 return -EINVAL;
1652
1653 bank[attr - bank_attrs] = new;
1654 mce_restart();
1655
1656 return size;
1657}
1658
1659static ssize_t
1660show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1661{
1662 strcpy(buf, trigger);
1663 strcat(buf, "\n");
1664 return strlen(trigger) + 1;
1665}
1666
1667static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1668 const char *buf, size_t siz)
1669{
1670 char *p;
1671 int len;
1672
1673 strncpy(trigger, buf, sizeof(trigger));
1674 trigger[sizeof(trigger)-1] = 0;
1675 len = strlen(trigger);
1676 p = strchr(trigger, '\n');
1677
1678 if (*p)
1679 *p = 0;
1680
1681 return len;
1682}
1683
1684static ssize_t store_int_with_restart(struct sys_device *s,
1685 struct sysdev_attribute *attr,
1686 const char *buf, size_t size)
1687{
1688 ssize_t ret = sysdev_store_int(s, attr, buf, size);
1689 mce_restart();
1690 return ret;
1691}
1692
1693static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1694static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1695static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1696
1697static struct sysdev_ext_attribute attr_check_interval = {
1698 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1699 store_int_with_restart),
1700 &check_interval
1701};
1702
1703static struct sysdev_attribute *mce_attrs[] = {
1704 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1705 &attr_monarch_timeout.attr,
1706 NULL
1707};
1708
1709static cpumask_var_t mce_dev_initialized;
1710
1711/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1712static __cpuinit int mce_create_device(unsigned int cpu)
1713{
1714 int err;
1715 int i;
1716
1717 if (!mce_available(&boot_cpu_data))
1718 return -EIO;
1719
1720 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1721 per_cpu(mce_dev, cpu).id = cpu;
1722 per_cpu(mce_dev, cpu).cls = &mce_sysclass;
1723
1724 err = sysdev_register(&per_cpu(mce_dev, cpu));
1725 if (err)
1726 return err;
1727
1728 for (i = 0; mce_attrs[i]; i++) {
1729 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1730 if (err)
1731 goto error;
1732 }
1733 for (i = 0; i < banks; i++) {
1734 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1735 &bank_attrs[i]);
1736 if (err)
1737 goto error2;
1738 }
1739 cpumask_set_cpu(cpu, mce_dev_initialized);
1740
1741 return 0;
1742error2:
1743 while (--i >= 0)
1744 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1745error:
1746 while (--i >= 0)
1747 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1748
1749 sysdev_unregister(&per_cpu(mce_dev, cpu));
1750
1751 return err;
1752}
1753
1754static __cpuinit void mce_remove_device(unsigned int cpu)
1755{
1756 int i;
1757
1758 if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1759 return;
1760
1761 for (i = 0; mce_attrs[i]; i++)
1762 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1763
1764 for (i = 0; i < banks; i++)
1765 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1766
1767 sysdev_unregister(&per_cpu(mce_dev, cpu));
1768 cpumask_clear_cpu(cpu, mce_dev_initialized);
1769}
1770
1771/* Make sure there are no machine checks on offlined CPUs. */
1772static void mce_disable_cpu(void *h)
1773{
1774 unsigned long action = *(unsigned long *)h;
1775 int i;
1776
1777 if (!mce_available(&current_cpu_data))
1778 return;
1779 if (!(action & CPU_TASKS_FROZEN))
1780 cmci_clear();
1781 for (i = 0; i < banks; i++) {
1782 if (!skip_bank_init(i))
1783 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1784 }
1785}
1786
1787static void mce_reenable_cpu(void *h)
1788{
1789 unsigned long action = *(unsigned long *)h;
1790 int i;
1791
1792 if (!mce_available(&current_cpu_data))
1793 return;
1794
1795 if (!(action & CPU_TASKS_FROZEN))
1796 cmci_reenable();
1797 for (i = 0; i < banks; i++) {
1798 if (!skip_bank_init(i))
1799 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1800 }
1801}
1802
1803/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1804static int __cpuinit
1805mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1806{
1807 unsigned int cpu = (unsigned long)hcpu;
1808 struct timer_list *t = &per_cpu(mce_timer, cpu);
1809
1810 switch (action) {
1811 case CPU_ONLINE:
1812 case CPU_ONLINE_FROZEN:
1813 mce_create_device(cpu);
1814 if (threshold_cpu_callback)
1815 threshold_cpu_callback(action, cpu);
1816 break;
1817 case CPU_DEAD:
1818 case CPU_DEAD_FROZEN:
1819 if (threshold_cpu_callback)
1820 threshold_cpu_callback(action, cpu);
1821 mce_remove_device(cpu);
1822 break;
1823 case CPU_DOWN_PREPARE:
1824 case CPU_DOWN_PREPARE_FROZEN:
1825 del_timer_sync(t);
1826 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1827 break;
1828 case CPU_DOWN_FAILED:
1829 case CPU_DOWN_FAILED_FROZEN:
1830 t->expires = round_jiffies(jiffies +
1831 __get_cpu_var(next_interval));
1832 add_timer_on(t, cpu);
1833 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1834 break;
1835 case CPU_POST_DEAD:
1836 /* intentionally ignoring frozen here */
1837 cmci_rediscover(cpu);
1838 break;
1839 }
1840 return NOTIFY_OK;
1841}
1842
1843static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1844 .notifier_call = mce_cpu_callback,
1845};
1846
1847static __init int mce_init_banks(void)
1848{
1849 int i;
1850
1851 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1852 GFP_KERNEL);
1853 if (!bank_attrs)
1854 return -ENOMEM;
1855
1856 for (i = 0; i < banks; i++) {
1857 struct sysdev_attribute *a = &bank_attrs[i];
1858
1859 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1860 if (!a->attr.name)
1861 goto nomem;
1862
1863 a->attr.mode = 0644;
1864 a->show = show_bank;
1865 a->store = set_bank;
1866 }
1867 return 0;
1868
1869nomem:
1870 while (--i >= 0)
1871 kfree(bank_attrs[i].attr.name);
1872 kfree(bank_attrs);
1873 bank_attrs = NULL;
1874
1875 return -ENOMEM;
1876}
1877
1878static __init int mce_init_device(void)
1879{
1880 int err;
1881 int i = 0;
1882
1883 if (!mce_available(&boot_cpu_data))
1884 return -EIO;
1885
1886 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1887
1888 err = mce_init_banks();
1889 if (err)
1890 return err;
1891
1892 err = sysdev_class_register(&mce_sysclass);
1893 if (err)
1894 return err;
1895
1896 for_each_online_cpu(i) {
1897 err = mce_create_device(i);
1898 if (err)
1899 return err;
1900 }
1901
1902 register_hotcpu_notifier(&mce_cpu_notifier);
1903 misc_register(&mce_log_device);
1904
1905 return err;
1906}
1907
1908device_initcall(mce_init_device);
1909
1910#else /* CONFIG_X86_OLD_MCE: */
1911
1912int nr_mce_banks;
1913EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
1914
1915/* This has to be run for each processor */
1916void mcheck_init(struct cpuinfo_x86 *c)
1917{
1918 if (mce_disabled == 1)
1919 return;
1920
1921 switch (c->x86_vendor) {
1922 case X86_VENDOR_AMD:
1923 amd_mcheck_init(c);
1924 break;
1925
1926 case X86_VENDOR_INTEL:
1927 if (c->x86 == 5)
1928 intel_p5_mcheck_init(c);
1929 if (c->x86 == 6)
1930 intel_p6_mcheck_init(c);
1931 if (c->x86 == 15)
1932 intel_p4_mcheck_init(c);
1933 break;
1934
1935 case X86_VENDOR_CENTAUR:
1936 if (c->x86 == 5)
1937 winchip_mcheck_init(c);
1938 break;
1939
1940 default:
1941 break;
1942 }
1943 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1944}
1945
1946static int __init mcheck_enable(char *str)
1947{
1948 mce_disabled = -1;
1949 return 1;
1950}
1951
1952__setup("mce", mcheck_enable);
1953
1954#endif /* CONFIG_X86_OLD_MCE */
1955
1956/*
1957 * Old style boot options parsing. Only for compatibility.
1958 */
1959static int __init mcheck_disable(char *str)
1960{
1961 mce_disabled = 1;
1962 return 1;
1963}
1964__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index ae9f628838f1..84a552b458c8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -1,14 +1,38 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <asm/mce.h> 2#include <asm/mce.h>
3 3
4#ifdef CONFIG_X86_OLD_MCE
4void amd_mcheck_init(struct cpuinfo_x86 *c); 5void amd_mcheck_init(struct cpuinfo_x86 *c);
5void intel_p4_mcheck_init(struct cpuinfo_x86 *c); 6void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
6void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
7void intel_p6_mcheck_init(struct cpuinfo_x86 *c); 7void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8#endif
9
10#ifdef CONFIG_X86_ANCIENT_MCE
11void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
8void winchip_mcheck_init(struct cpuinfo_x86 *c); 12void winchip_mcheck_init(struct cpuinfo_x86 *c);
13extern int mce_p5_enable;
14static inline int mce_p5_enabled(void) { return mce_p5_enable; }
15static inline void enable_p5_mce(void) { mce_p5_enable = 1; }
16#else
17static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
18static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
19static inline int mce_p5_enabled(void) { return 0; }
20static inline void enable_p5_mce(void) { }
21#endif
9 22
10/* Call the installed machine check handler for this CPU setup. */ 23/* Call the installed machine check handler for this CPU setup. */
11extern void (*machine_check_vector)(struct pt_regs *, long error_code); 24extern void (*machine_check_vector)(struct pt_regs *, long error_code);
12 25
26#ifdef CONFIG_X86_OLD_MCE
27
13extern int nr_mce_banks; 28extern int nr_mce_banks;
14 29
30void intel_set_thermal_handler(void);
31
32#else
33
34static inline void intel_set_thermal_handler(void) { }
35
36#endif
37
38void intel_init_thermal(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
deleted file mode 100644
index 3552119b091d..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ /dev/null
@@ -1,76 +0,0 @@
1/*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/smp.h>
11#include <linux/thread_info.h>
12
13#include <asm/processor.h>
14#include <asm/system.h>
15#include <asm/mce.h>
16
17#include "mce.h"
18
19int mce_disabled;
20int nr_mce_banks;
21
22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
23
24/* Handle unconfigured int18 (should never happen) */
25static void unexpected_machine_check(struct pt_regs *regs, long error_code)
26{
27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
28}
29
30/* Call the installed machine check handler for this CPU setup. */
31void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
32
33/* This has to be run for each processor */
34void mcheck_init(struct cpuinfo_x86 *c)
35{
36 if (mce_disabled == 1)
37 return;
38
39 switch (c->x86_vendor) {
40 case X86_VENDOR_AMD:
41 amd_mcheck_init(c);
42 break;
43
44 case X86_VENDOR_INTEL:
45 if (c->x86 == 5)
46 intel_p5_mcheck_init(c);
47 if (c->x86 == 6)
48 intel_p6_mcheck_init(c);
49 if (c->x86 == 15)
50 intel_p4_mcheck_init(c);
51 break;
52
53 case X86_VENDOR_CENTAUR:
54 if (c->x86 == 5)
55 winchip_mcheck_init(c);
56 break;
57
58 default:
59 break;
60 }
61}
62
63static int __init mcheck_disable(char *str)
64{
65 mce_disabled = 1;
66 return 1;
67}
68
69static int __init mcheck_enable(char *str)
70{
71 mce_disabled = -1;
72 return 1;
73}
74
75__setup("nomce", mcheck_disable);
76__setup("mce", mcheck_enable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
deleted file mode 100644
index 6fb0b359d2a5..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ /dev/null
@@ -1,1187 +0,0 @@
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/sched.h>
14#include <linux/smp_lock.h>
15#include <linux/string.h>
16#include <linux/rcupdate.h>
17#include <linux/kallsyms.h>
18#include <linux/sysdev.h>
19#include <linux/miscdevice.h>
20#include <linux/fs.h>
21#include <linux/capability.h>
22#include <linux/cpu.h>
23#include <linux/percpu.h>
24#include <linux/poll.h>
25#include <linux/thread_info.h>
26#include <linux/ctype.h>
27#include <linux/kmod.h>
28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
32#include <asm/processor.h>
33#include <asm/msr.h>
34#include <asm/mce.h>
35#include <asm/uaccess.h>
36#include <asm/smp.h>
37#include <asm/idle.h>
38
39#define MISC_MCELOG_MINOR 227
40
41atomic_t mce_entry;
42
43static int mce_dont_init;
44
45/*
46 * Tolerant levels:
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
51 */
52static int tolerant = 1;
53static int banks;
54static u64 *bank;
55static unsigned long notify_user;
56static int rip_msr;
57static int mce_bootlog = -1;
58static atomic_t mce_events;
59
60static char trigger[128];
61static char *trigger_argv[2] = { trigger, NULL };
62
63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
78/*
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
82 */
83
84static struct mce_log mcelog = {
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
87};
88
89void mce_log(struct mce *mce)
90{
91 unsigned next, entry;
92 atomic_inc(&mce_events);
93 mce->finished = 0;
94 wmb();
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
102 return;
103 }
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
108 }
109 break;
110 }
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
115 }
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
117 wmb();
118 mcelog.entry[entry].finished = 1;
119 wmb();
120
121 set_bit(0, &notify_user);
122}
123
124static void print_mce(struct mce *m)
125{
126 printk(KERN_EMERG "\n"
127 KERN_EMERG "HARDWARE ERROR\n"
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
131 if (m->ip) {
132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
134 m->cs, m->ip);
135 if (m->cs == __KERNEL_CS)
136 print_symbol("{%s}", m->ip);
137 printk("\n");
138 }
139 printk(KERN_EMERG "TSC %llx ", m->tsc);
140 if (m->addr)
141 printk("ADDR %llx ", m->addr);
142 if (m->misc)
143 printk("MISC %llx ", m->misc);
144 printk("\n");
145 printk(KERN_EMERG "This is not a software problem!\n");
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
148}
149
150static void mce_panic(char *msg, struct mce *backup, unsigned long start)
151{
152 int i;
153
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
157
158 if (time_before(tsc, start))
159 continue;
160 print_mce(&mcelog.entry[i]);
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
163 }
164 if (backup)
165 print_mce(backup);
166 panic(msg);
167}
168
169int mce_available(struct cpuinfo_x86 *c)
170{
171 if (mce_dont_init)
172 return 0;
173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
174}
175
176static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
177{
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
179 m->ip = regs->ip;
180 m->cs = regs->cs;
181 } else {
182 m->ip = 0;
183 m->cs = 0;
184 }
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
188 rdmsrl(rip_msr, m->ip);
189 m->cs = 0;
190 }
191}
192
193/*
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242 if (!(flags & MCP_DONTLOG)) {
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245 }
246
247 /*
248 * Clear state for this bank.
249 */
250 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
251 }
252
253 /*
254 * Don't clear MCG_STATUS here because it's only defined for
255 * exceptions.
256 */
257}
258
259/*
260 * The actual machine check handler. This only handles real
261 * exceptions when something got corrupted coming in through int 18.
262 *
263 * This is executed in NMI context not subject to normal locking rules. This
264 * implies that most kernel services cannot be safely used. Don't even
265 * think about putting a printk in there!
266 */
267void do_machine_check(struct pt_regs * regs, long error_code)
268{
269 struct mce m, panicm;
270 u64 mcestart = 0;
271 int i;
272 int panicm_found = 0;
273 /*
274 * If no_way_out gets set, there is no safe way to recover from this
275 * MCE. If tolerant is cranked up, we'll try anyway.
276 */
277 int no_way_out = 0;
278 /*
279 * If kill_it gets set, there might be a way to recover from this
280 * error.
281 */
282 int kill_it = 0;
283 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
284
285 atomic_inc(&mce_entry);
286
287 if (notify_die(DIE_NMI, "machine check", regs, error_code,
288 18, SIGKILL) == NOTIFY_STOP)
289 goto out2;
290 if (!banks)
291 goto out2;
292
293 mce_setup(&m);
294
295 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
296 /* if the restart IP is not valid, we're done for */
297 if (!(m.mcgstatus & MCG_STATUS_RIPV))
298 no_way_out = 1;
299
300 rdtscll(mcestart);
301 barrier();
302
303 for (i = 0; i < banks; i++) {
304 __clear_bit(i, toclear);
305 if (!bank[i])
306 continue;
307
308 m.misc = 0;
309 m.addr = 0;
310 m.bank = i;
311
312 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
313 if ((m.status & MCI_STATUS_VAL) == 0)
314 continue;
315
316 /*
317 * Non uncorrected errors are handled by machine_check_poll
318 * Leave them alone.
319 */
320 if ((m.status & MCI_STATUS_UC) == 0)
321 continue;
322
323 /*
324 * Set taint even when machine check was not enabled.
325 */
326 add_taint(TAINT_MACHINE_CHECK);
327
328 __set_bit(i, toclear);
329
330 if (m.status & MCI_STATUS_EN) {
331 /* if PCC was set, there's no way out */
332 no_way_out |= !!(m.status & MCI_STATUS_PCC);
333 /*
334 * If this error was uncorrectable and there was
335 * an overflow, we're in trouble. If no overflow,
336 * we might get away with just killing a task.
337 */
338 if (m.status & MCI_STATUS_UC) {
339 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
340 no_way_out = 1;
341 kill_it = 1;
342 }
343 } else {
344 /*
345 * Machine check event was not enabled. Clear, but
346 * ignore.
347 */
348 continue;
349 }
350
351 if (m.status & MCI_STATUS_MISCV)
352 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
353 if (m.status & MCI_STATUS_ADDRV)
354 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
355
356 mce_get_rip(&m, regs);
357 mce_log(&m);
358
359 /* Did this bank cause the exception? */
360 /* Assume that the bank with uncorrectable errors did it,
361 and that there is only a single one. */
362 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
363 panicm = m;
364 panicm_found = 1;
365 }
366 }
367
368 /* If we didn't find an uncorrectable error, pick
369 the last one (shouldn't happen, just being safe). */
370 if (!panicm_found)
371 panicm = m;
372
373 /*
374 * If we have decided that we just CAN'T continue, and the user
375 * has not set tolerant to an insane level, give up and die.
376 */
377 if (no_way_out && tolerant < 3)
378 mce_panic("Machine check", &panicm, mcestart);
379
380 /*
381 * If the error seems to be unrecoverable, something should be
382 * done. Try to kill as little as possible. If we can kill just
383 * one task, do that. If the user has set the tolerance very
384 * high, don't try to do anything at all.
385 */
386 if (kill_it && tolerant < 3) {
387 int user_space = 0;
388
389 /*
390 * If the EIPV bit is set, it means the saved IP is the
391 * instruction which caused the MCE.
392 */
393 if (m.mcgstatus & MCG_STATUS_EIPV)
394 user_space = panicm.ip && (panicm.cs & 3);
395
396 /*
397 * If we know that the error was in user space, send a
398 * SIGBUS. Otherwise, panic if tolerance is low.
399 *
400 * force_sig() takes an awful lot of locks and has a slight
401 * risk of deadlocking.
402 */
403 if (user_space) {
404 force_sig(SIGBUS, current);
405 } else if (panic_on_oops || tolerant < 2) {
406 mce_panic("Uncorrected machine check",
407 &panicm, mcestart);
408 }
409 }
410
411 /* notify userspace ASAP */
412 set_thread_flag(TIF_MCE_NOTIFY);
413
414 /* the last thing we do is clear state */
415 for (i = 0; i < banks; i++) {
416 if (test_bit(i, toclear))
417 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
418 }
419 wrmsrl(MSR_IA32_MCG_STATUS, 0);
420 out2:
421 atomic_dec(&mce_entry);
422}
423
424#ifdef CONFIG_X86_MCE_INTEL
425/***
426 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
427 * @cpu: The CPU on which the event occurred.
428 * @status: Event status information
429 *
430 * This function should be called by the thermal interrupt after the
431 * event has been processed and the decision was made to log the event
432 * further.
433 *
434 * The status parameter will be saved to the 'status' field of 'struct mce'
435 * and historically has been the register value of the
436 * MSR_IA32_THERMAL_STATUS (Intel) msr.
437 */
438void mce_log_therm_throt_event(__u64 status)
439{
440 struct mce m;
441
442 mce_setup(&m);
443 m.bank = MCE_THERMAL_BANK;
444 m.status = status;
445 mce_log(&m);
446}
447#endif /* CONFIG_X86_MCE_INTEL */
448
449/*
450 * Periodic polling timer for "silent" machine check errors. If the
451 * poller finds an MCE, poll 2x faster. When the poller finds no more
452 * errors, poll 2x slower (up to check_interval seconds).
453 */
454
455static int check_interval = 5 * 60; /* 5 minutes */
456static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
457static void mcheck_timer(unsigned long);
458static DEFINE_PER_CPU(struct timer_list, mce_timer);
459
460static void mcheck_timer(unsigned long data)
461{
462 struct timer_list *t = &per_cpu(mce_timer, data);
463 int *n;
464
465 WARN_ON(smp_processor_id() != data);
466
467 if (mce_available(&current_cpu_data))
468 machine_check_poll(MCP_TIMESTAMP,
469 &__get_cpu_var(mce_poll_banks));
470
471 /*
472 * Alert userspace if needed. If we logged an MCE, reduce the
473 * polling interval, otherwise increase the polling interval.
474 */
475 n = &__get_cpu_var(next_interval);
476 if (mce_notify_user()) {
477 *n = max(*n/2, HZ/100);
478 } else {
479 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
480 }
481
482 t->expires = jiffies + *n;
483 add_timer(t);
484}
485
486static void mce_do_trigger(struct work_struct *work)
487{
488 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
489}
490
491static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
492
493/*
494 * Notify the user(s) about new machine check events.
495 * Can be called from interrupt context, but not from machine check/NMI
496 * context.
497 */
498int mce_notify_user(void)
499{
500 /* Not more than two messages every minute */
501 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
502
503 clear_thread_flag(TIF_MCE_NOTIFY);
504 if (test_and_clear_bit(0, &notify_user)) {
505 wake_up_interruptible(&mce_wait);
506
507 /*
508 * There is no risk of missing notifications because
509 * work_pending is always cleared before the function is
510 * executed.
511 */
512 if (trigger[0] && !work_pending(&mce_trigger_work))
513 schedule_work(&mce_trigger_work);
514
515 if (__ratelimit(&ratelimit))
516 printk(KERN_INFO "Machine check events logged\n");
517
518 return 1;
519 }
520 return 0;
521}
522
523/* see if the idle task needs to notify userspace */
524static int
525mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
526{
527 /* IDLE_END should be safe - interrupts are back on */
528 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
529 mce_notify_user();
530
531 return NOTIFY_OK;
532}
533
534static struct notifier_block mce_idle_notifier = {
535 .notifier_call = mce_idle_callback,
536};
537
538static __init int periodic_mcheck_init(void)
539{
540 idle_notifier_register(&mce_idle_notifier);
541 return 0;
542}
543__initcall(periodic_mcheck_init);
544
545/*
546 * Initialize Machine Checks for a CPU.
547 */
548static int mce_cap_init(void)
549{
550 u64 cap;
551 unsigned b;
552
553 rdmsrl(MSR_IA32_MCG_CAP, cap);
554 b = cap & 0xff;
555 if (b > MAX_NR_BANKS) {
556 printk(KERN_WARNING
557 "MCE: Using only %u machine check banks out of %u\n",
558 MAX_NR_BANKS, b);
559 b = MAX_NR_BANKS;
560 }
561
562 /* Don't support asymmetric configurations today */
563 WARN_ON(banks != 0 && b != banks);
564 banks = b;
565 if (!bank) {
566 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
567 if (!bank)
568 return -ENOMEM;
569 memset(bank, 0xff, banks * sizeof(u64));
570 }
571
572 /* Use accurate RIP reporting if available. */
573 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
574 rip_msr = MSR_IA32_MCG_EIP;
575
576 return 0;
577}
578
579static void mce_init(void *dummy)
580{
581 u64 cap;
582 int i;
583 mce_banks_t all_banks;
584
585 /*
586 * Log the machine checks left over from the previous reset.
587 */
588 bitmap_fill(all_banks, MAX_NR_BANKS);
589 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
590
591 set_in_cr4(X86_CR4_MCE);
592
593 rdmsrl(MSR_IA32_MCG_CAP, cap);
594 if (cap & MCG_CTL_P)
595 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
596
597 for (i = 0; i < banks; i++) {
598 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
599 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
600 }
601}
602
603/* Add per CPU specific workarounds here */
604static void mce_cpu_quirks(struct cpuinfo_x86 *c)
605{
606 /* This should be disabled by the BIOS, but isn't always */
607 if (c->x86_vendor == X86_VENDOR_AMD) {
608 if (c->x86 == 15 && banks > 4)
609 /* disable GART TBL walk error reporting, which trips off
610 incorrectly with the IOMMU & 3ware & Cerberus. */
611 clear_bit(10, (unsigned long *)&bank[4]);
612 if(c->x86 <= 17 && mce_bootlog < 0)
613 /* Lots of broken BIOS around that don't clear them
614 by default and leave crap in there. Don't log. */
615 mce_bootlog = 0;
616 }
617
618}
619
620static void mce_cpu_features(struct cpuinfo_x86 *c)
621{
622 switch (c->x86_vendor) {
623 case X86_VENDOR_INTEL:
624 mce_intel_feature_init(c);
625 break;
626 case X86_VENDOR_AMD:
627 mce_amd_feature_init(c);
628 break;
629 default:
630 break;
631 }
632}
633
634static void mce_init_timer(void)
635{
636 struct timer_list *t = &__get_cpu_var(mce_timer);
637 int *n = &__get_cpu_var(next_interval);
638
639 *n = check_interval * HZ;
640 if (!*n)
641 return;
642 setup_timer(t, mcheck_timer, smp_processor_id());
643 t->expires = round_jiffies(jiffies + *n);
644 add_timer(t);
645}
646
647/*
648 * Called for each booted CPU to set up machine checks.
649 * Must be called with preempt off.
650 */
651void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
652{
653 if (!mce_available(c))
654 return;
655
656 if (mce_cap_init() < 0) {
657 mce_dont_init = 1;
658 return;
659 }
660 mce_cpu_quirks(c);
661
662 mce_init(NULL);
663 mce_cpu_features(c);
664 mce_init_timer();
665}
666
667/*
668 * Character device to read and clear the MCE log.
669 */
670
671static DEFINE_SPINLOCK(mce_state_lock);
672static int open_count; /* #times opened */
673static int open_exclu; /* already open exclusive? */
674
675static int mce_open(struct inode *inode, struct file *file)
676{
677 lock_kernel();
678 spin_lock(&mce_state_lock);
679
680 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
681 spin_unlock(&mce_state_lock);
682 unlock_kernel();
683 return -EBUSY;
684 }
685
686 if (file->f_flags & O_EXCL)
687 open_exclu = 1;
688 open_count++;
689
690 spin_unlock(&mce_state_lock);
691 unlock_kernel();
692
693 return nonseekable_open(inode, file);
694}
695
696static int mce_release(struct inode *inode, struct file *file)
697{
698 spin_lock(&mce_state_lock);
699
700 open_count--;
701 open_exclu = 0;
702
703 spin_unlock(&mce_state_lock);
704
705 return 0;
706}
707
708static void collect_tscs(void *data)
709{
710 unsigned long *cpu_tsc = (unsigned long *)data;
711
712 rdtscll(cpu_tsc[smp_processor_id()]);
713}
714
715static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
716 loff_t *off)
717{
718 unsigned long *cpu_tsc;
719 static DEFINE_MUTEX(mce_read_mutex);
720 unsigned prev, next;
721 char __user *buf = ubuf;
722 int i, err;
723
724 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
725 if (!cpu_tsc)
726 return -ENOMEM;
727
728 mutex_lock(&mce_read_mutex);
729 next = rcu_dereference(mcelog.next);
730
731 /* Only supports full reads right now */
732 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
733 mutex_unlock(&mce_read_mutex);
734 kfree(cpu_tsc);
735 return -EINVAL;
736 }
737
738 err = 0;
739 prev = 0;
740 do {
741 for (i = prev; i < next; i++) {
742 unsigned long start = jiffies;
743
744 while (!mcelog.entry[i].finished) {
745 if (time_after_eq(jiffies, start + 2)) {
746 memset(mcelog.entry + i, 0,
747 sizeof(struct mce));
748 goto timeout;
749 }
750 cpu_relax();
751 }
752 smp_rmb();
753 err |= copy_to_user(buf, mcelog.entry + i,
754 sizeof(struct mce));
755 buf += sizeof(struct mce);
756timeout:
757 ;
758 }
759
760 memset(mcelog.entry + prev, 0,
761 (next - prev) * sizeof(struct mce));
762 prev = next;
763 next = cmpxchg(&mcelog.next, prev, 0);
764 } while (next != prev);
765
766 synchronize_sched();
767
768 /*
769 * Collect entries that were still getting written before the
770 * synchronize.
771 */
772 on_each_cpu(collect_tscs, cpu_tsc, 1);
773 for (i = next; i < MCE_LOG_LEN; i++) {
774 if (mcelog.entry[i].finished &&
775 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
776 err |= copy_to_user(buf, mcelog.entry+i,
777 sizeof(struct mce));
778 smp_rmb();
779 buf += sizeof(struct mce);
780 memset(&mcelog.entry[i], 0, sizeof(struct mce));
781 }
782 }
783 mutex_unlock(&mce_read_mutex);
784 kfree(cpu_tsc);
785 return err ? -EFAULT : buf - ubuf;
786}
787
788static unsigned int mce_poll(struct file *file, poll_table *wait)
789{
790 poll_wait(file, &mce_wait, wait);
791 if (rcu_dereference(mcelog.next))
792 return POLLIN | POLLRDNORM;
793 return 0;
794}
795
796static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
797{
798 int __user *p = (int __user *)arg;
799
800 if (!capable(CAP_SYS_ADMIN))
801 return -EPERM;
802 switch (cmd) {
803 case MCE_GET_RECORD_LEN:
804 return put_user(sizeof(struct mce), p);
805 case MCE_GET_LOG_LEN:
806 return put_user(MCE_LOG_LEN, p);
807 case MCE_GETCLEAR_FLAGS: {
808 unsigned flags;
809
810 do {
811 flags = mcelog.flags;
812 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
813 return put_user(flags, p);
814 }
815 default:
816 return -ENOTTY;
817 }
818}
819
820static const struct file_operations mce_chrdev_ops = {
821 .open = mce_open,
822 .release = mce_release,
823 .read = mce_read,
824 .poll = mce_poll,
825 .unlocked_ioctl = mce_ioctl,
826};
827
828static struct miscdevice mce_log_device = {
829 MISC_MCELOG_MINOR,
830 "mcelog",
831 &mce_chrdev_ops,
832};
833
834/*
835 * Old style boot options parsing. Only for compatibility.
836 */
837static int __init mcheck_disable(char *str)
838{
839 mce_dont_init = 1;
840 return 1;
841}
842
843/* mce=off disables machine check.
844 mce=TOLERANCELEVEL (number, see above)
845 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
846 mce=nobootlog Don't log MCEs from before booting. */
847static int __init mcheck_enable(char *str)
848{
849 if (!strcmp(str, "off"))
850 mce_dont_init = 1;
851 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
852 mce_bootlog = str[0] == 'b';
853 else if (isdigit(str[0]))
854 get_option(&str, &tolerant);
855 else
856 printk("mce= argument %s ignored. Please use /sys", str);
857 return 1;
858}
859
860__setup("nomce", mcheck_disable);
861__setup("mce=", mcheck_enable);
862
863/*
864 * Sysfs support
865 */
866
867/*
868 * Disable machine checks on suspend and shutdown. We can't really handle
869 * them later.
870 */
871static int mce_disable(void)
872{
873 int i;
874
875 for (i = 0; i < banks; i++)
876 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
877 return 0;
878}
879
880static int mce_suspend(struct sys_device *dev, pm_message_t state)
881{
882 return mce_disable();
883}
884
885static int mce_shutdown(struct sys_device *dev)
886{
887 return mce_disable();
888}
889
890/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
891 Only one CPU is active at this time, the others get readded later using
892 CPU hotplug. */
893static int mce_resume(struct sys_device *dev)
894{
895 mce_init(NULL);
896 mce_cpu_features(&current_cpu_data);
897 return 0;
898}
899
900static void mce_cpu_restart(void *data)
901{
902 del_timer_sync(&__get_cpu_var(mce_timer));
903 if (mce_available(&current_cpu_data))
904 mce_init(NULL);
905 mce_init_timer();
906}
907
908/* Reinit MCEs after user configuration changes */
909static void mce_restart(void)
910{
911 on_each_cpu(mce_cpu_restart, NULL, 1);
912}
913
914static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
917 .resume = mce_resume,
918 .name = "machinecheck",
919};
920
921DEFINE_PER_CPU(struct sys_device, device_mce);
922void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
923
924/* Why are there no generic functions for this? */
925#define ACCESSOR(name, var, start) \
926 static ssize_t show_ ## name(struct sys_device *s, \
927 struct sysdev_attribute *attr, \
928 char *buf) { \
929 return sprintf(buf, "%lx\n", (unsigned long)var); \
930 } \
931 static ssize_t set_ ## name(struct sys_device *s, \
932 struct sysdev_attribute *attr, \
933 const char *buf, size_t siz) { \
934 char *end; \
935 unsigned long new = simple_strtoul(buf, &end, 0); \
936 if (end == buf) return -EINVAL; \
937 var = new; \
938 start; \
939 return end-buf; \
940 } \
941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
942
943static struct sysdev_attribute *bank_attrs;
944
945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
946 char *buf)
947{
948 u64 b = bank[attr - bank_attrs];
949 return sprintf(buf, "%llx\n", b);
950}
951
952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
963
964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
965 char *buf)
966{
967 strcpy(buf, trigger);
968 strcat(buf, "\n");
969 return strlen(trigger) + 1;
970}
971
972static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
973 const char *buf,size_t siz)
974{
975 char *p;
976 int len;
977 strncpy(trigger, buf, sizeof(trigger));
978 trigger[sizeof(trigger)-1] = 0;
979 len = strlen(trigger);
980 p = strchr(trigger, '\n');
981 if (*p) *p = 0;
982 return len;
983}
984
985static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
987ACCESSOR(check_interval,check_interval,mce_restart())
988static struct sysdev_attribute *mce_attributes[] = {
989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
990 NULL
991};
992
993static cpumask_var_t mce_device_initialized;
994
995/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
996static __cpuinit int mce_create_device(unsigned int cpu)
997{
998 int err;
999 int i;
1000
1001 if (!mce_available(&boot_cpu_data))
1002 return -EIO;
1003
1004 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1005 per_cpu(device_mce,cpu).id = cpu;
1006 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1007
1008 err = sysdev_register(&per_cpu(device_mce,cpu));
1009 if (err)
1010 return err;
1011
1012 for (i = 0; mce_attributes[i]; i++) {
1013 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1014 mce_attributes[i]);
1015 if (err)
1016 goto error;
1017 }
1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
1024 cpumask_set_cpu(cpu, mce_device_initialized);
1025
1026 return 0;
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
1032error:
1033 while (--i >= 0) {
1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
1035 mce_attributes[i]);
1036 }
1037 sysdev_unregister(&per_cpu(device_mce,cpu));
1038
1039 return err;
1040}
1041
1042static __cpuinit void mce_remove_device(unsigned int cpu)
1043{
1044 int i;
1045
1046 if (!cpumask_test_cpu(cpu, mce_device_initialized))
1047 return;
1048
1049 for (i = 0; mce_attributes[i]; i++)
1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
1055 sysdev_unregister(&per_cpu(device_mce,cpu));
1056 cpumask_clear_cpu(cpu, mce_device_initialized);
1057}
1058
1059/* Make sure there are no machine checks on offlined CPUs. */
1060static void mce_disable_cpu(void *h)
1061{
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1064
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071}
1072
1073static void mce_reenable_cpu(void *h)
1074{
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1077
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1084}
1085
1086/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1087static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1088 unsigned long action, void *hcpu)
1089{
1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
1092
1093 switch (action) {
1094 case CPU_ONLINE:
1095 case CPU_ONLINE_FROZEN:
1096 mce_create_device(cpu);
1097 if (threshold_cpu_callback)
1098 threshold_cpu_callback(action, cpu);
1099 break;
1100 case CPU_DEAD:
1101 case CPU_DEAD_FROZEN:
1102 if (threshold_cpu_callback)
1103 threshold_cpu_callback(action, cpu);
1104 mce_remove_device(cpu);
1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies(jiffies +
1114 __get_cpu_var(next_interval));
1115 add_timer_on(t, cpu);
1116 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1117 break;
1118 case CPU_POST_DEAD:
1119 /* intentionally ignoring frozen here */
1120 cmci_rediscover(cpu);
1121 break;
1122 }
1123 return NOTIFY_OK;
1124}
1125
1126static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1127 .notifier_call = mce_cpu_callback,
1128};
1129
1130static __init int mce_init_banks(void)
1131{
1132 int i;
1133
1134 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1135 GFP_KERNEL);
1136 if (!bank_attrs)
1137 return -ENOMEM;
1138
1139 for (i = 0; i < banks; i++) {
1140 struct sysdev_attribute *a = &bank_attrs[i];
1141 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1142 if (!a->attr.name)
1143 goto nomem;
1144 a->attr.mode = 0644;
1145 a->show = show_bank;
1146 a->store = set_bank;
1147 }
1148 return 0;
1149
1150nomem:
1151 while (--i >= 0)
1152 kfree(bank_attrs[i].attr.name);
1153 kfree(bank_attrs);
1154 bank_attrs = NULL;
1155 return -ENOMEM;
1156}
1157
1158static __init int mce_init_device(void)
1159{
1160 int err;
1161 int i = 0;
1162
1163 if (!mce_available(&boot_cpu_data))
1164 return -EIO;
1165
1166 alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1167
1168 err = mce_init_banks();
1169 if (err)
1170 return err;
1171
1172 err = sysdev_class_register(&mce_sysclass);
1173 if (err)
1174 return err;
1175
1176 for_each_online_cpu(i) {
1177 err = mce_create_device(i);
1178 if (err)
1179 return err;
1180 }
1181
1182 register_hotcpu_notifier(&mce_cpu_notifier);
1183 misc_register(&mce_log_device);
1184 return err;
1185}
1186
1187device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 56dde9c4bc96..ddae21620bda 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -13,22 +13,22 @@
13 * 13 *
14 * All MC4_MISCi registers are shared between multi-cores 14 * All MC4_MISCi registers are shared between multi-cores
15 */ 15 */
16
17#include <linux/cpu.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/interrupt.h> 16#include <linux/interrupt.h>
21#include <linux/kobject.h>
22#include <linux/notifier.h> 17#include <linux/notifier.h>
23#include <linux/sched.h> 18#include <linux/kobject.h>
24#include <linux/smp.h> 19#include <linux/percpu.h>
25#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/errno.h>
22#include <linux/sched.h>
26#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/init.h>
25#include <linux/cpu.h>
26#include <linux/smp.h>
27
27#include <asm/apic.h> 28#include <asm/apic.h>
29#include <asm/idle.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
29#include <asm/msr.h> 31#include <asm/msr.h>
30#include <asm/percpu.h>
31#include <asm/idle.h>
32 32
33#define PFX "mce_threshold: " 33#define PFX "mce_threshold: "
34#define VERSION "version 1.1.1" 34#define VERSION "version 1.1.1"
@@ -48,26 +48,26 @@
48#define MCG_XBLK_ADDR 0xC0000400 48#define MCG_XBLK_ADDR 0xC0000400
49 49
50struct threshold_block { 50struct threshold_block {
51 unsigned int block; 51 unsigned int block;
52 unsigned int bank; 52 unsigned int bank;
53 unsigned int cpu; 53 unsigned int cpu;
54 u32 address; 54 u32 address;
55 u16 interrupt_enable; 55 u16 interrupt_enable;
56 u16 threshold_limit; 56 u16 threshold_limit;
57 struct kobject kobj; 57 struct kobject kobj;
58 struct list_head miscj; 58 struct list_head miscj;
59}; 59};
60 60
61/* defaults used early on boot */ 61/* defaults used early on boot */
62static struct threshold_block threshold_defaults = { 62static struct threshold_block threshold_defaults = {
63 .interrupt_enable = 0, 63 .interrupt_enable = 0,
64 .threshold_limit = THRESHOLD_MAX, 64 .threshold_limit = THRESHOLD_MAX,
65}; 65};
66 66
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void);
86 */ 86 */
87 87
88struct thresh_restart { 88struct thresh_restart {
89 struct threshold_block *b; 89 struct threshold_block *b;
90 int reset; 90 int reset;
91 u16 old_limit; 91 u16 old_limit;
92}; 92};
93 93
94/* must be called with correct cpu affinity */ 94/* must be called with correct cpu affinity */
@@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr)
110 } else if (tr->old_limit) { /* change limit w/o reset */ 110 } else if (tr->old_limit) { /* change limit w/o reset */
111 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 111 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
112 (tr->old_limit - tr->b->threshold_limit); 112 (tr->old_limit - tr->b->threshold_limit);
113
113 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 114 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
114 (new_count & THRESHOLD_MAX); 115 (new_count & THRESHOLD_MAX);
115 } 116 }
@@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr)
125/* cpu init entry point, called from mce.c with preempt off */ 126/* cpu init entry point, called from mce.c with preempt off */
126void mce_amd_feature_init(struct cpuinfo_x86 *c) 127void mce_amd_feature_init(struct cpuinfo_x86 *c)
127{ 128{
128 unsigned int bank, block;
129 unsigned int cpu = smp_processor_id(); 129 unsigned int cpu = smp_processor_id();
130 u8 lvt_off;
131 u32 low = 0, high = 0, address = 0; 130 u32 low = 0, high = 0, address = 0;
131 unsigned int bank, block;
132 struct thresh_restart tr; 132 struct thresh_restart tr;
133 u8 lvt_off;
133 134
134 for (bank = 0; bank < NR_BANKS; ++bank) { 135 for (bank = 0; bank < NR_BANKS; ++bank) {
135 for (block = 0; block < NR_BLOCKS; ++block) { 136 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
140 if (!address) 141 if (!address)
141 break; 142 break;
142 address += MCG_XBLK_ADDR; 143 address += MCG_XBLK_ADDR;
143 } 144 } else
144 else
145 ++address; 145 ++address;
146 146
147 if (rdmsr_safe(address, &low, &high)) 147 if (rdmsr_safe(address, &low, &high))
@@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
193 */ 193 */
194static void amd_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
195{ 195{
196 u32 low = 0, high = 0, address = 0;
196 unsigned int bank, block; 197 unsigned int bank, block;
197 struct mce m; 198 struct mce m;
198 u32 low = 0, high = 0, address = 0;
199 199
200 mce_setup(&m); 200 mce_setup(&m);
201 201
@@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void)
204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) 204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
205 continue; 205 continue;
206 for (block = 0; block < NR_BLOCKS; ++block) { 206 for (block = 0; block < NR_BLOCKS; ++block) {
207 if (block == 0) 207 if (block == 0) {
208 address = MSR_IA32_MC0_MISC + bank * 4; 208 address = MSR_IA32_MC0_MISC + bank * 4;
209 else if (block == 1) { 209 } else if (block == 1) {
210 address = (low & MASK_BLKPTR_LO) >> 21; 210 address = (low & MASK_BLKPTR_LO) >> 21;
211 if (!address) 211 if (!address)
212 break; 212 break;
213 address += MCG_XBLK_ADDR; 213 address += MCG_XBLK_ADDR;
214 } 214 } else {
215 else
216 ++address; 215 ++address;
216 }
217 217
218 if (rdmsr_safe(address, &low, &high)) 218 if (rdmsr_safe(address, &low, &high))
219 break; 219 break;
@@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void)
229 (high & MASK_LOCKED_HI)) 229 (high & MASK_LOCKED_HI))
230 continue; 230 continue;
231 231
232 /* Log the machine check that caused the threshold 232 /*
233 event. */ 233 * Log the machine check that caused the threshold
234 * event.
235 */
234 machine_check_poll(MCP_TIMESTAMP, 236 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks)); 237 &__get_cpu_var(mce_poll_banks));
236 238
@@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void)
254 256
255struct threshold_attr { 257struct threshold_attr {
256 struct attribute attr; 258 struct attribute attr;
257 ssize_t(*show) (struct threshold_block *, char *); 259 ssize_t (*show) (struct threshold_block *, char *);
258 ssize_t(*store) (struct threshold_block *, const char *, size_t count); 260 ssize_t (*store) (struct threshold_block *, const char *, size_t count);
259}; 261};
260 262
261#define SHOW_FIELDS(name) \ 263#define SHOW_FIELDS(name) \
262static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ 264static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
263{ \ 265{ \
264 return sprintf(buf, "%lx\n", (unsigned long) b->name); \ 266 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
265} 267}
266SHOW_FIELDS(interrupt_enable) 268SHOW_FIELDS(interrupt_enable)
267SHOW_FIELDS(threshold_limit) 269SHOW_FIELDS(threshold_limit)
268 270
269static ssize_t store_interrupt_enable(struct threshold_block *b, 271static ssize_t
270 const char *buf, size_t count) 272store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
271{ 273{
272 char *end;
273 struct thresh_restart tr; 274 struct thresh_restart tr;
274 unsigned long new = simple_strtoul(buf, &end, 0); 275 unsigned long new;
275 if (end == buf) 276
277 if (strict_strtoul(buf, 0, &new) < 0)
276 return -EINVAL; 278 return -EINVAL;
279
277 b->interrupt_enable = !!new; 280 b->interrupt_enable = !!new;
278 281
279 tr.b = b; 282 tr.b = b;
280 tr.reset = 0; 283 tr.reset = 0;
281 tr.old_limit = 0; 284 tr.old_limit = 0;
285
282 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 286 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
283 287
284 return end - buf; 288 return size;
285} 289}
286 290
287static ssize_t store_threshold_limit(struct threshold_block *b, 291static ssize_t
288 const char *buf, size_t count) 292store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
289{ 293{
290 char *end;
291 struct thresh_restart tr; 294 struct thresh_restart tr;
292 unsigned long new = simple_strtoul(buf, &end, 0); 295 unsigned long new;
293 if (end == buf) 296
297 if (strict_strtoul(buf, 0, &new) < 0)
294 return -EINVAL; 298 return -EINVAL;
299
295 if (new > THRESHOLD_MAX) 300 if (new > THRESHOLD_MAX)
296 new = THRESHOLD_MAX; 301 new = THRESHOLD_MAX;
297 if (new < 1) 302 if (new < 1)
298 new = 1; 303 new = 1;
304
299 tr.old_limit = b->threshold_limit; 305 tr.old_limit = b->threshold_limit;
300 b->threshold_limit = new; 306 b->threshold_limit = new;
301 tr.b = b; 307 tr.b = b;
@@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
303 309
304 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 310 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
305 311
306 return end - buf; 312 return size;
307} 313}
308 314
309struct threshold_block_cross_cpu { 315struct threshold_block_cross_cpu {
310 struct threshold_block *tb; 316 struct threshold_block *tb;
311 long retval; 317 long retval;
312}; 318};
313 319
314static void local_error_count_handler(void *_tbcc) 320static void local_error_count_handler(void *_tbcc)
@@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b,
338 return 1; 344 return 1;
339} 345}
340 346
341#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ 347#define RW_ATTR(val) \
342 .attr = {.name = __stringify(_name), .mode = _mode }, \ 348static struct threshold_attr val = { \
343 .show = _show, \ 349 .attr = {.name = __stringify(val), .mode = 0644 }, \
344 .store = _store, \ 350 .show = show_## val, \
351 .store = store_## val, \
345}; 352};
346 353
347#define RW_ATTR(name) \
348static struct threshold_attr name = \
349 THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
350
351RW_ATTR(interrupt_enable); 354RW_ATTR(interrupt_enable);
352RW_ATTR(threshold_limit); 355RW_ATTR(threshold_limit);
353RW_ATTR(error_count); 356RW_ATTR(error_count);
@@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = {
359 NULL 362 NULL
360}; 363};
361 364
362#define to_block(k) container_of(k, struct threshold_block, kobj) 365#define to_block(k) container_of(k, struct threshold_block, kobj)
363#define to_attr(a) container_of(a, struct threshold_attr, attr) 366#define to_attr(a) container_of(a, struct threshold_attr, attr)
364 367
365static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) 368static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
366{ 369{
367 struct threshold_block *b = to_block(kobj); 370 struct threshold_block *b = to_block(kobj);
368 struct threshold_attr *a = to_attr(attr); 371 struct threshold_attr *a = to_attr(attr);
369 ssize_t ret; 372 ssize_t ret;
373
370 ret = a->show ? a->show(b, buf) : -EIO; 374 ret = a->show ? a->show(b, buf) : -EIO;
375
371 return ret; 376 return ret;
372} 377}
373 378
@@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
377 struct threshold_block *b = to_block(kobj); 382 struct threshold_block *b = to_block(kobj);
378 struct threshold_attr *a = to_attr(attr); 383 struct threshold_attr *a = to_attr(attr);
379 ssize_t ret; 384 ssize_t ret;
385
380 ret = a->store ? a->store(b, buf, count) : -EIO; 386 ret = a->store ? a->store(b, buf, count) : -EIO;
387
381 return ret; 388 return ret;
382} 389}
383 390
384static struct sysfs_ops threshold_ops = { 391static struct sysfs_ops threshold_ops = {
385 .show = show, 392 .show = show,
386 .store = store, 393 .store = store,
387}; 394};
388 395
389static struct kobj_type threshold_ktype = { 396static struct kobj_type threshold_ktype = {
390 .sysfs_ops = &threshold_ops, 397 .sysfs_ops = &threshold_ops,
391 .default_attrs = default_attrs, 398 .default_attrs = default_attrs,
392}; 399};
393 400
394static __cpuinit int allocate_threshold_blocks(unsigned int cpu, 401static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
@@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
396 unsigned int block, 403 unsigned int block,
397 u32 address) 404 u32 address)
398{ 405{
399 int err;
400 u32 low, high;
401 struct threshold_block *b = NULL; 406 struct threshold_block *b = NULL;
407 u32 low, high;
408 int err;
402 409
403 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) 410 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
404 return 0; 411 return 0;
@@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
421 if (!b) 428 if (!b)
422 return -ENOMEM; 429 return -ENOMEM;
423 430
424 b->block = block; 431 b->block = block;
425 b->bank = bank; 432 b->bank = bank;
426 b->cpu = cpu; 433 b->cpu = cpu;
427 b->address = address; 434 b->address = address;
428 b->interrupt_enable = 0; 435 b->interrupt_enable = 0;
429 b->threshold_limit = THRESHOLD_MAX; 436 b->threshold_limit = THRESHOLD_MAX;
430 437
431 INIT_LIST_HEAD(&b->miscj); 438 INIT_LIST_HEAD(&b->miscj);
432 439
433 if (per_cpu(threshold_banks, cpu)[bank]->blocks) 440 if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
434 list_add(&b->miscj, 441 list_add(&b->miscj,
435 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); 442 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
436 else 443 } else {
437 per_cpu(threshold_banks, cpu)[bank]->blocks = b; 444 per_cpu(threshold_banks, cpu)[bank]->blocks = b;
445 }
438 446
439 err = kobject_init_and_add(&b->kobj, &threshold_ktype, 447 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
440 per_cpu(threshold_banks, cpu)[bank]->kobj, 448 per_cpu(threshold_banks, cpu)[bank]->kobj,
@@ -447,8 +455,9 @@ recurse:
447 if (!address) 455 if (!address)
448 return 0; 456 return 0;
449 address += MCG_XBLK_ADDR; 457 address += MCG_XBLK_ADDR;
450 } else 458 } else {
451 ++address; 459 ++address;
460 }
452 461
453 err = allocate_threshold_blocks(cpu, bank, ++block, address); 462 err = allocate_threshold_blocks(cpu, bank, ++block, address);
454 if (err) 463 if (err)
@@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
500 if (!b) 509 if (!b)
501 goto out; 510 goto out;
502 511
503 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, 512 err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
504 b->kobj, name); 513 b->kobj, name);
505 if (err) 514 if (err)
506 goto out; 515 goto out;
507 516
508 cpumask_copy(b->cpus, cpu_core_mask(cpu)); 517 cpumask_copy(b->cpus, cpu_core_mask(cpu));
509 per_cpu(threshold_banks, cpu)[bank] = b; 518 per_cpu(threshold_banks, cpu)[bank] = b;
519
510 goto out; 520 goto out;
511 } 521 }
512#endif 522#endif
@@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
522 goto out; 532 goto out;
523 } 533 }
524 534
525 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 535 b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
526 if (!b->kobj) 536 if (!b->kobj)
527 goto out_free; 537 goto out_free;
528 538
@@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
542 if (i == cpu) 552 if (i == cpu)
543 continue; 553 continue;
544 554
545 err = sysfs_create_link(&per_cpu(device_mce, i).kobj, 555 err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
546 b->kobj, name); 556 b->kobj, name);
547 if (err) 557 if (err)
548 goto out; 558 goto out;
@@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu,
605 615
606static void threshold_remove_bank(unsigned int cpu, int bank) 616static void threshold_remove_bank(unsigned int cpu, int bank)
607{ 617{
608 int i = 0;
609 struct threshold_bank *b; 618 struct threshold_bank *b;
610 char name[32]; 619 char name[32];
620 int i = 0;
611 621
612 b = per_cpu(threshold_banks, cpu)[bank]; 622 b = per_cpu(threshold_banks, cpu)[bank];
613
614 if (!b) 623 if (!b)
615 return; 624 return;
616
617 if (!b->blocks) 625 if (!b->blocks)
618 goto free_out; 626 goto free_out;
619 627
@@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
622#ifdef CONFIG_SMP 630#ifdef CONFIG_SMP
623 /* sibling symlink */ 631 /* sibling symlink */
624 if (shared_bank[bank] && b->blocks->cpu != cpu) { 632 if (shared_bank[bank] && b->blocks->cpu != cpu) {
625 sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); 633 sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
626 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
635
627 return; 636 return;
628 } 637 }
629#endif 638#endif
@@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
633 if (i == cpu) 642 if (i == cpu)
634 continue; 643 continue;
635 644
636 sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); 645 sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
637 per_cpu(threshold_banks, i)[bank] = NULL; 646 per_cpu(threshold_banks, i)[bank] = NULL;
638 } 647 }
639 648
@@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu)
659} 668}
660 669
661/* get notified when a cpu comes on/off */ 670/* get notified when a cpu comes on/off */
662static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, 671static void __cpuinit
663 unsigned int cpu) 672amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
664{ 673{
665 if (cpu >= NR_CPUS)
666 return;
667
668 switch (action) { 674 switch (action) {
669 case CPU_ONLINE: 675 case CPU_ONLINE:
670 case CPU_ONLINE_FROZEN: 676 case CPU_ONLINE_FROZEN:
@@ -686,11 +692,12 @@ static __init int threshold_init_device(void)
686 /* to hit CPUs online before the notifier is up */ 692 /* to hit CPUs online before the notifier is up */
687 for_each_online_cpu(lcpu) { 693 for_each_online_cpu(lcpu) {
688 int err = threshold_create_device(lcpu); 694 int err = threshold_create_device(lcpu);
695
689 if (err) 696 if (err)
690 return err; 697 return err;
691 } 698 }
692 threshold_cpu_callback = amd_64_threshold_cpu_callback; 699 threshold_cpu_callback = amd_64_threshold_cpu_callback;
700
693 return 0; 701 return 0;
694} 702}
695
696device_initcall(threshold_init_device); 703device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
new file mode 100644
index 000000000000..2b011d2d8579
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -0,0 +1,74 @@
1/*
2 * Common code for Intel machine checks
3 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6#include <linux/types.h>
7#include <linux/init.h>
8#include <linux/smp.h>
9
10#include <asm/therm_throt.h>
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/apic.h>
14#include <asm/msr.h>
15
16#include "mce.h"
17
18void intel_init_thermal(struct cpuinfo_x86 *c)
19{
20 unsigned int cpu = smp_processor_id();
21 int tm2 = 0;
22 u32 l, h;
23
24 /* Thermal monitoring depends on ACPI and clock modulation*/
25 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
26 return;
27
28 /*
29 * First check if its enabled already, in which case there might
30 * be some SMM goo which handles it, so we can't even put a handler
31 * since it might be delivered via SMI already:
32 */
33 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
34 h = apic_read(APIC_LVTTHMR);
35 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
36 printk(KERN_DEBUG
37 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
38 return;
39 }
40
41 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
42 tm2 = 1;
43
44 /* Check whether a vector already exists */
45 if (h & APIC_VECTOR_MASK) {
46 printk(KERN_DEBUG
47 "CPU%d: Thermal LVT vector (%#x) already installed\n",
48 cpu, (h & APIC_VECTOR_MASK));
49 return;
50 }
51
52 /* We'll mask the thermal vector in the lapic till we're ready: */
53 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
54 apic_write(APIC_LVTTHMR, h);
55
56 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
57 wrmsr(MSR_IA32_THERM_INTERRUPT,
58 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
59
60 intel_set_thermal_handler();
61
62 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
63 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
64
65 /* Unmask the thermal vector: */
66 l = apic_read(APIC_LVTTHMR);
67 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
68
69 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
70 cpu, tm2 ? "TM2" : "TM1");
71
72 /* enable thermal throttle processing */
73 atomic_set(&therm_throt_en, 1);
74}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index cef3ee30744b..f2ef6952c400 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -15,7 +15,8 @@
15#include <asm/hw_irq.h> 15#include <asm/hw_irq.h>
16#include <asm/idle.h> 16#include <asm/idle.h>
17#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18#include <asm/apic.h> 18
19#include "mce.h"
19 20
20asmlinkage void smp_thermal_interrupt(void) 21asmlinkage void smp_thermal_interrupt(void)
21{ 22{
@@ -27,67 +28,13 @@ asmlinkage void smp_thermal_interrupt(void)
27 irq_enter(); 28 irq_enter();
28 29
29 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 30 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
30 if (therm_throt_process(msr_val & 1)) 31 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
31 mce_log_therm_throt_event(msr_val); 32 mce_log_therm_throt_event(msr_val);
32 33
33 inc_irq_stat(irq_thermal_count); 34 inc_irq_stat(irq_thermal_count);
34 irq_exit(); 35 irq_exit();
35} 36}
36 37
37static void intel_init_thermal(struct cpuinfo_x86 *c)
38{
39 u32 l, h;
40 int tm2 = 0;
41 unsigned int cpu = smp_processor_id();
42
43 if (!cpu_has(c, X86_FEATURE_ACPI))
44 return;
45
46 if (!cpu_has(c, X86_FEATURE_ACC))
47 return;
48
49 /* first check if TM1 is already enabled by the BIOS, in which
50 * case there might be some SMM goo which handles it, so we can't even
51 * put a handler since it might be delivered via SMI already.
52 */
53 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
54 h = apic_read(APIC_LVTTHMR);
55 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
56 printk(KERN_DEBUG
57 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
58 return;
59 }
60
61 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
62 tm2 = 1;
63
64 if (h & APIC_VECTOR_MASK) {
65 printk(KERN_DEBUG
66 "CPU%d: Thermal LVT vector (%#x) already "
67 "installed\n", cpu, (h & APIC_VECTOR_MASK));
68 return;
69 }
70
71 h = THERMAL_APIC_VECTOR;
72 h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
73 apic_write(APIC_LVTTHMR, h);
74
75 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
76 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
77
78 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
79 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
80
81 l = apic_read(APIC_LVTTHMR);
82 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
83 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
84 cpu, tm2 ? "TM2" : "TM1");
85
86 /* enable thermal throttle processing */
87 atomic_set(&therm_throt_en, 1);
88 return;
89}
90
91/* 38/*
92 * Support for Intel Correct Machine Check Interrupts. This allows 39 * Support for Intel Correct Machine Check Interrupts. This allows
93 * the CPU to raise an interrupt when a corrected machine check happened. 40 * the CPU to raise an interrupt when a corrected machine check happened.
@@ -109,6 +56,9 @@ static int cmci_supported(int *banks)
109{ 56{
110 u64 cap; 57 u64 cap;
111 58
59 if (mce_cmci_disabled || mce_ignore_ce)
60 return 0;
61
112 /* 62 /*
113 * Vendor check is not strictly needed, but the initial 63 * Vendor check is not strictly needed, but the initial
114 * initialization is vendor keyed and this 64 * initialization is vendor keyed and this
@@ -132,7 +82,7 @@ static int cmci_supported(int *banks)
132static void intel_threshold_interrupt(void) 82static void intel_threshold_interrupt(void)
133{ 83{
134 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 84 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
135 mce_notify_user(); 85 mce_notify_irq();
136} 86}
137 87
138static void print_update(char *type, int *hdr, int num) 88static void print_update(char *type, int *hdr, int num)
@@ -248,7 +198,7 @@ void cmci_rediscover(int dying)
248 return; 198 return;
249 cpumask_copy(old, &current->cpus_allowed); 199 cpumask_copy(old, &current->cpus_allowed);
250 200
251 for_each_online_cpu (cpu) { 201 for_each_online_cpu(cpu) {
252 if (cpu == dying) 202 if (cpu == dying)
253 continue; 203 continue;
254 if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) 204 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index a74af128efc9..70b710420f74 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -6,15 +6,14 @@
6 * This file contains routines to check for non-fatal MCEs every 15s 6 * This file contains routines to check for non-fatal MCEs every 15s
7 * 7 *
8 */ 8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/jiffies.h>
14#include <linux/workqueue.h>
15#include <linux/interrupt.h> 9#include <linux/interrupt.h>
16#include <linux/smp.h> 10#include <linux/workqueue.h>
11#include <linux/jiffies.h>
12#include <linux/kernel.h>
17#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/smp.h>
18 17
19#include <asm/processor.h> 18#include <asm/processor.h>
20#include <asm/system.h> 19#include <asm/system.h>
@@ -22,9 +21,9 @@
22 21
23#include "mce.h" 22#include "mce.h"
24 23
25static int firstbank; 24static int firstbank;
26 25
27#define MCE_RATE 15*HZ /* timer rate is 15s */ 26#define MCE_RATE (15*HZ) /* timer rate is 15s */
28 27
29static void mce_checkregs(void *info) 28static void mce_checkregs(void *info)
30{ 29{
@@ -34,23 +33,24 @@ static void mce_checkregs(void *info)
34 for (i = firstbank; i < nr_mce_banks; i++) { 33 for (i = firstbank; i < nr_mce_banks; i++) {
35 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
36 35
37 if (high & (1<<31)) { 36 if (!(high & (1<<31)))
38 printk(KERN_INFO "MCE: The hardware reports a non " 37 continue;
39 "fatal, correctable incident occurred on " 38
40 "CPU %d.\n", 39 printk(KERN_INFO "MCE: The hardware reports a non fatal, "
40 "correctable incident occurred on CPU %d.\n",
41 smp_processor_id()); 41 smp_processor_id());
42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); 42
43 43 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
44 /* 44
45 * Scrub the error so we don't pick it up in MCE_RATE 45 /*
46 * seconds time. 46 * Scrub the error so we don't pick it up in MCE_RATE
47 */ 47 * seconds time:
48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 48 */
49 49 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
50 /* Serialize */ 50
51 wmb(); 51 /* Serialize: */
52 add_taint(TAINT_MACHINE_CHECK); 52 wmb();
53 } 53 add_taint(TAINT_MACHINE_CHECK);
54 } 54 }
55} 55}
56 56
@@ -77,16 +77,17 @@ static int __init init_nonfatal_mce_checker(void)
77 77
78 /* Some Athlons misbehave when we frob bank 0 */ 78 /* Some Athlons misbehave when we frob bank 0 */
79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
80 boot_cpu_data.x86 == 6) 80 boot_cpu_data.x86 == 6)
81 firstbank = 1; 81 firstbank = 1;
82 else 82 else
83 firstbank = 0; 83 firstbank = 0;
84 84
85 /* 85 /*
86 * Check for non-fatal errors every MCE_RATE s 86 * Check for non-fatal errors every MCE_RATE s
87 */ 87 */
88 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); 88 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
89 printk(KERN_INFO "Machine check exception polling timer started.\n"); 89 printk(KERN_INFO "Machine check exception polling timer started.\n");
90
90 return 0; 91 return 0;
91} 92}
92module_init(init_nonfatal_mce_checker); 93module_init(init_nonfatal_mce_checker);
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f53bdcbaf382..82cee108a2d3 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -2,18 +2,17 @@
2 * P4 specific Machine Check Exception Reporting 2 * P4 specific Machine Check Exception Reporting
3 */ 3 */
4 4
5#include <linux/init.h>
6#include <linux/types.h>
7#include <linux/kernel.h>
8#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h> 9#include <linux/smp.h>
10 10
11#include <asm/therm_throt.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/system.h> 13#include <asm/system.h>
13#include <asm/msr.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
15 15#include <asm/msr.h>
16#include <asm/therm_throt.h>
17 16
18#include "mce.h" 17#include "mce.h"
19 18
@@ -36,6 +35,7 @@ static int mce_num_extended_msrs;
36 35
37 36
38#ifdef CONFIG_X86_MCE_P4THERMAL 37#ifdef CONFIG_X86_MCE_P4THERMAL
38
39static void unexpected_thermal_interrupt(struct pt_regs *regs) 39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{ 40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", 41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
@@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs)
43 add_taint(TAINT_MACHINE_CHECK); 43 add_taint(TAINT_MACHINE_CHECK);
44} 44}
45 45
46/* P4/Xeon Thermal transition interrupt handler */ 46/* P4/Xeon Thermal transition interrupt handler: */
47static void intel_thermal_interrupt(struct pt_regs *regs) 47static void intel_thermal_interrupt(struct pt_regs *regs)
48{ 48{
49 __u64 msr_val; 49 __u64 msr_val;
@@ -51,11 +51,12 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
51 ack_APIC_irq(); 51 ack_APIC_irq();
52 52
53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
54 therm_throt_process(msr_val & 0x1); 54 therm_throt_process(msr_val & THERM_STATUS_PROCHOT);
55} 55}
56 56
57/* Thermal interrupt handler for this CPU setup */ 57/* Thermal interrupt handler for this CPU setup: */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; 58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) =
59 unexpected_thermal_interrupt;
59 60
60void smp_thermal_interrupt(struct pt_regs *regs) 61void smp_thermal_interrupt(struct pt_regs *regs)
61{ 62{
@@ -65,67 +66,15 @@ void smp_thermal_interrupt(struct pt_regs *regs)
65 irq_exit(); 66 irq_exit();
66} 67}
67 68
68/* P4/Xeon Thermal regulation detect and init */ 69void intel_set_thermal_handler(void)
69static void intel_init_thermal(struct cpuinfo_x86 *c)
70{ 70{
71 u32 l, h;
72 unsigned int cpu = smp_processor_id();
73
74 /* Thermal monitoring */
75 if (!cpu_has(c, X86_FEATURE_ACPI))
76 return; /* -ENODEV */
77
78 /* Clock modulation */
79 if (!cpu_has(c, X86_FEATURE_ACC))
80 return; /* -ENODEV */
81
82 /* first check if its enabled already, in which case there might
83 * be some SMM goo which handles it, so we can't even put a handler
84 * since it might be delivered via SMI already -zwanem.
85 */
86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 h = apic_read(APIC_LVTTHMR);
88 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
90 cpu);
91 return; /* -EBUSY */
92 }
93
94 /* check whether a vector already exists, temporarily masked? */
95 if (h & APIC_VECTOR_MASK) {
96 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
97 "installed\n",
98 cpu, (h & APIC_VECTOR_MASK));
99 return; /* -EBUSY */
100 }
101
102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write(APIC_LVTTHMR, h);
106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
109
110 /* ok we're good to go... */
111 vendor_thermal_interrupt = intel_thermal_interrupt; 71 vendor_thermal_interrupt = intel_thermal_interrupt;
112
113 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
114 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
115
116 l = apic_read(APIC_LVTTHMR);
117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119
120 /* enable thermal throttle processing */
121 atomic_set(&therm_throt_en, 1);
122 return;
123} 72}
124#endif /* CONFIG_X86_MCE_P4THERMAL */
125 73
74#endif /* CONFIG_X86_MCE_P4THERMAL */
126 75
127/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ 76/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
128static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) 77static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
129{ 78{
130 u32 h; 79 u32 h;
131 80
@@ -143,9 +92,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
143 92
144static void intel_machine_check(struct pt_regs *regs, long error_code) 93static void intel_machine_check(struct pt_regs *regs, long error_code)
145{ 94{
146 int recover = 1;
147 u32 alow, ahigh, high, low; 95 u32 alow, ahigh, high, low;
148 u32 mcgstl, mcgsth; 96 u32 mcgstl, mcgsth;
97 int recover = 1;
149 int i; 98 int i;
150 99
151 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 100 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -157,7 +106,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
157 106
158 if (mce_num_extended_msrs > 0) { 107 if (mce_num_extended_msrs > 0) {
159 struct intel_mce_extended_msrs dbg; 108 struct intel_mce_extended_msrs dbg;
109
160 intel_get_extended_msrs(&dbg); 110 intel_get_extended_msrs(&dbg);
111
161 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" 112 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
162 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" 113 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
163 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", 114 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
@@ -171,6 +122,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
171 if (high & (1<<31)) { 122 if (high & (1<<31)) {
172 char misc[20]; 123 char misc[20];
173 char addr[24]; 124 char addr[24];
125
174 misc[0] = addr[0] = '\0'; 126 misc[0] = addr[0] = '\0';
175 if (high & (1<<29)) 127 if (high & (1<<29))
176 recover |= 1; 128 recover |= 1;
@@ -196,6 +148,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
196 panic("Unable to continue"); 148 panic("Unable to continue");
197 149
198 printk(KERN_EMERG "Attempting to continue.\n"); 150 printk(KERN_EMERG "Attempting to continue.\n");
151
199 /* 152 /*
200 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 153 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
201 * recoverable/continuable.This will allow BIOS to look at the MSRs 154 * recoverable/continuable.This will allow BIOS to look at the MSRs
@@ -217,7 +170,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
217 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 170 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
218} 171}
219 172
220
221void intel_p4_mcheck_init(struct cpuinfo_x86 *c) 173void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
222{ 174{
223 u32 l, h; 175 u32 l, h;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index c9f77ea69edc..015f481ab1b0 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -2,11 +2,10 @@
2 * P5 specific Machine Check Exception Reporting 2 * P5 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -15,39 +14,58 @@
15 14
16#include "mce.h" 15#include "mce.h"
17 16
18/* Machine check handler for Pentium class Intel */ 17/* By default disabled */
18int mce_p5_enable;
19
20/* Machine check handler for Pentium class Intel CPUs: */
19static void pentium_machine_check(struct pt_regs *regs, long error_code) 21static void pentium_machine_check(struct pt_regs *regs, long error_code)
20{ 22{
21 u32 loaddr, hi, lotype; 23 u32 loaddr, hi, lotype;
24
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 25 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
23 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); 26 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
24 printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); 27
25 if (lotype&(1<<5)) 28 printk(KERN_EMERG
26 printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); 29 "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
30 smp_processor_id(), loaddr, lotype);
31
32 if (lotype & (1<<5)) {
33 printk(KERN_EMERG
34 "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
35 smp_processor_id());
36 }
37
27 add_taint(TAINT_MACHINE_CHECK); 38 add_taint(TAINT_MACHINE_CHECK);
28} 39}
29 40
30/* Set up machine check reporting for processors with Intel style MCE */ 41/* Set up machine check reporting for processors with Intel style MCE: */
31void intel_p5_mcheck_init(struct cpuinfo_x86 *c) 42void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
32{ 43{
33 u32 l, h; 44 u32 l, h;
34 45
35 /*Check for MCE support */ 46 /* Check for MCE support: */
36 if (!cpu_has(c, X86_FEATURE_MCE)) 47 if (!cpu_has(c, X86_FEATURE_MCE))
37 return; 48 return;
38 49
39 /* Default P5 to off as its often misconnected */ 50#ifdef CONFIG_X86_OLD_MCE
51 /* Default P5 to off as its often misconnected: */
40 if (mce_disabled != -1) 52 if (mce_disabled != -1)
41 return; 53 return;
54#endif
55
42 machine_check_vector = pentium_machine_check; 56 machine_check_vector = pentium_machine_check;
57 /* Make sure the vector pointer is visible before we enable MCEs: */
43 wmb(); 58 wmb();
44 59
45 /* Read registers before enabling */ 60 /* Read registers before enabling: */
46 rdmsr(MSR_IA32_P5_MC_ADDR, l, h); 61 rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
47 rdmsr(MSR_IA32_P5_MC_TYPE, l, h); 62 rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
48 printk(KERN_INFO "Intel old style machine check architecture supported.\n"); 63 printk(KERN_INFO
64 "Intel old style machine check architecture supported.\n");
49 65
50 /* Enable MCE */ 66 /* Enable MCE: */
51 set_in_cr4(X86_CR4_MCE); 67 set_in_cr4(X86_CR4_MCE);
52 printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); 68 printk(KERN_INFO
69 "Intel old style machine check reporting enabled on CPU#%d.\n",
70 smp_processor_id());
53} 71}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 2ac52d7b434b..43c24e667457 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -2,11 +2,10 @@
2 * P6 specific Machine Check Exception Reporting 2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -18,9 +17,9 @@
18/* Machine Check Handler For PII/PIII */ 17/* Machine Check Handler For PII/PIII */
19static void intel_machine_check(struct pt_regs *regs, long error_code) 18static void intel_machine_check(struct pt_regs *regs, long error_code)
20{ 19{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 20 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 21 u32 mcgstl, mcgsth;
22 int recover = 1;
24 int i; 23 int i;
25 24
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 25 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -35,12 +34,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
35 if (high & (1<<31)) { 34 if (high & (1<<31)) {
36 char misc[20]; 35 char misc[20];
37 char addr[24]; 36 char addr[24];
38 misc[0] = addr[0] = '\0'; 37
38 misc[0] = '\0';
39 addr[0] = '\0';
40
39 if (high & (1<<29)) 41 if (high & (1<<29))
40 recover |= 1; 42 recover |= 1;
41 if (high & (1<<25)) 43 if (high & (1<<25))
42 recover |= 2; 44 recover |= 2;
43 high &= ~(1<<31); 45 high &= ~(1<<31);
46
44 if (high & (1<<27)) { 47 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 48 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 49 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,6 +52,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 52 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 53 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 54 }
55
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 56 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 57 smp_processor_id(), i, high, low, misc, addr);
54 } 58 }
@@ -63,16 +67,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
63 /* 67 /*
64 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 68 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
65 * recoverable/continuable.This will allow BIOS to look at the MSRs 69 * recoverable/continuable.This will allow BIOS to look at the MSRs
66 * for errors if the OS could not log the error. 70 * for errors if the OS could not log the error:
67 */ 71 */
68 for (i = 0; i < nr_mce_banks; i++) { 72 for (i = 0; i < nr_mce_banks; i++) {
69 unsigned int msr; 73 unsigned int msr;
74
70 msr = MSR_IA32_MC0_STATUS+i*4; 75 msr = MSR_IA32_MC0_STATUS+i*4;
71 rdmsr(msr, low, high); 76 rdmsr(msr, low, high);
72 if (high & (1<<31)) { 77 if (high & (1<<31)) {
73 /* Clear it */ 78 /* Clear it: */
74 wrmsr(msr, 0UL, 0UL); 79 wrmsr(msr, 0UL, 0UL);
75 /* Serialize */ 80 /* Serialize: */
76 wmb(); 81 wmb();
77 add_taint(TAINT_MACHINE_CHECK); 82 add_taint(TAINT_MACHINE_CHECK);
78 } 83 }
@@ -81,7 +86,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
81 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 86 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
82} 87}
83 88
84/* Set up machine check reporting for processors with Intel style MCE */ 89/* Set up machine check reporting for processors with Intel style MCE: */
85void intel_p6_mcheck_init(struct cpuinfo_x86 *c) 90void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
86{ 91{
87 u32 l, h; 92 u32 l, h;
@@ -97,6 +102,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
97 102
98 /* Ok machine check is available */ 103 /* Ok machine check is available */
99 machine_check_vector = intel_machine_check; 104 machine_check_vector = intel_machine_check;
105 /* Make sure the vector pointer is visible before we enable MCEs: */
100 wmb(); 106 wmb();
101 107
102 printk(KERN_INFO "Intel machine check architecture supported.\n"); 108 printk(KERN_INFO "Intel machine check architecture supported.\n");
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d5ae2243f0b9..7b1ae2e20ba5 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 *
3 * Thermal throttle event support code (such as syslog messaging and rate 2 * Thermal throttle event support code (such as syslog messaging and rate
4 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). 3 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
4 *
5 * This allows consistent reporting of CPU thermal throttle events. 5 * This allows consistent reporting of CPU thermal throttle events.
6 * 6 *
7 * Maintains a counter in /sys that keeps track of the number of thermal 7 * Maintains a counter in /sys that keeps track of the number of thermal
@@ -13,43 +13,43 @@
13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. 13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
14 * Inspired by Ross Biro's and Al Borchers' counter code. 14 * Inspired by Ross Biro's and Al Borchers' counter code.
15 */ 15 */
16 16#include <linux/notifier.h>
17#include <linux/jiffies.h>
17#include <linux/percpu.h> 18#include <linux/percpu.h>
18#include <linux/sysdev.h> 19#include <linux/sysdev.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <asm/cpu.h> 21
21#include <linux/notifier.h>
22#include <linux/jiffies.h>
23#include <asm/therm_throt.h> 22#include <asm/therm_throt.h>
24 23
25/* How long to wait between reporting thermal events */ 24/* How long to wait between reporting thermal events */
26#define CHECK_INTERVAL (300 * HZ) 25#define CHECK_INTERVAL (300 * HZ)
27 26
28static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 27static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
29static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 28static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
30atomic_t therm_throt_en = ATOMIC_INIT(0); 29
30atomic_t therm_throt_en = ATOMIC_INIT(0);
31 31
32#ifdef CONFIG_SYSFS 32#ifdef CONFIG_SYSFS
33#define define_therm_throt_sysdev_one_ro(_name) \ 33#define define_therm_throt_sysdev_one_ro(_name) \
34 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 34 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \ 38 struct sysdev_attribute *attr, \
39 char *buf) \ 39 char *buf) \
40{ \ 40{ \
41 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
42 ssize_t ret; \ 42 ssize_t ret; \
43 \ 43 \
44 preempt_disable(); /* CPU hotplug */ \ 44 preempt_disable(); /* CPU hotplug */ \
45 if (cpu_online(cpu)) \ 45 if (cpu_online(cpu)) \
46 ret = sprintf(buf, "%lu\n", \ 46 ret = sprintf(buf, "%lu\n", \
47 per_cpu(thermal_throttle_##name, cpu)); \ 47 per_cpu(thermal_throttle_##name, cpu)); \
48 else \ 48 else \
49 ret = 0; \ 49 ret = 0; \
50 preempt_enable(); \ 50 preempt_enable(); \
51 \ 51 \
52 return ret; \ 52 return ret; \
53} 53}
54 54
55define_therm_throt_sysdev_show_func(count); 55define_therm_throt_sysdev_show_func(count);
@@ -61,8 +61,8 @@ static struct attribute *thermal_throttle_attrs[] = {
61}; 61};
62 62
63static struct attribute_group thermal_throttle_attr_group = { 63static struct attribute_group thermal_throttle_attr_group = {
64 .attrs = thermal_throttle_attrs, 64 .attrs = thermal_throttle_attrs,
65 .name = "thermal_throttle" 65 .name = "thermal_throttle"
66}; 66};
67#endif /* CONFIG_SYSFS */ 67#endif /* CONFIG_SYSFS */
68 68
@@ -110,10 +110,11 @@ int therm_throt_process(int curr)
110} 110}
111 111
112#ifdef CONFIG_SYSFS 112#ifdef CONFIG_SYSFS
113/* Add/Remove thermal_throttle interface for CPU device */ 113/* Add/Remove thermal_throttle interface for CPU device: */
114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
115{ 115{
116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); 116 return sysfs_create_group(&sys_dev->kobj,
117 &thermal_throttle_attr_group);
117} 118}
118 119
119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 120static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -121,19 +122,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
121 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 122 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
122} 123}
123 124
124/* Mutex protecting device creation against CPU hotplug */ 125/* Mutex protecting device creation against CPU hotplug: */
125static DEFINE_MUTEX(therm_cpu_lock); 126static DEFINE_MUTEX(therm_cpu_lock);
126 127
127/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 128/* Get notified when a cpu comes on/off. Be hotplug friendly. */
128static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, 129static __cpuinit int
129 unsigned long action, 130thermal_throttle_cpu_callback(struct notifier_block *nfb,
130 void *hcpu) 131 unsigned long action,
132 void *hcpu)
131{ 133{
132 unsigned int cpu = (unsigned long)hcpu; 134 unsigned int cpu = (unsigned long)hcpu;
133 struct sys_device *sys_dev; 135 struct sys_device *sys_dev;
134 int err = 0; 136 int err = 0;
135 137
136 sys_dev = get_cpu_sysdev(cpu); 138 sys_dev = get_cpu_sysdev(cpu);
139
137 switch (action) { 140 switch (action) {
138 case CPU_UP_PREPARE: 141 case CPU_UP_PREPARE:
139 case CPU_UP_PREPARE_FROZEN: 142 case CPU_UP_PREPARE_FROZEN:
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 23ee9e730f78..d746df2909c9 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,7 +17,7 @@ static void default_threshold_interrupt(void)
17 17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt; 18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19 19
20asmlinkage void mce_threshold_interrupt(void) 20asmlinkage void smp_threshold_interrupt(void)
21{ 21{
22 exit_idle(); 22 exit_idle();
23 irq_enter(); 23 irq_enter();
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2a043d89811d..81b02487090b 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -2,11 +2,10 @@
2 * IDT Winchip specific Machine Check Exception Reporting 2 * IDT Winchip specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10 9
11#include <asm/processor.h> 10#include <asm/processor.h>
12#include <asm/system.h> 11#include <asm/system.h>
@@ -14,7 +13,7 @@
14 13
15#include "mce.h" 14#include "mce.h"
16 15
17/* Machine check handler for WinChip C6 */ 16/* Machine check handler for WinChip C6: */
18static void winchip_machine_check(struct pt_regs *regs, long error_code) 17static void winchip_machine_check(struct pt_regs *regs, long error_code)
19{ 18{
20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 19 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
@@ -25,12 +24,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
25void winchip_mcheck_init(struct cpuinfo_x86 *c) 24void winchip_mcheck_init(struct cpuinfo_x86 *c)
26{ 25{
27 u32 lo, hi; 26 u32 lo, hi;
27
28 machine_check_vector = winchip_machine_check; 28 machine_check_vector = winchip_machine_check;
29 /* Make sure the vector pointer is visible before we enable MCEs: */
29 wmb(); 30 wmb();
31
30 rdmsr(MSR_IDT_FCR1, lo, hi); 32 rdmsr(MSR_IDT_FCR1, lo, hi);
31 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ 33 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
32 lo &= ~(1<<4); /* Enable MCE */ 34 lo &= ~(1<<4); /* Enable MCE */
33 wrmsr(MSR_IDT_FCR1, lo, hi); 35 wrmsr(MSR_IDT_FCR1, lo, hi);
36
34 set_in_cr4(X86_CR4_MCE); 37 set_in_cr4(X86_CR4_MCE);
35 printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); 38
39 printk(KERN_INFO
40 "Winchip machine check reporting enabled on CPU#0.\n");
36} 41}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ce0fe4b5c04f..1d584a18a50d 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
808 808
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) 809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0; 810 return 0;
811 rdmsr(MTRRdefType_MSR, def, dummy); 811 rdmsr(MSR_MTRRdefType, def, dummy);
812 def &= 0xff; 812 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE) 813 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0; 814 return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1003 */ 1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim) 1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0; 1005 return 0;
1006 rdmsr(MTRRdefType_MSR, def, dummy); 1006 rdmsr(MSR_MTRRdefType, def, dummy);
1007 def &= 0xff; 1007 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE) 1008 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0; 1009 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0b776c09aff3..0543f69f0b27 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -20,9 +20,9 @@ struct fixed_range_block {
20}; 20};
21 21
22static struct fixed_range_block fixed_range_blocks[] = { 22static struct fixed_range_block fixed_range_blocks[] = {
23 { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ 23 { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
24 { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ 24 { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
25 { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ 25 { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
26 {} 26 {}
27}; 27};
28 28
@@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)
194 194
195 k8_check_syscfg_dram_mod_en(); 195 k8_check_syscfg_dram_mod_en();
196 196
197 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 197 rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
198 198
199 for (i = 0; i < 2; i++) 199 for (i = 0; i < 2; i++)
200 rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); 200 rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
201 for (i = 0; i < 8; i++) 201 for (i = 0; i < 8; i++)
202 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 202 rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
203} 203}
204 204
205void mtrr_save_fixed_ranges(void *info) 205void mtrr_save_fixed_ranges(void *info)
@@ -275,7 +275,11 @@ static void __init print_mtrr_state(void)
275 } 275 }
276 printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", 276 printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
277 mtrr_state.enabled & 2 ? "en" : "dis"); 277 mtrr_state.enabled & 2 ? "en" : "dis");
278 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; 278 if (size_or_mask & 0xffffffffUL)
279 high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
280 else
281 high_width = ffs(size_or_mask>>32) + 32 - 1;
282 high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
279 for (i = 0; i < num_var_ranges; ++i) { 283 for (i = 0; i < num_var_ranges; ++i) {
280 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) 284 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
281 printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", 285 printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n",
@@ -306,7 +310,7 @@ void __init get_mtrr_state(void)
306 310
307 vrs = mtrr_state.var_ranges; 311 vrs = mtrr_state.var_ranges;
308 312
309 rdmsr(MTRRcap_MSR, lo, dummy); 313 rdmsr(MSR_MTRRcap, lo, dummy);
310 mtrr_state.have_fixed = (lo >> 8) & 1; 314 mtrr_state.have_fixed = (lo >> 8) & 1;
311 315
312 for (i = 0; i < num_var_ranges; i++) 316 for (i = 0; i < num_var_ranges; i++)
@@ -314,7 +318,7 @@ void __init get_mtrr_state(void)
314 if (mtrr_state.have_fixed) 318 if (mtrr_state.have_fixed)
315 get_fixed_ranges(mtrr_state.fixed_ranges); 319 get_fixed_ranges(mtrr_state.fixed_ranges);
316 320
317 rdmsr(MTRRdefType_MSR, lo, dummy); 321 rdmsr(MSR_MTRRdefType, lo, dummy);
318 mtrr_state.def_type = (lo & 0xff); 322 mtrr_state.def_type = (lo & 0xff);
319 mtrr_state.enabled = (lo & 0xc00) >> 10; 323 mtrr_state.enabled = (lo & 0xc00) >> 10;
320 324
@@ -579,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
579 __flush_tlb(); 583 __flush_tlb();
580 584
581 /* Save MTRR state */ 585 /* Save MTRR state */
582 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 586 rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
583 587
584 /* Disable MTRRs, and set the default type to uncached */ 588 /* Disable MTRRs, and set the default type to uncached */
585 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); 589 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
586} 590}
587 591
588static void post_set(void) __releases(set_atomicity_lock) 592static void post_set(void) __releases(set_atomicity_lock)
@@ -591,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)
591 __flush_tlb(); 595 __flush_tlb();
592 596
593 /* Intel (P6) standard MTRRs */ 597 /* Intel (P6) standard MTRRs */
594 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 598 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
595 599
596 /* Enable caches */ 600 /* Enable caches */
597 write_cr0(read_cr0() & 0xbfffffff); 601 write_cr0(read_cr0() & 0xbfffffff);
@@ -703,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
703static int generic_have_wrcomb(void) 707static int generic_have_wrcomb(void)
704{ 708{
705 unsigned long config, dummy; 709 unsigned long config, dummy;
706 rdmsr(MTRRcap_MSR, config, dummy); 710 rdmsr(MSR_MTRRcap, config, dummy);
707 return (config & (1 << 10)); 711 return (config & (1 << 10));
708} 712}
709 713
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 03cda01f57c7..8fc248b5aeaf 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
104 unsigned long config = 0, dummy; 104 unsigned long config = 0, dummy;
105 105
106 if (use_intel()) { 106 if (use_intel()) {
107 rdmsr(MTRRcap_MSR, config, dummy); 107 rdmsr(MSR_MTRRcap, config, dummy);
108 } else if (is_cpu(AMD)) 108 } else if (is_cpu(AMD))
109 config = 2; 109 config = 2;
110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) 110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 77f67f7b347a..7538b767f206 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,21 +5,6 @@
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/stddef.h> 6#include <linux/stddef.h>
7 7
8#define MTRRcap_MSR 0x0fe
9#define MTRRdefType_MSR 0x2ff
10
11#define MTRRfix64K_00000_MSR 0x250
12#define MTRRfix16K_80000_MSR 0x258
13#define MTRRfix16K_A0000_MSR 0x259
14#define MTRRfix4K_C0000_MSR 0x268
15#define MTRRfix4K_C8000_MSR 0x269
16#define MTRRfix4K_D0000_MSR 0x26a
17#define MTRRfix4K_D8000_MSR 0x26b
18#define MTRRfix4K_E0000_MSR 0x26c
19#define MTRRfix4K_E8000_MSR 0x26d
20#define MTRRfix4K_F0000_MSR 0x26e
21#define MTRRfix4K_F8000_MSR 0x26f
22
23#define MTRR_CHANGE_MASK_FIXED 0x01 8#define MTRR_CHANGE_MASK_FIXED 0x01
24#define MTRR_CHANGE_MASK_VARIABLE 0x02 9#define MTRR_CHANGE_MASK_VARIABLE 0x02
25#define MTRR_CHANGE_MASK_DEFTYPE 0x04 10#define MTRR_CHANGE_MASK_DEFTYPE 0x04
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 7f7e2753685b..1f5fb1588d1f 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
35 35
36 if (use_intel()) 36 if (use_intel())
37 /* Save MTRR state */ 37 /* Save MTRR state */
38 rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); 38 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
39 else 39 else
40 /* Cyrix ARRs - everything else were excluded at the top */ 40 /* Cyrix ARRs - everything else were excluded at the top */
41 ctxt->ccr3 = getCx86(CX86_CCR3); 41 ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
46{ 46{
47 if (use_intel()) 47 if (use_intel())
48 /* Disable MTRRs, and set the default type to uncached */ 48 /* Disable MTRRs, and set the default type to uncached */
49 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, 49 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
50 ctxt->deftype_hi); 50 ctxt->deftype_hi);
51 else if (is_cpu(CYRIX)) 51 else if (is_cpu(CYRIX))
52 /* Cyrix ARRs - everything else were excluded at the top */ 52 /* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
64 /* Restore MTRRdefType */ 64 /* Restore MTRRdefType */
65 if (use_intel()) 65 if (use_intel())
66 /* Intel (P6) standard MTRRs */ 66 /* Intel (P6) standard MTRRs */
67 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); 67 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
68 else 68 else
69 /* Cyrix ARRs - everything else was excluded at the top */ 69 /* Cyrix ARRs - everything else was excluded at the top */
70 setCx86(CX86_CCR3, ctxt->ccr3); 70 setCx86(CX86_CCR3, ctxt->ccr3);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..275bc142cd5d
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1711 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 *
10 * For licencing details see kernel-base/COPYING
11 */
12
13#include <linux/perf_counter.h>
14#include <linux/capability.h>
15#include <linux/notifier.h>
16#include <linux/hardirq.h>
17#include <linux/kprobes.h>
18#include <linux/module.h>
19#include <linux/kdebug.h>
20#include <linux/sched.h>
21#include <linux/uaccess.h>
22
23#include <asm/apic.h>
24#include <asm/stacktrace.h>
25#include <asm/nmi.h>
26
27static u64 perf_counter_mask __read_mostly;
28
29struct cpu_hw_counters {
30 struct perf_counter *counters[X86_PMC_IDX_MAX];
31 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
32 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33 unsigned long interrupts;
34 int enabled;
35};
36
37/*
38 * struct x86_pmu - generic x86 pmu
39 */
40struct x86_pmu {
41 const char *name;
42 int version;
43 int (*handle_irq)(struct pt_regs *);
44 void (*disable_all)(void);
45 void (*enable_all)(void);
46 void (*enable)(struct hw_perf_counter *, int);
47 void (*disable)(struct hw_perf_counter *, int);
48 unsigned eventsel;
49 unsigned perfctr;
50 u64 (*event_map)(int);
51 u64 (*raw_event)(u64);
52 int max_events;
53 int num_counters;
54 int num_counters_fixed;
55 int counter_bits;
56 u64 counter_mask;
57 u64 max_period;
58 u64 intel_ctrl;
59};
60
61static struct x86_pmu x86_pmu __read_mostly;
62
63static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
64 .enabled = 1,
65};
66
67/*
68 * Intel PerfMon v3. Used on Core2 and later.
69 */
70static const u64 intel_perfmon_event_map[] =
71{
72 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
73 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
74 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
75 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
76 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
77 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
78 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
79};
80
81static u64 intel_pmu_event_map(int event)
82{
83 return intel_perfmon_event_map[event];
84}
85
86/*
87 * Generalized hw caching related event table, filled
88 * in on a per model basis. A value of 0 means
89 * 'not supported', -1 means 'event makes no sense on
90 * this CPU', any other value means the raw event
91 * ID.
92 */
93
94#define C(x) PERF_COUNT_HW_CACHE_##x
95
96static u64 __read_mostly hw_cache_event_ids
97 [PERF_COUNT_HW_CACHE_MAX]
98 [PERF_COUNT_HW_CACHE_OP_MAX]
99 [PERF_COUNT_HW_CACHE_RESULT_MAX];
100
101static const u64 nehalem_hw_cache_event_ids
102 [PERF_COUNT_HW_CACHE_MAX]
103 [PERF_COUNT_HW_CACHE_OP_MAX]
104 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
105{
106 [ C(L1D) ] = {
107 [ C(OP_READ) ] = {
108 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
109 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
110 },
111 [ C(OP_WRITE) ] = {
112 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
113 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
114 },
115 [ C(OP_PREFETCH) ] = {
116 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
117 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
118 },
119 },
120 [ C(L1I ) ] = {
121 [ C(OP_READ) ] = {
122 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
123 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
124 },
125 [ C(OP_WRITE) ] = {
126 [ C(RESULT_ACCESS) ] = -1,
127 [ C(RESULT_MISS) ] = -1,
128 },
129 [ C(OP_PREFETCH) ] = {
130 [ C(RESULT_ACCESS) ] = 0x0,
131 [ C(RESULT_MISS) ] = 0x0,
132 },
133 },
134 [ C(LL ) ] = {
135 [ C(OP_READ) ] = {
136 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
137 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
138 },
139 [ C(OP_WRITE) ] = {
140 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
141 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
142 },
143 [ C(OP_PREFETCH) ] = {
144 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
145 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
146 },
147 },
148 [ C(DTLB) ] = {
149 [ C(OP_READ) ] = {
150 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
151 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
152 },
153 [ C(OP_WRITE) ] = {
154 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
155 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
156 },
157 [ C(OP_PREFETCH) ] = {
158 [ C(RESULT_ACCESS) ] = 0x0,
159 [ C(RESULT_MISS) ] = 0x0,
160 },
161 },
162 [ C(ITLB) ] = {
163 [ C(OP_READ) ] = {
164 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
165 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
166 },
167 [ C(OP_WRITE) ] = {
168 [ C(RESULT_ACCESS) ] = -1,
169 [ C(RESULT_MISS) ] = -1,
170 },
171 [ C(OP_PREFETCH) ] = {
172 [ C(RESULT_ACCESS) ] = -1,
173 [ C(RESULT_MISS) ] = -1,
174 },
175 },
176 [ C(BPU ) ] = {
177 [ C(OP_READ) ] = {
178 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
179 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
180 },
181 [ C(OP_WRITE) ] = {
182 [ C(RESULT_ACCESS) ] = -1,
183 [ C(RESULT_MISS) ] = -1,
184 },
185 [ C(OP_PREFETCH) ] = {
186 [ C(RESULT_ACCESS) ] = -1,
187 [ C(RESULT_MISS) ] = -1,
188 },
189 },
190};
191
192static const u64 core2_hw_cache_event_ids
193 [PERF_COUNT_HW_CACHE_MAX]
194 [PERF_COUNT_HW_CACHE_OP_MAX]
195 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
196{
197 [ C(L1D) ] = {
198 [ C(OP_READ) ] = {
199 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
200 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
201 },
202 [ C(OP_WRITE) ] = {
203 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
204 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
205 },
206 [ C(OP_PREFETCH) ] = {
207 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
208 [ C(RESULT_MISS) ] = 0,
209 },
210 },
211 [ C(L1I ) ] = {
212 [ C(OP_READ) ] = {
213 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
214 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
215 },
216 [ C(OP_WRITE) ] = {
217 [ C(RESULT_ACCESS) ] = -1,
218 [ C(RESULT_MISS) ] = -1,
219 },
220 [ C(OP_PREFETCH) ] = {
221 [ C(RESULT_ACCESS) ] = 0,
222 [ C(RESULT_MISS) ] = 0,
223 },
224 },
225 [ C(LL ) ] = {
226 [ C(OP_READ) ] = {
227 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
228 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
229 },
230 [ C(OP_WRITE) ] = {
231 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
232 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
233 },
234 [ C(OP_PREFETCH) ] = {
235 [ C(RESULT_ACCESS) ] = 0,
236 [ C(RESULT_MISS) ] = 0,
237 },
238 },
239 [ C(DTLB) ] = {
240 [ C(OP_READ) ] = {
241 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
242 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
243 },
244 [ C(OP_WRITE) ] = {
245 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
246 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
247 },
248 [ C(OP_PREFETCH) ] = {
249 [ C(RESULT_ACCESS) ] = 0,
250 [ C(RESULT_MISS) ] = 0,
251 },
252 },
253 [ C(ITLB) ] = {
254 [ C(OP_READ) ] = {
255 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
256 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
257 },
258 [ C(OP_WRITE) ] = {
259 [ C(RESULT_ACCESS) ] = -1,
260 [ C(RESULT_MISS) ] = -1,
261 },
262 [ C(OP_PREFETCH) ] = {
263 [ C(RESULT_ACCESS) ] = -1,
264 [ C(RESULT_MISS) ] = -1,
265 },
266 },
267 [ C(BPU ) ] = {
268 [ C(OP_READ) ] = {
269 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
270 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
271 },
272 [ C(OP_WRITE) ] = {
273 [ C(RESULT_ACCESS) ] = -1,
274 [ C(RESULT_MISS) ] = -1,
275 },
276 [ C(OP_PREFETCH) ] = {
277 [ C(RESULT_ACCESS) ] = -1,
278 [ C(RESULT_MISS) ] = -1,
279 },
280 },
281};
282
283static const u64 atom_hw_cache_event_ids
284 [PERF_COUNT_HW_CACHE_MAX]
285 [PERF_COUNT_HW_CACHE_OP_MAX]
286 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
287{
288 [ C(L1D) ] = {
289 [ C(OP_READ) ] = {
290 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
291 [ C(RESULT_MISS) ] = 0,
292 },
293 [ C(OP_WRITE) ] = {
294 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
295 [ C(RESULT_MISS) ] = 0,
296 },
297 [ C(OP_PREFETCH) ] = {
298 [ C(RESULT_ACCESS) ] = 0x0,
299 [ C(RESULT_MISS) ] = 0,
300 },
301 },
302 [ C(L1I ) ] = {
303 [ C(OP_READ) ] = {
304 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
305 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
306 },
307 [ C(OP_WRITE) ] = {
308 [ C(RESULT_ACCESS) ] = -1,
309 [ C(RESULT_MISS) ] = -1,
310 },
311 [ C(OP_PREFETCH) ] = {
312 [ C(RESULT_ACCESS) ] = 0,
313 [ C(RESULT_MISS) ] = 0,
314 },
315 },
316 [ C(LL ) ] = {
317 [ C(OP_READ) ] = {
318 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
319 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
320 },
321 [ C(OP_WRITE) ] = {
322 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
323 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
324 },
325 [ C(OP_PREFETCH) ] = {
326 [ C(RESULT_ACCESS) ] = 0,
327 [ C(RESULT_MISS) ] = 0,
328 },
329 },
330 [ C(DTLB) ] = {
331 [ C(OP_READ) ] = {
332 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
333 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
334 },
335 [ C(OP_WRITE) ] = {
336 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
337 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
338 },
339 [ C(OP_PREFETCH) ] = {
340 [ C(RESULT_ACCESS) ] = 0,
341 [ C(RESULT_MISS) ] = 0,
342 },
343 },
344 [ C(ITLB) ] = {
345 [ C(OP_READ) ] = {
346 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
347 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
348 },
349 [ C(OP_WRITE) ] = {
350 [ C(RESULT_ACCESS) ] = -1,
351 [ C(RESULT_MISS) ] = -1,
352 },
353 [ C(OP_PREFETCH) ] = {
354 [ C(RESULT_ACCESS) ] = -1,
355 [ C(RESULT_MISS) ] = -1,
356 },
357 },
358 [ C(BPU ) ] = {
359 [ C(OP_READ) ] = {
360 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
361 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
362 },
363 [ C(OP_WRITE) ] = {
364 [ C(RESULT_ACCESS) ] = -1,
365 [ C(RESULT_MISS) ] = -1,
366 },
367 [ C(OP_PREFETCH) ] = {
368 [ C(RESULT_ACCESS) ] = -1,
369 [ C(RESULT_MISS) ] = -1,
370 },
371 },
372};
373
374static u64 intel_pmu_raw_event(u64 event)
375{
376#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
377#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
378#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
379#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
380#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
381
382#define CORE_EVNTSEL_MASK \
383 (CORE_EVNTSEL_EVENT_MASK | \
384 CORE_EVNTSEL_UNIT_MASK | \
385 CORE_EVNTSEL_EDGE_MASK | \
386 CORE_EVNTSEL_INV_MASK | \
387 CORE_EVNTSEL_COUNTER_MASK)
388
389 return event & CORE_EVNTSEL_MASK;
390}
391
392static const u64 amd_0f_hw_cache_event_ids
393 [PERF_COUNT_HW_CACHE_MAX]
394 [PERF_COUNT_HW_CACHE_OP_MAX]
395 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
396{
397 [ C(L1D) ] = {
398 [ C(OP_READ) ] = {
399 [ C(RESULT_ACCESS) ] = 0,
400 [ C(RESULT_MISS) ] = 0,
401 },
402 [ C(OP_WRITE) ] = {
403 [ C(RESULT_ACCESS) ] = 0,
404 [ C(RESULT_MISS) ] = 0,
405 },
406 [ C(OP_PREFETCH) ] = {
407 [ C(RESULT_ACCESS) ] = 0,
408 [ C(RESULT_MISS) ] = 0,
409 },
410 },
411 [ C(L1I ) ] = {
412 [ C(OP_READ) ] = {
413 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
414 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
415 },
416 [ C(OP_WRITE) ] = {
417 [ C(RESULT_ACCESS) ] = -1,
418 [ C(RESULT_MISS) ] = -1,
419 },
420 [ C(OP_PREFETCH) ] = {
421 [ C(RESULT_ACCESS) ] = 0,
422 [ C(RESULT_MISS) ] = 0,
423 },
424 },
425 [ C(LL ) ] = {
426 [ C(OP_READ) ] = {
427 [ C(RESULT_ACCESS) ] = 0,
428 [ C(RESULT_MISS) ] = 0,
429 },
430 [ C(OP_WRITE) ] = {
431 [ C(RESULT_ACCESS) ] = 0,
432 [ C(RESULT_MISS) ] = 0,
433 },
434 [ C(OP_PREFETCH) ] = {
435 [ C(RESULT_ACCESS) ] = 0,
436 [ C(RESULT_MISS) ] = 0,
437 },
438 },
439 [ C(DTLB) ] = {
440 [ C(OP_READ) ] = {
441 [ C(RESULT_ACCESS) ] = 0,
442 [ C(RESULT_MISS) ] = 0,
443 },
444 [ C(OP_WRITE) ] = {
445 [ C(RESULT_ACCESS) ] = 0,
446 [ C(RESULT_MISS) ] = 0,
447 },
448 [ C(OP_PREFETCH) ] = {
449 [ C(RESULT_ACCESS) ] = 0,
450 [ C(RESULT_MISS) ] = 0,
451 },
452 },
453 [ C(ITLB) ] = {
454 [ C(OP_READ) ] = {
455 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
456 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
457 },
458 [ C(OP_WRITE) ] = {
459 [ C(RESULT_ACCESS) ] = -1,
460 [ C(RESULT_MISS) ] = -1,
461 },
462 [ C(OP_PREFETCH) ] = {
463 [ C(RESULT_ACCESS) ] = -1,
464 [ C(RESULT_MISS) ] = -1,
465 },
466 },
467 [ C(BPU ) ] = {
468 [ C(OP_READ) ] = {
469 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
470 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
471 },
472 [ C(OP_WRITE) ] = {
473 [ C(RESULT_ACCESS) ] = -1,
474 [ C(RESULT_MISS) ] = -1,
475 },
476 [ C(OP_PREFETCH) ] = {
477 [ C(RESULT_ACCESS) ] = -1,
478 [ C(RESULT_MISS) ] = -1,
479 },
480 },
481};
482
483/*
484 * AMD Performance Monitor K7 and later.
485 */
486static const u64 amd_perfmon_event_map[] =
487{
488 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
489 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
490 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
491 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
492 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
493 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
494};
495
496static u64 amd_pmu_event_map(int event)
497{
498 return amd_perfmon_event_map[event];
499}
500
501static u64 amd_pmu_raw_event(u64 event)
502{
503#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
504#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
505#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
506#define K7_EVNTSEL_INV_MASK 0x000800000ULL
507#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
508
509#define K7_EVNTSEL_MASK \
510 (K7_EVNTSEL_EVENT_MASK | \
511 K7_EVNTSEL_UNIT_MASK | \
512 K7_EVNTSEL_EDGE_MASK | \
513 K7_EVNTSEL_INV_MASK | \
514 K7_EVNTSEL_COUNTER_MASK)
515
516 return event & K7_EVNTSEL_MASK;
517}
518
519/*
520 * Propagate counter elapsed time into the generic counter.
521 * Can only be executed on the CPU where the counter is active.
522 * Returns the delta events processed.
523 */
524static u64
525x86_perf_counter_update(struct perf_counter *counter,
526 struct hw_perf_counter *hwc, int idx)
527{
528 int shift = 64 - x86_pmu.counter_bits;
529 u64 prev_raw_count, new_raw_count;
530 s64 delta;
531
532 /*
533 * Careful: an NMI might modify the previous counter value.
534 *
535 * Our tactic to handle this is to first atomically read and
536 * exchange a new raw count - then add that new-prev delta
537 * count to the generic counter atomically:
538 */
539again:
540 prev_raw_count = atomic64_read(&hwc->prev_count);
541 rdmsrl(hwc->counter_base + idx, new_raw_count);
542
543 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
544 new_raw_count) != prev_raw_count)
545 goto again;
546
547 /*
548 * Now we have the new raw value and have updated the prev
549 * timestamp already. We can now calculate the elapsed delta
550 * (counter-)time and add that to the generic counter.
551 *
552 * Careful, not all hw sign-extends above the physical width
553 * of the count.
554 */
555 delta = (new_raw_count << shift) - (prev_raw_count << shift);
556 delta >>= shift;
557
558 atomic64_add(delta, &counter->count);
559 atomic64_sub(delta, &hwc->period_left);
560
561 return new_raw_count;
562}
563
564static atomic_t active_counters;
565static DEFINE_MUTEX(pmc_reserve_mutex);
566
567static bool reserve_pmc_hardware(void)
568{
569 int i;
570
571 if (nmi_watchdog == NMI_LOCAL_APIC)
572 disable_lapic_nmi_watchdog();
573
574 for (i = 0; i < x86_pmu.num_counters; i++) {
575 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
576 goto perfctr_fail;
577 }
578
579 for (i = 0; i < x86_pmu.num_counters; i++) {
580 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
581 goto eventsel_fail;
582 }
583
584 return true;
585
586eventsel_fail:
587 for (i--; i >= 0; i--)
588 release_evntsel_nmi(x86_pmu.eventsel + i);
589
590 i = x86_pmu.num_counters;
591
592perfctr_fail:
593 for (i--; i >= 0; i--)
594 release_perfctr_nmi(x86_pmu.perfctr + i);
595
596 if (nmi_watchdog == NMI_LOCAL_APIC)
597 enable_lapic_nmi_watchdog();
598
599 return false;
600}
601
602static void release_pmc_hardware(void)
603{
604 int i;
605
606 for (i = 0; i < x86_pmu.num_counters; i++) {
607 release_perfctr_nmi(x86_pmu.perfctr + i);
608 release_evntsel_nmi(x86_pmu.eventsel + i);
609 }
610
611 if (nmi_watchdog == NMI_LOCAL_APIC)
612 enable_lapic_nmi_watchdog();
613}
614
615static void hw_perf_counter_destroy(struct perf_counter *counter)
616{
617 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
618 release_pmc_hardware();
619 mutex_unlock(&pmc_reserve_mutex);
620 }
621}
622
623static inline int x86_pmu_initialized(void)
624{
625 return x86_pmu.handle_irq != NULL;
626}
627
628static inline int
629set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
630{
631 unsigned int cache_type, cache_op, cache_result;
632 u64 config, val;
633
634 config = attr->config;
635
636 cache_type = (config >> 0) & 0xff;
637 if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
638 return -EINVAL;
639
640 cache_op = (config >> 8) & 0xff;
641 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
642 return -EINVAL;
643
644 cache_result = (config >> 16) & 0xff;
645 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
646 return -EINVAL;
647
648 val = hw_cache_event_ids[cache_type][cache_op][cache_result];
649
650 if (val == 0)
651 return -ENOENT;
652
653 if (val == -1)
654 return -EINVAL;
655
656 hwc->config |= val;
657
658 return 0;
659}
660
661/*
662 * Setup the hardware configuration for a given attr_type
663 */
664static int __hw_perf_counter_init(struct perf_counter *counter)
665{
666 struct perf_counter_attr *attr = &counter->attr;
667 struct hw_perf_counter *hwc = &counter->hw;
668 int err;
669
670 if (!x86_pmu_initialized())
671 return -ENODEV;
672
673 err = 0;
674 if (!atomic_inc_not_zero(&active_counters)) {
675 mutex_lock(&pmc_reserve_mutex);
676 if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
677 err = -EBUSY;
678 else
679 atomic_inc(&active_counters);
680 mutex_unlock(&pmc_reserve_mutex);
681 }
682 if (err)
683 return err;
684
685 /*
686 * Generate PMC IRQs:
687 * (keep 'enabled' bit clear for now)
688 */
689 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
690
691 /*
692 * Count user and OS events unless requested not to.
693 */
694 if (!attr->exclude_user)
695 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
696 if (!attr->exclude_kernel)
697 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
698
699 if (!hwc->sample_period) {
700 hwc->sample_period = x86_pmu.max_period;
701 hwc->last_period = hwc->sample_period;
702 atomic64_set(&hwc->period_left, hwc->sample_period);
703 }
704
705 counter->destroy = hw_perf_counter_destroy;
706
707 /*
708 * Raw event type provide the config in the event structure
709 */
710 if (attr->type == PERF_TYPE_RAW) {
711 hwc->config |= x86_pmu.raw_event(attr->config);
712 return 0;
713 }
714
715 if (attr->type == PERF_TYPE_HW_CACHE)
716 return set_ext_hw_attr(hwc, attr);
717
718 if (attr->config >= x86_pmu.max_events)
719 return -EINVAL;
720 /*
721 * The generic map:
722 */
723 hwc->config |= x86_pmu.event_map(attr->config);
724
725 return 0;
726}
727
728static void intel_pmu_disable_all(void)
729{
730 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
731}
732
733static void amd_pmu_disable_all(void)
734{
735 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
736 int idx;
737
738 if (!cpuc->enabled)
739 return;
740
741 cpuc->enabled = 0;
742 /*
743 * ensure we write the disable before we start disabling the
744 * counters proper, so that amd_pmu_enable_counter() does the
745 * right thing.
746 */
747 barrier();
748
749 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
750 u64 val;
751
752 if (!test_bit(idx, cpuc->active_mask))
753 continue;
754 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
755 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
756 continue;
757 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
758 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
759 }
760}
761
762void hw_perf_disable(void)
763{
764 if (!x86_pmu_initialized())
765 return;
766 return x86_pmu.disable_all();
767}
768
769static void intel_pmu_enable_all(void)
770{
771 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
772}
773
774static void amd_pmu_enable_all(void)
775{
776 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
777 int idx;
778
779 if (cpuc->enabled)
780 return;
781
782 cpuc->enabled = 1;
783 barrier();
784
785 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
786 u64 val;
787
788 if (!test_bit(idx, cpuc->active_mask))
789 continue;
790 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
791 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
792 continue;
793 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
794 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
795 }
796}
797
798void hw_perf_enable(void)
799{
800 if (!x86_pmu_initialized())
801 return;
802 x86_pmu.enable_all();
803}
804
805static inline u64 intel_pmu_get_status(void)
806{
807 u64 status;
808
809 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
810
811 return status;
812}
813
814static inline void intel_pmu_ack_status(u64 ack)
815{
816 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
817}
818
819static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
820{
821 int err;
822 err = checking_wrmsrl(hwc->config_base + idx,
823 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
824}
825
826static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
827{
828 int err;
829 err = checking_wrmsrl(hwc->config_base + idx,
830 hwc->config);
831}
832
833static inline void
834intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
835{
836 int idx = __idx - X86_PMC_IDX_FIXED;
837 u64 ctrl_val, mask;
838 int err;
839
840 mask = 0xfULL << (idx * 4);
841
842 rdmsrl(hwc->config_base, ctrl_val);
843 ctrl_val &= ~mask;
844 err = checking_wrmsrl(hwc->config_base, ctrl_val);
845}
846
847static inline void
848intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
849{
850 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
851 intel_pmu_disable_fixed(hwc, idx);
852 return;
853 }
854
855 x86_pmu_disable_counter(hwc, idx);
856}
857
858static inline void
859amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
860{
861 x86_pmu_disable_counter(hwc, idx);
862}
863
864static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
865
866/*
867 * Set the next IRQ period, based on the hwc->period_left value.
868 * To be called with the counter disabled in hw:
869 */
870static int
871x86_perf_counter_set_period(struct perf_counter *counter,
872 struct hw_perf_counter *hwc, int idx)
873{
874 s64 left = atomic64_read(&hwc->period_left);
875 s64 period = hwc->sample_period;
876 int err, ret = 0;
877
878 /*
879 * If we are way outside a reasoable range then just skip forward:
880 */
881 if (unlikely(left <= -period)) {
882 left = period;
883 atomic64_set(&hwc->period_left, left);
884 hwc->last_period = period;
885 ret = 1;
886 }
887
888 if (unlikely(left <= 0)) {
889 left += period;
890 atomic64_set(&hwc->period_left, left);
891 hwc->last_period = period;
892 ret = 1;
893 }
894 /*
895 * Quirk: certain CPUs dont like it if just 1 event is left:
896 */
897 if (unlikely(left < 2))
898 left = 2;
899
900 if (left > x86_pmu.max_period)
901 left = x86_pmu.max_period;
902
903 per_cpu(prev_left[idx], smp_processor_id()) = left;
904
905 /*
906 * The hw counter starts counting from this counter offset,
907 * mark it to be able to extra future deltas:
908 */
909 atomic64_set(&hwc->prev_count, (u64)-left);
910
911 err = checking_wrmsrl(hwc->counter_base + idx,
912 (u64)(-left) & x86_pmu.counter_mask);
913
914 return ret;
915}
916
917static inline void
918intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
919{
920 int idx = __idx - X86_PMC_IDX_FIXED;
921 u64 ctrl_val, bits, mask;
922 int err;
923
924 /*
925 * Enable IRQ generation (0x8),
926 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
927 * if requested:
928 */
929 bits = 0x8ULL;
930 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
931 bits |= 0x2;
932 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
933 bits |= 0x1;
934 bits <<= (idx * 4);
935 mask = 0xfULL << (idx * 4);
936
937 rdmsrl(hwc->config_base, ctrl_val);
938 ctrl_val &= ~mask;
939 ctrl_val |= bits;
940 err = checking_wrmsrl(hwc->config_base, ctrl_val);
941}
942
943static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
944{
945 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
946 intel_pmu_enable_fixed(hwc, idx);
947 return;
948 }
949
950 x86_pmu_enable_counter(hwc, idx);
951}
952
953static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
954{
955 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
956
957 if (cpuc->enabled)
958 x86_pmu_enable_counter(hwc, idx);
959 else
960 x86_pmu_disable_counter(hwc, idx);
961}
962
963static int
964fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
965{
966 unsigned int event;
967
968 if (!x86_pmu.num_counters_fixed)
969 return -1;
970
971 /*
972 * Quirk, IA32_FIXED_CTRs do not work on current Atom processors:
973 */
974 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
975 boot_cpu_data.x86_model == 28)
976 return -1;
977
978 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
979
980 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
981 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
982 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
983 return X86_PMC_IDX_FIXED_CPU_CYCLES;
984 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
985 return X86_PMC_IDX_FIXED_BUS_CYCLES;
986
987 return -1;
988}
989
990/*
991 * Find a PMC slot for the freshly enabled / scheduled in counter:
992 */
993static int x86_pmu_enable(struct perf_counter *counter)
994{
995 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
996 struct hw_perf_counter *hwc = &counter->hw;
997 int idx;
998
999 idx = fixed_mode_idx(counter, hwc);
1000 if (idx >= 0) {
1001 /*
1002 * Try to get the fixed counter, if that is already taken
1003 * then try to get a generic counter:
1004 */
1005 if (test_and_set_bit(idx, cpuc->used_mask))
1006 goto try_generic;
1007
1008 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1009 /*
1010 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
1011 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1012 */
1013 hwc->counter_base =
1014 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1015 hwc->idx = idx;
1016 } else {
1017 idx = hwc->idx;
1018 /* Try to get the previous generic counter again */
1019 if (test_and_set_bit(idx, cpuc->used_mask)) {
1020try_generic:
1021 idx = find_first_zero_bit(cpuc->used_mask,
1022 x86_pmu.num_counters);
1023 if (idx == x86_pmu.num_counters)
1024 return -EAGAIN;
1025
1026 set_bit(idx, cpuc->used_mask);
1027 hwc->idx = idx;
1028 }
1029 hwc->config_base = x86_pmu.eventsel;
1030 hwc->counter_base = x86_pmu.perfctr;
1031 }
1032
1033 perf_counters_lapic_init();
1034
1035 x86_pmu.disable(hwc, idx);
1036
1037 cpuc->counters[idx] = counter;
1038 set_bit(idx, cpuc->active_mask);
1039
1040 x86_perf_counter_set_period(counter, hwc, idx);
1041 x86_pmu.enable(hwc, idx);
1042
1043 return 0;
1044}
1045
1046static void x86_pmu_unthrottle(struct perf_counter *counter)
1047{
1048 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1049 struct hw_perf_counter *hwc = &counter->hw;
1050
1051 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1052 cpuc->counters[hwc->idx] != counter))
1053 return;
1054
1055 x86_pmu.enable(hwc, hwc->idx);
1056}
1057
1058void perf_counter_print_debug(void)
1059{
1060 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1061 struct cpu_hw_counters *cpuc;
1062 unsigned long flags;
1063 int cpu, idx;
1064
1065 if (!x86_pmu.num_counters)
1066 return;
1067
1068 local_irq_save(flags);
1069
1070 cpu = smp_processor_id();
1071 cpuc = &per_cpu(cpu_hw_counters, cpu);
1072
1073 if (x86_pmu.version >= 2) {
1074 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1075 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1076 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1077 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1078
1079 pr_info("\n");
1080 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1081 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1082 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1083 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1084 }
1085 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
1086
1087 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1088 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1089 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1090
1091 prev_left = per_cpu(prev_left[idx], cpu);
1092
1093 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1094 cpu, idx, pmc_ctrl);
1095 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
1096 cpu, idx, pmc_count);
1097 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1098 cpu, idx, prev_left);
1099 }
1100 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1101 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1102
1103 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1104 cpu, idx, pmc_count);
1105 }
1106 local_irq_restore(flags);
1107}
1108
1109static void x86_pmu_disable(struct perf_counter *counter)
1110{
1111 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1112 struct hw_perf_counter *hwc = &counter->hw;
1113 int idx = hwc->idx;
1114
1115 /*
1116 * Must be done before we disable, otherwise the nmi handler
1117 * could reenable again:
1118 */
1119 clear_bit(idx, cpuc->active_mask);
1120 x86_pmu.disable(hwc, idx);
1121
1122 /*
1123 * Make sure the cleared pointer becomes visible before we
1124 * (potentially) free the counter:
1125 */
1126 barrier();
1127
1128 /*
1129 * Drain the remaining delta count out of a counter
1130 * that we are disabling:
1131 */
1132 x86_perf_counter_update(counter, hwc, idx);
1133 cpuc->counters[idx] = NULL;
1134 clear_bit(idx, cpuc->used_mask);
1135}
1136
1137/*
1138 * Save and restart an expired counter. Called by NMI contexts,
1139 * so it has to be careful about preempting normal counter ops:
1140 */
1141static int intel_pmu_save_and_restart(struct perf_counter *counter)
1142{
1143 struct hw_perf_counter *hwc = &counter->hw;
1144 int idx = hwc->idx;
1145 int ret;
1146
1147 x86_perf_counter_update(counter, hwc, idx);
1148 ret = x86_perf_counter_set_period(counter, hwc, idx);
1149
1150 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1151 intel_pmu_enable_counter(hwc, idx);
1152
1153 return ret;
1154}
1155
1156static void intel_pmu_reset(void)
1157{
1158 unsigned long flags;
1159 int idx;
1160
1161 if (!x86_pmu.num_counters)
1162 return;
1163
1164 local_irq_save(flags);
1165
1166 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1167
1168 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1169 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1170 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1171 }
1172 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1173 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1174 }
1175
1176 local_irq_restore(flags);
1177}
1178
1179
1180/*
1181 * This handler is triggered by the local APIC, so the APIC IRQ handling
1182 * rules apply:
1183 */
1184static int intel_pmu_handle_irq(struct pt_regs *regs)
1185{
1186 struct perf_sample_data data;
1187 struct cpu_hw_counters *cpuc;
1188 int bit, cpu, loops;
1189 u64 ack, status;
1190
1191 data.regs = regs;
1192 data.addr = 0;
1193
1194 cpu = smp_processor_id();
1195 cpuc = &per_cpu(cpu_hw_counters, cpu);
1196
1197 perf_disable();
1198 status = intel_pmu_get_status();
1199 if (!status) {
1200 perf_enable();
1201 return 0;
1202 }
1203
1204 loops = 0;
1205again:
1206 if (++loops > 100) {
1207 WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
1208 perf_counter_print_debug();
1209 intel_pmu_reset();
1210 perf_enable();
1211 return 1;
1212 }
1213
1214 inc_irq_stat(apic_perf_irqs);
1215 ack = status;
1216 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1217 struct perf_counter *counter = cpuc->counters[bit];
1218
1219 clear_bit(bit, (unsigned long *) &status);
1220 if (!test_bit(bit, cpuc->active_mask))
1221 continue;
1222
1223 if (!intel_pmu_save_and_restart(counter))
1224 continue;
1225
1226 if (perf_counter_overflow(counter, 1, &data))
1227 intel_pmu_disable_counter(&counter->hw, bit);
1228 }
1229
1230 intel_pmu_ack_status(ack);
1231
1232 /*
1233 * Repeat if there is more work to be done:
1234 */
1235 status = intel_pmu_get_status();
1236 if (status)
1237 goto again;
1238
1239 perf_enable();
1240
1241 return 1;
1242}
1243
1244static int amd_pmu_handle_irq(struct pt_regs *regs)
1245{
1246 struct perf_sample_data data;
1247 struct cpu_hw_counters *cpuc;
1248 struct perf_counter *counter;
1249 struct hw_perf_counter *hwc;
1250 int cpu, idx, handled = 0;
1251 u64 val;
1252
1253 data.regs = regs;
1254 data.addr = 0;
1255
1256 cpu = smp_processor_id();
1257 cpuc = &per_cpu(cpu_hw_counters, cpu);
1258
1259 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1260 if (!test_bit(idx, cpuc->active_mask))
1261 continue;
1262
1263 counter = cpuc->counters[idx];
1264 hwc = &counter->hw;
1265
1266 val = x86_perf_counter_update(counter, hwc, idx);
1267 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
1268 continue;
1269
1270 /*
1271 * counter overflow
1272 */
1273 handled = 1;
1274 data.period = counter->hw.last_period;
1275
1276 if (!x86_perf_counter_set_period(counter, hwc, idx))
1277 continue;
1278
1279 if (perf_counter_overflow(counter, 1, &data))
1280 amd_pmu_disable_counter(hwc, idx);
1281 }
1282
1283 if (handled)
1284 inc_irq_stat(apic_perf_irqs);
1285
1286 return handled;
1287}
1288
1289void smp_perf_pending_interrupt(struct pt_regs *regs)
1290{
1291 irq_enter();
1292 ack_APIC_irq();
1293 inc_irq_stat(apic_pending_irqs);
1294 perf_counter_do_pending();
1295 irq_exit();
1296}
1297
1298void set_perf_counter_pending(void)
1299{
1300 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1301}
1302
1303void perf_counters_lapic_init(void)
1304{
1305 if (!x86_pmu_initialized())
1306 return;
1307
1308 /*
1309 * Always use NMI for PMU
1310 */
1311 apic_write(APIC_LVTPC, APIC_DM_NMI);
1312}
1313
1314static int __kprobes
1315perf_counter_nmi_handler(struct notifier_block *self,
1316 unsigned long cmd, void *__args)
1317{
1318 struct die_args *args = __args;
1319 struct pt_regs *regs;
1320
1321 if (!atomic_read(&active_counters))
1322 return NOTIFY_DONE;
1323
1324 switch (cmd) {
1325 case DIE_NMI:
1326 case DIE_NMI_IPI:
1327 break;
1328
1329 default:
1330 return NOTIFY_DONE;
1331 }
1332
1333 regs = args->regs;
1334
1335 apic_write(APIC_LVTPC, APIC_DM_NMI);
1336 /*
1337 * Can't rely on the handled return value to say it was our NMI, two
1338 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
1339 *
1340 * If the first NMI handles both, the latter will be empty and daze
1341 * the CPU.
1342 */
1343 x86_pmu.handle_irq(regs);
1344
1345 return NOTIFY_STOP;
1346}
1347
1348static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1349 .notifier_call = perf_counter_nmi_handler,
1350 .next = NULL,
1351 .priority = 1
1352};
1353
1354static struct x86_pmu intel_pmu = {
1355 .name = "Intel",
1356 .handle_irq = intel_pmu_handle_irq,
1357 .disable_all = intel_pmu_disable_all,
1358 .enable_all = intel_pmu_enable_all,
1359 .enable = intel_pmu_enable_counter,
1360 .disable = intel_pmu_disable_counter,
1361 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
1362 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
1363 .event_map = intel_pmu_event_map,
1364 .raw_event = intel_pmu_raw_event,
1365 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
1366 /*
1367 * Intel PMCs cannot be accessed sanely above 32 bit width,
1368 * so we install an artificial 1<<31 period regardless of
1369 * the generic counter period:
1370 */
1371 .max_period = (1ULL << 31) - 1,
1372};
1373
1374static struct x86_pmu amd_pmu = {
1375 .name = "AMD",
1376 .handle_irq = amd_pmu_handle_irq,
1377 .disable_all = amd_pmu_disable_all,
1378 .enable_all = amd_pmu_enable_all,
1379 .enable = amd_pmu_enable_counter,
1380 .disable = amd_pmu_disable_counter,
1381 .eventsel = MSR_K7_EVNTSEL0,
1382 .perfctr = MSR_K7_PERFCTR0,
1383 .event_map = amd_pmu_event_map,
1384 .raw_event = amd_pmu_raw_event,
1385 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
1386 .num_counters = 4,
1387 .counter_bits = 48,
1388 .counter_mask = (1ULL << 48) - 1,
1389 /* use highest bit to detect overflow */
1390 .max_period = (1ULL << 47) - 1,
1391};
1392
1393static int intel_pmu_init(void)
1394{
1395 union cpuid10_edx edx;
1396 union cpuid10_eax eax;
1397 unsigned int unused;
1398 unsigned int ebx;
1399 int version;
1400
1401 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
1402 return -ENODEV;
1403
1404 /*
1405 * Check whether the Architectural PerfMon supports
1406 * Branch Misses Retired Event or not.
1407 */
1408 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1409 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
1410 return -ENODEV;
1411
1412 version = eax.split.version_id;
1413 if (version < 2)
1414 return -ENODEV;
1415
1416 x86_pmu = intel_pmu;
1417 x86_pmu.version = version;
1418 x86_pmu.num_counters = eax.split.num_counters;
1419 x86_pmu.counter_bits = eax.split.bit_width;
1420 x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
1421
1422 /*
1423 * Quirk: v2 perfmon does not report fixed-purpose counters, so
1424 * assume at least 3 counters:
1425 */
1426 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
1427
1428 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1429
1430 /*
1431 * Install the hw-cache-events table:
1432 */
1433 switch (boot_cpu_data.x86_model) {
1434 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
1435 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
1436 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
1437 case 29: /* six-core 45 nm xeon "Dunnington" */
1438 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
1439 sizeof(hw_cache_event_ids));
1440
1441 pr_cont("Core2 events, ");
1442 break;
1443 default:
1444 case 26:
1445 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1446 sizeof(hw_cache_event_ids));
1447
1448 pr_cont("Nehalem/Corei7 events, ");
1449 break;
1450 case 28:
1451 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
1452 sizeof(hw_cache_event_ids));
1453
1454 pr_cont("Atom events, ");
1455 break;
1456 }
1457 return 0;
1458}
1459
1460static int amd_pmu_init(void)
1461{
1462 x86_pmu = amd_pmu;
1463
1464 switch (boot_cpu_data.x86) {
1465 case 0x0f:
1466 case 0x10:
1467 case 0x11:
1468 memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
1469 sizeof(hw_cache_event_ids));
1470
1471 pr_cont("AMD Family 0f/10/11 events, ");
1472 break;
1473 }
1474 return 0;
1475}
1476
1477void __init init_hw_perf_counters(void)
1478{
1479 int err;
1480
1481 pr_info("Performance Counters: ");
1482
1483 switch (boot_cpu_data.x86_vendor) {
1484 case X86_VENDOR_INTEL:
1485 err = intel_pmu_init();
1486 break;
1487 case X86_VENDOR_AMD:
1488 err = amd_pmu_init();
1489 break;
1490 default:
1491 return;
1492 }
1493 if (err != 0) {
1494 pr_cont("no PMU driver, software counters only.\n");
1495 return;
1496 }
1497
1498 pr_cont("%s PMU driver.\n", x86_pmu.name);
1499
1500 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1501 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1502 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1503 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1504 }
1505 perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
1506 perf_max_counters = x86_pmu.num_counters;
1507
1508 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1509 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1510 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1511 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1512 }
1513
1514 perf_counter_mask |=
1515 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1516
1517 perf_counters_lapic_init();
1518 register_die_notifier(&perf_counter_nmi_notifier);
1519
1520 pr_info("... version: %d\n", x86_pmu.version);
1521 pr_info("... bit width: %d\n", x86_pmu.counter_bits);
1522 pr_info("... generic counters: %d\n", x86_pmu.num_counters);
1523 pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
1524 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1525 pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed);
1526 pr_info("... counter mask: %016Lx\n", perf_counter_mask);
1527}
1528
1529static inline void x86_pmu_read(struct perf_counter *counter)
1530{
1531 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1532}
1533
1534static const struct pmu pmu = {
1535 .enable = x86_pmu_enable,
1536 .disable = x86_pmu_disable,
1537 .read = x86_pmu_read,
1538 .unthrottle = x86_pmu_unthrottle,
1539};
1540
1541const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
1542{
1543 int err;
1544
1545 err = __hw_perf_counter_init(counter);
1546 if (err)
1547 return ERR_PTR(err);
1548
1549 return &pmu;
1550}
1551
1552/*
1553 * callchain support
1554 */
1555
1556static inline
1557void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
1558{
1559 if (entry->nr < MAX_STACK_DEPTH)
1560 entry->ip[entry->nr++] = ip;
1561}
1562
1563static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1564static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1565
1566
1567static void
1568backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1569{
1570 /* Ignore warnings */
1571}
1572
1573static void backtrace_warning(void *data, char *msg)
1574{
1575 /* Ignore warnings */
1576}
1577
1578static int backtrace_stack(void *data, char *name)
1579{
1580 /* Don't bother with IRQ stacks for now */
1581 return -1;
1582}
1583
1584static void backtrace_address(void *data, unsigned long addr, int reliable)
1585{
1586 struct perf_callchain_entry *entry = data;
1587
1588 if (reliable)
1589 callchain_store(entry, addr);
1590}
1591
1592static const struct stacktrace_ops backtrace_ops = {
1593 .warning = backtrace_warning,
1594 .warning_symbol = backtrace_warning_symbol,
1595 .stack = backtrace_stack,
1596 .address = backtrace_address,
1597};
1598
1599static void
1600perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1601{
1602 unsigned long bp;
1603 char *stack;
1604 int nr = entry->nr;
1605
1606 callchain_store(entry, instruction_pointer(regs));
1607
1608 stack = ((char *)regs + sizeof(struct pt_regs));
1609#ifdef CONFIG_FRAME_POINTER
1610 bp = frame_pointer(regs);
1611#else
1612 bp = 0;
1613#endif
1614
1615 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1616
1617 entry->kernel = entry->nr - nr;
1618}
1619
1620
1621struct stack_frame {
1622 const void __user *next_fp;
1623 unsigned long return_address;
1624};
1625
1626static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1627{
1628 int ret;
1629
1630 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1631 return 0;
1632
1633 ret = 1;
1634 pagefault_disable();
1635 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1636 ret = 0;
1637 pagefault_enable();
1638
1639 return ret;
1640}
1641
1642static void
1643perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1644{
1645 struct stack_frame frame;
1646 const void __user *fp;
1647 int nr = entry->nr;
1648
1649 regs = (struct pt_regs *)current->thread.sp0 - 1;
1650 fp = (void __user *)regs->bp;
1651
1652 callchain_store(entry, regs->ip);
1653
1654 while (entry->nr < MAX_STACK_DEPTH) {
1655 frame.next_fp = NULL;
1656 frame.return_address = 0;
1657
1658 if (!copy_stack_frame(fp, &frame))
1659 break;
1660
1661 if ((unsigned long)fp < user_stack_pointer(regs))
1662 break;
1663
1664 callchain_store(entry, frame.return_address);
1665 fp = frame.next_fp;
1666 }
1667
1668 entry->user = entry->nr - nr;
1669}
1670
1671static void
1672perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1673{
1674 int is_user;
1675
1676 if (!regs)
1677 return;
1678
1679 is_user = user_mode(regs);
1680
1681 if (!current || current->pid == 0)
1682 return;
1683
1684 if (is_user && current->state != TASK_RUNNING)
1685 return;
1686
1687 if (!is_user)
1688 perf_callchain_kernel(regs, entry);
1689
1690 if (current->mm)
1691 perf_callchain_user(regs, entry);
1692}
1693
1694struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1695{
1696 struct perf_callchain_entry *entry;
1697
1698 if (in_nmi())
1699 entry = &__get_cpu_var(nmi_entry);
1700 else
1701 entry = &__get_cpu_var(irq_entry);
1702
1703 entry->nr = 0;
1704 entry->hv = 0;
1705 entry->kernel = 0;
1706 entry->user = 0;
1707
1708 perf_do_callchain(regs, entry);
1709
1710 return entry;
1711}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e32..d6f5b9fbde32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/genapic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2ac1f0c2beb3..b07af8861244 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
182 .notifier_call = cpuid_class_cpu_callback, 182 .notifier_call = cpuid_class_cpu_callback,
183}; 183};
184 184
185static char *cpuid_nodename(struct device *dev)
186{
187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
188}
189
185static int __init cpuid_init(void) 190static int __init cpuid_init(void)
186{ 191{
187 int i, err = 0; 192 int i, err = 0;
@@ -198,6 +203,7 @@ static int __init cpuid_init(void)
198 err = PTR_ERR(cpuid_class); 203 err = PTR_ERR(cpuid_class);
199 goto out_chrdev; 204 goto out_chrdev;
200 } 205 }
206 cpuid_class->nodename = cpuid_nodename;
201 for_each_online_cpu(i) { 207 for_each_online_cpu(i) {
202 err = cpuid_device_create(i); 208 err = cpuid_device_create(i);
203 if (err != 0) 209 if (err != 0)
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index da87590b8698..81086c227ab7 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
29 unsigned long *sp, unsigned long bp, char *log_lvl); 29 unsigned long *sp, unsigned long bp, char *log_lvl);
30 30
31extern unsigned int code_bytes; 31extern unsigned int code_bytes;
32extern int kstack_depth_to_print;
33 32
34/* The form of the top of the frame on the stack */ 33/* The form of the top of the frame on the stack */
35struct stack_frame { 34struct stack_frame {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 006281302925..7271fa33d791 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
617 */ 617 */
618__init void e820_setup_gap(void) 618__init void e820_setup_gap(void)
619{ 619{
620 unsigned long gapstart, gapsize, round; 620 unsigned long gapstart, gapsize;
621 int found; 621 int found;
622 622
623 gapstart = 0x10000000; 623 gapstart = 0x10000000;
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
635#endif 635#endif
636 636
637 /* 637 /*
638 * See how much we want to round up: start off with 638 * e820_reserve_resources_late protect stolen RAM already
639 * rounding to the next 1MB area.
640 */ 639 */
641 round = 0x100000; 640 pci_mem_start = gapstart;
642 while ((gapsize >> 4) > round)
643 round += round;
644 /* Fun with two's complement */
645 pci_mem_start = (gapstart + round) & -round;
646 641
647 printk(KERN_INFO 642 printk(KERN_INFO
648 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", 643 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
@@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void)
1371 } 1366 }
1372} 1367}
1373 1368
1369/* How much should we pad RAM ending depending on where it is? */
1370static unsigned long ram_alignment(resource_size_t pos)
1371{
1372 unsigned long mb = pos >> 20;
1373
1374 /* To 64kB in the first megabyte */
1375 if (!mb)
1376 return 64*1024;
1377
1378 /* To 1MB in the first 16MB */
1379 if (mb < 16)
1380 return 1024*1024;
1381
1382 /* To 32MB for anything above that */
1383 return 32*1024*1024;
1384}
1385
1374void __init e820_reserve_resources_late(void) 1386void __init e820_reserve_resources_late(void)
1375{ 1387{
1376 int i; 1388 int i;
@@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void)
1382 insert_resource_expand_to_fit(&iomem_resource, res); 1394 insert_resource_expand_to_fit(&iomem_resource, res);
1383 res++; 1395 res++;
1384 } 1396 }
1397
1398 /*
1399 * Try to bump up RAM regions to reasonable boundaries to
1400 * avoid stolen RAM:
1401 */
1402 for (i = 0; i < e820.nr_map; i++) {
1403 struct e820entry *entry = &e820_saved.map[i];
1404 resource_size_t start, end;
1405
1406 if (entry->type != E820_RAM)
1407 continue;
1408 start = entry->addr + entry->size;
1409 end = round_up(start, ram_alignment(start));
1410 if (start == end)
1411 continue;
1412 reserve_region_with_split(&iomem_resource, start,
1413 end - 1, "RAM buffer");
1414 }
1385} 1415}
1386 1416
1387char *__init default_machine_specific_memory_setup(void) 1417char *__init default_machine_specific_memory_setup(void)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd953dee..ebdb85cf2686 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
97} 97}
98 98
99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) 99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100static u32 __init ati_ixp4x0_rev(int num, int slot, int func) 101static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
101{ 102{
102 u32 d; 103 u32 d;
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
114 d &= 0xff; 115 d &= 0xff;
115 return d; 116 return d;
116} 117}
118#endif
117 119
118static void __init ati_bugs(int num, int slot, int func) 120static void __init ati_bugs(int num, int slot, int func)
119{ 121{
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 987f91f0f755..de74f0a3e0ed 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -963,6 +963,8 @@ END(\sym)
963#ifdef CONFIG_SMP 963#ifdef CONFIG_SMP
964apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ 964apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
965 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 965 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
966apicinterrupt REBOOT_VECTOR \
967 reboot_interrupt smp_reboot_interrupt
966#endif 968#endif
967 969
968#ifdef CONFIG_X86_UV 970#ifdef CONFIG_X86_UV
@@ -994,10 +996,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
994#endif 996#endif
995 997
996apicinterrupt THRESHOLD_APIC_VECTOR \ 998apicinterrupt THRESHOLD_APIC_VECTOR \
997 threshold_interrupt mce_threshold_interrupt 999 threshold_interrupt smp_threshold_interrupt
998apicinterrupt THERMAL_APIC_VECTOR \ 1000apicinterrupt THERMAL_APIC_VECTOR \
999 thermal_interrupt smp_thermal_interrupt 1001 thermal_interrupt smp_thermal_interrupt
1000 1002
1003#ifdef CONFIG_X86_MCE
1004apicinterrupt MCE_SELF_VECTOR \
1005 mce_self_interrupt smp_mce_self_interrupt
1006#endif
1007
1001#ifdef CONFIG_SMP 1008#ifdef CONFIG_SMP
1002apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ 1009apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1003 call_function_single_interrupt smp_call_function_single_interrupt 1010 call_function_single_interrupt smp_call_function_single_interrupt
@@ -1012,6 +1019,11 @@ apicinterrupt ERROR_APIC_VECTOR \
1012apicinterrupt SPURIOUS_APIC_VECTOR \ 1019apicinterrupt SPURIOUS_APIC_VECTOR \
1013 spurious_interrupt smp_spurious_interrupt 1020 spurious_interrupt smp_spurious_interrupt
1014 1021
1022#ifdef CONFIG_PERF_COUNTERS
1023apicinterrupt LOCAL_PENDING_VECTOR \
1024 perf_pending_interrupt smp_perf_pending_interrupt
1025#endif
1026
1015/* 1027/*
1016 * Exception entry points. 1028 * Exception entry points.
1017 */ 1029 */
@@ -1366,10 +1378,15 @@ END(xen_failsafe_callback)
1366paranoidzeroentry_ist debug do_debug DEBUG_STACK 1378paranoidzeroentry_ist debug do_debug DEBUG_STACK
1367paranoidzeroentry_ist int3 do_int3 DEBUG_STACK 1379paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1368paranoiderrorentry stack_segment do_stack_segment 1380paranoiderrorentry stack_segment do_stack_segment
1381#ifdef CONFIG_XEN
1382zeroentry xen_debug do_debug
1383zeroentry xen_int3 do_int3
1384errorentry xen_stack_segment do_stack_segment
1385#endif
1369errorentry general_protection do_general_protection 1386errorentry general_protection do_general_protection
1370errorentry page_fault do_page_fault 1387errorentry page_fault do_page_fault
1371#ifdef CONFIG_X86_MCE 1388#ifdef CONFIG_X86_MCE
1372paranoidzeroentry machine_check do_machine_check 1389paranoidzeroentry machine_check *machine_check_vector(%rip)
1373#endif 1390#endif
1374 1391
1375 /* 1392 /*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 18dfa30795c9..b79c5533c421 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -442,7 +442,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
442 _ASM_EXTABLE(1b, 4b) 442 _ASM_EXTABLE(1b, 4b)
443 _ASM_EXTABLE(2b, 4b) 443 _ASM_EXTABLE(2b, 4b)
444 444
445 : [old] "=r" (old), [faulted] "=r" (faulted) 445 : [old] "=&r" (old), [faulted] "=r" (faulted)
446 : [parent] "r" (parent), [return_hooker] "r" (return_hooker) 446 : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
447 : "memory" 447 : "memory"
448 ); 448 );
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 30683883e0cd..dc5ed4bdd88d 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -608,13 +608,6 @@ ignore_int:
608ENTRY(initial_code) 608ENTRY(initial_code)
609 .long i386_start_kernel 609 .long i386_start_kernel
610 610
611.section .text
612/*
613 * Real beginning of normal "text" segment
614 */
615ENTRY(stext)
616ENTRY(_stext)
617
618/* 611/*
619 * BSS section 612 * BSS section
620 */ 613 */
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 69451473dbd2..51d959528b1d 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -91,7 +91,7 @@ void arch_update_kernel_hw_breakpoint(void *unused)
91 */ 91 */
92 kdr7 = temp_kdr7; 92 kdr7 = temp_kdr7;
93 set_debugreg(kdr7 | current->thread.debugreg7, 7); 93 set_debugreg(kdr7 | current->thread.debugreg7, 7);
94 put_cpu_no_resched(); 94 put_cpu();
95} 95}
96 96
97/* 97/*
@@ -374,7 +374,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
374 rc = NOTIFY_DONE; 374 rc = NOTIFY_DONE;
375 375
376 set_debugreg(dr7, 7); 376 set_debugreg(dr7, 7);
377 put_cpu_no_resched(); 377 put_cpu();
378 return rc; 378 return rc;
379} 379}
380 380
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c2e0bb0890d4..5cf36c053ac4 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -7,6 +7,7 @@
7#include <linux/spinlock.h> 7#include <linux/spinlock.h>
8#include <linux/jiffies.h> 8#include <linux/jiffies.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/timex.h>
10#include <linux/delay.h> 11#include <linux/delay.h>
11#include <linux/init.h> 12#include <linux/init.h>
12#include <linux/io.h> 13#include <linux/io.h>
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index df3bf269beab..270ff83efc11 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -12,7 +12,6 @@
12 12
13static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 13static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
15struct mm_struct init_mm = INIT_MM(init_mm);
16 15
17/* 16/*
18 * Initial thread structure. 17 * Initial thread structure.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c3fe010d74c8..b0cdde6932f5 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,8 @@
12#include <asm/io_apic.h> 12#include <asm/io_apic.h>
13#include <asm/irq.h> 13#include <asm/irq.h>
14#include <asm/idle.h> 14#include <asm/idle.h>
15#include <asm/mce.h>
16#include <asm/hw_irq.h>
15 17
16atomic_t irq_err_count; 18atomic_t irq_err_count;
17 19
@@ -24,9 +26,9 @@ void (*generic_interrupt_extension)(void) = NULL;
24 */ 26 */
25void ack_bad_irq(unsigned int irq) 27void ack_bad_irq(unsigned int irq)
26{ 28{
27 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); 29 if (printk_ratelimit())
30 pr_err("unexpected IRQ trap at vector %02x\n", irq);
28 31
29#ifdef CONFIG_X86_LOCAL_APIC
30 /* 32 /*
31 * Currently unexpected vectors happen only on SMP and APIC. 33 * Currently unexpected vectors happen only on SMP and APIC.
32 * We _must_ ack these because every local APIC has only N 34 * We _must_ ack these because every local APIC has only N
@@ -36,9 +38,7 @@ void ack_bad_irq(unsigned int irq)
36 * completely. 38 * completely.
37 * But only ack when the APIC is enabled -AK 39 * But only ack when the APIC is enabled -AK
38 */ 40 */
39 if (cpu_has_apic) 41 ack_APIC_irq();
40 ack_APIC_irq();
41#endif
42} 42}
43 43
44#define irq_stats(x) (&per_cpu(irq_stat, x)) 44#define irq_stats(x) (&per_cpu(irq_stat, x))
@@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
63 for_each_online_cpu(j) 63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "%*s: ", prec, "CNT");
67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND");
71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n");
66#endif 74#endif
67 if (generic_interrupt_extension) { 75 if (generic_interrupt_extension) {
68 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
@@ -89,13 +97,23 @@ static int show_other_interrupts(struct seq_file *p, int prec)
89 for_each_online_cpu(j) 97 for_each_online_cpu(j)
90 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
91 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
92# ifdef CONFIG_X86_64 100# ifdef CONFIG_X86_MCE_THRESHOLD
93 seq_printf(p, "%*s: ", prec, "THR"); 101 seq_printf(p, "%*s: ", prec, "THR");
94 for_each_online_cpu(j) 102 for_each_online_cpu(j)
95 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
96 seq_printf(p, " Threshold APIC interrupts\n"); 104 seq_printf(p, " Threshold APIC interrupts\n");
97# endif 105# endif
98#endif 106#endif
107#ifdef CONFIG_X86_NEW_MCE
108 seq_printf(p, "%*s: ", prec, "MCE");
109 for_each_online_cpu(j)
110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
111 seq_printf(p, " Machine check exceptions\n");
112 seq_printf(p, "%*s: ", prec, "MCP");
113 for_each_online_cpu(j)
114 seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
115 seq_printf(p, " Machine check polls\n");
116#endif
99 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); 117 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
100#if defined(CONFIG_X86_IO_APIC) 118#if defined(CONFIG_X86_IO_APIC)
101 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); 119 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -166,6 +184,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
166#ifdef CONFIG_X86_LOCAL_APIC 184#ifdef CONFIG_X86_LOCAL_APIC
167 sum += irq_stats(cpu)->apic_timer_irqs; 185 sum += irq_stats(cpu)->apic_timer_irqs;
168 sum += irq_stats(cpu)->irq_spurious_count; 186 sum += irq_stats(cpu)->irq_spurious_count;
187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs;
169#endif 189#endif
170 if (generic_interrupt_extension) 190 if (generic_interrupt_extension)
171 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->generic_irqs;
@@ -176,9 +196,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
176#endif 196#endif
177#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_MCE
178 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
179# ifdef CONFIG_X86_64 199# ifdef CONFIG_X86_MCE_THRESHOLD
180 sum += irq_stats(cpu)->irq_threshold_count; 200 sum += irq_stats(cpu)->irq_threshold_count;
201# endif
181#endif 202#endif
203#ifdef CONFIG_X86_NEW_MCE
204 sum += per_cpu(mce_exception_count, cpu);
205 sum += per_cpu(mce_poll_count, cpu);
182#endif 206#endif
183 return sum; 207 return sum;
184} 208}
@@ -213,14 +237,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
213 irq = __get_cpu_var(vector_irq)[vector]; 237 irq = __get_cpu_var(vector_irq)[vector];
214 238
215 if (!handle_irq(irq, regs)) { 239 if (!handle_irq(irq, regs)) {
216#ifdef CONFIG_X86_64 240 ack_APIC_irq();
217 if (!disable_apic)
218 ack_APIC_irq();
219#endif
220 241
221 if (printk_ratelimit()) 242 if (printk_ratelimit())
222 printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", 243 pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
223 __func__, smp_processor_id(), vector, irq); 244 __func__, smp_processor_id(), vector, irq);
224 } 245 }
225 246
226 irq_exit(); 247 irq_exit();
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c
index 368b0a8836f9..696f0e475c2d 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit.c
@@ -1,20 +1,25 @@
1#include <linux/linkage.h>
1#include <linux/errno.h> 2#include <linux/errno.h>
2#include <linux/signal.h> 3#include <linux/signal.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/ioport.h> 5#include <linux/ioport.h>
5#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h>
6#include <linux/slab.h> 8#include <linux/slab.h>
7#include <linux/random.h> 9#include <linux/random.h>
10#include <linux/kprobes.h>
8#include <linux/init.h> 11#include <linux/init.h>
9#include <linux/kernel_stat.h> 12#include <linux/kernel_stat.h>
10#include <linux/sysdev.h> 13#include <linux/sysdev.h>
11#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/acpi.h>
12#include <linux/io.h> 16#include <linux/io.h>
13#include <linux/delay.h> 17#include <linux/delay.h>
14 18
15#include <asm/atomic.h> 19#include <asm/atomic.h>
16#include <asm/system.h> 20#include <asm/system.h>
17#include <asm/timer.h> 21#include <asm/timer.h>
22#include <asm/hw_irq.h>
18#include <asm/pgtable.h> 23#include <asm/pgtable.h>
19#include <asm/desc.h> 24#include <asm/desc.h>
20#include <asm/apic.h> 25#include <asm/apic.h>
@@ -22,7 +27,23 @@
22#include <asm/i8259.h> 27#include <asm/i8259.h>
23#include <asm/traps.h> 28#include <asm/traps.h>
24 29
30/*
31 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
32 * (these are usually mapped to vectors 0x30-0x3f)
33 */
34
35/*
36 * The IO-APIC gives us many more interrupt sources. Most of these
37 * are unused but an SMP system is supposed to have enough memory ...
38 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
39 * across the spectrum, so we really want to be prepared to get all
40 * of these. Plus, more powerful systems might have more than 64
41 * IO-APIC registers.
42 *
43 * (these are usually mapped into the 0x30-0xff vector range)
44 */
25 45
46#ifdef CONFIG_X86_32
26/* 47/*
27 * Note that on a 486, we don't want to do a SIGFPE on an irq13 48 * Note that on a 486, we don't want to do a SIGFPE on an irq13
28 * as the irq is unreliable, and exception 16 works correctly 49 * as the irq is unreliable, and exception 16 works correctly
@@ -52,30 +73,7 @@ static struct irqaction fpu_irq = {
52 .handler = math_error_irq, 73 .handler = math_error_irq,
53 .name = "fpu", 74 .name = "fpu",
54}; 75};
55
56void __init init_ISA_irqs(void)
57{
58 int i;
59
60#ifdef CONFIG_X86_LOCAL_APIC
61 init_bsp_APIC();
62#endif 76#endif
63 init_8259A(0);
64
65 /*
66 * 16 old-style INTA-cycle interrupts:
67 */
68 for (i = 0; i < NR_IRQS_LEGACY; i++) {
69 struct irq_desc *desc = irq_to_desc(i);
70
71 desc->status = IRQ_DISABLED;
72 desc->action = NULL;
73 desc->depth = 1;
74
75 set_irq_chip_and_handler_name(i, &i8259A_chip,
76 handle_level_irq, "XT");
77 }
78}
79 77
80/* 78/*
81 * IRQ2 is cascade interrupt to second interrupt controller 79 * IRQ2 is cascade interrupt to second interrupt controller
@@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector)
118 return 0; 116 return 0;
119} 117}
120 118
121/* Overridden in paravirt.c */ 119static void __init init_ISA_irqs(void)
122void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
123
124void __init native_init_IRQ(void)
125{ 120{
126 int i; 121 int i;
127 122
128 /* Execute any quirks before the call gates are initialised: */ 123#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
129 x86_quirk_pre_intr_init(); 124 init_bsp_APIC();
125#endif
126 init_8259A(0);
130 127
131 /* 128 /*
132 * Cover the whole vector space, no vector can escape 129 * 16 old-style INTA-cycle interrupts:
133 * us. (some of these will be overridden and become
134 * 'special' SMP interrupts)
135 */ 130 */
136 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 131 for (i = 0; i < NR_IRQS_LEGACY; i++) {
137 /* SYSCALL_VECTOR was reserved in trap_init. */ 132 struct irq_desc *desc = irq_to_desc(i);
138 if (i != SYSCALL_VECTOR) 133
139 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); 134 desc->status = IRQ_DISABLED;
135 desc->action = NULL;
136 desc->depth = 1;
137
138 set_irq_chip_and_handler_name(i, &i8259A_chip,
139 handle_level_irq, "XT");
140 } 140 }
141}
141 142
143/* Overridden in paravirt.c */
144void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
142 145
143#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) 146static void __init smp_intr_init(void)
147{
148#ifdef CONFIG_SMP
149#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
144 /* 150 /*
145 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 151 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
146 * IPI, driven by wakeup. 152 * IPI, driven by wakeup.
@@ -160,16 +166,35 @@ void __init native_init_IRQ(void)
160 /* IPI for generic function call */ 166 /* IPI for generic function call */
161 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 167 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
162 168
163 /* IPI for single call function */ 169 /* IPI for generic single function call */
164 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, 170 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
165 call_function_single_interrupt); 171 call_function_single_interrupt);
166 172
167 /* Low priority IPI to cleanup after moving an irq */ 173 /* Low priority IPI to cleanup after moving an irq */
168 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 174 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
169 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); 175 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
176
177 /* IPI used for rebooting/stopping */
178 alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
170#endif 179#endif
180#endif /* CONFIG_SMP */
181}
182
183static void __init apic_intr_init(void)
184{
185 smp_intr_init();
171 186
172#ifdef CONFIG_X86_LOCAL_APIC 187#ifdef CONFIG_X86_THERMAL_VECTOR
188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
189#endif
190#ifdef CONFIG_X86_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
194 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
195#endif
196
197#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
173 /* self generated IPI for local APIC timer */ 198 /* self generated IPI for local APIC timer */
174 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 199 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
175 200
@@ -179,16 +204,59 @@ void __init native_init_IRQ(void)
179 /* IPI vectors for APIC spurious and error interrupts */ 204 /* IPI vectors for APIC spurious and error interrupts */
180 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 205 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
181 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 206 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
207
208 /* Performance monitoring interrupts: */
209# ifdef CONFIG_PERF_COUNTERS
210 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
211# endif
212
182#endif 213#endif
214}
183 215
184#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) 216/**
185 /* thermal monitor LVT interrupt */ 217 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
186 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 218 *
219 * Description:
220 * Perform any necessary interrupt initialisation prior to setting up
221 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
222 * interrupts should be initialised here if the machine emulates a PC
223 * in any way.
224 **/
225static void __init x86_quirk_pre_intr_init(void)
226{
227#ifdef CONFIG_X86_32
228 if (x86_quirks->arch_pre_intr_init) {
229 if (x86_quirks->arch_pre_intr_init())
230 return;
231 }
187#endif 232#endif
233 init_ISA_irqs();
234}
235
236void __init native_init_IRQ(void)
237{
238 int i;
239
240 /* Execute any quirks before the call gates are initialised: */
241 x86_quirk_pre_intr_init();
242
243 apic_intr_init();
244
245 /*
246 * Cover the whole vector space, no vector can escape
247 * us. (some of these will be overridden and become
248 * 'special' SMP interrupts)
249 */
250 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
251 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
252 if (!test_bit(i, used_vectors))
253 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
254 }
188 255
189 if (!acpi_ioapic) 256 if (!acpi_ioapic)
190 setup_irq(2, &irq2); 257 setup_irq(2, &irq2);
191 258
259#ifdef CONFIG_X86_32
192 /* 260 /*
193 * Call quirks after call gates are initialised (usually add in 261 * Call quirks after call gates are initialised (usually add in
194 * the architecture specific gates): 262 * the architecture specific gates):
@@ -203,4 +271,5 @@ void __init native_init_IRQ(void)
203 setup_irq(FPU_IRQ, &fpu_irq); 271 setup_irq(FPU_IRQ, &fpu_irq);
204 272
205 irq_ctx_init(smp_processor_id()); 273 irq_ctx_init(smp_processor_id());
274#endif
206} 275}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
deleted file mode 100644
index 8cd10537fd46..000000000000
--- a/arch/x86/kernel/irqinit_64.c
+++ /dev/null
@@ -1,177 +0,0 @@
1#include <linux/linkage.h>
2#include <linux/errno.h>
3#include <linux/signal.h>
4#include <linux/sched.h>
5#include <linux/ioport.h>
6#include <linux/interrupt.h>
7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h>
10#include <linux/init.h>
11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h>
13#include <linux/bitops.h>
14#include <linux/acpi.h>
15#include <linux/io.h>
16#include <linux/delay.h>
17
18#include <asm/atomic.h>
19#include <asm/system.h>
20#include <asm/hw_irq.h>
21#include <asm/pgtable.h>
22#include <asm/desc.h>
23#include <asm/apic.h>
24#include <asm/i8259.h>
25
26/*
27 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
28 * (these are usually mapped to vectors 0x30-0x3f)
29 */
30
31/*
32 * The IO-APIC gives us many more interrupt sources. Most of these
33 * are unused but an SMP system is supposed to have enough memory ...
34 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
35 * across the spectrum, so we really want to be prepared to get all
36 * of these. Plus, more powerful systems might have more than 64
37 * IO-APIC registers.
38 *
39 * (these are usually mapped into the 0x30-0xff vector range)
40 */
41
42/*
43 * IRQ2 is cascade interrupt to second interrupt controller
44 */
45
46static struct irqaction irq2 = {
47 .handler = no_action,
48 .name = "cascade",
49};
50DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
51 [0 ... IRQ0_VECTOR - 1] = -1,
52 [IRQ0_VECTOR] = 0,
53 [IRQ1_VECTOR] = 1,
54 [IRQ2_VECTOR] = 2,
55 [IRQ3_VECTOR] = 3,
56 [IRQ4_VECTOR] = 4,
57 [IRQ5_VECTOR] = 5,
58 [IRQ6_VECTOR] = 6,
59 [IRQ7_VECTOR] = 7,
60 [IRQ8_VECTOR] = 8,
61 [IRQ9_VECTOR] = 9,
62 [IRQ10_VECTOR] = 10,
63 [IRQ11_VECTOR] = 11,
64 [IRQ12_VECTOR] = 12,
65 [IRQ13_VECTOR] = 13,
66 [IRQ14_VECTOR] = 14,
67 [IRQ15_VECTOR] = 15,
68 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
69};
70
71int vector_used_by_percpu_irq(unsigned int vector)
72{
73 int cpu;
74
75 for_each_online_cpu(cpu) {
76 if (per_cpu(vector_irq, cpu)[vector] != -1)
77 return 1;
78 }
79
80 return 0;
81}
82
83static void __init init_ISA_irqs(void)
84{
85 int i;
86
87 init_bsp_APIC();
88 init_8259A(0);
89
90 for (i = 0; i < NR_IRQS_LEGACY; i++) {
91 struct irq_desc *desc = irq_to_desc(i);
92
93 desc->status = IRQ_DISABLED;
94 desc->action = NULL;
95 desc->depth = 1;
96
97 /*
98 * 16 old-style INTA-cycle interrupts:
99 */
100 set_irq_chip_and_handler_name(i, &i8259A_chip,
101 handle_level_irq, "XT");
102 }
103}
104
105void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
106
107static void __init smp_intr_init(void)
108{
109#ifdef CONFIG_SMP
110 /*
111 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
112 * IPI, driven by wakeup.
113 */
114 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
115
116 /* IPIs for invalidation */
117 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
118 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
119 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
120 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
121 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
122 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
123 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
124 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
125
126 /* IPI for generic function call */
127 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
128
129 /* IPI for generic single function call */
130 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
131 call_function_single_interrupt);
132
133 /* Low priority IPI to cleanup after moving an irq */
134 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
135 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
136#endif
137}
138
139static void __init apic_intr_init(void)
140{
141 smp_intr_init();
142
143 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
144 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
145
146 /* self generated IPI for local APIC timer */
147 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
148
149 /* generic IPI for platform specific use */
150 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
151
152 /* IPI vectors for APIC spurious and error interrupts */
153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
155}
156
157void __init native_init_IRQ(void)
158{
159 int i;
160
161 init_ISA_irqs();
162 /*
163 * Cover the whole vector space, no vector can escape
164 * us. (some of these will be overridden and become
165 * 'special' SMP interrupts)
166 */
167 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
168 int vector = FIRST_EXTERNAL_VECTOR + i;
169 if (vector != IA32_SYSCALL_VECTOR)
170 set_intr_gate(vector, interrupt[i]);
171 }
172
173 apic_intr_init();
174
175 if (!acpi_ioapic)
176 setup_irq(2, &irq2);
177}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f820b73c7f28..34e86b67550c 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -143,7 +143,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
143 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 143 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
144 gdb_regs32[GDB_CS] = __KERNEL_CS; 144 gdb_regs32[GDB_CS] = __KERNEL_CS;
145 gdb_regs32[GDB_SS] = __KERNEL_DS; 145 gdb_regs32[GDB_SS] = __KERNEL_DS;
146 gdb_regs[GDB_PC] = p->thread.ip; 146 gdb_regs[GDB_PC] = 0;
147 gdb_regs[GDB_R8] = 0; 147 gdb_regs[GDB_R8] = 0;
148 gdb_regs[GDB_R9] = 0; 148 gdb_regs[GDB_R9] = 0;
149 gdb_regs[GDB_R10] = 0; 149 gdb_regs[GDB_R10] = 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019ddb56b4..a78ecad0c900 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <asm/timer.h>
30 31
31#define MMU_QUEUE_SIZE 1024 32#define MMU_QUEUE_SIZE 1024
32 33
@@ -195,7 +196,7 @@ static void kvm_leave_lazy_mmu(void)
195 struct kvm_para_state *state = kvm_para_state(); 196 struct kvm_para_state *state = kvm_para_state();
196 197
197 mmu_queue_flush(state); 198 mmu_queue_flush(state);
198 paravirt_leave_lazy(paravirt_get_lazy_mode()); 199 paravirt_leave_lazy_mmu();
199 state->mode = paravirt_get_lazy_mode(); 200 state->mode = paravirt_get_lazy_mode();
200} 201}
201 202
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)
230 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; 231 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
231 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; 232 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
232 } 233 }
234#ifdef CONFIG_X86_IO_APIC
235 no_timer_check = 1;
236#endif
233} 237}
234 238
235void __init kvm_guest_init(void) 239void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 453b5795a5c6..366baa179913 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,25 +13,13 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16#include <linux/platform_device.h>
17#include <linux/capability.h>
18#include <linux/miscdevice.h>
19#include <linux/firmware.h> 16#include <linux/firmware.h>
20#include <linux/spinlock.h>
21#include <linux/cpumask.h>
22#include <linux/pci_ids.h> 17#include <linux/pci_ids.h>
23#include <linux/uaccess.h> 18#include <linux/uaccess.h>
24#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
25#include <linux/kernel.h> 20#include <linux/kernel.h>
26#include <linux/module.h> 21#include <linux/module.h>
27#include <linux/mutex.h>
28#include <linux/sched.h>
29#include <linux/init.h>
30#include <linux/slab.h>
31#include <linux/cpu.h>
32#include <linux/pci.h> 22#include <linux/pci.h>
33#include <linux/fs.h>
34#include <linux/mm.h>
35 23
36#include <asm/microcode.h> 24#include <asm/microcode.h>
37#include <asm/processor.h> 25#include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
79#define UCODE_CONTAINER_SECTION_HDR 8 67#define UCODE_CONTAINER_SECTION_HDR 8
80#define UCODE_CONTAINER_HEADER_SIZE 12 68#define UCODE_CONTAINER_HEADER_SIZE 12
81 69
82/* serialize access to the physical write */
83static DEFINE_SPINLOCK(microcode_update_lock);
84
85static struct equiv_cpu_entry *equiv_cpu_table; 70static struct equiv_cpu_entry *equiv_cpu_table;
86 71
87static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 72static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
144 return 1; 129 return 1;
145} 130}
146 131
147static void apply_microcode_amd(int cpu) 132static int apply_microcode_amd(int cpu)
148{ 133{
149 unsigned long flags;
150 u32 rev, dummy; 134 u32 rev, dummy;
151 int cpu_num = raw_smp_processor_id(); 135 int cpu_num = raw_smp_processor_id();
152 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; 136 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
156 BUG_ON(cpu_num != cpu); 140 BUG_ON(cpu_num != cpu);
157 141
158 if (mc_amd == NULL) 142 if (mc_amd == NULL)
159 return; 143 return 0;
160 144
161 spin_lock_irqsave(&microcode_update_lock, flags);
162 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); 145 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
163 /* get patch id after patching */ 146 /* get patch id after patching */
164 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); 147 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
165 spin_unlock_irqrestore(&microcode_update_lock, flags);
166 148
167 /* check current patch id and patch's id for match */ 149 /* check current patch id and patch's id for match */
168 if (rev != mc_amd->hdr.patch_id) { 150 if (rev != mc_amd->hdr.patch_id) {
169 printk(KERN_ERR "microcode: CPU%d: update failed " 151 printk(KERN_ERR "microcode: CPU%d: update failed "
170 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
171 return; 153 return -1;
172 } 154 }
173 155
174 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
175 cpu, rev); 157 cpu, rev);
176 158
177 uci->cpu_sig.rev = rev; 159 uci->cpu_sig.rev = rev;
160
161 return 0;
178} 162}
179 163
180static int get_ucode_data(void *to, const u8 *from, size_t n) 164static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf)
257 241
258static void free_equiv_cpu_table(void) 242static void free_equiv_cpu_table(void)
259{ 243{
260 if (equiv_cpu_table) { 244 vfree(equiv_cpu_table);
261 vfree(equiv_cpu_table); 245 equiv_cpu_table = NULL;
262 equiv_cpu_table = NULL;
263 }
264} 246}
265 247
266static int generic_load_microcode(int cpu, const u8 *data, size_t size) 248static enum ucode_state
249generic_load_microcode(int cpu, const u8 *data, size_t size)
267{ 250{
268 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 251 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
269 const u8 *ucode_ptr = data; 252 const u8 *ucode_ptr = data;
@@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
272 int new_rev = uci->cpu_sig.rev; 255 int new_rev = uci->cpu_sig.rev;
273 unsigned int leftover; 256 unsigned int leftover;
274 unsigned long offset; 257 unsigned long offset;
258 enum ucode_state state = UCODE_OK;
275 259
276 offset = install_equiv_cpu_table(ucode_ptr); 260 offset = install_equiv_cpu_table(ucode_ptr);
277 if (!offset) { 261 if (!offset) {
278 printk(KERN_ERR "microcode: failed to create " 262 printk(KERN_ERR "microcode: failed to create "
279 "equivalent cpu table\n"); 263 "equivalent cpu table\n");
280 return -EINVAL; 264 return UCODE_ERROR;
281 } 265 }
282 266
283 ucode_ptr += offset; 267 ucode_ptr += offset;
@@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
293 277
294 mc_header = (struct microcode_header_amd *)mc; 278 mc_header = (struct microcode_header_amd *)mc;
295 if (get_matching_microcode(cpu, mc, new_rev)) { 279 if (get_matching_microcode(cpu, mc, new_rev)) {
296 if (new_mc) 280 vfree(new_mc);
297 vfree(new_mc);
298 new_rev = mc_header->patch_id; 281 new_rev = mc_header->patch_id;
299 new_mc = mc; 282 new_mc = mc;
300 } else 283 } else
@@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
306 289
307 if (new_mc) { 290 if (new_mc) {
308 if (!leftover) { 291 if (!leftover) {
309 if (uci->mc) 292 vfree(uci->mc);
310 vfree(uci->mc);
311 uci->mc = new_mc; 293 uci->mc = new_mc;
312 pr_debug("microcode: CPU%d found a matching microcode " 294 pr_debug("microcode: CPU%d found a matching microcode "
313 "update with version 0x%x (current=0x%x)\n", 295 "update with version 0x%x (current=0x%x)\n",
314 cpu, new_rev, uci->cpu_sig.rev); 296 cpu, new_rev, uci->cpu_sig.rev);
315 } else 297 } else {
316 vfree(new_mc); 298 vfree(new_mc);
317 } 299 state = UCODE_ERROR;
300 }
301 } else
302 state = UCODE_NFOUND;
318 303
319 free_equiv_cpu_table(); 304 free_equiv_cpu_table();
320 305
321 return (int)leftover; 306 return state;
322} 307}
323 308
324static int request_microcode_fw(int cpu, struct device *device) 309static enum ucode_state request_microcode_fw(int cpu, struct device *device)
325{ 310{
326 const char *fw_name = "amd-ucode/microcode_amd.bin"; 311 const char *fw_name = "amd-ucode/microcode_amd.bin";
327 const struct firmware *firmware; 312 const struct firmware *firmware;
328 int ret; 313 enum ucode_state ret;
329
330 /* We should bind the task to the CPU */
331 BUG_ON(cpu != raw_smp_processor_id());
332 314
333 ret = request_firmware(&firmware, fw_name, device); 315 if (request_firmware(&firmware, fw_name, device)) {
334 if (ret) {
335 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); 316 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
336 return ret; 317 return UCODE_NFOUND;
337 } 318 }
338 319
339 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 320 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device)
343 return ret; 324 return ret;
344} 325}
345 326
346static int request_microcode_user(int cpu, const void __user *buf, size_t size) 327static enum ucode_state
328request_microcode_user(int cpu, const void __user *buf, size_t size)
347{ 329{
348 printk(KERN_INFO "microcode: AMD microcode update via " 330 printk(KERN_INFO "microcode: AMD microcode update via "
349 "/dev/cpu/microcode not supported\n"); 331 "/dev/cpu/microcode not supported\n");
350 return -1; 332 return UCODE_ERROR;
351} 333}
352 334
353static void microcode_fini_cpu_amd(int cpu) 335static void microcode_fini_cpu_amd(int cpu)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 98c470c069d1..9371448290ac 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -71,27 +71,18 @@
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h> 73#include <linux/platform_device.h>
74#include <linux/capability.h>
75#include <linux/miscdevice.h> 74#include <linux/miscdevice.h>
76#include <linux/firmware.h> 75#include <linux/capability.h>
77#include <linux/smp_lock.h> 76#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
79#include <linux/cpumask.h>
80#include <linux/uaccess.h>
81#include <linux/vmalloc.h>
82#include <linux/kernel.h> 77#include <linux/kernel.h>
83#include <linux/module.h> 78#include <linux/module.h>
84#include <linux/mutex.h> 79#include <linux/mutex.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
88#include <linux/cpu.h> 80#include <linux/cpu.h>
89#include <linux/fs.h> 81#include <linux/fs.h>
90#include <linux/mm.h> 82#include <linux/mm.h>
91 83
92#include <asm/microcode.h> 84#include <asm/microcode.h>
93#include <asm/processor.h> 85#include <asm/processor.h>
94#include <asm/msr.h>
95 86
96MODULE_DESCRIPTION("Microcode Update Driver"); 87MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 88MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
101 92
102static struct microcode_ops *microcode_ops; 93static struct microcode_ops *microcode_ops;
103 94
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ 95/*
96 * Synchronization.
97 *
98 * All non cpu-hotplug-callback call sites use:
99 *
100 * - microcode_mutex to synchronize with each other;
101 * - get/put_online_cpus() to synchronize with
102 * the cpu-hotplug-callback call sites.
103 *
104 * We guarantee that only a single cpu is being
105 * updated at any particular moment of time.
106 */
105static DEFINE_MUTEX(microcode_mutex); 107static DEFINE_MUTEX(microcode_mutex);
106 108
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; 109struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info); 110EXPORT_SYMBOL_GPL(ucode_cpu_info);
109 111
112/*
113 * Operations that are run on a target cpu:
114 */
115
116struct cpu_info_ctx {
117 struct cpu_signature *cpu_sig;
118 int err;
119};
120
121static void collect_cpu_info_local(void *arg)
122{
123 struct cpu_info_ctx *ctx = arg;
124
125 ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
126 ctx->cpu_sig);
127}
128
129static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
130{
131 struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
132 int ret;
133
134 ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
135 if (!ret)
136 ret = ctx.err;
137
138 return ret;
139}
140
141static int collect_cpu_info(int cpu)
142{
143 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
144 int ret;
145
146 memset(uci, 0, sizeof(*uci));
147
148 ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
149 if (!ret)
150 uci->valid = 1;
151
152 return ret;
153}
154
155struct apply_microcode_ctx {
156 int err;
157};
158
159static void apply_microcode_local(void *arg)
160{
161 struct apply_microcode_ctx *ctx = arg;
162
163 ctx->err = microcode_ops->apply_microcode(smp_processor_id());
164}
165
166static int apply_microcode_on_target(int cpu)
167{
168 struct apply_microcode_ctx ctx = { .err = 0 };
169 int ret;
170
171 ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
172 if (!ret)
173 ret = ctx.err;
174
175 return ret;
176}
177
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE 178#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size) 179static int do_microcode_update(const void __user *buf, size_t size)
112{ 180{
113 cpumask_t old;
114 int error = 0; 181 int error = 0;
115 int cpu; 182 int cpu;
116 183
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) { 184 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 185 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
186 enum ucode_state ustate;
121 187
122 if (!uci->valid) 188 if (!uci->valid)
123 continue; 189 continue;
124 190
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 191 ustate = microcode_ops->request_microcode_user(cpu, buf, size);
126 error = microcode_ops->request_microcode_user(cpu, buf, size); 192 if (ustate == UCODE_ERROR) {
127 if (error < 0) 193 error = -1;
128 goto out; 194 break;
129 if (!error) 195 } else if (ustate == UCODE_OK)
130 microcode_ops->apply_microcode(cpu); 196 apply_microcode_on_target(cpu);
131 } 197 }
132out: 198
133 set_cpus_allowed_ptr(current, &old);
134 return error; 199 return error;
135} 200}
136 201
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
143static ssize_t microcode_write(struct file *file, const char __user *buf, 208static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos) 209 size_t len, loff_t *ppos)
145{ 210{
146 ssize_t ret; 211 ssize_t ret = -EINVAL;
147 212
148 if ((len >> PAGE_SHIFT) > num_physpages) { 213 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", 214 pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
150 num_physpages); 215 return ret;
151 return -EINVAL;
152 } 216 }
153 217
154 get_online_cpus(); 218 get_online_cpus();
155 mutex_lock(&microcode_mutex); 219 mutex_lock(&microcode_mutex);
156 220
157 ret = do_microcode_update(buf, len); 221 if (do_microcode_update(buf, len) == 0)
158 if (!ret)
159 ret = (ssize_t)len; 222 ret = (ssize_t)len;
160 223
161 mutex_unlock(&microcode_mutex); 224 mutex_unlock(&microcode_mutex);
@@ -165,15 +228,16 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
165} 228}
166 229
167static const struct file_operations microcode_fops = { 230static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE, 231 .owner = THIS_MODULE,
169 .write = microcode_write, 232 .write = microcode_write,
170 .open = microcode_open, 233 .open = microcode_open,
171}; 234};
172 235
173static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
175 .name = "microcode", 238 .name = "microcode",
176 .fops = &microcode_fops, 239 .devnode = "cpu/microcode",
240 .fops = &microcode_fops,
177}; 241};
178 242
179static int __init microcode_dev_init(void) 243static int __init microcode_dev_init(void)
@@ -182,9 +246,7 @@ static int __init microcode_dev_init(void)
182 246
183 error = misc_register(&microcode_dev); 247 error = misc_register(&microcode_dev);
184 if (error) { 248 if (error) {
185 printk(KERN_ERR 249 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error; 250 return error;
189 } 251 }
190 252
@@ -205,42 +267,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
205/* fake device for request_firmware */ 267/* fake device for request_firmware */
206static struct platform_device *microcode_pdev; 268static struct platform_device *microcode_pdev;
207 269
208static long reload_for_cpu(void *unused) 270static int reload_for_cpu(int cpu)
209{ 271{
210 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); 272 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
211 int err = 0; 273 int err = 0;
212 274
213 mutex_lock(&microcode_mutex); 275 mutex_lock(&microcode_mutex);
214 if (uci->valid) { 276 if (uci->valid) {
215 err = microcode_ops->request_microcode_fw(smp_processor_id(), 277 enum ucode_state ustate;
216 &microcode_pdev->dev); 278
217 if (!err) 279 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
218 microcode_ops->apply_microcode(smp_processor_id()); 280 if (ustate == UCODE_OK)
281 apply_microcode_on_target(cpu);
282 else
283 if (ustate == UCODE_ERROR)
284 err = -EINVAL;
219 } 285 }
220 mutex_unlock(&microcode_mutex); 286 mutex_unlock(&microcode_mutex);
287
221 return err; 288 return err;
222} 289}
223 290
224static ssize_t reload_store(struct sys_device *dev, 291static ssize_t reload_store(struct sys_device *dev,
225 struct sysdev_attribute *attr, 292 struct sysdev_attribute *attr,
226 const char *buf, size_t sz) 293 const char *buf, size_t size)
227{ 294{
228 char *end; 295 unsigned long val;
229 unsigned long val = simple_strtoul(buf, &end, 0);
230 int err = 0;
231 int cpu = dev->id; 296 int cpu = dev->id;
297 int ret = 0;
298 char *end;
232 299
300 val = simple_strtoul(buf, &end, 0);
233 if (end == buf) 301 if (end == buf)
234 return -EINVAL; 302 return -EINVAL;
303
235 if (val == 1) { 304 if (val == 1) {
236 get_online_cpus(); 305 get_online_cpus();
237 if (cpu_online(cpu)) 306 if (cpu_online(cpu))
238 err = work_on_cpu(cpu, reload_for_cpu, NULL); 307 ret = reload_for_cpu(cpu);
239 put_online_cpus(); 308 put_online_cpus();
240 } 309 }
241 if (err) 310
242 return err; 311 if (!ret)
243 return sz; 312 ret = size;
313
314 return ret;
244} 315}
245 316
246static ssize_t version_show(struct sys_device *dev, 317static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +342,11 @@ static struct attribute *mc_default_attrs[] = {
271}; 342};
272 343
273static struct attribute_group mc_attr_group = { 344static struct attribute_group mc_attr_group = {
274 .attrs = mc_default_attrs, 345 .attrs = mc_default_attrs,
275 .name = "microcode", 346 .name = "microcode",
276}; 347};
277 348
278static void __microcode_fini_cpu(int cpu) 349static void microcode_fini_cpu(int cpu)
279{ 350{
280 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 351 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
281 352
@@ -283,103 +354,68 @@ static void __microcode_fini_cpu(int cpu)
283 uci->valid = 0; 354 uci->valid = 0;
284} 355}
285 356
286static void microcode_fini_cpu(int cpu) 357static enum ucode_state microcode_resume_cpu(int cpu)
287{
288 mutex_lock(&microcode_mutex);
289 __microcode_fini_cpu(cpu);
290 mutex_unlock(&microcode_mutex);
291}
292
293static void collect_cpu_info(int cpu)
294{ 358{
295 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 359 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
296 360
297 memset(uci, 0, sizeof(*uci)); 361 if (!uci->mc)
298 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) 362 return UCODE_NFOUND;
299 uci->valid = 1; 363
364 pr_debug("microcode: CPU%d updated upon resume\n", cpu);
365 apply_microcode_on_target(cpu);
366
367 return UCODE_OK;
300} 368}
301 369
302static int microcode_resume_cpu(int cpu) 370static enum ucode_state microcode_init_cpu(int cpu)
303{ 371{
304 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 372 enum ucode_state ustate;
305 struct cpu_signature nsig;
306 373
307 pr_debug("microcode: CPU%d resumed\n", cpu); 374 if (collect_cpu_info(cpu))
375 return UCODE_ERROR;
308 376
309 if (!uci->mc) 377 /* --dimm. Trigger a delayed update? */
310 return 1; 378 if (system_state != SYSTEM_RUNNING)
379 return UCODE_NFOUND;
311 380
312 /* 381 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
313 * Let's verify that the 'cached' ucode does belong
314 * to this cpu (a bit of paranoia):
315 */
316 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
317 __microcode_fini_cpu(cpu);
318 printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
319 cpu);
320 return -1;
321 }
322 382
323 if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { 383 if (ustate == UCODE_OK) {
324 __microcode_fini_cpu(cpu); 384 pr_debug("microcode: CPU%d updated upon init\n", cpu);
325 printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", 385 apply_microcode_on_target(cpu);
326 cpu);
327 /* Should we look for a new ucode here? */
328 return 1;
329 } 386 }
330 387
331 return 0; 388 return ustate;
332} 389}
333 390
334static long microcode_update_cpu(void *unused) 391static enum ucode_state microcode_update_cpu(int cpu)
335{ 392{
336 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); 393 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
337 int err = 0; 394 enum ucode_state ustate;
338 395
339 /* 396 if (uci->valid)
340 * Check if the system resume is in progress (uci->valid != NULL), 397 ustate = microcode_resume_cpu(cpu);
341 * otherwise just request a firmware: 398 else
342 */ 399 ustate = microcode_init_cpu(cpu);
343 if (uci->valid) {
344 err = microcode_resume_cpu(smp_processor_id());
345 } else {
346 collect_cpu_info(smp_processor_id());
347 if (uci->valid && system_state == SYSTEM_RUNNING)
348 err = microcode_ops->request_microcode_fw(
349 smp_processor_id(),
350 &microcode_pdev->dev);
351 }
352 if (!err)
353 microcode_ops->apply_microcode(smp_processor_id());
354 return err;
355}
356 400
357static int microcode_init_cpu(int cpu) 401 return ustate;
358{
359 int err;
360 mutex_lock(&microcode_mutex);
361 err = work_on_cpu(cpu, microcode_update_cpu, NULL);
362 mutex_unlock(&microcode_mutex);
363
364 return err;
365} 402}
366 403
367static int mc_sysdev_add(struct sys_device *sys_dev) 404static int mc_sysdev_add(struct sys_device *sys_dev)
368{ 405{
369 int err, cpu = sys_dev->id; 406 int err, cpu = sys_dev->id;
370 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
371 407
372 if (!cpu_online(cpu)) 408 if (!cpu_online(cpu))
373 return 0; 409 return 0;
374 410
375 pr_debug("microcode: CPU%d added\n", cpu); 411 pr_debug("microcode: CPU%d added\n", cpu);
376 memset(uci, 0, sizeof(*uci));
377 412
378 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 413 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
379 if (err) 414 if (err)
380 return err; 415 return err;
381 416
382 err = microcode_init_cpu(cpu); 417 if (microcode_init_cpu(cpu) == UCODE_ERROR)
418 err = -EINVAL;
383 419
384 return err; 420 return err;
385} 421}
@@ -400,19 +436,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
400static int mc_sysdev_resume(struct sys_device *dev) 436static int mc_sysdev_resume(struct sys_device *dev)
401{ 437{
402 int cpu = dev->id; 438 int cpu = dev->id;
439 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
403 440
404 if (!cpu_online(cpu)) 441 if (!cpu_online(cpu))
405 return 0; 442 return 0;
406 443
407 /* only CPU 0 will apply ucode here */ 444 /*
408 microcode_update_cpu(NULL); 445 * All non-bootup cpus are still disabled,
446 * so only CPU 0 will apply ucode here.
447 *
448 * Moreover, there can be no concurrent
449 * updates from any other places at this point.
450 */
451 WARN_ON(cpu != 0);
452
453 if (uci->valid && uci->mc)
454 microcode_ops->apply_microcode(cpu);
455
409 return 0; 456 return 0;
410} 457}
411 458
412static struct sysdev_driver mc_sysdev_driver = { 459static struct sysdev_driver mc_sysdev_driver = {
413 .add = mc_sysdev_add, 460 .add = mc_sysdev_add,
414 .remove = mc_sysdev_remove, 461 .remove = mc_sysdev_remove,
415 .resume = mc_sysdev_resume, 462 .resume = mc_sysdev_resume,
416}; 463};
417 464
418static __cpuinit int 465static __cpuinit int
@@ -425,15 +472,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
425 switch (action) { 472 switch (action) {
426 case CPU_ONLINE: 473 case CPU_ONLINE:
427 case CPU_ONLINE_FROZEN: 474 case CPU_ONLINE_FROZEN:
428 if (microcode_init_cpu(cpu)) 475 microcode_update_cpu(cpu);
429 printk(KERN_ERR "microcode: failed to init CPU%d\n",
430 cpu);
431 case CPU_DOWN_FAILED: 476 case CPU_DOWN_FAILED:
432 case CPU_DOWN_FAILED_FROZEN: 477 case CPU_DOWN_FAILED_FROZEN:
433 pr_debug("microcode: CPU%d added\n", cpu); 478 pr_debug("microcode: CPU%d added\n", cpu);
434 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 479 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
435 printk(KERN_ERR "microcode: Failed to create the sysfs " 480 pr_err("microcode: Failed to create group for CPU%d\n", cpu);
436 "group for CPU%d\n", cpu);
437 break; 481 break;
438 case CPU_DOWN_PREPARE: 482 case CPU_DOWN_PREPARE:
439 case CPU_DOWN_PREPARE_FROZEN: 483 case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +509,10 @@ static int __init microcode_init(void)
465 microcode_ops = init_amd_microcode(); 509 microcode_ops = init_amd_microcode();
466 510
467 if (!microcode_ops) { 511 if (!microcode_ops) {
468 printk(KERN_ERR "microcode: no support for this CPU vendor\n"); 512 pr_err("microcode: no support for this CPU vendor\n");
469 return -ENODEV; 513 return -ENODEV;
470 } 514 }
471 515
472 error = microcode_dev_init();
473 if (error)
474 return error;
475 microcode_pdev = platform_device_register_simple("microcode", -1, 516 microcode_pdev = platform_device_register_simple("microcode", -1,
476 NULL, 0); 517 NULL, 0);
477 if (IS_ERR(microcode_pdev)) { 518 if (IS_ERR(microcode_pdev)) {
@@ -480,23 +521,31 @@ static int __init microcode_init(void)
480 } 521 }
481 522
482 get_online_cpus(); 523 get_online_cpus();
524 mutex_lock(&microcode_mutex);
525
483 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); 526 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
527
528 mutex_unlock(&microcode_mutex);
484 put_online_cpus(); 529 put_online_cpus();
530
485 if (error) { 531 if (error) {
486 microcode_dev_exit();
487 platform_device_unregister(microcode_pdev); 532 platform_device_unregister(microcode_pdev);
488 return error; 533 return error;
489 } 534 }
490 535
536 error = microcode_dev_init();
537 if (error)
538 return error;
539
491 register_hotcpu_notifier(&mc_cpu_notifier); 540 register_hotcpu_notifier(&mc_cpu_notifier);
492 541
493 printk(KERN_INFO 542 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
494 "Microcode Update Driver: v" MICROCODE_VERSION
495 " <tigran@aivazian.fsnet.co.uk>," 543 " <tigran@aivazian.fsnet.co.uk>,"
496 " Peter Oruba\n"); 544 " Peter Oruba\n");
497 545
498 return 0; 546 return 0;
499} 547}
548module_init(microcode_init);
500 549
501static void __exit microcode_exit(void) 550static void __exit microcode_exit(void)
502{ 551{
@@ -505,16 +554,17 @@ static void __exit microcode_exit(void)
505 unregister_hotcpu_notifier(&mc_cpu_notifier); 554 unregister_hotcpu_notifier(&mc_cpu_notifier);
506 555
507 get_online_cpus(); 556 get_online_cpus();
557 mutex_lock(&microcode_mutex);
558
508 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); 559 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
560
561 mutex_unlock(&microcode_mutex);
509 put_online_cpus(); 562 put_online_cpus();
510 563
511 platform_device_unregister(microcode_pdev); 564 platform_device_unregister(microcode_pdev);
512 565
513 microcode_ops = NULL; 566 microcode_ops = NULL;
514 567
515 printk(KERN_INFO 568 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
516 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
517} 569}
518
519module_init(microcode_init);
520module_exit(microcode_exit); 570module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 149b9ec7c1ab..0d334ddd0a96 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,24 +70,11 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h>
74#include <linux/capability.h>
75#include <linux/miscdevice.h>
76#include <linux/firmware.h> 73#include <linux/firmware.h>
77#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
79#include <linux/cpumask.h>
80#include <linux/uaccess.h> 74#include <linux/uaccess.h>
81#include <linux/vmalloc.h>
82#include <linux/kernel.h> 75#include <linux/kernel.h>
83#include <linux/module.h> 76#include <linux/module.h>
84#include <linux/mutex.h> 77#include <linux/vmalloc.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
88#include <linux/cpu.h>
89#include <linux/fs.h>
90#include <linux/mm.h>
91 78
92#include <asm/microcode.h> 79#include <asm/microcode.h>
93#include <asm/processor.h> 80#include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
150 137
151#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) 138#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
152 139
153/* serialize access to the physical write to MSR 0x79 */
154static DEFINE_SPINLOCK(microcode_update_lock);
155
156static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) 140static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
157{ 141{
158 struct cpuinfo_x86 *c = &cpu_data(cpu_num); 142 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
159 unsigned long flags;
160 unsigned int val[2]; 143 unsigned int val[2];
161 144
162 memset(csig, 0, sizeof(*csig)); 145 memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
176 csig->pf = 1 << ((val[1] >> 18) & 7); 159 csig->pf = 1 << ((val[1] >> 18) & 7);
177 } 160 }
178 161
179 /* serialize access to the physical write to MSR 0x79 */
180 spin_lock_irqsave(&microcode_update_lock, flags);
181
182 wrmsr(MSR_IA32_UCODE_REV, 0, 0); 162 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
183 /* see notes above for revision 1.07. Apparent chip bug */ 163 /* see notes above for revision 1.07. Apparent chip bug */
184 sync_core(); 164 sync_core();
185 /* get the current revision from MSR 0x8B */ 165 /* get the current revision from MSR 0x8B */
186 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
187 spin_unlock_irqrestore(&microcode_update_lock, flags);
188 167
189 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", 168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
190 csig->sig, csig->pf, csig->rev); 169 cpu_num, csig->sig, csig->pf, csig->rev);
191 170
192 return 0; 171 return 0;
193} 172}
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
318 return 0; 297 return 0;
319} 298}
320 299
321static void apply_microcode(int cpu) 300static int apply_microcode(int cpu)
322{ 301{
323 struct microcode_intel *mc_intel; 302 struct microcode_intel *mc_intel;
324 struct ucode_cpu_info *uci; 303 struct ucode_cpu_info *uci;
325 unsigned long flags;
326 unsigned int val[2]; 304 unsigned int val[2];
327 int cpu_num; 305 int cpu_num;
328 306
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
334 BUG_ON(cpu_num != cpu); 312 BUG_ON(cpu_num != cpu);
335 313
336 if (mc_intel == NULL) 314 if (mc_intel == NULL)
337 return; 315 return 0;
338
339 /* serialize access to the physical write to MSR 0x79 */
340 spin_lock_irqsave(&microcode_update_lock, flags);
341 316
342 /* write microcode via MSR 0x79 */ 317 /* write microcode via MSR 0x79 */
343 wrmsr(MSR_IA32_UCODE_WRITE, 318 wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
351 /* get the current revision from MSR 0x8B */ 326 /* get the current revision from MSR 0x8B */
352 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
353 328
354 spin_unlock_irqrestore(&microcode_update_lock, flags);
355 if (val[1] != mc_intel->hdr.rev) { 329 if (val[1] != mc_intel->hdr.rev) {
356 printk(KERN_ERR "microcode: CPU%d update from revision " 330 printk(KERN_ERR "microcode: CPU%d update "
357 "0x%x to 0x%x failed\n", 331 "to revision 0x%x failed\n",
358 cpu_num, uci->cpu_sig.rev, val[1]); 332 cpu_num, mc_intel->hdr.rev);
359 return; 333 return -1;
360 } 334 }
361 printk(KERN_INFO "microcode: CPU%d updated from revision " 335 printk(KERN_INFO "microcode: CPU%d updated to revision "
362 "0x%x to 0x%x, date = %04x-%02x-%02x \n", 336 "0x%x, date = %04x-%02x-%02x \n",
363 cpu_num, uci->cpu_sig.rev, val[1], 337 cpu_num, val[1],
364 mc_intel->hdr.date & 0xffff, 338 mc_intel->hdr.date & 0xffff,
365 mc_intel->hdr.date >> 24, 339 mc_intel->hdr.date >> 24,
366 (mc_intel->hdr.date >> 16) & 0xff); 340 (mc_intel->hdr.date >> 16) & 0xff);
367 341
368 uci->cpu_sig.rev = val[1]; 342 uci->cpu_sig.rev = val[1];
343
344 return 0;
369} 345}
370 346
371static int generic_load_microcode(int cpu, void *data, size_t size, 347static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
372 int (*get_ucode_data)(void *, const void *, size_t)) 348 int (*get_ucode_data)(void *, const void *, size_t))
373{ 349{
374 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 350 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
375 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 351 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
376 int new_rev = uci->cpu_sig.rev; 352 int new_rev = uci->cpu_sig.rev;
377 unsigned int leftover = size; 353 unsigned int leftover = size;
354 enum ucode_state state = UCODE_OK;
378 355
379 while (leftover) { 356 while (leftover) {
380 struct microcode_header_intel mc_header; 357 struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
412 leftover -= mc_size; 389 leftover -= mc_size;
413 } 390 }
414 391
415 if (!new_mc) 392 if (leftover) {
393 if (new_mc)
394 vfree(new_mc);
395 state = UCODE_ERROR;
416 goto out; 396 goto out;
397 }
417 398
418 if (leftover) { 399 if (!new_mc) {
419 vfree(new_mc); 400 state = UCODE_NFOUND;
420 goto out; 401 goto out;
421 } 402 }
422 403
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
427 pr_debug("microcode: CPU%d found a matching microcode update with" 408 pr_debug("microcode: CPU%d found a matching microcode update with"
428 " version 0x%x (current=0x%x)\n", 409 " version 0x%x (current=0x%x)\n",
429 cpu, new_rev, uci->cpu_sig.rev); 410 cpu, new_rev, uci->cpu_sig.rev);
430 411out:
431 out: 412 return state;
432 return (int)leftover;
433} 413}
434 414
435static int get_ucode_fw(void *to, const void *from, size_t n) 415static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
438 return 0; 418 return 0;
439} 419}
440 420
441static int request_microcode_fw(int cpu, struct device *device) 421static enum ucode_state request_microcode_fw(int cpu, struct device *device)
442{ 422{
443 char name[30]; 423 char name[30];
444 struct cpuinfo_x86 *c = &cpu_data(cpu); 424 struct cpuinfo_x86 *c = &cpu_data(cpu);
445 const struct firmware *firmware; 425 const struct firmware *firmware;
446 int ret; 426 enum ucode_state ret;
447 427
448 /* We should bind the task to the CPU */
449 BUG_ON(cpu != raw_smp_processor_id());
450 sprintf(name, "intel-ucode/%02x-%02x-%02x", 428 sprintf(name, "intel-ucode/%02x-%02x-%02x",
451 c->x86, c->x86_model, c->x86_mask); 429 c->x86, c->x86_model, c->x86_mask);
452 ret = request_firmware(&firmware, name, device); 430
453 if (ret) { 431 if (request_firmware(&firmware, name, device)) {
454 pr_debug("microcode: data file %s load failed\n", name); 432 pr_debug("microcode: data file %s load failed\n", name);
455 return ret; 433 return UCODE_NFOUND;
456 } 434 }
457 435
458 ret = generic_load_microcode(cpu, (void *)firmware->data, 436 ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
468 return copy_from_user(to, from, n); 446 return copy_from_user(to, from, n);
469} 447}
470 448
471static int request_microcode_user(int cpu, const void __user *buf, size_t size) 449static enum ucode_state
450request_microcode_user(int cpu, const void __user *buf, size_t size)
472{ 451{
473 /* We should bind the task to the CPU */
474 BUG_ON(cpu != raw_smp_processor_id());
475
476 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); 452 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
477} 453}
478 454
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module.c
index c23880b90b5c..89f386f044e4 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module.c
@@ -1,6 +1,5 @@
1/* Kernel module help for x86-64 1/* Kernel module help for x86.
2 Copyright (C) 2001 Rusty Russell. 2 Copyright (C) 2001 Rusty Russell.
3 Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
4 3
5 This program is free software; you can redistribute it and/or modify 4 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 5 it under the terms of the GNU General Public License as published by
@@ -22,23 +21,18 @@
22#include <linux/fs.h> 21#include <linux/fs.h>
23#include <linux/string.h> 22#include <linux/string.h>
24#include <linux/kernel.h> 23#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/slab.h>
27#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h>
28 26
29#include <asm/system.h> 27#include <asm/system.h>
30#include <asm/page.h> 28#include <asm/page.h>
31#include <asm/pgtable.h> 29#include <asm/pgtable.h>
32 30
31#if 0
32#define DEBUGP printk
33#else
33#define DEBUGP(fmt...) 34#define DEBUGP(fmt...)
34 35#endif
35#ifndef CONFIG_UML
36void module_free(struct module *mod, void *module_region)
37{
38 vfree(module_region);
39 /* FIXME: If module_region == mod->init_region, trim exception
40 table entries. */
41}
42 36
43void *module_alloc(unsigned long size) 37void *module_alloc(unsigned long size)
44{ 38{
@@ -54,9 +48,15 @@ void *module_alloc(unsigned long size)
54 if (!area) 48 if (!area)
55 return NULL; 49 return NULL;
56 50
57 return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); 51 return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
52 PAGE_KERNEL_EXEC);
53}
54
55/* Free memory returned from module_alloc */
56void module_free(struct module *mod, void *module_region)
57{
58 vfree(module_region);
58} 59}
59#endif
60 60
61/* We don't need anything special. */ 61/* We don't need anything special. */
62int module_frob_arch_sections(Elf_Ehdr *hdr, 62int module_frob_arch_sections(Elf_Ehdr *hdr,
@@ -67,6 +67,58 @@ int module_frob_arch_sections(Elf_Ehdr *hdr,
67 return 0; 67 return 0;
68} 68}
69 69
70#ifdef CONFIG_X86_32
71int apply_relocate(Elf32_Shdr *sechdrs,
72 const char *strtab,
73 unsigned int symindex,
74 unsigned int relsec,
75 struct module *me)
76{
77 unsigned int i;
78 Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
79 Elf32_Sym *sym;
80 uint32_t *location;
81
82 DEBUGP("Applying relocate section %u to %u\n", relsec,
83 sechdrs[relsec].sh_info);
84 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
85 /* This is where to make the change */
86 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
87 + rel[i].r_offset;
88 /* This is the symbol it is referring to. Note that all
89 undefined symbols have been resolved. */
90 sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
91 + ELF32_R_SYM(rel[i].r_info);
92
93 switch (ELF32_R_TYPE(rel[i].r_info)) {
94 case R_386_32:
95 /* We add the value into the location given */
96 *location += sym->st_value;
97 break;
98 case R_386_PC32:
99 /* Add the value, subtract its postition */
100 *location += sym->st_value - (uint32_t)location;
101 break;
102 default:
103 printk(KERN_ERR "module %s: Unknown relocation: %u\n",
104 me->name, ELF32_R_TYPE(rel[i].r_info));
105 return -ENOEXEC;
106 }
107 }
108 return 0;
109}
110
111int apply_relocate_add(Elf32_Shdr *sechdrs,
112 const char *strtab,
113 unsigned int symindex,
114 unsigned int relsec,
115 struct module *me)
116{
117 printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
118 me->name);
119 return -ENOEXEC;
120}
121#else /*X86_64*/
70int apply_relocate_add(Elf64_Shdr *sechdrs, 122int apply_relocate_add(Elf64_Shdr *sechdrs,
71 const char *strtab, 123 const char *strtab,
72 unsigned int symindex, 124 unsigned int symindex,
@@ -147,6 +199,8 @@ int apply_relocate(Elf_Shdr *sechdrs,
147 return -ENOSYS; 199 return -ENOSYS;
148} 200}
149 201
202#endif
203
150int module_finalize(const Elf_Ehdr *hdr, 204int module_finalize(const Elf_Ehdr *hdr,
151 const Elf_Shdr *sechdrs, 205 const Elf_Shdr *sechdrs,
152 struct module *me) 206 struct module *me)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
deleted file mode 100644
index 0edd819050e7..000000000000
--- a/arch/x86/kernel/module_32.c
+++ /dev/null
@@ -1,152 +0,0 @@
1/* Kernel module help for i386.
2 Copyright (C) 2001 Rusty Russell.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18#include <linux/moduleloader.h>
19#include <linux/elf.h>
20#include <linux/vmalloc.h>
21#include <linux/fs.h>
22#include <linux/string.h>
23#include <linux/kernel.h>
24#include <linux/bug.h>
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(fmt...)
30#endif
31
32void *module_alloc(unsigned long size)
33{
34 if (size == 0)
35 return NULL;
36 return vmalloc_exec(size);
37}
38
39
40/* Free memory returned from module_alloc */
41void module_free(struct module *mod, void *module_region)
42{
43 vfree(module_region);
44 /* FIXME: If module_region == mod->init_region, trim exception
45 table entries. */
46}
47
48/* We don't need anything special. */
49int module_frob_arch_sections(Elf_Ehdr *hdr,
50 Elf_Shdr *sechdrs,
51 char *secstrings,
52 struct module *mod)
53{
54 return 0;
55}
56
57int apply_relocate(Elf32_Shdr *sechdrs,
58 const char *strtab,
59 unsigned int symindex,
60 unsigned int relsec,
61 struct module *me)
62{
63 unsigned int i;
64 Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
65 Elf32_Sym *sym;
66 uint32_t *location;
67
68 DEBUGP("Applying relocate section %u to %u\n", relsec,
69 sechdrs[relsec].sh_info);
70 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
71 /* This is where to make the change */
72 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
73 + rel[i].r_offset;
74 /* This is the symbol it is referring to. Note that all
75 undefined symbols have been resolved. */
76 sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
77 + ELF32_R_SYM(rel[i].r_info);
78
79 switch (ELF32_R_TYPE(rel[i].r_info)) {
80 case R_386_32:
81 /* We add the value into the location given */
82 *location += sym->st_value;
83 break;
84 case R_386_PC32:
85 /* Add the value, subtract its postition */
86 *location += sym->st_value - (uint32_t)location;
87 break;
88 default:
89 printk(KERN_ERR "module %s: Unknown relocation: %u\n",
90 me->name, ELF32_R_TYPE(rel[i].r_info));
91 return -ENOEXEC;
92 }
93 }
94 return 0;
95}
96
97int apply_relocate_add(Elf32_Shdr *sechdrs,
98 const char *strtab,
99 unsigned int symindex,
100 unsigned int relsec,
101 struct module *me)
102{
103 printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
104 me->name);
105 return -ENOEXEC;
106}
107
108int module_finalize(const Elf_Ehdr *hdr,
109 const Elf_Shdr *sechdrs,
110 struct module *me)
111{
112 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
113 *para = NULL;
114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
115
116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
117 if (!strcmp(".text", secstrings + s->sh_name))
118 text = s;
119 if (!strcmp(".altinstructions", secstrings + s->sh_name))
120 alt = s;
121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
122 locks = s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s;
125 }
126
127 if (alt) {
128 /* patch .altinstructions */
129 void *aseg = (void *)alt->sh_addr;
130 apply_alternatives(aseg, aseg + alt->sh_size);
131 }
132 if (locks && text) {
133 void *lseg = (void *)locks->sh_addr;
134 void *tseg = (void *)text->sh_addr;
135 alternatives_smp_module_add(me, me->name,
136 lseg, lseg + locks->sh_size,
137 tseg, tseg + text->sh_size);
138 }
139
140 if (para) {
141 void *pseg = (void *)para->sh_addr;
142 apply_paravirt(pseg, pseg + para->sh_size);
143 }
144
145 return module_bug_finalize(hdr, sechdrs, me);
146}
147
148void module_arch_cleanup(struct module *mod)
149{
150 alternatives_smp_module_del(mod);
151 module_bug_cleanup(mod);
152}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 70fd7e414c15..651c93b28862 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -17,6 +17,7 @@
17#include <linux/acpi.h> 17#include <linux/acpi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/pci.h>
20 21
21#include <asm/mtrr.h> 22#include <asm/mtrr.h>
22#include <asm/mpspec.h> 23#include <asm/mpspec.h>
@@ -870,24 +871,17 @@ static
870inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} 871inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
871#endif /* CONFIG_X86_IO_APIC */ 872#endif /* CONFIG_X86_IO_APIC */
872 873
873static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, 874static int
874 int count) 875check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
875{ 876{
876 if (!mpc_new_phys) { 877 int ret = 0;
877 pr_info("No spare slots, try to append...take your risk, " 878
878 "new mpc_length %x\n", count); 879 if (!mpc_new_phys || count <= mpc_new_length) {
879 } else { 880 WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
880 if (count <= mpc_new_length) 881 return -1;
881 pr_info("No spare slots, try to append..., "
882 "new mpc_length %x\n", count);
883 else {
884 pr_err("mpc_new_length %lx is too small\n",
885 mpc_new_length);
886 return -1;
887 }
888 } 882 }
889 883
890 return 0; 884 return ret;
891} 885}
892 886
893static int __init replace_intsrc_all(struct mpc_table *mpc, 887static int __init replace_intsrc_all(struct mpc_table *mpc,
@@ -946,7 +940,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
946 } else { 940 } else {
947 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 941 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
948 count += sizeof(struct mpc_intsrc); 942 count += sizeof(struct mpc_intsrc);
949 if (!check_slot(mpc_new_phys, mpc_new_length, count)) 943 if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
950 goto out; 944 goto out;
951 assign_to_mpc_intsrc(&mp_irqs[i], m); 945 assign_to_mpc_intsrc(&mp_irqs[i], m);
952 mpc->length = count; 946 mpc->length = count;
@@ -963,11 +957,14 @@ out:
963 return 0; 957 return 0;
964} 958}
965 959
966static int __initdata enable_update_mptable; 960int enable_update_mptable;
967 961
968static int __init update_mptable_setup(char *str) 962static int __init update_mptable_setup(char *str)
969{ 963{
970 enable_update_mptable = 1; 964 enable_update_mptable = 1;
965#ifdef CONFIG_PCI
966 pci_routeirq = 1;
967#endif
971 return 0; 968 return 0;
972} 969}
973early_param("update_mptable", update_mptable_setup); 970early_param("update_mptable", update_mptable_setup);
@@ -980,6 +977,9 @@ static int __initdata alloc_mptable;
980static int __init parse_alloc_mptable_opt(char *p) 977static int __init parse_alloc_mptable_opt(char *p)
981{ 978{
982 enable_update_mptable = 1; 979 enable_update_mptable = 1;
980#ifdef CONFIG_PCI
981 pci_routeirq = 1;
982#endif
983 alloc_mptable = 1; 983 alloc_mptable = 1;
984 if (!p) 984 if (!p)
985 return 0; 985 return 0;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 3cf3413ec626..98fd6cd4e3a4 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -196,6 +196,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
196 .notifier_call = msr_class_cpu_callback, 196 .notifier_call = msr_class_cpu_callback,
197}; 197};
198 198
199static char *msr_nodename(struct device *dev)
200{
201 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
202}
203
199static int __init msr_init(void) 204static int __init msr_init(void)
200{ 205{
201 int i, err = 0; 206 int i, err = 0;
@@ -212,6 +217,7 @@ static int __init msr_init(void)
212 err = PTR_ERR(msr_class); 217 err = PTR_ERR(msr_class);
213 goto out_chrdev; 218 goto out_chrdev;
214 } 219 }
220 msr_class->nodename = msr_nodename;
215 for_each_online_cpu(i) { 221 for_each_online_cpu(i) {
216 err = msr_device_create(i); 222 err = msr_device_create(i);
217 if (err != 0) 223 if (err != 0)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 8e45f4464880..70ec9b951d76 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -134,7 +134,9 @@ static void *get_call_destination(u8 type)
134 .pv_irq_ops = pv_irq_ops, 134 .pv_irq_ops = pv_irq_ops,
135 .pv_apic_ops = pv_apic_ops, 135 .pv_apic_ops = pv_apic_ops,
136 .pv_mmu_ops = pv_mmu_ops, 136 .pv_mmu_ops = pv_mmu_ops,
137#ifdef CONFIG_PARAVIRT_SPINLOCKS
137 .pv_lock_ops = pv_lock_ops, 138 .pv_lock_ops = pv_lock_ops,
139#endif
138 }; 140 };
139 return *((void **)&tmpl + type); 141 return *((void **)&tmpl + type);
140} 142}
@@ -246,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
246 248
247static inline void enter_lazy(enum paravirt_lazy_mode mode) 249static inline void enter_lazy(enum paravirt_lazy_mode mode)
248{ 250{
249 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); 251 BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
250 BUG_ON(preemptible());
251 252
252 __get_cpu_var(paravirt_lazy_mode) = mode; 253 percpu_write(paravirt_lazy_mode, mode);
253} 254}
254 255
255void paravirt_leave_lazy(enum paravirt_lazy_mode mode) 256static void leave_lazy(enum paravirt_lazy_mode mode)
256{ 257{
257 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); 258 BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
258 BUG_ON(preemptible());
259 259
260 __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; 260 percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
261} 261}
262 262
263void paravirt_enter_lazy_mmu(void) 263void paravirt_enter_lazy_mmu(void)
@@ -267,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)
267 267
268void paravirt_leave_lazy_mmu(void) 268void paravirt_leave_lazy_mmu(void)
269{ 269{
270 paravirt_leave_lazy(PARAVIRT_LAZY_MMU); 270 leave_lazy(PARAVIRT_LAZY_MMU);
271} 271}
272 272
273void paravirt_enter_lazy_cpu(void) 273void paravirt_start_context_switch(struct task_struct *prev)
274{ 274{
275 BUG_ON(preemptible());
276
277 if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
278 arch_leave_lazy_mmu_mode();
279 set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
280 }
275 enter_lazy(PARAVIRT_LAZY_CPU); 281 enter_lazy(PARAVIRT_LAZY_CPU);
276} 282}
277 283
278void paravirt_leave_lazy_cpu(void) 284void paravirt_end_context_switch(struct task_struct *next)
279{ 285{
280 paravirt_leave_lazy(PARAVIRT_LAZY_CPU); 286 BUG_ON(preemptible());
287
288 leave_lazy(PARAVIRT_LAZY_CPU);
289
290 if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
291 arch_enter_lazy_mmu_mode();
281} 292}
282 293
283enum paravirt_lazy_mode paravirt_get_lazy_mode(void) 294enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
284{ 295{
285 return __get_cpu_var(paravirt_lazy_mode); 296 if (in_interrupt())
297 return PARAVIRT_LAZY_NONE;
298
299 return percpu_read(paravirt_lazy_mode);
286} 300}
287 301
288void arch_flush_lazy_mmu_mode(void) 302void arch_flush_lazy_mmu_mode(void)
@@ -290,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)
290 preempt_disable(); 304 preempt_disable();
291 305
292 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 306 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
293 WARN_ON(preempt_count() == 1);
294 arch_leave_lazy_mmu_mode(); 307 arch_leave_lazy_mmu_mode();
295 arch_enter_lazy_mmu_mode(); 308 arch_enter_lazy_mmu_mode();
296 } 309 }
@@ -298,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)
298 preempt_enable(); 311 preempt_enable();
299} 312}
300 313
301void arch_flush_lazy_cpu_mode(void)
302{
303 preempt_disable();
304
305 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
306 WARN_ON(preempt_count() == 1);
307 arch_leave_lazy_cpu_mode();
308 arch_enter_lazy_cpu_mode();
309 }
310
311 preempt_enable();
312}
313
314struct pv_info pv_info = { 314struct pv_info pv_info = {
315 .name = "bare hardware", 315 .name = "bare hardware",
316 .paravirt_enabled = 0, 316 .paravirt_enabled = 0,
@@ -402,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {
402 .set_iopl_mask = native_set_iopl_mask, 402 .set_iopl_mask = native_set_iopl_mask,
403 .io_delay = native_io_delay, 403 .io_delay = native_io_delay,
404 404
405 .lazy_mode = { 405 .start_context_switch = paravirt_nop,
406 .enter = paravirt_nop, 406 .end_context_switch = paravirt_nop,
407 .leave = paravirt_nop,
408 },
409}; 407};
410 408
411struct pv_apic_ops pv_apic_ops = { 409struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 755c21e906f3..971a3bec47a8 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
186 186
187static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; 187static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
188 188
189/* enable this to stress test the chip's TCE cache */
190#ifdef CONFIG_IOMMU_DEBUG
191static int debugging = 1;
192
193static inline unsigned long verify_bit_range(unsigned long* bitmap,
194 int expected, unsigned long start, unsigned long end)
195{
196 unsigned long idx = start;
197
198 BUG_ON(start >= end);
199
200 while (idx < end) {
201 if (!!test_bit(idx, bitmap) != expected)
202 return idx;
203 ++idx;
204 }
205
206 /* all bits have the expected value */
207 return ~0UL;
208}
209#else /* debugging is disabled */
210static int debugging;
211
212static inline unsigned long verify_bit_range(unsigned long* bitmap,
213 int expected, unsigned long start, unsigned long end)
214{
215 return ~0UL;
216}
217
218#endif /* CONFIG_IOMMU_DEBUG */
219
220static inline int translation_enabled(struct iommu_table *tbl) 189static inline int translation_enabled(struct iommu_table *tbl)
221{ 190{
222 /* only PHBs with translation enabled have an IOMMU table */ 191 /* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
228{ 197{
229 unsigned long index; 198 unsigned long index;
230 unsigned long end; 199 unsigned long end;
231 unsigned long badbit;
232 unsigned long flags; 200 unsigned long flags;
233 201
234 index = start_addr >> PAGE_SHIFT; 202 index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
243 211
244 spin_lock_irqsave(&tbl->it_lock, flags); 212 spin_lock_irqsave(&tbl->it_lock, flags);
245 213
246 badbit = verify_bit_range(tbl->it_map, 0, index, end);
247 if (badbit != ~0UL) {
248 if (printk_ratelimit())
249 printk(KERN_ERR "Calgary: entry already allocated at "
250 "0x%lx tbl %p dma 0x%lx npages %u\n",
251 badbit, tbl, start_addr, npages);
252 }
253
254 iommu_area_reserve(tbl->it_map, index, npages); 214 iommu_area_reserve(tbl->it_map, index, npages);
255 215
256 spin_unlock_irqrestore(&tbl->it_lock, flags); 216 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
326 unsigned int npages) 286 unsigned int npages)
327{ 287{
328 unsigned long entry; 288 unsigned long entry;
329 unsigned long badbit;
330 unsigned long badend; 289 unsigned long badend;
331 unsigned long flags; 290 unsigned long flags;
332 291
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
346 305
347 spin_lock_irqsave(&tbl->it_lock, flags); 306 spin_lock_irqsave(&tbl->it_lock, flags);
348 307
349 badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
350 if (badbit != ~0UL) {
351 if (printk_ratelimit())
352 printk(KERN_ERR "Calgary: bit is off at 0x%lx "
353 "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
354 badbit, tbl, dma_addr, entry, npages);
355 }
356
357 iommu_area_free(tbl->it_map, entry, npages); 308 iommu_area_free(tbl->it_map, entry, npages);
358 309
359 spin_unlock_irqrestore(&tbl->it_lock, flags); 310 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
1488 iommu_detected = 1; 1439 iommu_detected = 1;
1489 calgary_detected = 1; 1440 calgary_detected = 1;
1490 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); 1441 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
1491 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1492 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1443 specified_table_size);
1493 debugging ? "enabled" : "disabled");
1494 1444
1495 /* swiotlb for devices that aren't behind the Calgary. */ 1445 /* swiotlb for devices that aren't behind the Calgary. */
1496 if (max_pfn > MAX_DMA32_PFN) 1446 if (max_pfn > MAX_DMA32_PFN)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index b284b58c035c..cfd9f9063896 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -144,48 +144,21 @@ static void flush_gart(void)
144} 144}
145 145
146#ifdef CONFIG_IOMMU_LEAK 146#ifdef CONFIG_IOMMU_LEAK
147
148#define SET_LEAK(x) \
149 do { \
150 if (iommu_leak_tab) \
151 iommu_leak_tab[x] = __builtin_return_address(0);\
152 } while (0)
153
154#define CLEAR_LEAK(x) \
155 do { \
156 if (iommu_leak_tab) \
157 iommu_leak_tab[x] = NULL; \
158 } while (0)
159
160/* Debugging aid for drivers that don't free their IOMMU tables */ 147/* Debugging aid for drivers that don't free their IOMMU tables */
161static void **iommu_leak_tab;
162static int leak_trace; 148static int leak_trace;
163static int iommu_leak_pages = 20; 149static int iommu_leak_pages = 20;
164 150
165static void dump_leak(void) 151static void dump_leak(void)
166{ 152{
167 int i;
168 static int dump; 153 static int dump;
169 154
170 if (dump || !iommu_leak_tab) 155 if (dump)
171 return; 156 return;
172 dump = 1; 157 dump = 1;
173 show_stack(NULL, NULL);
174 158
175 /* Very crude. dump some from the end of the table too */ 159 show_stack(NULL, NULL);
176 printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", 160 debug_dma_dump_mappings(NULL);
177 iommu_leak_pages);
178 for (i = 0; i < iommu_leak_pages; i += 2) {
179 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
180 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
181 0);
182 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
183 }
184 printk(KERN_DEBUG "\n");
185} 161}
186#else
187# define SET_LEAK(x)
188# define CLEAR_LEAK(x)
189#endif 162#endif
190 163
191static void iommu_full(struct device *dev, size_t size, int dir) 164static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
248 221
249 for (i = 0; i < npages; i++) { 222 for (i = 0; i < npages; i++) {
250 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); 223 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
251 SET_LEAK(iommu_page + i);
252 phys_mem += PAGE_SIZE; 224 phys_mem += PAGE_SIZE;
253 } 225 }
254 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 226 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
294 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); 266 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
295 for (i = 0; i < npages; i++) { 267 for (i = 0; i < npages; i++) {
296 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 268 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
297 CLEAR_LEAK(iommu_page + i);
298 } 269 }
299 free_iommu(iommu_page, npages); 270 free_iommu(iommu_page, npages);
300} 271}
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
377 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); 348 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
378 while (pages--) { 349 while (pages--) {
379 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 350 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
380 SET_LEAK(iommu_page);
381 addr += PAGE_SIZE; 351 addr += PAGE_SIZE;
382 iommu_page++; 352 iommu_page++;
383 } 353 }
@@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
688 658
689 agp_gatt_table = gatt; 659 agp_gatt_table = gatt;
690 660
691 enable_gart_translations();
692
693 error = sysdev_class_register(&gart_sysdev_class); 661 error = sysdev_class_register(&gart_sysdev_class);
694 if (!error) 662 if (!error)
695 error = sysdev_register(&device_gart); 663 error = sysdev_register(&device_gart);
@@ -801,11 +769,12 @@ void __init gart_iommu_init(void)
801 769
802#ifdef CONFIG_IOMMU_LEAK 770#ifdef CONFIG_IOMMU_LEAK
803 if (leak_trace) { 771 if (leak_trace) {
804 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 772 int ret;
805 get_order(iommu_pages*sizeof(void *))); 773
806 if (!iommu_leak_tab) 774 ret = dma_debug_resize_entries(iommu_pages);
775 if (ret)
807 printk(KERN_DEBUG 776 printk(KERN_DEBUG
808 "PCI-DMA: Cannot allocate leak trace area\n"); 777 "PCI-DMA: Cannot trace all the entries\n");
809 } 778 }
810#endif 779#endif
811 780
@@ -845,6 +814,14 @@ void __init gart_iommu_init(void)
845 * the pages as Not-Present: 814 * the pages as Not-Present:
846 */ 815 */
847 wbinvd(); 816 wbinvd();
817
818 /*
819 * Now all caches are flushed and we can safely enable
820 * GART hardware. Doing it early leaves the possibility
821 * of stale cache entries that can lead to GART PTE
822 * errors.
823 */
824 enable_gart_translations();
848 825
849 /* 826 /*
850 * Try to workaround a bug (thanks to BenH): 827 * Try to workaround a bug (thanks to BenH):
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 221a3853e268..a1712f2b50f1 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
28 return paddr; 28 return paddr;
29} 29}
30 30
31phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) 31phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
32{ 32{
33 return baddr; 33 return baddr;
34} 34}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 19a686c401b5..fc6e4b773fc4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,9 +8,11 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h>
11#include <trace/power.h> 12#include <trace/power.h>
12#include <asm/system.h> 13#include <asm/system.h>
13#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/syscalls.h>
14#include <asm/idle.h> 16#include <asm/idle.h>
15#include <asm/uaccess.h> 17#include <asm/uaccess.h>
16#include <asm/i387.h> 18#include <asm/i387.h>
@@ -65,7 +67,7 @@ void arch_task_cache_init(void)
65 task_xstate_cachep = 67 task_xstate_cachep =
66 kmem_cache_create("task_xstate", xstate_size, 68 kmem_cache_create("task_xstate", xstate_size,
67 __alignof__(union thread_xstate), 69 __alignof__(union thread_xstate),
68 SLAB_PANIC, NULL); 70 SLAB_PANIC | SLAB_NOTRACK, NULL);
69} 71}
70 72
71/* 73/*
@@ -604,3 +606,16 @@ static int __init idle_setup(char *str)
604} 606}
605early_param("idle", idle_setup); 607early_param("idle", idle_setup);
606 608
609unsigned long arch_align_stack(unsigned long sp)
610{
611 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
612 sp -= get_random_int() % 8192;
613 return sp & ~0xf;
614}
615
616unsigned long arch_randomize_brk(struct mm_struct *mm)
617{
618 unsigned long range_end = mm->brk + 0x02000000;
619 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
620}
621
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 297ffff2ffc2..00a8fe4c58bb 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,8 +9,6 @@
9 * This file handles the architecture-dependent parts of process handling.. 9 * This file handles the architecture-dependent parts of process handling..
10 */ 10 */
11 11
12#include <stdarg.h>
13
14#include <linux/stackprotector.h> 12#include <linux/stackprotector.h>
15#include <linux/cpu.h> 13#include <linux/cpu.h>
16#include <linux/errno.h> 14#include <linux/errno.h>
@@ -33,7 +31,6 @@
33#include <linux/module.h> 31#include <linux/module.h>
34#include <linux/kallsyms.h> 32#include <linux/kallsyms.h>
35#include <linux/ptrace.h> 33#include <linux/ptrace.h>
36#include <linux/random.h>
37#include <linux/personality.h> 34#include <linux/personality.h>
38#include <linux/tick.h> 35#include <linux/tick.h>
39#include <linux/percpu.h> 36#include <linux/percpu.h>
@@ -419,7 +416,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
419 * done before math_state_restore, so the TS bit is up 416 * done before math_state_restore, so the TS bit is up
420 * to date. 417 * to date.
421 */ 418 */
422 arch_leave_lazy_cpu_mode(); 419 arch_end_context_switch(next_p);
423 420
424 /* If the task has used fpu the last 5 timeslices, just do a full 421 /* If the task has used fpu the last 5 timeslices, just do a full
425 * restore of the math state immediately to avoid the trap; the 422 * restore of the math state immediately to avoid the trap; the
@@ -526,15 +523,3 @@ unsigned long get_wchan(struct task_struct *p)
526 return 0; 523 return 0;
527} 524}
528 525
529unsigned long arch_align_stack(unsigned long sp)
530{
531 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
532 sp -= get_random_int() % 8192;
533 return sp & ~0xf;
534}
535
536unsigned long arch_randomize_brk(struct mm_struct *mm)
537{
538 unsigned long range_end = mm->brk + 0x02000000;
539 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
540}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index f7b276d4b3fb..89c46f1259d3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,8 +14,6 @@
14 * This file handles the architecture-dependent parts of process handling.. 14 * This file handles the architecture-dependent parts of process handling..
15 */ 15 */
16 16
17#include <stdarg.h>
18
19#include <linux/stackprotector.h> 17#include <linux/stackprotector.h>
20#include <linux/cpu.h> 18#include <linux/cpu.h>
21#include <linux/errno.h> 19#include <linux/errno.h>
@@ -32,7 +30,6 @@
32#include <linux/delay.h> 30#include <linux/delay.h>
33#include <linux/module.h> 31#include <linux/module.h>
34#include <linux/ptrace.h> 32#include <linux/ptrace.h>
35#include <linux/random.h>
36#include <linux/notifier.h> 33#include <linux/notifier.h>
37#include <linux/kprobes.h> 34#include <linux/kprobes.h>
38#include <linux/kdebug.h> 35#include <linux/kdebug.h>
@@ -442,7 +439,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
442 * done before math_state_restore, so the TS bit is up 439 * done before math_state_restore, so the TS bit is up
443 * to date. 440 * to date.
444 */ 441 */
445 arch_leave_lazy_cpu_mode(); 442 arch_end_context_switch(next_p);
446 443
447 /* 444 /*
448 * Switch FS and GS. 445 * Switch FS and GS.
@@ -692,15 +689,3 @@ long sys_arch_prctl(int code, unsigned long addr)
692 return do_arch_prctl(current, code, addr); 689 return do_arch_prctl(current, code, addr);
693} 690}
694 691
695unsigned long arch_align_stack(unsigned long sp)
696{
697 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
698 sp -= get_random_int() % 8192;
699 return sp & ~0xf;
700}
701
702unsigned long arch_randomize_brk(struct mm_struct *mm)
703{
704 unsigned long range_end = mm->brk + 0x02000000;
705 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
706}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 7563b31b4f03..af71d06624bf 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,5 +491,42 @@ void force_hpet_resume(void)
491 break; 491 break;
492 } 492 }
493} 493}
494#endif
495
496#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
497/* Set correct numa_node information for AMD NB functions */
498static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{
500 struct pci_dev *nb_ht;
501 unsigned int devfn;
502 u32 val;
503
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
505 nb_ht = pci_get_slot(dev->bus, devfn);
506 if (!nb_ht)
507 return;
508
509 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7);
511 pci_dev_put(dev);
512}
494 513
514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
515 quirk_amd_nb_node);
516DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
517 quirk_amd_nb_node);
518DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
519 quirk_amd_nb_node);
520DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
521 quirk_amd_nb_node);
522DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
523 quirk_amd_nb_node);
524DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
525 quirk_amd_nb_node);
526DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
527 quirk_amd_nb_node);
528DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
529 quirk_amd_nb_node);
530DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
531 quirk_amd_nb_node);
495#endif 532#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1340dad417f4..d2d1ce8170f0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
192 DMI_MATCH(DMI_BOARD_NAME, "0KP561"), 192 DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
193 }, 193 },
194 }, 194 },
195 { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
196 .callback = set_bios_reboot,
197 .ident = "Dell OptiPlex 360",
198 .matches = {
199 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
200 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
201 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
202 },
203 },
195 { /* Handle problems with rebooting on Dell 2400's */ 204 { /* Handle problems with rebooting on Dell 2400's */
196 .callback = set_bios_reboot, 205 .callback = set_bios_reboot,
197 .ident = "Dell PowerEdge 2400", 206 .ident = "Dell PowerEdge 2400",
@@ -232,6 +241,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
232 DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), 241 DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
233 }, 242 },
234 }, 243 },
244 { /* Handle problems with rebooting on Sony VGN-Z540N */
245 .callback = set_bios_reboot,
246 .ident = "Sony VGN-Z540N",
247 .matches = {
248 DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
249 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
250 },
251 },
235 { } 252 { }
236}; 253};
237 254
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4158439bf63..be5ae80f897f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,14 @@
112#define ARCH_SETUP 112#define ARCH_SETUP
113#endif 113#endif
114 114
115/*
116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
117 * The direct mapping extends to max_pfn_mapped, so that we can directly access
118 * apertures, ACPI and other tables without having to play with fixmaps.
119 */
120unsigned long max_low_pfn_mapped;
121unsigned long max_pfn_mapped;
122
115RESERVE_BRK(dmi_alloc, 65536); 123RESERVE_BRK(dmi_alloc, 65536);
116 124
117unsigned int boot_cpu_id __read_mostly; 125unsigned int boot_cpu_id __read_mostly;
@@ -214,8 +222,8 @@ unsigned long mmu_cr4_features;
214unsigned long mmu_cr4_features = X86_CR4_PAE; 222unsigned long mmu_cr4_features = X86_CR4_PAE;
215#endif 223#endif
216 224
217/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 225/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
218int bootloader_type; 226int bootloader_type, bootloader_version;
219 227
220/* 228/*
221 * Setup options 229 * Setup options
@@ -293,15 +301,13 @@ static void __init reserve_brk(void)
293 301
294#ifdef CONFIG_BLK_DEV_INITRD 302#ifdef CONFIG_BLK_DEV_INITRD
295 303
296#ifdef CONFIG_X86_32
297
298#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 304#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
299static void __init relocate_initrd(void) 305static void __init relocate_initrd(void)
300{ 306{
301 307
302 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 308 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
303 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 309 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
304 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; 310 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
305 u64 ramdisk_here; 311 u64 ramdisk_here;
306 unsigned long slop, clen, mapaddr; 312 unsigned long slop, clen, mapaddr;
307 char *p, *q; 313 char *p, *q;
@@ -357,14 +363,13 @@ static void __init relocate_initrd(void)
357 ramdisk_image, ramdisk_image + ramdisk_size - 1, 363 ramdisk_image, ramdisk_image + ramdisk_size - 1,
358 ramdisk_here, ramdisk_here + ramdisk_size - 1); 364 ramdisk_here, ramdisk_here + ramdisk_size - 1);
359} 365}
360#endif
361 366
362static void __init reserve_initrd(void) 367static void __init reserve_initrd(void)
363{ 368{
364 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 369 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
365 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 370 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
366 u64 ramdisk_end = ramdisk_image + ramdisk_size; 371 u64 ramdisk_end = ramdisk_image + ramdisk_size;
367 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; 372 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
368 373
369 if (!boot_params.hdr.type_of_loader || 374 if (!boot_params.hdr.type_of_loader ||
370 !ramdisk_image || !ramdisk_size) 375 !ramdisk_image || !ramdisk_size)
@@ -394,14 +399,8 @@ static void __init reserve_initrd(void)
394 return; 399 return;
395 } 400 }
396 401
397#ifdef CONFIG_X86_32
398 relocate_initrd(); 402 relocate_initrd();
399#else 403
400 printk(KERN_ERR "initrd extends beyond end of memory "
401 "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
402 ramdisk_end, end_of_lowmem);
403 initrd_start = 0;
404#endif
405 free_early(ramdisk_image, ramdisk_end); 404 free_early(ramdisk_image, ramdisk_end);
406} 405}
407#else 406#else
@@ -706,6 +705,12 @@ void __init setup_arch(char **cmdline_p)
706#endif 705#endif
707 saved_video_mode = boot_params.hdr.vid_mode; 706 saved_video_mode = boot_params.hdr.vid_mode;
708 bootloader_type = boot_params.hdr.type_of_loader; 707 bootloader_type = boot_params.hdr.type_of_loader;
708 if ((bootloader_type >> 4) == 0xe) {
709 bootloader_type &= 0xf;
710 bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
711 }
712 bootloader_version = bootloader_type & 0xf;
713 bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
709 714
710#ifdef CONFIG_BLK_DEV_RAM 715#ifdef CONFIG_BLK_DEV_RAM
711 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; 716 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
@@ -854,12 +859,16 @@ void __init setup_arch(char **cmdline_p)
854 max_low_pfn = max_pfn; 859 max_low_pfn = max_pfn;
855 860
856 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 861 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
862 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
857#endif 863#endif
858 864
859#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION 865#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
860 setup_bios_corruption_check(); 866 setup_bios_corruption_check();
861#endif 867#endif
862 868
869 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
870 max_pfn_mapped<<PAGE_SHIFT);
871
863 reserve_brk(); 872 reserve_brk();
864 873
865 /* max_pfn_mapped is updated here */ 874 /* max_pfn_mapped is updated here */
@@ -997,24 +1006,6 @@ void __init setup_arch(char **cmdline_p)
997#ifdef CONFIG_X86_32 1006#ifdef CONFIG_X86_32
998 1007
999/** 1008/**
1000 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
1001 *
1002 * Description:
1003 * Perform any necessary interrupt initialisation prior to setting up
1004 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
1005 * interrupts should be initialised here if the machine emulates a PC
1006 * in any way.
1007 **/
1008void __init x86_quirk_pre_intr_init(void)
1009{
1010 if (x86_quirks->arch_pre_intr_init) {
1011 if (x86_quirks->arch_pre_intr_init())
1012 return;
1013 }
1014 init_ISA_irqs();
1015}
1016
1017/**
1018 * x86_quirk_intr_init - post gate setup interrupt initialisation 1009 * x86_quirk_intr_init - post gate setup interrupt initialisation
1019 * 1010 *
1020 * Description: 1011 * Description:
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 3a97a4cf1872..9c3f0823e6aa 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -160,8 +160,10 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
160 /* 160 /*
161 * If large page isn't supported, there's no benefit in doing 161 * If large page isn't supported, there's no benefit in doing
162 * this. Also, on non-NUMA, embedding is better. 162 * this. Also, on non-NUMA, embedding is better.
163 *
164 * NOTE: disabled for now.
163 */ 165 */
164 if (!cpu_has_pse || !pcpu_need_numa()) 166 if (true || !cpu_has_pse || !pcpu_need_numa())
165 return -EINVAL; 167 return -EINVAL;
166 168
167 /* 169 /*
@@ -423,6 +425,14 @@ void __init setup_per_cpu_areas(void)
423 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 425 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
424#endif 426#endif
425 427
428#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
429 /*
430 * make sure boot cpu node_number is right, when boot cpu is on the
431 * node that doesn't have mem installed
432 */
433 per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
434#endif
435
426 /* Setup node to cpumask map */ 436 /* Setup node to cpumask map */
427 setup_node_to_cpumask_map(); 437 setup_node_to_cpumask_map();
428 438
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index f33d2e0ef095..0f89a4f20db2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#include <linux/sched.h> 9#include <linux/sched.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
@@ -25,11 +24,11 @@
25#include <asm/ucontext.h> 24#include <asm/ucontext.h>
26#include <asm/i387.h> 25#include <asm/i387.h>
27#include <asm/vdso.h> 26#include <asm/vdso.h>
27#include <asm/mce.h>
28 28
29#ifdef CONFIG_X86_64 29#ifdef CONFIG_X86_64
30#include <asm/proto.h> 30#include <asm/proto.h>
31#include <asm/ia32_unistd.h> 31#include <asm/ia32_unistd.h>
32#include <asm/mce.h>
33#endif /* CONFIG_X86_64 */ 32#endif /* CONFIG_X86_64 */
34 33
35#include <asm/syscall.h> 34#include <asm/syscall.h>
@@ -848,10 +847,10 @@ static void do_signal(struct pt_regs *regs)
848void 847void
849do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 848do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
850{ 849{
851#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) 850#ifdef CONFIG_X86_NEW_MCE
852 /* notify userspace of pending MCEs */ 851 /* notify userspace of pending MCEs */
853 if (thread_info_flags & _TIF_MCE_NOTIFY) 852 if (thread_info_flags & _TIF_MCE_NOTIFY)
854 mce_notify_user(); 853 mce_notify_process();
855#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ 854#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
856 855
857 /* deal with pending signal delivery */ 856 /* deal with pending signal delivery */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 13f33ea8ccaa..ec1de97600e7 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask)
150 * this function calls the 'stop' function on all other CPUs in the system. 150 * this function calls the 'stop' function on all other CPUs in the system.
151 */ 151 */
152 152
153asmlinkage void smp_reboot_interrupt(void)
154{
155 ack_APIC_irq();
156 irq_enter();
157 stop_this_cpu(NULL);
158 irq_exit();
159}
160
153static void native_smp_send_stop(void) 161static void native_smp_send_stop(void)
154{ 162{
155 unsigned long flags; 163 unsigned long flags;
164 unsigned long wait;
156 165
157 if (reboot_force) 166 if (reboot_force)
158 return; 167 return;
159 168
160 smp_call_function(stop_this_cpu, NULL, 0); 169 /*
170 * Use an own vector here because smp_call_function
171 * does lots of things not suitable in a panic situation.
172 * On most systems we could also use an NMI here,
173 * but there are a few systems around where NMI
174 * is problematic so stay with an non NMI for now
175 * (this implies we cannot stop CPUs spinning with irq off
176 * currently)
177 */
178 if (num_online_cpus() > 1) {
179 apic->send_IPI_allbutself(REBOOT_VECTOR);
180
181 /* Don't wait longer than a second */
182 wait = USEC_PER_SEC;
183 while (num_online_cpus() > 1 && wait--)
184 udelay(1);
185 }
186
161 local_irq_save(flags); 187 local_irq_save(flags);
162 disable_local_APIC(); 188 disable_local_APIC();
163 local_irq_restore(flags); 189 local_irq_restore(flags);
@@ -172,6 +198,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
172{ 198{
173 ack_APIC_irq(); 199 ack_APIC_irq();
174 inc_irq_stat(irq_resched_count); 200 inc_irq_stat(irq_resched_count);
201 /*
202 * KVM uses this interrupt to force a cpu out of guest mode
203 */
175} 204}
176 205
177void smp_call_function_interrupt(struct pt_regs *regs) 206void smp_call_function_interrupt(struct pt_regs *regs)
@@ -193,19 +222,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
193} 222}
194 223
195struct smp_ops smp_ops = { 224struct smp_ops smp_ops = {
196 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 225 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
197 .smp_prepare_cpus = native_smp_prepare_cpus, 226 .smp_prepare_cpus = native_smp_prepare_cpus,
198 .smp_cpus_done = native_smp_cpus_done, 227 .smp_cpus_done = native_smp_cpus_done,
199 228
200 .smp_send_stop = native_smp_send_stop, 229 .smp_send_stop = native_smp_send_stop,
201 .smp_send_reschedule = native_smp_send_reschedule, 230 .smp_send_reschedule = native_smp_send_reschedule,
202 231
203 .cpu_up = native_cpu_up, 232 .cpu_up = native_cpu_up,
204 .cpu_die = native_cpu_die, 233 .cpu_die = native_cpu_die,
205 .cpu_disable = native_cpu_disable, 234 .cpu_disable = native_cpu_disable,
206 .play_dead = native_play_dead, 235 .play_dead = native_play_dead,
207 236
208 .send_call_func_ipi = native_send_call_func_ipi, 237 .send_call_func_ipi = native_send_call_func_ipi,
209 .send_call_func_single_ipi = native_send_call_func_single_ipi, 238 .send_call_func_single_ipi = native_send_call_func_single_ipi,
210}; 239};
211EXPORT_SYMBOL_GPL(smp_ops); 240EXPORT_SYMBOL_GPL(smp_ops);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2b2652d205c0..dee0f3d814af 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -506,7 +506,7 @@ void __inquire_remote_apic(int apicid)
506 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this 506 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
507 * won't ... remember to clear down the APIC, etc later. 507 * won't ... remember to clear down the APIC, etc later.
508 */ 508 */
509int __devinit 509int __cpuinit
510wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) 510wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
511{ 511{
512 unsigned long send_status, accept_status = 0; 512 unsigned long send_status, accept_status = 0;
@@ -540,7 +540,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
540 return (send_status | accept_status); 540 return (send_status | accept_status);
541} 541}
542 542
543int __devinit 543static int __cpuinit
544wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) 544wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
545{ 545{
546 unsigned long send_status, accept_status = 0; 546 unsigned long send_status, accept_status = 0;
@@ -824,10 +824,12 @@ do_rest:
824 /* mark "stuck" area as not stuck */ 824 /* mark "stuck" area as not stuck */
825 *((volatile unsigned long *)trampoline_base) = 0; 825 *((volatile unsigned long *)trampoline_base) = 0;
826 826
827 /* 827 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
828 * Cleanup possible dangling ends... 828 /*
829 */ 829 * Cleanup possible dangling ends...
830 smpboot_restore_warm_reset_vector(); 830 */
831 smpboot_restore_warm_reset_vector();
832 }
831 833
832 return boot_error; 834 return boot_error;
833} 835}
@@ -873,7 +875,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
873 875
874 err = do_boot_cpu(apicid, cpu); 876 err = do_boot_cpu(apicid, cpu);
875 877
876 zap_low_mappings(); 878 zap_low_mappings(false);
877 low_mappings = 0; 879 low_mappings = 0;
878#else 880#else
879 err = do_boot_cpu(apicid, cpu); 881 err = do_boot_cpu(apicid, cpu);
@@ -992,10 +994,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
992 */ 994 */
993 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && 995 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
994 !cpu_has_apic) { 996 !cpu_has_apic) {
995 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 997 if (!disable_apic) {
996 boot_cpu_physical_apicid); 998 pr_err("BIOS bug, local APIC #%d not detected!...\n",
997 printk(KERN_ERR "... forcing use of dummy APIC emulation." 999 boot_cpu_physical_apicid);
1000 pr_err("... forcing use of dummy APIC emulation."
998 "(tell your hw vendor)\n"); 1001 "(tell your hw vendor)\n");
1002 }
999 smpboot_clear_io_apic(); 1003 smpboot_clear_io_apic();
1000 arch_disable_smp_support(); 1004 arch_disable_smp_support();
1001 return -1; 1005 return -1;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 4aaf7e48394f..c3eb207181fe 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace)
77} 77}
78EXPORT_SYMBOL_GPL(save_stack_trace); 78EXPORT_SYMBOL_GPL(save_stack_trace);
79 79
80void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
81{
82 dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
83 if (trace->nr_entries < trace->max_entries)
84 trace->entries[trace->nr_entries++] = ULONG_MAX;
85}
86
80void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 87void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
81{ 88{
82 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); 89 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b491..d51321ddafda 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,5 @@ ENTRY(sys_call_table)
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_preadv 335 .long sys_preadv
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_counter_open
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ed0c33761e6d..124d40c575df 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)
715 struct bau_desc *adp; 715 struct bau_desc *adp;
716 struct bau_desc *ad2; 716 struct bau_desc *ad2;
717 717
718 adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); 718 /*
719 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
720 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
721 */
722 adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
723 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
719 BUG_ON(!adp); 724 BUG_ON(!adp);
720 725
721 pa = uv_gpa(adp); /* need the real nasid*/ 726 pa = uv_gpa(adp); /* need the real nasid*/
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)
729 (n << UV_DESC_BASE_PNODE_SHIFT | m)); 734 (n << UV_DESC_BASE_PNODE_SHIFT | m));
730 } 735 }
731 736
732 for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { 737 /*
738 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
739 * cpu even though we only use the first one; one descriptor can
740 * describe a broadcast to 256 nodes.
741 */
742 for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
743 i++, ad2++) {
733 memset(ad2, 0, sizeof(struct bau_desc)); 744 memset(ad2, 0, sizeof(struct bau_desc));
734 ad2->header.sw_ack_flag = 1; 745 ad2->header.sw_ack_flag = 1;
735 /* 746 /*
@@ -832,7 +843,7 @@ static int __init uv_bau_init(void)
832 return 0; 843 return 0;
833 844
834 for_each_possible_cpu(cur_cpu) 845 for_each_possible_cpu(cur_cpu)
835 alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 846 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
836 GFP_KERNEL, cpu_to_node(cur_cpu)); 847 GFP_KERNEL, cpu_to_node(cur_cpu));
837 848
838 uv_bau_retry_limit = 1; 849 uv_bau_retry_limit = 1;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 124a4d5a95b2..286d64eba31b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -45,6 +45,7 @@
45#include <linux/edac.h> 45#include <linux/edac.h>
46#endif 46#endif
47 47
48#include <asm/kmemcheck.h>
48#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
49#include <asm/processor.h> 50#include <asm/processor.h>
50#include <asm/debugreg.h> 51#include <asm/debugreg.h>
@@ -534,6 +535,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
534 535
535 get_debugreg(dr6, 6); 536 get_debugreg(dr6, 6);
536 537
538 /* Catch kmemcheck conditions first of all! */
539 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
540 return;
541
537 /* DR6 may or may not be cleared by the CPU */ 542 /* DR6 may or may not be cleared by the CPU */
538 set_debugreg(0, 6); 543 set_debugreg(0, 6);
539 /* 544 /*
@@ -777,15 +782,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
777 782
778 return new_kesp; 783 return new_kesp;
779} 784}
780#else 785#endif
786
781asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) 787asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
782{ 788{
783} 789}
784 790
785asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) 791asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
786{ 792{
787} 793}
788#endif
789 794
790/* 795/*
791 * 'math_state_restore()' saves the current math information in the 796 * 'math_state_restore()' saves the current math information in the
@@ -818,9 +823,6 @@ asmlinkage void math_state_restore(void)
818 } 823 }
819 824
820 clts(); /* Allow maths ops (or we recurse) */ 825 clts(); /* Allow maths ops (or we recurse) */
821#ifdef CONFIG_X86_32
822 restore_fpu(tsk);
823#else
824 /* 826 /*
825 * Paranoid restore. send a SIGSEGV if we fail to restore the state. 827 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
826 */ 828 */
@@ -829,7 +831,7 @@ asmlinkage void math_state_restore(void)
829 force_sig(SIGSEGV, tsk); 831 force_sig(SIGSEGV, tsk);
830 return; 832 return;
831 } 833 }
832#endif 834
833 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 835 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
834 tsk->fpu_counter++; 836 tsk->fpu_counter++;
835} 837}
@@ -924,8 +926,13 @@ void __init trap_init(void)
924#endif 926#endif
925 set_intr_gate(19, &simd_coprocessor_error); 927 set_intr_gate(19, &simd_coprocessor_error);
926 928
929 /* Reserve all the builtin and the syscall vector: */
930 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
931 set_bit(i, used_vectors);
932
927#ifdef CONFIG_IA32_EMULATION 933#ifdef CONFIG_IA32_EMULATION
928 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 934 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
935 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
929#endif 936#endif
930 937
931#ifdef CONFIG_X86_32 938#ifdef CONFIG_X86_32
@@ -942,17 +949,9 @@ void __init trap_init(void)
942 } 949 }
943 950
944 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 951 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
945#endif
946
947 /* Reserve all the builtin and the syscall vector: */
948 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
949 set_bit(i, used_vectors);
950
951#ifdef CONFIG_X86_64
952 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
953#else
954 set_bit(SYSCALL_VECTOR, used_vectors); 952 set_bit(SYSCALL_VECTOR, used_vectors);
955#endif 953#endif
954
956 /* 955 /*
957 * Should be a barrier for any external CPU state: 956 * Should be a barrier for any external CPU state:
958 */ 957 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d57de05dc430..ae3180c506a6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -9,6 +9,7 @@
9#include <linux/delay.h> 9#include <linux/delay.h>
10#include <linux/clocksource.h> 10#include <linux/clocksource.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/timex.h>
12 13
13#include <asm/hpet.h> 14#include <asm/hpet.h>
14#include <asm/timer.h> 15#include <asm/timer.h>
@@ -384,13 +385,13 @@ unsigned long native_calibrate_tsc(void)
384{ 385{
385 u64 tsc1, tsc2, delta, ref1, ref2; 386 u64 tsc1, tsc2, delta, ref1, ref2;
386 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 387 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
387 unsigned long flags, latch, ms, fast_calibrate, tsc_khz; 388 unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
388 int hpet = is_hpet_enabled(), i, loopmin; 389 int hpet = is_hpet_enabled(), i, loopmin;
389 390
390 tsc_khz = get_hypervisor_tsc_freq(); 391 hv_tsc_khz = get_hypervisor_tsc_freq();
391 if (tsc_khz) { 392 if (hv_tsc_khz) {
392 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); 393 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
393 return tsc_khz; 394 return hv_tsc_khz;
394 } 395 }
395 396
396 local_irq_save(flags); 397 local_irq_save(flags);
@@ -710,7 +711,16 @@ static cycle_t read_tsc(struct clocksource *cs)
710#ifdef CONFIG_X86_64 711#ifdef CONFIG_X86_64
711static cycle_t __vsyscall_fn vread_tsc(void) 712static cycle_t __vsyscall_fn vread_tsc(void)
712{ 713{
713 cycle_t ret = (cycle_t)vget_cycles(); 714 cycle_t ret;
715
716 /*
717 * Surround the RDTSC by barriers, to make sure it's not
718 * speculated to outside the seqlock critical section and
719 * does not cause time warps:
720 */
721 rdtsc_barrier();
722 ret = (cycle_t)vget_cycles();
723 rdtsc_barrier();
714 724
715 return ret >= __vsyscall_gtod_data.clock.cycle_last ? 725 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
716 ret : __vsyscall_gtod_data.clock.cycle_last; 726 ret : __vsyscall_gtod_data.clock.cycle_last;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bf36328f6ef9..027b5b498993 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
34 * of a critical section, to be able to prove TSC time-warps: 34 * of a critical section, to be able to prove TSC time-warps:
35 */ 35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; 36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
37
37static __cpuinitdata cycles_t last_tsc; 38static __cpuinitdata cycles_t last_tsc;
38static __cpuinitdata cycles_t max_warp; 39static __cpuinitdata cycles_t max_warp;
39static __cpuinitdata int nr_warps; 40static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
113 return; 114 return;
114 115
115 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
116 printk(KERN_INFO 117 pr_info("Skipping synchronization checks as TSC is reliable.\n");
117 "Skipping synchronization checks as TSC is reliable.\n");
118 return; 118 return;
119 } 119 }
120 120
121 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", 121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu); 122 smp_processor_id(), cpu);
123 123
124 /* 124 /*
125 * Reset it - in case this is a second bootup: 125 * Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
143 143
144 if (nr_warps) { 144 if (nr_warps) {
145 printk("\n"); 145 printk("\n");
146 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 " turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
195 while (atomic_read(&stop_count) != cpus) 195 while (atomic_read(&stop_count) != cpus)
196 cpu_relax(); 196 cpu_relax();
197} 197}
198#undef NR_LOOPS
199
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e7fc1c..9c4e62539058 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
287 info->regs.pt.ds = 0; 287 info->regs.pt.ds = 0;
288 info->regs.pt.es = 0; 288 info->regs.pt.es = 0;
289 info->regs.pt.fs = 0; 289 info->regs.pt.fs = 0;
290 290#ifndef CONFIG_X86_32_LAZY_GS
291/* we are clearing gs later just before "jmp resume_userspace", 291 info->regs.pt.gs = 0;
292 * because it is not saved/restored. 292#endif
293 */
294 293
295/* 294/*
296 * The flags register is also special: we cannot trust that the user 295 * The flags register is also special: we cannot trust that the user
@@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
318 } 317 }
319 318
320/* 319/*
321 * Save old state, set default return value (%ax) to 0 320 * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
322 */ 321 */
323 info->regs32->ax = 0; 322 info->regs32->ax = VM86_SIGNAL;
324 tsk->thread.saved_sp0 = tsk->thread.sp0; 323 tsk->thread.saved_sp0 = tsk->thread.sp0;
325 tsk->thread.saved_fs = info->regs32->fs; 324 tsk->thread.saved_fs = info->regs32->fs;
326 tsk->thread.saved_gs = get_user_gs(info->regs32); 325 tsk->thread.saved_gs = get_user_gs(info->regs32);
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
343 __asm__ __volatile__( 342 __asm__ __volatile__(
344 "movl %0,%%esp\n\t" 343 "movl %0,%%esp\n\t"
345 "movl %1,%%ebp\n\t" 344 "movl %1,%%ebp\n\t"
345#ifdef CONFIG_X86_32_LAZY_GS
346 "mov %2, %%gs\n\t" 346 "mov %2, %%gs\n\t"
347#endif
347 "jmp resume_userspace" 348 "jmp resume_userspace"
348 : /* no outputs */ 349 : /* no outputs */
349 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); 350 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95deb9f2211e..b263423fbe2a 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
462} 462}
463#endif 463#endif
464 464
465static void vmi_enter_lazy_cpu(void) 465static void vmi_start_context_switch(struct task_struct *prev)
466{ 466{
467 paravirt_enter_lazy_cpu(); 467 paravirt_start_context_switch(prev);
468 vmi_ops.set_lazy_mode(2); 468 vmi_ops.set_lazy_mode(2);
469} 469}
470 470
471static void vmi_end_context_switch(struct task_struct *next)
472{
473 vmi_ops.set_lazy_mode(0);
474 paravirt_end_context_switch(next);
475}
476
471static void vmi_enter_lazy_mmu(void) 477static void vmi_enter_lazy_mmu(void)
472{ 478{
473 paravirt_enter_lazy_mmu(); 479 paravirt_enter_lazy_mmu();
474 vmi_ops.set_lazy_mode(1); 480 vmi_ops.set_lazy_mode(1);
475} 481}
476 482
477static void vmi_leave_lazy(void) 483static void vmi_leave_lazy_mmu(void)
478{ 484{
479 paravirt_leave_lazy(paravirt_get_lazy_mode());
480 vmi_ops.set_lazy_mode(0); 485 vmi_ops.set_lazy_mode(0);
486 paravirt_leave_lazy_mmu();
481} 487}
482 488
483static inline int __init check_vmi_rom(struct vrom_header *rom) 489static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)
711 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); 717 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
712 para_fill(pv_cpu_ops.io_delay, IODelay); 718 para_fill(pv_cpu_ops.io_delay, IODelay);
713 719
714 para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, 720 para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
715 set_lazy_mode, SetLazyMode); 721 set_lazy_mode, SetLazyMode);
716 para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, 722 para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
717 set_lazy_mode, SetLazyMode); 723 set_lazy_mode, SetLazyMode);
718 724
719 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, 725 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
720 set_lazy_mode, SetLazyMode); 726 set_lazy_mode, SetLazyMode);
721 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, 727 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
722 set_lazy_mode, SetLazyMode); 728 set_lazy_mode, SetLazyMode);
723 729
724 /* user and kernel flush are just handled with different flags to FlushTLB */ 730 /* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 849ee611f013..367e87882041 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,5 +1,433 @@
1/*
2 * ld script for the x86 kernel
3 *
4 * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
5 *
6 * Modernisation, unification and other changes and fixes:
7 * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org>
8 *
9 *
10 * Don't define absolute symbols until and unless you know that symbol
11 * value is should remain constant even if kernel image is relocated
12 * at run time. Absolute symbols are not relocated. If symbol value should
13 * change if kernel is relocated, make the symbol section relative and
14 * put it inside the section definition.
15 */
16
1#ifdef CONFIG_X86_32 17#ifdef CONFIG_X86_32
2# include "vmlinux_32.lds.S" 18#define LOAD_OFFSET __PAGE_OFFSET
3#else 19#else
4# include "vmlinux_64.lds.S" 20#define LOAD_OFFSET __START_KERNEL_map
5#endif 21#endif
22
23#include <asm-generic/vmlinux.lds.h>
24#include <asm/asm-offsets.h>
25#include <asm/thread_info.h>
26#include <asm/page_types.h>
27#include <asm/cache.h>
28#include <asm/boot.h>
29
30#undef i386 /* in case the preprocessor is a 32bit one */
31
32OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
33
34#ifdef CONFIG_X86_32
35OUTPUT_ARCH(i386)
36ENTRY(phys_startup_32)
37jiffies = jiffies_64;
38#else
39OUTPUT_ARCH(i386:x86-64)
40ENTRY(phys_startup_64)
41jiffies_64 = jiffies;
42#endif
43
44PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */
47#ifdef CONFIG_X86_64
48 user PT_LOAD FLAGS(7); /* RWE */
49 data.init PT_LOAD FLAGS(7); /* RWE */
50#ifdef CONFIG_SMP
51 percpu PT_LOAD FLAGS(7); /* RWE */
52#endif
53 data.init2 PT_LOAD FLAGS(7); /* RWE */
54#endif
55 note PT_NOTE FLAGS(0); /* ___ */
56}
57
58SECTIONS
59{
60#ifdef CONFIG_X86_32
61 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
62 phys_startup_32 = startup_32 - LOAD_OFFSET;
63#else
64 . = __START_KERNEL;
65 phys_startup_64 = startup_64 - LOAD_OFFSET;
66#endif
67
68 /* Text and read-only data */
69
70 /* bootstrapping code */
71 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
72 _text = .;
73 *(.text.head)
74 } :text = 0x9090
75
76 /* The rest of the text */
77 .text : AT(ADDR(.text) - LOAD_OFFSET) {
78#ifdef CONFIG_X86_32
79 /* not really needed, already page aligned */
80 . = ALIGN(PAGE_SIZE);
81 *(.text.page_aligned)
82#endif
83 . = ALIGN(8);
84 _stext = .;
85 TEXT_TEXT
86 SCHED_TEXT
87 LOCK_TEXT
88 KPROBES_TEXT
89 IRQENTRY_TEXT
90 *(.fixup)
91 *(.gnu.warning)
92 /* End of text section */
93 _etext = .;
94 } :text = 0x9090
95
96 NOTES :text :note
97
98 /* Exception table */
99 . = ALIGN(16);
100 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
101 __start___ex_table = .;
102 *(__ex_table)
103 __stop___ex_table = .;
104 } :text = 0x9090
105
106 RODATA
107
108 /* Data */
109 . = ALIGN(PAGE_SIZE);
110 .data : AT(ADDR(.data) - LOAD_OFFSET) {
111 /* Start of data section */
112 _sdata = .;
113 DATA_DATA
114 CONSTRUCTORS
115
116#ifdef CONFIG_X86_64
117 /* End of data section */
118 _edata = .;
119#endif
120 } :data
121
122#ifdef CONFIG_X86_32
123 /* 32 bit has nosave before _edata */
124 . = ALIGN(PAGE_SIZE);
125 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
126 __nosave_begin = .;
127 *(.data.nosave)
128 . = ALIGN(PAGE_SIZE);
129 __nosave_end = .;
130 }
131#endif
132
133 . = ALIGN(PAGE_SIZE);
134 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
135 *(.data.page_aligned)
136 *(.data.idt)
137 }
138
139#ifdef CONFIG_X86_32
140 . = ALIGN(32);
141#else
142 . = ALIGN(PAGE_SIZE);
143 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
144#endif
145 .data.cacheline_aligned :
146 AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
147 *(.data.cacheline_aligned)
148 }
149
150 /* rarely changed data like cpu maps */
151#ifdef CONFIG_X86_32
152 . = ALIGN(32);
153#else
154 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
155#endif
156 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
157 *(.data.read_mostly)
158
159#ifdef CONFIG_X86_32
160 /* End of data section */
161 _edata = .;
162#endif
163 }
164
165#ifdef CONFIG_X86_64
166
167#define VSYSCALL_ADDR (-10*1024*1024)
168#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
169 SIZEOF(.data.read_mostly) + 4095) & ~(4095))
170#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
171 SIZEOF(.data.read_mostly) + 4095) & ~(4095))
172
173#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
174#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
175
176#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
177#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
178
179 . = VSYSCALL_ADDR;
180 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
181 *(.vsyscall_0)
182 } :user
183
184 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
185
186 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
187 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
188 *(.vsyscall_fn)
189 }
190
191 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
192 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
193 *(.vsyscall_gtod_data)
194 }
195
196 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
197 .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
198 *(.vsyscall_clock)
199 }
200 vsyscall_clock = VVIRT(.vsyscall_clock);
201
202
203 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
204 *(.vsyscall_1)
205 }
206 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
207 *(.vsyscall_2)
208 }
209
210 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
211 *(.vgetcpu_mode)
212 }
213 vgetcpu_mode = VVIRT(.vgetcpu_mode);
214
215 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
216 .jiffies : AT(VLOAD(.jiffies)) {
217 *(.jiffies)
218 }
219 jiffies = VVIRT(.jiffies);
220
221 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
222 *(.vsyscall_3)
223 }
224
225 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
226
227#undef VSYSCALL_ADDR
228#undef VSYSCALL_PHYS_ADDR
229#undef VSYSCALL_VIRT_ADDR
230#undef VLOAD_OFFSET
231#undef VLOAD
232#undef VVIRT_OFFSET
233#undef VVIRT
234
235#endif /* CONFIG_X86_64 */
236
237 /* init_task */
238 . = ALIGN(THREAD_SIZE);
239 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
240 *(.data.init_task)
241 }
242#ifdef CONFIG_X86_64
243 :data.init
244#endif
245
246 /*
247 * smp_locks might be freed after init
248 * start/end must be page aligned
249 */
250 . = ALIGN(PAGE_SIZE);
251 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
252 __smp_locks = .;
253 *(.smp_locks)
254 __smp_locks_end = .;
255 . = ALIGN(PAGE_SIZE);
256 }
257
258 /* Init code and data - will be freed after init */
259 . = ALIGN(PAGE_SIZE);
260 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
261 __init_begin = .; /* paired with __init_end */
262 _sinittext = .;
263 INIT_TEXT
264 _einittext = .;
265 }
266
267 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
268 INIT_DATA
269 }
270
271 . = ALIGN(16);
272 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
273 __setup_start = .;
274 *(.init.setup)
275 __setup_end = .;
276 }
277 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
278 __initcall_start = .;
279 INITCALLS
280 __initcall_end = .;
281 }
282
283 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
284 __con_initcall_start = .;
285 *(.con_initcall.init)
286 __con_initcall_end = .;
287 }
288
289 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
290 __x86_cpu_dev_start = .;
291 *(.x86_cpu_dev.init)
292 __x86_cpu_dev_end = .;
293 }
294
295 SECURITY_INIT
296
297 . = ALIGN(8);
298 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
299 __parainstructions = .;
300 *(.parainstructions)
301 __parainstructions_end = .;
302 }
303
304 . = ALIGN(8);
305 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
306 __alt_instructions = .;
307 *(.altinstructions)
308 __alt_instructions_end = .;
309 }
310
311 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
312 *(.altinstr_replacement)
313 }
314
315 /*
316 * .exit.text is discard at runtime, not link time, to deal with
317 * references from .altinstructions and .eh_frame
318 */
319 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
320 EXIT_TEXT
321 }
322
323 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
324 EXIT_DATA
325 }
326
327#ifdef CONFIG_BLK_DEV_INITRD
328 . = ALIGN(PAGE_SIZE);
329 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
330 __initramfs_start = .;
331 *(.init.ramfs)
332 __initramfs_end = .;
333 }
334#endif
335
336#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
337 /*
338 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
339 * output PHDR, so the next output section - __data_nosave - should
340 * start another section data.init2. Also, pda should be at the head of
341 * percpu area. Preallocate it and define the percpu offset symbol
342 * so that it can be accessed as a percpu variable.
343 */
344 . = ALIGN(PAGE_SIZE);
345 PERCPU_VADDR(0, :percpu)
346#else
347 PERCPU(PAGE_SIZE)
348#endif
349
350 . = ALIGN(PAGE_SIZE);
351
352 /* freed after init ends here */
353 .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
354 __init_end = .;
355 }
356
357#ifdef CONFIG_X86_64
358 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
359 . = ALIGN(PAGE_SIZE);
360 __nosave_begin = .;
361 *(.data.nosave)
362 . = ALIGN(PAGE_SIZE);
363 __nosave_end = .;
364 } :data.init2
365 /* use another section data.init2, see PERCPU_VADDR() above */
366#endif
367
368 /* BSS */
369 . = ALIGN(PAGE_SIZE);
370 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
371 __bss_start = .;
372 *(.bss.page_aligned)
373 *(.bss)
374 . = ALIGN(4);
375 __bss_stop = .;
376 }
377
378 . = ALIGN(PAGE_SIZE);
379 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
380 __brk_base = .;
381 . += 64 * 1024; /* 64k alignment slop space */
382 *(.brk_reservation) /* areas brk users have reserved */
383 __brk_limit = .;
384 }
385
386 .end : AT(ADDR(.end) - LOAD_OFFSET) {
387 _end = .;
388 }
389
390 /* Sections to be discarded */
391 /DISCARD/ : {
392 *(.exitcall.exit)
393 *(.eh_frame)
394 *(.discard)
395 }
396
397 STABS_DEBUG
398 DWARF_DEBUG
399}
400
401
402#ifdef CONFIG_X86_32
403ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
404 "kernel image bigger than KERNEL_IMAGE_SIZE")
405#else
406/*
407 * Per-cpu symbols which need to be offset from __per_cpu_load
408 * for the boot processor.
409 */
410#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
411INIT_PER_CPU(gdt_page);
412INIT_PER_CPU(irq_stack_union);
413
414/*
415 * Build-time check on the image size:
416 */
417ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
418 "kernel image bigger than KERNEL_IMAGE_SIZE")
419
420#ifdef CONFIG_SMP
421ASSERT((per_cpu__irq_stack_union == 0),
422 "irq_stack_union is not at start of per-cpu area");
423#endif
424
425#endif /* CONFIG_X86_32 */
426
427#ifdef CONFIG_KEXEC
428#include <asm/kexec.h>
429
430ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
431 "kexec control code size is too big")
432#endif
433
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
deleted file mode 100644
index 62ad500d55f3..000000000000
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,229 +0,0 @@
1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
9 */
10
11#define LOAD_OFFSET __PAGE_OFFSET
12
13#include <asm-generic/vmlinux.lds.h>
14#include <asm/thread_info.h>
15#include <asm/page_types.h>
16#include <asm/cache.h>
17#include <asm/boot.h>
18
19OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
20OUTPUT_ARCH(i386)
21ENTRY(phys_startup_32)
22jiffies = jiffies_64;
23
24PHDRS {
25 text PT_LOAD FLAGS(5); /* R_E */
26 data PT_LOAD FLAGS(7); /* RWE */
27 note PT_NOTE FLAGS(0); /* ___ */
28}
29SECTIONS
30{
31 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
32 phys_startup_32 = startup_32 - LOAD_OFFSET;
33
34 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
35 _text = .; /* Text and read-only data */
36 *(.text.head)
37 } :text = 0x9090
38
39 /* read-only */
40 .text : AT(ADDR(.text) - LOAD_OFFSET) {
41 . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
42 *(.text.page_aligned)
43 TEXT_TEXT
44 SCHED_TEXT
45 LOCK_TEXT
46 KPROBES_TEXT
47 IRQENTRY_TEXT
48 *(.fixup)
49 *(.gnu.warning)
50 _etext = .; /* End of text section */
51 } :text = 0x9090
52
53 NOTES :text :note
54
55 . = ALIGN(16); /* Exception table */
56 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
57 __start___ex_table = .;
58 *(__ex_table)
59 __stop___ex_table = .;
60 } :text = 0x9090
61
62 RODATA
63
64 /* writeable */
65 . = ALIGN(PAGE_SIZE);
66 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
67 DATA_DATA
68 CONSTRUCTORS
69 } :data
70
71 . = ALIGN(PAGE_SIZE);
72 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
73 __nosave_begin = .;
74 *(.data.nosave)
75 . = ALIGN(PAGE_SIZE);
76 __nosave_end = .;
77 }
78
79 . = ALIGN(PAGE_SIZE);
80 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
81 *(.data.page_aligned)
82 *(.data.idt)
83 }
84
85 . = ALIGN(32);
86 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
87 *(.data.cacheline_aligned)
88 }
89
90 /* rarely changed data like cpu maps */
91 . = ALIGN(32);
92 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
93 *(.data.read_mostly)
94 _edata = .; /* End of data section */
95 }
96
97 . = ALIGN(THREAD_SIZE); /* init_task */
98 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
99 *(.data.init_task)
100 }
101
102 /* might get freed after init */
103 . = ALIGN(PAGE_SIZE);
104 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
105 __smp_locks = .;
106 *(.smp_locks)
107 __smp_locks_end = .;
108 }
109 /* will be freed after init
110 * Following ALIGN() is required to make sure no other data falls on the
111 * same page where __smp_alt_end is pointing as that page might be freed
112 * after boot. Always make sure that ALIGN() directive is present after
113 * the section which contains __smp_alt_end.
114 */
115 . = ALIGN(PAGE_SIZE);
116
117 /* will be freed after init */
118 . = ALIGN(PAGE_SIZE); /* Init code and data */
119 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
120 __init_begin = .;
121 _sinittext = .;
122 INIT_TEXT
123 _einittext = .;
124 }
125 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
126 INIT_DATA
127 }
128 . = ALIGN(16);
129 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
130 __setup_start = .;
131 *(.init.setup)
132 __setup_end = .;
133 }
134 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
135 __initcall_start = .;
136 INITCALLS
137 __initcall_end = .;
138 }
139 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
140 __con_initcall_start = .;
141 *(.con_initcall.init)
142 __con_initcall_end = .;
143 }
144 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
145 __x86_cpu_dev_start = .;
146 *(.x86_cpu_dev.init)
147 __x86_cpu_dev_end = .;
148 }
149 SECURITY_INIT
150 . = ALIGN(4);
151 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
152 __alt_instructions = .;
153 *(.altinstructions)
154 __alt_instructions_end = .;
155 }
156 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
157 *(.altinstr_replacement)
158 }
159 . = ALIGN(4);
160 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
161 __parainstructions = .;
162 *(.parainstructions)
163 __parainstructions_end = .;
164 }
165 /* .exit.text is discard at runtime, not link time, to deal with references
166 from .altinstructions and .eh_frame */
167 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
168 EXIT_TEXT
169 }
170 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
171 EXIT_DATA
172 }
173#if defined(CONFIG_BLK_DEV_INITRD)
174 . = ALIGN(PAGE_SIZE);
175 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
176 __initramfs_start = .;
177 *(.init.ramfs)
178 __initramfs_end = .;
179 }
180#endif
181 PERCPU(PAGE_SIZE)
182 . = ALIGN(PAGE_SIZE);
183 /* freed after init ends here */
184
185 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
186 __init_end = .;
187 __bss_start = .; /* BSS */
188 *(.bss.page_aligned)
189 *(.bss)
190 . = ALIGN(4);
191 __bss_stop = .;
192 }
193
194 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
195 . = ALIGN(PAGE_SIZE);
196 __brk_base = . ;
197 . += 64 * 1024 ; /* 64k alignment slop space */
198 *(.brk_reservation) /* areas brk users have reserved */
199 __brk_limit = . ;
200 }
201
202 .end : AT(ADDR(.end) - LOAD_OFFSET) {
203 _end = . ;
204 }
205
206 /* Sections to be discarded */
207 /DISCARD/ : {
208 *(.exitcall.exit)
209 *(.discard)
210 }
211
212 STABS_DEBUG
213
214 DWARF_DEBUG
215}
216
217/*
218 * Build-time check on the image size:
219 */
220ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
221 "kernel image bigger than KERNEL_IMAGE_SIZE")
222
223#ifdef CONFIG_KEXEC
224/* Link time checks */
225#include <asm/kexec.h>
226
227ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
228 "kexec control code size is too big")
229#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
deleted file mode 100644
index c8742507b030..000000000000
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,298 +0,0 @@
1/* ld script to make x86-64 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 */
4
5#define LOAD_OFFSET __START_KERNEL_map
6
7#include <asm-generic/vmlinux.lds.h>
8#include <asm/asm-offsets.h>
9#include <asm/page_types.h>
10
11#undef i386 /* in case the preprocessor is a 32bit one */
12
13OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
14OUTPUT_ARCH(i386:x86-64)
15ENTRY(phys_startup_64)
16jiffies_64 = jiffies;
17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */
22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif
25 data.init2 PT_LOAD FLAGS(7); /* RWE */
26 note PT_NOTE FLAGS(0); /* ___ */
27}
28SECTIONS
29{
30 . = __START_KERNEL;
31 phys_startup_64 = startup_64 - LOAD_OFFSET;
32 .text : AT(ADDR(.text) - LOAD_OFFSET) {
33 _text = .; /* Text and read-only data */
34 /* First the code that has to be first for bootstrapping */
35 *(.text.head)
36 _stext = .;
37 /* Then the rest */
38 TEXT_TEXT
39 SCHED_TEXT
40 LOCK_TEXT
41 KPROBES_TEXT
42 IRQENTRY_TEXT
43 *(.fixup)
44 *(.gnu.warning)
45 _etext = .; /* End of text section */
46 } :text = 0x9090
47
48 NOTES :text :note
49
50 . = ALIGN(16); /* Exception table */
51 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
52 __start___ex_table = .;
53 *(__ex_table)
54 __stop___ex_table = .;
55 } :text = 0x9090
56
57 RODATA
58
59 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
60 /* Data */
61 .data : AT(ADDR(.data) - LOAD_OFFSET) {
62 DATA_DATA
63 CONSTRUCTORS
64 _edata = .; /* End of data section */
65 } :data
66
67
68 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
69 . = ALIGN(PAGE_SIZE);
70 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
71 *(.data.cacheline_aligned)
72 }
73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
74 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
75 *(.data.read_mostly)
76 }
77
78#define VSYSCALL_ADDR (-10*1024*1024)
79#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
80#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
81
82#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
83#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
84
85#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
86#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
87
88 . = VSYSCALL_ADDR;
89 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
90 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
91
92 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
93 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
94 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
95 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
96 { *(.vsyscall_gtod_data) }
97 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
98 .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
99 { *(.vsyscall_clock) }
100 vsyscall_clock = VVIRT(.vsyscall_clock);
101
102
103 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
104 { *(.vsyscall_1) }
105 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
106 { *(.vsyscall_2) }
107
108 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
109 vgetcpu_mode = VVIRT(.vgetcpu_mode);
110
111 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
112 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
113 jiffies = VVIRT(.jiffies);
114
115 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
116 { *(.vsyscall_3) }
117
118 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
119
120#undef VSYSCALL_ADDR
121#undef VSYSCALL_PHYS_ADDR
122#undef VSYSCALL_VIRT_ADDR
123#undef VLOAD_OFFSET
124#undef VLOAD
125#undef VVIRT_OFFSET
126#undef VVIRT
127
128 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
129 . = ALIGN(THREAD_SIZE); /* init_task */
130 *(.data.init_task)
131 }:data.init
132
133 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
134 . = ALIGN(PAGE_SIZE);
135 *(.data.page_aligned)
136 }
137
138 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
139 /* might get freed after init */
140 . = ALIGN(PAGE_SIZE);
141 __smp_alt_begin = .;
142 __smp_locks = .;
143 *(.smp_locks)
144 __smp_locks_end = .;
145 . = ALIGN(PAGE_SIZE);
146 __smp_alt_end = .;
147 }
148
149 . = ALIGN(PAGE_SIZE); /* Init code and data */
150 __init_begin = .; /* paired with __init_end */
151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
152 _sinittext = .;
153 INIT_TEXT
154 _einittext = .;
155 }
156 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
157 __initdata_begin = .;
158 INIT_DATA
159 __initdata_end = .;
160 }
161
162 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
163 . = ALIGN(16);
164 __setup_start = .;
165 *(.init.setup)
166 __setup_end = .;
167 }
168 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
169 __initcall_start = .;
170 INITCALLS
171 __initcall_end = .;
172 }
173 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
174 __con_initcall_start = .;
175 *(.con_initcall.init)
176 __con_initcall_end = .;
177 }
178 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
179 __x86_cpu_dev_start = .;
180 *(.x86_cpu_dev.init)
181 __x86_cpu_dev_end = .;
182 }
183 SECURITY_INIT
184
185 . = ALIGN(8);
186 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
187 __parainstructions = .;
188 *(.parainstructions)
189 __parainstructions_end = .;
190 }
191
192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
193 . = ALIGN(8);
194 __alt_instructions = .;
195 *(.altinstructions)
196 __alt_instructions_end = .;
197 }
198 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
199 *(.altinstr_replacement)
200 }
201 /* .exit.text is discard at runtime, not link time, to deal with references
202 from .altinstructions and .eh_frame */
203 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
204 EXIT_TEXT
205 }
206 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
207 EXIT_DATA
208 }
209
210#ifdef CONFIG_BLK_DEV_INITRD
211 . = ALIGN(PAGE_SIZE);
212 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
213 __initramfs_start = .;
214 *(.init.ramfs)
215 __initramfs_end = .;
216 }
217#endif
218
219#ifdef CONFIG_SMP
220 /*
221 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
222 * output PHDR, so the next output section - __data_nosave - should
223 * start another section data.init2. Also, pda should be at the head of
224 * percpu area. Preallocate it and define the percpu offset symbol
225 * so that it can be accessed as a percpu variable.
226 */
227 . = ALIGN(PAGE_SIZE);
228 PERCPU_VADDR(0, :percpu)
229#else
230 PERCPU(PAGE_SIZE)
231#endif
232
233 . = ALIGN(PAGE_SIZE);
234 __init_end = .;
235
236 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
237 . = ALIGN(PAGE_SIZE);
238 __nosave_begin = .;
239 *(.data.nosave)
240 . = ALIGN(PAGE_SIZE);
241 __nosave_end = .;
242 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
243
244 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
245 . = ALIGN(PAGE_SIZE);
246 __bss_start = .; /* BSS */
247 *(.bss.page_aligned)
248 *(.bss)
249 __bss_stop = .;
250 }
251
252 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
253 . = ALIGN(PAGE_SIZE);
254 __brk_base = . ;
255 . += 64 * 1024 ; /* 64k alignment slop space */
256 *(.brk_reservation) /* areas brk users have reserved */
257 __brk_limit = . ;
258 }
259
260 _end = . ;
261
262 /* Sections to be discarded */
263 /DISCARD/ : {
264 *(.exitcall.exit)
265 *(.eh_frame)
266 *(.discard)
267 }
268
269 STABS_DEBUG
270
271 DWARF_DEBUG
272}
273
274 /*
275 * Per-cpu symbols which need to be offset from __per_cpu_load
276 * for the boot processor.
277 */
278#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
279INIT_PER_CPU(gdt_page);
280INIT_PER_CPU(irq_stack_union);
281
282/*
283 * Build-time check on the image size:
284 */
285ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
286 "kernel image bigger than KERNEL_IMAGE_SIZE")
287
288#ifdef CONFIG_SMP
289ASSERT((per_cpu__irq_stack_union == 0),
290 "irq_stack_union is not at start of per-cpu area");
291#endif
292
293#ifdef CONFIG_KEXEC
294#include <asm/kexec.h>
295
296ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
297 "kexec control code size is too big")
298#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 44153afc9067..25ee06a80aad 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
132 return; 132 return;
133 } 133 }
134 134
135 /*
136 * Surround the RDTSC by barriers, to make sure it's not
137 * speculated to outside the seqlock critical section and
138 * does not cause time warps:
139 */
140 rdtsc_barrier();
141 now = vread(); 135 now = vread();
142 rdtsc_barrier();
143
144 base = __vsyscall_gtod_data.clock.cycle_last; 136 base = __vsyscall_gtod_data.clock.cycle_last;
145 mask = __vsyscall_gtod_data.clock.mask; 137 mask = __vsyscall_gtod_data.clock.mask;
146 mult = __vsyscall_gtod_data.clock.mult; 138 mult = __vsyscall_gtod_data.clock.mult;