diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-06-17 06:52:15 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-06-17 06:56:49 -0400 |
commit | eadb8a091b27a840de7450f84ecff5ef13476424 (patch) | |
tree | 58c3782d40def63baa8167f3d31e3048cb4c7660 /arch/x86/kernel | |
parent | 73874005cd8800440be4299bd095387fff4b90ac (diff) | |
parent | 65795efbd380a832ae508b04dba8f8e53f0b84d9 (diff) |
Merge branch 'linus' into tracing/hw-breakpoints
Conflicts:
arch/x86/Kconfig
arch/x86/kernel/traps.c
arch/x86/power/cpu.c
arch/x86/power/cpu_32.c
kernel/Makefile
Semantic conflict:
arch/x86/kernel/hw_breakpoint.c
Merge reason: Resolve the conflicts, move from put_cpu_no_sched() to
put_cpu() in arch/x86/kernel/hw_breakpoint.c.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/kernel')
107 files changed, 7329 insertions, 4518 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cbc781829173..b67efd1cf59b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -28,7 +28,7 @@ CFLAGS_paravirt.o := $(nostackp) | |||
28 | obj-y := process_$(BITS).o signal.o entry_$(BITS).o | 28 | obj-y := process_$(BITS).o signal.o entry_$(BITS).o |
29 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o | 29 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o |
30 | obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o | 30 | obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o |
31 | obj-y += setup.o i8259.o irqinit_$(BITS).o | 31 | obj-y += setup.o i8259.o irqinit.o |
32 | obj-$(CONFIG_X86_VISWS) += visws_quirks.o | 32 | obj-$(CONFIG_X86_VISWS) += visws_quirks.o |
33 | obj-$(CONFIG_X86_32) += probe_roms_32.o | 33 | obj-$(CONFIG_X86_32) += probe_roms_32.o |
34 | obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o | 34 | obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o |
@@ -73,7 +73,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o | |||
73 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o | 73 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o |
74 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o | 74 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o |
75 | obj-$(CONFIG_KPROBES) += kprobes.o | 75 | obj-$(CONFIG_KPROBES) += kprobes.o |
76 | obj-$(CONFIG_MODULES) += module_$(BITS).o | 76 | obj-$(CONFIG_MODULES) += module.o |
77 | obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o | 77 | obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o |
78 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o | 78 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o |
79 | obj-$(CONFIG_KGDB) += kgdb.o | 79 | obj-$(CONFIG_KGDB) += kgdb.o |
@@ -90,7 +90,8 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o | |||
90 | obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o | 90 | obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o |
91 | obj-$(CONFIG_KVM_GUEST) += kvm.o | 91 | obj-$(CONFIG_KVM_GUEST) += kvm.o |
92 | obj-$(CONFIG_KVM_CLOCK) += kvmclock.o | 92 | obj-$(CONFIG_KVM_CLOCK) += kvmclock.o |
93 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o | 93 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o |
94 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o | ||
94 | obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o | 95 | obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o |
95 | 96 | ||
96 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o | 97 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 723989d7f802..631086159c53 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/irq.h> | 33 | #include <linux/irq.h> |
34 | #include <linux/bootmem.h> | 34 | #include <linux/bootmem.h> |
35 | #include <linux/ioport.h> | 35 | #include <linux/ioport.h> |
36 | #include <linux/pci.h> | ||
36 | 37 | ||
37 | #include <asm/pgtable.h> | 38 | #include <asm/pgtable.h> |
38 | #include <asm/io_apic.h> | 39 | #include <asm/io_apic.h> |
@@ -522,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) | |||
522 | * success: return IRQ number (>=0) | 523 | * success: return IRQ number (>=0) |
523 | * failure: return < 0 | 524 | * failure: return < 0 |
524 | */ | 525 | */ |
525 | int acpi_register_gsi(u32 gsi, int triggering, int polarity) | 526 | int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) |
526 | { | 527 | { |
527 | unsigned int irq; | 528 | unsigned int irq; |
528 | unsigned int plat_gsi = gsi; | 529 | unsigned int plat_gsi = gsi; |
@@ -532,14 +533,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity) | |||
532 | * Make sure all (legacy) PCI IRQs are set as level-triggered. | 533 | * Make sure all (legacy) PCI IRQs are set as level-triggered. |
533 | */ | 534 | */ |
534 | if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { | 535 | if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { |
535 | if (triggering == ACPI_LEVEL_SENSITIVE) | 536 | if (trigger == ACPI_LEVEL_SENSITIVE) |
536 | eisa_set_level_irq(gsi); | 537 | eisa_set_level_irq(gsi); |
537 | } | 538 | } |
538 | #endif | 539 | #endif |
539 | 540 | ||
540 | #ifdef CONFIG_X86_IO_APIC | 541 | #ifdef CONFIG_X86_IO_APIC |
541 | if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { | 542 | if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { |
542 | plat_gsi = mp_register_gsi(gsi, triggering, polarity); | 543 | plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); |
543 | } | 544 | } |
544 | #endif | 545 | #endif |
545 | acpi_gsi_to_irq(plat_gsi, &irq); | 546 | acpi_gsi_to_irq(plat_gsi, &irq); |
@@ -903,10 +904,8 @@ extern int es7000_plat; | |||
903 | #endif | 904 | #endif |
904 | 905 | ||
905 | static struct { | 906 | static struct { |
906 | int apic_id; | ||
907 | int gsi_base; | 907 | int gsi_base; |
908 | int gsi_end; | 908 | int gsi_end; |
909 | DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); | ||
910 | } mp_ioapic_routing[MAX_IO_APICS]; | 909 | } mp_ioapic_routing[MAX_IO_APICS]; |
911 | 910 | ||
912 | int mp_find_ioapic(int gsi) | 911 | int mp_find_ioapic(int gsi) |
@@ -986,16 +985,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) | |||
986 | 985 | ||
987 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | 986 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); |
988 | mp_ioapics[idx].apicid = uniq_ioapic_id(id); | 987 | mp_ioapics[idx].apicid = uniq_ioapic_id(id); |
989 | #ifdef CONFIG_X86_32 | ||
990 | mp_ioapics[idx].apicver = io_apic_get_version(idx); | 988 | mp_ioapics[idx].apicver = io_apic_get_version(idx); |
991 | #else | 989 | |
992 | mp_ioapics[idx].apicver = 0; | ||
993 | #endif | ||
994 | /* | 990 | /* |
995 | * Build basic GSI lookup table to facilitate gsi->io_apic lookups | 991 | * Build basic GSI lookup table to facilitate gsi->io_apic lookups |
996 | * and to prevent reprogramming of IOAPIC pins (PCI GSIs). | 992 | * and to prevent reprogramming of IOAPIC pins (PCI GSIs). |
997 | */ | 993 | */ |
998 | mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid; | ||
999 | mp_ioapic_routing[idx].gsi_base = gsi_base; | 994 | mp_ioapic_routing[idx].gsi_base = gsi_base; |
1000 | mp_ioapic_routing[idx].gsi_end = gsi_base + | 995 | mp_ioapic_routing[idx].gsi_end = gsi_base + |
1001 | io_apic_get_redir_entries(idx); | 996 | io_apic_get_redir_entries(idx); |
@@ -1158,26 +1153,52 @@ void __init mp_config_acpi_legacy_irqs(void) | |||
1158 | } | 1153 | } |
1159 | } | 1154 | } |
1160 | 1155 | ||
1161 | int mp_register_gsi(u32 gsi, int triggering, int polarity) | 1156 | static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, |
1157 | int polarity) | ||
1162 | { | 1158 | { |
1159 | #ifdef CONFIG_X86_MPPARSE | ||
1160 | struct mpc_intsrc mp_irq; | ||
1161 | struct pci_dev *pdev; | ||
1162 | unsigned char number; | ||
1163 | unsigned int devfn; | ||
1163 | int ioapic; | 1164 | int ioapic; |
1164 | int ioapic_pin; | 1165 | u8 pin; |
1165 | #ifdef CONFIG_X86_32 | ||
1166 | #define MAX_GSI_NUM 4096 | ||
1167 | #define IRQ_COMPRESSION_START 64 | ||
1168 | 1166 | ||
1169 | static int pci_irq = IRQ_COMPRESSION_START; | 1167 | if (!acpi_ioapic) |
1170 | /* | 1168 | return 0; |
1171 | * Mapping between Global System Interrupts, which | 1169 | if (!dev) |
1172 | * represent all possible interrupts, and IRQs | 1170 | return 0; |
1173 | * assigned to actual devices. | 1171 | if (dev->bus != &pci_bus_type) |
1174 | */ | 1172 | return 0; |
1175 | static int gsi_to_irq[MAX_GSI_NUM]; | 1173 | |
1176 | #else | 1174 | pdev = to_pci_dev(dev); |
1175 | number = pdev->bus->number; | ||
1176 | devfn = pdev->devfn; | ||
1177 | pin = pdev->pin; | ||
1178 | /* print the entry should happen on mptable identically */ | ||
1179 | mp_irq.type = MP_INTSRC; | ||
1180 | mp_irq.irqtype = mp_INT; | ||
1181 | mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | | ||
1182 | (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); | ||
1183 | mp_irq.srcbus = number; | ||
1184 | mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); | ||
1185 | ioapic = mp_find_ioapic(gsi); | ||
1186 | mp_irq.dstapic = mp_ioapics[ioapic].apicid; | ||
1187 | mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); | ||
1188 | |||
1189 | save_mp_irq(&mp_irq); | ||
1190 | #endif | ||
1191 | return 0; | ||
1192 | } | ||
1193 | |||
1194 | int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) | ||
1195 | { | ||
1196 | int ioapic; | ||
1197 | int ioapic_pin; | ||
1198 | struct io_apic_irq_attr irq_attr; | ||
1177 | 1199 | ||
1178 | if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) | 1200 | if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) |
1179 | return gsi; | 1201 | return gsi; |
1180 | #endif | ||
1181 | 1202 | ||
1182 | /* Don't set up the ACPI SCI because it's already set up */ | 1203 | /* Don't set up the ACPI SCI because it's already set up */ |
1183 | if (acpi_gbl_FADT.sci_interrupt == gsi) | 1204 | if (acpi_gbl_FADT.sci_interrupt == gsi) |
@@ -1196,93 +1217,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) | |||
1196 | gsi = ioapic_renumber_irq(ioapic, gsi); | 1217 | gsi = ioapic_renumber_irq(ioapic, gsi); |
1197 | #endif | 1218 | #endif |
1198 | 1219 | ||
1199 | /* | ||
1200 | * Avoid pin reprogramming. PRTs typically include entries | ||
1201 | * with redundant pin->gsi mappings (but unique PCI devices); | ||
1202 | * we only program the IOAPIC on the first. | ||
1203 | */ | ||
1204 | if (ioapic_pin > MP_MAX_IOAPIC_PIN) { | 1220 | if (ioapic_pin > MP_MAX_IOAPIC_PIN) { |
1205 | printk(KERN_ERR "Invalid reference to IOAPIC pin " | 1221 | printk(KERN_ERR "Invalid reference to IOAPIC pin " |
1206 | "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, | 1222 | "%d-%d\n", mp_ioapics[ioapic].apicid, |
1207 | ioapic_pin); | 1223 | ioapic_pin); |
1208 | return gsi; | 1224 | return gsi; |
1209 | } | 1225 | } |
1210 | if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { | ||
1211 | pr_debug("Pin %d-%d already programmed\n", | ||
1212 | mp_ioapic_routing[ioapic].apic_id, ioapic_pin); | ||
1213 | #ifdef CONFIG_X86_32 | ||
1214 | return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); | ||
1215 | #else | ||
1216 | return gsi; | ||
1217 | #endif | ||
1218 | } | ||
1219 | |||
1220 | set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); | ||
1221 | #ifdef CONFIG_X86_32 | ||
1222 | /* | ||
1223 | * For GSI >= 64, use IRQ compression | ||
1224 | */ | ||
1225 | if ((gsi >= IRQ_COMPRESSION_START) | ||
1226 | && (triggering == ACPI_LEVEL_SENSITIVE)) { | ||
1227 | /* | ||
1228 | * For PCI devices assign IRQs in order, avoiding gaps | ||
1229 | * due to unused I/O APIC pins. | ||
1230 | */ | ||
1231 | int irq = gsi; | ||
1232 | if (gsi < MAX_GSI_NUM) { | ||
1233 | /* | ||
1234 | * Retain the VIA chipset work-around (gsi > 15), but | ||
1235 | * avoid a problem where the 8254 timer (IRQ0) is setup | ||
1236 | * via an override (so it's not on pin 0 of the ioapic), | ||
1237 | * and at the same time, the pin 0 interrupt is a PCI | ||
1238 | * type. The gsi > 15 test could cause these two pins | ||
1239 | * to be shared as IRQ0, and they are not shareable. | ||
1240 | * So test for this condition, and if necessary, avoid | ||
1241 | * the pin collision. | ||
1242 | */ | ||
1243 | gsi = pci_irq++; | ||
1244 | /* | ||
1245 | * Don't assign IRQ used by ACPI SCI | ||
1246 | */ | ||
1247 | if (gsi == acpi_gbl_FADT.sci_interrupt) | ||
1248 | gsi = pci_irq++; | ||
1249 | gsi_to_irq[irq] = gsi; | ||
1250 | } else { | ||
1251 | printk(KERN_ERR "GSI %u is too high\n", gsi); | ||
1252 | return gsi; | ||
1253 | } | ||
1254 | } | ||
1255 | #endif | ||
1256 | io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, | ||
1257 | triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, | ||
1258 | polarity == ACPI_ACTIVE_HIGH ? 0 : 1); | ||
1259 | return gsi; | ||
1260 | } | ||
1261 | 1226 | ||
1262 | int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, | 1227 | if (enable_update_mptable) |
1263 | u32 gsi, int triggering, int polarity) | 1228 | mp_config_acpi_gsi(dev, gsi, trigger, polarity); |
1264 | { | ||
1265 | #ifdef CONFIG_X86_MPPARSE | ||
1266 | struct mpc_intsrc mp_irq; | ||
1267 | int ioapic; | ||
1268 | 1229 | ||
1269 | if (!acpi_ioapic) | 1230 | set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, |
1270 | return 0; | 1231 | trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, |
1232 | polarity == ACPI_ACTIVE_HIGH ? 0 : 1); | ||
1233 | io_apic_set_pci_routing(dev, gsi, &irq_attr); | ||
1271 | 1234 | ||
1272 | /* print the entry should happen on mptable identically */ | 1235 | return gsi; |
1273 | mp_irq.type = MP_INTSRC; | ||
1274 | mp_irq.irqtype = mp_INT; | ||
1275 | mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | | ||
1276 | (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); | ||
1277 | mp_irq.srcbus = number; | ||
1278 | mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); | ||
1279 | ioapic = mp_find_ioapic(gsi); | ||
1280 | mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; | ||
1281 | mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); | ||
1282 | |||
1283 | save_mp_irq(&mp_irq); | ||
1284 | #endif | ||
1285 | return 0; | ||
1286 | } | 1236 | } |
1287 | 1237 | ||
1288 | /* | 1238 | /* |
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile index 1c31cc0e9def..167bc16ce0e5 100644 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ b/arch/x86/kernel/acpi/realmode/Makefile | |||
@@ -9,7 +9,7 @@ | |||
9 | always := wakeup.bin | 9 | always := wakeup.bin |
10 | targets := wakeup.elf wakeup.lds | 10 | targets := wakeup.elf wakeup.lds |
11 | 11 | ||
12 | wakeup-y += wakeup.o wakemain.o video-mode.o copy.o | 12 | wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o |
13 | 13 | ||
14 | # The link order of the video-*.o modules can matter. In particular, | 14 | # The link order of the video-*.o modules can matter. In particular, |
15 | # video-vga.o *must* be listed first, followed by video-vesa.o. | 15 | # video-vga.o *must* be listed first, followed by video-vesa.o. |
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S new file mode 100644 index 000000000000..f51eb0bb56ce --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/bioscall.S | |||
@@ -0,0 +1 @@ | |||
#include "../../../boot/bioscall.S" | |||
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c new file mode 100644 index 000000000000..6206033ba202 --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/regs.c | |||
@@ -0,0 +1 @@ | |||
#include "../../../boot/regs.c" | |||
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 7c243a2c5115..ca93638ba430 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c | |||
@@ -104,7 +104,7 @@ int acpi_save_state_mem(void) | |||
104 | initial_gs = per_cpu_offset(smp_processor_id()); | 104 | initial_gs = per_cpu_offset(smp_processor_id()); |
105 | #endif | 105 | #endif |
106 | initial_code = (unsigned long)wakeup_long64; | 106 | initial_code = (unsigned long)wakeup_long64; |
107 | saved_magic = 0x123456789abcdef0; | 107 | saved_magic = 0x123456789abcdef0L; |
108 | #endif /* CONFIG_64BIT */ | 108 | #endif /* CONFIG_64BIT */ |
109 | 109 | ||
110 | return 0; | 110 | return 0; |
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad52..1c60554537c3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c | |||
@@ -55,7 +55,16 @@ struct iommu_cmd { | |||
55 | static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, | 55 | static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, |
56 | struct unity_map_entry *e); | 56 | struct unity_map_entry *e); |
57 | static struct dma_ops_domain *find_protection_domain(u16 devid); | 57 | static struct dma_ops_domain *find_protection_domain(u16 devid); |
58 | static u64* alloc_pte(struct protection_domain *dom, | ||
59 | unsigned long address, u64 | ||
60 | **pte_page, gfp_t gfp); | ||
61 | static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, | ||
62 | unsigned long start_page, | ||
63 | unsigned int pages); | ||
58 | 64 | ||
65 | #ifndef BUS_NOTIFY_UNBOUND_DRIVER | ||
66 | #define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 | ||
67 | #endif | ||
59 | 68 | ||
60 | #ifdef CONFIG_AMD_IOMMU_STATS | 69 | #ifdef CONFIG_AMD_IOMMU_STATS |
61 | 70 | ||
@@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) | |||
213 | { | 222 | { |
214 | struct amd_iommu *iommu; | 223 | struct amd_iommu *iommu; |
215 | 224 | ||
216 | list_for_each_entry(iommu, &amd_iommu_list, list) | 225 | for_each_iommu(iommu) |
217 | iommu_poll_events(iommu); | 226 | iommu_poll_events(iommu); |
218 | 227 | ||
219 | return IRQ_HANDLED; | 228 | return IRQ_HANDLED; |
@@ -440,7 +449,7 @@ static void iommu_flush_domain(u16 domid) | |||
440 | __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, | 449 | __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, |
441 | domid, 1, 1); | 450 | domid, 1, 1); |
442 | 451 | ||
443 | list_for_each_entry(iommu, &amd_iommu_list, list) { | 452 | for_each_iommu(iommu) { |
444 | spin_lock_irqsave(&iommu->lock, flags); | 453 | spin_lock_irqsave(&iommu->lock, flags); |
445 | __iommu_queue_command(iommu, &cmd); | 454 | __iommu_queue_command(iommu, &cmd); |
446 | __iommu_completion_wait(iommu); | 455 | __iommu_completion_wait(iommu); |
@@ -449,6 +458,35 @@ static void iommu_flush_domain(u16 domid) | |||
449 | } | 458 | } |
450 | } | 459 | } |
451 | 460 | ||
461 | void amd_iommu_flush_all_domains(void) | ||
462 | { | ||
463 | int i; | ||
464 | |||
465 | for (i = 1; i < MAX_DOMAIN_ID; ++i) { | ||
466 | if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) | ||
467 | continue; | ||
468 | iommu_flush_domain(i); | ||
469 | } | ||
470 | } | ||
471 | |||
472 | void amd_iommu_flush_all_devices(void) | ||
473 | { | ||
474 | struct amd_iommu *iommu; | ||
475 | int i; | ||
476 | |||
477 | for (i = 0; i <= amd_iommu_last_bdf; ++i) { | ||
478 | if (amd_iommu_pd_table[i] == NULL) | ||
479 | continue; | ||
480 | |||
481 | iommu = amd_iommu_rlookup_table[i]; | ||
482 | if (!iommu) | ||
483 | continue; | ||
484 | |||
485 | iommu_queue_inv_dev_entry(iommu, i); | ||
486 | iommu_completion_wait(iommu); | ||
487 | } | ||
488 | } | ||
489 | |||
452 | /**************************************************************************** | 490 | /**************************************************************************** |
453 | * | 491 | * |
454 | * The functions below are used the create the page table mappings for | 492 | * The functions below are used the create the page table mappings for |
@@ -468,7 +506,7 @@ static int iommu_map_page(struct protection_domain *dom, | |||
468 | unsigned long phys_addr, | 506 | unsigned long phys_addr, |
469 | int prot) | 507 | int prot) |
470 | { | 508 | { |
471 | u64 __pte, *pte, *page; | 509 | u64 __pte, *pte; |
472 | 510 | ||
473 | bus_addr = PAGE_ALIGN(bus_addr); | 511 | bus_addr = PAGE_ALIGN(bus_addr); |
474 | phys_addr = PAGE_ALIGN(phys_addr); | 512 | phys_addr = PAGE_ALIGN(phys_addr); |
@@ -477,27 +515,7 @@ static int iommu_map_page(struct protection_domain *dom, | |||
477 | if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) | 515 | if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) |
478 | return -EINVAL; | 516 | return -EINVAL; |
479 | 517 | ||
480 | pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; | 518 | pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); |
481 | |||
482 | if (!IOMMU_PTE_PRESENT(*pte)) { | ||
483 | page = (u64 *)get_zeroed_page(GFP_KERNEL); | ||
484 | if (!page) | ||
485 | return -ENOMEM; | ||
486 | *pte = IOMMU_L2_PDE(virt_to_phys(page)); | ||
487 | } | ||
488 | |||
489 | pte = IOMMU_PTE_PAGE(*pte); | ||
490 | pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; | ||
491 | |||
492 | if (!IOMMU_PTE_PRESENT(*pte)) { | ||
493 | page = (u64 *)get_zeroed_page(GFP_KERNEL); | ||
494 | if (!page) | ||
495 | return -ENOMEM; | ||
496 | *pte = IOMMU_L1_PDE(virt_to_phys(page)); | ||
497 | } | ||
498 | |||
499 | pte = IOMMU_PTE_PAGE(*pte); | ||
500 | pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)]; | ||
501 | 519 | ||
502 | if (IOMMU_PTE_PRESENT(*pte)) | 520 | if (IOMMU_PTE_PRESENT(*pte)) |
503 | return -EBUSY; | 521 | return -EBUSY; |
@@ -595,7 +613,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, | |||
595 | * as allocated in the aperture | 613 | * as allocated in the aperture |
596 | */ | 614 | */ |
597 | if (addr < dma_dom->aperture_size) | 615 | if (addr < dma_dom->aperture_size) |
598 | __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); | 616 | __set_bit(addr >> PAGE_SHIFT, |
617 | dma_dom->aperture[0]->bitmap); | ||
599 | } | 618 | } |
600 | 619 | ||
601 | return 0; | 620 | return 0; |
@@ -632,42 +651,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, | |||
632 | ****************************************************************************/ | 651 | ****************************************************************************/ |
633 | 652 | ||
634 | /* | 653 | /* |
635 | * The address allocator core function. | 654 | * The address allocator core functions. |
636 | * | 655 | * |
637 | * called with domain->lock held | 656 | * called with domain->lock held |
638 | */ | 657 | */ |
658 | |||
659 | /* | ||
660 | * This function checks if there is a PTE for a given dma address. If | ||
661 | * there is one, it returns the pointer to it. | ||
662 | */ | ||
663 | static u64* fetch_pte(struct protection_domain *domain, | ||
664 | unsigned long address) | ||
665 | { | ||
666 | u64 *pte; | ||
667 | |||
668 | pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; | ||
669 | |||
670 | if (!IOMMU_PTE_PRESENT(*pte)) | ||
671 | return NULL; | ||
672 | |||
673 | pte = IOMMU_PTE_PAGE(*pte); | ||
674 | pte = &pte[IOMMU_PTE_L1_INDEX(address)]; | ||
675 | |||
676 | if (!IOMMU_PTE_PRESENT(*pte)) | ||
677 | return NULL; | ||
678 | |||
679 | pte = IOMMU_PTE_PAGE(*pte); | ||
680 | pte = &pte[IOMMU_PTE_L0_INDEX(address)]; | ||
681 | |||
682 | return pte; | ||
683 | } | ||
684 | |||
685 | /* | ||
686 | * This function is used to add a new aperture range to an existing | ||
687 | * aperture in case of dma_ops domain allocation or address allocation | ||
688 | * failure. | ||
689 | */ | ||
690 | static int alloc_new_range(struct amd_iommu *iommu, | ||
691 | struct dma_ops_domain *dma_dom, | ||
692 | bool populate, gfp_t gfp) | ||
693 | { | ||
694 | int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; | ||
695 | int i; | ||
696 | |||
697 | #ifdef CONFIG_IOMMU_STRESS | ||
698 | populate = false; | ||
699 | #endif | ||
700 | |||
701 | if (index >= APERTURE_MAX_RANGES) | ||
702 | return -ENOMEM; | ||
703 | |||
704 | dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp); | ||
705 | if (!dma_dom->aperture[index]) | ||
706 | return -ENOMEM; | ||
707 | |||
708 | dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp); | ||
709 | if (!dma_dom->aperture[index]->bitmap) | ||
710 | goto out_free; | ||
711 | |||
712 | dma_dom->aperture[index]->offset = dma_dom->aperture_size; | ||
713 | |||
714 | if (populate) { | ||
715 | unsigned long address = dma_dom->aperture_size; | ||
716 | int i, num_ptes = APERTURE_RANGE_PAGES / 512; | ||
717 | u64 *pte, *pte_page; | ||
718 | |||
719 | for (i = 0; i < num_ptes; ++i) { | ||
720 | pte = alloc_pte(&dma_dom->domain, address, | ||
721 | &pte_page, gfp); | ||
722 | if (!pte) | ||
723 | goto out_free; | ||
724 | |||
725 | dma_dom->aperture[index]->pte_pages[i] = pte_page; | ||
726 | |||
727 | address += APERTURE_RANGE_SIZE / 64; | ||
728 | } | ||
729 | } | ||
730 | |||
731 | dma_dom->aperture_size += APERTURE_RANGE_SIZE; | ||
732 | |||
733 | /* Intialize the exclusion range if necessary */ | ||
734 | if (iommu->exclusion_start && | ||
735 | iommu->exclusion_start >= dma_dom->aperture[index]->offset && | ||
736 | iommu->exclusion_start < dma_dom->aperture_size) { | ||
737 | unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; | ||
738 | int pages = iommu_num_pages(iommu->exclusion_start, | ||
739 | iommu->exclusion_length, | ||
740 | PAGE_SIZE); | ||
741 | dma_ops_reserve_addresses(dma_dom, startpage, pages); | ||
742 | } | ||
743 | |||
744 | /* | ||
745 | * Check for areas already mapped as present in the new aperture | ||
746 | * range and mark those pages as reserved in the allocator. Such | ||
747 | * mappings may already exist as a result of requested unity | ||
748 | * mappings for devices. | ||
749 | */ | ||
750 | for (i = dma_dom->aperture[index]->offset; | ||
751 | i < dma_dom->aperture_size; | ||
752 | i += PAGE_SIZE) { | ||
753 | u64 *pte = fetch_pte(&dma_dom->domain, i); | ||
754 | if (!pte || !IOMMU_PTE_PRESENT(*pte)) | ||
755 | continue; | ||
756 | |||
757 | dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); | ||
758 | } | ||
759 | |||
760 | return 0; | ||
761 | |||
762 | out_free: | ||
763 | free_page((unsigned long)dma_dom->aperture[index]->bitmap); | ||
764 | |||
765 | kfree(dma_dom->aperture[index]); | ||
766 | dma_dom->aperture[index] = NULL; | ||
767 | |||
768 | return -ENOMEM; | ||
769 | } | ||
770 | |||
771 | static unsigned long dma_ops_area_alloc(struct device *dev, | ||
772 | struct dma_ops_domain *dom, | ||
773 | unsigned int pages, | ||
774 | unsigned long align_mask, | ||
775 | u64 dma_mask, | ||
776 | unsigned long start) | ||
777 | { | ||
778 | unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE; | ||
779 | int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; | ||
780 | int i = start >> APERTURE_RANGE_SHIFT; | ||
781 | unsigned long boundary_size; | ||
782 | unsigned long address = -1; | ||
783 | unsigned long limit; | ||
784 | |||
785 | next_bit >>= PAGE_SHIFT; | ||
786 | |||
787 | boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, | ||
788 | PAGE_SIZE) >> PAGE_SHIFT; | ||
789 | |||
790 | for (;i < max_index; ++i) { | ||
791 | unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT; | ||
792 | |||
793 | if (dom->aperture[i]->offset >= dma_mask) | ||
794 | break; | ||
795 | |||
796 | limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset, | ||
797 | dma_mask >> PAGE_SHIFT); | ||
798 | |||
799 | address = iommu_area_alloc(dom->aperture[i]->bitmap, | ||
800 | limit, next_bit, pages, 0, | ||
801 | boundary_size, align_mask); | ||
802 | if (address != -1) { | ||
803 | address = dom->aperture[i]->offset + | ||
804 | (address << PAGE_SHIFT); | ||
805 | dom->next_address = address + (pages << PAGE_SHIFT); | ||
806 | break; | ||
807 | } | ||
808 | |||
809 | next_bit = 0; | ||
810 | } | ||
811 | |||
812 | return address; | ||
813 | } | ||
814 | |||
639 | static unsigned long dma_ops_alloc_addresses(struct device *dev, | 815 | static unsigned long dma_ops_alloc_addresses(struct device *dev, |
640 | struct dma_ops_domain *dom, | 816 | struct dma_ops_domain *dom, |
641 | unsigned int pages, | 817 | unsigned int pages, |
642 | unsigned long align_mask, | 818 | unsigned long align_mask, |
643 | u64 dma_mask) | 819 | u64 dma_mask) |
644 | { | 820 | { |
645 | unsigned long limit; | ||
646 | unsigned long address; | 821 | unsigned long address; |
647 | unsigned long boundary_size; | ||
648 | 822 | ||
649 | boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, | 823 | #ifdef CONFIG_IOMMU_STRESS |
650 | PAGE_SIZE) >> PAGE_SHIFT; | 824 | dom->next_address = 0; |
651 | limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, | 825 | dom->need_flush = true; |
652 | dma_mask >> PAGE_SHIFT); | 826 | #endif |
653 | 827 | ||
654 | if (dom->next_bit >= limit) { | 828 | address = dma_ops_area_alloc(dev, dom, pages, align_mask, |
655 | dom->next_bit = 0; | 829 | dma_mask, dom->next_address); |
656 | dom->need_flush = true; | ||
657 | } | ||
658 | 830 | ||
659 | address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, | ||
660 | 0 , boundary_size, align_mask); | ||
661 | if (address == -1) { | 831 | if (address == -1) { |
662 | address = iommu_area_alloc(dom->bitmap, limit, 0, pages, | 832 | dom->next_address = 0; |
663 | 0, boundary_size, align_mask); | 833 | address = dma_ops_area_alloc(dev, dom, pages, align_mask, |
834 | dma_mask, 0); | ||
664 | dom->need_flush = true; | 835 | dom->need_flush = true; |
665 | } | 836 | } |
666 | 837 | ||
667 | if (likely(address != -1)) { | 838 | if (unlikely(address == -1)) |
668 | dom->next_bit = address + pages; | ||
669 | address <<= PAGE_SHIFT; | ||
670 | } else | ||
671 | address = bad_dma_address; | 839 | address = bad_dma_address; |
672 | 840 | ||
673 | WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); | 841 | WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); |
@@ -684,11 +852,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, | |||
684 | unsigned long address, | 852 | unsigned long address, |
685 | unsigned int pages) | 853 | unsigned int pages) |
686 | { | 854 | { |
687 | address >>= PAGE_SHIFT; | 855 | unsigned i = address >> APERTURE_RANGE_SHIFT; |
688 | iommu_area_free(dom->bitmap, address, pages); | 856 | struct aperture_range *range = dom->aperture[i]; |
689 | 857 | ||
690 | if (address >= dom->next_bit) | 858 | BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); |
859 | |||
860 | #ifdef CONFIG_IOMMU_STRESS | ||
861 | if (i < 4) | ||
862 | return; | ||
863 | #endif | ||
864 | |||
865 | if (address >= dom->next_address) | ||
691 | dom->need_flush = true; | 866 | dom->need_flush = true; |
867 | |||
868 | address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; | ||
869 | |||
870 | iommu_area_free(range->bitmap, address, pages); | ||
871 | |||
692 | } | 872 | } |
693 | 873 | ||
694 | /**************************************************************************** | 874 | /**************************************************************************** |
@@ -736,12 +916,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, | |||
736 | unsigned long start_page, | 916 | unsigned long start_page, |
737 | unsigned int pages) | 917 | unsigned int pages) |
738 | { | 918 | { |
739 | unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; | 919 | unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; |
740 | 920 | ||
741 | if (start_page + pages > last_page) | 921 | if (start_page + pages > last_page) |
742 | pages = last_page - start_page; | 922 | pages = last_page - start_page; |
743 | 923 | ||
744 | iommu_area_reserve(dom->bitmap, start_page, pages); | 924 | for (i = start_page; i < start_page + pages; ++i) { |
925 | int index = i / APERTURE_RANGE_PAGES; | ||
926 | int page = i % APERTURE_RANGE_PAGES; | ||
927 | __set_bit(page, dom->aperture[index]->bitmap); | ||
928 | } | ||
745 | } | 929 | } |
746 | 930 | ||
747 | static void free_pagetable(struct protection_domain *domain) | 931 | static void free_pagetable(struct protection_domain *domain) |
@@ -780,14 +964,19 @@ static void free_pagetable(struct protection_domain *domain) | |||
780 | */ | 964 | */ |
781 | static void dma_ops_domain_free(struct dma_ops_domain *dom) | 965 | static void dma_ops_domain_free(struct dma_ops_domain *dom) |
782 | { | 966 | { |
967 | int i; | ||
968 | |||
783 | if (!dom) | 969 | if (!dom) |
784 | return; | 970 | return; |
785 | 971 | ||
786 | free_pagetable(&dom->domain); | 972 | free_pagetable(&dom->domain); |
787 | 973 | ||
788 | kfree(dom->pte_pages); | 974 | for (i = 0; i < APERTURE_MAX_RANGES; ++i) { |
789 | 975 | if (!dom->aperture[i]) | |
790 | kfree(dom->bitmap); | 976 | continue; |
977 | free_page((unsigned long)dom->aperture[i]->bitmap); | ||
978 | kfree(dom->aperture[i]); | ||
979 | } | ||
791 | 980 | ||
792 | kfree(dom); | 981 | kfree(dom); |
793 | } | 982 | } |
@@ -797,19 +986,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) | |||
797 | * It also intializes the page table and the address allocator data | 986 | * It also intializes the page table and the address allocator data |
798 | * structures required for the dma_ops interface | 987 | * structures required for the dma_ops interface |
799 | */ | 988 | */ |
800 | static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, | 989 | static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) |
801 | unsigned order) | ||
802 | { | 990 | { |
803 | struct dma_ops_domain *dma_dom; | 991 | struct dma_ops_domain *dma_dom; |
804 | unsigned i, num_pte_pages; | ||
805 | u64 *l2_pde; | ||
806 | u64 address; | ||
807 | |||
808 | /* | ||
809 | * Currently the DMA aperture must be between 32 MB and 1GB in size | ||
810 | */ | ||
811 | if ((order < 25) || (order > 30)) | ||
812 | return NULL; | ||
813 | 992 | ||
814 | dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); | 993 | dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); |
815 | if (!dma_dom) | 994 | if (!dma_dom) |
@@ -826,55 +1005,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, | |||
826 | dma_dom->domain.priv = dma_dom; | 1005 | dma_dom->domain.priv = dma_dom; |
827 | if (!dma_dom->domain.pt_root) | 1006 | if (!dma_dom->domain.pt_root) |
828 | goto free_dma_dom; | 1007 | goto free_dma_dom; |
829 | dma_dom->aperture_size = (1ULL << order); | ||
830 | dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8), | ||
831 | GFP_KERNEL); | ||
832 | if (!dma_dom->bitmap) | ||
833 | goto free_dma_dom; | ||
834 | /* | ||
835 | * mark the first page as allocated so we never return 0 as | ||
836 | * a valid dma-address. So we can use 0 as error value | ||
837 | */ | ||
838 | dma_dom->bitmap[0] = 1; | ||
839 | dma_dom->next_bit = 0; | ||
840 | 1008 | ||
841 | dma_dom->need_flush = false; | 1009 | dma_dom->need_flush = false; |
842 | dma_dom->target_dev = 0xffff; | 1010 | dma_dom->target_dev = 0xffff; |
843 | 1011 | ||
844 | /* Intialize the exclusion range if necessary */ | 1012 | if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) |
845 | if (iommu->exclusion_start && | 1013 | goto free_dma_dom; |
846 | iommu->exclusion_start < dma_dom->aperture_size) { | ||
847 | unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; | ||
848 | int pages = iommu_num_pages(iommu->exclusion_start, | ||
849 | iommu->exclusion_length, | ||
850 | PAGE_SIZE); | ||
851 | dma_ops_reserve_addresses(dma_dom, startpage, pages); | ||
852 | } | ||
853 | 1014 | ||
854 | /* | 1015 | /* |
855 | * At the last step, build the page tables so we don't need to | 1016 | * mark the first page as allocated so we never return 0 as |
856 | * allocate page table pages in the dma_ops mapping/unmapping | 1017 | * a valid dma-address. So we can use 0 as error value |
857 | * path. | ||
858 | */ | 1018 | */ |
859 | num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); | 1019 | dma_dom->aperture[0]->bitmap[0] = 1; |
860 | dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), | 1020 | dma_dom->next_address = 0; |
861 | GFP_KERNEL); | ||
862 | if (!dma_dom->pte_pages) | ||
863 | goto free_dma_dom; | ||
864 | |||
865 | l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); | ||
866 | if (l2_pde == NULL) | ||
867 | goto free_dma_dom; | ||
868 | 1021 | ||
869 | dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); | ||
870 | |||
871 | for (i = 0; i < num_pte_pages; ++i) { | ||
872 | dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL); | ||
873 | if (!dma_dom->pte_pages[i]) | ||
874 | goto free_dma_dom; | ||
875 | address = virt_to_phys(dma_dom->pte_pages[i]); | ||
876 | l2_pde[i] = IOMMU_L1_PDE(address); | ||
877 | } | ||
878 | 1022 | ||
879 | return dma_dom; | 1023 | return dma_dom; |
880 | 1024 | ||
@@ -983,7 +1127,6 @@ static int device_change_notifier(struct notifier_block *nb, | |||
983 | struct protection_domain *domain; | 1127 | struct protection_domain *domain; |
984 | struct dma_ops_domain *dma_domain; | 1128 | struct dma_ops_domain *dma_domain; |
985 | struct amd_iommu *iommu; | 1129 | struct amd_iommu *iommu; |
986 | int order = amd_iommu_aperture_order; | ||
987 | unsigned long flags; | 1130 | unsigned long flags; |
988 | 1131 | ||
989 | if (devid > amd_iommu_last_bdf) | 1132 | if (devid > amd_iommu_last_bdf) |
@@ -1002,17 +1145,7 @@ static int device_change_notifier(struct notifier_block *nb, | |||
1002 | "to a non-dma-ops domain\n", dev_name(dev)); | 1145 | "to a non-dma-ops domain\n", dev_name(dev)); |
1003 | 1146 | ||
1004 | switch (action) { | 1147 | switch (action) { |
1005 | case BUS_NOTIFY_BOUND_DRIVER: | 1148 | case BUS_NOTIFY_UNBOUND_DRIVER: |
1006 | if (domain) | ||
1007 | goto out; | ||
1008 | dma_domain = find_protection_domain(devid); | ||
1009 | if (!dma_domain) | ||
1010 | dma_domain = iommu->default_dom; | ||
1011 | attach_device(iommu, &dma_domain->domain, devid); | ||
1012 | printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " | ||
1013 | "device %s\n", dma_domain->domain.id, dev_name(dev)); | ||
1014 | break; | ||
1015 | case BUS_NOTIFY_UNBIND_DRIVER: | ||
1016 | if (!domain) | 1149 | if (!domain) |
1017 | goto out; | 1150 | goto out; |
1018 | detach_device(domain, devid); | 1151 | detach_device(domain, devid); |
@@ -1022,7 +1155,7 @@ static int device_change_notifier(struct notifier_block *nb, | |||
1022 | dma_domain = find_protection_domain(devid); | 1155 | dma_domain = find_protection_domain(devid); |
1023 | if (dma_domain) | 1156 | if (dma_domain) |
1024 | goto out; | 1157 | goto out; |
1025 | dma_domain = dma_ops_domain_alloc(iommu, order); | 1158 | dma_domain = dma_ops_domain_alloc(iommu); |
1026 | if (!dma_domain) | 1159 | if (!dma_domain) |
1027 | goto out; | 1160 | goto out; |
1028 | dma_domain->target_dev = devid; | 1161 | dma_domain->target_dev = devid; |
@@ -1133,8 +1266,8 @@ static int get_device_resources(struct device *dev, | |||
1133 | dma_dom = (*iommu)->default_dom; | 1266 | dma_dom = (*iommu)->default_dom; |
1134 | *domain = &dma_dom->domain; | 1267 | *domain = &dma_dom->domain; |
1135 | attach_device(*iommu, *domain, *bdf); | 1268 | attach_device(*iommu, *domain, *bdf); |
1136 | printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " | 1269 | DUMP_printk("Using protection domain %d for device %s\n", |
1137 | "device %s\n", (*domain)->id, dev_name(dev)); | 1270 | (*domain)->id, dev_name(dev)); |
1138 | } | 1271 | } |
1139 | 1272 | ||
1140 | if (domain_for_device(_bdf) == NULL) | 1273 | if (domain_for_device(_bdf) == NULL) |
@@ -1144,6 +1277,66 @@ static int get_device_resources(struct device *dev, | |||
1144 | } | 1277 | } |
1145 | 1278 | ||
1146 | /* | 1279 | /* |
1280 | * If the pte_page is not yet allocated this function is called | ||
1281 | */ | ||
1282 | static u64* alloc_pte(struct protection_domain *dom, | ||
1283 | unsigned long address, u64 **pte_page, gfp_t gfp) | ||
1284 | { | ||
1285 | u64 *pte, *page; | ||
1286 | |||
1287 | pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; | ||
1288 | |||
1289 | if (!IOMMU_PTE_PRESENT(*pte)) { | ||
1290 | page = (u64 *)get_zeroed_page(gfp); | ||
1291 | if (!page) | ||
1292 | return NULL; | ||
1293 | *pte = IOMMU_L2_PDE(virt_to_phys(page)); | ||
1294 | } | ||
1295 | |||
1296 | pte = IOMMU_PTE_PAGE(*pte); | ||
1297 | pte = &pte[IOMMU_PTE_L1_INDEX(address)]; | ||
1298 | |||
1299 | if (!IOMMU_PTE_PRESENT(*pte)) { | ||
1300 | page = (u64 *)get_zeroed_page(gfp); | ||
1301 | if (!page) | ||
1302 | return NULL; | ||
1303 | *pte = IOMMU_L1_PDE(virt_to_phys(page)); | ||
1304 | } | ||
1305 | |||
1306 | pte = IOMMU_PTE_PAGE(*pte); | ||
1307 | |||
1308 | if (pte_page) | ||
1309 | *pte_page = pte; | ||
1310 | |||
1311 | pte = &pte[IOMMU_PTE_L0_INDEX(address)]; | ||
1312 | |||
1313 | return pte; | ||
1314 | } | ||
1315 | |||
1316 | /* | ||
1317 | * This function fetches the PTE for a given address in the aperture | ||
1318 | */ | ||
1319 | static u64* dma_ops_get_pte(struct dma_ops_domain *dom, | ||
1320 | unsigned long address) | ||
1321 | { | ||
1322 | struct aperture_range *aperture; | ||
1323 | u64 *pte, *pte_page; | ||
1324 | |||
1325 | aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; | ||
1326 | if (!aperture) | ||
1327 | return NULL; | ||
1328 | |||
1329 | pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; | ||
1330 | if (!pte) { | ||
1331 | pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); | ||
1332 | aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; | ||
1333 | } else | ||
1334 | pte += IOMMU_PTE_L0_INDEX(address); | ||
1335 | |||
1336 | return pte; | ||
1337 | } | ||
1338 | |||
1339 | /* | ||
1147 | * This is the generic map function. It maps one 4kb page at paddr to | 1340 | * This is the generic map function. It maps one 4kb page at paddr to |
1148 | * the given address in the DMA address space for the domain. | 1341 | * the given address in the DMA address space for the domain. |
1149 | */ | 1342 | */ |
@@ -1159,8 +1352,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, | |||
1159 | 1352 | ||
1160 | paddr &= PAGE_MASK; | 1353 | paddr &= PAGE_MASK; |
1161 | 1354 | ||
1162 | pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; | 1355 | pte = dma_ops_get_pte(dom, address); |
1163 | pte += IOMMU_PTE_L0_INDEX(address); | 1356 | if (!pte) |
1357 | return bad_dma_address; | ||
1164 | 1358 | ||
1165 | __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; | 1359 | __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; |
1166 | 1360 | ||
@@ -1185,14 +1379,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, | |||
1185 | struct dma_ops_domain *dom, | 1379 | struct dma_ops_domain *dom, |
1186 | unsigned long address) | 1380 | unsigned long address) |
1187 | { | 1381 | { |
1382 | struct aperture_range *aperture; | ||
1188 | u64 *pte; | 1383 | u64 *pte; |
1189 | 1384 | ||
1190 | if (address >= dom->aperture_size) | 1385 | if (address >= dom->aperture_size) |
1191 | return; | 1386 | return; |
1192 | 1387 | ||
1193 | WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); | 1388 | aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; |
1389 | if (!aperture) | ||
1390 | return; | ||
1391 | |||
1392 | pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; | ||
1393 | if (!pte) | ||
1394 | return; | ||
1194 | 1395 | ||
1195 | pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; | ||
1196 | pte += IOMMU_PTE_L0_INDEX(address); | 1396 | pte += IOMMU_PTE_L0_INDEX(address); |
1197 | 1397 | ||
1198 | WARN_ON(!*pte); | 1398 | WARN_ON(!*pte); |
@@ -1216,7 +1416,7 @@ static dma_addr_t __map_single(struct device *dev, | |||
1216 | u64 dma_mask) | 1416 | u64 dma_mask) |
1217 | { | 1417 | { |
1218 | dma_addr_t offset = paddr & ~PAGE_MASK; | 1418 | dma_addr_t offset = paddr & ~PAGE_MASK; |
1219 | dma_addr_t address, start; | 1419 | dma_addr_t address, start, ret; |
1220 | unsigned int pages; | 1420 | unsigned int pages; |
1221 | unsigned long align_mask = 0; | 1421 | unsigned long align_mask = 0; |
1222 | int i; | 1422 | int i; |
@@ -1232,14 +1432,33 @@ static dma_addr_t __map_single(struct device *dev, | |||
1232 | if (align) | 1432 | if (align) |
1233 | align_mask = (1UL << get_order(size)) - 1; | 1433 | align_mask = (1UL << get_order(size)) - 1; |
1234 | 1434 | ||
1435 | retry: | ||
1235 | address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, | 1436 | address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, |
1236 | dma_mask); | 1437 | dma_mask); |
1237 | if (unlikely(address == bad_dma_address)) | 1438 | if (unlikely(address == bad_dma_address)) { |
1238 | goto out; | 1439 | /* |
1440 | * setting next_address here will let the address | ||
1441 | * allocator only scan the new allocated range in the | ||
1442 | * first run. This is a small optimization. | ||
1443 | */ | ||
1444 | dma_dom->next_address = dma_dom->aperture_size; | ||
1445 | |||
1446 | if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) | ||
1447 | goto out; | ||
1448 | |||
1449 | /* | ||
1450 | * aperture was sucessfully enlarged by 128 MB, try | ||
1451 | * allocation again | ||
1452 | */ | ||
1453 | goto retry; | ||
1454 | } | ||
1239 | 1455 | ||
1240 | start = address; | 1456 | start = address; |
1241 | for (i = 0; i < pages; ++i) { | 1457 | for (i = 0; i < pages; ++i) { |
1242 | dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); | 1458 | ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); |
1459 | if (ret == bad_dma_address) | ||
1460 | goto out_unmap; | ||
1461 | |||
1243 | paddr += PAGE_SIZE; | 1462 | paddr += PAGE_SIZE; |
1244 | start += PAGE_SIZE; | 1463 | start += PAGE_SIZE; |
1245 | } | 1464 | } |
@@ -1255,6 +1474,17 @@ static dma_addr_t __map_single(struct device *dev, | |||
1255 | 1474 | ||
1256 | out: | 1475 | out: |
1257 | return address; | 1476 | return address; |
1477 | |||
1478 | out_unmap: | ||
1479 | |||
1480 | for (--i; i >= 0; --i) { | ||
1481 | start -= PAGE_SIZE; | ||
1482 | dma_ops_domain_unmap(iommu, dma_dom, start); | ||
1483 | } | ||
1484 | |||
1485 | dma_ops_free_addresses(dma_dom, address, pages); | ||
1486 | |||
1487 | return bad_dma_address; | ||
1258 | } | 1488 | } |
1259 | 1489 | ||
1260 | /* | 1490 | /* |
@@ -1537,8 +1767,10 @@ static void *alloc_coherent(struct device *dev, size_t size, | |||
1537 | *dma_addr = __map_single(dev, iommu, domain->priv, paddr, | 1767 | *dma_addr = __map_single(dev, iommu, domain->priv, paddr, |
1538 | size, DMA_BIDIRECTIONAL, true, dma_mask); | 1768 | size, DMA_BIDIRECTIONAL, true, dma_mask); |
1539 | 1769 | ||
1540 | if (*dma_addr == bad_dma_address) | 1770 | if (*dma_addr == bad_dma_address) { |
1771 | spin_unlock_irqrestore(&domain->lock, flags); | ||
1541 | goto out_free; | 1772 | goto out_free; |
1773 | } | ||
1542 | 1774 | ||
1543 | iommu_completion_wait(iommu); | 1775 | iommu_completion_wait(iommu); |
1544 | 1776 | ||
@@ -1625,7 +1857,6 @@ static void prealloc_protection_domains(void) | |||
1625 | struct pci_dev *dev = NULL; | 1857 | struct pci_dev *dev = NULL; |
1626 | struct dma_ops_domain *dma_dom; | 1858 | struct dma_ops_domain *dma_dom; |
1627 | struct amd_iommu *iommu; | 1859 | struct amd_iommu *iommu; |
1628 | int order = amd_iommu_aperture_order; | ||
1629 | u16 devid; | 1860 | u16 devid; |
1630 | 1861 | ||
1631 | while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { | 1862 | while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { |
@@ -1638,7 +1869,7 @@ static void prealloc_protection_domains(void) | |||
1638 | iommu = amd_iommu_rlookup_table[devid]; | 1869 | iommu = amd_iommu_rlookup_table[devid]; |
1639 | if (!iommu) | 1870 | if (!iommu) |
1640 | continue; | 1871 | continue; |
1641 | dma_dom = dma_ops_domain_alloc(iommu, order); | 1872 | dma_dom = dma_ops_domain_alloc(iommu); |
1642 | if (!dma_dom) | 1873 | if (!dma_dom) |
1643 | continue; | 1874 | continue; |
1644 | init_unity_mappings_for_device(dma_dom, devid); | 1875 | init_unity_mappings_for_device(dma_dom, devid); |
@@ -1664,7 +1895,6 @@ static struct dma_map_ops amd_iommu_dma_ops = { | |||
1664 | int __init amd_iommu_init_dma_ops(void) | 1895 | int __init amd_iommu_init_dma_ops(void) |
1665 | { | 1896 | { |
1666 | struct amd_iommu *iommu; | 1897 | struct amd_iommu *iommu; |
1667 | int order = amd_iommu_aperture_order; | ||
1668 | int ret; | 1898 | int ret; |
1669 | 1899 | ||
1670 | /* | 1900 | /* |
@@ -1672,8 +1902,8 @@ int __init amd_iommu_init_dma_ops(void) | |||
1672 | * found in the system. Devices not assigned to any other | 1902 | * found in the system. Devices not assigned to any other |
1673 | * protection domain will be assigned to the default one. | 1903 | * protection domain will be assigned to the default one. |
1674 | */ | 1904 | */ |
1675 | list_for_each_entry(iommu, &amd_iommu_list, list) { | 1905 | for_each_iommu(iommu) { |
1676 | iommu->default_dom = dma_ops_domain_alloc(iommu, order); | 1906 | iommu->default_dom = dma_ops_domain_alloc(iommu); |
1677 | if (iommu->default_dom == NULL) | 1907 | if (iommu->default_dom == NULL) |
1678 | return -ENOMEM; | 1908 | return -ENOMEM; |
1679 | iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; | 1909 | iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; |
@@ -1710,7 +1940,7 @@ int __init amd_iommu_init_dma_ops(void) | |||
1710 | 1940 | ||
1711 | free_domains: | 1941 | free_domains: |
1712 | 1942 | ||
1713 | list_for_each_entry(iommu, &amd_iommu_list, list) { | 1943 | for_each_iommu(iommu) { |
1714 | if (iommu->default_dom) | 1944 | if (iommu->default_dom) |
1715 | dma_ops_domain_free(iommu->default_dom); | 1945 | dma_ops_domain_free(iommu->default_dom); |
1716 | } | 1946 | } |
@@ -1842,7 +2072,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, | |||
1842 | 2072 | ||
1843 | old_domain = domain_for_device(devid); | 2073 | old_domain = domain_for_device(devid); |
1844 | if (old_domain) | 2074 | if (old_domain) |
1845 | return -EBUSY; | 2075 | detach_device(old_domain, devid); |
1846 | 2076 | ||
1847 | attach_device(iommu, domain, devid); | 2077 | attach_device(iommu, domain, devid); |
1848 | 2078 | ||
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902dac..238989ec077d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c | |||
@@ -115,15 +115,21 @@ struct ivmd_header { | |||
115 | u64 range_length; | 115 | u64 range_length; |
116 | } __attribute__((packed)); | 116 | } __attribute__((packed)); |
117 | 117 | ||
118 | bool amd_iommu_dump; | ||
119 | |||
118 | static int __initdata amd_iommu_detected; | 120 | static int __initdata amd_iommu_detected; |
119 | 121 | ||
120 | u16 amd_iommu_last_bdf; /* largest PCI device id we have | 122 | u16 amd_iommu_last_bdf; /* largest PCI device id we have |
121 | to handle */ | 123 | to handle */ |
122 | LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings | 124 | LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings |
123 | we find in ACPI */ | 125 | we find in ACPI */ |
124 | unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ | 126 | #ifdef CONFIG_IOMMU_STRESS |
127 | bool amd_iommu_isolate = false; | ||
128 | #else | ||
125 | bool amd_iommu_isolate = true; /* if true, device isolation is | 129 | bool amd_iommu_isolate = true; /* if true, device isolation is |
126 | enabled */ | 130 | enabled */ |
131 | #endif | ||
132 | |||
127 | bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ | 133 | bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ |
128 | 134 | ||
129 | LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the | 135 | LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the |
@@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid) | |||
175 | static inline unsigned long tbl_size(int entry_size) | 181 | static inline unsigned long tbl_size(int entry_size) |
176 | { | 182 | { |
177 | unsigned shift = PAGE_SHIFT + | 183 | unsigned shift = PAGE_SHIFT + |
178 | get_order(amd_iommu_last_bdf * entry_size); | 184 | get_order(((int)amd_iommu_last_bdf + 1) * entry_size); |
179 | 185 | ||
180 | return 1UL << shift; | 186 | return 1UL << shift; |
181 | } | 187 | } |
@@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size) | |||
193 | * This function set the exclusion range in the IOMMU. DMA accesses to the | 199 | * This function set the exclusion range in the IOMMU. DMA accesses to the |
194 | * exclusion range are passed through untranslated | 200 | * exclusion range are passed through untranslated |
195 | */ | 201 | */ |
196 | static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) | 202 | static void iommu_set_exclusion_range(struct amd_iommu *iommu) |
197 | { | 203 | { |
198 | u64 start = iommu->exclusion_start & PAGE_MASK; | 204 | u64 start = iommu->exclusion_start & PAGE_MASK; |
199 | u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; | 205 | u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; |
@@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu) | |||
225 | } | 231 | } |
226 | 232 | ||
227 | /* Generic functions to enable/disable certain features of the IOMMU. */ | 233 | /* Generic functions to enable/disable certain features of the IOMMU. */ |
228 | static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) | 234 | static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) |
229 | { | 235 | { |
230 | u32 ctrl; | 236 | u32 ctrl; |
231 | 237 | ||
@@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) | |||
244 | } | 250 | } |
245 | 251 | ||
246 | /* Function to enable the hardware */ | 252 | /* Function to enable the hardware */ |
247 | static void __init iommu_enable(struct amd_iommu *iommu) | 253 | static void iommu_enable(struct amd_iommu *iommu) |
248 | { | 254 | { |
249 | printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", | 255 | printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", |
250 | dev_name(&iommu->dev->dev), iommu->cap_ptr); | 256 | dev_name(&iommu->dev->dev), iommu->cap_ptr); |
@@ -252,11 +258,9 @@ static void __init iommu_enable(struct amd_iommu *iommu) | |||
252 | iommu_feature_enable(iommu, CONTROL_IOMMU_EN); | 258 | iommu_feature_enable(iommu, CONTROL_IOMMU_EN); |
253 | } | 259 | } |
254 | 260 | ||
255 | /* Function to enable IOMMU event logging and event interrupts */ | 261 | static void iommu_disable(struct amd_iommu *iommu) |
256 | static void __init iommu_enable_event_logging(struct amd_iommu *iommu) | ||
257 | { | 262 | { |
258 | iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); | 263 | iommu_feature_disable(iommu, CONTROL_IOMMU_EN); |
259 | iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); | ||
260 | } | 264 | } |
261 | 265 | ||
262 | /* | 266 | /* |
@@ -413,25 +417,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) | |||
413 | { | 417 | { |
414 | u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, | 418 | u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, |
415 | get_order(CMD_BUFFER_SIZE)); | 419 | get_order(CMD_BUFFER_SIZE)); |
416 | u64 entry; | ||
417 | 420 | ||
418 | if (cmd_buf == NULL) | 421 | if (cmd_buf == NULL) |
419 | return NULL; | 422 | return NULL; |
420 | 423 | ||
421 | iommu->cmd_buf_size = CMD_BUFFER_SIZE; | 424 | iommu->cmd_buf_size = CMD_BUFFER_SIZE; |
422 | 425 | ||
423 | entry = (u64)virt_to_phys(cmd_buf); | 426 | return cmd_buf; |
427 | } | ||
428 | |||
429 | /* | ||
430 | * This function writes the command buffer address to the hardware and | ||
431 | * enables it. | ||
432 | */ | ||
433 | static void iommu_enable_command_buffer(struct amd_iommu *iommu) | ||
434 | { | ||
435 | u64 entry; | ||
436 | |||
437 | BUG_ON(iommu->cmd_buf == NULL); | ||
438 | |||
439 | entry = (u64)virt_to_phys(iommu->cmd_buf); | ||
424 | entry |= MMIO_CMD_SIZE_512; | 440 | entry |= MMIO_CMD_SIZE_512; |
441 | |||
425 | memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, | 442 | memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, |
426 | &entry, sizeof(entry)); | 443 | &entry, sizeof(entry)); |
427 | 444 | ||
428 | /* set head and tail to zero manually */ | 445 | /* set head and tail to zero manually */ |
429 | writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); | 446 | writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); |
430 | writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); | 447 | writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); |
431 | 448 | ||
432 | iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); | 449 | iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); |
433 | |||
434 | return cmd_buf; | ||
435 | } | 450 | } |
436 | 451 | ||
437 | static void __init free_command_buffer(struct amd_iommu *iommu) | 452 | static void __init free_command_buffer(struct amd_iommu *iommu) |
@@ -443,20 +458,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu) | |||
443 | /* allocates the memory where the IOMMU will log its events to */ | 458 | /* allocates the memory where the IOMMU will log its events to */ |
444 | static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) | 459 | static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) |
445 | { | 460 | { |
446 | u64 entry; | ||
447 | iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, | 461 | iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, |
448 | get_order(EVT_BUFFER_SIZE)); | 462 | get_order(EVT_BUFFER_SIZE)); |
449 | 463 | ||
450 | if (iommu->evt_buf == NULL) | 464 | if (iommu->evt_buf == NULL) |
451 | return NULL; | 465 | return NULL; |
452 | 466 | ||
467 | return iommu->evt_buf; | ||
468 | } | ||
469 | |||
470 | static void iommu_enable_event_buffer(struct amd_iommu *iommu) | ||
471 | { | ||
472 | u64 entry; | ||
473 | |||
474 | BUG_ON(iommu->evt_buf == NULL); | ||
475 | |||
453 | entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; | 476 | entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; |
477 | |||
454 | memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, | 478 | memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, |
455 | &entry, sizeof(entry)); | 479 | &entry, sizeof(entry)); |
456 | 480 | ||
457 | iommu->evt_buf_size = EVT_BUFFER_SIZE; | 481 | iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); |
458 | |||
459 | return iommu->evt_buf; | ||
460 | } | 482 | } |
461 | 483 | ||
462 | static void __init free_event_buffer(struct amd_iommu *iommu) | 484 | static void __init free_event_buffer(struct amd_iommu *iommu) |
@@ -596,32 +618,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, | |||
596 | p += sizeof(struct ivhd_header); | 618 | p += sizeof(struct ivhd_header); |
597 | end += h->length; | 619 | end += h->length; |
598 | 620 | ||
621 | |||
599 | while (p < end) { | 622 | while (p < end) { |
600 | e = (struct ivhd_entry *)p; | 623 | e = (struct ivhd_entry *)p; |
601 | switch (e->type) { | 624 | switch (e->type) { |
602 | case IVHD_DEV_ALL: | 625 | case IVHD_DEV_ALL: |
626 | |||
627 | DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x" | ||
628 | " last device %02x:%02x.%x flags: %02x\n", | ||
629 | PCI_BUS(iommu->first_device), | ||
630 | PCI_SLOT(iommu->first_device), | ||
631 | PCI_FUNC(iommu->first_device), | ||
632 | PCI_BUS(iommu->last_device), | ||
633 | PCI_SLOT(iommu->last_device), | ||
634 | PCI_FUNC(iommu->last_device), | ||
635 | e->flags); | ||
636 | |||
603 | for (dev_i = iommu->first_device; | 637 | for (dev_i = iommu->first_device; |
604 | dev_i <= iommu->last_device; ++dev_i) | 638 | dev_i <= iommu->last_device; ++dev_i) |
605 | set_dev_entry_from_acpi(iommu, dev_i, | 639 | set_dev_entry_from_acpi(iommu, dev_i, |
606 | e->flags, 0); | 640 | e->flags, 0); |
607 | break; | 641 | break; |
608 | case IVHD_DEV_SELECT: | 642 | case IVHD_DEV_SELECT: |
643 | |||
644 | DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x " | ||
645 | "flags: %02x\n", | ||
646 | PCI_BUS(e->devid), | ||
647 | PCI_SLOT(e->devid), | ||
648 | PCI_FUNC(e->devid), | ||
649 | e->flags); | ||
650 | |||
609 | devid = e->devid; | 651 | devid = e->devid; |
610 | set_dev_entry_from_acpi(iommu, devid, e->flags, 0); | 652 | set_dev_entry_from_acpi(iommu, devid, e->flags, 0); |
611 | break; | 653 | break; |
612 | case IVHD_DEV_SELECT_RANGE_START: | 654 | case IVHD_DEV_SELECT_RANGE_START: |
655 | |||
656 | DUMP_printk(" DEV_SELECT_RANGE_START\t " | ||
657 | "devid: %02x:%02x.%x flags: %02x\n", | ||
658 | PCI_BUS(e->devid), | ||
659 | PCI_SLOT(e->devid), | ||
660 | PCI_FUNC(e->devid), | ||
661 | e->flags); | ||
662 | |||
613 | devid_start = e->devid; | 663 | devid_start = e->devid; |
614 | flags = e->flags; | 664 | flags = e->flags; |
615 | ext_flags = 0; | 665 | ext_flags = 0; |
616 | alias = false; | 666 | alias = false; |
617 | break; | 667 | break; |
618 | case IVHD_DEV_ALIAS: | 668 | case IVHD_DEV_ALIAS: |
669 | |||
670 | DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x " | ||
671 | "flags: %02x devid_to: %02x:%02x.%x\n", | ||
672 | PCI_BUS(e->devid), | ||
673 | PCI_SLOT(e->devid), | ||
674 | PCI_FUNC(e->devid), | ||
675 | e->flags, | ||
676 | PCI_BUS(e->ext >> 8), | ||
677 | PCI_SLOT(e->ext >> 8), | ||
678 | PCI_FUNC(e->ext >> 8)); | ||
679 | |||
619 | devid = e->devid; | 680 | devid = e->devid; |
620 | devid_to = e->ext >> 8; | 681 | devid_to = e->ext >> 8; |
621 | set_dev_entry_from_acpi(iommu, devid, e->flags, 0); | 682 | set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); |
622 | amd_iommu_alias_table[devid] = devid_to; | 683 | amd_iommu_alias_table[devid] = devid_to; |
623 | break; | 684 | break; |
624 | case IVHD_DEV_ALIAS_RANGE: | 685 | case IVHD_DEV_ALIAS_RANGE: |
686 | |||
687 | DUMP_printk(" DEV_ALIAS_RANGE\t\t " | ||
688 | "devid: %02x:%02x.%x flags: %02x " | ||
689 | "devid_to: %02x:%02x.%x\n", | ||
690 | PCI_BUS(e->devid), | ||
691 | PCI_SLOT(e->devid), | ||
692 | PCI_FUNC(e->devid), | ||
693 | e->flags, | ||
694 | PCI_BUS(e->ext >> 8), | ||
695 | PCI_SLOT(e->ext >> 8), | ||
696 | PCI_FUNC(e->ext >> 8)); | ||
697 | |||
625 | devid_start = e->devid; | 698 | devid_start = e->devid; |
626 | flags = e->flags; | 699 | flags = e->flags; |
627 | devid_to = e->ext >> 8; | 700 | devid_to = e->ext >> 8; |
@@ -629,17 +702,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, | |||
629 | alias = true; | 702 | alias = true; |
630 | break; | 703 | break; |
631 | case IVHD_DEV_EXT_SELECT: | 704 | case IVHD_DEV_EXT_SELECT: |
705 | |||
706 | DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x " | ||
707 | "flags: %02x ext: %08x\n", | ||
708 | PCI_BUS(e->devid), | ||
709 | PCI_SLOT(e->devid), | ||
710 | PCI_FUNC(e->devid), | ||
711 | e->flags, e->ext); | ||
712 | |||
632 | devid = e->devid; | 713 | devid = e->devid; |
633 | set_dev_entry_from_acpi(iommu, devid, e->flags, | 714 | set_dev_entry_from_acpi(iommu, devid, e->flags, |
634 | e->ext); | 715 | e->ext); |
635 | break; | 716 | break; |
636 | case IVHD_DEV_EXT_SELECT_RANGE: | 717 | case IVHD_DEV_EXT_SELECT_RANGE: |
718 | |||
719 | DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: " | ||
720 | "%02x:%02x.%x flags: %02x ext: %08x\n", | ||
721 | PCI_BUS(e->devid), | ||
722 | PCI_SLOT(e->devid), | ||
723 | PCI_FUNC(e->devid), | ||
724 | e->flags, e->ext); | ||
725 | |||
637 | devid_start = e->devid; | 726 | devid_start = e->devid; |
638 | flags = e->flags; | 727 | flags = e->flags; |
639 | ext_flags = e->ext; | 728 | ext_flags = e->ext; |
640 | alias = false; | 729 | alias = false; |
641 | break; | 730 | break; |
642 | case IVHD_DEV_RANGE_END: | 731 | case IVHD_DEV_RANGE_END: |
732 | |||
733 | DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n", | ||
734 | PCI_BUS(e->devid), | ||
735 | PCI_SLOT(e->devid), | ||
736 | PCI_FUNC(e->devid)); | ||
737 | |||
643 | devid = e->devid; | 738 | devid = e->devid; |
644 | for (dev_i = devid_start; dev_i <= devid; ++dev_i) { | 739 | for (dev_i = devid_start; dev_i <= devid; ++dev_i) { |
645 | if (alias) | 740 | if (alias) |
@@ -679,7 +774,7 @@ static void __init free_iommu_all(void) | |||
679 | { | 774 | { |
680 | struct amd_iommu *iommu, *next; | 775 | struct amd_iommu *iommu, *next; |
681 | 776 | ||
682 | list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { | 777 | for_each_iommu_safe(iommu, next) { |
683 | list_del(&iommu->list); | 778 | list_del(&iommu->list); |
684 | free_iommu_one(iommu); | 779 | free_iommu_one(iommu); |
685 | kfree(iommu); | 780 | kfree(iommu); |
@@ -710,7 +805,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) | |||
710 | if (!iommu->mmio_base) | 805 | if (!iommu->mmio_base) |
711 | return -ENOMEM; | 806 | return -ENOMEM; |
712 | 807 | ||
713 | iommu_set_device_table(iommu); | ||
714 | iommu->cmd_buf = alloc_command_buffer(iommu); | 808 | iommu->cmd_buf = alloc_command_buffer(iommu); |
715 | if (!iommu->cmd_buf) | 809 | if (!iommu->cmd_buf) |
716 | return -ENOMEM; | 810 | return -ENOMEM; |
@@ -746,6 +840,15 @@ static int __init init_iommu_all(struct acpi_table_header *table) | |||
746 | h = (struct ivhd_header *)p; | 840 | h = (struct ivhd_header *)p; |
747 | switch (*p) { | 841 | switch (*p) { |
748 | case ACPI_IVHD_TYPE: | 842 | case ACPI_IVHD_TYPE: |
843 | |||
844 | DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " | ||
845 | "seg: %d flags: %01x info %04x\n", | ||
846 | PCI_BUS(h->devid), PCI_SLOT(h->devid), | ||
847 | PCI_FUNC(h->devid), h->cap_ptr, | ||
848 | h->pci_seg, h->flags, h->info); | ||
849 | DUMP_printk(" mmio-addr: %016llx\n", | ||
850 | h->mmio_phys); | ||
851 | |||
749 | iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); | 852 | iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); |
750 | if (iommu == NULL) | 853 | if (iommu == NULL) |
751 | return -ENOMEM; | 854 | return -ENOMEM; |
@@ -773,56 +876,9 @@ static int __init init_iommu_all(struct acpi_table_header *table) | |||
773 | * | 876 | * |
774 | ****************************************************************************/ | 877 | ****************************************************************************/ |
775 | 878 | ||
776 | static int __init iommu_setup_msix(struct amd_iommu *iommu) | ||
777 | { | ||
778 | struct amd_iommu *curr; | ||
779 | struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ | ||
780 | int nvec = 0, i; | ||
781 | |||
782 | list_for_each_entry(curr, &amd_iommu_list, list) { | ||
783 | if (curr->dev == iommu->dev) { | ||
784 | entries[nvec].entry = curr->evt_msi_num; | ||
785 | entries[nvec].vector = 0; | ||
786 | curr->int_enabled = true; | ||
787 | nvec++; | ||
788 | } | ||
789 | } | ||
790 | |||
791 | if (pci_enable_msix(iommu->dev, entries, nvec)) { | ||
792 | pci_disable_msix(iommu->dev); | ||
793 | return 1; | ||
794 | } | ||
795 | |||
796 | for (i = 0; i < nvec; ++i) { | ||
797 | int r = request_irq(entries->vector, amd_iommu_int_handler, | ||
798 | IRQF_SAMPLE_RANDOM, | ||
799 | "AMD IOMMU", | ||
800 | NULL); | ||
801 | if (r) | ||
802 | goto out_free; | ||
803 | } | ||
804 | |||
805 | return 0; | ||
806 | |||
807 | out_free: | ||
808 | for (i -= 1; i >= 0; --i) | ||
809 | free_irq(entries->vector, NULL); | ||
810 | |||
811 | pci_disable_msix(iommu->dev); | ||
812 | |||
813 | return 1; | ||
814 | } | ||
815 | |||
816 | static int __init iommu_setup_msi(struct amd_iommu *iommu) | 879 | static int __init iommu_setup_msi(struct amd_iommu *iommu) |
817 | { | 880 | { |
818 | int r; | 881 | int r; |
819 | struct amd_iommu *curr; | ||
820 | |||
821 | list_for_each_entry(curr, &amd_iommu_list, list) { | ||
822 | if (curr->dev == iommu->dev) | ||
823 | curr->int_enabled = true; | ||
824 | } | ||
825 | |||
826 | 882 | ||
827 | if (pci_enable_msi(iommu->dev)) | 883 | if (pci_enable_msi(iommu->dev)) |
828 | return 1; | 884 | return 1; |
@@ -837,17 +893,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) | |||
837 | return 1; | 893 | return 1; |
838 | } | 894 | } |
839 | 895 | ||
896 | iommu->int_enabled = true; | ||
897 | iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); | ||
898 | |||
840 | return 0; | 899 | return 0; |
841 | } | 900 | } |
842 | 901 | ||
843 | static int __init iommu_init_msi(struct amd_iommu *iommu) | 902 | static int iommu_init_msi(struct amd_iommu *iommu) |
844 | { | 903 | { |
845 | if (iommu->int_enabled) | 904 | if (iommu->int_enabled) |
846 | return 0; | 905 | return 0; |
847 | 906 | ||
848 | if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) | 907 | if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) |
849 | return iommu_setup_msix(iommu); | ||
850 | else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) | ||
851 | return iommu_setup_msi(iommu); | 908 | return iommu_setup_msi(iommu); |
852 | 909 | ||
853 | return 1; | 910 | return 1; |
@@ -899,6 +956,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) | |||
899 | static int __init init_unity_map_range(struct ivmd_header *m) | 956 | static int __init init_unity_map_range(struct ivmd_header *m) |
900 | { | 957 | { |
901 | struct unity_map_entry *e = 0; | 958 | struct unity_map_entry *e = 0; |
959 | char *s; | ||
902 | 960 | ||
903 | e = kzalloc(sizeof(*e), GFP_KERNEL); | 961 | e = kzalloc(sizeof(*e), GFP_KERNEL); |
904 | if (e == NULL) | 962 | if (e == NULL) |
@@ -906,14 +964,19 @@ static int __init init_unity_map_range(struct ivmd_header *m) | |||
906 | 964 | ||
907 | switch (m->type) { | 965 | switch (m->type) { |
908 | default: | 966 | default: |
967 | kfree(e); | ||
968 | return 0; | ||
909 | case ACPI_IVMD_TYPE: | 969 | case ACPI_IVMD_TYPE: |
970 | s = "IVMD_TYPEi\t\t\t"; | ||
910 | e->devid_start = e->devid_end = m->devid; | 971 | e->devid_start = e->devid_end = m->devid; |
911 | break; | 972 | break; |
912 | case ACPI_IVMD_TYPE_ALL: | 973 | case ACPI_IVMD_TYPE_ALL: |
974 | s = "IVMD_TYPE_ALL\t\t"; | ||
913 | e->devid_start = 0; | 975 | e->devid_start = 0; |
914 | e->devid_end = amd_iommu_last_bdf; | 976 | e->devid_end = amd_iommu_last_bdf; |
915 | break; | 977 | break; |
916 | case ACPI_IVMD_TYPE_RANGE: | 978 | case ACPI_IVMD_TYPE_RANGE: |
979 | s = "IVMD_TYPE_RANGE\t\t"; | ||
917 | e->devid_start = m->devid; | 980 | e->devid_start = m->devid; |
918 | e->devid_end = m->aux; | 981 | e->devid_end = m->aux; |
919 | break; | 982 | break; |
@@ -922,6 +985,13 @@ static int __init init_unity_map_range(struct ivmd_header *m) | |||
922 | e->address_end = e->address_start + PAGE_ALIGN(m->range_length); | 985 | e->address_end = e->address_start + PAGE_ALIGN(m->range_length); |
923 | e->prot = m->flags >> 1; | 986 | e->prot = m->flags >> 1; |
924 | 987 | ||
988 | DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x" | ||
989 | " range_start: %016llx range_end: %016llx flags: %x\n", s, | ||
990 | PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start), | ||
991 | PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end), | ||
992 | PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end), | ||
993 | e->address_start, e->address_end, m->flags); | ||
994 | |||
925 | list_add_tail(&e->list, &amd_iommu_unity_map); | 995 | list_add_tail(&e->list, &amd_iommu_unity_map); |
926 | 996 | ||
927 | return 0; | 997 | return 0; |
@@ -967,18 +1037,28 @@ static void init_device_table(void) | |||
967 | * This function finally enables all IOMMUs found in the system after | 1037 | * This function finally enables all IOMMUs found in the system after |
968 | * they have been initialized | 1038 | * they have been initialized |
969 | */ | 1039 | */ |
970 | static void __init enable_iommus(void) | 1040 | static void enable_iommus(void) |
971 | { | 1041 | { |
972 | struct amd_iommu *iommu; | 1042 | struct amd_iommu *iommu; |
973 | 1043 | ||
974 | list_for_each_entry(iommu, &amd_iommu_list, list) { | 1044 | for_each_iommu(iommu) { |
1045 | iommu_set_device_table(iommu); | ||
1046 | iommu_enable_command_buffer(iommu); | ||
1047 | iommu_enable_event_buffer(iommu); | ||
975 | iommu_set_exclusion_range(iommu); | 1048 | iommu_set_exclusion_range(iommu); |
976 | iommu_init_msi(iommu); | 1049 | iommu_init_msi(iommu); |
977 | iommu_enable_event_logging(iommu); | ||
978 | iommu_enable(iommu); | 1050 | iommu_enable(iommu); |
979 | } | 1051 | } |
980 | } | 1052 | } |
981 | 1053 | ||
1054 | static void disable_iommus(void) | ||
1055 | { | ||
1056 | struct amd_iommu *iommu; | ||
1057 | |||
1058 | for_each_iommu(iommu) | ||
1059 | iommu_disable(iommu); | ||
1060 | } | ||
1061 | |||
982 | /* | 1062 | /* |
983 | * Suspend/Resume support | 1063 | * Suspend/Resume support |
984 | * disable suspend until real resume implemented | 1064 | * disable suspend until real resume implemented |
@@ -986,12 +1066,31 @@ static void __init enable_iommus(void) | |||
986 | 1066 | ||
987 | static int amd_iommu_resume(struct sys_device *dev) | 1067 | static int amd_iommu_resume(struct sys_device *dev) |
988 | { | 1068 | { |
1069 | /* | ||
1070 | * Disable IOMMUs before reprogramming the hardware registers. | ||
1071 | * IOMMU is still enabled from the resume kernel. | ||
1072 | */ | ||
1073 | disable_iommus(); | ||
1074 | |||
1075 | /* re-load the hardware */ | ||
1076 | enable_iommus(); | ||
1077 | |||
1078 | /* | ||
1079 | * we have to flush after the IOMMUs are enabled because a | ||
1080 | * disabled IOMMU will never execute the commands we send | ||
1081 | */ | ||
1082 | amd_iommu_flush_all_domains(); | ||
1083 | amd_iommu_flush_all_devices(); | ||
1084 | |||
989 | return 0; | 1085 | return 0; |
990 | } | 1086 | } |
991 | 1087 | ||
992 | static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) | 1088 | static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) |
993 | { | 1089 | { |
994 | return -EINVAL; | 1090 | /* disable IOMMUs to go out of the way for BIOS */ |
1091 | disable_iommus(); | ||
1092 | |||
1093 | return 0; | ||
995 | } | 1094 | } |
996 | 1095 | ||
997 | static struct sysdev_class amd_iommu_sysdev_class = { | 1096 | static struct sysdev_class amd_iommu_sysdev_class = { |
@@ -1137,9 +1236,6 @@ int __init amd_iommu_init(void) | |||
1137 | 1236 | ||
1138 | enable_iommus(); | 1237 | enable_iommus(); |
1139 | 1238 | ||
1140 | printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", | ||
1141 | (1 << (amd_iommu_aperture_order-20))); | ||
1142 | |||
1143 | printk(KERN_INFO "AMD IOMMU: device isolation "); | 1239 | printk(KERN_INFO "AMD IOMMU: device isolation "); |
1144 | if (amd_iommu_isolate) | 1240 | if (amd_iommu_isolate) |
1145 | printk("enabled\n"); | 1241 | printk("enabled\n"); |
@@ -1211,6 +1307,13 @@ void __init amd_iommu_detect(void) | |||
1211 | * | 1307 | * |
1212 | ****************************************************************************/ | 1308 | ****************************************************************************/ |
1213 | 1309 | ||
1310 | static int __init parse_amd_iommu_dump(char *str) | ||
1311 | { | ||
1312 | amd_iommu_dump = true; | ||
1313 | |||
1314 | return 1; | ||
1315 | } | ||
1316 | |||
1214 | static int __init parse_amd_iommu_options(char *str) | 1317 | static int __init parse_amd_iommu_options(char *str) |
1215 | { | 1318 | { |
1216 | for (; *str; ++str) { | 1319 | for (; *str; ++str) { |
@@ -1225,15 +1328,5 @@ static int __init parse_amd_iommu_options(char *str) | |||
1225 | return 1; | 1328 | return 1; |
1226 | } | 1329 | } |
1227 | 1330 | ||
1228 | static int __init parse_amd_iommu_size_options(char *str) | 1331 | __setup("amd_iommu_dump", parse_amd_iommu_dump); |
1229 | { | ||
1230 | unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); | ||
1231 | |||
1232 | if ((order > 24) && (order < 31)) | ||
1233 | amd_iommu_aperture_order = order; | ||
1234 | |||
1235 | return 1; | ||
1236 | } | ||
1237 | |||
1238 | __setup("amd_iommu=", parse_amd_iommu_options); | 1332 | __setup("amd_iommu=", parse_amd_iommu_options); |
1239 | __setup("amd_iommu_size=", parse_amd_iommu_size_options); | ||
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f2870920f246..8c7c042ecad1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -14,6 +14,7 @@ | |||
14 | * Mikael Pettersson : PM converted to driver model. | 14 | * Mikael Pettersson : PM converted to driver model. |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/perf_counter.h> | ||
17 | #include <linux/kernel_stat.h> | 18 | #include <linux/kernel_stat.h> |
18 | #include <linux/mc146818rtc.h> | 19 | #include <linux/mc146818rtc.h> |
19 | #include <linux/acpi_pmtmr.h> | 20 | #include <linux/acpi_pmtmr.h> |
@@ -34,6 +35,7 @@ | |||
34 | #include <linux/smp.h> | 35 | #include <linux/smp.h> |
35 | #include <linux/mm.h> | 36 | #include <linux/mm.h> |
36 | 37 | ||
38 | #include <asm/perf_counter.h> | ||
37 | #include <asm/pgalloc.h> | 39 | #include <asm/pgalloc.h> |
38 | #include <asm/atomic.h> | 40 | #include <asm/atomic.h> |
39 | #include <asm/mpspec.h> | 41 | #include <asm/mpspec.h> |
@@ -98,6 +100,29 @@ early_param("lapic", parse_lapic); | |||
98 | /* Local APIC was disabled by the BIOS and enabled by the kernel */ | 100 | /* Local APIC was disabled by the BIOS and enabled by the kernel */ |
99 | static int enabled_via_apicbase; | 101 | static int enabled_via_apicbase; |
100 | 102 | ||
103 | /* | ||
104 | * Handle interrupt mode configuration register (IMCR). | ||
105 | * This register controls whether the interrupt signals | ||
106 | * that reach the BSP come from the master PIC or from the | ||
107 | * local APIC. Before entering Symmetric I/O Mode, either | ||
108 | * the BIOS or the operating system must switch out of | ||
109 | * PIC Mode by changing the IMCR. | ||
110 | */ | ||
111 | static inline void imcr_pic_to_apic(void) | ||
112 | { | ||
113 | /* select IMCR register */ | ||
114 | outb(0x70, 0x22); | ||
115 | /* NMI and 8259 INTR go through APIC */ | ||
116 | outb(0x01, 0x23); | ||
117 | } | ||
118 | |||
119 | static inline void imcr_apic_to_pic(void) | ||
120 | { | ||
121 | /* select IMCR register */ | ||
122 | outb(0x70, 0x22); | ||
123 | /* NMI and 8259 INTR go directly to BSP */ | ||
124 | outb(0x00, 0x23); | ||
125 | } | ||
101 | #endif | 126 | #endif |
102 | 127 | ||
103 | #ifdef CONFIG_X86_64 | 128 | #ifdef CONFIG_X86_64 |
@@ -111,13 +136,19 @@ static __init int setup_apicpmtimer(char *s) | |||
111 | __setup("apicpmtimer", setup_apicpmtimer); | 136 | __setup("apicpmtimer", setup_apicpmtimer); |
112 | #endif | 137 | #endif |
113 | 138 | ||
139 | int x2apic_mode; | ||
114 | #ifdef CONFIG_X86_X2APIC | 140 | #ifdef CONFIG_X86_X2APIC |
115 | int x2apic; | ||
116 | /* x2apic enabled before OS handover */ | 141 | /* x2apic enabled before OS handover */ |
117 | static int x2apic_preenabled; | 142 | static int x2apic_preenabled; |
118 | static int disable_x2apic; | 143 | static int disable_x2apic; |
119 | static __init int setup_nox2apic(char *str) | 144 | static __init int setup_nox2apic(char *str) |
120 | { | 145 | { |
146 | if (x2apic_enabled()) { | ||
147 | pr_warning("Bios already enabled x2apic, " | ||
148 | "can't enforce nox2apic"); | ||
149 | return 0; | ||
150 | } | ||
151 | |||
121 | disable_x2apic = 1; | 152 | disable_x2apic = 1; |
122 | setup_clear_cpu_cap(X86_FEATURE_X2APIC); | 153 | setup_clear_cpu_cap(X86_FEATURE_X2APIC); |
123 | return 0; | 154 | return 0; |
@@ -209,6 +240,31 @@ static int modern_apic(void) | |||
209 | return lapic_get_version() >= 0x14; | 240 | return lapic_get_version() >= 0x14; |
210 | } | 241 | } |
211 | 242 | ||
243 | /* | ||
244 | * bare function to substitute write operation | ||
245 | * and it's _that_ fast :) | ||
246 | */ | ||
247 | static void native_apic_write_dummy(u32 reg, u32 v) | ||
248 | { | ||
249 | WARN_ON_ONCE((cpu_has_apic || !disable_apic)); | ||
250 | } | ||
251 | |||
252 | static u32 native_apic_read_dummy(u32 reg) | ||
253 | { | ||
254 | WARN_ON_ONCE((cpu_has_apic && !disable_apic)); | ||
255 | return 0; | ||
256 | } | ||
257 | |||
258 | /* | ||
259 | * right after this call apic->write/read doesn't do anything | ||
260 | * note that there is no restore operation it works one way | ||
261 | */ | ||
262 | void apic_disable(void) | ||
263 | { | ||
264 | apic->read = native_apic_read_dummy; | ||
265 | apic->write = native_apic_write_dummy; | ||
266 | } | ||
267 | |||
212 | void native_apic_wait_icr_idle(void) | 268 | void native_apic_wait_icr_idle(void) |
213 | { | 269 | { |
214 | while (apic_read(APIC_ICR) & APIC_ICR_BUSY) | 270 | while (apic_read(APIC_ICR) & APIC_ICR_BUSY) |
@@ -348,7 +404,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | |||
348 | 404 | ||
349 | static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) | 405 | static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) |
350 | { | 406 | { |
351 | unsigned long reg = (lvt_off << 4) + APIC_EILVT0; | 407 | unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); |
352 | unsigned int v = (mask << 16) | (msg_type << 8) | vector; | 408 | unsigned int v = (mask << 16) | (msg_type << 8) | vector; |
353 | 409 | ||
354 | apic_write(reg, v); | 410 | apic_write(reg, v); |
@@ -815,7 +871,7 @@ void clear_local_APIC(void) | |||
815 | u32 v; | 871 | u32 v; |
816 | 872 | ||
817 | /* APIC hasn't been mapped yet */ | 873 | /* APIC hasn't been mapped yet */ |
818 | if (!x2apic && !apic_phys) | 874 | if (!x2apic_mode && !apic_phys) |
819 | return; | 875 | return; |
820 | 876 | ||
821 | maxlvt = lapic_get_maxlvt(); | 877 | maxlvt = lapic_get_maxlvt(); |
@@ -843,7 +899,7 @@ void clear_local_APIC(void) | |||
843 | } | 899 | } |
844 | 900 | ||
845 | /* lets not touch this if we didn't frob it */ | 901 | /* lets not touch this if we didn't frob it */ |
846 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) | 902 | #ifdef CONFIG_X86_THERMAL_VECTOR |
847 | if (maxlvt >= 5) { | 903 | if (maxlvt >= 5) { |
848 | v = apic_read(APIC_LVTTHMR); | 904 | v = apic_read(APIC_LVTTHMR); |
849 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); | 905 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); |
@@ -1133,6 +1189,7 @@ void __cpuinit setup_local_APIC(void) | |||
1133 | apic_write(APIC_ESR, 0); | 1189 | apic_write(APIC_ESR, 0); |
1134 | } | 1190 | } |
1135 | #endif | 1191 | #endif |
1192 | perf_counters_lapic_init(); | ||
1136 | 1193 | ||
1137 | preempt_disable(); | 1194 | preempt_disable(); |
1138 | 1195 | ||
@@ -1287,7 +1344,7 @@ void check_x2apic(void) | |||
1287 | { | 1344 | { |
1288 | if (x2apic_enabled()) { | 1345 | if (x2apic_enabled()) { |
1289 | pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); | 1346 | pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); |
1290 | x2apic_preenabled = x2apic = 1; | 1347 | x2apic_preenabled = x2apic_mode = 1; |
1291 | } | 1348 | } |
1292 | } | 1349 | } |
1293 | 1350 | ||
@@ -1295,7 +1352,7 @@ void enable_x2apic(void) | |||
1295 | { | 1352 | { |
1296 | int msr, msr2; | 1353 | int msr, msr2; |
1297 | 1354 | ||
1298 | if (!x2apic) | 1355 | if (!x2apic_mode) |
1299 | return; | 1356 | return; |
1300 | 1357 | ||
1301 | rdmsr(MSR_IA32_APICBASE, msr, msr2); | 1358 | rdmsr(MSR_IA32_APICBASE, msr, msr2); |
@@ -1304,6 +1361,7 @@ void enable_x2apic(void) | |||
1304 | wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); | 1361 | wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); |
1305 | } | 1362 | } |
1306 | } | 1363 | } |
1364 | #endif /* CONFIG_X86_X2APIC */ | ||
1307 | 1365 | ||
1308 | void __init enable_IR_x2apic(void) | 1366 | void __init enable_IR_x2apic(void) |
1309 | { | 1367 | { |
@@ -1312,32 +1370,21 @@ void __init enable_IR_x2apic(void) | |||
1312 | unsigned long flags; | 1370 | unsigned long flags; |
1313 | struct IO_APIC_route_entry **ioapic_entries = NULL; | 1371 | struct IO_APIC_route_entry **ioapic_entries = NULL; |
1314 | 1372 | ||
1315 | if (!cpu_has_x2apic) | 1373 | ret = dmar_table_init(); |
1316 | return; | 1374 | if (ret) { |
1317 | 1375 | pr_debug("dmar_table_init() failed with %d:\n", ret); | |
1318 | if (!x2apic_preenabled && disable_x2apic) { | 1376 | goto ir_failed; |
1319 | pr_info("Skipped enabling x2apic and Interrupt-remapping " | ||
1320 | "because of nox2apic\n"); | ||
1321 | return; | ||
1322 | } | 1377 | } |
1323 | 1378 | ||
1324 | if (x2apic_preenabled && disable_x2apic) | 1379 | if (!intr_remapping_supported()) { |
1325 | panic("Bios already enabled x2apic, can't enforce nox2apic"); | 1380 | pr_debug("intr-remapping not supported\n"); |
1326 | 1381 | goto ir_failed; | |
1327 | if (!x2apic_preenabled && skip_ioapic_setup) { | ||
1328 | pr_info("Skipped enabling x2apic and Interrupt-remapping " | ||
1329 | "because of skipping io-apic setup\n"); | ||
1330 | return; | ||
1331 | } | 1382 | } |
1332 | 1383 | ||
1333 | ret = dmar_table_init(); | ||
1334 | if (ret) { | ||
1335 | pr_info("dmar_table_init() failed with %d:\n", ret); | ||
1336 | 1384 | ||
1337 | if (x2apic_preenabled) | 1385 | if (!x2apic_preenabled && skip_ioapic_setup) { |
1338 | panic("x2apic enabled by bios. But IR enabling failed"); | 1386 | pr_info("Skipped enabling intr-remap because of skipping " |
1339 | else | 1387 | "io-apic setup\n"); |
1340 | pr_info("Not enabling x2apic,Intr-remapping\n"); | ||
1341 | return; | 1388 | return; |
1342 | } | 1389 | } |
1343 | 1390 | ||
@@ -1357,19 +1404,16 @@ void __init enable_IR_x2apic(void) | |||
1357 | mask_IO_APIC_setup(ioapic_entries); | 1404 | mask_IO_APIC_setup(ioapic_entries); |
1358 | mask_8259A(); | 1405 | mask_8259A(); |
1359 | 1406 | ||
1360 | ret = enable_intr_remapping(EIM_32BIT_APIC_ID); | 1407 | ret = enable_intr_remapping(x2apic_supported()); |
1361 | |||
1362 | if (ret && x2apic_preenabled) { | ||
1363 | local_irq_restore(flags); | ||
1364 | panic("x2apic enabled by bios. But IR enabling failed"); | ||
1365 | } | ||
1366 | |||
1367 | if (ret) | 1408 | if (ret) |
1368 | goto end_restore; | 1409 | goto end_restore; |
1369 | 1410 | ||
1370 | if (!x2apic) { | 1411 | pr_info("Enabled Interrupt-remapping\n"); |
1371 | x2apic = 1; | 1412 | |
1413 | if (x2apic_supported() && !x2apic_mode) { | ||
1414 | x2apic_mode = 1; | ||
1372 | enable_x2apic(); | 1415 | enable_x2apic(); |
1416 | pr_info("Enabled x2apic\n"); | ||
1373 | } | 1417 | } |
1374 | 1418 | ||
1375 | end_restore: | 1419 | end_restore: |
@@ -1378,37 +1422,34 @@ end_restore: | |||
1378 | * IR enabling failed | 1422 | * IR enabling failed |
1379 | */ | 1423 | */ |
1380 | restore_IO_APIC_setup(ioapic_entries); | 1424 | restore_IO_APIC_setup(ioapic_entries); |
1381 | else | ||
1382 | reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries); | ||
1383 | 1425 | ||
1384 | unmask_8259A(); | 1426 | unmask_8259A(); |
1385 | local_irq_restore(flags); | 1427 | local_irq_restore(flags); |
1386 | 1428 | ||
1387 | end: | 1429 | end: |
1388 | if (!ret) { | ||
1389 | if (!x2apic_preenabled) | ||
1390 | pr_info("Enabled x2apic and interrupt-remapping\n"); | ||
1391 | else | ||
1392 | pr_info("Enabled Interrupt-remapping\n"); | ||
1393 | } else | ||
1394 | pr_err("Failed to enable Interrupt-remapping and x2apic\n"); | ||
1395 | if (ioapic_entries) | 1430 | if (ioapic_entries) |
1396 | free_ioapic_entries(ioapic_entries); | 1431 | free_ioapic_entries(ioapic_entries); |
1432 | |||
1433 | if (!ret) | ||
1434 | return; | ||
1435 | |||
1436 | ir_failed: | ||
1437 | if (x2apic_preenabled) | ||
1438 | panic("x2apic enabled by bios. But IR enabling failed"); | ||
1439 | else if (cpu_has_x2apic) | ||
1440 | pr_info("Not enabling x2apic,Intr-remapping\n"); | ||
1397 | #else | 1441 | #else |
1398 | if (!cpu_has_x2apic) | 1442 | if (!cpu_has_x2apic) |
1399 | return; | 1443 | return; |
1400 | 1444 | ||
1401 | if (x2apic_preenabled) | 1445 | if (x2apic_preenabled) |
1402 | panic("x2apic enabled prior OS handover," | 1446 | panic("x2apic enabled prior OS handover," |
1403 | " enable CONFIG_INTR_REMAP"); | 1447 | " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); |
1404 | |||
1405 | pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping " | ||
1406 | " and x2apic\n"); | ||
1407 | #endif | 1448 | #endif |
1408 | 1449 | ||
1409 | return; | 1450 | return; |
1410 | } | 1451 | } |
1411 | #endif /* CONFIG_X86_X2APIC */ | 1452 | |
1412 | 1453 | ||
1413 | #ifdef CONFIG_X86_64 | 1454 | #ifdef CONFIG_X86_64 |
1414 | /* | 1455 | /* |
@@ -1425,7 +1466,6 @@ static int __init detect_init_APIC(void) | |||
1425 | } | 1466 | } |
1426 | 1467 | ||
1427 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | 1468 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; |
1428 | boot_cpu_physical_apicid = 0; | ||
1429 | return 0; | 1469 | return 0; |
1430 | } | 1470 | } |
1431 | #else | 1471 | #else |
@@ -1539,32 +1579,49 @@ void __init early_init_lapic_mapping(void) | |||
1539 | */ | 1579 | */ |
1540 | void __init init_apic_mappings(void) | 1580 | void __init init_apic_mappings(void) |
1541 | { | 1581 | { |
1542 | if (x2apic) { | 1582 | unsigned int new_apicid; |
1583 | |||
1584 | if (x2apic_mode) { | ||
1543 | boot_cpu_physical_apicid = read_apic_id(); | 1585 | boot_cpu_physical_apicid = read_apic_id(); |
1544 | return; | 1586 | return; |
1545 | } | 1587 | } |
1546 | 1588 | ||
1547 | /* | 1589 | /* If no local APIC can be found return early */ |
1548 | * If no local APIC can be found then set up a fake all | ||
1549 | * zeroes page to simulate the local APIC and another | ||
1550 | * one for the IO-APIC. | ||
1551 | */ | ||
1552 | if (!smp_found_config && detect_init_APIC()) { | 1590 | if (!smp_found_config && detect_init_APIC()) { |
1553 | apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | 1591 | /* lets NOP'ify apic operations */ |
1554 | apic_phys = __pa(apic_phys); | 1592 | pr_info("APIC: disable apic facility\n"); |
1555 | } else | 1593 | apic_disable(); |
1594 | } else { | ||
1556 | apic_phys = mp_lapic_addr; | 1595 | apic_phys = mp_lapic_addr; |
1557 | 1596 | ||
1558 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | 1597 | /* |
1559 | apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", | 1598 | * acpi lapic path already maps that address in |
1560 | APIC_BASE, apic_phys); | 1599 | * acpi_register_lapic_address() |
1600 | */ | ||
1601 | if (!acpi_lapic) | ||
1602 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | ||
1603 | |||
1604 | apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", | ||
1605 | APIC_BASE, apic_phys); | ||
1606 | } | ||
1561 | 1607 | ||
1562 | /* | 1608 | /* |
1563 | * Fetch the APIC ID of the BSP in case we have a | 1609 | * Fetch the APIC ID of the BSP in case we have a |
1564 | * default configuration (or the MP table is broken). | 1610 | * default configuration (or the MP table is broken). |
1565 | */ | 1611 | */ |
1566 | if (boot_cpu_physical_apicid == -1U) | 1612 | new_apicid = read_apic_id(); |
1567 | boot_cpu_physical_apicid = read_apic_id(); | 1613 | if (boot_cpu_physical_apicid != new_apicid) { |
1614 | boot_cpu_physical_apicid = new_apicid; | ||
1615 | /* | ||
1616 | * yeah -- we lie about apic_version | ||
1617 | * in case if apic was disabled via boot option | ||
1618 | * but it's not a problem for SMP compiled kernel | ||
1619 | * since smp_sanity_check is prepared for such a case | ||
1620 | * and disable smp mode | ||
1621 | */ | ||
1622 | apic_version[new_apicid] = | ||
1623 | GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
1624 | } | ||
1568 | } | 1625 | } |
1569 | 1626 | ||
1570 | /* | 1627 | /* |
@@ -1733,8 +1790,7 @@ void __init connect_bsp_APIC(void) | |||
1733 | */ | 1790 | */ |
1734 | apic_printk(APIC_VERBOSE, "leaving PIC mode, " | 1791 | apic_printk(APIC_VERBOSE, "leaving PIC mode, " |
1735 | "enabling APIC mode.\n"); | 1792 | "enabling APIC mode.\n"); |
1736 | outb(0x70, 0x22); | 1793 | imcr_pic_to_apic(); |
1737 | outb(0x01, 0x23); | ||
1738 | } | 1794 | } |
1739 | #endif | 1795 | #endif |
1740 | if (apic->enable_apic_mode) | 1796 | if (apic->enable_apic_mode) |
@@ -1762,8 +1818,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) | |||
1762 | */ | 1818 | */ |
1763 | apic_printk(APIC_VERBOSE, "disabling APIC mode, " | 1819 | apic_printk(APIC_VERBOSE, "disabling APIC mode, " |
1764 | "entering PIC mode.\n"); | 1820 | "entering PIC mode.\n"); |
1765 | outb(0x70, 0x22); | 1821 | imcr_apic_to_pic(); |
1766 | outb(0x00, 0x23); | ||
1767 | return; | 1822 | return; |
1768 | } | 1823 | } |
1769 | #endif | 1824 | #endif |
@@ -1962,17 +2017,17 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) | |||
1962 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); | 2017 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); |
1963 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); | 2018 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); |
1964 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); | 2019 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); |
1965 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) | 2020 | #ifdef CONFIG_X86_THERMAL_VECTOR |
1966 | if (maxlvt >= 5) | 2021 | if (maxlvt >= 5) |
1967 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); | 2022 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); |
1968 | #endif | 2023 | #endif |
1969 | 2024 | ||
1970 | local_irq_save(flags); | 2025 | local_irq_save(flags); |
1971 | disable_local_APIC(); | 2026 | disable_local_APIC(); |
1972 | #ifdef CONFIG_INTR_REMAP | 2027 | |
1973 | if (intr_remapping_enabled) | 2028 | if (intr_remapping_enabled) |
1974 | disable_intr_remapping(); | 2029 | disable_intr_remapping(); |
1975 | #endif | 2030 | |
1976 | local_irq_restore(flags); | 2031 | local_irq_restore(flags); |
1977 | return 0; | 2032 | return 0; |
1978 | } | 2033 | } |
@@ -1982,42 +2037,34 @@ static int lapic_resume(struct sys_device *dev) | |||
1982 | unsigned int l, h; | 2037 | unsigned int l, h; |
1983 | unsigned long flags; | 2038 | unsigned long flags; |
1984 | int maxlvt; | 2039 | int maxlvt; |
1985 | 2040 | int ret = 0; | |
1986 | #ifdef CONFIG_INTR_REMAP | ||
1987 | int ret; | ||
1988 | struct IO_APIC_route_entry **ioapic_entries = NULL; | 2041 | struct IO_APIC_route_entry **ioapic_entries = NULL; |
1989 | 2042 | ||
1990 | if (!apic_pm_state.active) | 2043 | if (!apic_pm_state.active) |
1991 | return 0; | 2044 | return 0; |
1992 | 2045 | ||
1993 | local_irq_save(flags); | 2046 | local_irq_save(flags); |
1994 | if (x2apic) { | 2047 | if (intr_remapping_enabled) { |
1995 | ioapic_entries = alloc_ioapic_entries(); | 2048 | ioapic_entries = alloc_ioapic_entries(); |
1996 | if (!ioapic_entries) { | 2049 | if (!ioapic_entries) { |
1997 | WARN(1, "Alloc ioapic_entries in lapic resume failed."); | 2050 | WARN(1, "Alloc ioapic_entries in lapic resume failed."); |
1998 | return -ENOMEM; | 2051 | ret = -ENOMEM; |
2052 | goto restore; | ||
1999 | } | 2053 | } |
2000 | 2054 | ||
2001 | ret = save_IO_APIC_setup(ioapic_entries); | 2055 | ret = save_IO_APIC_setup(ioapic_entries); |
2002 | if (ret) { | 2056 | if (ret) { |
2003 | WARN(1, "Saving IO-APIC state failed: %d\n", ret); | 2057 | WARN(1, "Saving IO-APIC state failed: %d\n", ret); |
2004 | free_ioapic_entries(ioapic_entries); | 2058 | free_ioapic_entries(ioapic_entries); |
2005 | return ret; | 2059 | goto restore; |
2006 | } | 2060 | } |
2007 | 2061 | ||
2008 | mask_IO_APIC_setup(ioapic_entries); | 2062 | mask_IO_APIC_setup(ioapic_entries); |
2009 | mask_8259A(); | 2063 | mask_8259A(); |
2010 | enable_x2apic(); | ||
2011 | } | 2064 | } |
2012 | #else | ||
2013 | if (!apic_pm_state.active) | ||
2014 | return 0; | ||
2015 | 2065 | ||
2016 | local_irq_save(flags); | 2066 | if (x2apic_mode) |
2017 | if (x2apic) | ||
2018 | enable_x2apic(); | 2067 | enable_x2apic(); |
2019 | #endif | ||
2020 | |||
2021 | else { | 2068 | else { |
2022 | /* | 2069 | /* |
2023 | * Make sure the APICBASE points to the right address | 2070 | * Make sure the APICBASE points to the right address |
@@ -2055,21 +2102,16 @@ static int lapic_resume(struct sys_device *dev) | |||
2055 | apic_write(APIC_ESR, 0); | 2102 | apic_write(APIC_ESR, 0); |
2056 | apic_read(APIC_ESR); | 2103 | apic_read(APIC_ESR); |
2057 | 2104 | ||
2058 | #ifdef CONFIG_INTR_REMAP | 2105 | if (intr_remapping_enabled) { |
2059 | if (intr_remapping_enabled) | 2106 | reenable_intr_remapping(x2apic_mode); |
2060 | reenable_intr_remapping(EIM_32BIT_APIC_ID); | ||
2061 | |||
2062 | if (x2apic) { | ||
2063 | unmask_8259A(); | 2107 | unmask_8259A(); |
2064 | restore_IO_APIC_setup(ioapic_entries); | 2108 | restore_IO_APIC_setup(ioapic_entries); |
2065 | free_ioapic_entries(ioapic_entries); | 2109 | free_ioapic_entries(ioapic_entries); |
2066 | } | 2110 | } |
2067 | #endif | 2111 | restore: |
2068 | |||
2069 | local_irq_restore(flags); | 2112 | local_irq_restore(flags); |
2070 | 2113 | ||
2071 | 2114 | return ret; | |
2072 | return 0; | ||
2073 | } | 2115 | } |
2074 | 2116 | ||
2075 | /* | 2117 | /* |
@@ -2117,31 +2159,14 @@ static void apic_pm_activate(void) { } | |||
2117 | #endif /* CONFIG_PM */ | 2159 | #endif /* CONFIG_PM */ |
2118 | 2160 | ||
2119 | #ifdef CONFIG_X86_64 | 2161 | #ifdef CONFIG_X86_64 |
2120 | /* | 2162 | |
2121 | * apic_is_clustered_box() -- Check if we can expect good TSC | 2163 | static int __cpuinit apic_cluster_num(void) |
2122 | * | ||
2123 | * Thus far, the major user of this is IBM's Summit2 series: | ||
2124 | * | ||
2125 | * Clustered boxes may have unsynced TSC problems if they are | ||
2126 | * multi-chassis. Use available data to take a good guess. | ||
2127 | * If in doubt, go HPET. | ||
2128 | */ | ||
2129 | __cpuinit int apic_is_clustered_box(void) | ||
2130 | { | 2164 | { |
2131 | int i, clusters, zeros; | 2165 | int i, clusters, zeros; |
2132 | unsigned id; | 2166 | unsigned id; |
2133 | u16 *bios_cpu_apicid; | 2167 | u16 *bios_cpu_apicid; |
2134 | DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); | 2168 | DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); |
2135 | 2169 | ||
2136 | /* | ||
2137 | * there is not this kind of box with AMD CPU yet. | ||
2138 | * Some AMD box with quadcore cpu and 8 sockets apicid | ||
2139 | * will be [4, 0x23] or [8, 0x27] could be thought to | ||
2140 | * vsmp box still need checking... | ||
2141 | */ | ||
2142 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) | ||
2143 | return 0; | ||
2144 | |||
2145 | bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); | 2170 | bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); |
2146 | bitmap_zero(clustermap, NUM_APIC_CLUSTERS); | 2171 | bitmap_zero(clustermap, NUM_APIC_CLUSTERS); |
2147 | 2172 | ||
@@ -2177,18 +2202,67 @@ __cpuinit int apic_is_clustered_box(void) | |||
2177 | ++zeros; | 2202 | ++zeros; |
2178 | } | 2203 | } |
2179 | 2204 | ||
2180 | /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are | 2205 | return clusters; |
2181 | * not guaranteed to be synced between boards | 2206 | } |
2182 | */ | 2207 | |
2183 | if (is_vsmp_box() && clusters > 1) | 2208 | static int __cpuinitdata multi_checked; |
2209 | static int __cpuinitdata multi; | ||
2210 | |||
2211 | static int __cpuinit set_multi(const struct dmi_system_id *d) | ||
2212 | { | ||
2213 | if (multi) | ||
2214 | return 0; | ||
2215 | pr_info("APIC: %s detected, Multi Chassis\n", d->ident); | ||
2216 | multi = 1; | ||
2217 | return 0; | ||
2218 | } | ||
2219 | |||
2220 | static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = { | ||
2221 | { | ||
2222 | .callback = set_multi, | ||
2223 | .ident = "IBM System Summit2", | ||
2224 | .matches = { | ||
2225 | DMI_MATCH(DMI_SYS_VENDOR, "IBM"), | ||
2226 | DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"), | ||
2227 | }, | ||
2228 | }, | ||
2229 | {} | ||
2230 | }; | ||
2231 | |||
2232 | static void __cpuinit dmi_check_multi(void) | ||
2233 | { | ||
2234 | if (multi_checked) | ||
2235 | return; | ||
2236 | |||
2237 | dmi_check_system(multi_dmi_table); | ||
2238 | multi_checked = 1; | ||
2239 | } | ||
2240 | |||
2241 | /* | ||
2242 | * apic_is_clustered_box() -- Check if we can expect good TSC | ||
2243 | * | ||
2244 | * Thus far, the major user of this is IBM's Summit2 series: | ||
2245 | * Clustered boxes may have unsynced TSC problems if they are | ||
2246 | * multi-chassis. | ||
2247 | * Use DMI to check them | ||
2248 | */ | ||
2249 | __cpuinit int apic_is_clustered_box(void) | ||
2250 | { | ||
2251 | dmi_check_multi(); | ||
2252 | if (multi) | ||
2184 | return 1; | 2253 | return 1; |
2185 | 2254 | ||
2255 | if (!is_vsmp_box()) | ||
2256 | return 0; | ||
2257 | |||
2186 | /* | 2258 | /* |
2187 | * If clusters > 2, then should be multi-chassis. | 2259 | * ScaleMP vSMPowered boxes have one cluster per board and TSCs are |
2188 | * May have to revisit this when multi-core + hyperthreaded CPUs come | 2260 | * not guaranteed to be synced between boards |
2189 | * out, but AFAIK this will work even for them. | ||
2190 | */ | 2261 | */ |
2191 | return (clusters > 2); | 2262 | if (apic_cluster_num() > 1) |
2263 | return 1; | ||
2264 | |||
2265 | return 0; | ||
2192 | } | 2266 | } |
2193 | #endif | 2267 | #endif |
2194 | 2268 | ||
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 306e5e88fb6f..d0c99abc26c3 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c | |||
@@ -161,7 +161,7 @@ static int flat_apic_id_registered(void) | |||
161 | 161 | ||
162 | static int flat_phys_pkg_id(int initial_apic_id, int index_msb) | 162 | static int flat_phys_pkg_id(int initial_apic_id, int index_msb) |
163 | { | 163 | { |
164 | return hard_smp_processor_id() >> index_msb; | 164 | return initial_apic_id >> index_msb; |
165 | } | 165 | } |
166 | 166 | ||
167 | struct apic apic_flat = { | 167 | struct apic apic_flat = { |
@@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | |||
235 | * regardless of how many processors are present (x86_64 ES7000 | 235 | * regardless of how many processors are present (x86_64 ES7000 |
236 | * is an example). | 236 | * is an example). |
237 | */ | 237 | */ |
238 | if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && | 238 | if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && |
239 | (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { | 239 | (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { |
240 | printk(KERN_DEBUG "system APIC only can use physical flat"); | 240 | printk(KERN_DEBUG "system APIC only can use physical flat"); |
241 | return 1; | 241 | return 1; |
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 1c11b819f245..69328ac8de9c 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c | |||
@@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi) | |||
145 | return gsi; | 145 | return gsi; |
146 | } | 146 | } |
147 | 147 | ||
148 | static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) | 148 | static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) |
149 | { | 149 | { |
150 | unsigned long vect = 0, psaival = 0; | 150 | unsigned long vect = 0, psaival = 0; |
151 | 151 | ||
@@ -254,7 +254,7 @@ static int parse_unisys_oem(char *oemptr) | |||
254 | } | 254 | } |
255 | 255 | ||
256 | #ifdef CONFIG_ACPI | 256 | #ifdef CONFIG_ACPI |
257 | static int find_unisys_acpi_oem_table(unsigned long *oem_addr) | 257 | static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) |
258 | { | 258 | { |
259 | struct acpi_table_header *header = NULL; | 259 | struct acpi_table_header *header = NULL; |
260 | struct es7000_oem_table *table; | 260 | struct es7000_oem_table *table; |
@@ -285,7 +285,7 @@ static int find_unisys_acpi_oem_table(unsigned long *oem_addr) | |||
285 | return 0; | 285 | return 0; |
286 | } | 286 | } |
287 | 287 | ||
288 | static void unmap_unisys_acpi_oem_table(unsigned long oem_addr) | 288 | static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) |
289 | { | 289 | { |
290 | if (!oem_addr) | 290 | if (!oem_addr) |
291 | return; | 291 | return; |
@@ -306,7 +306,7 @@ static int es7000_check_dsdt(void) | |||
306 | static int es7000_acpi_ret; | 306 | static int es7000_acpi_ret; |
307 | 307 | ||
308 | /* Hook from generic ACPI tables.c */ | 308 | /* Hook from generic ACPI tables.c */ |
309 | static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 309 | static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
310 | { | 310 | { |
311 | unsigned long oem_addr = 0; | 311 | unsigned long oem_addr = 0; |
312 | int check_dsdt; | 312 | int check_dsdt; |
@@ -717,7 +717,7 @@ struct apic apic_es7000_cluster = { | |||
717 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, | 717 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, |
718 | }; | 718 | }; |
719 | 719 | ||
720 | struct apic apic_es7000 = { | 720 | struct apic __refdata apic_es7000 = { |
721 | 721 | ||
722 | .name = "es7000", | 722 | .name = "es7000", |
723 | .probe = probe_es7000, | 723 | .probe = probe_es7000, |
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 30da617d18e4..ef8d9290c7ea 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -59,6 +59,7 @@ | |||
59 | #include <asm/setup.h> | 59 | #include <asm/setup.h> |
60 | #include <asm/irq_remapping.h> | 60 | #include <asm/irq_remapping.h> |
61 | #include <asm/hpet.h> | 61 | #include <asm/hpet.h> |
62 | #include <asm/hw_irq.h> | ||
62 | #include <asm/uv/uv_hub.h> | 63 | #include <asm/uv/uv_hub.h> |
63 | #include <asm/uv/uv_irq.h> | 64 | #include <asm/uv/uv_irq.h> |
64 | 65 | ||
@@ -129,12 +130,9 @@ struct irq_pin_list { | |||
129 | struct irq_pin_list *next; | 130 | struct irq_pin_list *next; |
130 | }; | 131 | }; |
131 | 132 | ||
132 | static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) | 133 | static struct irq_pin_list *get_one_free_irq_2_pin(int node) |
133 | { | 134 | { |
134 | struct irq_pin_list *pin; | 135 | struct irq_pin_list *pin; |
135 | int node; | ||
136 | |||
137 | node = cpu_to_node(cpu); | ||
138 | 136 | ||
139 | pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); | 137 | pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); |
140 | 138 | ||
@@ -148,9 +146,6 @@ struct irq_cfg { | |||
148 | unsigned move_cleanup_count; | 146 | unsigned move_cleanup_count; |
149 | u8 vector; | 147 | u8 vector; |
150 | u8 move_in_progress : 1; | 148 | u8 move_in_progress : 1; |
151 | #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC | ||
152 | u8 move_desc_pending : 1; | ||
153 | #endif | ||
154 | }; | 149 | }; |
155 | 150 | ||
156 | /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ | 151 | /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ |
@@ -182,16 +177,18 @@ int __init arch_early_irq_init(void) | |||
182 | struct irq_cfg *cfg; | 177 | struct irq_cfg *cfg; |
183 | struct irq_desc *desc; | 178 | struct irq_desc *desc; |
184 | int count; | 179 | int count; |
180 | int node; | ||
185 | int i; | 181 | int i; |
186 | 182 | ||
187 | cfg = irq_cfgx; | 183 | cfg = irq_cfgx; |
188 | count = ARRAY_SIZE(irq_cfgx); | 184 | count = ARRAY_SIZE(irq_cfgx); |
185 | node= cpu_to_node(boot_cpu_id); | ||
189 | 186 | ||
190 | for (i = 0; i < count; i++) { | 187 | for (i = 0; i < count; i++) { |
191 | desc = irq_to_desc(i); | 188 | desc = irq_to_desc(i); |
192 | desc->chip_data = &cfg[i]; | 189 | desc->chip_data = &cfg[i]; |
193 | alloc_bootmem_cpumask_var(&cfg[i].domain); | 190 | zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); |
194 | alloc_bootmem_cpumask_var(&cfg[i].old_domain); | 191 | zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); |
195 | if (i < NR_IRQS_LEGACY) | 192 | if (i < NR_IRQS_LEGACY) |
196 | cpumask_setall(cfg[i].domain); | 193 | cpumask_setall(cfg[i].domain); |
197 | } | 194 | } |
@@ -212,12 +209,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq) | |||
212 | return cfg; | 209 | return cfg; |
213 | } | 210 | } |
214 | 211 | ||
215 | static struct irq_cfg *get_one_free_irq_cfg(int cpu) | 212 | static struct irq_cfg *get_one_free_irq_cfg(int node) |
216 | { | 213 | { |
217 | struct irq_cfg *cfg; | 214 | struct irq_cfg *cfg; |
218 | int node; | ||
219 | |||
220 | node = cpu_to_node(cpu); | ||
221 | 215 | ||
222 | cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); | 216 | cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); |
223 | if (cfg) { | 217 | if (cfg) { |
@@ -238,13 +232,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu) | |||
238 | return cfg; | 232 | return cfg; |
239 | } | 233 | } |
240 | 234 | ||
241 | int arch_init_chip_data(struct irq_desc *desc, int cpu) | 235 | int arch_init_chip_data(struct irq_desc *desc, int node) |
242 | { | 236 | { |
243 | struct irq_cfg *cfg; | 237 | struct irq_cfg *cfg; |
244 | 238 | ||
245 | cfg = desc->chip_data; | 239 | cfg = desc->chip_data; |
246 | if (!cfg) { | 240 | if (!cfg) { |
247 | desc->chip_data = get_one_free_irq_cfg(cpu); | 241 | desc->chip_data = get_one_free_irq_cfg(node); |
248 | if (!desc->chip_data) { | 242 | if (!desc->chip_data) { |
249 | printk(KERN_ERR "can not alloc irq_cfg\n"); | 243 | printk(KERN_ERR "can not alloc irq_cfg\n"); |
250 | BUG_ON(1); | 244 | BUG_ON(1); |
@@ -254,10 +248,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu) | |||
254 | return 0; | 248 | return 0; |
255 | } | 249 | } |
256 | 250 | ||
257 | #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC | 251 | /* for move_irq_desc */ |
258 | |||
259 | static void | 252 | static void |
260 | init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) | 253 | init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node) |
261 | { | 254 | { |
262 | struct irq_pin_list *old_entry, *head, *tail, *entry; | 255 | struct irq_pin_list *old_entry, *head, *tail, *entry; |
263 | 256 | ||
@@ -266,7 +259,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) | |||
266 | if (!old_entry) | 259 | if (!old_entry) |
267 | return; | 260 | return; |
268 | 261 | ||
269 | entry = get_one_free_irq_2_pin(cpu); | 262 | entry = get_one_free_irq_2_pin(node); |
270 | if (!entry) | 263 | if (!entry) |
271 | return; | 264 | return; |
272 | 265 | ||
@@ -276,7 +269,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) | |||
276 | tail = entry; | 269 | tail = entry; |
277 | old_entry = old_entry->next; | 270 | old_entry = old_entry->next; |
278 | while (old_entry) { | 271 | while (old_entry) { |
279 | entry = get_one_free_irq_2_pin(cpu); | 272 | entry = get_one_free_irq_2_pin(node); |
280 | if (!entry) { | 273 | if (!entry) { |
281 | entry = head; | 274 | entry = head; |
282 | while (entry) { | 275 | while (entry) { |
@@ -316,12 +309,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) | |||
316 | } | 309 | } |
317 | 310 | ||
318 | void arch_init_copy_chip_data(struct irq_desc *old_desc, | 311 | void arch_init_copy_chip_data(struct irq_desc *old_desc, |
319 | struct irq_desc *desc, int cpu) | 312 | struct irq_desc *desc, int node) |
320 | { | 313 | { |
321 | struct irq_cfg *cfg; | 314 | struct irq_cfg *cfg; |
322 | struct irq_cfg *old_cfg; | 315 | struct irq_cfg *old_cfg; |
323 | 316 | ||
324 | cfg = get_one_free_irq_cfg(cpu); | 317 | cfg = get_one_free_irq_cfg(node); |
325 | 318 | ||
326 | if (!cfg) | 319 | if (!cfg) |
327 | return; | 320 | return; |
@@ -332,7 +325,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, | |||
332 | 325 | ||
333 | memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); | 326 | memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); |
334 | 327 | ||
335 | init_copy_irq_2_pin(old_cfg, cfg, cpu); | 328 | init_copy_irq_2_pin(old_cfg, cfg, node); |
336 | } | 329 | } |
337 | 330 | ||
338 | static void free_irq_cfg(struct irq_cfg *old_cfg) | 331 | static void free_irq_cfg(struct irq_cfg *old_cfg) |
@@ -356,19 +349,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) | |||
356 | old_desc->chip_data = NULL; | 349 | old_desc->chip_data = NULL; |
357 | } | 350 | } |
358 | } | 351 | } |
359 | 352 | /* end for move_irq_desc */ | |
360 | static void | ||
361 | set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) | ||
362 | { | ||
363 | struct irq_cfg *cfg = desc->chip_data; | ||
364 | |||
365 | if (!cfg->move_in_progress) { | ||
366 | /* it means that domain is not changed */ | ||
367 | if (!cpumask_intersects(desc->affinity, mask)) | ||
368 | cfg->move_desc_pending = 1; | ||
369 | } | ||
370 | } | ||
371 | #endif | ||
372 | 353 | ||
373 | #else | 354 | #else |
374 | static struct irq_cfg *irq_cfg(unsigned int irq) | 355 | static struct irq_cfg *irq_cfg(unsigned int irq) |
@@ -378,13 +359,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq) | |||
378 | 359 | ||
379 | #endif | 360 | #endif |
380 | 361 | ||
381 | #ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC | ||
382 | static inline void | ||
383 | set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) | ||
384 | { | ||
385 | } | ||
386 | #endif | ||
387 | |||
388 | struct io_apic { | 362 | struct io_apic { |
389 | unsigned int index; | 363 | unsigned int index; |
390 | unsigned int unused[3]; | 364 | unsigned int unused[3]; |
@@ -518,132 +492,18 @@ static void ioapic_mask_entry(int apic, int pin) | |||
518 | spin_unlock_irqrestore(&ioapic_lock, flags); | 492 | spin_unlock_irqrestore(&ioapic_lock, flags); |
519 | } | 493 | } |
520 | 494 | ||
521 | #ifdef CONFIG_SMP | ||
522 | static void send_cleanup_vector(struct irq_cfg *cfg) | ||
523 | { | ||
524 | cpumask_var_t cleanup_mask; | ||
525 | |||
526 | if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { | ||
527 | unsigned int i; | ||
528 | cfg->move_cleanup_count = 0; | ||
529 | for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) | ||
530 | cfg->move_cleanup_count++; | ||
531 | for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) | ||
532 | apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); | ||
533 | } else { | ||
534 | cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); | ||
535 | cfg->move_cleanup_count = cpumask_weight(cleanup_mask); | ||
536 | apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); | ||
537 | free_cpumask_var(cleanup_mask); | ||
538 | } | ||
539 | cfg->move_in_progress = 0; | ||
540 | } | ||
541 | |||
542 | static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) | ||
543 | { | ||
544 | int apic, pin; | ||
545 | struct irq_pin_list *entry; | ||
546 | u8 vector = cfg->vector; | ||
547 | |||
548 | entry = cfg->irq_2_pin; | ||
549 | for (;;) { | ||
550 | unsigned int reg; | ||
551 | |||
552 | if (!entry) | ||
553 | break; | ||
554 | |||
555 | apic = entry->apic; | ||
556 | pin = entry->pin; | ||
557 | /* | ||
558 | * With interrupt-remapping, destination information comes | ||
559 | * from interrupt-remapping table entry. | ||
560 | */ | ||
561 | if (!irq_remapped(irq)) | ||
562 | io_apic_write(apic, 0x11 + pin*2, dest); | ||
563 | reg = io_apic_read(apic, 0x10 + pin*2); | ||
564 | reg &= ~IO_APIC_REDIR_VECTOR_MASK; | ||
565 | reg |= vector; | ||
566 | io_apic_modify(apic, 0x10 + pin*2, reg); | ||
567 | if (!entry->next) | ||
568 | break; | ||
569 | entry = entry->next; | ||
570 | } | ||
571 | } | ||
572 | |||
573 | static int | ||
574 | assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); | ||
575 | |||
576 | /* | ||
577 | * Either sets desc->affinity to a valid value, and returns | ||
578 | * ->cpu_mask_to_apicid of that, or returns BAD_APICID and | ||
579 | * leaves desc->affinity untouched. | ||
580 | */ | ||
581 | static unsigned int | ||
582 | set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) | ||
583 | { | ||
584 | struct irq_cfg *cfg; | ||
585 | unsigned int irq; | ||
586 | |||
587 | if (!cpumask_intersects(mask, cpu_online_mask)) | ||
588 | return BAD_APICID; | ||
589 | |||
590 | irq = desc->irq; | ||
591 | cfg = desc->chip_data; | ||
592 | if (assign_irq_vector(irq, cfg, mask)) | ||
593 | return BAD_APICID; | ||
594 | |||
595 | /* check that before desc->addinity get updated */ | ||
596 | set_extra_move_desc(desc, mask); | ||
597 | |||
598 | cpumask_copy(desc->affinity, mask); | ||
599 | |||
600 | return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); | ||
601 | } | ||
602 | |||
603 | static void | ||
604 | set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) | ||
605 | { | ||
606 | struct irq_cfg *cfg; | ||
607 | unsigned long flags; | ||
608 | unsigned int dest; | ||
609 | unsigned int irq; | ||
610 | |||
611 | irq = desc->irq; | ||
612 | cfg = desc->chip_data; | ||
613 | |||
614 | spin_lock_irqsave(&ioapic_lock, flags); | ||
615 | dest = set_desc_affinity(desc, mask); | ||
616 | if (dest != BAD_APICID) { | ||
617 | /* Only the high 8 bits are valid. */ | ||
618 | dest = SET_APIC_LOGICAL_ID(dest); | ||
619 | __target_IO_APIC_irq(irq, dest, cfg); | ||
620 | } | ||
621 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
622 | } | ||
623 | |||
624 | static void | ||
625 | set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) | ||
626 | { | ||
627 | struct irq_desc *desc; | ||
628 | |||
629 | desc = irq_to_desc(irq); | ||
630 | |||
631 | set_ioapic_affinity_irq_desc(desc, mask); | ||
632 | } | ||
633 | #endif /* CONFIG_SMP */ | ||
634 | |||
635 | /* | 495 | /* |
636 | * The common case is 1:1 IRQ<->pin mappings. Sometimes there are | 496 | * The common case is 1:1 IRQ<->pin mappings. Sometimes there are |
637 | * shared ISA-space IRQs, so we have to support them. We are super | 497 | * shared ISA-space IRQs, so we have to support them. We are super |
638 | * fast in the common case, and fast for shared ISA-space IRQs. | 498 | * fast in the common case, and fast for shared ISA-space IRQs. |
639 | */ | 499 | */ |
640 | static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) | 500 | static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) |
641 | { | 501 | { |
642 | struct irq_pin_list *entry; | 502 | struct irq_pin_list *entry; |
643 | 503 | ||
644 | entry = cfg->irq_2_pin; | 504 | entry = cfg->irq_2_pin; |
645 | if (!entry) { | 505 | if (!entry) { |
646 | entry = get_one_free_irq_2_pin(cpu); | 506 | entry = get_one_free_irq_2_pin(node); |
647 | if (!entry) { | 507 | if (!entry) { |
648 | printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", | 508 | printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", |
649 | apic, pin); | 509 | apic, pin); |
@@ -663,7 +523,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) | |||
663 | entry = entry->next; | 523 | entry = entry->next; |
664 | } | 524 | } |
665 | 525 | ||
666 | entry->next = get_one_free_irq_2_pin(cpu); | 526 | entry->next = get_one_free_irq_2_pin(node); |
667 | entry = entry->next; | 527 | entry = entry->next; |
668 | entry->apic = apic; | 528 | entry->apic = apic; |
669 | entry->pin = pin; | 529 | entry->pin = pin; |
@@ -672,7 +532,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) | |||
672 | /* | 532 | /* |
673 | * Reroute an IRQ to a different pin. | 533 | * Reroute an IRQ to a different pin. |
674 | */ | 534 | */ |
675 | static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, | 535 | static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, |
676 | int oldapic, int oldpin, | 536 | int oldapic, int oldpin, |
677 | int newapic, int newpin) | 537 | int newapic, int newpin) |
678 | { | 538 | { |
@@ -692,7 +552,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, | |||
692 | 552 | ||
693 | /* why? call replace before add? */ | 553 | /* why? call replace before add? */ |
694 | if (!replaced) | 554 | if (!replaced) |
695 | add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); | 555 | add_pin_to_irq_node(cfg, node, newapic, newpin); |
696 | } | 556 | } |
697 | 557 | ||
698 | static inline void io_apic_modify_irq(struct irq_cfg *cfg, | 558 | static inline void io_apic_modify_irq(struct irq_cfg *cfg, |
@@ -850,7 +710,6 @@ static int __init ioapic_pirq_setup(char *str) | |||
850 | __setup("pirq=", ioapic_pirq_setup); | 710 | __setup("pirq=", ioapic_pirq_setup); |
851 | #endif /* CONFIG_X86_32 */ | 711 | #endif /* CONFIG_X86_32 */ |
852 | 712 | ||
853 | #ifdef CONFIG_INTR_REMAP | ||
854 | struct IO_APIC_route_entry **alloc_ioapic_entries(void) | 713 | struct IO_APIC_route_entry **alloc_ioapic_entries(void) |
855 | { | 714 | { |
856 | int apic; | 715 | int apic; |
@@ -948,20 +807,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) | |||
948 | return 0; | 807 | return 0; |
949 | } | 808 | } |
950 | 809 | ||
951 | void reinit_intr_remapped_IO_APIC(int intr_remapping, | ||
952 | struct IO_APIC_route_entry **ioapic_entries) | ||
953 | |||
954 | { | ||
955 | /* | ||
956 | * for now plain restore of previous settings. | ||
957 | * TBD: In the case of OS enabling interrupt-remapping, | ||
958 | * IO-APIC RTE's need to be setup to point to interrupt-remapping | ||
959 | * table entries. for now, do a plain restore, and wait for | ||
960 | * the setup_IO_APIC_irqs() to do proper initialization. | ||
961 | */ | ||
962 | restore_IO_APIC_setup(ioapic_entries); | ||
963 | } | ||
964 | |||
965 | void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) | 810 | void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) |
966 | { | 811 | { |
967 | int apic; | 812 | int apic; |
@@ -971,7 +816,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) | |||
971 | 816 | ||
972 | kfree(ioapic_entries); | 817 | kfree(ioapic_entries); |
973 | } | 818 | } |
974 | #endif | ||
975 | 819 | ||
976 | /* | 820 | /* |
977 | * Find the IRQ entry number of a certain pin. | 821 | * Find the IRQ entry number of a certain pin. |
@@ -1032,54 +876,6 @@ static int __init find_isa_irq_apic(int irq, int type) | |||
1032 | return -1; | 876 | return -1; |
1033 | } | 877 | } |
1034 | 878 | ||
1035 | /* | ||
1036 | * Find a specific PCI IRQ entry. | ||
1037 | * Not an __init, possibly needed by modules | ||
1038 | */ | ||
1039 | static int pin_2_irq(int idx, int apic, int pin); | ||
1040 | |||
1041 | int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | ||
1042 | { | ||
1043 | int apic, i, best_guess = -1; | ||
1044 | |||
1045 | apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", | ||
1046 | bus, slot, pin); | ||
1047 | if (test_bit(bus, mp_bus_not_pci)) { | ||
1048 | apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); | ||
1049 | return -1; | ||
1050 | } | ||
1051 | for (i = 0; i < mp_irq_entries; i++) { | ||
1052 | int lbus = mp_irqs[i].srcbus; | ||
1053 | |||
1054 | for (apic = 0; apic < nr_ioapics; apic++) | ||
1055 | if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || | ||
1056 | mp_irqs[i].dstapic == MP_APIC_ALL) | ||
1057 | break; | ||
1058 | |||
1059 | if (!test_bit(lbus, mp_bus_not_pci) && | ||
1060 | !mp_irqs[i].irqtype && | ||
1061 | (bus == lbus) && | ||
1062 | (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { | ||
1063 | int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); | ||
1064 | |||
1065 | if (!(apic || IO_APIC_IRQ(irq))) | ||
1066 | continue; | ||
1067 | |||
1068 | if (pin == (mp_irqs[i].srcbusirq & 3)) | ||
1069 | return irq; | ||
1070 | /* | ||
1071 | * Use the first all-but-pin matching entry as a | ||
1072 | * best-guess fuzzy result for broken mptables. | ||
1073 | */ | ||
1074 | if (best_guess < 0) | ||
1075 | best_guess = irq; | ||
1076 | } | ||
1077 | } | ||
1078 | return best_guess; | ||
1079 | } | ||
1080 | |||
1081 | EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); | ||
1082 | |||
1083 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) | 879 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) |
1084 | /* | 880 | /* |
1085 | * EISA Edge/Level control register, ELCR | 881 | * EISA Edge/Level control register, ELCR |
@@ -1298,6 +1094,64 @@ static int pin_2_irq(int idx, int apic, int pin) | |||
1298 | return irq; | 1094 | return irq; |
1299 | } | 1095 | } |
1300 | 1096 | ||
1097 | /* | ||
1098 | * Find a specific PCI IRQ entry. | ||
1099 | * Not an __init, possibly needed by modules | ||
1100 | */ | ||
1101 | int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, | ||
1102 | struct io_apic_irq_attr *irq_attr) | ||
1103 | { | ||
1104 | int apic, i, best_guess = -1; | ||
1105 | |||
1106 | apic_printk(APIC_DEBUG, | ||
1107 | "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", | ||
1108 | bus, slot, pin); | ||
1109 | if (test_bit(bus, mp_bus_not_pci)) { | ||
1110 | apic_printk(APIC_VERBOSE, | ||
1111 | "PCI BIOS passed nonexistent PCI bus %d!\n", bus); | ||
1112 | return -1; | ||
1113 | } | ||
1114 | for (i = 0; i < mp_irq_entries; i++) { | ||
1115 | int lbus = mp_irqs[i].srcbus; | ||
1116 | |||
1117 | for (apic = 0; apic < nr_ioapics; apic++) | ||
1118 | if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || | ||
1119 | mp_irqs[i].dstapic == MP_APIC_ALL) | ||
1120 | break; | ||
1121 | |||
1122 | if (!test_bit(lbus, mp_bus_not_pci) && | ||
1123 | !mp_irqs[i].irqtype && | ||
1124 | (bus == lbus) && | ||
1125 | (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { | ||
1126 | int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); | ||
1127 | |||
1128 | if (!(apic || IO_APIC_IRQ(irq))) | ||
1129 | continue; | ||
1130 | |||
1131 | if (pin == (mp_irqs[i].srcbusirq & 3)) { | ||
1132 | set_io_apic_irq_attr(irq_attr, apic, | ||
1133 | mp_irqs[i].dstirq, | ||
1134 | irq_trigger(i), | ||
1135 | irq_polarity(i)); | ||
1136 | return irq; | ||
1137 | } | ||
1138 | /* | ||
1139 | * Use the first all-but-pin matching entry as a | ||
1140 | * best-guess fuzzy result for broken mptables. | ||
1141 | */ | ||
1142 | if (best_guess < 0) { | ||
1143 | set_io_apic_irq_attr(irq_attr, apic, | ||
1144 | mp_irqs[i].dstirq, | ||
1145 | irq_trigger(i), | ||
1146 | irq_polarity(i)); | ||
1147 | best_guess = irq; | ||
1148 | } | ||
1149 | } | ||
1150 | } | ||
1151 | return best_guess; | ||
1152 | } | ||
1153 | EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); | ||
1154 | |||
1301 | void lock_vector_lock(void) | 1155 | void lock_vector_lock(void) |
1302 | { | 1156 | { |
1303 | /* Used to the online set of cpus does not change | 1157 | /* Used to the online set of cpus does not change |
@@ -1628,58 +1482,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq | |||
1628 | ioapic_write_entry(apic_id, pin, entry); | 1482 | ioapic_write_entry(apic_id, pin, entry); |
1629 | } | 1483 | } |
1630 | 1484 | ||
1485 | static struct { | ||
1486 | DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); | ||
1487 | } mp_ioapic_routing[MAX_IO_APICS]; | ||
1488 | |||
1631 | static void __init setup_IO_APIC_irqs(void) | 1489 | static void __init setup_IO_APIC_irqs(void) |
1632 | { | 1490 | { |
1633 | int apic_id, pin, idx, irq; | 1491 | int apic_id = 0, pin, idx, irq; |
1634 | int notcon = 0; | 1492 | int notcon = 0; |
1635 | struct irq_desc *desc; | 1493 | struct irq_desc *desc; |
1636 | struct irq_cfg *cfg; | 1494 | struct irq_cfg *cfg; |
1637 | int cpu = boot_cpu_id; | 1495 | int node = cpu_to_node(boot_cpu_id); |
1638 | 1496 | ||
1639 | apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); | 1497 | apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); |
1640 | 1498 | ||
1641 | for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { | 1499 | #ifdef CONFIG_ACPI |
1642 | for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { | 1500 | if (!acpi_disabled && acpi_ioapic) { |
1643 | 1501 | apic_id = mp_find_ioapic(0); | |
1644 | idx = find_irq_entry(apic_id, pin, mp_INT); | 1502 | if (apic_id < 0) |
1645 | if (idx == -1) { | 1503 | apic_id = 0; |
1646 | if (!notcon) { | 1504 | } |
1647 | notcon = 1; | 1505 | #endif |
1648 | apic_printk(APIC_VERBOSE, | ||
1649 | KERN_DEBUG " %d-%d", | ||
1650 | mp_ioapics[apic_id].apicid, pin); | ||
1651 | } else | ||
1652 | apic_printk(APIC_VERBOSE, " %d-%d", | ||
1653 | mp_ioapics[apic_id].apicid, pin); | ||
1654 | continue; | ||
1655 | } | ||
1656 | if (notcon) { | ||
1657 | apic_printk(APIC_VERBOSE, | ||
1658 | " (apicid-pin) not connected\n"); | ||
1659 | notcon = 0; | ||
1660 | } | ||
1661 | 1506 | ||
1662 | irq = pin_2_irq(idx, apic_id, pin); | 1507 | for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { |
1508 | idx = find_irq_entry(apic_id, pin, mp_INT); | ||
1509 | if (idx == -1) { | ||
1510 | if (!notcon) { | ||
1511 | notcon = 1; | ||
1512 | apic_printk(APIC_VERBOSE, | ||
1513 | KERN_DEBUG " %d-%d", | ||
1514 | mp_ioapics[apic_id].apicid, pin); | ||
1515 | } else | ||
1516 | apic_printk(APIC_VERBOSE, " %d-%d", | ||
1517 | mp_ioapics[apic_id].apicid, pin); | ||
1518 | continue; | ||
1519 | } | ||
1520 | if (notcon) { | ||
1521 | apic_printk(APIC_VERBOSE, | ||
1522 | " (apicid-pin) not connected\n"); | ||
1523 | notcon = 0; | ||
1524 | } | ||
1663 | 1525 | ||
1664 | /* | 1526 | irq = pin_2_irq(idx, apic_id, pin); |
1665 | * Skip the timer IRQ if there's a quirk handler | ||
1666 | * installed and if it returns 1: | ||
1667 | */ | ||
1668 | if (apic->multi_timer_check && | ||
1669 | apic->multi_timer_check(apic_id, irq)) | ||
1670 | continue; | ||
1671 | 1527 | ||
1672 | desc = irq_to_desc_alloc_cpu(irq, cpu); | 1528 | /* |
1673 | if (!desc) { | 1529 | * Skip the timer IRQ if there's a quirk handler |
1674 | printk(KERN_INFO "can not get irq_desc for %d\n", irq); | 1530 | * installed and if it returns 1: |
1675 | continue; | 1531 | */ |
1676 | } | 1532 | if (apic->multi_timer_check && |
1677 | cfg = desc->chip_data; | 1533 | apic->multi_timer_check(apic_id, irq)) |
1678 | add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); | 1534 | continue; |
1679 | 1535 | ||
1680 | setup_IO_APIC_irq(apic_id, pin, irq, desc, | 1536 | desc = irq_to_desc_alloc_node(irq, node); |
1681 | irq_trigger(idx), irq_polarity(idx)); | 1537 | if (!desc) { |
1538 | printk(KERN_INFO "can not get irq_desc for %d\n", irq); | ||
1539 | continue; | ||
1682 | } | 1540 | } |
1541 | cfg = desc->chip_data; | ||
1542 | add_pin_to_irq_node(cfg, node, apic_id, pin); | ||
1543 | /* | ||
1544 | * don't mark it in pin_programmed, so later acpi could | ||
1545 | * set it correctly when irq < 16 | ||
1546 | */ | ||
1547 | setup_IO_APIC_irq(apic_id, pin, irq, desc, | ||
1548 | irq_trigger(idx), irq_polarity(idx)); | ||
1683 | } | 1549 | } |
1684 | 1550 | ||
1685 | if (notcon) | 1551 | if (notcon) |
@@ -1869,7 +1735,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base) | |||
1869 | 1735 | ||
1870 | __apicdebuginit(void) print_local_APIC(void *dummy) | 1736 | __apicdebuginit(void) print_local_APIC(void *dummy) |
1871 | { | 1737 | { |
1872 | unsigned int v, ver, maxlvt; | 1738 | unsigned int i, v, ver, maxlvt; |
1873 | u64 icr; | 1739 | u64 icr; |
1874 | 1740 | ||
1875 | if (apic_verbosity == APIC_QUIET) | 1741 | if (apic_verbosity == APIC_QUIET) |
@@ -1957,6 +1823,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy) | |||
1957 | printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); | 1823 | printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); |
1958 | v = apic_read(APIC_TDCR); | 1824 | v = apic_read(APIC_TDCR); |
1959 | printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); | 1825 | printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); |
1826 | |||
1827 | if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { | ||
1828 | v = apic_read(APIC_EFEAT); | ||
1829 | maxlvt = (v >> 16) & 0xff; | ||
1830 | printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v); | ||
1831 | v = apic_read(APIC_ECTRL); | ||
1832 | printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v); | ||
1833 | for (i = 0; i < maxlvt; i++) { | ||
1834 | v = apic_read(APIC_EILVTn(i)); | ||
1835 | printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); | ||
1836 | } | ||
1837 | } | ||
1960 | printk("\n"); | 1838 | printk("\n"); |
1961 | } | 1839 | } |
1962 | 1840 | ||
@@ -2005,6 +1883,11 @@ __apicdebuginit(void) print_PIC(void) | |||
2005 | __apicdebuginit(int) print_all_ICs(void) | 1883 | __apicdebuginit(int) print_all_ICs(void) |
2006 | { | 1884 | { |
2007 | print_PIC(); | 1885 | print_PIC(); |
1886 | |||
1887 | /* don't print out if apic is not there */ | ||
1888 | if (!cpu_has_apic || disable_apic) | ||
1889 | return 0; | ||
1890 | |||
2008 | print_all_local_APICs(); | 1891 | print_all_local_APICs(); |
2009 | print_IO_APIC(); | 1892 | print_IO_APIC(); |
2010 | 1893 | ||
@@ -2360,6 +2243,118 @@ static int ioapic_retrigger_irq(unsigned int irq) | |||
2360 | */ | 2243 | */ |
2361 | 2244 | ||
2362 | #ifdef CONFIG_SMP | 2245 | #ifdef CONFIG_SMP |
2246 | static void send_cleanup_vector(struct irq_cfg *cfg) | ||
2247 | { | ||
2248 | cpumask_var_t cleanup_mask; | ||
2249 | |||
2250 | if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { | ||
2251 | unsigned int i; | ||
2252 | cfg->move_cleanup_count = 0; | ||
2253 | for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) | ||
2254 | cfg->move_cleanup_count++; | ||
2255 | for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) | ||
2256 | apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); | ||
2257 | } else { | ||
2258 | cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); | ||
2259 | cfg->move_cleanup_count = cpumask_weight(cleanup_mask); | ||
2260 | apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); | ||
2261 | free_cpumask_var(cleanup_mask); | ||
2262 | } | ||
2263 | cfg->move_in_progress = 0; | ||
2264 | } | ||
2265 | |||
2266 | static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) | ||
2267 | { | ||
2268 | int apic, pin; | ||
2269 | struct irq_pin_list *entry; | ||
2270 | u8 vector = cfg->vector; | ||
2271 | |||
2272 | entry = cfg->irq_2_pin; | ||
2273 | for (;;) { | ||
2274 | unsigned int reg; | ||
2275 | |||
2276 | if (!entry) | ||
2277 | break; | ||
2278 | |||
2279 | apic = entry->apic; | ||
2280 | pin = entry->pin; | ||
2281 | /* | ||
2282 | * With interrupt-remapping, destination information comes | ||
2283 | * from interrupt-remapping table entry. | ||
2284 | */ | ||
2285 | if (!irq_remapped(irq)) | ||
2286 | io_apic_write(apic, 0x11 + pin*2, dest); | ||
2287 | reg = io_apic_read(apic, 0x10 + pin*2); | ||
2288 | reg &= ~IO_APIC_REDIR_VECTOR_MASK; | ||
2289 | reg |= vector; | ||
2290 | io_apic_modify(apic, 0x10 + pin*2, reg); | ||
2291 | if (!entry->next) | ||
2292 | break; | ||
2293 | entry = entry->next; | ||
2294 | } | ||
2295 | } | ||
2296 | |||
2297 | static int | ||
2298 | assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); | ||
2299 | |||
2300 | /* | ||
2301 | * Either sets desc->affinity to a valid value, and returns | ||
2302 | * ->cpu_mask_to_apicid of that, or returns BAD_APICID and | ||
2303 | * leaves desc->affinity untouched. | ||
2304 | */ | ||
2305 | static unsigned int | ||
2306 | set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) | ||
2307 | { | ||
2308 | struct irq_cfg *cfg; | ||
2309 | unsigned int irq; | ||
2310 | |||
2311 | if (!cpumask_intersects(mask, cpu_online_mask)) | ||
2312 | return BAD_APICID; | ||
2313 | |||
2314 | irq = desc->irq; | ||
2315 | cfg = desc->chip_data; | ||
2316 | if (assign_irq_vector(irq, cfg, mask)) | ||
2317 | return BAD_APICID; | ||
2318 | |||
2319 | cpumask_copy(desc->affinity, mask); | ||
2320 | |||
2321 | return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); | ||
2322 | } | ||
2323 | |||
2324 | static int | ||
2325 | set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) | ||
2326 | { | ||
2327 | struct irq_cfg *cfg; | ||
2328 | unsigned long flags; | ||
2329 | unsigned int dest; | ||
2330 | unsigned int irq; | ||
2331 | int ret = -1; | ||
2332 | |||
2333 | irq = desc->irq; | ||
2334 | cfg = desc->chip_data; | ||
2335 | |||
2336 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2337 | dest = set_desc_affinity(desc, mask); | ||
2338 | if (dest != BAD_APICID) { | ||
2339 | /* Only the high 8 bits are valid. */ | ||
2340 | dest = SET_APIC_LOGICAL_ID(dest); | ||
2341 | __target_IO_APIC_irq(irq, dest, cfg); | ||
2342 | ret = 0; | ||
2343 | } | ||
2344 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2345 | |||
2346 | return ret; | ||
2347 | } | ||
2348 | |||
2349 | static int | ||
2350 | set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) | ||
2351 | { | ||
2352 | struct irq_desc *desc; | ||
2353 | |||
2354 | desc = irq_to_desc(irq); | ||
2355 | |||
2356 | return set_ioapic_affinity_irq_desc(desc, mask); | ||
2357 | } | ||
2363 | 2358 | ||
2364 | #ifdef CONFIG_INTR_REMAP | 2359 | #ifdef CONFIG_INTR_REMAP |
2365 | 2360 | ||
@@ -2374,26 +2369,25 @@ static int ioapic_retrigger_irq(unsigned int irq) | |||
2374 | * Real vector that is used for interrupting cpu will be coming from | 2369 | * Real vector that is used for interrupting cpu will be coming from |
2375 | * the interrupt-remapping table entry. | 2370 | * the interrupt-remapping table entry. |
2376 | */ | 2371 | */ |
2377 | static void | 2372 | static int |
2378 | migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) | 2373 | migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) |
2379 | { | 2374 | { |
2380 | struct irq_cfg *cfg; | 2375 | struct irq_cfg *cfg; |
2381 | struct irte irte; | 2376 | struct irte irte; |
2382 | unsigned int dest; | 2377 | unsigned int dest; |
2383 | unsigned int irq; | 2378 | unsigned int irq; |
2379 | int ret = -1; | ||
2384 | 2380 | ||
2385 | if (!cpumask_intersects(mask, cpu_online_mask)) | 2381 | if (!cpumask_intersects(mask, cpu_online_mask)) |
2386 | return; | 2382 | return ret; |
2387 | 2383 | ||
2388 | irq = desc->irq; | 2384 | irq = desc->irq; |
2389 | if (get_irte(irq, &irte)) | 2385 | if (get_irte(irq, &irte)) |
2390 | return; | 2386 | return ret; |
2391 | 2387 | ||
2392 | cfg = desc->chip_data; | 2388 | cfg = desc->chip_data; |
2393 | if (assign_irq_vector(irq, cfg, mask)) | 2389 | if (assign_irq_vector(irq, cfg, mask)) |
2394 | return; | 2390 | return ret; |
2395 | |||
2396 | set_extra_move_desc(desc, mask); | ||
2397 | 2391 | ||
2398 | dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); | 2392 | dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); |
2399 | 2393 | ||
@@ -2409,27 +2403,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) | |||
2409 | send_cleanup_vector(cfg); | 2403 | send_cleanup_vector(cfg); |
2410 | 2404 | ||
2411 | cpumask_copy(desc->affinity, mask); | 2405 | cpumask_copy(desc->affinity, mask); |
2406 | |||
2407 | return 0; | ||
2412 | } | 2408 | } |
2413 | 2409 | ||
2414 | /* | 2410 | /* |
2415 | * Migrates the IRQ destination in the process context. | 2411 | * Migrates the IRQ destination in the process context. |
2416 | */ | 2412 | */ |
2417 | static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, | 2413 | static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, |
2418 | const struct cpumask *mask) | 2414 | const struct cpumask *mask) |
2419 | { | 2415 | { |
2420 | migrate_ioapic_irq_desc(desc, mask); | 2416 | return migrate_ioapic_irq_desc(desc, mask); |
2421 | } | 2417 | } |
2422 | static void set_ir_ioapic_affinity_irq(unsigned int irq, | 2418 | static int set_ir_ioapic_affinity_irq(unsigned int irq, |
2423 | const struct cpumask *mask) | 2419 | const struct cpumask *mask) |
2424 | { | 2420 | { |
2425 | struct irq_desc *desc = irq_to_desc(irq); | 2421 | struct irq_desc *desc = irq_to_desc(irq); |
2426 | 2422 | ||
2427 | set_ir_ioapic_affinity_irq_desc(desc, mask); | 2423 | return set_ir_ioapic_affinity_irq_desc(desc, mask); |
2428 | } | 2424 | } |
2429 | #else | 2425 | #else |
2430 | static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, | 2426 | static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, |
2431 | const struct cpumask *mask) | 2427 | const struct cpumask *mask) |
2432 | { | 2428 | { |
2429 | return 0; | ||
2433 | } | 2430 | } |
2434 | #endif | 2431 | #endif |
2435 | 2432 | ||
@@ -2491,86 +2488,19 @@ static void irq_complete_move(struct irq_desc **descp) | |||
2491 | struct irq_cfg *cfg = desc->chip_data; | 2488 | struct irq_cfg *cfg = desc->chip_data; |
2492 | unsigned vector, me; | 2489 | unsigned vector, me; |
2493 | 2490 | ||
2494 | if (likely(!cfg->move_in_progress)) { | 2491 | if (likely(!cfg->move_in_progress)) |
2495 | #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC | ||
2496 | if (likely(!cfg->move_desc_pending)) | ||
2497 | return; | ||
2498 | |||
2499 | /* domain has not changed, but affinity did */ | ||
2500 | me = smp_processor_id(); | ||
2501 | if (cpumask_test_cpu(me, desc->affinity)) { | ||
2502 | *descp = desc = move_irq_desc(desc, me); | ||
2503 | /* get the new one */ | ||
2504 | cfg = desc->chip_data; | ||
2505 | cfg->move_desc_pending = 0; | ||
2506 | } | ||
2507 | #endif | ||
2508 | return; | 2492 | return; |
2509 | } | ||
2510 | 2493 | ||
2511 | vector = ~get_irq_regs()->orig_ax; | 2494 | vector = ~get_irq_regs()->orig_ax; |
2512 | me = smp_processor_id(); | 2495 | me = smp_processor_id(); |
2513 | 2496 | ||
2514 | if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { | 2497 | if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) |
2515 | #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC | ||
2516 | *descp = desc = move_irq_desc(desc, me); | ||
2517 | /* get the new one */ | ||
2518 | cfg = desc->chip_data; | ||
2519 | #endif | ||
2520 | send_cleanup_vector(cfg); | 2498 | send_cleanup_vector(cfg); |
2521 | } | ||
2522 | } | 2499 | } |
2523 | #else | 2500 | #else |
2524 | static inline void irq_complete_move(struct irq_desc **descp) {} | 2501 | static inline void irq_complete_move(struct irq_desc **descp) {} |
2525 | #endif | 2502 | #endif |
2526 | 2503 | ||
2527 | static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) | ||
2528 | { | ||
2529 | int apic, pin; | ||
2530 | struct irq_pin_list *entry; | ||
2531 | |||
2532 | entry = cfg->irq_2_pin; | ||
2533 | for (;;) { | ||
2534 | |||
2535 | if (!entry) | ||
2536 | break; | ||
2537 | |||
2538 | apic = entry->apic; | ||
2539 | pin = entry->pin; | ||
2540 | io_apic_eoi(apic, pin); | ||
2541 | entry = entry->next; | ||
2542 | } | ||
2543 | } | ||
2544 | |||
2545 | static void | ||
2546 | eoi_ioapic_irq(struct irq_desc *desc) | ||
2547 | { | ||
2548 | struct irq_cfg *cfg; | ||
2549 | unsigned long flags; | ||
2550 | unsigned int irq; | ||
2551 | |||
2552 | irq = desc->irq; | ||
2553 | cfg = desc->chip_data; | ||
2554 | |||
2555 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2556 | __eoi_ioapic_irq(irq, cfg); | ||
2557 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2558 | } | ||
2559 | |||
2560 | #ifdef CONFIG_X86_X2APIC | ||
2561 | static void ack_x2apic_level(unsigned int irq) | ||
2562 | { | ||
2563 | struct irq_desc *desc = irq_to_desc(irq); | ||
2564 | ack_x2APIC_irq(); | ||
2565 | eoi_ioapic_irq(desc); | ||
2566 | } | ||
2567 | |||
2568 | static void ack_x2apic_edge(unsigned int irq) | ||
2569 | { | ||
2570 | ack_x2APIC_irq(); | ||
2571 | } | ||
2572 | #endif | ||
2573 | |||
2574 | static void ack_apic_edge(unsigned int irq) | 2504 | static void ack_apic_edge(unsigned int irq) |
2575 | { | 2505 | { |
2576 | struct irq_desc *desc = irq_to_desc(irq); | 2506 | struct irq_desc *desc = irq_to_desc(irq); |
@@ -2634,9 +2564,6 @@ static void ack_apic_level(unsigned int irq) | |||
2634 | */ | 2564 | */ |
2635 | ack_APIC_irq(); | 2565 | ack_APIC_irq(); |
2636 | 2566 | ||
2637 | if (irq_remapped(irq)) | ||
2638 | eoi_ioapic_irq(desc); | ||
2639 | |||
2640 | /* Now we can move and renable the irq */ | 2567 | /* Now we can move and renable the irq */ |
2641 | if (unlikely(do_unmask_irq)) { | 2568 | if (unlikely(do_unmask_irq)) { |
2642 | /* Only migrate the irq if the ack has been received. | 2569 | /* Only migrate the irq if the ack has been received. |
@@ -2683,22 +2610,50 @@ static void ack_apic_level(unsigned int irq) | |||
2683 | } | 2610 | } |
2684 | 2611 | ||
2685 | #ifdef CONFIG_INTR_REMAP | 2612 | #ifdef CONFIG_INTR_REMAP |
2613 | static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) | ||
2614 | { | ||
2615 | int apic, pin; | ||
2616 | struct irq_pin_list *entry; | ||
2617 | |||
2618 | entry = cfg->irq_2_pin; | ||
2619 | for (;;) { | ||
2620 | |||
2621 | if (!entry) | ||
2622 | break; | ||
2623 | |||
2624 | apic = entry->apic; | ||
2625 | pin = entry->pin; | ||
2626 | io_apic_eoi(apic, pin); | ||
2627 | entry = entry->next; | ||
2628 | } | ||
2629 | } | ||
2630 | |||
2631 | static void | ||
2632 | eoi_ioapic_irq(struct irq_desc *desc) | ||
2633 | { | ||
2634 | struct irq_cfg *cfg; | ||
2635 | unsigned long flags; | ||
2636 | unsigned int irq; | ||
2637 | |||
2638 | irq = desc->irq; | ||
2639 | cfg = desc->chip_data; | ||
2640 | |||
2641 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2642 | __eoi_ioapic_irq(irq, cfg); | ||
2643 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2644 | } | ||
2645 | |||
2686 | static void ir_ack_apic_edge(unsigned int irq) | 2646 | static void ir_ack_apic_edge(unsigned int irq) |
2687 | { | 2647 | { |
2688 | #ifdef CONFIG_X86_X2APIC | 2648 | ack_APIC_irq(); |
2689 | if (x2apic_enabled()) | ||
2690 | return ack_x2apic_edge(irq); | ||
2691 | #endif | ||
2692 | return ack_apic_edge(irq); | ||
2693 | } | 2649 | } |
2694 | 2650 | ||
2695 | static void ir_ack_apic_level(unsigned int irq) | 2651 | static void ir_ack_apic_level(unsigned int irq) |
2696 | { | 2652 | { |
2697 | #ifdef CONFIG_X86_X2APIC | 2653 | struct irq_desc *desc = irq_to_desc(irq); |
2698 | if (x2apic_enabled()) | 2654 | |
2699 | return ack_x2apic_level(irq); | 2655 | ack_APIC_irq(); |
2700 | #endif | 2656 | eoi_ioapic_irq(desc); |
2701 | return ack_apic_level(irq); | ||
2702 | } | 2657 | } |
2703 | #endif /* CONFIG_INTR_REMAP */ | 2658 | #endif /* CONFIG_INTR_REMAP */ |
2704 | 2659 | ||
@@ -2903,7 +2858,7 @@ static inline void __init check_timer(void) | |||
2903 | { | 2858 | { |
2904 | struct irq_desc *desc = irq_to_desc(0); | 2859 | struct irq_desc *desc = irq_to_desc(0); |
2905 | struct irq_cfg *cfg = desc->chip_data; | 2860 | struct irq_cfg *cfg = desc->chip_data; |
2906 | int cpu = boot_cpu_id; | 2861 | int node = cpu_to_node(boot_cpu_id); |
2907 | int apic1, pin1, apic2, pin2; | 2862 | int apic1, pin1, apic2, pin2; |
2908 | unsigned long flags; | 2863 | unsigned long flags; |
2909 | int no_pin1 = 0; | 2864 | int no_pin1 = 0; |
@@ -2969,7 +2924,7 @@ static inline void __init check_timer(void) | |||
2969 | * Ok, does IRQ0 through the IOAPIC work? | 2924 | * Ok, does IRQ0 through the IOAPIC work? |
2970 | */ | 2925 | */ |
2971 | if (no_pin1) { | 2926 | if (no_pin1) { |
2972 | add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); | 2927 | add_pin_to_irq_node(cfg, node, apic1, pin1); |
2973 | setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); | 2928 | setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); |
2974 | } else { | 2929 | } else { |
2975 | /* for edge trigger, setup_IO_APIC_irq already | 2930 | /* for edge trigger, setup_IO_APIC_irq already |
@@ -3006,7 +2961,7 @@ static inline void __init check_timer(void) | |||
3006 | /* | 2961 | /* |
3007 | * legacy devices should be connected to IO APIC #0 | 2962 | * legacy devices should be connected to IO APIC #0 |
3008 | */ | 2963 | */ |
3009 | replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); | 2964 | replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); |
3010 | setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); | 2965 | setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); |
3011 | enable_8259A_irq(0); | 2966 | enable_8259A_irq(0); |
3012 | if (timer_irq_works()) { | 2967 | if (timer_irq_works()) { |
@@ -3218,14 +3173,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY; | |||
3218 | /* | 3173 | /* |
3219 | * Dynamic irq allocate and deallocation | 3174 | * Dynamic irq allocate and deallocation |
3220 | */ | 3175 | */ |
3221 | unsigned int create_irq_nr(unsigned int irq_want) | 3176 | unsigned int create_irq_nr(unsigned int irq_want, int node) |
3222 | { | 3177 | { |
3223 | /* Allocate an unused irq */ | 3178 | /* Allocate an unused irq */ |
3224 | unsigned int irq; | 3179 | unsigned int irq; |
3225 | unsigned int new; | 3180 | unsigned int new; |
3226 | unsigned long flags; | 3181 | unsigned long flags; |
3227 | struct irq_cfg *cfg_new = NULL; | 3182 | struct irq_cfg *cfg_new = NULL; |
3228 | int cpu = boot_cpu_id; | ||
3229 | struct irq_desc *desc_new = NULL; | 3183 | struct irq_desc *desc_new = NULL; |
3230 | 3184 | ||
3231 | irq = 0; | 3185 | irq = 0; |
@@ -3234,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want) | |||
3234 | 3188 | ||
3235 | spin_lock_irqsave(&vector_lock, flags); | 3189 | spin_lock_irqsave(&vector_lock, flags); |
3236 | for (new = irq_want; new < nr_irqs; new++) { | 3190 | for (new = irq_want; new < nr_irqs; new++) { |
3237 | desc_new = irq_to_desc_alloc_cpu(new, cpu); | 3191 | desc_new = irq_to_desc_alloc_node(new, node); |
3238 | if (!desc_new) { | 3192 | if (!desc_new) { |
3239 | printk(KERN_INFO "can not get irq_desc for %d\n", new); | 3193 | printk(KERN_INFO "can not get irq_desc for %d\n", new); |
3240 | continue; | 3194 | continue; |
@@ -3243,6 +3197,9 @@ unsigned int create_irq_nr(unsigned int irq_want) | |||
3243 | 3197 | ||
3244 | if (cfg_new->vector != 0) | 3198 | if (cfg_new->vector != 0) |
3245 | continue; | 3199 | continue; |
3200 | |||
3201 | desc_new = move_irq_desc(desc_new, node); | ||
3202 | |||
3246 | if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) | 3203 | if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) |
3247 | irq = new; | 3204 | irq = new; |
3248 | break; | 3205 | break; |
@@ -3260,11 +3217,12 @@ unsigned int create_irq_nr(unsigned int irq_want) | |||
3260 | 3217 | ||
3261 | int create_irq(void) | 3218 | int create_irq(void) |
3262 | { | 3219 | { |
3220 | int node = cpu_to_node(boot_cpu_id); | ||
3263 | unsigned int irq_want; | 3221 | unsigned int irq_want; |
3264 | int irq; | 3222 | int irq; |
3265 | 3223 | ||
3266 | irq_want = nr_irqs_gsi; | 3224 | irq_want = nr_irqs_gsi; |
3267 | irq = create_irq_nr(irq_want); | 3225 | irq = create_irq_nr(irq_want, node); |
3268 | 3226 | ||
3269 | if (irq == 0) | 3227 | if (irq == 0) |
3270 | irq = -1; | 3228 | irq = -1; |
@@ -3366,7 +3324,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms | |||
3366 | } | 3324 | } |
3367 | 3325 | ||
3368 | #ifdef CONFIG_SMP | 3326 | #ifdef CONFIG_SMP |
3369 | static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) | 3327 | static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) |
3370 | { | 3328 | { |
3371 | struct irq_desc *desc = irq_to_desc(irq); | 3329 | struct irq_desc *desc = irq_to_desc(irq); |
3372 | struct irq_cfg *cfg; | 3330 | struct irq_cfg *cfg; |
@@ -3375,7 +3333,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) | |||
3375 | 3333 | ||
3376 | dest = set_desc_affinity(desc, mask); | 3334 | dest = set_desc_affinity(desc, mask); |
3377 | if (dest == BAD_APICID) | 3335 | if (dest == BAD_APICID) |
3378 | return; | 3336 | return -1; |
3379 | 3337 | ||
3380 | cfg = desc->chip_data; | 3338 | cfg = desc->chip_data; |
3381 | 3339 | ||
@@ -3387,13 +3345,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) | |||
3387 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); | 3345 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); |
3388 | 3346 | ||
3389 | write_msi_msg_desc(desc, &msg); | 3347 | write_msi_msg_desc(desc, &msg); |
3348 | |||
3349 | return 0; | ||
3390 | } | 3350 | } |
3391 | #ifdef CONFIG_INTR_REMAP | 3351 | #ifdef CONFIG_INTR_REMAP |
3392 | /* | 3352 | /* |
3393 | * Migrate the MSI irq to another cpumask. This migration is | 3353 | * Migrate the MSI irq to another cpumask. This migration is |
3394 | * done in the process context using interrupt-remapping hardware. | 3354 | * done in the process context using interrupt-remapping hardware. |
3395 | */ | 3355 | */ |
3396 | static void | 3356 | static int |
3397 | ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) | 3357 | ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) |
3398 | { | 3358 | { |
3399 | struct irq_desc *desc = irq_to_desc(irq); | 3359 | struct irq_desc *desc = irq_to_desc(irq); |
@@ -3402,11 +3362,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) | |||
3402 | struct irte irte; | 3362 | struct irte irte; |
3403 | 3363 | ||
3404 | if (get_irte(irq, &irte)) | 3364 | if (get_irte(irq, &irte)) |
3405 | return; | 3365 | return -1; |
3406 | 3366 | ||
3407 | dest = set_desc_affinity(desc, mask); | 3367 | dest = set_desc_affinity(desc, mask); |
3408 | if (dest == BAD_APICID) | 3368 | if (dest == BAD_APICID) |
3409 | return; | 3369 | return -1; |
3410 | 3370 | ||
3411 | irte.vector = cfg->vector; | 3371 | irte.vector = cfg->vector; |
3412 | irte.dest_id = IRTE_DEST(dest); | 3372 | irte.dest_id = IRTE_DEST(dest); |
@@ -3423,6 +3383,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) | |||
3423 | */ | 3383 | */ |
3424 | if (cfg->move_in_progress) | 3384 | if (cfg->move_in_progress) |
3425 | send_cleanup_vector(cfg); | 3385 | send_cleanup_vector(cfg); |
3386 | |||
3387 | return 0; | ||
3426 | } | 3388 | } |
3427 | 3389 | ||
3428 | #endif | 3390 | #endif |
@@ -3518,15 +3480,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) | |||
3518 | unsigned int irq_want; | 3480 | unsigned int irq_want; |
3519 | struct intel_iommu *iommu = NULL; | 3481 | struct intel_iommu *iommu = NULL; |
3520 | int index = 0; | 3482 | int index = 0; |
3483 | int node; | ||
3521 | 3484 | ||
3522 | /* x86 doesn't support multiple MSI yet */ | 3485 | /* x86 doesn't support multiple MSI yet */ |
3523 | if (type == PCI_CAP_ID_MSI && nvec > 1) | 3486 | if (type == PCI_CAP_ID_MSI && nvec > 1) |
3524 | return 1; | 3487 | return 1; |
3525 | 3488 | ||
3489 | node = dev_to_node(&dev->dev); | ||
3526 | irq_want = nr_irqs_gsi; | 3490 | irq_want = nr_irqs_gsi; |
3527 | sub_handle = 0; | 3491 | sub_handle = 0; |
3528 | list_for_each_entry(msidesc, &dev->msi_list, list) { | 3492 | list_for_each_entry(msidesc, &dev->msi_list, list) { |
3529 | irq = create_irq_nr(irq_want); | 3493 | irq = create_irq_nr(irq_want, node); |
3530 | if (irq == 0) | 3494 | if (irq == 0) |
3531 | return -1; | 3495 | return -1; |
3532 | irq_want = irq + 1; | 3496 | irq_want = irq + 1; |
@@ -3576,7 +3540,7 @@ void arch_teardown_msi_irq(unsigned int irq) | |||
3576 | 3540 | ||
3577 | #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) | 3541 | #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) |
3578 | #ifdef CONFIG_SMP | 3542 | #ifdef CONFIG_SMP |
3579 | static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) | 3543 | static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) |
3580 | { | 3544 | { |
3581 | struct irq_desc *desc = irq_to_desc(irq); | 3545 | struct irq_desc *desc = irq_to_desc(irq); |
3582 | struct irq_cfg *cfg; | 3546 | struct irq_cfg *cfg; |
@@ -3585,7 +3549,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) | |||
3585 | 3549 | ||
3586 | dest = set_desc_affinity(desc, mask); | 3550 | dest = set_desc_affinity(desc, mask); |
3587 | if (dest == BAD_APICID) | 3551 | if (dest == BAD_APICID) |
3588 | return; | 3552 | return -1; |
3589 | 3553 | ||
3590 | cfg = desc->chip_data; | 3554 | cfg = desc->chip_data; |
3591 | 3555 | ||
@@ -3597,6 +3561,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) | |||
3597 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); | 3561 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); |
3598 | 3562 | ||
3599 | dmar_msi_write(irq, &msg); | 3563 | dmar_msi_write(irq, &msg); |
3564 | |||
3565 | return 0; | ||
3600 | } | 3566 | } |
3601 | 3567 | ||
3602 | #endif /* CONFIG_SMP */ | 3568 | #endif /* CONFIG_SMP */ |
@@ -3630,7 +3596,7 @@ int arch_setup_dmar_msi(unsigned int irq) | |||
3630 | #ifdef CONFIG_HPET_TIMER | 3596 | #ifdef CONFIG_HPET_TIMER |
3631 | 3597 | ||
3632 | #ifdef CONFIG_SMP | 3598 | #ifdef CONFIG_SMP |
3633 | static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) | 3599 | static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) |
3634 | { | 3600 | { |
3635 | struct irq_desc *desc = irq_to_desc(irq); | 3601 | struct irq_desc *desc = irq_to_desc(irq); |
3636 | struct irq_cfg *cfg; | 3602 | struct irq_cfg *cfg; |
@@ -3639,7 +3605,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) | |||
3639 | 3605 | ||
3640 | dest = set_desc_affinity(desc, mask); | 3606 | dest = set_desc_affinity(desc, mask); |
3641 | if (dest == BAD_APICID) | 3607 | if (dest == BAD_APICID) |
3642 | return; | 3608 | return -1; |
3643 | 3609 | ||
3644 | cfg = desc->chip_data; | 3610 | cfg = desc->chip_data; |
3645 | 3611 | ||
@@ -3651,6 +3617,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) | |||
3651 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); | 3617 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); |
3652 | 3618 | ||
3653 | hpet_msi_write(irq, &msg); | 3619 | hpet_msi_write(irq, &msg); |
3620 | |||
3621 | return 0; | ||
3654 | } | 3622 | } |
3655 | 3623 | ||
3656 | #endif /* CONFIG_SMP */ | 3624 | #endif /* CONFIG_SMP */ |
@@ -3707,7 +3675,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) | |||
3707 | write_ht_irq_msg(irq, &msg); | 3675 | write_ht_irq_msg(irq, &msg); |
3708 | } | 3676 | } |
3709 | 3677 | ||
3710 | static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) | 3678 | static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) |
3711 | { | 3679 | { |
3712 | struct irq_desc *desc = irq_to_desc(irq); | 3680 | struct irq_desc *desc = irq_to_desc(irq); |
3713 | struct irq_cfg *cfg; | 3681 | struct irq_cfg *cfg; |
@@ -3715,11 +3683,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) | |||
3715 | 3683 | ||
3716 | dest = set_desc_affinity(desc, mask); | 3684 | dest = set_desc_affinity(desc, mask); |
3717 | if (dest == BAD_APICID) | 3685 | if (dest == BAD_APICID) |
3718 | return; | 3686 | return -1; |
3719 | 3687 | ||
3720 | cfg = desc->chip_data; | 3688 | cfg = desc->chip_data; |
3721 | 3689 | ||
3722 | target_ht_irq(irq, dest, cfg->vector); | 3690 | target_ht_irq(irq, dest, cfg->vector); |
3691 | |||
3692 | return 0; | ||
3723 | } | 3693 | } |
3724 | 3694 | ||
3725 | #endif | 3695 | #endif |
@@ -3794,6 +3764,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, | |||
3794 | unsigned long flags; | 3764 | unsigned long flags; |
3795 | int err; | 3765 | int err; |
3796 | 3766 | ||
3767 | BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); | ||
3768 | |||
3797 | cfg = irq_cfg(irq); | 3769 | cfg = irq_cfg(irq); |
3798 | 3770 | ||
3799 | err = assign_irq_vector(irq, cfg, eligible_cpu); | 3771 | err = assign_irq_vector(irq, cfg, eligible_cpu); |
@@ -3807,15 +3779,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, | |||
3807 | 3779 | ||
3808 | mmr_value = 0; | 3780 | mmr_value = 0; |
3809 | entry = (struct uv_IO_APIC_route_entry *)&mmr_value; | 3781 | entry = (struct uv_IO_APIC_route_entry *)&mmr_value; |
3810 | BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); | 3782 | entry->vector = cfg->vector; |
3811 | 3783 | entry->delivery_mode = apic->irq_delivery_mode; | |
3812 | entry->vector = cfg->vector; | 3784 | entry->dest_mode = apic->irq_dest_mode; |
3813 | entry->delivery_mode = apic->irq_delivery_mode; | 3785 | entry->polarity = 0; |
3814 | entry->dest_mode = apic->irq_dest_mode; | 3786 | entry->trigger = 0; |
3815 | entry->polarity = 0; | 3787 | entry->mask = 0; |
3816 | entry->trigger = 0; | 3788 | entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); |
3817 | entry->mask = 0; | ||
3818 | entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); | ||
3819 | 3789 | ||
3820 | mmr_pnode = uv_blade_to_pnode(mmr_blade); | 3790 | mmr_pnode = uv_blade_to_pnode(mmr_blade); |
3821 | uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); | 3791 | uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); |
@@ -3833,10 +3803,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) | |||
3833 | struct uv_IO_APIC_route_entry *entry; | 3803 | struct uv_IO_APIC_route_entry *entry; |
3834 | int mmr_pnode; | 3804 | int mmr_pnode; |
3835 | 3805 | ||
3806 | BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); | ||
3807 | |||
3836 | mmr_value = 0; | 3808 | mmr_value = 0; |
3837 | entry = (struct uv_IO_APIC_route_entry *)&mmr_value; | 3809 | entry = (struct uv_IO_APIC_route_entry *)&mmr_value; |
3838 | BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); | ||
3839 | |||
3840 | entry->mask = 1; | 3810 | entry->mask = 1; |
3841 | 3811 | ||
3842 | mmr_pnode = uv_blade_to_pnode(mmr_blade); | 3812 | mmr_pnode = uv_blade_to_pnode(mmr_blade); |
@@ -3900,6 +3870,71 @@ int __init arch_probe_nr_irqs(void) | |||
3900 | } | 3870 | } |
3901 | #endif | 3871 | #endif |
3902 | 3872 | ||
3873 | static int __io_apic_set_pci_routing(struct device *dev, int irq, | ||
3874 | struct io_apic_irq_attr *irq_attr) | ||
3875 | { | ||
3876 | struct irq_desc *desc; | ||
3877 | struct irq_cfg *cfg; | ||
3878 | int node; | ||
3879 | int ioapic, pin; | ||
3880 | int trigger, polarity; | ||
3881 | |||
3882 | ioapic = irq_attr->ioapic; | ||
3883 | if (!IO_APIC_IRQ(irq)) { | ||
3884 | apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", | ||
3885 | ioapic); | ||
3886 | return -EINVAL; | ||
3887 | } | ||
3888 | |||
3889 | if (dev) | ||
3890 | node = dev_to_node(dev); | ||
3891 | else | ||
3892 | node = cpu_to_node(boot_cpu_id); | ||
3893 | |||
3894 | desc = irq_to_desc_alloc_node(irq, node); | ||
3895 | if (!desc) { | ||
3896 | printk(KERN_INFO "can not get irq_desc %d\n", irq); | ||
3897 | return 0; | ||
3898 | } | ||
3899 | |||
3900 | pin = irq_attr->ioapic_pin; | ||
3901 | trigger = irq_attr->trigger; | ||
3902 | polarity = irq_attr->polarity; | ||
3903 | |||
3904 | /* | ||
3905 | * IRQs < 16 are already in the irq_2_pin[] map | ||
3906 | */ | ||
3907 | if (irq >= NR_IRQS_LEGACY) { | ||
3908 | cfg = desc->chip_data; | ||
3909 | add_pin_to_irq_node(cfg, node, ioapic, pin); | ||
3910 | } | ||
3911 | |||
3912 | setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); | ||
3913 | |||
3914 | return 0; | ||
3915 | } | ||
3916 | |||
3917 | int io_apic_set_pci_routing(struct device *dev, int irq, | ||
3918 | struct io_apic_irq_attr *irq_attr) | ||
3919 | { | ||
3920 | int ioapic, pin; | ||
3921 | /* | ||
3922 | * Avoid pin reprogramming. PRTs typically include entries | ||
3923 | * with redundant pin->gsi mappings (but unique PCI devices); | ||
3924 | * we only program the IOAPIC on the first. | ||
3925 | */ | ||
3926 | ioapic = irq_attr->ioapic; | ||
3927 | pin = irq_attr->ioapic_pin; | ||
3928 | if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { | ||
3929 | pr_debug("Pin %d-%d already programmed\n", | ||
3930 | mp_ioapics[ioapic].apicid, pin); | ||
3931 | return 0; | ||
3932 | } | ||
3933 | set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); | ||
3934 | |||
3935 | return __io_apic_set_pci_routing(dev, irq, irq_attr); | ||
3936 | } | ||
3937 | |||
3903 | /* -------------------------------------------------------------------------- | 3938 | /* -------------------------------------------------------------------------- |
3904 | ACPI-based IOAPIC Configuration | 3939 | ACPI-based IOAPIC Configuration |
3905 | -------------------------------------------------------------------------- */ | 3940 | -------------------------------------------------------------------------- */ |
@@ -3980,6 +4015,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) | |||
3980 | 4015 | ||
3981 | return apic_id; | 4016 | return apic_id; |
3982 | } | 4017 | } |
4018 | #endif | ||
3983 | 4019 | ||
3984 | int __init io_apic_get_version(int ioapic) | 4020 | int __init io_apic_get_version(int ioapic) |
3985 | { | 4021 | { |
@@ -3992,39 +4028,6 @@ int __init io_apic_get_version(int ioapic) | |||
3992 | 4028 | ||
3993 | return reg_01.bits.version; | 4029 | return reg_01.bits.version; |
3994 | } | 4030 | } |
3995 | #endif | ||
3996 | |||
3997 | int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) | ||
3998 | { | ||
3999 | struct irq_desc *desc; | ||
4000 | struct irq_cfg *cfg; | ||
4001 | int cpu = boot_cpu_id; | ||
4002 | |||
4003 | if (!IO_APIC_IRQ(irq)) { | ||
4004 | apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", | ||
4005 | ioapic); | ||
4006 | return -EINVAL; | ||
4007 | } | ||
4008 | |||
4009 | desc = irq_to_desc_alloc_cpu(irq, cpu); | ||
4010 | if (!desc) { | ||
4011 | printk(KERN_INFO "can not get irq_desc %d\n", irq); | ||
4012 | return 0; | ||
4013 | } | ||
4014 | |||
4015 | /* | ||
4016 | * IRQs < 16 are already in the irq_2_pin[] map | ||
4017 | */ | ||
4018 | if (irq >= NR_IRQS_LEGACY) { | ||
4019 | cfg = desc->chip_data; | ||
4020 | add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); | ||
4021 | } | ||
4022 | |||
4023 | setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); | ||
4024 | |||
4025 | return 0; | ||
4026 | } | ||
4027 | |||
4028 | 4031 | ||
4029 | int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) | 4032 | int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) |
4030 | { | 4033 | { |
@@ -4055,51 +4058,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) | |||
4055 | #ifdef CONFIG_SMP | 4058 | #ifdef CONFIG_SMP |
4056 | void __init setup_ioapic_dest(void) | 4059 | void __init setup_ioapic_dest(void) |
4057 | { | 4060 | { |
4058 | int pin, ioapic, irq, irq_entry; | 4061 | int pin, ioapic = 0, irq, irq_entry; |
4059 | struct irq_desc *desc; | 4062 | struct irq_desc *desc; |
4060 | struct irq_cfg *cfg; | ||
4061 | const struct cpumask *mask; | 4063 | const struct cpumask *mask; |
4062 | 4064 | ||
4063 | if (skip_ioapic_setup == 1) | 4065 | if (skip_ioapic_setup == 1) |
4064 | return; | 4066 | return; |
4065 | 4067 | ||
4066 | for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { | 4068 | #ifdef CONFIG_ACPI |
4067 | for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { | 4069 | if (!acpi_disabled && acpi_ioapic) { |
4068 | irq_entry = find_irq_entry(ioapic, pin, mp_INT); | 4070 | ioapic = mp_find_ioapic(0); |
4069 | if (irq_entry == -1) | 4071 | if (ioapic < 0) |
4070 | continue; | 4072 | ioapic = 0; |
4071 | irq = pin_2_irq(irq_entry, ioapic, pin); | 4073 | } |
4072 | 4074 | #endif | |
4073 | /* setup_IO_APIC_irqs could fail to get vector for some device | ||
4074 | * when you have too many devices, because at that time only boot | ||
4075 | * cpu is online. | ||
4076 | */ | ||
4077 | desc = irq_to_desc(irq); | ||
4078 | cfg = desc->chip_data; | ||
4079 | if (!cfg->vector) { | ||
4080 | setup_IO_APIC_irq(ioapic, pin, irq, desc, | ||
4081 | irq_trigger(irq_entry), | ||
4082 | irq_polarity(irq_entry)); | ||
4083 | continue; | ||
4084 | 4075 | ||
4085 | } | 4076 | for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { |
4077 | irq_entry = find_irq_entry(ioapic, pin, mp_INT); | ||
4078 | if (irq_entry == -1) | ||
4079 | continue; | ||
4080 | irq = pin_2_irq(irq_entry, ioapic, pin); | ||
4086 | 4081 | ||
4087 | /* | 4082 | desc = irq_to_desc(irq); |
4088 | * Honour affinities which have been set in early boot | ||
4089 | */ | ||
4090 | if (desc->status & | ||
4091 | (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) | ||
4092 | mask = desc->affinity; | ||
4093 | else | ||
4094 | mask = apic->target_cpus(); | ||
4095 | 4083 | ||
4096 | if (intr_remapping_enabled) | 4084 | /* |
4097 | set_ir_ioapic_affinity_irq_desc(desc, mask); | 4085 | * Honour affinities which have been set in early boot |
4098 | else | 4086 | */ |
4099 | set_ioapic_affinity_irq_desc(desc, mask); | 4087 | if (desc->status & |
4100 | } | 4088 | (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) |
4089 | mask = desc->affinity; | ||
4090 | else | ||
4091 | mask = apic->target_cpus(); | ||
4101 | 4092 | ||
4093 | if (intr_remapping_enabled) | ||
4094 | set_ir_ioapic_affinity_irq_desc(desc, mask); | ||
4095 | else | ||
4096 | set_ioapic_affinity_irq_desc(desc, mask); | ||
4102 | } | 4097 | } |
4098 | |||
4103 | } | 4099 | } |
4104 | #endif | 4100 | #endif |
4105 | 4101 | ||
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index ce4fbfa315a1..b3025b43b63a 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c | |||
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) | |||
66 | 66 | ||
67 | static inline int mce_in_progress(void) | 67 | static inline int mce_in_progress(void) |
68 | { | 68 | { |
69 | #if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) | 69 | #if defined(CONFIG_X86_NEW_MCE) |
70 | return atomic_read(&mce_entry) > 0; | 70 | return atomic_read(&mce_entry) > 0; |
71 | #endif | 71 | #endif |
72 | return 0; | 72 | return 0; |
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data) | |||
104 | } | 104 | } |
105 | #endif | 105 | #endif |
106 | 106 | ||
107 | static void report_broken_nmi(int cpu, int *prev_nmi_count) | 107 | static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) |
108 | { | 108 | { |
109 | printk(KERN_CONT "\n"); | 109 | printk(KERN_CONT "\n"); |
110 | 110 | ||
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 01eda2ac65e4..440a8bccd91a 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c | |||
@@ -160,7 +160,6 @@ extern struct apic apic_summit; | |||
160 | extern struct apic apic_bigsmp; | 160 | extern struct apic apic_bigsmp; |
161 | extern struct apic apic_es7000; | 161 | extern struct apic apic_es7000; |
162 | extern struct apic apic_es7000_cluster; | 162 | extern struct apic apic_es7000_cluster; |
163 | extern struct apic apic_default; | ||
164 | 163 | ||
165 | struct apic *apic = &apic_default; | 164 | struct apic *apic = &apic_default; |
166 | EXPORT_SYMBOL_GPL(apic); | 165 | EXPORT_SYMBOL_GPL(apic); |
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 1783652bb0e5..bc3e880f9b82 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c | |||
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = { | |||
50 | void __init default_setup_apic_routing(void) | 50 | void __init default_setup_apic_routing(void) |
51 | { | 51 | { |
52 | #ifdef CONFIG_X86_X2APIC | 52 | #ifdef CONFIG_X86_X2APIC |
53 | if (x2apic && (apic != &apic_x2apic_phys && | 53 | if (x2apic_mode && (apic != &apic_x2apic_phys && |
54 | #ifdef CONFIG_X86_UV | 54 | #ifdef CONFIG_X86_UV |
55 | apic != &apic_x2apic_uv_x && | 55 | apic != &apic_x2apic_uv_x && |
56 | #endif | 56 | #endif |
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9cfe1f415d81..344eee4ac0a4 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c | |||
@@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){ | |||
173 | rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); | 173 | rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); |
174 | } | 174 | } |
175 | 175 | ||
176 | |||
177 | /* In clustered mode, the high nibble of APIC ID is a cluster number. | ||
178 | * The low nibble is a 4-bit bitmap. */ | ||
179 | #define XAPIC_DEST_CPUS_SHIFT 4 | ||
180 | #define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) | ||
181 | #define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) | ||
182 | |||
183 | #define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) | 176 | #define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) |
184 | 177 | ||
185 | static const struct cpumask *summit_target_cpus(void) | 178 | static const struct cpumask *summit_target_cpus(void) |
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 4a903e2f0d17..8e4cbb255c38 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c | |||
@@ -10,7 +10,7 @@ | |||
10 | #include <asm/apic.h> | 10 | #include <asm/apic.h> |
11 | #include <asm/ipi.h> | 11 | #include <asm/ipi.h> |
12 | 12 | ||
13 | DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); | 13 | static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); |
14 | 14 | ||
15 | static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 15 | static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
16 | { | 16 | { |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2bda69352976..096d19aea2f7 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) | |||
105 | cpumask_set_cpu(cpu, retmask); | 105 | cpumask_set_cpu(cpu, retmask); |
106 | } | 106 | } |
107 | 107 | ||
108 | static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) | 108 | static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) |
109 | { | 109 | { |
110 | #ifdef CONFIG_SMP | 110 | #ifdef CONFIG_SMP |
111 | unsigned long val; | 111 | unsigned long val; |
@@ -463,7 +463,7 @@ static void uv_heartbeat(unsigned long ignored) | |||
463 | uv_set_scir_bits(bits); | 463 | uv_set_scir_bits(bits); |
464 | 464 | ||
465 | /* enable next timer period */ | 465 | /* enable next timer period */ |
466 | mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); | 466 | mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL); |
467 | } | 467 | } |
468 | 468 | ||
469 | static void __cpuinit uv_heartbeat_enable(int cpu) | 469 | static void __cpuinit uv_heartbeat_enable(int cpu) |
@@ -562,7 +562,7 @@ void __init uv_system_init(void) | |||
562 | union uvh_node_id_u node_id; | 562 | union uvh_node_id_u node_id; |
563 | unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; | 563 | unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; |
564 | int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; | 564 | int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; |
565 | int max_pnode = 0; | 565 | int gnode_extra, max_pnode = 0; |
566 | unsigned long mmr_base, present, paddr; | 566 | unsigned long mmr_base, present, paddr; |
567 | unsigned short pnode_mask; | 567 | unsigned short pnode_mask; |
568 | 568 | ||
@@ -574,6 +574,13 @@ void __init uv_system_init(void) | |||
574 | mmr_base = | 574 | mmr_base = |
575 | uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & | 575 | uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & |
576 | ~UV_MMR_ENABLE; | 576 | ~UV_MMR_ENABLE; |
577 | pnode_mask = (1 << n_val) - 1; | ||
578 | node_id.v = uv_read_local_mmr(UVH_NODE_ID); | ||
579 | gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; | ||
580 | gnode_upper = ((unsigned long)gnode_extra << m_val); | ||
581 | printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", | ||
582 | n_val, m_val, gnode_upper, gnode_extra); | ||
583 | |||
577 | printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); | 584 | printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); |
578 | 585 | ||
579 | for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) | 586 | for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) |
@@ -583,15 +590,18 @@ void __init uv_system_init(void) | |||
583 | 590 | ||
584 | bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); | 591 | bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); |
585 | uv_blade_info = kmalloc(bytes, GFP_KERNEL); | 592 | uv_blade_info = kmalloc(bytes, GFP_KERNEL); |
593 | BUG_ON(!uv_blade_info); | ||
586 | 594 | ||
587 | get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); | 595 | get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); |
588 | 596 | ||
589 | bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); | 597 | bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); |
590 | uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); | 598 | uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); |
599 | BUG_ON(!uv_node_to_blade); | ||
591 | memset(uv_node_to_blade, 255, bytes); | 600 | memset(uv_node_to_blade, 255, bytes); |
592 | 601 | ||
593 | bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); | 602 | bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); |
594 | uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); | 603 | uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); |
604 | BUG_ON(!uv_cpu_to_blade); | ||
595 | memset(uv_cpu_to_blade, 255, bytes); | 605 | memset(uv_cpu_to_blade, 255, bytes); |
596 | 606 | ||
597 | blade = 0; | 607 | blade = 0; |
@@ -607,11 +617,6 @@ void __init uv_system_init(void) | |||
607 | } | 617 | } |
608 | } | 618 | } |
609 | 619 | ||
610 | pnode_mask = (1 << n_val) - 1; | ||
611 | node_id.v = uv_read_local_mmr(UVH_NODE_ID); | ||
612 | gnode_upper = (((unsigned long)node_id.s.node_id) & | ||
613 | ~((1 << n_val) - 1)) << m_val; | ||
614 | |||
615 | uv_bios_init(); | 620 | uv_bios_init(); |
616 | uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, | 621 | uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, |
617 | &sn_coherency_id, &sn_region_size); | 622 | &sn_coherency_id, &sn_region_size); |
@@ -634,6 +639,7 @@ void __init uv_system_init(void) | |||
634 | uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; | 639 | uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; |
635 | uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; | 640 | uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; |
636 | uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; | 641 | uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; |
642 | uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; | ||
637 | uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; | 643 | uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; |
638 | uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; | 644 | uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; |
639 | uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; | 645 | uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 49e0939bac42..79302e9a33a4 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -1233,9 +1233,9 @@ static int suspend(int vetoable) | |||
1233 | int err; | 1233 | int err; |
1234 | struct apm_user *as; | 1234 | struct apm_user *as; |
1235 | 1235 | ||
1236 | device_suspend(PMSG_SUSPEND); | 1236 | dpm_suspend_start(PMSG_SUSPEND); |
1237 | 1237 | ||
1238 | device_power_down(PMSG_SUSPEND); | 1238 | dpm_suspend_noirq(PMSG_SUSPEND); |
1239 | 1239 | ||
1240 | local_irq_disable(); | 1240 | local_irq_disable(); |
1241 | sysdev_suspend(PMSG_SUSPEND); | 1241 | sysdev_suspend(PMSG_SUSPEND); |
@@ -1259,9 +1259,9 @@ static int suspend(int vetoable) | |||
1259 | sysdev_resume(); | 1259 | sysdev_resume(); |
1260 | local_irq_enable(); | 1260 | local_irq_enable(); |
1261 | 1261 | ||
1262 | device_power_up(PMSG_RESUME); | 1262 | dpm_resume_noirq(PMSG_RESUME); |
1263 | 1263 | ||
1264 | device_resume(PMSG_RESUME); | 1264 | dpm_resume_end(PMSG_RESUME); |
1265 | queue_event(APM_NORMAL_RESUME, NULL); | 1265 | queue_event(APM_NORMAL_RESUME, NULL); |
1266 | spin_lock(&user_list_lock); | 1266 | spin_lock(&user_list_lock); |
1267 | for (as = user_list; as != NULL; as = as->next) { | 1267 | for (as = user_list; as != NULL; as = as->next) { |
@@ -1277,7 +1277,7 @@ static void standby(void) | |||
1277 | { | 1277 | { |
1278 | int err; | 1278 | int err; |
1279 | 1279 | ||
1280 | device_power_down(PMSG_SUSPEND); | 1280 | dpm_suspend_noirq(PMSG_SUSPEND); |
1281 | 1281 | ||
1282 | local_irq_disable(); | 1282 | local_irq_disable(); |
1283 | sysdev_suspend(PMSG_SUSPEND); | 1283 | sysdev_suspend(PMSG_SUSPEND); |
@@ -1291,7 +1291,7 @@ static void standby(void) | |||
1291 | sysdev_resume(); | 1291 | sysdev_resume(); |
1292 | local_irq_enable(); | 1292 | local_irq_enable(); |
1293 | 1293 | ||
1294 | device_power_up(PMSG_RESUME); | 1294 | dpm_resume_noirq(PMSG_RESUME); |
1295 | } | 1295 | } |
1296 | 1296 | ||
1297 | static apm_event_t get_event(void) | 1297 | static apm_event_t get_event(void) |
@@ -1376,7 +1376,7 @@ static void check_events(void) | |||
1376 | ignore_bounce = 1; | 1376 | ignore_bounce = 1; |
1377 | if ((event != APM_NORMAL_RESUME) | 1377 | if ((event != APM_NORMAL_RESUME) |
1378 | || (ignore_normal_resume == 0)) { | 1378 | || (ignore_normal_resume == 0)) { |
1379 | device_resume(PMSG_RESUME); | 1379 | dpm_resume_end(PMSG_RESUME); |
1380 | queue_event(event, NULL); | 1380 | queue_event(event, NULL); |
1381 | } | 1381 | } |
1382 | ignore_normal_resume = 0; | 1382 | ignore_normal_resume = 0; |
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 5a6aa1c1162f..dfdbf6403895 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c | |||
@@ -126,6 +126,7 @@ void foo(void) | |||
126 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) | 126 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) |
127 | BLANK(); | 127 | BLANK(); |
128 | OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); | 128 | OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); |
129 | OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); | ||
129 | OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); | 130 | OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); |
130 | 131 | ||
131 | BLANK(); | 132 | BLANK(); |
@@ -146,4 +147,5 @@ void foo(void) | |||
146 | OFFSET(BP_loadflags, boot_params, hdr.loadflags); | 147 | OFFSET(BP_loadflags, boot_params, hdr.loadflags); |
147 | OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); | 148 | OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); |
148 | OFFSET(BP_version, boot_params, hdr.version); | 149 | OFFSET(BP_version, boot_params, hdr.version); |
150 | OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); | ||
149 | } | 151 | } |
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72f062fb4b5..898ecc47e129 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c | |||
@@ -125,6 +125,7 @@ int main(void) | |||
125 | OFFSET(BP_loadflags, boot_params, hdr.loadflags); | 125 | OFFSET(BP_loadflags, boot_params, hdr.loadflags); |
126 | OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); | 126 | OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); |
127 | OFFSET(BP_version, boot_params, hdr.version); | 127 | OFFSET(BP_version, boot_params, hdr.version); |
128 | OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); | ||
128 | 129 | ||
129 | BLANK(); | 130 | BLANK(); |
130 | DEFINE(PAGE_SIZE_asm, PAGE_SIZE); | 131 | DEFINE(PAGE_SIZE_asm, PAGE_SIZE); |
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4e242f9a06e4..3efcb2b96a15 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | # | 1 | # |
2 | # Makefile for x86-compatible CPU details and quirks | 2 | # Makefile for x86-compatible CPU details, features and quirks |
3 | # | 3 | # |
4 | 4 | ||
5 | # Don't trace early stages of a secondary CPU boot | 5 | # Don't trace early stages of a secondary CPU boot |
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o | |||
23 | obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o | 23 | obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o |
24 | obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o | 24 | obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o |
25 | 25 | ||
26 | obj-$(CONFIG_X86_MCE) += mcheck/ | 26 | obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o |
27 | obj-$(CONFIG_MTRR) += mtrr/ | ||
28 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ | ||
29 | 27 | ||
30 | obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o | 28 | obj-$(CONFIG_X86_MCE) += mcheck/ |
29 | obj-$(CONFIG_MTRR) += mtrr/ | ||
30 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ | ||
31 | |||
32 | obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o | ||
31 | 33 | ||
32 | quiet_cmd_mkcapflags = MKCAP $@ | 34 | quiet_cmd_mkcapflags = MKCAP $@ |
33 | cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ | 35 | cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459daa64..e5b27d8f1b47 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <asm/processor.h> | 6 | #include <asm/processor.h> |
7 | #include <asm/apic.h> | 7 | #include <asm/apic.h> |
8 | #include <asm/cpu.h> | 8 | #include <asm/cpu.h> |
9 | #include <asm/pci-direct.h> | ||
9 | 10 | ||
10 | #ifdef CONFIG_X86_64 | 11 | #ifdef CONFIG_X86_64 |
11 | # include <asm/numa_64.h> | 12 | # include <asm/numa_64.h> |
@@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) | |||
272 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) | 273 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) |
273 | int cpu = smp_processor_id(); | 274 | int cpu = smp_processor_id(); |
274 | int node; | 275 | int node; |
275 | unsigned apicid = hard_smp_processor_id(); | 276 | unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; |
276 | 277 | ||
277 | node = c->phys_proc_id; | 278 | node = c->phys_proc_id; |
278 | if (apicid_to_node[apicid] != NUMA_NO_NODE) | 279 | if (apicid_to_node[apicid] != NUMA_NO_NODE) |
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | |||
351 | (c->x86_model == 8 && c->x86_mask >= 8)) | 352 | (c->x86_model == 8 && c->x86_mask >= 8)) |
352 | set_cpu_cap(c, X86_FEATURE_K6_MTRR); | 353 | set_cpu_cap(c, X86_FEATURE_K6_MTRR); |
353 | #endif | 354 | #endif |
355 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) | ||
356 | /* check CPU config space for extended APIC ID */ | ||
357 | if (c->x86 >= 0xf) { | ||
358 | unsigned int val; | ||
359 | val = read_pci_config(0, 24, 0, 0x68); | ||
360 | if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) | ||
361 | set_cpu_cap(c, X86_FEATURE_EXTD_APICID); | ||
362 | } | ||
363 | #endif | ||
354 | } | 364 | } |
355 | 365 | ||
356 | static void __cpuinit init_amd(struct cpuinfo_x86 *c) | 366 | static void __cpuinit init_amd(struct cpuinfo_x86 *c) |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1caefc82e62..9fa33886c0d7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/io.h> | 13 | #include <linux/io.h> |
14 | 14 | ||
15 | #include <asm/stackprotector.h> | 15 | #include <asm/stackprotector.h> |
16 | #include <asm/perf_counter.h> | ||
16 | #include <asm/mmu_context.h> | 17 | #include <asm/mmu_context.h> |
17 | #include <asm/hypervisor.h> | 18 | #include <asm/hypervisor.h> |
18 | #include <asm/processor.h> | 19 | #include <asm/processor.h> |
@@ -114,6 +115,13 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { | |||
114 | } }; | 115 | } }; |
115 | EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); | 116 | EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); |
116 | 117 | ||
118 | static int __init x86_xsave_setup(char *s) | ||
119 | { | ||
120 | setup_clear_cpu_cap(X86_FEATURE_XSAVE); | ||
121 | return 1; | ||
122 | } | ||
123 | __setup("noxsave", x86_xsave_setup); | ||
124 | |||
117 | #ifdef CONFIG_X86_32 | 125 | #ifdef CONFIG_X86_32 |
118 | static int cachesize_override __cpuinitdata = -1; | 126 | static int cachesize_override __cpuinitdata = -1; |
119 | static int disable_x86_serial_nr __cpuinitdata = 1; | 127 | static int disable_x86_serial_nr __cpuinitdata = 1; |
@@ -292,7 +300,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) | |||
292 | return NULL; /* Not found */ | 300 | return NULL; /* Not found */ |
293 | } | 301 | } |
294 | 302 | ||
295 | __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; | 303 | __u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; |
304 | __u32 cpu_caps_set[NCAPINTS] __cpuinitdata; | ||
296 | 305 | ||
297 | void load_percpu_segment(int cpu) | 306 | void load_percpu_segment(int cpu) |
298 | { | 307 | { |
@@ -478,7 +487,6 @@ out: | |||
478 | static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) | 487 | static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) |
479 | { | 488 | { |
480 | char *v = c->x86_vendor_id; | 489 | char *v = c->x86_vendor_id; |
481 | static int printed; | ||
482 | int i; | 490 | int i; |
483 | 491 | ||
484 | for (i = 0; i < X86_VENDOR_NUM; i++) { | 492 | for (i = 0; i < X86_VENDOR_NUM; i++) { |
@@ -495,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) | |||
495 | } | 503 | } |
496 | } | 504 | } |
497 | 505 | ||
498 | if (!printed) { | 506 | printk_once(KERN_ERR |
499 | printed++; | 507 | "CPU: vendor_id '%s' unknown, using generic init.\n" \ |
500 | printk(KERN_ERR | 508 | "CPU: Your system may be unstable.\n", v); |
501 | "CPU: vendor_id '%s' unknown, using generic init.\n", v); | ||
502 | |||
503 | printk(KERN_ERR "CPU: Your system may be unstable.\n"); | ||
504 | } | ||
505 | 509 | ||
506 | c->x86_vendor = X86_VENDOR_UNKNOWN; | 510 | c->x86_vendor = X86_VENDOR_UNKNOWN; |
507 | this_cpu = &default_cpu; | 511 | this_cpu = &default_cpu; |
@@ -761,6 +765,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |||
761 | if (this_cpu->c_identify) | 765 | if (this_cpu->c_identify) |
762 | this_cpu->c_identify(c); | 766 | this_cpu->c_identify(c); |
763 | 767 | ||
768 | /* Clear/Set all flags overriden by options, after probe */ | ||
769 | for (i = 0; i < NCAPINTS; i++) { | ||
770 | c->x86_capability[i] &= ~cpu_caps_cleared[i]; | ||
771 | c->x86_capability[i] |= cpu_caps_set[i]; | ||
772 | } | ||
773 | |||
764 | #ifdef CONFIG_X86_64 | 774 | #ifdef CONFIG_X86_64 |
765 | c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); | 775 | c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); |
766 | #endif | 776 | #endif |
@@ -806,6 +816,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |||
806 | #endif | 816 | #endif |
807 | 817 | ||
808 | init_hypervisor(c); | 818 | init_hypervisor(c); |
819 | |||
820 | /* | ||
821 | * Clear/Set all flags overriden by options, need do it | ||
822 | * before following smp all cpus cap AND. | ||
823 | */ | ||
824 | for (i = 0; i < NCAPINTS; i++) { | ||
825 | c->x86_capability[i] &= ~cpu_caps_cleared[i]; | ||
826 | c->x86_capability[i] |= cpu_caps_set[i]; | ||
827 | } | ||
828 | |||
809 | /* | 829 | /* |
810 | * On SMP, boot_cpu_data holds the common feature set between | 830 | * On SMP, boot_cpu_data holds the common feature set between |
811 | * all CPUs; so make sure that we indicate which features are | 831 | * all CPUs; so make sure that we indicate which features are |
@@ -818,10 +838,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |||
818 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | 838 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; |
819 | } | 839 | } |
820 | 840 | ||
821 | /* Clear all flags overriden by options */ | ||
822 | for (i = 0; i < NCAPINTS; i++) | ||
823 | c->x86_capability[i] &= ~cleared_cpu_caps[i]; | ||
824 | |||
825 | #ifdef CONFIG_X86_MCE | 841 | #ifdef CONFIG_X86_MCE |
826 | /* Init Machine Check Exception if available. */ | 842 | /* Init Machine Check Exception if available. */ |
827 | mcheck_init(c); | 843 | mcheck_init(c); |
@@ -854,6 +870,7 @@ void __init identify_boot_cpu(void) | |||
854 | #else | 870 | #else |
855 | vgetcpu_set_mode(); | 871 | vgetcpu_set_mode(); |
856 | #endif | 872 | #endif |
873 | init_hw_perf_counters(); | ||
857 | } | 874 | } |
858 | 875 | ||
859 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) | 876 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) |
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 46e29ab96c6a..6b2a52dd0403 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c | |||
@@ -32,9 +32,7 @@ | |||
32 | 32 | ||
33 | static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); | 33 | static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); |
34 | static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); | 34 | static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); |
35 | static DEFINE_PER_CPU(unsigned, cpu_modelflag); | ||
36 | static DEFINE_PER_CPU(int, cpu_priv_count); | 35 | static DEFINE_PER_CPU(int, cpu_priv_count); |
37 | static DEFINE_PER_CPU(unsigned, cpu_model); | ||
38 | 36 | ||
39 | static DEFINE_MUTEX(cpu_debug_lock); | 37 | static DEFINE_MUTEX(cpu_debug_lock); |
40 | 38 | ||
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = { | |||
80 | { "value", CPU_REG_ALL, 1 }, | 78 | { "value", CPU_REG_ALL, 1 }, |
81 | }; | 79 | }; |
82 | 80 | ||
83 | /* Intel Registers Range */ | 81 | /* CPU Registers Range */ |
84 | static struct cpu_debug_range cpu_intel_range[] = { | 82 | static struct cpu_debug_range cpu_reg_range[] = { |
85 | { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, | 83 | { 0x00000000, 0x00000001, CPU_MC, }, |
86 | { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, | 84 | { 0x00000006, 0x00000007, CPU_MONITOR, }, |
87 | { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, | 85 | { 0x00000010, 0x00000010, CPU_TIME, }, |
88 | { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, | 86 | { 0x00000011, 0x00000013, CPU_PMC, }, |
89 | { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, | 87 | { 0x00000017, 0x00000017, CPU_PLATFORM, }, |
90 | { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, | 88 | { 0x0000001B, 0x0000001B, CPU_APIC, }, |
91 | 89 | { 0x0000002A, 0x0000002B, CPU_POWERON, }, | |
92 | { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, | 90 | { 0x0000002C, 0x0000002C, CPU_FREQ, }, |
93 | { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, | 91 | { 0x0000003A, 0x0000003A, CPU_CONTROL, }, |
94 | { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, | 92 | { 0x00000040, 0x00000047, CPU_LBRANCH, }, |
95 | { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, | 93 | { 0x00000060, 0x00000067, CPU_LBRANCH, }, |
96 | 94 | { 0x00000079, 0x00000079, CPU_BIOS, }, | |
97 | { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, | 95 | { 0x00000088, 0x0000008A, CPU_CACHE, }, |
98 | { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, | 96 | { 0x0000008B, 0x0000008B, CPU_BIOS, }, |
99 | { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, | 97 | { 0x0000009B, 0x0000009B, CPU_MONITOR, }, |
100 | { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, | 98 | { 0x000000C1, 0x000000C4, CPU_PMC, }, |
101 | 99 | { 0x000000CD, 0x000000CD, CPU_FREQ, }, | |
102 | { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, | 100 | { 0x000000E7, 0x000000E8, CPU_PERF, }, |
103 | { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, | 101 | { 0x000000FE, 0x000000FE, CPU_MTRR, }, |
104 | { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, | 102 | |
105 | { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, | 103 | { 0x00000116, 0x0000011E, CPU_CACHE, }, |
106 | 104 | { 0x00000174, 0x00000176, CPU_SYSENTER, }, | |
107 | { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, | 105 | { 0x00000179, 0x0000017B, CPU_MC, }, |
108 | { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, | 106 | { 0x00000186, 0x00000189, CPU_PMC, }, |
109 | { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, | 107 | { 0x00000198, 0x00000199, CPU_PERF, }, |
110 | { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, | 108 | { 0x0000019A, 0x0000019A, CPU_TIME, }, |
111 | 109 | { 0x0000019B, 0x0000019D, CPU_THERM, }, | |
112 | { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, | 110 | { 0x000001A0, 0x000001A0, CPU_MISC, }, |
113 | { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, | 111 | { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, |
114 | { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, | 112 | { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, |
115 | { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, | 113 | { 0x000001D9, 0x000001D9, CPU_DEBUG, }, |
116 | { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, | 114 | { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, |
117 | 115 | ||
118 | { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, | 116 | { 0x00000200, 0x0000020F, CPU_MTRR, }, |
119 | { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, | 117 | { 0x00000250, 0x00000250, CPU_MTRR, }, |
120 | { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, | 118 | { 0x00000258, 0x00000259, CPU_MTRR, }, |
121 | { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, | 119 | { 0x00000268, 0x0000026F, CPU_MTRR, }, |
122 | { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, | 120 | { 0x00000277, 0x00000277, CPU_PAT, }, |
123 | { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, | 121 | { 0x000002FF, 0x000002FF, CPU_MTRR, }, |
124 | { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, | 122 | |
125 | { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, | 123 | { 0x00000300, 0x00000311, CPU_PMC, }, |
126 | 124 | { 0x00000345, 0x00000345, CPU_PMC, }, | |
127 | { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, | 125 | { 0x00000360, 0x00000371, CPU_PMC, }, |
128 | { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, | 126 | { 0x0000038D, 0x00000390, CPU_PMC, }, |
129 | { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, | 127 | { 0x000003A0, 0x000003BE, CPU_PMC, }, |
130 | { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, | 128 | { 0x000003C0, 0x000003CD, CPU_PMC, }, |
131 | { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, | 129 | { 0x000003E0, 0x000003E1, CPU_PMC, }, |
132 | { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, | 130 | { 0x000003F0, 0x000003F2, CPU_PMC, }, |
133 | { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, | 131 | |
134 | { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, | 132 | { 0x00000400, 0x00000417, CPU_MC, }, |
135 | 133 | { 0x00000480, 0x0000048B, CPU_VMX, }, | |
136 | { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, | 134 | |
137 | { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, | 135 | { 0x00000600, 0x00000600, CPU_DEBUG, }, |
138 | { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, | 136 | { 0x00000680, 0x0000068F, CPU_LBRANCH, }, |
139 | { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, | 137 | { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, |
140 | { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, | 138 | |
141 | { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, | 139 | { 0x000107CC, 0x000107D3, CPU_PMC, }, |
142 | 140 | ||
143 | { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, | 141 | { 0xC0000080, 0xC0000080, CPU_FEATURES, }, |
144 | { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, | 142 | { 0xC0000081, 0xC0000084, CPU_CALL, }, |
145 | { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, | 143 | { 0xC0000100, 0xC0000102, CPU_BASE, }, |
146 | { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, | 144 | { 0xC0000103, 0xC0000103, CPU_TIME, }, |
147 | { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, | 145 | |
148 | { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, | 146 | { 0xC0010000, 0xC0010007, CPU_PMC, }, |
149 | { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, | 147 | { 0xC0010010, 0xC0010010, CPU_CONF, }, |
150 | { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, | 148 | { 0xC0010015, 0xC0010015, CPU_CONF, }, |
151 | { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, | 149 | { 0xC0010016, 0xC001001A, CPU_MTRR, }, |
152 | { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, | 150 | { 0xC001001D, 0xC001001D, CPU_MTRR, }, |
153 | { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, | 151 | { 0xC001001F, 0xC001001F, CPU_CONF, }, |
154 | { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, | 152 | { 0xC0010030, 0xC0010035, CPU_BIOS, }, |
155 | 153 | { 0xC0010044, 0xC0010048, CPU_MC, }, | |
156 | { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, | 154 | { 0xC0010050, 0xC0010056, CPU_SMM, }, |
157 | { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, | 155 | { 0xC0010058, 0xC0010058, CPU_CONF, }, |
158 | { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, | 156 | { 0xC0010060, 0xC0010060, CPU_CACHE, }, |
159 | { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, | 157 | { 0xC0010061, 0xC0010068, CPU_SMM, }, |
160 | { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, | 158 | { 0xC0010069, 0xC001006B, CPU_SMM, }, |
161 | { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, | 159 | { 0xC0010070, 0xC0010071, CPU_SMM, }, |
162 | { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, | 160 | { 0xC0010111, 0xC0010113, CPU_SMM, }, |
163 | { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, | 161 | { 0xC0010114, 0xC0010118, CPU_SVM, }, |
164 | { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, | 162 | { 0xC0010140, 0xC0010141, CPU_OSVM, }, |
165 | { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, | 163 | { 0xC0011022, 0xC0011023, CPU_CONF, }, |
166 | { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE }, | ||
167 | |||
168 | { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE }, | ||
169 | { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON }, | ||
170 | { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON }, | ||
171 | |||
172 | { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP }, | ||
173 | |||
174 | { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON }, | ||
175 | { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON }, | ||
176 | { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON }, | ||
177 | { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON }, | ||
178 | }; | 164 | }; |
179 | 165 | ||
180 | /* AMD Registers Range */ | ||
181 | static struct cpu_debug_range cpu_amd_range[] = { | ||
182 | { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, }, | ||
183 | { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, }, | ||
184 | { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, }, | ||
185 | { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS }, | ||
186 | { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS }, | ||
187 | { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, }, | ||
188 | |||
189 | { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, }, | ||
190 | { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, }, | ||
191 | { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, }, | ||
192 | { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, }, | ||
193 | |||
194 | { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, }, | ||
195 | { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, }, | ||
196 | { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, }, | ||
197 | { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, }, | ||
198 | { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, }, | ||
199 | { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, }, | ||
200 | |||
201 | { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, }, | ||
202 | |||
203 | { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, }, | ||
204 | { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, }, | ||
205 | { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, }, | ||
206 | { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, }, | ||
207 | |||
208 | { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, }, | ||
209 | { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, }, | ||
210 | { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, }, | ||
211 | { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, }, | ||
212 | { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, }, | ||
213 | { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, }, | ||
214 | { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, }, | ||
215 | { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, }, | ||
216 | { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, }, | ||
217 | { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, }, | ||
218 | { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, }, | ||
219 | { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, }, | ||
220 | { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, }, | ||
221 | { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, }, | ||
222 | { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, }, | ||
223 | { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, }, | ||
224 | { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, }, | ||
225 | { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, }, | ||
226 | }; | ||
227 | |||
228 | |||
229 | /* Intel */ | ||
230 | static int get_intel_modelflag(unsigned model) | ||
231 | { | ||
232 | int flag; | ||
233 | |||
234 | switch (model) { | ||
235 | case 0x0501: | ||
236 | case 0x0502: | ||
237 | case 0x0504: | ||
238 | flag = CPU_INTEL_PENTIUM; | ||
239 | break; | ||
240 | case 0x0601: | ||
241 | case 0x0603: | ||
242 | case 0x0605: | ||
243 | case 0x0607: | ||
244 | case 0x0608: | ||
245 | case 0x060A: | ||
246 | case 0x060B: | ||
247 | flag = CPU_INTEL_P6; | ||
248 | break; | ||
249 | case 0x0609: | ||
250 | case 0x060D: | ||
251 | flag = CPU_INTEL_PENTIUM_M; | ||
252 | break; | ||
253 | case 0x060E: | ||
254 | flag = CPU_INTEL_CORE; | ||
255 | break; | ||
256 | case 0x060F: | ||
257 | case 0x0617: | ||
258 | flag = CPU_INTEL_CORE2; | ||
259 | break; | ||
260 | case 0x061C: | ||
261 | flag = CPU_INTEL_ATOM; | ||
262 | break; | ||
263 | case 0x0F00: | ||
264 | case 0x0F01: | ||
265 | case 0x0F02: | ||
266 | case 0x0F03: | ||
267 | case 0x0F04: | ||
268 | flag = CPU_INTEL_XEON_P4; | ||
269 | break; | ||
270 | case 0x0F06: | ||
271 | flag = CPU_INTEL_XEON_MP; | ||
272 | break; | ||
273 | default: | ||
274 | flag = CPU_NONE; | ||
275 | break; | ||
276 | } | ||
277 | |||
278 | return flag; | ||
279 | } | ||
280 | |||
281 | /* AMD */ | ||
282 | static int get_amd_modelflag(unsigned model) | ||
283 | { | ||
284 | int flag; | ||
285 | |||
286 | switch (model >> 8) { | ||
287 | case 0x6: | ||
288 | flag = CPU_AMD_K6; | ||
289 | break; | ||
290 | case 0x7: | ||
291 | flag = CPU_AMD_K7; | ||
292 | break; | ||
293 | case 0x8: | ||
294 | flag = CPU_AMD_K8; | ||
295 | break; | ||
296 | case 0xf: | ||
297 | flag = CPU_AMD_0F; | ||
298 | break; | ||
299 | case 0x10: | ||
300 | flag = CPU_AMD_10; | ||
301 | break; | ||
302 | case 0x11: | ||
303 | flag = CPU_AMD_11; | ||
304 | break; | ||
305 | default: | ||
306 | flag = CPU_NONE; | ||
307 | break; | ||
308 | } | ||
309 | |||
310 | return flag; | ||
311 | } | ||
312 | |||
313 | static int get_cpu_modelflag(unsigned cpu) | ||
314 | { | ||
315 | int flag; | ||
316 | |||
317 | flag = per_cpu(cpu_model, cpu); | ||
318 | |||
319 | switch (flag >> 16) { | ||
320 | case X86_VENDOR_INTEL: | ||
321 | flag = get_intel_modelflag(flag); | ||
322 | break; | ||
323 | case X86_VENDOR_AMD: | ||
324 | flag = get_amd_modelflag(flag & 0xffff); | ||
325 | break; | ||
326 | default: | ||
327 | flag = CPU_NONE; | ||
328 | break; | ||
329 | } | ||
330 | |||
331 | return flag; | ||
332 | } | ||
333 | |||
334 | static int get_cpu_range_count(unsigned cpu) | ||
335 | { | ||
336 | int index; | ||
337 | |||
338 | switch (per_cpu(cpu_model, cpu) >> 16) { | ||
339 | case X86_VENDOR_INTEL: | ||
340 | index = ARRAY_SIZE(cpu_intel_range); | ||
341 | break; | ||
342 | case X86_VENDOR_AMD: | ||
343 | index = ARRAY_SIZE(cpu_amd_range); | ||
344 | break; | ||
345 | default: | ||
346 | index = 0; | ||
347 | break; | ||
348 | } | ||
349 | |||
350 | return index; | ||
351 | } | ||
352 | |||
353 | static int is_typeflag_valid(unsigned cpu, unsigned flag) | 166 | static int is_typeflag_valid(unsigned cpu, unsigned flag) |
354 | { | 167 | { |
355 | unsigned vendor, modelflag; | 168 | int i; |
356 | int i, index; | ||
357 | 169 | ||
358 | /* Standard Registers should be always valid */ | 170 | /* Standard Registers should be always valid */ |
359 | if (flag >= CPU_TSS) | 171 | if (flag >= CPU_TSS) |
360 | return 1; | 172 | return 1; |
361 | 173 | ||
362 | modelflag = per_cpu(cpu_modelflag, cpu); | 174 | for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { |
363 | vendor = per_cpu(cpu_model, cpu) >> 16; | 175 | if (cpu_reg_range[i].flag == flag) |
364 | index = get_cpu_range_count(cpu); | 176 | return 1; |
365 | |||
366 | for (i = 0; i < index; i++) { | ||
367 | switch (vendor) { | ||
368 | case X86_VENDOR_INTEL: | ||
369 | if ((cpu_intel_range[i].model & modelflag) && | ||
370 | (cpu_intel_range[i].flag & flag)) | ||
371 | return 1; | ||
372 | break; | ||
373 | case X86_VENDOR_AMD: | ||
374 | if ((cpu_amd_range[i].model & modelflag) && | ||
375 | (cpu_amd_range[i].flag & flag)) | ||
376 | return 1; | ||
377 | break; | ||
378 | } | ||
379 | } | 177 | } |
380 | 178 | ||
381 | /* Invalid */ | 179 | /* Invalid */ |
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag) | |||
385 | static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, | 183 | static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, |
386 | int index, unsigned flag) | 184 | int index, unsigned flag) |
387 | { | 185 | { |
388 | unsigned modelflag; | 186 | if (cpu_reg_range[index].flag == flag) { |
389 | 187 | *min = cpu_reg_range[index].min; | |
390 | modelflag = per_cpu(cpu_modelflag, cpu); | 188 | *max = cpu_reg_range[index].max; |
391 | *max = 0; | 189 | } else |
392 | switch (per_cpu(cpu_model, cpu) >> 16) { | 190 | *max = 0; |
393 | case X86_VENDOR_INTEL: | ||
394 | if ((cpu_intel_range[index].model & modelflag) && | ||
395 | (cpu_intel_range[index].flag & flag)) { | ||
396 | *min = cpu_intel_range[index].min; | ||
397 | *max = cpu_intel_range[index].max; | ||
398 | } | ||
399 | break; | ||
400 | case X86_VENDOR_AMD: | ||
401 | if ((cpu_amd_range[index].model & modelflag) && | ||
402 | (cpu_amd_range[index].flag & flag)) { | ||
403 | *min = cpu_amd_range[index].min; | ||
404 | *max = cpu_amd_range[index].max; | ||
405 | } | ||
406 | break; | ||
407 | } | ||
408 | 191 | ||
409 | return *max; | 192 | return *max; |
410 | } | 193 | } |
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) | |||
434 | unsigned msr, msr_min, msr_max; | 217 | unsigned msr, msr_min, msr_max; |
435 | struct cpu_private *priv; | 218 | struct cpu_private *priv; |
436 | u32 low, high; | 219 | u32 low, high; |
437 | int i, range; | 220 | int i; |
438 | 221 | ||
439 | if (seq) { | 222 | if (seq) { |
440 | priv = seq->private; | 223 | priv = seq->private; |
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) | |||
446 | } | 229 | } |
447 | } | 230 | } |
448 | 231 | ||
449 | range = get_cpu_range_count(cpu); | 232 | for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { |
450 | |||
451 | for (i = 0; i < range; i++) { | ||
452 | if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) | 233 | if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) |
453 | continue; | 234 | continue; |
454 | 235 | ||
@@ -588,8 +369,20 @@ static void print_apic(void *arg) | |||
588 | seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); | 369 | seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); |
589 | seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); | 370 | seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); |
590 | seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); | 371 | seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); |
591 | #endif /* CONFIG_X86_LOCAL_APIC */ | 372 | if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { |
373 | unsigned int i, v, maxeilvt; | ||
374 | |||
375 | v = apic_read(APIC_EFEAT); | ||
376 | maxeilvt = (v >> 16) & 0xff; | ||
377 | seq_printf(seq, " EFEAT\t\t: %08x\n", v); | ||
378 | seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); | ||
592 | 379 | ||
380 | for (i = 0; i < maxeilvt; i++) { | ||
381 | v = apic_read(APIC_EILVTn(i)); | ||
382 | seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); | ||
383 | } | ||
384 | } | ||
385 | #endif /* CONFIG_X86_LOCAL_APIC */ | ||
593 | seq_printf(seq, "\n MSR\t:\n"); | 386 | seq_printf(seq, "\n MSR\t:\n"); |
594 | } | 387 | } |
595 | 388 | ||
@@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) | |||
788 | { | 581 | { |
789 | struct dentry *cpu_dentry = NULL; | 582 | struct dentry *cpu_dentry = NULL; |
790 | unsigned reg, reg_min, reg_max; | 583 | unsigned reg, reg_min, reg_max; |
791 | int i, range, err = 0; | 584 | int i, err = 0; |
792 | char reg_dir[12]; | 585 | char reg_dir[12]; |
793 | u32 low, high; | 586 | u32 low, high; |
794 | 587 | ||
795 | range = get_cpu_range_count(cpu); | 588 | for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { |
796 | |||
797 | for (i = 0; i < range; i++) { | ||
798 | if (!get_cpu_range(cpu, ®_min, ®_max, i, | 589 | if (!get_cpu_range(cpu, ®_min, ®_max, i, |
799 | cpu_base[type].flag)) | 590 | cpu_base[type].flag)) |
800 | continue; | 591 | continue; |
@@ -850,10 +641,6 @@ static int cpu_init_cpu(void) | |||
850 | cpui = &cpu_data(cpu); | 641 | cpui = &cpu_data(cpu); |
851 | if (!cpu_has(cpui, X86_FEATURE_MSR)) | 642 | if (!cpu_has(cpui, X86_FEATURE_MSR)) |
852 | continue; | 643 | continue; |
853 | per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) | | ||
854 | (cpui->x86 << 8) | | ||
855 | (cpui->x86_model)); | ||
856 | per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu); | ||
857 | 644 | ||
858 | sprintf(cpu_dir, "cpu%d", cpu); | 645 | sprintf(cpu_dir, "cpu%d", cpu); |
859 | cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); | 646 | cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); |
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 52c839875478..f138c6c389b9 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig | |||
@@ -220,11 +220,14 @@ config X86_LONGHAUL | |||
220 | If in doubt, say N. | 220 | If in doubt, say N. |
221 | 221 | ||
222 | config X86_E_POWERSAVER | 222 | config X86_E_POWERSAVER |
223 | tristate "VIA C7 Enhanced PowerSaver" | 223 | tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)" |
224 | select CPU_FREQ_TABLE | 224 | select CPU_FREQ_TABLE |
225 | depends on X86_32 | 225 | depends on X86_32 && EXPERIMENTAL |
226 | help | 226 | help |
227 | This adds the CPUFreq driver for VIA C7 processors. | 227 | This adds the CPUFreq driver for VIA C7 processors. However, this driver |
228 | does not have any safeguards to prevent operating the CPU out of spec | ||
229 | and is thus considered dangerous. Please use the regular ACPI cpufreq | ||
230 | driver, enabled by CONFIG_X86_ACPI_CPUFREQ. | ||
228 | 231 | ||
229 | If in doubt, say N. | 232 | If in doubt, say N. |
230 | 233 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 208ecf6643df..ae9b503220ca 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | |||
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid) | |||
90 | { | 90 | { |
91 | struct cpuinfo_x86 *cpu = &cpu_data(cpuid); | 91 | struct cpuinfo_x86 *cpu = &cpu_data(cpuid); |
92 | 92 | ||
93 | if (cpu->x86_vendor != X86_VENDOR_INTEL || | 93 | return cpu_has(cpu, X86_FEATURE_EST); |
94 | !cpu_has(cpu, X86_FEATURE_EST)) | ||
95 | return 0; | ||
96 | |||
97 | return 1; | ||
98 | } | 94 | } |
99 | 95 | ||
100 | static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) | 96 | static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) |
@@ -550,7 +546,7 @@ static int __init acpi_cpufreq_early_init(void) | |||
550 | return -ENOMEM; | 546 | return -ENOMEM; |
551 | } | 547 | } |
552 | for_each_possible_cpu(i) { | 548 | for_each_possible_cpu(i) { |
553 | if (!alloc_cpumask_var_node( | 549 | if (!zalloc_cpumask_var_node( |
554 | &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, | 550 | &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, |
555 | GFP_KERNEL, cpu_to_node(i))) { | 551 | GFP_KERNEL, cpu_to_node(i))) { |
556 | 552 | ||
@@ -693,8 +689,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) | |||
693 | if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE && | 689 | if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE && |
694 | policy->cpuinfo.transition_latency > 20 * 1000) { | 690 | policy->cpuinfo.transition_latency > 20 * 1000) { |
695 | policy->cpuinfo.transition_latency = 20 * 1000; | 691 | policy->cpuinfo.transition_latency = 20 * 1000; |
696 | printk_once(KERN_INFO "Capping off P-state tranision" | 692 | printk_once(KERN_INFO |
697 | " latency at 20 uS\n"); | 693 | "P-state transition latency capped at 20 uS\n"); |
698 | } | 694 | } |
699 | 695 | ||
700 | /* table init */ | 696 | /* table init */ |
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 6ac55bd341ae..869615193720 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | |||
@@ -168,6 +168,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) | |||
168 | case 0x0E: /* Core */ | 168 | case 0x0E: /* Core */ |
169 | case 0x0F: /* Core Duo */ | 169 | case 0x0F: /* Core Duo */ |
170 | case 0x16: /* Celeron Core */ | 170 | case 0x16: /* Celeron Core */ |
171 | case 0x1C: /* Atom */ | ||
171 | p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; | 172 | p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; |
172 | return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE); | 173 | return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE); |
173 | case 0x0D: /* Pentium M (Dothan) */ | 174 | case 0x0D: /* Pentium M (Dothan) */ |
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 3c28ccd49742..d47c775eb0ab 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c | |||
@@ -168,10 +168,12 @@ static int check_powernow(void) | |||
168 | return 1; | 168 | return 1; |
169 | } | 169 | } |
170 | 170 | ||
171 | #ifdef CONFIG_X86_POWERNOW_K7_ACPI | ||
171 | static void invalidate_entry(unsigned int entry) | 172 | static void invalidate_entry(unsigned int entry) |
172 | { | 173 | { |
173 | powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; | 174 | powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; |
174 | } | 175 | } |
176 | #endif | ||
175 | 177 | ||
176 | static int get_ranges(unsigned char *pst) | 178 | static int get_ranges(unsigned char *pst) |
177 | { | 179 | { |
@@ -320,7 +322,7 @@ static int powernow_acpi_init(void) | |||
320 | goto err0; | 322 | goto err0; |
321 | } | 323 | } |
322 | 324 | ||
323 | if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, | 325 | if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, |
324 | GFP_KERNEL)) { | 326 | GFP_KERNEL)) { |
325 | retval = -ENOMEM; | 327 | retval = -ENOMEM; |
326 | goto err05; | 328 | goto err05; |
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 4709ead2db52..cf52215d9eb1 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c | |||
@@ -649,6 +649,20 @@ static void print_basics(struct powernow_k8_data *data) | |||
649 | data->batps); | 649 | data->batps); |
650 | } | 650 | } |
651 | 651 | ||
652 | static u32 freq_from_fid_did(u32 fid, u32 did) | ||
653 | { | ||
654 | u32 mhz = 0; | ||
655 | |||
656 | if (boot_cpu_data.x86 == 0x10) | ||
657 | mhz = (100 * (fid + 0x10)) >> did; | ||
658 | else if (boot_cpu_data.x86 == 0x11) | ||
659 | mhz = (100 * (fid + 8)) >> did; | ||
660 | else | ||
661 | BUG(); | ||
662 | |||
663 | return mhz * 1000; | ||
664 | } | ||
665 | |||
652 | static int fill_powernow_table(struct powernow_k8_data *data, | 666 | static int fill_powernow_table(struct powernow_k8_data *data, |
653 | struct pst_s *pst, u8 maxvid) | 667 | struct pst_s *pst, u8 maxvid) |
654 | { | 668 | { |
@@ -821,7 +835,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) | |||
821 | { | 835 | { |
822 | struct cpufreq_frequency_table *powernow_table; | 836 | struct cpufreq_frequency_table *powernow_table; |
823 | int ret_val = -ENODEV; | 837 | int ret_val = -ENODEV; |
824 | acpi_integer space_id; | 838 | acpi_integer control, status; |
825 | 839 | ||
826 | if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { | 840 | if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { |
827 | dprintk("register performance failed: bad ACPI data\n"); | 841 | dprintk("register performance failed: bad ACPI data\n"); |
@@ -834,12 +848,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) | |||
834 | goto err_out; | 848 | goto err_out; |
835 | } | 849 | } |
836 | 850 | ||
837 | space_id = data->acpi_data.control_register.space_id; | 851 | control = data->acpi_data.control_register.space_id; |
838 | if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || | 852 | status = data->acpi_data.status_register.space_id; |
839 | (space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { | 853 | |
854 | if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) || | ||
855 | (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) { | ||
840 | dprintk("Invalid control/status registers (%x - %x)\n", | 856 | dprintk("Invalid control/status registers (%x - %x)\n", |
841 | data->acpi_data.control_register.space_id, | 857 | control, status); |
842 | space_id); | ||
843 | goto err_out; | 858 | goto err_out; |
844 | } | 859 | } |
845 | 860 | ||
@@ -872,7 +887,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) | |||
872 | /* notify BIOS that we exist */ | 887 | /* notify BIOS that we exist */ |
873 | acpi_processor_notify_smm(THIS_MODULE); | 888 | acpi_processor_notify_smm(THIS_MODULE); |
874 | 889 | ||
875 | if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { | 890 | if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { |
876 | printk(KERN_ERR PFX | 891 | printk(KERN_ERR PFX |
877 | "unable to alloc powernow_k8_data cpumask\n"); | 892 | "unable to alloc powernow_k8_data cpumask\n"); |
878 | ret_val = -ENOMEM; | 893 | ret_val = -ENOMEM; |
@@ -923,8 +938,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, | |||
923 | 938 | ||
924 | powernow_table[i].index = index; | 939 | powernow_table[i].index = index; |
925 | 940 | ||
926 | powernow_table[i].frequency = | 941 | /* Frequency may be rounded for these */ |
927 | data->acpi_data.states[i].core_frequency * 1000; | 942 | if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) { |
943 | powernow_table[i].frequency = | ||
944 | freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); | ||
945 | } else | ||
946 | powernow_table[i].frequency = | ||
947 | data->acpi_data.states[i].core_frequency * 1000; | ||
928 | } | 948 | } |
929 | return 0; | 949 | return 0; |
930 | } | 950 | } |
@@ -1215,13 +1235,16 @@ static int powernowk8_verify(struct cpufreq_policy *pol) | |||
1215 | return cpufreq_frequency_table_verify(pol, data->powernow_table); | 1235 | return cpufreq_frequency_table_verify(pol, data->powernow_table); |
1216 | } | 1236 | } |
1217 | 1237 | ||
1238 | static const char ACPI_PSS_BIOS_BUG_MSG[] = | ||
1239 | KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" | ||
1240 | KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; | ||
1241 | |||
1218 | /* per CPU init entry point to the driver */ | 1242 | /* per CPU init entry point to the driver */ |
1219 | static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) | 1243 | static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) |
1220 | { | 1244 | { |
1221 | struct powernow_k8_data *data; | 1245 | struct powernow_k8_data *data; |
1222 | cpumask_t oldmask; | 1246 | cpumask_t oldmask; |
1223 | int rc; | 1247 | int rc; |
1224 | static int print_once; | ||
1225 | 1248 | ||
1226 | if (!cpu_online(pol->cpu)) | 1249 | if (!cpu_online(pol->cpu)) |
1227 | return -ENODEV; | 1250 | return -ENODEV; |
@@ -1244,19 +1267,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) | |||
1244 | * an UP version, and is deprecated by AMD. | 1267 | * an UP version, and is deprecated by AMD. |
1245 | */ | 1268 | */ |
1246 | if (num_online_cpus() != 1) { | 1269 | if (num_online_cpus() != 1) { |
1247 | /* | 1270 | printk_once(ACPI_PSS_BIOS_BUG_MSG); |
1248 | * Replace this one with print_once as soon as such a | ||
1249 | * thing gets introduced | ||
1250 | */ | ||
1251 | if (!print_once) { | ||
1252 | WARN_ONCE(1, KERN_ERR FW_BUG PFX "Your BIOS " | ||
1253 | "does not provide ACPI _PSS objects " | ||
1254 | "in a way that Linux understands. " | ||
1255 | "Please report this to the Linux ACPI" | ||
1256 | " maintainers and complain to your " | ||
1257 | "BIOS vendor.\n"); | ||
1258 | print_once++; | ||
1259 | } | ||
1260 | goto err_out; | 1271 | goto err_out; |
1261 | } | 1272 | } |
1262 | if (pol->cpu != 0) { | 1273 | if (pol->cpu != 0) { |
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index c9f1fdc02830..55c831ed71ce 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | |||
@@ -471,7 +471,7 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
471 | 471 | ||
472 | if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) | 472 | if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) |
473 | return -ENOMEM; | 473 | return -ENOMEM; |
474 | if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { | 474 | if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { |
475 | free_cpumask_var(saved_mask); | 475 | free_cpumask_var(saved_mask); |
476 | return -ENOMEM; | 476 | return -ENOMEM; |
477 | } | 477 | } |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7437fa133c02..3260ab044996 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | |||
86 | */ | 86 | */ |
87 | if (c->x86 == 6 && c->x86_model < 15) | 87 | if (c->x86 == 6 && c->x86_model < 15) |
88 | clear_cpu_cap(c, X86_FEATURE_PAT); | 88 | clear_cpu_cap(c, X86_FEATURE_PAT); |
89 | |||
90 | #ifdef CONFIG_KMEMCHECK | ||
91 | /* | ||
92 | * P4s have a "fast strings" feature which causes single- | ||
93 | * stepping REP instructions to only generate a #DB on | ||
94 | * cache-line boundaries. | ||
95 | * | ||
96 | * Ingo Molnar reported a Pentium D (model 6) and a Xeon | ||
97 | * (model 2) with the same problem. | ||
98 | */ | ||
99 | if (c->x86 == 15) { | ||
100 | u64 misc_enable; | ||
101 | |||
102 | rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); | ||
103 | |||
104 | if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { | ||
105 | printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); | ||
106 | |||
107 | misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; | ||
108 | wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); | ||
109 | } | ||
110 | } | ||
111 | #endif | ||
89 | } | 112 | } |
90 | 113 | ||
91 | #ifdef CONFIG_X86_32 | 114 | #ifdef CONFIG_X86_32 |
@@ -229,12 +252,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) | |||
229 | } | 252 | } |
230 | #endif | 253 | #endif |
231 | 254 | ||
232 | static void __cpuinit srat_detect_node(void) | 255 | static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) |
233 | { | 256 | { |
234 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) | 257 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) |
235 | unsigned node; | 258 | unsigned node; |
236 | int cpu = smp_processor_id(); | 259 | int cpu = smp_processor_id(); |
237 | int apicid = hard_smp_processor_id(); | 260 | int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; |
238 | 261 | ||
239 | /* Don't do the funky fallback heuristics the AMD version employs | 262 | /* Don't do the funky fallback heuristics the AMD version employs |
240 | for now. */ | 263 | for now. */ |
@@ -400,7 +423,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) | |||
400 | } | 423 | } |
401 | 424 | ||
402 | /* Work around errata */ | 425 | /* Work around errata */ |
403 | srat_detect_node(); | 426 | srat_detect_node(c); |
404 | 427 | ||
405 | if (cpu_has(c, X86_FEATURE_VMX)) | 428 | if (cpu_has(c, X86_FEATURE_VMX)) |
406 | detect_vmx_virtcap(c); | 429 | detect_vmx_virtcap(c); |
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 483eda96e102..789efe217e1a 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -17,6 +17,7 @@ | |||
17 | 17 | ||
18 | #include <asm/processor.h> | 18 | #include <asm/processor.h> |
19 | #include <asm/smp.h> | 19 | #include <asm/smp.h> |
20 | #include <asm/k8.h> | ||
20 | 21 | ||
21 | #define LVL_1_INST 1 | 22 | #define LVL_1_INST 1 |
22 | #define LVL_1_DATA 2 | 23 | #define LVL_1_DATA 2 |
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs { | |||
159 | unsigned long can_disable; | 160 | unsigned long can_disable; |
160 | }; | 161 | }; |
161 | 162 | ||
162 | #if defined(CONFIG_PCI) && defined(CONFIG_SYSFS) | ||
163 | static struct pci_device_id k8_nb_id[] = { | ||
164 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, | ||
165 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, | ||
166 | {} | ||
167 | }; | ||
168 | #endif | ||
169 | |||
170 | unsigned short num_cache_leaves; | 163 | unsigned short num_cache_leaves; |
171 | 164 | ||
172 | /* AMD doesn't have CPUID4. Emulate it here to report the same | 165 | /* AMD doesn't have CPUID4. Emulate it here to report the same |
@@ -207,10 +200,17 @@ union l3_cache { | |||
207 | }; | 200 | }; |
208 | 201 | ||
209 | static const unsigned short __cpuinitconst assocs[] = { | 202 | static const unsigned short __cpuinitconst assocs[] = { |
210 | [1] = 1, [2] = 2, [4] = 4, [6] = 8, | 203 | [1] = 1, |
211 | [8] = 16, [0xa] = 32, [0xb] = 48, | 204 | [2] = 2, |
205 | [4] = 4, | ||
206 | [6] = 8, | ||
207 | [8] = 16, | ||
208 | [0xa] = 32, | ||
209 | [0xb] = 48, | ||
212 | [0xc] = 64, | 210 | [0xc] = 64, |
213 | [0xf] = 0xffff // ?? | 211 | [0xd] = 96, |
212 | [0xe] = 128, | ||
213 | [0xf] = 0xffff /* fully associative - no way to show this currently */ | ||
214 | }; | 214 | }; |
215 | 215 | ||
216 | static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; | 216 | static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; |
@@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, | |||
271 | eax->split.type = types[leaf]; | 271 | eax->split.type = types[leaf]; |
272 | eax->split.level = levels[leaf]; | 272 | eax->split.level = levels[leaf]; |
273 | if (leaf == 3) | 273 | if (leaf == 3) |
274 | eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; | 274 | eax->split.num_threads_sharing = |
275 | current_cpu_data.x86_max_cores - 1; | ||
275 | else | 276 | else |
276 | eax->split.num_threads_sharing = 0; | 277 | eax->split.num_threads_sharing = 0; |
277 | eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; | 278 | eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; |
@@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) | |||
291 | { | 292 | { |
292 | if (index < 3) | 293 | if (index < 3) |
293 | return; | 294 | return; |
295 | |||
296 | if (boot_cpu_data.x86 == 0x11) | ||
297 | return; | ||
298 | |||
299 | /* see erratum #382 */ | ||
300 | if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) | ||
301 | return; | ||
302 | |||
294 | this_leaf->can_disable = 1; | 303 | this_leaf->can_disable = 1; |
295 | } | 304 | } |
296 | 305 | ||
@@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) | |||
696 | #define to_object(k) container_of(k, struct _index_kobject, kobj) | 705 | #define to_object(k) container_of(k, struct _index_kobject, kobj) |
697 | #define to_attr(a) container_of(a, struct _cache_attr, attr) | 706 | #define to_attr(a) container_of(a, struct _cache_attr, attr) |
698 | 707 | ||
699 | #ifdef CONFIG_PCI | 708 | static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, |
700 | static struct pci_dev *get_k8_northbridge(int node) | 709 | unsigned int index) |
701 | { | ||
702 | struct pci_dev *dev = NULL; | ||
703 | int i; | ||
704 | |||
705 | for (i = 0; i <= node; i++) { | ||
706 | do { | ||
707 | dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); | ||
708 | if (!dev) | ||
709 | break; | ||
710 | } while (!pci_match_id(&k8_nb_id[0], dev)); | ||
711 | if (!dev) | ||
712 | break; | ||
713 | } | ||
714 | return dev; | ||
715 | } | ||
716 | #else | ||
717 | static struct pci_dev *get_k8_northbridge(int node) | ||
718 | { | ||
719 | return NULL; | ||
720 | } | ||
721 | #endif | ||
722 | |||
723 | static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) | ||
724 | { | 710 | { |
725 | const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); | 711 | int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); |
726 | int node = cpu_to_node(cpumask_first(mask)); | 712 | int node = cpu_to_node(cpu); |
727 | struct pci_dev *dev = NULL; | 713 | struct pci_dev *dev = node_to_k8_nb_misc(node); |
728 | ssize_t ret = 0; | 714 | unsigned int reg = 0; |
729 | int i; | ||
730 | 715 | ||
731 | if (!this_leaf->can_disable) | 716 | if (!this_leaf->can_disable) |
732 | return sprintf(buf, "Feature not enabled\n"); | ||
733 | |||
734 | dev = get_k8_northbridge(node); | ||
735 | if (!dev) { | ||
736 | printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); | ||
737 | return -EINVAL; | 717 | return -EINVAL; |
738 | } | ||
739 | 718 | ||
740 | for (i = 0; i < 2; i++) { | 719 | if (!dev) |
741 | unsigned int reg; | 720 | return -EINVAL; |
742 | 721 | ||
743 | pci_read_config_dword(dev, 0x1BC + i * 4, ®); | 722 | pci_read_config_dword(dev, 0x1BC + index * 4, ®); |
723 | return sprintf(buf, "%x\n", reg); | ||
724 | } | ||
744 | 725 | ||
745 | ret += sprintf(buf, "%sEntry: %d\n", buf, i); | 726 | #define SHOW_CACHE_DISABLE(index) \ |
746 | ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", | 727 | static ssize_t \ |
747 | buf, | 728 | show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ |
748 | reg & 0x80000000 ? "Disabled" : "Allowed", | 729 | { \ |
749 | reg & 0x40000000 ? "Disabled" : "Allowed"); | 730 | return show_cache_disable(this_leaf, buf, index); \ |
750 | ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", | ||
751 | buf, (reg & 0x30000) >> 16, reg & 0xfff); | ||
752 | } | ||
753 | return ret; | ||
754 | } | 731 | } |
732 | SHOW_CACHE_DISABLE(0) | ||
733 | SHOW_CACHE_DISABLE(1) | ||
755 | 734 | ||
756 | static ssize_t | 735 | static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, |
757 | store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, | 736 | const char *buf, size_t count, unsigned int index) |
758 | size_t count) | ||
759 | { | 737 | { |
760 | const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); | 738 | int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); |
761 | int node = cpu_to_node(cpumask_first(mask)); | 739 | int node = cpu_to_node(cpu); |
762 | struct pci_dev *dev = NULL; | 740 | struct pci_dev *dev = node_to_k8_nb_misc(node); |
763 | unsigned int ret, index, val; | 741 | unsigned long val = 0; |
742 | unsigned int scrubber = 0; | ||
764 | 743 | ||
765 | if (!this_leaf->can_disable) | 744 | if (!this_leaf->can_disable) |
766 | return 0; | ||
767 | |||
768 | if (strlen(buf) > 15) | ||
769 | return -EINVAL; | 745 | return -EINVAL; |
770 | 746 | ||
771 | ret = sscanf(buf, "%x %x", &index, &val); | 747 | if (!capable(CAP_SYS_ADMIN)) |
772 | if (ret != 2) | 748 | return -EPERM; |
749 | |||
750 | if (!dev) | ||
773 | return -EINVAL; | 751 | return -EINVAL; |
774 | if (index > 1) | 752 | |
753 | if (strict_strtoul(buf, 10, &val) < 0) | ||
775 | return -EINVAL; | 754 | return -EINVAL; |
776 | 755 | ||
777 | val |= 0xc0000000; | 756 | val |= 0xc0000000; |
778 | dev = get_k8_northbridge(node); | 757 | |
779 | if (!dev) { | 758 | pci_read_config_dword(dev, 0x58, &scrubber); |
780 | printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); | 759 | scrubber &= ~0x1f000000; |
781 | return -EINVAL; | 760 | pci_write_config_dword(dev, 0x58, scrubber); |
782 | } | ||
783 | 761 | ||
784 | pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); | 762 | pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); |
785 | wbinvd(); | 763 | wbinvd(); |
786 | pci_write_config_dword(dev, 0x1BC + index * 4, val); | 764 | pci_write_config_dword(dev, 0x1BC + index * 4, val); |
765 | return count; | ||
766 | } | ||
787 | 767 | ||
788 | return 1; | 768 | #define STORE_CACHE_DISABLE(index) \ |
769 | static ssize_t \ | ||
770 | store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ | ||
771 | const char *buf, size_t count) \ | ||
772 | { \ | ||
773 | return store_cache_disable(this_leaf, buf, count, index); \ | ||
789 | } | 774 | } |
775 | STORE_CACHE_DISABLE(0) | ||
776 | STORE_CACHE_DISABLE(1) | ||
790 | 777 | ||
791 | struct _cache_attr { | 778 | struct _cache_attr { |
792 | struct attribute attr; | 779 | struct attribute attr; |
@@ -808,7 +795,10 @@ define_one_ro(size); | |||
808 | define_one_ro(shared_cpu_map); | 795 | define_one_ro(shared_cpu_map); |
809 | define_one_ro(shared_cpu_list); | 796 | define_one_ro(shared_cpu_list); |
810 | 797 | ||
811 | static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); | 798 | static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, |
799 | show_cache_disable_0, store_cache_disable_0); | ||
800 | static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, | ||
801 | show_cache_disable_1, store_cache_disable_1); | ||
812 | 802 | ||
813 | static struct attribute * default_attrs[] = { | 803 | static struct attribute * default_attrs[] = { |
814 | &type.attr, | 804 | &type.attr, |
@@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = { | |||
820 | &size.attr, | 810 | &size.attr, |
821 | &shared_cpu_map.attr, | 811 | &shared_cpu_map.attr, |
822 | &shared_cpu_list.attr, | 812 | &shared_cpu_list.attr, |
823 | &cache_disable.attr, | 813 | &cache_disable_0.attr, |
814 | &cache_disable_1.attr, | ||
824 | NULL | 815 | NULL |
825 | }; | 816 | }; |
826 | 817 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index b2f89829bbe8..45004faf67ea 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile | |||
@@ -1,7 +1,11 @@ | |||
1 | obj-y = mce_$(BITS).o therm_throt.o | 1 | obj-y = mce.o therm_throt.o |
2 | 2 | ||
3 | obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o | 3 | obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o |
4 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o | 4 | obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o |
5 | obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o | ||
6 | obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o | ||
7 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o | ||
5 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o | 8 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o |
6 | obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o | 9 | obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o |
7 | obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o | 10 | obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o |
11 | obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o | ||
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index dd3af6e7b39a..89e510424152 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c | |||
@@ -2,11 +2,10 @@ | |||
2 | * Athlon specific Machine Check Exception Reporting | 2 | * Athlon specific Machine Check Exception Reporting |
3 | * (C) Copyright 2002 Dave Jones <davej@redhat.com> | 3 | * (C) Copyright 2002 Dave Jones <davej@redhat.com> |
4 | */ | 4 | */ |
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/init.h> | ||
10 | #include <linux/smp.h> | 9 | #include <linux/smp.h> |
11 | 10 | ||
12 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
@@ -15,12 +14,12 @@ | |||
15 | 14 | ||
16 | #include "mce.h" | 15 | #include "mce.h" |
17 | 16 | ||
18 | /* Machine Check Handler For AMD Athlon/Duron */ | 17 | /* Machine Check Handler For AMD Athlon/Duron: */ |
19 | static void k7_machine_check(struct pt_regs *regs, long error_code) | 18 | static void k7_machine_check(struct pt_regs *regs, long error_code) |
20 | { | 19 | { |
21 | int recover = 1; | ||
22 | u32 alow, ahigh, high, low; | 20 | u32 alow, ahigh, high, low; |
23 | u32 mcgstl, mcgsth; | 21 | u32 mcgstl, mcgsth; |
22 | int recover = 1; | ||
24 | int i; | 23 | int i; |
25 | 24 | ||
26 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 25 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
@@ -32,15 +31,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code) | |||
32 | 31 | ||
33 | for (i = 1; i < nr_mce_banks; i++) { | 32 | for (i = 1; i < nr_mce_banks; i++) { |
34 | rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); | 33 | rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); |
35 | if (high&(1<<31)) { | 34 | if (high & (1<<31)) { |
36 | char misc[20]; | 35 | char misc[20]; |
37 | char addr[24]; | 36 | char addr[24]; |
38 | misc[0] = addr[0] = '\0'; | 37 | |
38 | misc[0] = '\0'; | ||
39 | addr[0] = '\0'; | ||
40 | |||
39 | if (high & (1<<29)) | 41 | if (high & (1<<29)) |
40 | recover |= 1; | 42 | recover |= 1; |
41 | if (high & (1<<25)) | 43 | if (high & (1<<25)) |
42 | recover |= 2; | 44 | recover |= 2; |
43 | high &= ~(1<<31); | 45 | high &= ~(1<<31); |
46 | |||
44 | if (high & (1<<27)) { | 47 | if (high & (1<<27)) { |
45 | rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); | 48 | rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); |
46 | snprintf(misc, 20, "[%08x%08x]", ahigh, alow); | 49 | snprintf(misc, 20, "[%08x%08x]", ahigh, alow); |
@@ -49,27 +52,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code) | |||
49 | rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); | 52 | rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); |
50 | snprintf(addr, 24, " at %08x%08x", ahigh, alow); | 53 | snprintf(addr, 24, " at %08x%08x", ahigh, alow); |
51 | } | 54 | } |
55 | |||
52 | printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", | 56 | printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", |
53 | smp_processor_id(), i, high, low, misc, addr); | 57 | smp_processor_id(), i, high, low, misc, addr); |
54 | /* Clear it */ | 58 | |
59 | /* Clear it: */ | ||
55 | wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); | 60 | wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); |
56 | /* Serialize */ | 61 | /* Serialize: */ |
57 | wmb(); | 62 | wmb(); |
58 | add_taint(TAINT_MACHINE_CHECK); | 63 | add_taint(TAINT_MACHINE_CHECK); |
59 | } | 64 | } |
60 | } | 65 | } |
61 | 66 | ||
62 | if (recover&2) | 67 | if (recover & 2) |
63 | panic("CPU context corrupt"); | 68 | panic("CPU context corrupt"); |
64 | if (recover&1) | 69 | if (recover & 1) |
65 | panic("Unable to continue"); | 70 | panic("Unable to continue"); |
71 | |||
66 | printk(KERN_EMERG "Attempting to continue.\n"); | 72 | printk(KERN_EMERG "Attempting to continue.\n"); |
73 | |||
67 | mcgstl &= ~(1<<2); | 74 | mcgstl &= ~(1<<2); |
68 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 75 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
69 | } | 76 | } |
70 | 77 | ||
71 | 78 | ||
72 | /* AMD K7 machine check is Intel like */ | 79 | /* AMD K7 machine check is Intel like: */ |
73 | void amd_mcheck_init(struct cpuinfo_x86 *c) | 80 | void amd_mcheck_init(struct cpuinfo_x86 *c) |
74 | { | 81 | { |
75 | u32 l, h; | 82 | u32 l, h; |
@@ -79,21 +86,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) | |||
79 | return; | 86 | return; |
80 | 87 | ||
81 | machine_check_vector = k7_machine_check; | 88 | machine_check_vector = k7_machine_check; |
89 | /* Make sure the vector pointer is visible before we enable MCEs: */ | ||
82 | wmb(); | 90 | wmb(); |
83 | 91 | ||
84 | printk(KERN_INFO "Intel machine check architecture supported.\n"); | 92 | printk(KERN_INFO "Intel machine check architecture supported.\n"); |
93 | |||
85 | rdmsr(MSR_IA32_MCG_CAP, l, h); | 94 | rdmsr(MSR_IA32_MCG_CAP, l, h); |
86 | if (l & (1<<8)) /* Control register present ? */ | 95 | if (l & (1<<8)) /* Control register present ? */ |
87 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | 96 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); |
88 | nr_mce_banks = l & 0xff; | 97 | nr_mce_banks = l & 0xff; |
89 | 98 | ||
90 | /* Clear status for MC index 0 separately, we don't touch CTL, | 99 | /* |
91 | * as some K7 Athlons cause spurious MCEs when its enabled. */ | 100 | * Clear status for MC index 0 separately, we don't touch CTL, |
101 | * as some K7 Athlons cause spurious MCEs when its enabled: | ||
102 | */ | ||
92 | if (boot_cpu_data.x86 == 6) { | 103 | if (boot_cpu_data.x86 == 6) { |
93 | wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); | 104 | wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); |
94 | i = 1; | 105 | i = 1; |
95 | } else | 106 | } else |
96 | i = 0; | 107 | i = 0; |
108 | |||
97 | for (; i < nr_mce_banks; i++) { | 109 | for (; i < nr_mce_banks; i++) { |
98 | wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); | 110 | wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); |
99 | wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); | 111 | wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c new file mode 100644 index 000000000000..a3a235a53f09 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c | |||
@@ -0,0 +1,127 @@ | |||
1 | /* | ||
2 | * Machine check injection support. | ||
3 | * Copyright 2008 Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License | ||
7 | * as published by the Free Software Foundation; version 2 | ||
8 | * of the License. | ||
9 | * | ||
10 | * Authors: | ||
11 | * Andi Kleen | ||
12 | * Ying Huang | ||
13 | */ | ||
14 | #include <linux/uaccess.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/timer.h> | ||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/string.h> | ||
19 | #include <linux/fs.h> | ||
20 | #include <linux/smp.h> | ||
21 | #include <asm/mce.h> | ||
22 | |||
23 | /* Update fake mce registers on current CPU. */ | ||
24 | static void inject_mce(struct mce *m) | ||
25 | { | ||
26 | struct mce *i = &per_cpu(injectm, m->extcpu); | ||
27 | |||
28 | /* Make sure noone reads partially written injectm */ | ||
29 | i->finished = 0; | ||
30 | mb(); | ||
31 | m->finished = 0; | ||
32 | /* First set the fields after finished */ | ||
33 | i->extcpu = m->extcpu; | ||
34 | mb(); | ||
35 | /* Now write record in order, finished last (except above) */ | ||
36 | memcpy(i, m, sizeof(struct mce)); | ||
37 | /* Finally activate it */ | ||
38 | mb(); | ||
39 | i->finished = 1; | ||
40 | } | ||
41 | |||
42 | struct delayed_mce { | ||
43 | struct timer_list timer; | ||
44 | struct mce m; | ||
45 | }; | ||
46 | |||
47 | /* Inject mce on current CPU */ | ||
48 | static void raise_mce(unsigned long data) | ||
49 | { | ||
50 | struct delayed_mce *dm = (struct delayed_mce *)data; | ||
51 | struct mce *m = &dm->m; | ||
52 | int cpu = m->extcpu; | ||
53 | |||
54 | inject_mce(m); | ||
55 | if (m->status & MCI_STATUS_UC) { | ||
56 | struct pt_regs regs; | ||
57 | memset(®s, 0, sizeof(struct pt_regs)); | ||
58 | regs.ip = m->ip; | ||
59 | regs.cs = m->cs; | ||
60 | printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); | ||
61 | do_machine_check(®s, 0); | ||
62 | printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); | ||
63 | } else { | ||
64 | mce_banks_t b; | ||
65 | memset(&b, 0xff, sizeof(mce_banks_t)); | ||
66 | printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); | ||
67 | machine_check_poll(0, &b); | ||
68 | mce_notify_irq(); | ||
69 | printk(KERN_INFO "Finished machine check poll on CPU %d\n", | ||
70 | cpu); | ||
71 | } | ||
72 | kfree(dm); | ||
73 | } | ||
74 | |||
75 | /* Error injection interface */ | ||
76 | static ssize_t mce_write(struct file *filp, const char __user *ubuf, | ||
77 | size_t usize, loff_t *off) | ||
78 | { | ||
79 | struct delayed_mce *dm; | ||
80 | struct mce m; | ||
81 | |||
82 | if (!capable(CAP_SYS_ADMIN)) | ||
83 | return -EPERM; | ||
84 | /* | ||
85 | * There are some cases where real MSR reads could slip | ||
86 | * through. | ||
87 | */ | ||
88 | if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) | ||
89 | return -EIO; | ||
90 | |||
91 | if ((unsigned long)usize > sizeof(struct mce)) | ||
92 | usize = sizeof(struct mce); | ||
93 | if (copy_from_user(&m, ubuf, usize)) | ||
94 | return -EFAULT; | ||
95 | |||
96 | if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) | ||
97 | return -EINVAL; | ||
98 | |||
99 | dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); | ||
100 | if (!dm) | ||
101 | return -ENOMEM; | ||
102 | |||
103 | /* | ||
104 | * Need to give user space some time to set everything up, | ||
105 | * so do it a jiffie or two later everywhere. | ||
106 | * Should we use a hrtimer here for better synchronization? | ||
107 | */ | ||
108 | memcpy(&dm->m, &m, sizeof(struct mce)); | ||
109 | setup_timer(&dm->timer, raise_mce, (unsigned long)dm); | ||
110 | dm->timer.expires = jiffies + 2; | ||
111 | add_timer_on(&dm->timer, m.extcpu); | ||
112 | return usize; | ||
113 | } | ||
114 | |||
115 | static int inject_init(void) | ||
116 | { | ||
117 | printk(KERN_INFO "Machine check injector initialized\n"); | ||
118 | mce_chrdev_ops.write = mce_write; | ||
119 | return 0; | ||
120 | } | ||
121 | |||
122 | module_init(inject_init); | ||
123 | /* | ||
124 | * Cannot tolerate unloading currently because we cannot | ||
125 | * guarantee all openers of mce_chrdev will get a reference to us. | ||
126 | */ | ||
127 | MODULE_LICENSE("GPL"); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h new file mode 100644 index 000000000000..54dcb8ff12e5 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h | |||
@@ -0,0 +1,15 @@ | |||
1 | #include <asm/mce.h> | ||
2 | |||
3 | enum severity_level { | ||
4 | MCE_NO_SEVERITY, | ||
5 | MCE_KEEP_SEVERITY, | ||
6 | MCE_SOME_SEVERITY, | ||
7 | MCE_AO_SEVERITY, | ||
8 | MCE_UC_SEVERITY, | ||
9 | MCE_AR_SEVERITY, | ||
10 | MCE_PANIC_SEVERITY, | ||
11 | }; | ||
12 | |||
13 | int mce_severity(struct mce *a, int tolerant, char **msg); | ||
14 | |||
15 | extern int mce_ser; | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c new file mode 100644 index 000000000000..ff0807f97056 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
@@ -0,0 +1,218 @@ | |||
1 | /* | ||
2 | * MCE grading rules. | ||
3 | * Copyright 2008, 2009 Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License | ||
7 | * as published by the Free Software Foundation; version 2 | ||
8 | * of the License. | ||
9 | * | ||
10 | * Author: Andi Kleen | ||
11 | */ | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/seq_file.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/debugfs.h> | ||
16 | #include <asm/mce.h> | ||
17 | |||
18 | #include "mce-internal.h" | ||
19 | |||
20 | /* | ||
21 | * Grade an mce by severity. In general the most severe ones are processed | ||
22 | * first. Since there are quite a lot of combinations test the bits in a | ||
23 | * table-driven way. The rules are simply processed in order, first | ||
24 | * match wins. | ||
25 | * | ||
26 | * Note this is only used for machine check exceptions, the corrected | ||
27 | * errors use much simpler rules. The exceptions still check for the corrected | ||
28 | * errors, but only to leave them alone for the CMCI handler (except for | ||
29 | * panic situations) | ||
30 | */ | ||
31 | |||
32 | enum context { IN_KERNEL = 1, IN_USER = 2 }; | ||
33 | enum ser { SER_REQUIRED = 1, NO_SER = 2 }; | ||
34 | |||
35 | static struct severity { | ||
36 | u64 mask; | ||
37 | u64 result; | ||
38 | unsigned char sev; | ||
39 | unsigned char mcgmask; | ||
40 | unsigned char mcgres; | ||
41 | unsigned char ser; | ||
42 | unsigned char context; | ||
43 | unsigned char covered; | ||
44 | char *msg; | ||
45 | } severities[] = { | ||
46 | #define KERNEL .context = IN_KERNEL | ||
47 | #define USER .context = IN_USER | ||
48 | #define SER .ser = SER_REQUIRED | ||
49 | #define NOSER .ser = NO_SER | ||
50 | #define SEV(s) .sev = MCE_ ## s ## _SEVERITY | ||
51 | #define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } | ||
52 | #define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } | ||
53 | #define MCGMASK(x, res, s, m, r...) \ | ||
54 | { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } | ||
55 | #define MASK(x, y, s, m, r...) \ | ||
56 | { .mask = x, .result = y, SEV(s), .msg = m, ## r } | ||
57 | #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) | ||
58 | #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) | ||
59 | #define MCACOD 0xffff | ||
60 | |||
61 | BITCLR(MCI_STATUS_VAL, NO, "Invalid"), | ||
62 | BITCLR(MCI_STATUS_EN, NO, "Not enabled"), | ||
63 | BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), | ||
64 | /* When MCIP is not set something is very confused */ | ||
65 | MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), | ||
66 | /* Neither return not error IP -- no chance to recover -> PANIC */ | ||
67 | MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, | ||
68 | "Neither restart nor error IP"), | ||
69 | MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", | ||
70 | KERNEL), | ||
71 | BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), | ||
72 | MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, | ||
73 | "Spurious not enabled", SER), | ||
74 | |||
75 | /* ignore OVER for UCNA */ | ||
76 | MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, | ||
77 | "Uncorrected no action required", SER), | ||
78 | MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, | ||
79 | "Illegal combination (UCNA with AR=1)", SER), | ||
80 | MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), | ||
81 | |||
82 | /* AR add known MCACODs here */ | ||
83 | MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, | ||
84 | "Action required with lost events", SER), | ||
85 | MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, | ||
86 | "Action required; unknown MCACOD", SER), | ||
87 | |||
88 | /* known AO MCACODs: */ | ||
89 | MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, | ||
90 | "Action optional: memory scrubbing error", SER), | ||
91 | MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, | ||
92 | "Action optional: last level cache writeback error", SER), | ||
93 | |||
94 | MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, | ||
95 | "Action optional unknown MCACOD", SER), | ||
96 | MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, | ||
97 | "Action optional with lost events", SER), | ||
98 | BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), | ||
99 | BITSET(MCI_STATUS_UC, UC, "Uncorrected"), | ||
100 | BITSET(0, SOME, "No match") /* always matches. keep at end */ | ||
101 | }; | ||
102 | |||
103 | /* | ||
104 | * If the EIPV bit is set, it means the saved IP is the | ||
105 | * instruction which caused the MCE. | ||
106 | */ | ||
107 | static int error_context(struct mce *m) | ||
108 | { | ||
109 | if (m->mcgstatus & MCG_STATUS_EIPV) | ||
110 | return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; | ||
111 | /* Unknown, assume kernel */ | ||
112 | return IN_KERNEL; | ||
113 | } | ||
114 | |||
115 | int mce_severity(struct mce *a, int tolerant, char **msg) | ||
116 | { | ||
117 | enum context ctx = error_context(a); | ||
118 | struct severity *s; | ||
119 | |||
120 | for (s = severities;; s++) { | ||
121 | if ((a->status & s->mask) != s->result) | ||
122 | continue; | ||
123 | if ((a->mcgstatus & s->mcgmask) != s->mcgres) | ||
124 | continue; | ||
125 | if (s->ser == SER_REQUIRED && !mce_ser) | ||
126 | continue; | ||
127 | if (s->ser == NO_SER && mce_ser) | ||
128 | continue; | ||
129 | if (s->context && ctx != s->context) | ||
130 | continue; | ||
131 | if (msg) | ||
132 | *msg = s->msg; | ||
133 | s->covered = 1; | ||
134 | if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { | ||
135 | if (panic_on_oops || tolerant < 1) | ||
136 | return MCE_PANIC_SEVERITY; | ||
137 | } | ||
138 | return s->sev; | ||
139 | } | ||
140 | } | ||
141 | |||
142 | static void *s_start(struct seq_file *f, loff_t *pos) | ||
143 | { | ||
144 | if (*pos >= ARRAY_SIZE(severities)) | ||
145 | return NULL; | ||
146 | return &severities[*pos]; | ||
147 | } | ||
148 | |||
149 | static void *s_next(struct seq_file *f, void *data, loff_t *pos) | ||
150 | { | ||
151 | if (++(*pos) >= ARRAY_SIZE(severities)) | ||
152 | return NULL; | ||
153 | return &severities[*pos]; | ||
154 | } | ||
155 | |||
156 | static void s_stop(struct seq_file *f, void *data) | ||
157 | { | ||
158 | } | ||
159 | |||
160 | static int s_show(struct seq_file *f, void *data) | ||
161 | { | ||
162 | struct severity *ser = data; | ||
163 | seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); | ||
164 | return 0; | ||
165 | } | ||
166 | |||
167 | static const struct seq_operations severities_seq_ops = { | ||
168 | .start = s_start, | ||
169 | .next = s_next, | ||
170 | .stop = s_stop, | ||
171 | .show = s_show, | ||
172 | }; | ||
173 | |||
174 | static int severities_coverage_open(struct inode *inode, struct file *file) | ||
175 | { | ||
176 | return seq_open(file, &severities_seq_ops); | ||
177 | } | ||
178 | |||
179 | static ssize_t severities_coverage_write(struct file *file, | ||
180 | const char __user *ubuf, | ||
181 | size_t count, loff_t *ppos) | ||
182 | { | ||
183 | int i; | ||
184 | for (i = 0; i < ARRAY_SIZE(severities); i++) | ||
185 | severities[i].covered = 0; | ||
186 | return count; | ||
187 | } | ||
188 | |||
189 | static const struct file_operations severities_coverage_fops = { | ||
190 | .open = severities_coverage_open, | ||
191 | .release = seq_release, | ||
192 | .read = seq_read, | ||
193 | .write = severities_coverage_write, | ||
194 | }; | ||
195 | |||
196 | static int __init severities_debugfs_init(void) | ||
197 | { | ||
198 | struct dentry *dmce = NULL, *fseverities_coverage = NULL; | ||
199 | |||
200 | dmce = debugfs_create_dir("mce", NULL); | ||
201 | if (dmce == NULL) | ||
202 | goto err_out; | ||
203 | fseverities_coverage = debugfs_create_file("severities-coverage", | ||
204 | 0444, dmce, NULL, | ||
205 | &severities_coverage_fops); | ||
206 | if (fseverities_coverage == NULL) | ||
207 | goto err_out; | ||
208 | |||
209 | return 0; | ||
210 | |||
211 | err_out: | ||
212 | if (fseverities_coverage) | ||
213 | debugfs_remove(fseverities_coverage); | ||
214 | if (dmce) | ||
215 | debugfs_remove(dmce); | ||
216 | return -ENOMEM; | ||
217 | } | ||
218 | late_initcall(severities_debugfs_init); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c new file mode 100644 index 000000000000..fabba15e4558 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -0,0 +1,1964 @@ | |||
1 | /* | ||
2 | * Machine check handler. | ||
3 | * | ||
4 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
5 | * Rest from unknown author(s). | ||
6 | * 2004 Andi Kleen. Rewrote most of it. | ||
7 | * Copyright 2008 Intel Corporation | ||
8 | * Author: Andi Kleen | ||
9 | */ | ||
10 | #include <linux/thread_info.h> | ||
11 | #include <linux/capability.h> | ||
12 | #include <linux/miscdevice.h> | ||
13 | #include <linux/interrupt.h> | ||
14 | #include <linux/ratelimit.h> | ||
15 | #include <linux/kallsyms.h> | ||
16 | #include <linux/rcupdate.h> | ||
17 | #include <linux/kobject.h> | ||
18 | #include <linux/uaccess.h> | ||
19 | #include <linux/kdebug.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/percpu.h> | ||
22 | #include <linux/string.h> | ||
23 | #include <linux/sysdev.h> | ||
24 | #include <linux/delay.h> | ||
25 | #include <linux/ctype.h> | ||
26 | #include <linux/sched.h> | ||
27 | #include <linux/sysfs.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/init.h> | ||
30 | #include <linux/kmod.h> | ||
31 | #include <linux/poll.h> | ||
32 | #include <linux/nmi.h> | ||
33 | #include <linux/cpu.h> | ||
34 | #include <linux/smp.h> | ||
35 | #include <linux/fs.h> | ||
36 | #include <linux/mm.h> | ||
37 | |||
38 | #include <asm/processor.h> | ||
39 | #include <asm/hw_irq.h> | ||
40 | #include <asm/apic.h> | ||
41 | #include <asm/idle.h> | ||
42 | #include <asm/ipi.h> | ||
43 | #include <asm/mce.h> | ||
44 | #include <asm/msr.h> | ||
45 | |||
46 | #include "mce-internal.h" | ||
47 | #include "mce.h" | ||
48 | |||
49 | /* Handle unconfigured int18 (should never happen) */ | ||
50 | static void unexpected_machine_check(struct pt_regs *regs, long error_code) | ||
51 | { | ||
52 | printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", | ||
53 | smp_processor_id()); | ||
54 | } | ||
55 | |||
56 | /* Call the installed machine check handler for this CPU setup. */ | ||
57 | void (*machine_check_vector)(struct pt_regs *, long error_code) = | ||
58 | unexpected_machine_check; | ||
59 | |||
60 | int mce_disabled; | ||
61 | |||
62 | #ifdef CONFIG_X86_NEW_MCE | ||
63 | |||
64 | #define MISC_MCELOG_MINOR 227 | ||
65 | |||
66 | #define SPINUNIT 100 /* 100ns */ | ||
67 | |||
68 | atomic_t mce_entry; | ||
69 | |||
70 | DEFINE_PER_CPU(unsigned, mce_exception_count); | ||
71 | |||
72 | /* | ||
73 | * Tolerant levels: | ||
74 | * 0: always panic on uncorrected errors, log corrected errors | ||
75 | * 1: panic or SIGBUS on uncorrected errors, log corrected errors | ||
76 | * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors | ||
77 | * 3: never panic or SIGBUS, log all errors (for testing only) | ||
78 | */ | ||
79 | static int tolerant = 1; | ||
80 | static int banks; | ||
81 | static u64 *bank; | ||
82 | static unsigned long notify_user; | ||
83 | static int rip_msr; | ||
84 | static int mce_bootlog = -1; | ||
85 | static int monarch_timeout = -1; | ||
86 | static int mce_panic_timeout; | ||
87 | static int mce_dont_log_ce; | ||
88 | int mce_cmci_disabled; | ||
89 | int mce_ignore_ce; | ||
90 | int mce_ser; | ||
91 | |||
92 | static char trigger[128]; | ||
93 | static char *trigger_argv[2] = { trigger, NULL }; | ||
94 | |||
95 | static unsigned long dont_init_banks; | ||
96 | |||
97 | static DECLARE_WAIT_QUEUE_HEAD(mce_wait); | ||
98 | static DEFINE_PER_CPU(struct mce, mces_seen); | ||
99 | static int cpu_missing; | ||
100 | |||
101 | |||
102 | /* MCA banks polled by the period polling timer for corrected events */ | ||
103 | DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { | ||
104 | [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL | ||
105 | }; | ||
106 | |||
107 | static inline int skip_bank_init(int i) | ||
108 | { | ||
109 | return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); | ||
110 | } | ||
111 | |||
112 | static DEFINE_PER_CPU(struct work_struct, mce_work); | ||
113 | |||
114 | /* Do initial initialization of a struct mce */ | ||
115 | void mce_setup(struct mce *m) | ||
116 | { | ||
117 | memset(m, 0, sizeof(struct mce)); | ||
118 | m->cpu = m->extcpu = smp_processor_id(); | ||
119 | rdtscll(m->tsc); | ||
120 | /* We hope get_seconds stays lockless */ | ||
121 | m->time = get_seconds(); | ||
122 | m->cpuvendor = boot_cpu_data.x86_vendor; | ||
123 | m->cpuid = cpuid_eax(1); | ||
124 | #ifdef CONFIG_SMP | ||
125 | m->socketid = cpu_data(m->extcpu).phys_proc_id; | ||
126 | #endif | ||
127 | m->apicid = cpu_data(m->extcpu).initial_apicid; | ||
128 | rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); | ||
129 | } | ||
130 | |||
131 | DEFINE_PER_CPU(struct mce, injectm); | ||
132 | EXPORT_PER_CPU_SYMBOL_GPL(injectm); | ||
133 | |||
134 | /* | ||
135 | * Lockless MCE logging infrastructure. | ||
136 | * This avoids deadlocks on printk locks without having to break locks. Also | ||
137 | * separate MCEs from kernel messages to avoid bogus bug reports. | ||
138 | */ | ||
139 | |||
140 | static struct mce_log mcelog = { | ||
141 | .signature = MCE_LOG_SIGNATURE, | ||
142 | .len = MCE_LOG_LEN, | ||
143 | .recordlen = sizeof(struct mce), | ||
144 | }; | ||
145 | |||
146 | void mce_log(struct mce *mce) | ||
147 | { | ||
148 | unsigned next, entry; | ||
149 | |||
150 | mce->finished = 0; | ||
151 | wmb(); | ||
152 | for (;;) { | ||
153 | entry = rcu_dereference(mcelog.next); | ||
154 | for (;;) { | ||
155 | /* | ||
156 | * When the buffer fills up discard new entries. | ||
157 | * Assume that the earlier errors are the more | ||
158 | * interesting ones: | ||
159 | */ | ||
160 | if (entry >= MCE_LOG_LEN) { | ||
161 | set_bit(MCE_OVERFLOW, | ||
162 | (unsigned long *)&mcelog.flags); | ||
163 | return; | ||
164 | } | ||
165 | /* Old left over entry. Skip: */ | ||
166 | if (mcelog.entry[entry].finished) { | ||
167 | entry++; | ||
168 | continue; | ||
169 | } | ||
170 | break; | ||
171 | } | ||
172 | smp_rmb(); | ||
173 | next = entry + 1; | ||
174 | if (cmpxchg(&mcelog.next, entry, next) == entry) | ||
175 | break; | ||
176 | } | ||
177 | memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); | ||
178 | wmb(); | ||
179 | mcelog.entry[entry].finished = 1; | ||
180 | wmb(); | ||
181 | |||
182 | mce->finished = 1; | ||
183 | set_bit(0, ¬ify_user); | ||
184 | } | ||
185 | |||
186 | static void print_mce(struct mce *m) | ||
187 | { | ||
188 | printk(KERN_EMERG | ||
189 | "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", | ||
190 | m->extcpu, m->mcgstatus, m->bank, m->status); | ||
191 | if (m->ip) { | ||
192 | printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", | ||
193 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | ||
194 | m->cs, m->ip); | ||
195 | if (m->cs == __KERNEL_CS) | ||
196 | print_symbol("{%s}", m->ip); | ||
197 | printk("\n"); | ||
198 | } | ||
199 | printk(KERN_EMERG "TSC %llx ", m->tsc); | ||
200 | if (m->addr) | ||
201 | printk("ADDR %llx ", m->addr); | ||
202 | if (m->misc) | ||
203 | printk("MISC %llx ", m->misc); | ||
204 | printk("\n"); | ||
205 | printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", | ||
206 | m->cpuvendor, m->cpuid, m->time, m->socketid, | ||
207 | m->apicid); | ||
208 | } | ||
209 | |||
210 | static void print_mce_head(void) | ||
211 | { | ||
212 | printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); | ||
213 | } | ||
214 | |||
215 | static void print_mce_tail(void) | ||
216 | { | ||
217 | printk(KERN_EMERG "This is not a software problem!\n" | ||
218 | KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); | ||
219 | } | ||
220 | |||
221 | #define PANIC_TIMEOUT 5 /* 5 seconds */ | ||
222 | |||
223 | static atomic_t mce_paniced; | ||
224 | |||
225 | /* Panic in progress. Enable interrupts and wait for final IPI */ | ||
226 | static void wait_for_panic(void) | ||
227 | { | ||
228 | long timeout = PANIC_TIMEOUT*USEC_PER_SEC; | ||
229 | preempt_disable(); | ||
230 | local_irq_enable(); | ||
231 | while (timeout-- > 0) | ||
232 | udelay(1); | ||
233 | if (panic_timeout == 0) | ||
234 | panic_timeout = mce_panic_timeout; | ||
235 | panic("Panicing machine check CPU died"); | ||
236 | } | ||
237 | |||
238 | static void mce_panic(char *msg, struct mce *final, char *exp) | ||
239 | { | ||
240 | int i; | ||
241 | |||
242 | /* | ||
243 | * Make sure only one CPU runs in machine check panic | ||
244 | */ | ||
245 | if (atomic_add_return(1, &mce_paniced) > 1) | ||
246 | wait_for_panic(); | ||
247 | barrier(); | ||
248 | |||
249 | bust_spinlocks(1); | ||
250 | console_verbose(); | ||
251 | print_mce_head(); | ||
252 | /* First print corrected ones that are still unlogged */ | ||
253 | for (i = 0; i < MCE_LOG_LEN; i++) { | ||
254 | struct mce *m = &mcelog.entry[i]; | ||
255 | if (!(m->status & MCI_STATUS_VAL)) | ||
256 | continue; | ||
257 | if (!(m->status & MCI_STATUS_UC)) | ||
258 | print_mce(m); | ||
259 | } | ||
260 | /* Now print uncorrected but with the final one last */ | ||
261 | for (i = 0; i < MCE_LOG_LEN; i++) { | ||
262 | struct mce *m = &mcelog.entry[i]; | ||
263 | if (!(m->status & MCI_STATUS_VAL)) | ||
264 | continue; | ||
265 | if (!(m->status & MCI_STATUS_UC)) | ||
266 | continue; | ||
267 | if (!final || memcmp(m, final, sizeof(struct mce))) | ||
268 | print_mce(m); | ||
269 | } | ||
270 | if (final) | ||
271 | print_mce(final); | ||
272 | if (cpu_missing) | ||
273 | printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); | ||
274 | print_mce_tail(); | ||
275 | if (exp) | ||
276 | printk(KERN_EMERG "Machine check: %s\n", exp); | ||
277 | if (panic_timeout == 0) | ||
278 | panic_timeout = mce_panic_timeout; | ||
279 | panic(msg); | ||
280 | } | ||
281 | |||
282 | /* Support code for software error injection */ | ||
283 | |||
284 | static int msr_to_offset(u32 msr) | ||
285 | { | ||
286 | unsigned bank = __get_cpu_var(injectm.bank); | ||
287 | if (msr == rip_msr) | ||
288 | return offsetof(struct mce, ip); | ||
289 | if (msr == MSR_IA32_MC0_STATUS + bank*4) | ||
290 | return offsetof(struct mce, status); | ||
291 | if (msr == MSR_IA32_MC0_ADDR + bank*4) | ||
292 | return offsetof(struct mce, addr); | ||
293 | if (msr == MSR_IA32_MC0_MISC + bank*4) | ||
294 | return offsetof(struct mce, misc); | ||
295 | if (msr == MSR_IA32_MCG_STATUS) | ||
296 | return offsetof(struct mce, mcgstatus); | ||
297 | return -1; | ||
298 | } | ||
299 | |||
300 | /* MSR access wrappers used for error injection */ | ||
301 | static u64 mce_rdmsrl(u32 msr) | ||
302 | { | ||
303 | u64 v; | ||
304 | if (__get_cpu_var(injectm).finished) { | ||
305 | int offset = msr_to_offset(msr); | ||
306 | if (offset < 0) | ||
307 | return 0; | ||
308 | return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); | ||
309 | } | ||
310 | rdmsrl(msr, v); | ||
311 | return v; | ||
312 | } | ||
313 | |||
314 | static void mce_wrmsrl(u32 msr, u64 v) | ||
315 | { | ||
316 | if (__get_cpu_var(injectm).finished) { | ||
317 | int offset = msr_to_offset(msr); | ||
318 | if (offset >= 0) | ||
319 | *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; | ||
320 | return; | ||
321 | } | ||
322 | wrmsrl(msr, v); | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Simple lockless ring to communicate PFNs from the exception handler with the | ||
327 | * process context work function. This is vastly simplified because there's | ||
328 | * only a single reader and a single writer. | ||
329 | */ | ||
330 | #define MCE_RING_SIZE 16 /* we use one entry less */ | ||
331 | |||
332 | struct mce_ring { | ||
333 | unsigned short start; | ||
334 | unsigned short end; | ||
335 | unsigned long ring[MCE_RING_SIZE]; | ||
336 | }; | ||
337 | static DEFINE_PER_CPU(struct mce_ring, mce_ring); | ||
338 | |||
339 | /* Runs with CPU affinity in workqueue */ | ||
340 | static int mce_ring_empty(void) | ||
341 | { | ||
342 | struct mce_ring *r = &__get_cpu_var(mce_ring); | ||
343 | |||
344 | return r->start == r->end; | ||
345 | } | ||
346 | |||
347 | static int mce_ring_get(unsigned long *pfn) | ||
348 | { | ||
349 | struct mce_ring *r; | ||
350 | int ret = 0; | ||
351 | |||
352 | *pfn = 0; | ||
353 | get_cpu(); | ||
354 | r = &__get_cpu_var(mce_ring); | ||
355 | if (r->start == r->end) | ||
356 | goto out; | ||
357 | *pfn = r->ring[r->start]; | ||
358 | r->start = (r->start + 1) % MCE_RING_SIZE; | ||
359 | ret = 1; | ||
360 | out: | ||
361 | put_cpu(); | ||
362 | return ret; | ||
363 | } | ||
364 | |||
365 | /* Always runs in MCE context with preempt off */ | ||
366 | static int mce_ring_add(unsigned long pfn) | ||
367 | { | ||
368 | struct mce_ring *r = &__get_cpu_var(mce_ring); | ||
369 | unsigned next; | ||
370 | |||
371 | next = (r->end + 1) % MCE_RING_SIZE; | ||
372 | if (next == r->start) | ||
373 | return -1; | ||
374 | r->ring[r->end] = pfn; | ||
375 | wmb(); | ||
376 | r->end = next; | ||
377 | return 0; | ||
378 | } | ||
379 | |||
380 | int mce_available(struct cpuinfo_x86 *c) | ||
381 | { | ||
382 | if (mce_disabled) | ||
383 | return 0; | ||
384 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); | ||
385 | } | ||
386 | |||
387 | static void mce_schedule_work(void) | ||
388 | { | ||
389 | if (!mce_ring_empty()) { | ||
390 | struct work_struct *work = &__get_cpu_var(mce_work); | ||
391 | if (!work_pending(work)) | ||
392 | schedule_work(work); | ||
393 | } | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * Get the address of the instruction at the time of the machine check | ||
398 | * error. | ||
399 | */ | ||
400 | static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) | ||
401 | { | ||
402 | |||
403 | if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { | ||
404 | m->ip = regs->ip; | ||
405 | m->cs = regs->cs; | ||
406 | } else { | ||
407 | m->ip = 0; | ||
408 | m->cs = 0; | ||
409 | } | ||
410 | if (rip_msr) | ||
411 | m->ip = mce_rdmsrl(rip_msr); | ||
412 | } | ||
413 | |||
414 | #ifdef CONFIG_X86_LOCAL_APIC | ||
415 | /* | ||
416 | * Called after interrupts have been reenabled again | ||
417 | * when a MCE happened during an interrupts off region | ||
418 | * in the kernel. | ||
419 | */ | ||
420 | asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) | ||
421 | { | ||
422 | ack_APIC_irq(); | ||
423 | exit_idle(); | ||
424 | irq_enter(); | ||
425 | mce_notify_irq(); | ||
426 | mce_schedule_work(); | ||
427 | irq_exit(); | ||
428 | } | ||
429 | #endif | ||
430 | |||
431 | static void mce_report_event(struct pt_regs *regs) | ||
432 | { | ||
433 | if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { | ||
434 | mce_notify_irq(); | ||
435 | /* | ||
436 | * Triggering the work queue here is just an insurance | ||
437 | * policy in case the syscall exit notify handler | ||
438 | * doesn't run soon enough or ends up running on the | ||
439 | * wrong CPU (can happen when audit sleeps) | ||
440 | */ | ||
441 | mce_schedule_work(); | ||
442 | return; | ||
443 | } | ||
444 | |||
445 | #ifdef CONFIG_X86_LOCAL_APIC | ||
446 | /* | ||
447 | * Without APIC do not notify. The event will be picked | ||
448 | * up eventually. | ||
449 | */ | ||
450 | if (!cpu_has_apic) | ||
451 | return; | ||
452 | |||
453 | /* | ||
454 | * When interrupts are disabled we cannot use | ||
455 | * kernel services safely. Trigger an self interrupt | ||
456 | * through the APIC to instead do the notification | ||
457 | * after interrupts are reenabled again. | ||
458 | */ | ||
459 | apic->send_IPI_self(MCE_SELF_VECTOR); | ||
460 | |||
461 | /* | ||
462 | * Wait for idle afterwards again so that we don't leave the | ||
463 | * APIC in a non idle state because the normal APIC writes | ||
464 | * cannot exclude us. | ||
465 | */ | ||
466 | apic_wait_icr_idle(); | ||
467 | #endif | ||
468 | } | ||
469 | |||
470 | DEFINE_PER_CPU(unsigned, mce_poll_count); | ||
471 | |||
472 | /* | ||
473 | * Poll for corrected events or events that happened before reset. | ||
474 | * Those are just logged through /dev/mcelog. | ||
475 | * | ||
476 | * This is executed in standard interrupt context. | ||
477 | * | ||
478 | * Note: spec recommends to panic for fatal unsignalled | ||
479 | * errors here. However this would be quite problematic -- | ||
480 | * we would need to reimplement the Monarch handling and | ||
481 | * it would mess up the exclusion between exception handler | ||
482 | * and poll hander -- * so we skip this for now. | ||
483 | * These cases should not happen anyways, or only when the CPU | ||
484 | * is already totally * confused. In this case it's likely it will | ||
485 | * not fully execute the machine check handler either. | ||
486 | */ | ||
487 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | ||
488 | { | ||
489 | struct mce m; | ||
490 | int i; | ||
491 | |||
492 | __get_cpu_var(mce_poll_count)++; | ||
493 | |||
494 | mce_setup(&m); | ||
495 | |||
496 | m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); | ||
497 | for (i = 0; i < banks; i++) { | ||
498 | if (!bank[i] || !test_bit(i, *b)) | ||
499 | continue; | ||
500 | |||
501 | m.misc = 0; | ||
502 | m.addr = 0; | ||
503 | m.bank = i; | ||
504 | m.tsc = 0; | ||
505 | |||
506 | barrier(); | ||
507 | m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); | ||
508 | if (!(m.status & MCI_STATUS_VAL)) | ||
509 | continue; | ||
510 | |||
511 | /* | ||
512 | * Uncorrected or signalled events are handled by the exception | ||
513 | * handler when it is enabled, so don't process those here. | ||
514 | * | ||
515 | * TBD do the same check for MCI_STATUS_EN here? | ||
516 | */ | ||
517 | if (!(flags & MCP_UC) && | ||
518 | (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) | ||
519 | continue; | ||
520 | |||
521 | if (m.status & MCI_STATUS_MISCV) | ||
522 | m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); | ||
523 | if (m.status & MCI_STATUS_ADDRV) | ||
524 | m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); | ||
525 | |||
526 | if (!(flags & MCP_TIMESTAMP)) | ||
527 | m.tsc = 0; | ||
528 | /* | ||
529 | * Don't get the IP here because it's unlikely to | ||
530 | * have anything to do with the actual error location. | ||
531 | */ | ||
532 | if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { | ||
533 | mce_log(&m); | ||
534 | add_taint(TAINT_MACHINE_CHECK); | ||
535 | } | ||
536 | |||
537 | /* | ||
538 | * Clear state for this bank. | ||
539 | */ | ||
540 | mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * Don't clear MCG_STATUS here because it's only defined for | ||
545 | * exceptions. | ||
546 | */ | ||
547 | |||
548 | sync_core(); | ||
549 | } | ||
550 | EXPORT_SYMBOL_GPL(machine_check_poll); | ||
551 | |||
552 | /* | ||
553 | * Do a quick check if any of the events requires a panic. | ||
554 | * This decides if we keep the events around or clear them. | ||
555 | */ | ||
556 | static int mce_no_way_out(struct mce *m, char **msg) | ||
557 | { | ||
558 | int i; | ||
559 | |||
560 | for (i = 0; i < banks; i++) { | ||
561 | m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); | ||
562 | if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) | ||
563 | return 1; | ||
564 | } | ||
565 | return 0; | ||
566 | } | ||
567 | |||
568 | /* | ||
569 | * Variable to establish order between CPUs while scanning. | ||
570 | * Each CPU spins initially until executing is equal its number. | ||
571 | */ | ||
572 | static atomic_t mce_executing; | ||
573 | |||
574 | /* | ||
575 | * Defines order of CPUs on entry. First CPU becomes Monarch. | ||
576 | */ | ||
577 | static atomic_t mce_callin; | ||
578 | |||
579 | /* | ||
580 | * Check if a timeout waiting for other CPUs happened. | ||
581 | */ | ||
582 | static int mce_timed_out(u64 *t) | ||
583 | { | ||
584 | /* | ||
585 | * The others already did panic for some reason. | ||
586 | * Bail out like in a timeout. | ||
587 | * rmb() to tell the compiler that system_state | ||
588 | * might have been modified by someone else. | ||
589 | */ | ||
590 | rmb(); | ||
591 | if (atomic_read(&mce_paniced)) | ||
592 | wait_for_panic(); | ||
593 | if (!monarch_timeout) | ||
594 | goto out; | ||
595 | if ((s64)*t < SPINUNIT) { | ||
596 | /* CHECKME: Make panic default for 1 too? */ | ||
597 | if (tolerant < 1) | ||
598 | mce_panic("Timeout synchronizing machine check over CPUs", | ||
599 | NULL, NULL); | ||
600 | cpu_missing = 1; | ||
601 | return 1; | ||
602 | } | ||
603 | *t -= SPINUNIT; | ||
604 | out: | ||
605 | touch_nmi_watchdog(); | ||
606 | return 0; | ||
607 | } | ||
608 | |||
609 | /* | ||
610 | * The Monarch's reign. The Monarch is the CPU who entered | ||
611 | * the machine check handler first. It waits for the others to | ||
612 | * raise the exception too and then grades them. When any | ||
613 | * error is fatal panic. Only then let the others continue. | ||
614 | * | ||
615 | * The other CPUs entering the MCE handler will be controlled by the | ||
616 | * Monarch. They are called Subjects. | ||
617 | * | ||
618 | * This way we prevent any potential data corruption in a unrecoverable case | ||
619 | * and also makes sure always all CPU's errors are examined. | ||
620 | * | ||
621 | * Also this detects the case of an machine check event coming from outer | ||
622 | * space (not detected by any CPUs) In this case some external agent wants | ||
623 | * us to shut down, so panic too. | ||
624 | * | ||
625 | * The other CPUs might still decide to panic if the handler happens | ||
626 | * in a unrecoverable place, but in this case the system is in a semi-stable | ||
627 | * state and won't corrupt anything by itself. It's ok to let the others | ||
628 | * continue for a bit first. | ||
629 | * | ||
630 | * All the spin loops have timeouts; when a timeout happens a CPU | ||
631 | * typically elects itself to be Monarch. | ||
632 | */ | ||
633 | static void mce_reign(void) | ||
634 | { | ||
635 | int cpu; | ||
636 | struct mce *m = NULL; | ||
637 | int global_worst = 0; | ||
638 | char *msg = NULL; | ||
639 | char *nmsg = NULL; | ||
640 | |||
641 | /* | ||
642 | * This CPU is the Monarch and the other CPUs have run | ||
643 | * through their handlers. | ||
644 | * Grade the severity of the errors of all the CPUs. | ||
645 | */ | ||
646 | for_each_possible_cpu(cpu) { | ||
647 | int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, | ||
648 | &nmsg); | ||
649 | if (severity > global_worst) { | ||
650 | msg = nmsg; | ||
651 | global_worst = severity; | ||
652 | m = &per_cpu(mces_seen, cpu); | ||
653 | } | ||
654 | } | ||
655 | |||
656 | /* | ||
657 | * Cannot recover? Panic here then. | ||
658 | * This dumps all the mces in the log buffer and stops the | ||
659 | * other CPUs. | ||
660 | */ | ||
661 | if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) | ||
662 | mce_panic("Fatal Machine check", m, msg); | ||
663 | |||
664 | /* | ||
665 | * For UC somewhere we let the CPU who detects it handle it. | ||
666 | * Also must let continue the others, otherwise the handling | ||
667 | * CPU could deadlock on a lock. | ||
668 | */ | ||
669 | |||
670 | /* | ||
671 | * No machine check event found. Must be some external | ||
672 | * source or one CPU is hung. Panic. | ||
673 | */ | ||
674 | if (!m && tolerant < 3) | ||
675 | mce_panic("Machine check from unknown source", NULL, NULL); | ||
676 | |||
677 | /* | ||
678 | * Now clear all the mces_seen so that they don't reappear on | ||
679 | * the next mce. | ||
680 | */ | ||
681 | for_each_possible_cpu(cpu) | ||
682 | memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); | ||
683 | } | ||
684 | |||
685 | static atomic_t global_nwo; | ||
686 | |||
687 | /* | ||
688 | * Start of Monarch synchronization. This waits until all CPUs have | ||
689 | * entered the exception handler and then determines if any of them | ||
690 | * saw a fatal event that requires panic. Then it executes them | ||
691 | * in the entry order. | ||
692 | * TBD double check parallel CPU hotunplug | ||
693 | */ | ||
694 | static int mce_start(int no_way_out, int *order) | ||
695 | { | ||
696 | int nwo; | ||
697 | int cpus = num_online_cpus(); | ||
698 | u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; | ||
699 | |||
700 | if (!timeout) { | ||
701 | *order = -1; | ||
702 | return no_way_out; | ||
703 | } | ||
704 | |||
705 | atomic_add(no_way_out, &global_nwo); | ||
706 | |||
707 | /* | ||
708 | * Wait for everyone. | ||
709 | */ | ||
710 | while (atomic_read(&mce_callin) != cpus) { | ||
711 | if (mce_timed_out(&timeout)) { | ||
712 | atomic_set(&global_nwo, 0); | ||
713 | *order = -1; | ||
714 | return no_way_out; | ||
715 | } | ||
716 | ndelay(SPINUNIT); | ||
717 | } | ||
718 | |||
719 | /* | ||
720 | * Cache the global no_way_out state. | ||
721 | */ | ||
722 | nwo = atomic_read(&global_nwo); | ||
723 | |||
724 | /* | ||
725 | * Monarch starts executing now, the others wait. | ||
726 | */ | ||
727 | if (*order == 1) { | ||
728 | atomic_set(&mce_executing, 1); | ||
729 | return nwo; | ||
730 | } | ||
731 | |||
732 | /* | ||
733 | * Now start the scanning loop one by one | ||
734 | * in the original callin order. | ||
735 | * This way when there are any shared banks it will | ||
736 | * be only seen by one CPU before cleared, avoiding duplicates. | ||
737 | */ | ||
738 | while (atomic_read(&mce_executing) < *order) { | ||
739 | if (mce_timed_out(&timeout)) { | ||
740 | atomic_set(&global_nwo, 0); | ||
741 | *order = -1; | ||
742 | return no_way_out; | ||
743 | } | ||
744 | ndelay(SPINUNIT); | ||
745 | } | ||
746 | return nwo; | ||
747 | } | ||
748 | |||
749 | /* | ||
750 | * Synchronize between CPUs after main scanning loop. | ||
751 | * This invokes the bulk of the Monarch processing. | ||
752 | */ | ||
753 | static int mce_end(int order) | ||
754 | { | ||
755 | int ret = -1; | ||
756 | u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; | ||
757 | |||
758 | if (!timeout) | ||
759 | goto reset; | ||
760 | if (order < 0) | ||
761 | goto reset; | ||
762 | |||
763 | /* | ||
764 | * Allow others to run. | ||
765 | */ | ||
766 | atomic_inc(&mce_executing); | ||
767 | |||
768 | if (order == 1) { | ||
769 | /* CHECKME: Can this race with a parallel hotplug? */ | ||
770 | int cpus = num_online_cpus(); | ||
771 | |||
772 | /* | ||
773 | * Monarch: Wait for everyone to go through their scanning | ||
774 | * loops. | ||
775 | */ | ||
776 | while (atomic_read(&mce_executing) <= cpus) { | ||
777 | if (mce_timed_out(&timeout)) | ||
778 | goto reset; | ||
779 | ndelay(SPINUNIT); | ||
780 | } | ||
781 | |||
782 | mce_reign(); | ||
783 | barrier(); | ||
784 | ret = 0; | ||
785 | } else { | ||
786 | /* | ||
787 | * Subject: Wait for Monarch to finish. | ||
788 | */ | ||
789 | while (atomic_read(&mce_executing) != 0) { | ||
790 | if (mce_timed_out(&timeout)) | ||
791 | goto reset; | ||
792 | ndelay(SPINUNIT); | ||
793 | } | ||
794 | |||
795 | /* | ||
796 | * Don't reset anything. That's done by the Monarch. | ||
797 | */ | ||
798 | return 0; | ||
799 | } | ||
800 | |||
801 | /* | ||
802 | * Reset all global state. | ||
803 | */ | ||
804 | reset: | ||
805 | atomic_set(&global_nwo, 0); | ||
806 | atomic_set(&mce_callin, 0); | ||
807 | barrier(); | ||
808 | |||
809 | /* | ||
810 | * Let others run again. | ||
811 | */ | ||
812 | atomic_set(&mce_executing, 0); | ||
813 | return ret; | ||
814 | } | ||
815 | |||
816 | /* | ||
817 | * Check if the address reported by the CPU is in a format we can parse. | ||
818 | * It would be possible to add code for most other cases, but all would | ||
819 | * be somewhat complicated (e.g. segment offset would require an instruction | ||
820 | * parser). So only support physical addresses upto page granuality for now. | ||
821 | */ | ||
822 | static int mce_usable_address(struct mce *m) | ||
823 | { | ||
824 | if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) | ||
825 | return 0; | ||
826 | if ((m->misc & 0x3f) > PAGE_SHIFT) | ||
827 | return 0; | ||
828 | if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) | ||
829 | return 0; | ||
830 | return 1; | ||
831 | } | ||
832 | |||
833 | static void mce_clear_state(unsigned long *toclear) | ||
834 | { | ||
835 | int i; | ||
836 | |||
837 | for (i = 0; i < banks; i++) { | ||
838 | if (test_bit(i, toclear)) | ||
839 | mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
840 | } | ||
841 | } | ||
842 | |||
843 | /* | ||
844 | * The actual machine check handler. This only handles real | ||
845 | * exceptions when something got corrupted coming in through int 18. | ||
846 | * | ||
847 | * This is executed in NMI context not subject to normal locking rules. This | ||
848 | * implies that most kernel services cannot be safely used. Don't even | ||
849 | * think about putting a printk in there! | ||
850 | * | ||
851 | * On Intel systems this is entered on all CPUs in parallel through | ||
852 | * MCE broadcast. However some CPUs might be broken beyond repair, | ||
853 | * so be always careful when synchronizing with others. | ||
854 | */ | ||
855 | void do_machine_check(struct pt_regs *regs, long error_code) | ||
856 | { | ||
857 | struct mce m, *final; | ||
858 | int i; | ||
859 | int worst = 0; | ||
860 | int severity; | ||
861 | /* | ||
862 | * Establish sequential order between the CPUs entering the machine | ||
863 | * check handler. | ||
864 | */ | ||
865 | int order; | ||
866 | |||
867 | /* | ||
868 | * If no_way_out gets set, there is no safe way to recover from this | ||
869 | * MCE. If tolerant is cranked up, we'll try anyway. | ||
870 | */ | ||
871 | int no_way_out = 0; | ||
872 | /* | ||
873 | * If kill_it gets set, there might be a way to recover from this | ||
874 | * error. | ||
875 | */ | ||
876 | int kill_it = 0; | ||
877 | DECLARE_BITMAP(toclear, MAX_NR_BANKS); | ||
878 | char *msg = "Unknown"; | ||
879 | |||
880 | atomic_inc(&mce_entry); | ||
881 | |||
882 | __get_cpu_var(mce_exception_count)++; | ||
883 | |||
884 | if (notify_die(DIE_NMI, "machine check", regs, error_code, | ||
885 | 18, SIGKILL) == NOTIFY_STOP) | ||
886 | goto out; | ||
887 | if (!banks) | ||
888 | goto out; | ||
889 | |||
890 | order = atomic_add_return(1, &mce_callin); | ||
891 | mce_setup(&m); | ||
892 | |||
893 | m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); | ||
894 | no_way_out = mce_no_way_out(&m, &msg); | ||
895 | |||
896 | final = &__get_cpu_var(mces_seen); | ||
897 | *final = m; | ||
898 | |||
899 | barrier(); | ||
900 | |||
901 | /* | ||
902 | * When no restart IP must always kill or panic. | ||
903 | */ | ||
904 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | ||
905 | kill_it = 1; | ||
906 | |||
907 | /* | ||
908 | * Go through all the banks in exclusion of the other CPUs. | ||
909 | * This way we don't report duplicated events on shared banks | ||
910 | * because the first one to see it will clear it. | ||
911 | */ | ||
912 | no_way_out = mce_start(no_way_out, &order); | ||
913 | for (i = 0; i < banks; i++) { | ||
914 | __clear_bit(i, toclear); | ||
915 | if (!bank[i]) | ||
916 | continue; | ||
917 | |||
918 | m.misc = 0; | ||
919 | m.addr = 0; | ||
920 | m.bank = i; | ||
921 | |||
922 | m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); | ||
923 | if ((m.status & MCI_STATUS_VAL) == 0) | ||
924 | continue; | ||
925 | |||
926 | /* | ||
927 | * Non uncorrected or non signaled errors are handled by | ||
928 | * machine_check_poll. Leave them alone, unless this panics. | ||
929 | */ | ||
930 | if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && | ||
931 | !no_way_out) | ||
932 | continue; | ||
933 | |||
934 | /* | ||
935 | * Set taint even when machine check was not enabled. | ||
936 | */ | ||
937 | add_taint(TAINT_MACHINE_CHECK); | ||
938 | |||
939 | severity = mce_severity(&m, tolerant, NULL); | ||
940 | |||
941 | /* | ||
942 | * When machine check was for corrected handler don't touch, | ||
943 | * unless we're panicing. | ||
944 | */ | ||
945 | if (severity == MCE_KEEP_SEVERITY && !no_way_out) | ||
946 | continue; | ||
947 | __set_bit(i, toclear); | ||
948 | if (severity == MCE_NO_SEVERITY) { | ||
949 | /* | ||
950 | * Machine check event was not enabled. Clear, but | ||
951 | * ignore. | ||
952 | */ | ||
953 | continue; | ||
954 | } | ||
955 | |||
956 | /* | ||
957 | * Kill on action required. | ||
958 | */ | ||
959 | if (severity == MCE_AR_SEVERITY) | ||
960 | kill_it = 1; | ||
961 | |||
962 | if (m.status & MCI_STATUS_MISCV) | ||
963 | m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); | ||
964 | if (m.status & MCI_STATUS_ADDRV) | ||
965 | m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); | ||
966 | |||
967 | /* | ||
968 | * Action optional error. Queue address for later processing. | ||
969 | * When the ring overflows we just ignore the AO error. | ||
970 | * RED-PEN add some logging mechanism when | ||
971 | * usable_address or mce_add_ring fails. | ||
972 | * RED-PEN don't ignore overflow for tolerant == 0 | ||
973 | */ | ||
974 | if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) | ||
975 | mce_ring_add(m.addr >> PAGE_SHIFT); | ||
976 | |||
977 | mce_get_rip(&m, regs); | ||
978 | mce_log(&m); | ||
979 | |||
980 | if (severity > worst) { | ||
981 | *final = m; | ||
982 | worst = severity; | ||
983 | } | ||
984 | } | ||
985 | |||
986 | if (!no_way_out) | ||
987 | mce_clear_state(toclear); | ||
988 | |||
989 | /* | ||
990 | * Do most of the synchronization with other CPUs. | ||
991 | * When there's any problem use only local no_way_out state. | ||
992 | */ | ||
993 | if (mce_end(order) < 0) | ||
994 | no_way_out = worst >= MCE_PANIC_SEVERITY; | ||
995 | |||
996 | /* | ||
997 | * If we have decided that we just CAN'T continue, and the user | ||
998 | * has not set tolerant to an insane level, give up and die. | ||
999 | * | ||
1000 | * This is mainly used in the case when the system doesn't | ||
1001 | * support MCE broadcasting or it has been disabled. | ||
1002 | */ | ||
1003 | if (no_way_out && tolerant < 3) | ||
1004 | mce_panic("Fatal machine check on current CPU", final, msg); | ||
1005 | |||
1006 | /* | ||
1007 | * If the error seems to be unrecoverable, something should be | ||
1008 | * done. Try to kill as little as possible. If we can kill just | ||
1009 | * one task, do that. If the user has set the tolerance very | ||
1010 | * high, don't try to do anything at all. | ||
1011 | */ | ||
1012 | |||
1013 | if (kill_it && tolerant < 3) | ||
1014 | force_sig(SIGBUS, current); | ||
1015 | |||
1016 | /* notify userspace ASAP */ | ||
1017 | set_thread_flag(TIF_MCE_NOTIFY); | ||
1018 | |||
1019 | if (worst > 0) | ||
1020 | mce_report_event(regs); | ||
1021 | mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); | ||
1022 | out: | ||
1023 | atomic_dec(&mce_entry); | ||
1024 | sync_core(); | ||
1025 | } | ||
1026 | EXPORT_SYMBOL_GPL(do_machine_check); | ||
1027 | |||
1028 | /* dummy to break dependency. actual code is in mm/memory-failure.c */ | ||
1029 | void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) | ||
1030 | { | ||
1031 | printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); | ||
1032 | } | ||
1033 | |||
1034 | /* | ||
1035 | * Called after mce notification in process context. This code | ||
1036 | * is allowed to sleep. Call the high level VM handler to process | ||
1037 | * any corrupted pages. | ||
1038 | * Assume that the work queue code only calls this one at a time | ||
1039 | * per CPU. | ||
1040 | * Note we don't disable preemption, so this code might run on the wrong | ||
1041 | * CPU. In this case the event is picked up by the scheduled work queue. | ||
1042 | * This is merely a fast path to expedite processing in some common | ||
1043 | * cases. | ||
1044 | */ | ||
1045 | void mce_notify_process(void) | ||
1046 | { | ||
1047 | unsigned long pfn; | ||
1048 | mce_notify_irq(); | ||
1049 | while (mce_ring_get(&pfn)) | ||
1050 | memory_failure(pfn, MCE_VECTOR); | ||
1051 | } | ||
1052 | |||
1053 | static void mce_process_work(struct work_struct *dummy) | ||
1054 | { | ||
1055 | mce_notify_process(); | ||
1056 | } | ||
1057 | |||
1058 | #ifdef CONFIG_X86_MCE_INTEL | ||
1059 | /*** | ||
1060 | * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog | ||
1061 | * @cpu: The CPU on which the event occurred. | ||
1062 | * @status: Event status information | ||
1063 | * | ||
1064 | * This function should be called by the thermal interrupt after the | ||
1065 | * event has been processed and the decision was made to log the event | ||
1066 | * further. | ||
1067 | * | ||
1068 | * The status parameter will be saved to the 'status' field of 'struct mce' | ||
1069 | * and historically has been the register value of the | ||
1070 | * MSR_IA32_THERMAL_STATUS (Intel) msr. | ||
1071 | */ | ||
1072 | void mce_log_therm_throt_event(__u64 status) | ||
1073 | { | ||
1074 | struct mce m; | ||
1075 | |||
1076 | mce_setup(&m); | ||
1077 | m.bank = MCE_THERMAL_BANK; | ||
1078 | m.status = status; | ||
1079 | mce_log(&m); | ||
1080 | } | ||
1081 | #endif /* CONFIG_X86_MCE_INTEL */ | ||
1082 | |||
1083 | /* | ||
1084 | * Periodic polling timer for "silent" machine check errors. If the | ||
1085 | * poller finds an MCE, poll 2x faster. When the poller finds no more | ||
1086 | * errors, poll 2x slower (up to check_interval seconds). | ||
1087 | */ | ||
1088 | static int check_interval = 5 * 60; /* 5 minutes */ | ||
1089 | |||
1090 | static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ | ||
1091 | static DEFINE_PER_CPU(struct timer_list, mce_timer); | ||
1092 | |||
1093 | static void mcheck_timer(unsigned long data) | ||
1094 | { | ||
1095 | struct timer_list *t = &per_cpu(mce_timer, data); | ||
1096 | int *n; | ||
1097 | |||
1098 | WARN_ON(smp_processor_id() != data); | ||
1099 | |||
1100 | if (mce_available(¤t_cpu_data)) { | ||
1101 | machine_check_poll(MCP_TIMESTAMP, | ||
1102 | &__get_cpu_var(mce_poll_banks)); | ||
1103 | } | ||
1104 | |||
1105 | /* | ||
1106 | * Alert userspace if needed. If we logged an MCE, reduce the | ||
1107 | * polling interval, otherwise increase the polling interval. | ||
1108 | */ | ||
1109 | n = &__get_cpu_var(next_interval); | ||
1110 | if (mce_notify_irq()) | ||
1111 | *n = max(*n/2, HZ/100); | ||
1112 | else | ||
1113 | *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); | ||
1114 | |||
1115 | t->expires = jiffies + *n; | ||
1116 | add_timer(t); | ||
1117 | } | ||
1118 | |||
1119 | static void mce_do_trigger(struct work_struct *work) | ||
1120 | { | ||
1121 | call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); | ||
1122 | } | ||
1123 | |||
1124 | static DECLARE_WORK(mce_trigger_work, mce_do_trigger); | ||
1125 | |||
1126 | /* | ||
1127 | * Notify the user(s) about new machine check events. | ||
1128 | * Can be called from interrupt context, but not from machine check/NMI | ||
1129 | * context. | ||
1130 | */ | ||
1131 | int mce_notify_irq(void) | ||
1132 | { | ||
1133 | /* Not more than two messages every minute */ | ||
1134 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); | ||
1135 | |||
1136 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
1137 | |||
1138 | if (test_and_clear_bit(0, ¬ify_user)) { | ||
1139 | wake_up_interruptible(&mce_wait); | ||
1140 | |||
1141 | /* | ||
1142 | * There is no risk of missing notifications because | ||
1143 | * work_pending is always cleared before the function is | ||
1144 | * executed. | ||
1145 | */ | ||
1146 | if (trigger[0] && !work_pending(&mce_trigger_work)) | ||
1147 | schedule_work(&mce_trigger_work); | ||
1148 | |||
1149 | if (__ratelimit(&ratelimit)) | ||
1150 | printk(KERN_INFO "Machine check events logged\n"); | ||
1151 | |||
1152 | return 1; | ||
1153 | } | ||
1154 | return 0; | ||
1155 | } | ||
1156 | EXPORT_SYMBOL_GPL(mce_notify_irq); | ||
1157 | |||
1158 | /* | ||
1159 | * Initialize Machine Checks for a CPU. | ||
1160 | */ | ||
1161 | static int mce_cap_init(void) | ||
1162 | { | ||
1163 | unsigned b; | ||
1164 | u64 cap; | ||
1165 | |||
1166 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
1167 | |||
1168 | b = cap & MCG_BANKCNT_MASK; | ||
1169 | printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); | ||
1170 | |||
1171 | if (b > MAX_NR_BANKS) { | ||
1172 | printk(KERN_WARNING | ||
1173 | "MCE: Using only %u machine check banks out of %u\n", | ||
1174 | MAX_NR_BANKS, b); | ||
1175 | b = MAX_NR_BANKS; | ||
1176 | } | ||
1177 | |||
1178 | /* Don't support asymmetric configurations today */ | ||
1179 | WARN_ON(banks != 0 && b != banks); | ||
1180 | banks = b; | ||
1181 | if (!bank) { | ||
1182 | bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); | ||
1183 | if (!bank) | ||
1184 | return -ENOMEM; | ||
1185 | memset(bank, 0xff, banks * sizeof(u64)); | ||
1186 | } | ||
1187 | |||
1188 | /* Use accurate RIP reporting if available. */ | ||
1189 | if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) | ||
1190 | rip_msr = MSR_IA32_MCG_EIP; | ||
1191 | |||
1192 | if (cap & MCG_SER_P) | ||
1193 | mce_ser = 1; | ||
1194 | |||
1195 | return 0; | ||
1196 | } | ||
1197 | |||
1198 | static void mce_init(void) | ||
1199 | { | ||
1200 | mce_banks_t all_banks; | ||
1201 | u64 cap; | ||
1202 | int i; | ||
1203 | |||
1204 | /* | ||
1205 | * Log the machine checks left over from the previous reset. | ||
1206 | */ | ||
1207 | bitmap_fill(all_banks, MAX_NR_BANKS); | ||
1208 | machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); | ||
1209 | |||
1210 | set_in_cr4(X86_CR4_MCE); | ||
1211 | |||
1212 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
1213 | if (cap & MCG_CTL_P) | ||
1214 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | ||
1215 | |||
1216 | for (i = 0; i < banks; i++) { | ||
1217 | if (skip_bank_init(i)) | ||
1218 | continue; | ||
1219 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | ||
1220 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
1221 | } | ||
1222 | } | ||
1223 | |||
1224 | /* Add per CPU specific workarounds here */ | ||
1225 | static void mce_cpu_quirks(struct cpuinfo_x86 *c) | ||
1226 | { | ||
1227 | /* This should be disabled by the BIOS, but isn't always */ | ||
1228 | if (c->x86_vendor == X86_VENDOR_AMD) { | ||
1229 | if (c->x86 == 15 && banks > 4) { | ||
1230 | /* | ||
1231 | * disable GART TBL walk error reporting, which | ||
1232 | * trips off incorrectly with the IOMMU & 3ware | ||
1233 | * & Cerberus: | ||
1234 | */ | ||
1235 | clear_bit(10, (unsigned long *)&bank[4]); | ||
1236 | } | ||
1237 | if (c->x86 <= 17 && mce_bootlog < 0) { | ||
1238 | /* | ||
1239 | * Lots of broken BIOS around that don't clear them | ||
1240 | * by default and leave crap in there. Don't log: | ||
1241 | */ | ||
1242 | mce_bootlog = 0; | ||
1243 | } | ||
1244 | /* | ||
1245 | * Various K7s with broken bank 0 around. Always disable | ||
1246 | * by default. | ||
1247 | */ | ||
1248 | if (c->x86 == 6) | ||
1249 | bank[0] = 0; | ||
1250 | } | ||
1251 | |||
1252 | if (c->x86_vendor == X86_VENDOR_INTEL) { | ||
1253 | /* | ||
1254 | * SDM documents that on family 6 bank 0 should not be written | ||
1255 | * because it aliases to another special BIOS controlled | ||
1256 | * register. | ||
1257 | * But it's not aliased anymore on model 0x1a+ | ||
1258 | * Don't ignore bank 0 completely because there could be a | ||
1259 | * valid event later, merely don't write CTL0. | ||
1260 | */ | ||
1261 | |||
1262 | if (c->x86 == 6 && c->x86_model < 0x1A) | ||
1263 | __set_bit(0, &dont_init_banks); | ||
1264 | |||
1265 | /* | ||
1266 | * All newer Intel systems support MCE broadcasting. Enable | ||
1267 | * synchronization with a one second timeout. | ||
1268 | */ | ||
1269 | if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && | ||
1270 | monarch_timeout < 0) | ||
1271 | monarch_timeout = USEC_PER_SEC; | ||
1272 | } | ||
1273 | if (monarch_timeout < 0) | ||
1274 | monarch_timeout = 0; | ||
1275 | if (mce_bootlog != 0) | ||
1276 | mce_panic_timeout = 30; | ||
1277 | } | ||
1278 | |||
1279 | static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) | ||
1280 | { | ||
1281 | if (c->x86 != 5) | ||
1282 | return; | ||
1283 | switch (c->x86_vendor) { | ||
1284 | case X86_VENDOR_INTEL: | ||
1285 | if (mce_p5_enabled()) | ||
1286 | intel_p5_mcheck_init(c); | ||
1287 | break; | ||
1288 | case X86_VENDOR_CENTAUR: | ||
1289 | winchip_mcheck_init(c); | ||
1290 | break; | ||
1291 | } | ||
1292 | } | ||
1293 | |||
1294 | static void mce_cpu_features(struct cpuinfo_x86 *c) | ||
1295 | { | ||
1296 | switch (c->x86_vendor) { | ||
1297 | case X86_VENDOR_INTEL: | ||
1298 | mce_intel_feature_init(c); | ||
1299 | break; | ||
1300 | case X86_VENDOR_AMD: | ||
1301 | mce_amd_feature_init(c); | ||
1302 | break; | ||
1303 | default: | ||
1304 | break; | ||
1305 | } | ||
1306 | } | ||
1307 | |||
1308 | static void mce_init_timer(void) | ||
1309 | { | ||
1310 | struct timer_list *t = &__get_cpu_var(mce_timer); | ||
1311 | int *n = &__get_cpu_var(next_interval); | ||
1312 | |||
1313 | if (mce_ignore_ce) | ||
1314 | return; | ||
1315 | |||
1316 | *n = check_interval * HZ; | ||
1317 | if (!*n) | ||
1318 | return; | ||
1319 | setup_timer(t, mcheck_timer, smp_processor_id()); | ||
1320 | t->expires = round_jiffies(jiffies + *n); | ||
1321 | add_timer(t); | ||
1322 | } | ||
1323 | |||
1324 | /* | ||
1325 | * Called for each booted CPU to set up machine checks. | ||
1326 | * Must be called with preempt off: | ||
1327 | */ | ||
1328 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | ||
1329 | { | ||
1330 | if (mce_disabled) | ||
1331 | return; | ||
1332 | |||
1333 | mce_ancient_init(c); | ||
1334 | |||
1335 | if (!mce_available(c)) | ||
1336 | return; | ||
1337 | |||
1338 | if (mce_cap_init() < 0) { | ||
1339 | mce_disabled = 1; | ||
1340 | return; | ||
1341 | } | ||
1342 | mce_cpu_quirks(c); | ||
1343 | |||
1344 | machine_check_vector = do_machine_check; | ||
1345 | |||
1346 | mce_init(); | ||
1347 | mce_cpu_features(c); | ||
1348 | mce_init_timer(); | ||
1349 | INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); | ||
1350 | } | ||
1351 | |||
1352 | /* | ||
1353 | * Character device to read and clear the MCE log. | ||
1354 | */ | ||
1355 | |||
1356 | static DEFINE_SPINLOCK(mce_state_lock); | ||
1357 | static int open_count; /* #times opened */ | ||
1358 | static int open_exclu; /* already open exclusive? */ | ||
1359 | |||
1360 | static int mce_open(struct inode *inode, struct file *file) | ||
1361 | { | ||
1362 | spin_lock(&mce_state_lock); | ||
1363 | |||
1364 | if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { | ||
1365 | spin_unlock(&mce_state_lock); | ||
1366 | |||
1367 | return -EBUSY; | ||
1368 | } | ||
1369 | |||
1370 | if (file->f_flags & O_EXCL) | ||
1371 | open_exclu = 1; | ||
1372 | open_count++; | ||
1373 | |||
1374 | spin_unlock(&mce_state_lock); | ||
1375 | |||
1376 | return nonseekable_open(inode, file); | ||
1377 | } | ||
1378 | |||
1379 | static int mce_release(struct inode *inode, struct file *file) | ||
1380 | { | ||
1381 | spin_lock(&mce_state_lock); | ||
1382 | |||
1383 | open_count--; | ||
1384 | open_exclu = 0; | ||
1385 | |||
1386 | spin_unlock(&mce_state_lock); | ||
1387 | |||
1388 | return 0; | ||
1389 | } | ||
1390 | |||
1391 | static void collect_tscs(void *data) | ||
1392 | { | ||
1393 | unsigned long *cpu_tsc = (unsigned long *)data; | ||
1394 | |||
1395 | rdtscll(cpu_tsc[smp_processor_id()]); | ||
1396 | } | ||
1397 | |||
1398 | static DEFINE_MUTEX(mce_read_mutex); | ||
1399 | |||
1400 | static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, | ||
1401 | loff_t *off) | ||
1402 | { | ||
1403 | char __user *buf = ubuf; | ||
1404 | unsigned long *cpu_tsc; | ||
1405 | unsigned prev, next; | ||
1406 | int i, err; | ||
1407 | |||
1408 | cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); | ||
1409 | if (!cpu_tsc) | ||
1410 | return -ENOMEM; | ||
1411 | |||
1412 | mutex_lock(&mce_read_mutex); | ||
1413 | next = rcu_dereference(mcelog.next); | ||
1414 | |||
1415 | /* Only supports full reads right now */ | ||
1416 | if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { | ||
1417 | mutex_unlock(&mce_read_mutex); | ||
1418 | kfree(cpu_tsc); | ||
1419 | |||
1420 | return -EINVAL; | ||
1421 | } | ||
1422 | |||
1423 | err = 0; | ||
1424 | prev = 0; | ||
1425 | do { | ||
1426 | for (i = prev; i < next; i++) { | ||
1427 | unsigned long start = jiffies; | ||
1428 | |||
1429 | while (!mcelog.entry[i].finished) { | ||
1430 | if (time_after_eq(jiffies, start + 2)) { | ||
1431 | memset(mcelog.entry + i, 0, | ||
1432 | sizeof(struct mce)); | ||
1433 | goto timeout; | ||
1434 | } | ||
1435 | cpu_relax(); | ||
1436 | } | ||
1437 | smp_rmb(); | ||
1438 | err |= copy_to_user(buf, mcelog.entry + i, | ||
1439 | sizeof(struct mce)); | ||
1440 | buf += sizeof(struct mce); | ||
1441 | timeout: | ||
1442 | ; | ||
1443 | } | ||
1444 | |||
1445 | memset(mcelog.entry + prev, 0, | ||
1446 | (next - prev) * sizeof(struct mce)); | ||
1447 | prev = next; | ||
1448 | next = cmpxchg(&mcelog.next, prev, 0); | ||
1449 | } while (next != prev); | ||
1450 | |||
1451 | synchronize_sched(); | ||
1452 | |||
1453 | /* | ||
1454 | * Collect entries that were still getting written before the | ||
1455 | * synchronize. | ||
1456 | */ | ||
1457 | on_each_cpu(collect_tscs, cpu_tsc, 1); | ||
1458 | |||
1459 | for (i = next; i < MCE_LOG_LEN; i++) { | ||
1460 | if (mcelog.entry[i].finished && | ||
1461 | mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { | ||
1462 | err |= copy_to_user(buf, mcelog.entry+i, | ||
1463 | sizeof(struct mce)); | ||
1464 | smp_rmb(); | ||
1465 | buf += sizeof(struct mce); | ||
1466 | memset(&mcelog.entry[i], 0, sizeof(struct mce)); | ||
1467 | } | ||
1468 | } | ||
1469 | mutex_unlock(&mce_read_mutex); | ||
1470 | kfree(cpu_tsc); | ||
1471 | |||
1472 | return err ? -EFAULT : buf - ubuf; | ||
1473 | } | ||
1474 | |||
1475 | static unsigned int mce_poll(struct file *file, poll_table *wait) | ||
1476 | { | ||
1477 | poll_wait(file, &mce_wait, wait); | ||
1478 | if (rcu_dereference(mcelog.next)) | ||
1479 | return POLLIN | POLLRDNORM; | ||
1480 | return 0; | ||
1481 | } | ||
1482 | |||
1483 | static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) | ||
1484 | { | ||
1485 | int __user *p = (int __user *)arg; | ||
1486 | |||
1487 | if (!capable(CAP_SYS_ADMIN)) | ||
1488 | return -EPERM; | ||
1489 | |||
1490 | switch (cmd) { | ||
1491 | case MCE_GET_RECORD_LEN: | ||
1492 | return put_user(sizeof(struct mce), p); | ||
1493 | case MCE_GET_LOG_LEN: | ||
1494 | return put_user(MCE_LOG_LEN, p); | ||
1495 | case MCE_GETCLEAR_FLAGS: { | ||
1496 | unsigned flags; | ||
1497 | |||
1498 | do { | ||
1499 | flags = mcelog.flags; | ||
1500 | } while (cmpxchg(&mcelog.flags, flags, 0) != flags); | ||
1501 | |||
1502 | return put_user(flags, p); | ||
1503 | } | ||
1504 | default: | ||
1505 | return -ENOTTY; | ||
1506 | } | ||
1507 | } | ||
1508 | |||
1509 | /* Modified in mce-inject.c, so not static or const */ | ||
1510 | struct file_operations mce_chrdev_ops = { | ||
1511 | .open = mce_open, | ||
1512 | .release = mce_release, | ||
1513 | .read = mce_read, | ||
1514 | .poll = mce_poll, | ||
1515 | .unlocked_ioctl = mce_ioctl, | ||
1516 | }; | ||
1517 | EXPORT_SYMBOL_GPL(mce_chrdev_ops); | ||
1518 | |||
1519 | static struct miscdevice mce_log_device = { | ||
1520 | MISC_MCELOG_MINOR, | ||
1521 | "mcelog", | ||
1522 | &mce_chrdev_ops, | ||
1523 | }; | ||
1524 | |||
1525 | /* | ||
1526 | * mce=off Disables machine check | ||
1527 | * mce=no_cmci Disables CMCI | ||
1528 | * mce=dont_log_ce Clears corrected events silently, no log created for CEs. | ||
1529 | * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. | ||
1530 | * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) | ||
1531 | * monarchtimeout is how long to wait for other CPUs on machine | ||
1532 | * check, or 0 to not wait | ||
1533 | * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. | ||
1534 | * mce=nobootlog Don't log MCEs from before booting. | ||
1535 | */ | ||
1536 | static int __init mcheck_enable(char *str) | ||
1537 | { | ||
1538 | if (*str == 0) | ||
1539 | enable_p5_mce(); | ||
1540 | if (*str == '=') | ||
1541 | str++; | ||
1542 | if (!strcmp(str, "off")) | ||
1543 | mce_disabled = 1; | ||
1544 | else if (!strcmp(str, "no_cmci")) | ||
1545 | mce_cmci_disabled = 1; | ||
1546 | else if (!strcmp(str, "dont_log_ce")) | ||
1547 | mce_dont_log_ce = 1; | ||
1548 | else if (!strcmp(str, "ignore_ce")) | ||
1549 | mce_ignore_ce = 1; | ||
1550 | else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) | ||
1551 | mce_bootlog = (str[0] == 'b'); | ||
1552 | else if (isdigit(str[0])) { | ||
1553 | get_option(&str, &tolerant); | ||
1554 | if (*str == ',') { | ||
1555 | ++str; | ||
1556 | get_option(&str, &monarch_timeout); | ||
1557 | } | ||
1558 | } else { | ||
1559 | printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", | ||
1560 | str); | ||
1561 | return 0; | ||
1562 | } | ||
1563 | return 1; | ||
1564 | } | ||
1565 | __setup("mce", mcheck_enable); | ||
1566 | |||
1567 | /* | ||
1568 | * Sysfs support | ||
1569 | */ | ||
1570 | |||
1571 | /* | ||
1572 | * Disable machine checks on suspend and shutdown. We can't really handle | ||
1573 | * them later. | ||
1574 | */ | ||
1575 | static int mce_disable(void) | ||
1576 | { | ||
1577 | int i; | ||
1578 | |||
1579 | for (i = 0; i < banks; i++) { | ||
1580 | if (!skip_bank_init(i)) | ||
1581 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | ||
1582 | } | ||
1583 | return 0; | ||
1584 | } | ||
1585 | |||
1586 | static int mce_suspend(struct sys_device *dev, pm_message_t state) | ||
1587 | { | ||
1588 | return mce_disable(); | ||
1589 | } | ||
1590 | |||
1591 | static int mce_shutdown(struct sys_device *dev) | ||
1592 | { | ||
1593 | return mce_disable(); | ||
1594 | } | ||
1595 | |||
1596 | /* | ||
1597 | * On resume clear all MCE state. Don't want to see leftovers from the BIOS. | ||
1598 | * Only one CPU is active at this time, the others get re-added later using | ||
1599 | * CPU hotplug: | ||
1600 | */ | ||
1601 | static int mce_resume(struct sys_device *dev) | ||
1602 | { | ||
1603 | mce_init(); | ||
1604 | mce_cpu_features(¤t_cpu_data); | ||
1605 | |||
1606 | return 0; | ||
1607 | } | ||
1608 | |||
1609 | static void mce_cpu_restart(void *data) | ||
1610 | { | ||
1611 | del_timer_sync(&__get_cpu_var(mce_timer)); | ||
1612 | if (mce_available(¤t_cpu_data)) | ||
1613 | mce_init(); | ||
1614 | mce_init_timer(); | ||
1615 | } | ||
1616 | |||
1617 | /* Reinit MCEs after user configuration changes */ | ||
1618 | static void mce_restart(void) | ||
1619 | { | ||
1620 | on_each_cpu(mce_cpu_restart, NULL, 1); | ||
1621 | } | ||
1622 | |||
1623 | static struct sysdev_class mce_sysclass = { | ||
1624 | .suspend = mce_suspend, | ||
1625 | .shutdown = mce_shutdown, | ||
1626 | .resume = mce_resume, | ||
1627 | .name = "machinecheck", | ||
1628 | }; | ||
1629 | |||
1630 | DEFINE_PER_CPU(struct sys_device, mce_dev); | ||
1631 | |||
1632 | __cpuinitdata | ||
1633 | void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); | ||
1634 | |||
1635 | static struct sysdev_attribute *bank_attrs; | ||
1636 | |||
1637 | static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, | ||
1638 | char *buf) | ||
1639 | { | ||
1640 | u64 b = bank[attr - bank_attrs]; | ||
1641 | |||
1642 | return sprintf(buf, "%llx\n", b); | ||
1643 | } | ||
1644 | |||
1645 | static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, | ||
1646 | const char *buf, size_t size) | ||
1647 | { | ||
1648 | u64 new; | ||
1649 | |||
1650 | if (strict_strtoull(buf, 0, &new) < 0) | ||
1651 | return -EINVAL; | ||
1652 | |||
1653 | bank[attr - bank_attrs] = new; | ||
1654 | mce_restart(); | ||
1655 | |||
1656 | return size; | ||
1657 | } | ||
1658 | |||
1659 | static ssize_t | ||
1660 | show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) | ||
1661 | { | ||
1662 | strcpy(buf, trigger); | ||
1663 | strcat(buf, "\n"); | ||
1664 | return strlen(trigger) + 1; | ||
1665 | } | ||
1666 | |||
1667 | static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, | ||
1668 | const char *buf, size_t siz) | ||
1669 | { | ||
1670 | char *p; | ||
1671 | int len; | ||
1672 | |||
1673 | strncpy(trigger, buf, sizeof(trigger)); | ||
1674 | trigger[sizeof(trigger)-1] = 0; | ||
1675 | len = strlen(trigger); | ||
1676 | p = strchr(trigger, '\n'); | ||
1677 | |||
1678 | if (*p) | ||
1679 | *p = 0; | ||
1680 | |||
1681 | return len; | ||
1682 | } | ||
1683 | |||
1684 | static ssize_t store_int_with_restart(struct sys_device *s, | ||
1685 | struct sysdev_attribute *attr, | ||
1686 | const char *buf, size_t size) | ||
1687 | { | ||
1688 | ssize_t ret = sysdev_store_int(s, attr, buf, size); | ||
1689 | mce_restart(); | ||
1690 | return ret; | ||
1691 | } | ||
1692 | |||
1693 | static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); | ||
1694 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); | ||
1695 | static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); | ||
1696 | |||
1697 | static struct sysdev_ext_attribute attr_check_interval = { | ||
1698 | _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, | ||
1699 | store_int_with_restart), | ||
1700 | &check_interval | ||
1701 | }; | ||
1702 | |||
1703 | static struct sysdev_attribute *mce_attrs[] = { | ||
1704 | &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, | ||
1705 | &attr_monarch_timeout.attr, | ||
1706 | NULL | ||
1707 | }; | ||
1708 | |||
1709 | static cpumask_var_t mce_dev_initialized; | ||
1710 | |||
1711 | /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ | ||
1712 | static __cpuinit int mce_create_device(unsigned int cpu) | ||
1713 | { | ||
1714 | int err; | ||
1715 | int i; | ||
1716 | |||
1717 | if (!mce_available(&boot_cpu_data)) | ||
1718 | return -EIO; | ||
1719 | |||
1720 | memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); | ||
1721 | per_cpu(mce_dev, cpu).id = cpu; | ||
1722 | per_cpu(mce_dev, cpu).cls = &mce_sysclass; | ||
1723 | |||
1724 | err = sysdev_register(&per_cpu(mce_dev, cpu)); | ||
1725 | if (err) | ||
1726 | return err; | ||
1727 | |||
1728 | for (i = 0; mce_attrs[i]; i++) { | ||
1729 | err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); | ||
1730 | if (err) | ||
1731 | goto error; | ||
1732 | } | ||
1733 | for (i = 0; i < banks; i++) { | ||
1734 | err = sysdev_create_file(&per_cpu(mce_dev, cpu), | ||
1735 | &bank_attrs[i]); | ||
1736 | if (err) | ||
1737 | goto error2; | ||
1738 | } | ||
1739 | cpumask_set_cpu(cpu, mce_dev_initialized); | ||
1740 | |||
1741 | return 0; | ||
1742 | error2: | ||
1743 | while (--i >= 0) | ||
1744 | sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); | ||
1745 | error: | ||
1746 | while (--i >= 0) | ||
1747 | sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); | ||
1748 | |||
1749 | sysdev_unregister(&per_cpu(mce_dev, cpu)); | ||
1750 | |||
1751 | return err; | ||
1752 | } | ||
1753 | |||
1754 | static __cpuinit void mce_remove_device(unsigned int cpu) | ||
1755 | { | ||
1756 | int i; | ||
1757 | |||
1758 | if (!cpumask_test_cpu(cpu, mce_dev_initialized)) | ||
1759 | return; | ||
1760 | |||
1761 | for (i = 0; mce_attrs[i]; i++) | ||
1762 | sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); | ||
1763 | |||
1764 | for (i = 0; i < banks; i++) | ||
1765 | sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); | ||
1766 | |||
1767 | sysdev_unregister(&per_cpu(mce_dev, cpu)); | ||
1768 | cpumask_clear_cpu(cpu, mce_dev_initialized); | ||
1769 | } | ||
1770 | |||
1771 | /* Make sure there are no machine checks on offlined CPUs. */ | ||
1772 | static void mce_disable_cpu(void *h) | ||
1773 | { | ||
1774 | unsigned long action = *(unsigned long *)h; | ||
1775 | int i; | ||
1776 | |||
1777 | if (!mce_available(¤t_cpu_data)) | ||
1778 | return; | ||
1779 | if (!(action & CPU_TASKS_FROZEN)) | ||
1780 | cmci_clear(); | ||
1781 | for (i = 0; i < banks; i++) { | ||
1782 | if (!skip_bank_init(i)) | ||
1783 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | ||
1784 | } | ||
1785 | } | ||
1786 | |||
1787 | static void mce_reenable_cpu(void *h) | ||
1788 | { | ||
1789 | unsigned long action = *(unsigned long *)h; | ||
1790 | int i; | ||
1791 | |||
1792 | if (!mce_available(¤t_cpu_data)) | ||
1793 | return; | ||
1794 | |||
1795 | if (!(action & CPU_TASKS_FROZEN)) | ||
1796 | cmci_reenable(); | ||
1797 | for (i = 0; i < banks; i++) { | ||
1798 | if (!skip_bank_init(i)) | ||
1799 | wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); | ||
1800 | } | ||
1801 | } | ||
1802 | |||
1803 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ | ||
1804 | static int __cpuinit | ||
1805 | mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
1806 | { | ||
1807 | unsigned int cpu = (unsigned long)hcpu; | ||
1808 | struct timer_list *t = &per_cpu(mce_timer, cpu); | ||
1809 | |||
1810 | switch (action) { | ||
1811 | case CPU_ONLINE: | ||
1812 | case CPU_ONLINE_FROZEN: | ||
1813 | mce_create_device(cpu); | ||
1814 | if (threshold_cpu_callback) | ||
1815 | threshold_cpu_callback(action, cpu); | ||
1816 | break; | ||
1817 | case CPU_DEAD: | ||
1818 | case CPU_DEAD_FROZEN: | ||
1819 | if (threshold_cpu_callback) | ||
1820 | threshold_cpu_callback(action, cpu); | ||
1821 | mce_remove_device(cpu); | ||
1822 | break; | ||
1823 | case CPU_DOWN_PREPARE: | ||
1824 | case CPU_DOWN_PREPARE_FROZEN: | ||
1825 | del_timer_sync(t); | ||
1826 | smp_call_function_single(cpu, mce_disable_cpu, &action, 1); | ||
1827 | break; | ||
1828 | case CPU_DOWN_FAILED: | ||
1829 | case CPU_DOWN_FAILED_FROZEN: | ||
1830 | t->expires = round_jiffies(jiffies + | ||
1831 | __get_cpu_var(next_interval)); | ||
1832 | add_timer_on(t, cpu); | ||
1833 | smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); | ||
1834 | break; | ||
1835 | case CPU_POST_DEAD: | ||
1836 | /* intentionally ignoring frozen here */ | ||
1837 | cmci_rediscover(cpu); | ||
1838 | break; | ||
1839 | } | ||
1840 | return NOTIFY_OK; | ||
1841 | } | ||
1842 | |||
1843 | static struct notifier_block mce_cpu_notifier __cpuinitdata = { | ||
1844 | .notifier_call = mce_cpu_callback, | ||
1845 | }; | ||
1846 | |||
1847 | static __init int mce_init_banks(void) | ||
1848 | { | ||
1849 | int i; | ||
1850 | |||
1851 | bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, | ||
1852 | GFP_KERNEL); | ||
1853 | if (!bank_attrs) | ||
1854 | return -ENOMEM; | ||
1855 | |||
1856 | for (i = 0; i < banks; i++) { | ||
1857 | struct sysdev_attribute *a = &bank_attrs[i]; | ||
1858 | |||
1859 | a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); | ||
1860 | if (!a->attr.name) | ||
1861 | goto nomem; | ||
1862 | |||
1863 | a->attr.mode = 0644; | ||
1864 | a->show = show_bank; | ||
1865 | a->store = set_bank; | ||
1866 | } | ||
1867 | return 0; | ||
1868 | |||
1869 | nomem: | ||
1870 | while (--i >= 0) | ||
1871 | kfree(bank_attrs[i].attr.name); | ||
1872 | kfree(bank_attrs); | ||
1873 | bank_attrs = NULL; | ||
1874 | |||
1875 | return -ENOMEM; | ||
1876 | } | ||
1877 | |||
1878 | static __init int mce_init_device(void) | ||
1879 | { | ||
1880 | int err; | ||
1881 | int i = 0; | ||
1882 | |||
1883 | if (!mce_available(&boot_cpu_data)) | ||
1884 | return -EIO; | ||
1885 | |||
1886 | alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); | ||
1887 | |||
1888 | err = mce_init_banks(); | ||
1889 | if (err) | ||
1890 | return err; | ||
1891 | |||
1892 | err = sysdev_class_register(&mce_sysclass); | ||
1893 | if (err) | ||
1894 | return err; | ||
1895 | |||
1896 | for_each_online_cpu(i) { | ||
1897 | err = mce_create_device(i); | ||
1898 | if (err) | ||
1899 | return err; | ||
1900 | } | ||
1901 | |||
1902 | register_hotcpu_notifier(&mce_cpu_notifier); | ||
1903 | misc_register(&mce_log_device); | ||
1904 | |||
1905 | return err; | ||
1906 | } | ||
1907 | |||
1908 | device_initcall(mce_init_device); | ||
1909 | |||
1910 | #else /* CONFIG_X86_OLD_MCE: */ | ||
1911 | |||
1912 | int nr_mce_banks; | ||
1913 | EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ | ||
1914 | |||
1915 | /* This has to be run for each processor */ | ||
1916 | void mcheck_init(struct cpuinfo_x86 *c) | ||
1917 | { | ||
1918 | if (mce_disabled == 1) | ||
1919 | return; | ||
1920 | |||
1921 | switch (c->x86_vendor) { | ||
1922 | case X86_VENDOR_AMD: | ||
1923 | amd_mcheck_init(c); | ||
1924 | break; | ||
1925 | |||
1926 | case X86_VENDOR_INTEL: | ||
1927 | if (c->x86 == 5) | ||
1928 | intel_p5_mcheck_init(c); | ||
1929 | if (c->x86 == 6) | ||
1930 | intel_p6_mcheck_init(c); | ||
1931 | if (c->x86 == 15) | ||
1932 | intel_p4_mcheck_init(c); | ||
1933 | break; | ||
1934 | |||
1935 | case X86_VENDOR_CENTAUR: | ||
1936 | if (c->x86 == 5) | ||
1937 | winchip_mcheck_init(c); | ||
1938 | break; | ||
1939 | |||
1940 | default: | ||
1941 | break; | ||
1942 | } | ||
1943 | printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); | ||
1944 | } | ||
1945 | |||
1946 | static int __init mcheck_enable(char *str) | ||
1947 | { | ||
1948 | mce_disabled = -1; | ||
1949 | return 1; | ||
1950 | } | ||
1951 | |||
1952 | __setup("mce", mcheck_enable); | ||
1953 | |||
1954 | #endif /* CONFIG_X86_OLD_MCE */ | ||
1955 | |||
1956 | /* | ||
1957 | * Old style boot options parsing. Only for compatibility. | ||
1958 | */ | ||
1959 | static int __init mcheck_disable(char *str) | ||
1960 | { | ||
1961 | mce_disabled = 1; | ||
1962 | return 1; | ||
1963 | } | ||
1964 | __setup("nomce", mcheck_disable); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index ae9f628838f1..84a552b458c8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h | |||
@@ -1,14 +1,38 @@ | |||
1 | #include <linux/init.h> | 1 | #include <linux/init.h> |
2 | #include <asm/mce.h> | 2 | #include <asm/mce.h> |
3 | 3 | ||
4 | #ifdef CONFIG_X86_OLD_MCE | ||
4 | void amd_mcheck_init(struct cpuinfo_x86 *c); | 5 | void amd_mcheck_init(struct cpuinfo_x86 *c); |
5 | void intel_p4_mcheck_init(struct cpuinfo_x86 *c); | 6 | void intel_p4_mcheck_init(struct cpuinfo_x86 *c); |
6 | void intel_p5_mcheck_init(struct cpuinfo_x86 *c); | ||
7 | void intel_p6_mcheck_init(struct cpuinfo_x86 *c); | 7 | void intel_p6_mcheck_init(struct cpuinfo_x86 *c); |
8 | #endif | ||
9 | |||
10 | #ifdef CONFIG_X86_ANCIENT_MCE | ||
11 | void intel_p5_mcheck_init(struct cpuinfo_x86 *c); | ||
8 | void winchip_mcheck_init(struct cpuinfo_x86 *c); | 12 | void winchip_mcheck_init(struct cpuinfo_x86 *c); |
13 | extern int mce_p5_enable; | ||
14 | static inline int mce_p5_enabled(void) { return mce_p5_enable; } | ||
15 | static inline void enable_p5_mce(void) { mce_p5_enable = 1; } | ||
16 | #else | ||
17 | static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} | ||
18 | static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} | ||
19 | static inline int mce_p5_enabled(void) { return 0; } | ||
20 | static inline void enable_p5_mce(void) { } | ||
21 | #endif | ||
9 | 22 | ||
10 | /* Call the installed machine check handler for this CPU setup. */ | 23 | /* Call the installed machine check handler for this CPU setup. */ |
11 | extern void (*machine_check_vector)(struct pt_regs *, long error_code); | 24 | extern void (*machine_check_vector)(struct pt_regs *, long error_code); |
12 | 25 | ||
26 | #ifdef CONFIG_X86_OLD_MCE | ||
27 | |||
13 | extern int nr_mce_banks; | 28 | extern int nr_mce_banks; |
14 | 29 | ||
30 | void intel_set_thermal_handler(void); | ||
31 | |||
32 | #else | ||
33 | |||
34 | static inline void intel_set_thermal_handler(void) { } | ||
35 | |||
36 | #endif | ||
37 | |||
38 | void intel_init_thermal(struct cpuinfo_x86 *c); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c deleted file mode 100644 index 3552119b091d..000000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ /dev/null | |||
@@ -1,76 +0,0 @@ | |||
1 | /* | ||
2 | * mce.c - x86 Machine Check Exception Reporting | ||
3 | * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com> | ||
4 | */ | ||
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/smp.h> | ||
11 | #include <linux/thread_info.h> | ||
12 | |||
13 | #include <asm/processor.h> | ||
14 | #include <asm/system.h> | ||
15 | #include <asm/mce.h> | ||
16 | |||
17 | #include "mce.h" | ||
18 | |||
19 | int mce_disabled; | ||
20 | int nr_mce_banks; | ||
21 | |||
22 | EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ | ||
23 | |||
24 | /* Handle unconfigured int18 (should never happen) */ | ||
25 | static void unexpected_machine_check(struct pt_regs *regs, long error_code) | ||
26 | { | ||
27 | printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); | ||
28 | } | ||
29 | |||
30 | /* Call the installed machine check handler for this CPU setup. */ | ||
31 | void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; | ||
32 | |||
33 | /* This has to be run for each processor */ | ||
34 | void mcheck_init(struct cpuinfo_x86 *c) | ||
35 | { | ||
36 | if (mce_disabled == 1) | ||
37 | return; | ||
38 | |||
39 | switch (c->x86_vendor) { | ||
40 | case X86_VENDOR_AMD: | ||
41 | amd_mcheck_init(c); | ||
42 | break; | ||
43 | |||
44 | case X86_VENDOR_INTEL: | ||
45 | if (c->x86 == 5) | ||
46 | intel_p5_mcheck_init(c); | ||
47 | if (c->x86 == 6) | ||
48 | intel_p6_mcheck_init(c); | ||
49 | if (c->x86 == 15) | ||
50 | intel_p4_mcheck_init(c); | ||
51 | break; | ||
52 | |||
53 | case X86_VENDOR_CENTAUR: | ||
54 | if (c->x86 == 5) | ||
55 | winchip_mcheck_init(c); | ||
56 | break; | ||
57 | |||
58 | default: | ||
59 | break; | ||
60 | } | ||
61 | } | ||
62 | |||
63 | static int __init mcheck_disable(char *str) | ||
64 | { | ||
65 | mce_disabled = 1; | ||
66 | return 1; | ||
67 | } | ||
68 | |||
69 | static int __init mcheck_enable(char *str) | ||
70 | { | ||
71 | mce_disabled = -1; | ||
72 | return 1; | ||
73 | } | ||
74 | |||
75 | __setup("nomce", mcheck_disable); | ||
76 | __setup("mce", mcheck_enable); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c deleted file mode 100644 index 6fb0b359d2a5..000000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ /dev/null | |||
@@ -1,1187 +0,0 @@ | |||
1 | /* | ||
2 | * Machine check handler. | ||
3 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | * Rest from unknown author(s). | ||
5 | * 2004 Andi Kleen. Rewrote most of it. | ||
6 | * Copyright 2008 Intel Corporation | ||
7 | * Author: Andi Kleen | ||
8 | */ | ||
9 | |||
10 | #include <linux/init.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/smp_lock.h> | ||
15 | #include <linux/string.h> | ||
16 | #include <linux/rcupdate.h> | ||
17 | #include <linux/kallsyms.h> | ||
18 | #include <linux/sysdev.h> | ||
19 | #include <linux/miscdevice.h> | ||
20 | #include <linux/fs.h> | ||
21 | #include <linux/capability.h> | ||
22 | #include <linux/cpu.h> | ||
23 | #include <linux/percpu.h> | ||
24 | #include <linux/poll.h> | ||
25 | #include <linux/thread_info.h> | ||
26 | #include <linux/ctype.h> | ||
27 | #include <linux/kmod.h> | ||
28 | #include <linux/kdebug.h> | ||
29 | #include <linux/kobject.h> | ||
30 | #include <linux/sysfs.h> | ||
31 | #include <linux/ratelimit.h> | ||
32 | #include <asm/processor.h> | ||
33 | #include <asm/msr.h> | ||
34 | #include <asm/mce.h> | ||
35 | #include <asm/uaccess.h> | ||
36 | #include <asm/smp.h> | ||
37 | #include <asm/idle.h> | ||
38 | |||
39 | #define MISC_MCELOG_MINOR 227 | ||
40 | |||
41 | atomic_t mce_entry; | ||
42 | |||
43 | static int mce_dont_init; | ||
44 | |||
45 | /* | ||
46 | * Tolerant levels: | ||
47 | * 0: always panic on uncorrected errors, log corrected errors | ||
48 | * 1: panic or SIGBUS on uncorrected errors, log corrected errors | ||
49 | * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors | ||
50 | * 3: never panic or SIGBUS, log all errors (for testing only) | ||
51 | */ | ||
52 | static int tolerant = 1; | ||
53 | static int banks; | ||
54 | static u64 *bank; | ||
55 | static unsigned long notify_user; | ||
56 | static int rip_msr; | ||
57 | static int mce_bootlog = -1; | ||
58 | static atomic_t mce_events; | ||
59 | |||
60 | static char trigger[128]; | ||
61 | static char *trigger_argv[2] = { trigger, NULL }; | ||
62 | |||
63 | static DECLARE_WAIT_QUEUE_HEAD(mce_wait); | ||
64 | |||
65 | /* MCA banks polled by the period polling timer for corrected events */ | ||
66 | DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { | ||
67 | [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL | ||
68 | }; | ||
69 | |||
70 | /* Do initial initialization of a struct mce */ | ||
71 | void mce_setup(struct mce *m) | ||
72 | { | ||
73 | memset(m, 0, sizeof(struct mce)); | ||
74 | m->cpu = smp_processor_id(); | ||
75 | rdtscll(m->tsc); | ||
76 | } | ||
77 | |||
78 | /* | ||
79 | * Lockless MCE logging infrastructure. | ||
80 | * This avoids deadlocks on printk locks without having to break locks. Also | ||
81 | * separate MCEs from kernel messages to avoid bogus bug reports. | ||
82 | */ | ||
83 | |||
84 | static struct mce_log mcelog = { | ||
85 | MCE_LOG_SIGNATURE, | ||
86 | MCE_LOG_LEN, | ||
87 | }; | ||
88 | |||
89 | void mce_log(struct mce *mce) | ||
90 | { | ||
91 | unsigned next, entry; | ||
92 | atomic_inc(&mce_events); | ||
93 | mce->finished = 0; | ||
94 | wmb(); | ||
95 | for (;;) { | ||
96 | entry = rcu_dereference(mcelog.next); | ||
97 | for (;;) { | ||
98 | /* When the buffer fills up discard new entries. Assume | ||
99 | that the earlier errors are the more interesting. */ | ||
100 | if (entry >= MCE_LOG_LEN) { | ||
101 | set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); | ||
102 | return; | ||
103 | } | ||
104 | /* Old left over entry. Skip. */ | ||
105 | if (mcelog.entry[entry].finished) { | ||
106 | entry++; | ||
107 | continue; | ||
108 | } | ||
109 | break; | ||
110 | } | ||
111 | smp_rmb(); | ||
112 | next = entry + 1; | ||
113 | if (cmpxchg(&mcelog.next, entry, next) == entry) | ||
114 | break; | ||
115 | } | ||
116 | memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); | ||
117 | wmb(); | ||
118 | mcelog.entry[entry].finished = 1; | ||
119 | wmb(); | ||
120 | |||
121 | set_bit(0, ¬ify_user); | ||
122 | } | ||
123 | |||
124 | static void print_mce(struct mce *m) | ||
125 | { | ||
126 | printk(KERN_EMERG "\n" | ||
127 | KERN_EMERG "HARDWARE ERROR\n" | ||
128 | KERN_EMERG | ||
129 | "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", | ||
130 | m->cpu, m->mcgstatus, m->bank, m->status); | ||
131 | if (m->ip) { | ||
132 | printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", | ||
133 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | ||
134 | m->cs, m->ip); | ||
135 | if (m->cs == __KERNEL_CS) | ||
136 | print_symbol("{%s}", m->ip); | ||
137 | printk("\n"); | ||
138 | } | ||
139 | printk(KERN_EMERG "TSC %llx ", m->tsc); | ||
140 | if (m->addr) | ||
141 | printk("ADDR %llx ", m->addr); | ||
142 | if (m->misc) | ||
143 | printk("MISC %llx ", m->misc); | ||
144 | printk("\n"); | ||
145 | printk(KERN_EMERG "This is not a software problem!\n"); | ||
146 | printk(KERN_EMERG "Run through mcelog --ascii to decode " | ||
147 | "and contact your hardware vendor\n"); | ||
148 | } | ||
149 | |||
150 | static void mce_panic(char *msg, struct mce *backup, unsigned long start) | ||
151 | { | ||
152 | int i; | ||
153 | |||
154 | oops_begin(); | ||
155 | for (i = 0; i < MCE_LOG_LEN; i++) { | ||
156 | unsigned long tsc = mcelog.entry[i].tsc; | ||
157 | |||
158 | if (time_before(tsc, start)) | ||
159 | continue; | ||
160 | print_mce(&mcelog.entry[i]); | ||
161 | if (backup && mcelog.entry[i].tsc == backup->tsc) | ||
162 | backup = NULL; | ||
163 | } | ||
164 | if (backup) | ||
165 | print_mce(backup); | ||
166 | panic(msg); | ||
167 | } | ||
168 | |||
169 | int mce_available(struct cpuinfo_x86 *c) | ||
170 | { | ||
171 | if (mce_dont_init) | ||
172 | return 0; | ||
173 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); | ||
174 | } | ||
175 | |||
176 | static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) | ||
177 | { | ||
178 | if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { | ||
179 | m->ip = regs->ip; | ||
180 | m->cs = regs->cs; | ||
181 | } else { | ||
182 | m->ip = 0; | ||
183 | m->cs = 0; | ||
184 | } | ||
185 | if (rip_msr) { | ||
186 | /* Assume the RIP in the MSR is exact. Is this true? */ | ||
187 | m->mcgstatus |= MCG_STATUS_EIPV; | ||
188 | rdmsrl(rip_msr, m->ip); | ||
189 | m->cs = 0; | ||
190 | } | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * Poll for corrected events or events that happened before reset. | ||
195 | * Those are just logged through /dev/mcelog. | ||
196 | * | ||
197 | * This is executed in standard interrupt context. | ||
198 | */ | ||
199 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | ||
200 | { | ||
201 | struct mce m; | ||
202 | int i; | ||
203 | |||
204 | mce_setup(&m); | ||
205 | |||
206 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | ||
207 | for (i = 0; i < banks; i++) { | ||
208 | if (!bank[i] || !test_bit(i, *b)) | ||
209 | continue; | ||
210 | |||
211 | m.misc = 0; | ||
212 | m.addr = 0; | ||
213 | m.bank = i; | ||
214 | m.tsc = 0; | ||
215 | |||
216 | barrier(); | ||
217 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); | ||
218 | if (!(m.status & MCI_STATUS_VAL)) | ||
219 | continue; | ||
220 | |||
221 | /* | ||
222 | * Uncorrected events are handled by the exception handler | ||
223 | * when it is enabled. But when the exception is disabled log | ||
224 | * everything. | ||
225 | * | ||
226 | * TBD do the same check for MCI_STATUS_EN here? | ||
227 | */ | ||
228 | if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) | ||
229 | continue; | ||
230 | |||
231 | if (m.status & MCI_STATUS_MISCV) | ||
232 | rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); | ||
233 | if (m.status & MCI_STATUS_ADDRV) | ||
234 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); | ||
235 | |||
236 | if (!(flags & MCP_TIMESTAMP)) | ||
237 | m.tsc = 0; | ||
238 | /* | ||
239 | * Don't get the IP here because it's unlikely to | ||
240 | * have anything to do with the actual error location. | ||
241 | */ | ||
242 | if (!(flags & MCP_DONTLOG)) { | ||
243 | mce_log(&m); | ||
244 | add_taint(TAINT_MACHINE_CHECK); | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * Clear state for this bank. | ||
249 | */ | ||
250 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
251 | } | ||
252 | |||
253 | /* | ||
254 | * Don't clear MCG_STATUS here because it's only defined for | ||
255 | * exceptions. | ||
256 | */ | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * The actual machine check handler. This only handles real | ||
261 | * exceptions when something got corrupted coming in through int 18. | ||
262 | * | ||
263 | * This is executed in NMI context not subject to normal locking rules. This | ||
264 | * implies that most kernel services cannot be safely used. Don't even | ||
265 | * think about putting a printk in there! | ||
266 | */ | ||
267 | void do_machine_check(struct pt_regs * regs, long error_code) | ||
268 | { | ||
269 | struct mce m, panicm; | ||
270 | u64 mcestart = 0; | ||
271 | int i; | ||
272 | int panicm_found = 0; | ||
273 | /* | ||
274 | * If no_way_out gets set, there is no safe way to recover from this | ||
275 | * MCE. If tolerant is cranked up, we'll try anyway. | ||
276 | */ | ||
277 | int no_way_out = 0; | ||
278 | /* | ||
279 | * If kill_it gets set, there might be a way to recover from this | ||
280 | * error. | ||
281 | */ | ||
282 | int kill_it = 0; | ||
283 | DECLARE_BITMAP(toclear, MAX_NR_BANKS); | ||
284 | |||
285 | atomic_inc(&mce_entry); | ||
286 | |||
287 | if (notify_die(DIE_NMI, "machine check", regs, error_code, | ||
288 | 18, SIGKILL) == NOTIFY_STOP) | ||
289 | goto out2; | ||
290 | if (!banks) | ||
291 | goto out2; | ||
292 | |||
293 | mce_setup(&m); | ||
294 | |||
295 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | ||
296 | /* if the restart IP is not valid, we're done for */ | ||
297 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | ||
298 | no_way_out = 1; | ||
299 | |||
300 | rdtscll(mcestart); | ||
301 | barrier(); | ||
302 | |||
303 | for (i = 0; i < banks; i++) { | ||
304 | __clear_bit(i, toclear); | ||
305 | if (!bank[i]) | ||
306 | continue; | ||
307 | |||
308 | m.misc = 0; | ||
309 | m.addr = 0; | ||
310 | m.bank = i; | ||
311 | |||
312 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); | ||
313 | if ((m.status & MCI_STATUS_VAL) == 0) | ||
314 | continue; | ||
315 | |||
316 | /* | ||
317 | * Non uncorrected errors are handled by machine_check_poll | ||
318 | * Leave them alone. | ||
319 | */ | ||
320 | if ((m.status & MCI_STATUS_UC) == 0) | ||
321 | continue; | ||
322 | |||
323 | /* | ||
324 | * Set taint even when machine check was not enabled. | ||
325 | */ | ||
326 | add_taint(TAINT_MACHINE_CHECK); | ||
327 | |||
328 | __set_bit(i, toclear); | ||
329 | |||
330 | if (m.status & MCI_STATUS_EN) { | ||
331 | /* if PCC was set, there's no way out */ | ||
332 | no_way_out |= !!(m.status & MCI_STATUS_PCC); | ||
333 | /* | ||
334 | * If this error was uncorrectable and there was | ||
335 | * an overflow, we're in trouble. If no overflow, | ||
336 | * we might get away with just killing a task. | ||
337 | */ | ||
338 | if (m.status & MCI_STATUS_UC) { | ||
339 | if (tolerant < 1 || m.status & MCI_STATUS_OVER) | ||
340 | no_way_out = 1; | ||
341 | kill_it = 1; | ||
342 | } | ||
343 | } else { | ||
344 | /* | ||
345 | * Machine check event was not enabled. Clear, but | ||
346 | * ignore. | ||
347 | */ | ||
348 | continue; | ||
349 | } | ||
350 | |||
351 | if (m.status & MCI_STATUS_MISCV) | ||
352 | rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); | ||
353 | if (m.status & MCI_STATUS_ADDRV) | ||
354 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); | ||
355 | |||
356 | mce_get_rip(&m, regs); | ||
357 | mce_log(&m); | ||
358 | |||
359 | /* Did this bank cause the exception? */ | ||
360 | /* Assume that the bank with uncorrectable errors did it, | ||
361 | and that there is only a single one. */ | ||
362 | if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { | ||
363 | panicm = m; | ||
364 | panicm_found = 1; | ||
365 | } | ||
366 | } | ||
367 | |||
368 | /* If we didn't find an uncorrectable error, pick | ||
369 | the last one (shouldn't happen, just being safe). */ | ||
370 | if (!panicm_found) | ||
371 | panicm = m; | ||
372 | |||
373 | /* | ||
374 | * If we have decided that we just CAN'T continue, and the user | ||
375 | * has not set tolerant to an insane level, give up and die. | ||
376 | */ | ||
377 | if (no_way_out && tolerant < 3) | ||
378 | mce_panic("Machine check", &panicm, mcestart); | ||
379 | |||
380 | /* | ||
381 | * If the error seems to be unrecoverable, something should be | ||
382 | * done. Try to kill as little as possible. If we can kill just | ||
383 | * one task, do that. If the user has set the tolerance very | ||
384 | * high, don't try to do anything at all. | ||
385 | */ | ||
386 | if (kill_it && tolerant < 3) { | ||
387 | int user_space = 0; | ||
388 | |||
389 | /* | ||
390 | * If the EIPV bit is set, it means the saved IP is the | ||
391 | * instruction which caused the MCE. | ||
392 | */ | ||
393 | if (m.mcgstatus & MCG_STATUS_EIPV) | ||
394 | user_space = panicm.ip && (panicm.cs & 3); | ||
395 | |||
396 | /* | ||
397 | * If we know that the error was in user space, send a | ||
398 | * SIGBUS. Otherwise, panic if tolerance is low. | ||
399 | * | ||
400 | * force_sig() takes an awful lot of locks and has a slight | ||
401 | * risk of deadlocking. | ||
402 | */ | ||
403 | if (user_space) { | ||
404 | force_sig(SIGBUS, current); | ||
405 | } else if (panic_on_oops || tolerant < 2) { | ||
406 | mce_panic("Uncorrected machine check", | ||
407 | &panicm, mcestart); | ||
408 | } | ||
409 | } | ||
410 | |||
411 | /* notify userspace ASAP */ | ||
412 | set_thread_flag(TIF_MCE_NOTIFY); | ||
413 | |||
414 | /* the last thing we do is clear state */ | ||
415 | for (i = 0; i < banks; i++) { | ||
416 | if (test_bit(i, toclear)) | ||
417 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
418 | } | ||
419 | wrmsrl(MSR_IA32_MCG_STATUS, 0); | ||
420 | out2: | ||
421 | atomic_dec(&mce_entry); | ||
422 | } | ||
423 | |||
424 | #ifdef CONFIG_X86_MCE_INTEL | ||
425 | /*** | ||
426 | * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog | ||
427 | * @cpu: The CPU on which the event occurred. | ||
428 | * @status: Event status information | ||
429 | * | ||
430 | * This function should be called by the thermal interrupt after the | ||
431 | * event has been processed and the decision was made to log the event | ||
432 | * further. | ||
433 | * | ||
434 | * The status parameter will be saved to the 'status' field of 'struct mce' | ||
435 | * and historically has been the register value of the | ||
436 | * MSR_IA32_THERMAL_STATUS (Intel) msr. | ||
437 | */ | ||
438 | void mce_log_therm_throt_event(__u64 status) | ||
439 | { | ||
440 | struct mce m; | ||
441 | |||
442 | mce_setup(&m); | ||
443 | m.bank = MCE_THERMAL_BANK; | ||
444 | m.status = status; | ||
445 | mce_log(&m); | ||
446 | } | ||
447 | #endif /* CONFIG_X86_MCE_INTEL */ | ||
448 | |||
449 | /* | ||
450 | * Periodic polling timer for "silent" machine check errors. If the | ||
451 | * poller finds an MCE, poll 2x faster. When the poller finds no more | ||
452 | * errors, poll 2x slower (up to check_interval seconds). | ||
453 | */ | ||
454 | |||
455 | static int check_interval = 5 * 60; /* 5 minutes */ | ||
456 | static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ | ||
457 | static void mcheck_timer(unsigned long); | ||
458 | static DEFINE_PER_CPU(struct timer_list, mce_timer); | ||
459 | |||
460 | static void mcheck_timer(unsigned long data) | ||
461 | { | ||
462 | struct timer_list *t = &per_cpu(mce_timer, data); | ||
463 | int *n; | ||
464 | |||
465 | WARN_ON(smp_processor_id() != data); | ||
466 | |||
467 | if (mce_available(¤t_cpu_data)) | ||
468 | machine_check_poll(MCP_TIMESTAMP, | ||
469 | &__get_cpu_var(mce_poll_banks)); | ||
470 | |||
471 | /* | ||
472 | * Alert userspace if needed. If we logged an MCE, reduce the | ||
473 | * polling interval, otherwise increase the polling interval. | ||
474 | */ | ||
475 | n = &__get_cpu_var(next_interval); | ||
476 | if (mce_notify_user()) { | ||
477 | *n = max(*n/2, HZ/100); | ||
478 | } else { | ||
479 | *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); | ||
480 | } | ||
481 | |||
482 | t->expires = jiffies + *n; | ||
483 | add_timer(t); | ||
484 | } | ||
485 | |||
486 | static void mce_do_trigger(struct work_struct *work) | ||
487 | { | ||
488 | call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); | ||
489 | } | ||
490 | |||
491 | static DECLARE_WORK(mce_trigger_work, mce_do_trigger); | ||
492 | |||
493 | /* | ||
494 | * Notify the user(s) about new machine check events. | ||
495 | * Can be called from interrupt context, but not from machine check/NMI | ||
496 | * context. | ||
497 | */ | ||
498 | int mce_notify_user(void) | ||
499 | { | ||
500 | /* Not more than two messages every minute */ | ||
501 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); | ||
502 | |||
503 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
504 | if (test_and_clear_bit(0, ¬ify_user)) { | ||
505 | wake_up_interruptible(&mce_wait); | ||
506 | |||
507 | /* | ||
508 | * There is no risk of missing notifications because | ||
509 | * work_pending is always cleared before the function is | ||
510 | * executed. | ||
511 | */ | ||
512 | if (trigger[0] && !work_pending(&mce_trigger_work)) | ||
513 | schedule_work(&mce_trigger_work); | ||
514 | |||
515 | if (__ratelimit(&ratelimit)) | ||
516 | printk(KERN_INFO "Machine check events logged\n"); | ||
517 | |||
518 | return 1; | ||
519 | } | ||
520 | return 0; | ||
521 | } | ||
522 | |||
523 | /* see if the idle task needs to notify userspace */ | ||
524 | static int | ||
525 | mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) | ||
526 | { | ||
527 | /* IDLE_END should be safe - interrupts are back on */ | ||
528 | if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) | ||
529 | mce_notify_user(); | ||
530 | |||
531 | return NOTIFY_OK; | ||
532 | } | ||
533 | |||
534 | static struct notifier_block mce_idle_notifier = { | ||
535 | .notifier_call = mce_idle_callback, | ||
536 | }; | ||
537 | |||
538 | static __init int periodic_mcheck_init(void) | ||
539 | { | ||
540 | idle_notifier_register(&mce_idle_notifier); | ||
541 | return 0; | ||
542 | } | ||
543 | __initcall(periodic_mcheck_init); | ||
544 | |||
545 | /* | ||
546 | * Initialize Machine Checks for a CPU. | ||
547 | */ | ||
548 | static int mce_cap_init(void) | ||
549 | { | ||
550 | u64 cap; | ||
551 | unsigned b; | ||
552 | |||
553 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
554 | b = cap & 0xff; | ||
555 | if (b > MAX_NR_BANKS) { | ||
556 | printk(KERN_WARNING | ||
557 | "MCE: Using only %u machine check banks out of %u\n", | ||
558 | MAX_NR_BANKS, b); | ||
559 | b = MAX_NR_BANKS; | ||
560 | } | ||
561 | |||
562 | /* Don't support asymmetric configurations today */ | ||
563 | WARN_ON(banks != 0 && b != banks); | ||
564 | banks = b; | ||
565 | if (!bank) { | ||
566 | bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); | ||
567 | if (!bank) | ||
568 | return -ENOMEM; | ||
569 | memset(bank, 0xff, banks * sizeof(u64)); | ||
570 | } | ||
571 | |||
572 | /* Use accurate RIP reporting if available. */ | ||
573 | if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) | ||
574 | rip_msr = MSR_IA32_MCG_EIP; | ||
575 | |||
576 | return 0; | ||
577 | } | ||
578 | |||
579 | static void mce_init(void *dummy) | ||
580 | { | ||
581 | u64 cap; | ||
582 | int i; | ||
583 | mce_banks_t all_banks; | ||
584 | |||
585 | /* | ||
586 | * Log the machine checks left over from the previous reset. | ||
587 | */ | ||
588 | bitmap_fill(all_banks, MAX_NR_BANKS); | ||
589 | machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); | ||
590 | |||
591 | set_in_cr4(X86_CR4_MCE); | ||
592 | |||
593 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
594 | if (cap & MCG_CTL_P) | ||
595 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | ||
596 | |||
597 | for (i = 0; i < banks; i++) { | ||
598 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | ||
599 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
600 | } | ||
601 | } | ||
602 | |||
603 | /* Add per CPU specific workarounds here */ | ||
604 | static void mce_cpu_quirks(struct cpuinfo_x86 *c) | ||
605 | { | ||
606 | /* This should be disabled by the BIOS, but isn't always */ | ||
607 | if (c->x86_vendor == X86_VENDOR_AMD) { | ||
608 | if (c->x86 == 15 && banks > 4) | ||
609 | /* disable GART TBL walk error reporting, which trips off | ||
610 | incorrectly with the IOMMU & 3ware & Cerberus. */ | ||
611 | clear_bit(10, (unsigned long *)&bank[4]); | ||
612 | if(c->x86 <= 17 && mce_bootlog < 0) | ||
613 | /* Lots of broken BIOS around that don't clear them | ||
614 | by default and leave crap in there. Don't log. */ | ||
615 | mce_bootlog = 0; | ||
616 | } | ||
617 | |||
618 | } | ||
619 | |||
620 | static void mce_cpu_features(struct cpuinfo_x86 *c) | ||
621 | { | ||
622 | switch (c->x86_vendor) { | ||
623 | case X86_VENDOR_INTEL: | ||
624 | mce_intel_feature_init(c); | ||
625 | break; | ||
626 | case X86_VENDOR_AMD: | ||
627 | mce_amd_feature_init(c); | ||
628 | break; | ||
629 | default: | ||
630 | break; | ||
631 | } | ||
632 | } | ||
633 | |||
634 | static void mce_init_timer(void) | ||
635 | { | ||
636 | struct timer_list *t = &__get_cpu_var(mce_timer); | ||
637 | int *n = &__get_cpu_var(next_interval); | ||
638 | |||
639 | *n = check_interval * HZ; | ||
640 | if (!*n) | ||
641 | return; | ||
642 | setup_timer(t, mcheck_timer, smp_processor_id()); | ||
643 | t->expires = round_jiffies(jiffies + *n); | ||
644 | add_timer(t); | ||
645 | } | ||
646 | |||
647 | /* | ||
648 | * Called for each booted CPU to set up machine checks. | ||
649 | * Must be called with preempt off. | ||
650 | */ | ||
651 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | ||
652 | { | ||
653 | if (!mce_available(c)) | ||
654 | return; | ||
655 | |||
656 | if (mce_cap_init() < 0) { | ||
657 | mce_dont_init = 1; | ||
658 | return; | ||
659 | } | ||
660 | mce_cpu_quirks(c); | ||
661 | |||
662 | mce_init(NULL); | ||
663 | mce_cpu_features(c); | ||
664 | mce_init_timer(); | ||
665 | } | ||
666 | |||
667 | /* | ||
668 | * Character device to read and clear the MCE log. | ||
669 | */ | ||
670 | |||
671 | static DEFINE_SPINLOCK(mce_state_lock); | ||
672 | static int open_count; /* #times opened */ | ||
673 | static int open_exclu; /* already open exclusive? */ | ||
674 | |||
675 | static int mce_open(struct inode *inode, struct file *file) | ||
676 | { | ||
677 | lock_kernel(); | ||
678 | spin_lock(&mce_state_lock); | ||
679 | |||
680 | if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { | ||
681 | spin_unlock(&mce_state_lock); | ||
682 | unlock_kernel(); | ||
683 | return -EBUSY; | ||
684 | } | ||
685 | |||
686 | if (file->f_flags & O_EXCL) | ||
687 | open_exclu = 1; | ||
688 | open_count++; | ||
689 | |||
690 | spin_unlock(&mce_state_lock); | ||
691 | unlock_kernel(); | ||
692 | |||
693 | return nonseekable_open(inode, file); | ||
694 | } | ||
695 | |||
696 | static int mce_release(struct inode *inode, struct file *file) | ||
697 | { | ||
698 | spin_lock(&mce_state_lock); | ||
699 | |||
700 | open_count--; | ||
701 | open_exclu = 0; | ||
702 | |||
703 | spin_unlock(&mce_state_lock); | ||
704 | |||
705 | return 0; | ||
706 | } | ||
707 | |||
708 | static void collect_tscs(void *data) | ||
709 | { | ||
710 | unsigned long *cpu_tsc = (unsigned long *)data; | ||
711 | |||
712 | rdtscll(cpu_tsc[smp_processor_id()]); | ||
713 | } | ||
714 | |||
715 | static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, | ||
716 | loff_t *off) | ||
717 | { | ||
718 | unsigned long *cpu_tsc; | ||
719 | static DEFINE_MUTEX(mce_read_mutex); | ||
720 | unsigned prev, next; | ||
721 | char __user *buf = ubuf; | ||
722 | int i, err; | ||
723 | |||
724 | cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); | ||
725 | if (!cpu_tsc) | ||
726 | return -ENOMEM; | ||
727 | |||
728 | mutex_lock(&mce_read_mutex); | ||
729 | next = rcu_dereference(mcelog.next); | ||
730 | |||
731 | /* Only supports full reads right now */ | ||
732 | if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { | ||
733 | mutex_unlock(&mce_read_mutex); | ||
734 | kfree(cpu_tsc); | ||
735 | return -EINVAL; | ||
736 | } | ||
737 | |||
738 | err = 0; | ||
739 | prev = 0; | ||
740 | do { | ||
741 | for (i = prev; i < next; i++) { | ||
742 | unsigned long start = jiffies; | ||
743 | |||
744 | while (!mcelog.entry[i].finished) { | ||
745 | if (time_after_eq(jiffies, start + 2)) { | ||
746 | memset(mcelog.entry + i, 0, | ||
747 | sizeof(struct mce)); | ||
748 | goto timeout; | ||
749 | } | ||
750 | cpu_relax(); | ||
751 | } | ||
752 | smp_rmb(); | ||
753 | err |= copy_to_user(buf, mcelog.entry + i, | ||
754 | sizeof(struct mce)); | ||
755 | buf += sizeof(struct mce); | ||
756 | timeout: | ||
757 | ; | ||
758 | } | ||
759 | |||
760 | memset(mcelog.entry + prev, 0, | ||
761 | (next - prev) * sizeof(struct mce)); | ||
762 | prev = next; | ||
763 | next = cmpxchg(&mcelog.next, prev, 0); | ||
764 | } while (next != prev); | ||
765 | |||
766 | synchronize_sched(); | ||
767 | |||
768 | /* | ||
769 | * Collect entries that were still getting written before the | ||
770 | * synchronize. | ||
771 | */ | ||
772 | on_each_cpu(collect_tscs, cpu_tsc, 1); | ||
773 | for (i = next; i < MCE_LOG_LEN; i++) { | ||
774 | if (mcelog.entry[i].finished && | ||
775 | mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { | ||
776 | err |= copy_to_user(buf, mcelog.entry+i, | ||
777 | sizeof(struct mce)); | ||
778 | smp_rmb(); | ||
779 | buf += sizeof(struct mce); | ||
780 | memset(&mcelog.entry[i], 0, sizeof(struct mce)); | ||
781 | } | ||
782 | } | ||
783 | mutex_unlock(&mce_read_mutex); | ||
784 | kfree(cpu_tsc); | ||
785 | return err ? -EFAULT : buf - ubuf; | ||
786 | } | ||
787 | |||
788 | static unsigned int mce_poll(struct file *file, poll_table *wait) | ||
789 | { | ||
790 | poll_wait(file, &mce_wait, wait); | ||
791 | if (rcu_dereference(mcelog.next)) | ||
792 | return POLLIN | POLLRDNORM; | ||
793 | return 0; | ||
794 | } | ||
795 | |||
796 | static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) | ||
797 | { | ||
798 | int __user *p = (int __user *)arg; | ||
799 | |||
800 | if (!capable(CAP_SYS_ADMIN)) | ||
801 | return -EPERM; | ||
802 | switch (cmd) { | ||
803 | case MCE_GET_RECORD_LEN: | ||
804 | return put_user(sizeof(struct mce), p); | ||
805 | case MCE_GET_LOG_LEN: | ||
806 | return put_user(MCE_LOG_LEN, p); | ||
807 | case MCE_GETCLEAR_FLAGS: { | ||
808 | unsigned flags; | ||
809 | |||
810 | do { | ||
811 | flags = mcelog.flags; | ||
812 | } while (cmpxchg(&mcelog.flags, flags, 0) != flags); | ||
813 | return put_user(flags, p); | ||
814 | } | ||
815 | default: | ||
816 | return -ENOTTY; | ||
817 | } | ||
818 | } | ||
819 | |||
820 | static const struct file_operations mce_chrdev_ops = { | ||
821 | .open = mce_open, | ||
822 | .release = mce_release, | ||
823 | .read = mce_read, | ||
824 | .poll = mce_poll, | ||
825 | .unlocked_ioctl = mce_ioctl, | ||
826 | }; | ||
827 | |||
828 | static struct miscdevice mce_log_device = { | ||
829 | MISC_MCELOG_MINOR, | ||
830 | "mcelog", | ||
831 | &mce_chrdev_ops, | ||
832 | }; | ||
833 | |||
834 | /* | ||
835 | * Old style boot options parsing. Only for compatibility. | ||
836 | */ | ||
837 | static int __init mcheck_disable(char *str) | ||
838 | { | ||
839 | mce_dont_init = 1; | ||
840 | return 1; | ||
841 | } | ||
842 | |||
843 | /* mce=off disables machine check. | ||
844 | mce=TOLERANCELEVEL (number, see above) | ||
845 | mce=bootlog Log MCEs from before booting. Disabled by default on AMD. | ||
846 | mce=nobootlog Don't log MCEs from before booting. */ | ||
847 | static int __init mcheck_enable(char *str) | ||
848 | { | ||
849 | if (!strcmp(str, "off")) | ||
850 | mce_dont_init = 1; | ||
851 | else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) | ||
852 | mce_bootlog = str[0] == 'b'; | ||
853 | else if (isdigit(str[0])) | ||
854 | get_option(&str, &tolerant); | ||
855 | else | ||
856 | printk("mce= argument %s ignored. Please use /sys", str); | ||
857 | return 1; | ||
858 | } | ||
859 | |||
860 | __setup("nomce", mcheck_disable); | ||
861 | __setup("mce=", mcheck_enable); | ||
862 | |||
863 | /* | ||
864 | * Sysfs support | ||
865 | */ | ||
866 | |||
867 | /* | ||
868 | * Disable machine checks on suspend and shutdown. We can't really handle | ||
869 | * them later. | ||
870 | */ | ||
871 | static int mce_disable(void) | ||
872 | { | ||
873 | int i; | ||
874 | |||
875 | for (i = 0; i < banks; i++) | ||
876 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | ||
877 | return 0; | ||
878 | } | ||
879 | |||
880 | static int mce_suspend(struct sys_device *dev, pm_message_t state) | ||
881 | { | ||
882 | return mce_disable(); | ||
883 | } | ||
884 | |||
885 | static int mce_shutdown(struct sys_device *dev) | ||
886 | { | ||
887 | return mce_disable(); | ||
888 | } | ||
889 | |||
890 | /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. | ||
891 | Only one CPU is active at this time, the others get readded later using | ||
892 | CPU hotplug. */ | ||
893 | static int mce_resume(struct sys_device *dev) | ||
894 | { | ||
895 | mce_init(NULL); | ||
896 | mce_cpu_features(¤t_cpu_data); | ||
897 | return 0; | ||
898 | } | ||
899 | |||
900 | static void mce_cpu_restart(void *data) | ||
901 | { | ||
902 | del_timer_sync(&__get_cpu_var(mce_timer)); | ||
903 | if (mce_available(¤t_cpu_data)) | ||
904 | mce_init(NULL); | ||
905 | mce_init_timer(); | ||
906 | } | ||
907 | |||
908 | /* Reinit MCEs after user configuration changes */ | ||
909 | static void mce_restart(void) | ||
910 | { | ||
911 | on_each_cpu(mce_cpu_restart, NULL, 1); | ||
912 | } | ||
913 | |||
914 | static struct sysdev_class mce_sysclass = { | ||
915 | .suspend = mce_suspend, | ||
916 | .shutdown = mce_shutdown, | ||
917 | .resume = mce_resume, | ||
918 | .name = "machinecheck", | ||
919 | }; | ||
920 | |||
921 | DEFINE_PER_CPU(struct sys_device, device_mce); | ||
922 | void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; | ||
923 | |||
924 | /* Why are there no generic functions for this? */ | ||
925 | #define ACCESSOR(name, var, start) \ | ||
926 | static ssize_t show_ ## name(struct sys_device *s, \ | ||
927 | struct sysdev_attribute *attr, \ | ||
928 | char *buf) { \ | ||
929 | return sprintf(buf, "%lx\n", (unsigned long)var); \ | ||
930 | } \ | ||
931 | static ssize_t set_ ## name(struct sys_device *s, \ | ||
932 | struct sysdev_attribute *attr, \ | ||
933 | const char *buf, size_t siz) { \ | ||
934 | char *end; \ | ||
935 | unsigned long new = simple_strtoul(buf, &end, 0); \ | ||
936 | if (end == buf) return -EINVAL; \ | ||
937 | var = new; \ | ||
938 | start; \ | ||
939 | return end-buf; \ | ||
940 | } \ | ||
941 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); | ||
942 | |||
943 | static struct sysdev_attribute *bank_attrs; | ||
944 | |||
945 | static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, | ||
946 | char *buf) | ||
947 | { | ||
948 | u64 b = bank[attr - bank_attrs]; | ||
949 | return sprintf(buf, "%llx\n", b); | ||
950 | } | ||
951 | |||
952 | static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, | ||
953 | const char *buf, size_t siz) | ||
954 | { | ||
955 | char *end; | ||
956 | u64 new = simple_strtoull(buf, &end, 0); | ||
957 | if (end == buf) | ||
958 | return -EINVAL; | ||
959 | bank[attr - bank_attrs] = new; | ||
960 | mce_restart(); | ||
961 | return end-buf; | ||
962 | } | ||
963 | |||
964 | static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, | ||
965 | char *buf) | ||
966 | { | ||
967 | strcpy(buf, trigger); | ||
968 | strcat(buf, "\n"); | ||
969 | return strlen(trigger) + 1; | ||
970 | } | ||
971 | |||
972 | static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, | ||
973 | const char *buf,size_t siz) | ||
974 | { | ||
975 | char *p; | ||
976 | int len; | ||
977 | strncpy(trigger, buf, sizeof(trigger)); | ||
978 | trigger[sizeof(trigger)-1] = 0; | ||
979 | len = strlen(trigger); | ||
980 | p = strchr(trigger, '\n'); | ||
981 | if (*p) *p = 0; | ||
982 | return len; | ||
983 | } | ||
984 | |||
985 | static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); | ||
986 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); | ||
987 | ACCESSOR(check_interval,check_interval,mce_restart()) | ||
988 | static struct sysdev_attribute *mce_attributes[] = { | ||
989 | &attr_tolerant.attr, &attr_check_interval, &attr_trigger, | ||
990 | NULL | ||
991 | }; | ||
992 | |||
993 | static cpumask_var_t mce_device_initialized; | ||
994 | |||
995 | /* Per cpu sysdev init. All of the cpus still share the same ctl bank */ | ||
996 | static __cpuinit int mce_create_device(unsigned int cpu) | ||
997 | { | ||
998 | int err; | ||
999 | int i; | ||
1000 | |||
1001 | if (!mce_available(&boot_cpu_data)) | ||
1002 | return -EIO; | ||
1003 | |||
1004 | memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); | ||
1005 | per_cpu(device_mce,cpu).id = cpu; | ||
1006 | per_cpu(device_mce,cpu).cls = &mce_sysclass; | ||
1007 | |||
1008 | err = sysdev_register(&per_cpu(device_mce,cpu)); | ||
1009 | if (err) | ||
1010 | return err; | ||
1011 | |||
1012 | for (i = 0; mce_attributes[i]; i++) { | ||
1013 | err = sysdev_create_file(&per_cpu(device_mce,cpu), | ||
1014 | mce_attributes[i]); | ||
1015 | if (err) | ||
1016 | goto error; | ||
1017 | } | ||
1018 | for (i = 0; i < banks; i++) { | ||
1019 | err = sysdev_create_file(&per_cpu(device_mce, cpu), | ||
1020 | &bank_attrs[i]); | ||
1021 | if (err) | ||
1022 | goto error2; | ||
1023 | } | ||
1024 | cpumask_set_cpu(cpu, mce_device_initialized); | ||
1025 | |||
1026 | return 0; | ||
1027 | error2: | ||
1028 | while (--i >= 0) { | ||
1029 | sysdev_remove_file(&per_cpu(device_mce, cpu), | ||
1030 | &bank_attrs[i]); | ||
1031 | } | ||
1032 | error: | ||
1033 | while (--i >= 0) { | ||
1034 | sysdev_remove_file(&per_cpu(device_mce,cpu), | ||
1035 | mce_attributes[i]); | ||
1036 | } | ||
1037 | sysdev_unregister(&per_cpu(device_mce,cpu)); | ||
1038 | |||
1039 | return err; | ||
1040 | } | ||
1041 | |||
1042 | static __cpuinit void mce_remove_device(unsigned int cpu) | ||
1043 | { | ||
1044 | int i; | ||
1045 | |||
1046 | if (!cpumask_test_cpu(cpu, mce_device_initialized)) | ||
1047 | return; | ||
1048 | |||
1049 | for (i = 0; mce_attributes[i]; i++) | ||
1050 | sysdev_remove_file(&per_cpu(device_mce,cpu), | ||
1051 | mce_attributes[i]); | ||
1052 | for (i = 0; i < banks; i++) | ||
1053 | sysdev_remove_file(&per_cpu(device_mce, cpu), | ||
1054 | &bank_attrs[i]); | ||
1055 | sysdev_unregister(&per_cpu(device_mce,cpu)); | ||
1056 | cpumask_clear_cpu(cpu, mce_device_initialized); | ||
1057 | } | ||
1058 | |||
1059 | /* Make sure there are no machine checks on offlined CPUs. */ | ||
1060 | static void mce_disable_cpu(void *h) | ||
1061 | { | ||
1062 | int i; | ||
1063 | unsigned long action = *(unsigned long *)h; | ||
1064 | |||
1065 | if (!mce_available(¤t_cpu_data)) | ||
1066 | return; | ||
1067 | if (!(action & CPU_TASKS_FROZEN)) | ||
1068 | cmci_clear(); | ||
1069 | for (i = 0; i < banks; i++) | ||
1070 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | ||
1071 | } | ||
1072 | |||
1073 | static void mce_reenable_cpu(void *h) | ||
1074 | { | ||
1075 | int i; | ||
1076 | unsigned long action = *(unsigned long *)h; | ||
1077 | |||
1078 | if (!mce_available(¤t_cpu_data)) | ||
1079 | return; | ||
1080 | if (!(action & CPU_TASKS_FROZEN)) | ||
1081 | cmci_reenable(); | ||
1082 | for (i = 0; i < banks; i++) | ||
1083 | wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); | ||
1084 | } | ||
1085 | |||
1086 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ | ||
1087 | static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, | ||
1088 | unsigned long action, void *hcpu) | ||
1089 | { | ||
1090 | unsigned int cpu = (unsigned long)hcpu; | ||
1091 | struct timer_list *t = &per_cpu(mce_timer, cpu); | ||
1092 | |||
1093 | switch (action) { | ||
1094 | case CPU_ONLINE: | ||
1095 | case CPU_ONLINE_FROZEN: | ||
1096 | mce_create_device(cpu); | ||
1097 | if (threshold_cpu_callback) | ||
1098 | threshold_cpu_callback(action, cpu); | ||
1099 | break; | ||
1100 | case CPU_DEAD: | ||
1101 | case CPU_DEAD_FROZEN: | ||
1102 | if (threshold_cpu_callback) | ||
1103 | threshold_cpu_callback(action, cpu); | ||
1104 | mce_remove_device(cpu); | ||
1105 | break; | ||
1106 | case CPU_DOWN_PREPARE: | ||
1107 | case CPU_DOWN_PREPARE_FROZEN: | ||
1108 | del_timer_sync(t); | ||
1109 | smp_call_function_single(cpu, mce_disable_cpu, &action, 1); | ||
1110 | break; | ||
1111 | case CPU_DOWN_FAILED: | ||
1112 | case CPU_DOWN_FAILED_FROZEN: | ||
1113 | t->expires = round_jiffies(jiffies + | ||
1114 | __get_cpu_var(next_interval)); | ||
1115 | add_timer_on(t, cpu); | ||
1116 | smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); | ||
1117 | break; | ||
1118 | case CPU_POST_DEAD: | ||
1119 | /* intentionally ignoring frozen here */ | ||
1120 | cmci_rediscover(cpu); | ||
1121 | break; | ||
1122 | } | ||
1123 | return NOTIFY_OK; | ||
1124 | } | ||
1125 | |||
1126 | static struct notifier_block mce_cpu_notifier __cpuinitdata = { | ||
1127 | .notifier_call = mce_cpu_callback, | ||
1128 | }; | ||
1129 | |||
1130 | static __init int mce_init_banks(void) | ||
1131 | { | ||
1132 | int i; | ||
1133 | |||
1134 | bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, | ||
1135 | GFP_KERNEL); | ||
1136 | if (!bank_attrs) | ||
1137 | return -ENOMEM; | ||
1138 | |||
1139 | for (i = 0; i < banks; i++) { | ||
1140 | struct sysdev_attribute *a = &bank_attrs[i]; | ||
1141 | a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); | ||
1142 | if (!a->attr.name) | ||
1143 | goto nomem; | ||
1144 | a->attr.mode = 0644; | ||
1145 | a->show = show_bank; | ||
1146 | a->store = set_bank; | ||
1147 | } | ||
1148 | return 0; | ||
1149 | |||
1150 | nomem: | ||
1151 | while (--i >= 0) | ||
1152 | kfree(bank_attrs[i].attr.name); | ||
1153 | kfree(bank_attrs); | ||
1154 | bank_attrs = NULL; | ||
1155 | return -ENOMEM; | ||
1156 | } | ||
1157 | |||
1158 | static __init int mce_init_device(void) | ||
1159 | { | ||
1160 | int err; | ||
1161 | int i = 0; | ||
1162 | |||
1163 | if (!mce_available(&boot_cpu_data)) | ||
1164 | return -EIO; | ||
1165 | |||
1166 | alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); | ||
1167 | |||
1168 | err = mce_init_banks(); | ||
1169 | if (err) | ||
1170 | return err; | ||
1171 | |||
1172 | err = sysdev_class_register(&mce_sysclass); | ||
1173 | if (err) | ||
1174 | return err; | ||
1175 | |||
1176 | for_each_online_cpu(i) { | ||
1177 | err = mce_create_device(i); | ||
1178 | if (err) | ||
1179 | return err; | ||
1180 | } | ||
1181 | |||
1182 | register_hotcpu_notifier(&mce_cpu_notifier); | ||
1183 | misc_register(&mce_log_device); | ||
1184 | return err; | ||
1185 | } | ||
1186 | |||
1187 | device_initcall(mce_init_device); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 56dde9c4bc96..ddae21620bda 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c | |||
@@ -13,22 +13,22 @@ | |||
13 | * | 13 | * |
14 | * All MC4_MISCi registers are shared between multi-cores | 14 | * All MC4_MISCi registers are shared between multi-cores |
15 | */ | 15 | */ |
16 | |||
17 | #include <linux/cpu.h> | ||
18 | #include <linux/errno.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
21 | #include <linux/kobject.h> | ||
22 | #include <linux/notifier.h> | 17 | #include <linux/notifier.h> |
23 | #include <linux/sched.h> | 18 | #include <linux/kobject.h> |
24 | #include <linux/smp.h> | 19 | #include <linux/percpu.h> |
25 | #include <linux/sysdev.h> | 20 | #include <linux/sysdev.h> |
21 | #include <linux/errno.h> | ||
22 | #include <linux/sched.h> | ||
26 | #include <linux/sysfs.h> | 23 | #include <linux/sysfs.h> |
24 | #include <linux/init.h> | ||
25 | #include <linux/cpu.h> | ||
26 | #include <linux/smp.h> | ||
27 | |||
27 | #include <asm/apic.h> | 28 | #include <asm/apic.h> |
29 | #include <asm/idle.h> | ||
28 | #include <asm/mce.h> | 30 | #include <asm/mce.h> |
29 | #include <asm/msr.h> | 31 | #include <asm/msr.h> |
30 | #include <asm/percpu.h> | ||
31 | #include <asm/idle.h> | ||
32 | 32 | ||
33 | #define PFX "mce_threshold: " | 33 | #define PFX "mce_threshold: " |
34 | #define VERSION "version 1.1.1" | 34 | #define VERSION "version 1.1.1" |
@@ -48,26 +48,26 @@ | |||
48 | #define MCG_XBLK_ADDR 0xC0000400 | 48 | #define MCG_XBLK_ADDR 0xC0000400 |
49 | 49 | ||
50 | struct threshold_block { | 50 | struct threshold_block { |
51 | unsigned int block; | 51 | unsigned int block; |
52 | unsigned int bank; | 52 | unsigned int bank; |
53 | unsigned int cpu; | 53 | unsigned int cpu; |
54 | u32 address; | 54 | u32 address; |
55 | u16 interrupt_enable; | 55 | u16 interrupt_enable; |
56 | u16 threshold_limit; | 56 | u16 threshold_limit; |
57 | struct kobject kobj; | 57 | struct kobject kobj; |
58 | struct list_head miscj; | 58 | struct list_head miscj; |
59 | }; | 59 | }; |
60 | 60 | ||
61 | /* defaults used early on boot */ | 61 | /* defaults used early on boot */ |
62 | static struct threshold_block threshold_defaults = { | 62 | static struct threshold_block threshold_defaults = { |
63 | .interrupt_enable = 0, | 63 | .interrupt_enable = 0, |
64 | .threshold_limit = THRESHOLD_MAX, | 64 | .threshold_limit = THRESHOLD_MAX, |
65 | }; | 65 | }; |
66 | 66 | ||
67 | struct threshold_bank { | 67 | struct threshold_bank { |
68 | struct kobject *kobj; | 68 | struct kobject *kobj; |
69 | struct threshold_block *blocks; | 69 | struct threshold_block *blocks; |
70 | cpumask_var_t cpus; | 70 | cpumask_var_t cpus; |
71 | }; | 71 | }; |
72 | static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); | 72 | static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); |
73 | 73 | ||
@@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void); | |||
86 | */ | 86 | */ |
87 | 87 | ||
88 | struct thresh_restart { | 88 | struct thresh_restart { |
89 | struct threshold_block *b; | 89 | struct threshold_block *b; |
90 | int reset; | 90 | int reset; |
91 | u16 old_limit; | 91 | u16 old_limit; |
92 | }; | 92 | }; |
93 | 93 | ||
94 | /* must be called with correct cpu affinity */ | 94 | /* must be called with correct cpu affinity */ |
@@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr) | |||
110 | } else if (tr->old_limit) { /* change limit w/o reset */ | 110 | } else if (tr->old_limit) { /* change limit w/o reset */ |
111 | int new_count = (mci_misc_hi & THRESHOLD_MAX) + | 111 | int new_count = (mci_misc_hi & THRESHOLD_MAX) + |
112 | (tr->old_limit - tr->b->threshold_limit); | 112 | (tr->old_limit - tr->b->threshold_limit); |
113 | |||
113 | mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | | 114 | mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | |
114 | (new_count & THRESHOLD_MAX); | 115 | (new_count & THRESHOLD_MAX); |
115 | } | 116 | } |
@@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr) | |||
125 | /* cpu init entry point, called from mce.c with preempt off */ | 126 | /* cpu init entry point, called from mce.c with preempt off */ |
126 | void mce_amd_feature_init(struct cpuinfo_x86 *c) | 127 | void mce_amd_feature_init(struct cpuinfo_x86 *c) |
127 | { | 128 | { |
128 | unsigned int bank, block; | ||
129 | unsigned int cpu = smp_processor_id(); | 129 | unsigned int cpu = smp_processor_id(); |
130 | u8 lvt_off; | ||
131 | u32 low = 0, high = 0, address = 0; | 130 | u32 low = 0, high = 0, address = 0; |
131 | unsigned int bank, block; | ||
132 | struct thresh_restart tr; | 132 | struct thresh_restart tr; |
133 | u8 lvt_off; | ||
133 | 134 | ||
134 | for (bank = 0; bank < NR_BANKS; ++bank) { | 135 | for (bank = 0; bank < NR_BANKS; ++bank) { |
135 | for (block = 0; block < NR_BLOCKS; ++block) { | 136 | for (block = 0; block < NR_BLOCKS; ++block) { |
@@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
140 | if (!address) | 141 | if (!address) |
141 | break; | 142 | break; |
142 | address += MCG_XBLK_ADDR; | 143 | address += MCG_XBLK_ADDR; |
143 | } | 144 | } else |
144 | else | ||
145 | ++address; | 145 | ++address; |
146 | 146 | ||
147 | if (rdmsr_safe(address, &low, &high)) | 147 | if (rdmsr_safe(address, &low, &high)) |
@@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
193 | */ | 193 | */ |
194 | static void amd_threshold_interrupt(void) | 194 | static void amd_threshold_interrupt(void) |
195 | { | 195 | { |
196 | u32 low = 0, high = 0, address = 0; | ||
196 | unsigned int bank, block; | 197 | unsigned int bank, block; |
197 | struct mce m; | 198 | struct mce m; |
198 | u32 low = 0, high = 0, address = 0; | ||
199 | 199 | ||
200 | mce_setup(&m); | 200 | mce_setup(&m); |
201 | 201 | ||
@@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void) | |||
204 | if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) | 204 | if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) |
205 | continue; | 205 | continue; |
206 | for (block = 0; block < NR_BLOCKS; ++block) { | 206 | for (block = 0; block < NR_BLOCKS; ++block) { |
207 | if (block == 0) | 207 | if (block == 0) { |
208 | address = MSR_IA32_MC0_MISC + bank * 4; | 208 | address = MSR_IA32_MC0_MISC + bank * 4; |
209 | else if (block == 1) { | 209 | } else if (block == 1) { |
210 | address = (low & MASK_BLKPTR_LO) >> 21; | 210 | address = (low & MASK_BLKPTR_LO) >> 21; |
211 | if (!address) | 211 | if (!address) |
212 | break; | 212 | break; |
213 | address += MCG_XBLK_ADDR; | 213 | address += MCG_XBLK_ADDR; |
214 | } | 214 | } else { |
215 | else | ||
216 | ++address; | 215 | ++address; |
216 | } | ||
217 | 217 | ||
218 | if (rdmsr_safe(address, &low, &high)) | 218 | if (rdmsr_safe(address, &low, &high)) |
219 | break; | 219 | break; |
@@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void) | |||
229 | (high & MASK_LOCKED_HI)) | 229 | (high & MASK_LOCKED_HI)) |
230 | continue; | 230 | continue; |
231 | 231 | ||
232 | /* Log the machine check that caused the threshold | 232 | /* |
233 | event. */ | 233 | * Log the machine check that caused the threshold |
234 | * event. | ||
235 | */ | ||
234 | machine_check_poll(MCP_TIMESTAMP, | 236 | machine_check_poll(MCP_TIMESTAMP, |
235 | &__get_cpu_var(mce_poll_banks)); | 237 | &__get_cpu_var(mce_poll_banks)); |
236 | 238 | ||
@@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void) | |||
254 | 256 | ||
255 | struct threshold_attr { | 257 | struct threshold_attr { |
256 | struct attribute attr; | 258 | struct attribute attr; |
257 | ssize_t(*show) (struct threshold_block *, char *); | 259 | ssize_t (*show) (struct threshold_block *, char *); |
258 | ssize_t(*store) (struct threshold_block *, const char *, size_t count); | 260 | ssize_t (*store) (struct threshold_block *, const char *, size_t count); |
259 | }; | 261 | }; |
260 | 262 | ||
261 | #define SHOW_FIELDS(name) \ | 263 | #define SHOW_FIELDS(name) \ |
262 | static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ | 264 | static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ |
263 | { \ | 265 | { \ |
264 | return sprintf(buf, "%lx\n", (unsigned long) b->name); \ | 266 | return sprintf(buf, "%lx\n", (unsigned long) b->name); \ |
265 | } | 267 | } |
266 | SHOW_FIELDS(interrupt_enable) | 268 | SHOW_FIELDS(interrupt_enable) |
267 | SHOW_FIELDS(threshold_limit) | 269 | SHOW_FIELDS(threshold_limit) |
268 | 270 | ||
269 | static ssize_t store_interrupt_enable(struct threshold_block *b, | 271 | static ssize_t |
270 | const char *buf, size_t count) | 272 | store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) |
271 | { | 273 | { |
272 | char *end; | ||
273 | struct thresh_restart tr; | 274 | struct thresh_restart tr; |
274 | unsigned long new = simple_strtoul(buf, &end, 0); | 275 | unsigned long new; |
275 | if (end == buf) | 276 | |
277 | if (strict_strtoul(buf, 0, &new) < 0) | ||
276 | return -EINVAL; | 278 | return -EINVAL; |
279 | |||
277 | b->interrupt_enable = !!new; | 280 | b->interrupt_enable = !!new; |
278 | 281 | ||
279 | tr.b = b; | 282 | tr.b = b; |
280 | tr.reset = 0; | 283 | tr.reset = 0; |
281 | tr.old_limit = 0; | 284 | tr.old_limit = 0; |
285 | |||
282 | smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); | 286 | smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); |
283 | 287 | ||
284 | return end - buf; | 288 | return size; |
285 | } | 289 | } |
286 | 290 | ||
287 | static ssize_t store_threshold_limit(struct threshold_block *b, | 291 | static ssize_t |
288 | const char *buf, size_t count) | 292 | store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) |
289 | { | 293 | { |
290 | char *end; | ||
291 | struct thresh_restart tr; | 294 | struct thresh_restart tr; |
292 | unsigned long new = simple_strtoul(buf, &end, 0); | 295 | unsigned long new; |
293 | if (end == buf) | 296 | |
297 | if (strict_strtoul(buf, 0, &new) < 0) | ||
294 | return -EINVAL; | 298 | return -EINVAL; |
299 | |||
295 | if (new > THRESHOLD_MAX) | 300 | if (new > THRESHOLD_MAX) |
296 | new = THRESHOLD_MAX; | 301 | new = THRESHOLD_MAX; |
297 | if (new < 1) | 302 | if (new < 1) |
298 | new = 1; | 303 | new = 1; |
304 | |||
299 | tr.old_limit = b->threshold_limit; | 305 | tr.old_limit = b->threshold_limit; |
300 | b->threshold_limit = new; | 306 | b->threshold_limit = new; |
301 | tr.b = b; | 307 | tr.b = b; |
@@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b, | |||
303 | 309 | ||
304 | smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); | 310 | smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); |
305 | 311 | ||
306 | return end - buf; | 312 | return size; |
307 | } | 313 | } |
308 | 314 | ||
309 | struct threshold_block_cross_cpu { | 315 | struct threshold_block_cross_cpu { |
310 | struct threshold_block *tb; | 316 | struct threshold_block *tb; |
311 | long retval; | 317 | long retval; |
312 | }; | 318 | }; |
313 | 319 | ||
314 | static void local_error_count_handler(void *_tbcc) | 320 | static void local_error_count_handler(void *_tbcc) |
@@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b, | |||
338 | return 1; | 344 | return 1; |
339 | } | 345 | } |
340 | 346 | ||
341 | #define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ | 347 | #define RW_ATTR(val) \ |
342 | .attr = {.name = __stringify(_name), .mode = _mode }, \ | 348 | static struct threshold_attr val = { \ |
343 | .show = _show, \ | 349 | .attr = {.name = __stringify(val), .mode = 0644 }, \ |
344 | .store = _store, \ | 350 | .show = show_## val, \ |
351 | .store = store_## val, \ | ||
345 | }; | 352 | }; |
346 | 353 | ||
347 | #define RW_ATTR(name) \ | ||
348 | static struct threshold_attr name = \ | ||
349 | THRESHOLD_ATTR(name, 0644, show_## name, store_## name) | ||
350 | |||
351 | RW_ATTR(interrupt_enable); | 354 | RW_ATTR(interrupt_enable); |
352 | RW_ATTR(threshold_limit); | 355 | RW_ATTR(threshold_limit); |
353 | RW_ATTR(error_count); | 356 | RW_ATTR(error_count); |
@@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = { | |||
359 | NULL | 362 | NULL |
360 | }; | 363 | }; |
361 | 364 | ||
362 | #define to_block(k) container_of(k, struct threshold_block, kobj) | 365 | #define to_block(k) container_of(k, struct threshold_block, kobj) |
363 | #define to_attr(a) container_of(a, struct threshold_attr, attr) | 366 | #define to_attr(a) container_of(a, struct threshold_attr, attr) |
364 | 367 | ||
365 | static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) | 368 | static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) |
366 | { | 369 | { |
367 | struct threshold_block *b = to_block(kobj); | 370 | struct threshold_block *b = to_block(kobj); |
368 | struct threshold_attr *a = to_attr(attr); | 371 | struct threshold_attr *a = to_attr(attr); |
369 | ssize_t ret; | 372 | ssize_t ret; |
373 | |||
370 | ret = a->show ? a->show(b, buf) : -EIO; | 374 | ret = a->show ? a->show(b, buf) : -EIO; |
375 | |||
371 | return ret; | 376 | return ret; |
372 | } | 377 | } |
373 | 378 | ||
@@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, | |||
377 | struct threshold_block *b = to_block(kobj); | 382 | struct threshold_block *b = to_block(kobj); |
378 | struct threshold_attr *a = to_attr(attr); | 383 | struct threshold_attr *a = to_attr(attr); |
379 | ssize_t ret; | 384 | ssize_t ret; |
385 | |||
380 | ret = a->store ? a->store(b, buf, count) : -EIO; | 386 | ret = a->store ? a->store(b, buf, count) : -EIO; |
387 | |||
381 | return ret; | 388 | return ret; |
382 | } | 389 | } |
383 | 390 | ||
384 | static struct sysfs_ops threshold_ops = { | 391 | static struct sysfs_ops threshold_ops = { |
385 | .show = show, | 392 | .show = show, |
386 | .store = store, | 393 | .store = store, |
387 | }; | 394 | }; |
388 | 395 | ||
389 | static struct kobj_type threshold_ktype = { | 396 | static struct kobj_type threshold_ktype = { |
390 | .sysfs_ops = &threshold_ops, | 397 | .sysfs_ops = &threshold_ops, |
391 | .default_attrs = default_attrs, | 398 | .default_attrs = default_attrs, |
392 | }; | 399 | }; |
393 | 400 | ||
394 | static __cpuinit int allocate_threshold_blocks(unsigned int cpu, | 401 | static __cpuinit int allocate_threshold_blocks(unsigned int cpu, |
@@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, | |||
396 | unsigned int block, | 403 | unsigned int block, |
397 | u32 address) | 404 | u32 address) |
398 | { | 405 | { |
399 | int err; | ||
400 | u32 low, high; | ||
401 | struct threshold_block *b = NULL; | 406 | struct threshold_block *b = NULL; |
407 | u32 low, high; | ||
408 | int err; | ||
402 | 409 | ||
403 | if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) | 410 | if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) |
404 | return 0; | 411 | return 0; |
@@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, | |||
421 | if (!b) | 428 | if (!b) |
422 | return -ENOMEM; | 429 | return -ENOMEM; |
423 | 430 | ||
424 | b->block = block; | 431 | b->block = block; |
425 | b->bank = bank; | 432 | b->bank = bank; |
426 | b->cpu = cpu; | 433 | b->cpu = cpu; |
427 | b->address = address; | 434 | b->address = address; |
428 | b->interrupt_enable = 0; | 435 | b->interrupt_enable = 0; |
429 | b->threshold_limit = THRESHOLD_MAX; | 436 | b->threshold_limit = THRESHOLD_MAX; |
430 | 437 | ||
431 | INIT_LIST_HEAD(&b->miscj); | 438 | INIT_LIST_HEAD(&b->miscj); |
432 | 439 | ||
433 | if (per_cpu(threshold_banks, cpu)[bank]->blocks) | 440 | if (per_cpu(threshold_banks, cpu)[bank]->blocks) { |
434 | list_add(&b->miscj, | 441 | list_add(&b->miscj, |
435 | &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); | 442 | &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); |
436 | else | 443 | } else { |
437 | per_cpu(threshold_banks, cpu)[bank]->blocks = b; | 444 | per_cpu(threshold_banks, cpu)[bank]->blocks = b; |
445 | } | ||
438 | 446 | ||
439 | err = kobject_init_and_add(&b->kobj, &threshold_ktype, | 447 | err = kobject_init_and_add(&b->kobj, &threshold_ktype, |
440 | per_cpu(threshold_banks, cpu)[bank]->kobj, | 448 | per_cpu(threshold_banks, cpu)[bank]->kobj, |
@@ -447,8 +455,9 @@ recurse: | |||
447 | if (!address) | 455 | if (!address) |
448 | return 0; | 456 | return 0; |
449 | address += MCG_XBLK_ADDR; | 457 | address += MCG_XBLK_ADDR; |
450 | } else | 458 | } else { |
451 | ++address; | 459 | ++address; |
460 | } | ||
452 | 461 | ||
453 | err = allocate_threshold_blocks(cpu, bank, ++block, address); | 462 | err = allocate_threshold_blocks(cpu, bank, ++block, address); |
454 | if (err) | 463 | if (err) |
@@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
500 | if (!b) | 509 | if (!b) |
501 | goto out; | 510 | goto out; |
502 | 511 | ||
503 | err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, | 512 | err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, |
504 | b->kobj, name); | 513 | b->kobj, name); |
505 | if (err) | 514 | if (err) |
506 | goto out; | 515 | goto out; |
507 | 516 | ||
508 | cpumask_copy(b->cpus, cpu_core_mask(cpu)); | 517 | cpumask_copy(b->cpus, cpu_core_mask(cpu)); |
509 | per_cpu(threshold_banks, cpu)[bank] = b; | 518 | per_cpu(threshold_banks, cpu)[bank] = b; |
519 | |||
510 | goto out; | 520 | goto out; |
511 | } | 521 | } |
512 | #endif | 522 | #endif |
@@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
522 | goto out; | 532 | goto out; |
523 | } | 533 | } |
524 | 534 | ||
525 | b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); | 535 | b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); |
526 | if (!b->kobj) | 536 | if (!b->kobj) |
527 | goto out_free; | 537 | goto out_free; |
528 | 538 | ||
@@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
542 | if (i == cpu) | 552 | if (i == cpu) |
543 | continue; | 553 | continue; |
544 | 554 | ||
545 | err = sysfs_create_link(&per_cpu(device_mce, i).kobj, | 555 | err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, |
546 | b->kobj, name); | 556 | b->kobj, name); |
547 | if (err) | 557 | if (err) |
548 | goto out; | 558 | goto out; |
@@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu, | |||
605 | 615 | ||
606 | static void threshold_remove_bank(unsigned int cpu, int bank) | 616 | static void threshold_remove_bank(unsigned int cpu, int bank) |
607 | { | 617 | { |
608 | int i = 0; | ||
609 | struct threshold_bank *b; | 618 | struct threshold_bank *b; |
610 | char name[32]; | 619 | char name[32]; |
620 | int i = 0; | ||
611 | 621 | ||
612 | b = per_cpu(threshold_banks, cpu)[bank]; | 622 | b = per_cpu(threshold_banks, cpu)[bank]; |
613 | |||
614 | if (!b) | 623 | if (!b) |
615 | return; | 624 | return; |
616 | |||
617 | if (!b->blocks) | 625 | if (!b->blocks) |
618 | goto free_out; | 626 | goto free_out; |
619 | 627 | ||
@@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank) | |||
622 | #ifdef CONFIG_SMP | 630 | #ifdef CONFIG_SMP |
623 | /* sibling symlink */ | 631 | /* sibling symlink */ |
624 | if (shared_bank[bank] && b->blocks->cpu != cpu) { | 632 | if (shared_bank[bank] && b->blocks->cpu != cpu) { |
625 | sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); | 633 | sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); |
626 | per_cpu(threshold_banks, cpu)[bank] = NULL; | 634 | per_cpu(threshold_banks, cpu)[bank] = NULL; |
635 | |||
627 | return; | 636 | return; |
628 | } | 637 | } |
629 | #endif | 638 | #endif |
@@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) | |||
633 | if (i == cpu) | 642 | if (i == cpu) |
634 | continue; | 643 | continue; |
635 | 644 | ||
636 | sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); | 645 | sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); |
637 | per_cpu(threshold_banks, i)[bank] = NULL; | 646 | per_cpu(threshold_banks, i)[bank] = NULL; |
638 | } | 647 | } |
639 | 648 | ||
@@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu) | |||
659 | } | 668 | } |
660 | 669 | ||
661 | /* get notified when a cpu comes on/off */ | 670 | /* get notified when a cpu comes on/off */ |
662 | static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, | 671 | static void __cpuinit |
663 | unsigned int cpu) | 672 | amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) |
664 | { | 673 | { |
665 | if (cpu >= NR_CPUS) | ||
666 | return; | ||
667 | |||
668 | switch (action) { | 674 | switch (action) { |
669 | case CPU_ONLINE: | 675 | case CPU_ONLINE: |
670 | case CPU_ONLINE_FROZEN: | 676 | case CPU_ONLINE_FROZEN: |
@@ -686,11 +692,12 @@ static __init int threshold_init_device(void) | |||
686 | /* to hit CPUs online before the notifier is up */ | 692 | /* to hit CPUs online before the notifier is up */ |
687 | for_each_online_cpu(lcpu) { | 693 | for_each_online_cpu(lcpu) { |
688 | int err = threshold_create_device(lcpu); | 694 | int err = threshold_create_device(lcpu); |
695 | |||
689 | if (err) | 696 | if (err) |
690 | return err; | 697 | return err; |
691 | } | 698 | } |
692 | threshold_cpu_callback = amd_64_threshold_cpu_callback; | 699 | threshold_cpu_callback = amd_64_threshold_cpu_callback; |
700 | |||
693 | return 0; | 701 | return 0; |
694 | } | 702 | } |
695 | |||
696 | device_initcall(threshold_init_device); | 703 | device_initcall(threshold_init_device); |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c new file mode 100644 index 000000000000..2b011d2d8579 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c | |||
@@ -0,0 +1,74 @@ | |||
1 | /* | ||
2 | * Common code for Intel machine checks | ||
3 | */ | ||
4 | #include <linux/interrupt.h> | ||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/types.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/smp.h> | ||
9 | |||
10 | #include <asm/therm_throt.h> | ||
11 | #include <asm/processor.h> | ||
12 | #include <asm/system.h> | ||
13 | #include <asm/apic.h> | ||
14 | #include <asm/msr.h> | ||
15 | |||
16 | #include "mce.h" | ||
17 | |||
18 | void intel_init_thermal(struct cpuinfo_x86 *c) | ||
19 | { | ||
20 | unsigned int cpu = smp_processor_id(); | ||
21 | int tm2 = 0; | ||
22 | u32 l, h; | ||
23 | |||
24 | /* Thermal monitoring depends on ACPI and clock modulation*/ | ||
25 | if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) | ||
26 | return; | ||
27 | |||
28 | /* | ||
29 | * First check if its enabled already, in which case there might | ||
30 | * be some SMM goo which handles it, so we can't even put a handler | ||
31 | * since it might be delivered via SMI already: | ||
32 | */ | ||
33 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
34 | h = apic_read(APIC_LVTTHMR); | ||
35 | if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { | ||
36 | printk(KERN_DEBUG | ||
37 | "CPU%d: Thermal monitoring handled by SMI\n", cpu); | ||
38 | return; | ||
39 | } | ||
40 | |||
41 | if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) | ||
42 | tm2 = 1; | ||
43 | |||
44 | /* Check whether a vector already exists */ | ||
45 | if (h & APIC_VECTOR_MASK) { | ||
46 | printk(KERN_DEBUG | ||
47 | "CPU%d: Thermal LVT vector (%#x) already installed\n", | ||
48 | cpu, (h & APIC_VECTOR_MASK)); | ||
49 | return; | ||
50 | } | ||
51 | |||
52 | /* We'll mask the thermal vector in the lapic till we're ready: */ | ||
53 | h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; | ||
54 | apic_write(APIC_LVTTHMR, h); | ||
55 | |||
56 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | ||
57 | wrmsr(MSR_IA32_THERM_INTERRUPT, | ||
58 | l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); | ||
59 | |||
60 | intel_set_thermal_handler(); | ||
61 | |||
62 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
63 | wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); | ||
64 | |||
65 | /* Unmask the thermal vector: */ | ||
66 | l = apic_read(APIC_LVTTHMR); | ||
67 | apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
68 | |||
69 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", | ||
70 | cpu, tm2 ? "TM2" : "TM1"); | ||
71 | |||
72 | /* enable thermal throttle processing */ | ||
73 | atomic_set(&therm_throt_en, 1); | ||
74 | } | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index cef3ee30744b..f2ef6952c400 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c | |||
@@ -15,7 +15,8 @@ | |||
15 | #include <asm/hw_irq.h> | 15 | #include <asm/hw_irq.h> |
16 | #include <asm/idle.h> | 16 | #include <asm/idle.h> |
17 | #include <asm/therm_throt.h> | 17 | #include <asm/therm_throt.h> |
18 | #include <asm/apic.h> | 18 | |
19 | #include "mce.h" | ||
19 | 20 | ||
20 | asmlinkage void smp_thermal_interrupt(void) | 21 | asmlinkage void smp_thermal_interrupt(void) |
21 | { | 22 | { |
@@ -27,67 +28,13 @@ asmlinkage void smp_thermal_interrupt(void) | |||
27 | irq_enter(); | 28 | irq_enter(); |
28 | 29 | ||
29 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | 30 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); |
30 | if (therm_throt_process(msr_val & 1)) | 31 | if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) |
31 | mce_log_therm_throt_event(msr_val); | 32 | mce_log_therm_throt_event(msr_val); |
32 | 33 | ||
33 | inc_irq_stat(irq_thermal_count); | 34 | inc_irq_stat(irq_thermal_count); |
34 | irq_exit(); | 35 | irq_exit(); |
35 | } | 36 | } |
36 | 37 | ||
37 | static void intel_init_thermal(struct cpuinfo_x86 *c) | ||
38 | { | ||
39 | u32 l, h; | ||
40 | int tm2 = 0; | ||
41 | unsigned int cpu = smp_processor_id(); | ||
42 | |||
43 | if (!cpu_has(c, X86_FEATURE_ACPI)) | ||
44 | return; | ||
45 | |||
46 | if (!cpu_has(c, X86_FEATURE_ACC)) | ||
47 | return; | ||
48 | |||
49 | /* first check if TM1 is already enabled by the BIOS, in which | ||
50 | * case there might be some SMM goo which handles it, so we can't even | ||
51 | * put a handler since it might be delivered via SMI already. | ||
52 | */ | ||
53 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
54 | h = apic_read(APIC_LVTTHMR); | ||
55 | if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { | ||
56 | printk(KERN_DEBUG | ||
57 | "CPU%d: Thermal monitoring handled by SMI\n", cpu); | ||
58 | return; | ||
59 | } | ||
60 | |||
61 | if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) | ||
62 | tm2 = 1; | ||
63 | |||
64 | if (h & APIC_VECTOR_MASK) { | ||
65 | printk(KERN_DEBUG | ||
66 | "CPU%d: Thermal LVT vector (%#x) already " | ||
67 | "installed\n", cpu, (h & APIC_VECTOR_MASK)); | ||
68 | return; | ||
69 | } | ||
70 | |||
71 | h = THERMAL_APIC_VECTOR; | ||
72 | h |= (APIC_DM_FIXED | APIC_LVT_MASKED); | ||
73 | apic_write(APIC_LVTTHMR, h); | ||
74 | |||
75 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | ||
76 | wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); | ||
77 | |||
78 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
79 | wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); | ||
80 | |||
81 | l = apic_read(APIC_LVTTHMR); | ||
82 | apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
83 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", | ||
84 | cpu, tm2 ? "TM2" : "TM1"); | ||
85 | |||
86 | /* enable thermal throttle processing */ | ||
87 | atomic_set(&therm_throt_en, 1); | ||
88 | return; | ||
89 | } | ||
90 | |||
91 | /* | 38 | /* |
92 | * Support for Intel Correct Machine Check Interrupts. This allows | 39 | * Support for Intel Correct Machine Check Interrupts. This allows |
93 | * the CPU to raise an interrupt when a corrected machine check happened. | 40 | * the CPU to raise an interrupt when a corrected machine check happened. |
@@ -109,6 +56,9 @@ static int cmci_supported(int *banks) | |||
109 | { | 56 | { |
110 | u64 cap; | 57 | u64 cap; |
111 | 58 | ||
59 | if (mce_cmci_disabled || mce_ignore_ce) | ||
60 | return 0; | ||
61 | |||
112 | /* | 62 | /* |
113 | * Vendor check is not strictly needed, but the initial | 63 | * Vendor check is not strictly needed, but the initial |
114 | * initialization is vendor keyed and this | 64 | * initialization is vendor keyed and this |
@@ -132,7 +82,7 @@ static int cmci_supported(int *banks) | |||
132 | static void intel_threshold_interrupt(void) | 82 | static void intel_threshold_interrupt(void) |
133 | { | 83 | { |
134 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | 84 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); |
135 | mce_notify_user(); | 85 | mce_notify_irq(); |
136 | } | 86 | } |
137 | 87 | ||
138 | static void print_update(char *type, int *hdr, int num) | 88 | static void print_update(char *type, int *hdr, int num) |
@@ -248,7 +198,7 @@ void cmci_rediscover(int dying) | |||
248 | return; | 198 | return; |
249 | cpumask_copy(old, ¤t->cpus_allowed); | 199 | cpumask_copy(old, ¤t->cpus_allowed); |
250 | 200 | ||
251 | for_each_online_cpu (cpu) { | 201 | for_each_online_cpu(cpu) { |
252 | if (cpu == dying) | 202 | if (cpu == dying) |
253 | continue; | 203 | continue; |
254 | if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) | 204 | if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) |
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index a74af128efc9..70b710420f74 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c | |||
@@ -6,15 +6,14 @@ | |||
6 | * This file contains routines to check for non-fatal MCEs every 15s | 6 | * This file contains routines to check for non-fatal MCEs every 15s |
7 | * | 7 | * |
8 | */ | 8 | */ |
9 | |||
10 | #include <linux/init.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/jiffies.h> | ||
14 | #include <linux/workqueue.h> | ||
15 | #include <linux/interrupt.h> | 9 | #include <linux/interrupt.h> |
16 | #include <linux/smp.h> | 10 | #include <linux/workqueue.h> |
11 | #include <linux/jiffies.h> | ||
12 | #include <linux/kernel.h> | ||
17 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/types.h> | ||
15 | #include <linux/init.h> | ||
16 | #include <linux/smp.h> | ||
18 | 17 | ||
19 | #include <asm/processor.h> | 18 | #include <asm/processor.h> |
20 | #include <asm/system.h> | 19 | #include <asm/system.h> |
@@ -22,9 +21,9 @@ | |||
22 | 21 | ||
23 | #include "mce.h" | 22 | #include "mce.h" |
24 | 23 | ||
25 | static int firstbank; | 24 | static int firstbank; |
26 | 25 | ||
27 | #define MCE_RATE 15*HZ /* timer rate is 15s */ | 26 | #define MCE_RATE (15*HZ) /* timer rate is 15s */ |
28 | 27 | ||
29 | static void mce_checkregs(void *info) | 28 | static void mce_checkregs(void *info) |
30 | { | 29 | { |
@@ -34,23 +33,24 @@ static void mce_checkregs(void *info) | |||
34 | for (i = firstbank; i < nr_mce_banks; i++) { | 33 | for (i = firstbank; i < nr_mce_banks; i++) { |
35 | rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); | 34 | rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); |
36 | 35 | ||
37 | if (high & (1<<31)) { | 36 | if (!(high & (1<<31))) |
38 | printk(KERN_INFO "MCE: The hardware reports a non " | 37 | continue; |
39 | "fatal, correctable incident occurred on " | 38 | |
40 | "CPU %d.\n", | 39 | printk(KERN_INFO "MCE: The hardware reports a non fatal, " |
40 | "correctable incident occurred on CPU %d.\n", | ||
41 | smp_processor_id()); | 41 | smp_processor_id()); |
42 | printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); | 42 | |
43 | 43 | printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); | |
44 | /* | 44 | |
45 | * Scrub the error so we don't pick it up in MCE_RATE | 45 | /* |
46 | * seconds time. | 46 | * Scrub the error so we don't pick it up in MCE_RATE |
47 | */ | 47 | * seconds time: |
48 | wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); | 48 | */ |
49 | 49 | wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); | |
50 | /* Serialize */ | 50 | |
51 | wmb(); | 51 | /* Serialize: */ |
52 | add_taint(TAINT_MACHINE_CHECK); | 52 | wmb(); |
53 | } | 53 | add_taint(TAINT_MACHINE_CHECK); |
54 | } | 54 | } |
55 | } | 55 | } |
56 | 56 | ||
@@ -77,16 +77,17 @@ static int __init init_nonfatal_mce_checker(void) | |||
77 | 77 | ||
78 | /* Some Athlons misbehave when we frob bank 0 */ | 78 | /* Some Athlons misbehave when we frob bank 0 */ |
79 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | 79 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && |
80 | boot_cpu_data.x86 == 6) | 80 | boot_cpu_data.x86 == 6) |
81 | firstbank = 1; | 81 | firstbank = 1; |
82 | else | 82 | else |
83 | firstbank = 0; | 83 | firstbank = 0; |
84 | 84 | ||
85 | /* | 85 | /* |
86 | * Check for non-fatal errors every MCE_RATE s | 86 | * Check for non-fatal errors every MCE_RATE s |
87 | */ | 87 | */ |
88 | schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); | 88 | schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); |
89 | printk(KERN_INFO "Machine check exception polling timer started.\n"); | 89 | printk(KERN_INFO "Machine check exception polling timer started.\n"); |
90 | |||
90 | return 0; | 91 | return 0; |
91 | } | 92 | } |
92 | module_init(init_nonfatal_mce_checker); | 93 | module_init(init_nonfatal_mce_checker); |
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f53bdcbaf382..82cee108a2d3 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c | |||
@@ -2,18 +2,17 @@ | |||
2 | * P4 specific Machine Check Exception Reporting | 2 | * P4 specific Machine Check Exception Reporting |
3 | */ | 3 | */ |
4 | 4 | ||
5 | #include <linux/init.h> | ||
6 | #include <linux/types.h> | ||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/smp.h> | 9 | #include <linux/smp.h> |
10 | 10 | ||
11 | #include <asm/therm_throt.h> | ||
11 | #include <asm/processor.h> | 12 | #include <asm/processor.h> |
12 | #include <asm/system.h> | 13 | #include <asm/system.h> |
13 | #include <asm/msr.h> | ||
14 | #include <asm/apic.h> | 14 | #include <asm/apic.h> |
15 | 15 | #include <asm/msr.h> | |
16 | #include <asm/therm_throt.h> | ||
17 | 16 | ||
18 | #include "mce.h" | 17 | #include "mce.h" |
19 | 18 | ||
@@ -36,6 +35,7 @@ static int mce_num_extended_msrs; | |||
36 | 35 | ||
37 | 36 | ||
38 | #ifdef CONFIG_X86_MCE_P4THERMAL | 37 | #ifdef CONFIG_X86_MCE_P4THERMAL |
38 | |||
39 | static void unexpected_thermal_interrupt(struct pt_regs *regs) | 39 | static void unexpected_thermal_interrupt(struct pt_regs *regs) |
40 | { | 40 | { |
41 | printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", | 41 | printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", |
@@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs) | |||
43 | add_taint(TAINT_MACHINE_CHECK); | 43 | add_taint(TAINT_MACHINE_CHECK); |
44 | } | 44 | } |
45 | 45 | ||
46 | /* P4/Xeon Thermal transition interrupt handler */ | 46 | /* P4/Xeon Thermal transition interrupt handler: */ |
47 | static void intel_thermal_interrupt(struct pt_regs *regs) | 47 | static void intel_thermal_interrupt(struct pt_regs *regs) |
48 | { | 48 | { |
49 | __u64 msr_val; | 49 | __u64 msr_val; |
@@ -51,11 +51,12 @@ static void intel_thermal_interrupt(struct pt_regs *regs) | |||
51 | ack_APIC_irq(); | 51 | ack_APIC_irq(); |
52 | 52 | ||
53 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | 53 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); |
54 | therm_throt_process(msr_val & 0x1); | 54 | therm_throt_process(msr_val & THERM_STATUS_PROCHOT); |
55 | } | 55 | } |
56 | 56 | ||
57 | /* Thermal interrupt handler for this CPU setup */ | 57 | /* Thermal interrupt handler for this CPU setup: */ |
58 | static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; | 58 | static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = |
59 | unexpected_thermal_interrupt; | ||
59 | 60 | ||
60 | void smp_thermal_interrupt(struct pt_regs *regs) | 61 | void smp_thermal_interrupt(struct pt_regs *regs) |
61 | { | 62 | { |
@@ -65,67 +66,15 @@ void smp_thermal_interrupt(struct pt_regs *regs) | |||
65 | irq_exit(); | 66 | irq_exit(); |
66 | } | 67 | } |
67 | 68 | ||
68 | /* P4/Xeon Thermal regulation detect and init */ | 69 | void intel_set_thermal_handler(void) |
69 | static void intel_init_thermal(struct cpuinfo_x86 *c) | ||
70 | { | 70 | { |
71 | u32 l, h; | ||
72 | unsigned int cpu = smp_processor_id(); | ||
73 | |||
74 | /* Thermal monitoring */ | ||
75 | if (!cpu_has(c, X86_FEATURE_ACPI)) | ||
76 | return; /* -ENODEV */ | ||
77 | |||
78 | /* Clock modulation */ | ||
79 | if (!cpu_has(c, X86_FEATURE_ACC)) | ||
80 | return; /* -ENODEV */ | ||
81 | |||
82 | /* first check if its enabled already, in which case there might | ||
83 | * be some SMM goo which handles it, so we can't even put a handler | ||
84 | * since it might be delivered via SMI already -zwanem. | ||
85 | */ | ||
86 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
87 | h = apic_read(APIC_LVTTHMR); | ||
88 | if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { | ||
89 | printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", | ||
90 | cpu); | ||
91 | return; /* -EBUSY */ | ||
92 | } | ||
93 | |||
94 | /* check whether a vector already exists, temporarily masked? */ | ||
95 | if (h & APIC_VECTOR_MASK) { | ||
96 | printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " | ||
97 | "installed\n", | ||
98 | cpu, (h & APIC_VECTOR_MASK)); | ||
99 | return; /* -EBUSY */ | ||
100 | } | ||
101 | |||
102 | /* The temperature transition interrupt handler setup */ | ||
103 | h = THERMAL_APIC_VECTOR; /* our delivery vector */ | ||
104 | h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ | ||
105 | apic_write(APIC_LVTTHMR, h); | ||
106 | |||
107 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | ||
108 | wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); | ||
109 | |||
110 | /* ok we're good to go... */ | ||
111 | vendor_thermal_interrupt = intel_thermal_interrupt; | 71 | vendor_thermal_interrupt = intel_thermal_interrupt; |
112 | |||
113 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
114 | wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); | ||
115 | |||
116 | l = apic_read(APIC_LVTTHMR); | ||
117 | apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
118 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); | ||
119 | |||
120 | /* enable thermal throttle processing */ | ||
121 | atomic_set(&therm_throt_en, 1); | ||
122 | return; | ||
123 | } | 72 | } |
124 | #endif /* CONFIG_X86_MCE_P4THERMAL */ | ||
125 | 73 | ||
74 | #endif /* CONFIG_X86_MCE_P4THERMAL */ | ||
126 | 75 | ||
127 | /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ | 76 | /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ |
128 | static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) | 77 | static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) |
129 | { | 78 | { |
130 | u32 h; | 79 | u32 h; |
131 | 80 | ||
@@ -143,9 +92,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) | |||
143 | 92 | ||
144 | static void intel_machine_check(struct pt_regs *regs, long error_code) | 93 | static void intel_machine_check(struct pt_regs *regs, long error_code) |
145 | { | 94 | { |
146 | int recover = 1; | ||
147 | u32 alow, ahigh, high, low; | 95 | u32 alow, ahigh, high, low; |
148 | u32 mcgstl, mcgsth; | 96 | u32 mcgstl, mcgsth; |
97 | int recover = 1; | ||
149 | int i; | 98 | int i; |
150 | 99 | ||
151 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 100 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
@@ -157,7 +106,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
157 | 106 | ||
158 | if (mce_num_extended_msrs > 0) { | 107 | if (mce_num_extended_msrs > 0) { |
159 | struct intel_mce_extended_msrs dbg; | 108 | struct intel_mce_extended_msrs dbg; |
109 | |||
160 | intel_get_extended_msrs(&dbg); | 110 | intel_get_extended_msrs(&dbg); |
111 | |||
161 | printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" | 112 | printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" |
162 | "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" | 113 | "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" |
163 | "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", | 114 | "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", |
@@ -171,6 +122,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
171 | if (high & (1<<31)) { | 122 | if (high & (1<<31)) { |
172 | char misc[20]; | 123 | char misc[20]; |
173 | char addr[24]; | 124 | char addr[24]; |
125 | |||
174 | misc[0] = addr[0] = '\0'; | 126 | misc[0] = addr[0] = '\0'; |
175 | if (high & (1<<29)) | 127 | if (high & (1<<29)) |
176 | recover |= 1; | 128 | recover |= 1; |
@@ -196,6 +148,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
196 | panic("Unable to continue"); | 148 | panic("Unable to continue"); |
197 | 149 | ||
198 | printk(KERN_EMERG "Attempting to continue.\n"); | 150 | printk(KERN_EMERG "Attempting to continue.\n"); |
151 | |||
199 | /* | 152 | /* |
200 | * Do not clear the MSR_IA32_MCi_STATUS if the error is not | 153 | * Do not clear the MSR_IA32_MCi_STATUS if the error is not |
201 | * recoverable/continuable.This will allow BIOS to look at the MSRs | 154 | * recoverable/continuable.This will allow BIOS to look at the MSRs |
@@ -217,7 +170,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
217 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 170 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
218 | } | 171 | } |
219 | 172 | ||
220 | |||
221 | void intel_p4_mcheck_init(struct cpuinfo_x86 *c) | 173 | void intel_p4_mcheck_init(struct cpuinfo_x86 *c) |
222 | { | 174 | { |
223 | u32 l, h; | 175 | u32 l, h; |
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index c9f77ea69edc..015f481ab1b0 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c | |||
@@ -2,11 +2,10 @@ | |||
2 | * P5 specific Machine Check Exception Reporting | 2 | * P5 specific Machine Check Exception Reporting |
3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> | 3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> |
4 | */ | 4 | */ |
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/init.h> | ||
10 | #include <linux/smp.h> | 9 | #include <linux/smp.h> |
11 | 10 | ||
12 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
@@ -15,39 +14,58 @@ | |||
15 | 14 | ||
16 | #include "mce.h" | 15 | #include "mce.h" |
17 | 16 | ||
18 | /* Machine check handler for Pentium class Intel */ | 17 | /* By default disabled */ |
18 | int mce_p5_enable; | ||
19 | |||
20 | /* Machine check handler for Pentium class Intel CPUs: */ | ||
19 | static void pentium_machine_check(struct pt_regs *regs, long error_code) | 21 | static void pentium_machine_check(struct pt_regs *regs, long error_code) |
20 | { | 22 | { |
21 | u32 loaddr, hi, lotype; | 23 | u32 loaddr, hi, lotype; |
24 | |||
22 | rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); | 25 | rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); |
23 | rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); | 26 | rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); |
24 | printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); | 27 | |
25 | if (lotype&(1<<5)) | 28 | printk(KERN_EMERG |
26 | printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); | 29 | "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", |
30 | smp_processor_id(), loaddr, lotype); | ||
31 | |||
32 | if (lotype & (1<<5)) { | ||
33 | printk(KERN_EMERG | ||
34 | "CPU#%d: Possible thermal failure (CPU on fire ?).\n", | ||
35 | smp_processor_id()); | ||
36 | } | ||
37 | |||
27 | add_taint(TAINT_MACHINE_CHECK); | 38 | add_taint(TAINT_MACHINE_CHECK); |
28 | } | 39 | } |
29 | 40 | ||
30 | /* Set up machine check reporting for processors with Intel style MCE */ | 41 | /* Set up machine check reporting for processors with Intel style MCE: */ |
31 | void intel_p5_mcheck_init(struct cpuinfo_x86 *c) | 42 | void intel_p5_mcheck_init(struct cpuinfo_x86 *c) |
32 | { | 43 | { |
33 | u32 l, h; | 44 | u32 l, h; |
34 | 45 | ||
35 | /*Check for MCE support */ | 46 | /* Check for MCE support: */ |
36 | if (!cpu_has(c, X86_FEATURE_MCE)) | 47 | if (!cpu_has(c, X86_FEATURE_MCE)) |
37 | return; | 48 | return; |
38 | 49 | ||
39 | /* Default P5 to off as its often misconnected */ | 50 | #ifdef CONFIG_X86_OLD_MCE |
51 | /* Default P5 to off as its often misconnected: */ | ||
40 | if (mce_disabled != -1) | 52 | if (mce_disabled != -1) |
41 | return; | 53 | return; |
54 | #endif | ||
55 | |||
42 | machine_check_vector = pentium_machine_check; | 56 | machine_check_vector = pentium_machine_check; |
57 | /* Make sure the vector pointer is visible before we enable MCEs: */ | ||
43 | wmb(); | 58 | wmb(); |
44 | 59 | ||
45 | /* Read registers before enabling */ | 60 | /* Read registers before enabling: */ |
46 | rdmsr(MSR_IA32_P5_MC_ADDR, l, h); | 61 | rdmsr(MSR_IA32_P5_MC_ADDR, l, h); |
47 | rdmsr(MSR_IA32_P5_MC_TYPE, l, h); | 62 | rdmsr(MSR_IA32_P5_MC_TYPE, l, h); |
48 | printk(KERN_INFO "Intel old style machine check architecture supported.\n"); | 63 | printk(KERN_INFO |
64 | "Intel old style machine check architecture supported.\n"); | ||
49 | 65 | ||
50 | /* Enable MCE */ | 66 | /* Enable MCE: */ |
51 | set_in_cr4(X86_CR4_MCE); | 67 | set_in_cr4(X86_CR4_MCE); |
52 | printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); | 68 | printk(KERN_INFO |
69 | "Intel old style machine check reporting enabled on CPU#%d.\n", | ||
70 | smp_processor_id()); | ||
53 | } | 71 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index 2ac52d7b434b..43c24e667457 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c | |||
@@ -2,11 +2,10 @@ | |||
2 | * P6 specific Machine Check Exception Reporting | 2 | * P6 specific Machine Check Exception Reporting |
3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> | 3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> |
4 | */ | 4 | */ |
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/init.h> | ||
10 | #include <linux/smp.h> | 9 | #include <linux/smp.h> |
11 | 10 | ||
12 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
@@ -18,9 +17,9 @@ | |||
18 | /* Machine Check Handler For PII/PIII */ | 17 | /* Machine Check Handler For PII/PIII */ |
19 | static void intel_machine_check(struct pt_regs *regs, long error_code) | 18 | static void intel_machine_check(struct pt_regs *regs, long error_code) |
20 | { | 19 | { |
21 | int recover = 1; | ||
22 | u32 alow, ahigh, high, low; | 20 | u32 alow, ahigh, high, low; |
23 | u32 mcgstl, mcgsth; | 21 | u32 mcgstl, mcgsth; |
22 | int recover = 1; | ||
24 | int i; | 23 | int i; |
25 | 24 | ||
26 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 25 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
@@ -35,12 +34,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
35 | if (high & (1<<31)) { | 34 | if (high & (1<<31)) { |
36 | char misc[20]; | 35 | char misc[20]; |
37 | char addr[24]; | 36 | char addr[24]; |
38 | misc[0] = addr[0] = '\0'; | 37 | |
38 | misc[0] = '\0'; | ||
39 | addr[0] = '\0'; | ||
40 | |||
39 | if (high & (1<<29)) | 41 | if (high & (1<<29)) |
40 | recover |= 1; | 42 | recover |= 1; |
41 | if (high & (1<<25)) | 43 | if (high & (1<<25)) |
42 | recover |= 2; | 44 | recover |= 2; |
43 | high &= ~(1<<31); | 45 | high &= ~(1<<31); |
46 | |||
44 | if (high & (1<<27)) { | 47 | if (high & (1<<27)) { |
45 | rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); | 48 | rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); |
46 | snprintf(misc, 20, "[%08x%08x]", ahigh, alow); | 49 | snprintf(misc, 20, "[%08x%08x]", ahigh, alow); |
@@ -49,6 +52,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
49 | rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); | 52 | rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); |
50 | snprintf(addr, 24, " at %08x%08x", ahigh, alow); | 53 | snprintf(addr, 24, " at %08x%08x", ahigh, alow); |
51 | } | 54 | } |
55 | |||
52 | printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", | 56 | printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", |
53 | smp_processor_id(), i, high, low, misc, addr); | 57 | smp_processor_id(), i, high, low, misc, addr); |
54 | } | 58 | } |
@@ -63,16 +67,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
63 | /* | 67 | /* |
64 | * Do not clear the MSR_IA32_MCi_STATUS if the error is not | 68 | * Do not clear the MSR_IA32_MCi_STATUS if the error is not |
65 | * recoverable/continuable.This will allow BIOS to look at the MSRs | 69 | * recoverable/continuable.This will allow BIOS to look at the MSRs |
66 | * for errors if the OS could not log the error. | 70 | * for errors if the OS could not log the error: |
67 | */ | 71 | */ |
68 | for (i = 0; i < nr_mce_banks; i++) { | 72 | for (i = 0; i < nr_mce_banks; i++) { |
69 | unsigned int msr; | 73 | unsigned int msr; |
74 | |||
70 | msr = MSR_IA32_MC0_STATUS+i*4; | 75 | msr = MSR_IA32_MC0_STATUS+i*4; |
71 | rdmsr(msr, low, high); | 76 | rdmsr(msr, low, high); |
72 | if (high & (1<<31)) { | 77 | if (high & (1<<31)) { |
73 | /* Clear it */ | 78 | /* Clear it: */ |
74 | wrmsr(msr, 0UL, 0UL); | 79 | wrmsr(msr, 0UL, 0UL); |
75 | /* Serialize */ | 80 | /* Serialize: */ |
76 | wmb(); | 81 | wmb(); |
77 | add_taint(TAINT_MACHINE_CHECK); | 82 | add_taint(TAINT_MACHINE_CHECK); |
78 | } | 83 | } |
@@ -81,7 +86,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) | |||
81 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 86 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
82 | } | 87 | } |
83 | 88 | ||
84 | /* Set up machine check reporting for processors with Intel style MCE */ | 89 | /* Set up machine check reporting for processors with Intel style MCE: */ |
85 | void intel_p6_mcheck_init(struct cpuinfo_x86 *c) | 90 | void intel_p6_mcheck_init(struct cpuinfo_x86 *c) |
86 | { | 91 | { |
87 | u32 l, h; | 92 | u32 l, h; |
@@ -97,6 +102,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c) | |||
97 | 102 | ||
98 | /* Ok machine check is available */ | 103 | /* Ok machine check is available */ |
99 | machine_check_vector = intel_machine_check; | 104 | machine_check_vector = intel_machine_check; |
105 | /* Make sure the vector pointer is visible before we enable MCEs: */ | ||
100 | wmb(); | 106 | wmb(); |
101 | 107 | ||
102 | printk(KERN_INFO "Intel machine check architecture supported.\n"); | 108 | printk(KERN_INFO "Intel machine check architecture supported.\n"); |
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d5ae2243f0b9..7b1ae2e20ba5 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * | ||
3 | * Thermal throttle event support code (such as syslog messaging and rate | 2 | * Thermal throttle event support code (such as syslog messaging and rate |
4 | * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). | 3 | * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). |
4 | * | ||
5 | * This allows consistent reporting of CPU thermal throttle events. | 5 | * This allows consistent reporting of CPU thermal throttle events. |
6 | * | 6 | * |
7 | * Maintains a counter in /sys that keeps track of the number of thermal | 7 | * Maintains a counter in /sys that keeps track of the number of thermal |
@@ -13,43 +13,43 @@ | |||
13 | * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. | 13 | * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. |
14 | * Inspired by Ross Biro's and Al Borchers' counter code. | 14 | * Inspired by Ross Biro's and Al Borchers' counter code. |
15 | */ | 15 | */ |
16 | 16 | #include <linux/notifier.h> | |
17 | #include <linux/jiffies.h> | ||
17 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
18 | #include <linux/sysdev.h> | 19 | #include <linux/sysdev.h> |
19 | #include <linux/cpu.h> | 20 | #include <linux/cpu.h> |
20 | #include <asm/cpu.h> | 21 | |
21 | #include <linux/notifier.h> | ||
22 | #include <linux/jiffies.h> | ||
23 | #include <asm/therm_throt.h> | 22 | #include <asm/therm_throt.h> |
24 | 23 | ||
25 | /* How long to wait between reporting thermal events */ | 24 | /* How long to wait between reporting thermal events */ |
26 | #define CHECK_INTERVAL (300 * HZ) | 25 | #define CHECK_INTERVAL (300 * HZ) |
27 | 26 | ||
28 | static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; | 27 | static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; |
29 | static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); | 28 | static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); |
30 | atomic_t therm_throt_en = ATOMIC_INIT(0); | 29 | |
30 | atomic_t therm_throt_en = ATOMIC_INIT(0); | ||
31 | 31 | ||
32 | #ifdef CONFIG_SYSFS | 32 | #ifdef CONFIG_SYSFS |
33 | #define define_therm_throt_sysdev_one_ro(_name) \ | 33 | #define define_therm_throt_sysdev_one_ro(_name) \ |
34 | static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) | 34 | static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) |
35 | 35 | ||
36 | #define define_therm_throt_sysdev_show_func(name) \ | 36 | #define define_therm_throt_sysdev_show_func(name) \ |
37 | static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ | 37 | static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ |
38 | struct sysdev_attribute *attr, \ | 38 | struct sysdev_attribute *attr, \ |
39 | char *buf) \ | 39 | char *buf) \ |
40 | { \ | 40 | { \ |
41 | unsigned int cpu = dev->id; \ | 41 | unsigned int cpu = dev->id; \ |
42 | ssize_t ret; \ | 42 | ssize_t ret; \ |
43 | \ | 43 | \ |
44 | preempt_disable(); /* CPU hotplug */ \ | 44 | preempt_disable(); /* CPU hotplug */ \ |
45 | if (cpu_online(cpu)) \ | 45 | if (cpu_online(cpu)) \ |
46 | ret = sprintf(buf, "%lu\n", \ | 46 | ret = sprintf(buf, "%lu\n", \ |
47 | per_cpu(thermal_throttle_##name, cpu)); \ | 47 | per_cpu(thermal_throttle_##name, cpu)); \ |
48 | else \ | 48 | else \ |
49 | ret = 0; \ | 49 | ret = 0; \ |
50 | preempt_enable(); \ | 50 | preempt_enable(); \ |
51 | \ | 51 | \ |
52 | return ret; \ | 52 | return ret; \ |
53 | } | 53 | } |
54 | 54 | ||
55 | define_therm_throt_sysdev_show_func(count); | 55 | define_therm_throt_sysdev_show_func(count); |
@@ -61,8 +61,8 @@ static struct attribute *thermal_throttle_attrs[] = { | |||
61 | }; | 61 | }; |
62 | 62 | ||
63 | static struct attribute_group thermal_throttle_attr_group = { | 63 | static struct attribute_group thermal_throttle_attr_group = { |
64 | .attrs = thermal_throttle_attrs, | 64 | .attrs = thermal_throttle_attrs, |
65 | .name = "thermal_throttle" | 65 | .name = "thermal_throttle" |
66 | }; | 66 | }; |
67 | #endif /* CONFIG_SYSFS */ | 67 | #endif /* CONFIG_SYSFS */ |
68 | 68 | ||
@@ -110,10 +110,11 @@ int therm_throt_process(int curr) | |||
110 | } | 110 | } |
111 | 111 | ||
112 | #ifdef CONFIG_SYSFS | 112 | #ifdef CONFIG_SYSFS |
113 | /* Add/Remove thermal_throttle interface for CPU device */ | 113 | /* Add/Remove thermal_throttle interface for CPU device: */ |
114 | static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) | 114 | static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) |
115 | { | 115 | { |
116 | return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); | 116 | return sysfs_create_group(&sys_dev->kobj, |
117 | &thermal_throttle_attr_group); | ||
117 | } | 118 | } |
118 | 119 | ||
119 | static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) | 120 | static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) |
@@ -121,19 +122,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) | |||
121 | sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); | 122 | sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); |
122 | } | 123 | } |
123 | 124 | ||
124 | /* Mutex protecting device creation against CPU hotplug */ | 125 | /* Mutex protecting device creation against CPU hotplug: */ |
125 | static DEFINE_MUTEX(therm_cpu_lock); | 126 | static DEFINE_MUTEX(therm_cpu_lock); |
126 | 127 | ||
127 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ | 128 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ |
128 | static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, | 129 | static __cpuinit int |
129 | unsigned long action, | 130 | thermal_throttle_cpu_callback(struct notifier_block *nfb, |
130 | void *hcpu) | 131 | unsigned long action, |
132 | void *hcpu) | ||
131 | { | 133 | { |
132 | unsigned int cpu = (unsigned long)hcpu; | 134 | unsigned int cpu = (unsigned long)hcpu; |
133 | struct sys_device *sys_dev; | 135 | struct sys_device *sys_dev; |
134 | int err = 0; | 136 | int err = 0; |
135 | 137 | ||
136 | sys_dev = get_cpu_sysdev(cpu); | 138 | sys_dev = get_cpu_sysdev(cpu); |
139 | |||
137 | switch (action) { | 140 | switch (action) { |
138 | case CPU_UP_PREPARE: | 141 | case CPU_UP_PREPARE: |
139 | case CPU_UP_PREPARE_FROZEN: | 142 | case CPU_UP_PREPARE_FROZEN: |
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 23ee9e730f78..d746df2909c9 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c | |||
@@ -17,7 +17,7 @@ static void default_threshold_interrupt(void) | |||
17 | 17 | ||
18 | void (*mce_threshold_vector)(void) = default_threshold_interrupt; | 18 | void (*mce_threshold_vector)(void) = default_threshold_interrupt; |
19 | 19 | ||
20 | asmlinkage void mce_threshold_interrupt(void) | 20 | asmlinkage void smp_threshold_interrupt(void) |
21 | { | 21 | { |
22 | exit_idle(); | 22 | exit_idle(); |
23 | irq_enter(); | 23 | irq_enter(); |
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 2a043d89811d..81b02487090b 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c | |||
@@ -2,11 +2,10 @@ | |||
2 | * IDT Winchip specific Machine Check Exception Reporting | 2 | * IDT Winchip specific Machine Check Exception Reporting |
3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> | 3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> |
4 | */ | 4 | */ |
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/init.h> | ||
10 | 9 | ||
11 | #include <asm/processor.h> | 10 | #include <asm/processor.h> |
12 | #include <asm/system.h> | 11 | #include <asm/system.h> |
@@ -14,7 +13,7 @@ | |||
14 | 13 | ||
15 | #include "mce.h" | 14 | #include "mce.h" |
16 | 15 | ||
17 | /* Machine check handler for WinChip C6 */ | 16 | /* Machine check handler for WinChip C6: */ |
18 | static void winchip_machine_check(struct pt_regs *regs, long error_code) | 17 | static void winchip_machine_check(struct pt_regs *regs, long error_code) |
19 | { | 18 | { |
20 | printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); | 19 | printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); |
@@ -25,12 +24,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code) | |||
25 | void winchip_mcheck_init(struct cpuinfo_x86 *c) | 24 | void winchip_mcheck_init(struct cpuinfo_x86 *c) |
26 | { | 25 | { |
27 | u32 lo, hi; | 26 | u32 lo, hi; |
27 | |||
28 | machine_check_vector = winchip_machine_check; | 28 | machine_check_vector = winchip_machine_check; |
29 | /* Make sure the vector pointer is visible before we enable MCEs: */ | ||
29 | wmb(); | 30 | wmb(); |
31 | |||
30 | rdmsr(MSR_IDT_FCR1, lo, hi); | 32 | rdmsr(MSR_IDT_FCR1, lo, hi); |
31 | lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ | 33 | lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ |
32 | lo &= ~(1<<4); /* Enable MCE */ | 34 | lo &= ~(1<<4); /* Enable MCE */ |
33 | wrmsr(MSR_IDT_FCR1, lo, hi); | 35 | wrmsr(MSR_IDT_FCR1, lo, hi); |
36 | |||
34 | set_in_cr4(X86_CR4_MCE); | 37 | set_in_cr4(X86_CR4_MCE); |
35 | printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); | 38 | |
39 | printk(KERN_INFO | ||
40 | "Winchip machine check reporting enabled on CPU#0.\n"); | ||
36 | } | 41 | } |
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ce0fe4b5c04f..1d584a18a50d 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c | |||
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits) | |||
808 | 808 | ||
809 | if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) | 809 | if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) |
810 | return 0; | 810 | return 0; |
811 | rdmsr(MTRRdefType_MSR, def, dummy); | 811 | rdmsr(MSR_MTRRdefType, def, dummy); |
812 | def &= 0xff; | 812 | def &= 0xff; |
813 | if (def != MTRR_TYPE_UNCACHABLE) | 813 | if (def != MTRR_TYPE_UNCACHABLE) |
814 | return 0; | 814 | return 0; |
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) | |||
1003 | */ | 1003 | */ |
1004 | if (!is_cpu(INTEL) || disable_mtrr_trim) | 1004 | if (!is_cpu(INTEL) || disable_mtrr_trim) |
1005 | return 0; | 1005 | return 0; |
1006 | rdmsr(MTRRdefType_MSR, def, dummy); | 1006 | rdmsr(MSR_MTRRdefType, def, dummy); |
1007 | def &= 0xff; | 1007 | def &= 0xff; |
1008 | if (def != MTRR_TYPE_UNCACHABLE) | 1008 | if (def != MTRR_TYPE_UNCACHABLE) |
1009 | return 0; | 1009 | return 0; |
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0b776c09aff3..0543f69f0b27 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c | |||
@@ -20,9 +20,9 @@ struct fixed_range_block { | |||
20 | }; | 20 | }; |
21 | 21 | ||
22 | static struct fixed_range_block fixed_range_blocks[] = { | 22 | static struct fixed_range_block fixed_range_blocks[] = { |
23 | { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ | 23 | { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ |
24 | { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ | 24 | { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ |
25 | { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ | 25 | { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ |
26 | {} | 26 | {} |
27 | }; | 27 | }; |
28 | 28 | ||
@@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs) | |||
194 | 194 | ||
195 | k8_check_syscfg_dram_mod_en(); | 195 | k8_check_syscfg_dram_mod_en(); |
196 | 196 | ||
197 | rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); | 197 | rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); |
198 | 198 | ||
199 | for (i = 0; i < 2; i++) | 199 | for (i = 0; i < 2; i++) |
200 | rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); | 200 | rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); |
201 | for (i = 0; i < 8; i++) | 201 | for (i = 0; i < 8; i++) |
202 | rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); | 202 | rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); |
203 | } | 203 | } |
204 | 204 | ||
205 | void mtrr_save_fixed_ranges(void *info) | 205 | void mtrr_save_fixed_ranges(void *info) |
@@ -275,7 +275,11 @@ static void __init print_mtrr_state(void) | |||
275 | } | 275 | } |
276 | printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", | 276 | printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", |
277 | mtrr_state.enabled & 2 ? "en" : "dis"); | 277 | mtrr_state.enabled & 2 ? "en" : "dis"); |
278 | high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; | 278 | if (size_or_mask & 0xffffffffUL) |
279 | high_width = ffs(size_or_mask & 0xffffffffUL) - 1; | ||
280 | else | ||
281 | high_width = ffs(size_or_mask>>32) + 32 - 1; | ||
282 | high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; | ||
279 | for (i = 0; i < num_var_ranges; ++i) { | 283 | for (i = 0; i < num_var_ranges; ++i) { |
280 | if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) | 284 | if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) |
281 | printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", | 285 | printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", |
@@ -306,7 +310,7 @@ void __init get_mtrr_state(void) | |||
306 | 310 | ||
307 | vrs = mtrr_state.var_ranges; | 311 | vrs = mtrr_state.var_ranges; |
308 | 312 | ||
309 | rdmsr(MTRRcap_MSR, lo, dummy); | 313 | rdmsr(MSR_MTRRcap, lo, dummy); |
310 | mtrr_state.have_fixed = (lo >> 8) & 1; | 314 | mtrr_state.have_fixed = (lo >> 8) & 1; |
311 | 315 | ||
312 | for (i = 0; i < num_var_ranges; i++) | 316 | for (i = 0; i < num_var_ranges; i++) |
@@ -314,7 +318,7 @@ void __init get_mtrr_state(void) | |||
314 | if (mtrr_state.have_fixed) | 318 | if (mtrr_state.have_fixed) |
315 | get_fixed_ranges(mtrr_state.fixed_ranges); | 319 | get_fixed_ranges(mtrr_state.fixed_ranges); |
316 | 320 | ||
317 | rdmsr(MTRRdefType_MSR, lo, dummy); | 321 | rdmsr(MSR_MTRRdefType, lo, dummy); |
318 | mtrr_state.def_type = (lo & 0xff); | 322 | mtrr_state.def_type = (lo & 0xff); |
319 | mtrr_state.enabled = (lo & 0xc00) >> 10; | 323 | mtrr_state.enabled = (lo & 0xc00) >> 10; |
320 | 324 | ||
@@ -579,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) | |||
579 | __flush_tlb(); | 583 | __flush_tlb(); |
580 | 584 | ||
581 | /* Save MTRR state */ | 585 | /* Save MTRR state */ |
582 | rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); | 586 | rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); |
583 | 587 | ||
584 | /* Disable MTRRs, and set the default type to uncached */ | 588 | /* Disable MTRRs, and set the default type to uncached */ |
585 | mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); | 589 | mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); |
586 | } | 590 | } |
587 | 591 | ||
588 | static void post_set(void) __releases(set_atomicity_lock) | 592 | static void post_set(void) __releases(set_atomicity_lock) |
@@ -591,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock) | |||
591 | __flush_tlb(); | 595 | __flush_tlb(); |
592 | 596 | ||
593 | /* Intel (P6) standard MTRRs */ | 597 | /* Intel (P6) standard MTRRs */ |
594 | mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); | 598 | mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); |
595 | 599 | ||
596 | /* Enable caches */ | 600 | /* Enable caches */ |
597 | write_cr0(read_cr0() & 0xbfffffff); | 601 | write_cr0(read_cr0() & 0xbfffffff); |
@@ -703,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i | |||
703 | static int generic_have_wrcomb(void) | 707 | static int generic_have_wrcomb(void) |
704 | { | 708 | { |
705 | unsigned long config, dummy; | 709 | unsigned long config, dummy; |
706 | rdmsr(MTRRcap_MSR, config, dummy); | 710 | rdmsr(MSR_MTRRcap, config, dummy); |
707 | return (config & (1 << 10)); | 711 | return (config & (1 << 10)); |
708 | } | 712 | } |
709 | 713 | ||
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 03cda01f57c7..8fc248b5aeaf 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c | |||
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void) | |||
104 | unsigned long config = 0, dummy; | 104 | unsigned long config = 0, dummy; |
105 | 105 | ||
106 | if (use_intel()) { | 106 | if (use_intel()) { |
107 | rdmsr(MTRRcap_MSR, config, dummy); | 107 | rdmsr(MSR_MTRRcap, config, dummy); |
108 | } else if (is_cpu(AMD)) | 108 | } else if (is_cpu(AMD)) |
109 | config = 2; | 109 | config = 2; |
110 | else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) | 110 | else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) |
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 77f67f7b347a..7538b767f206 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h | |||
@@ -5,21 +5,6 @@ | |||
5 | #include <linux/types.h> | 5 | #include <linux/types.h> |
6 | #include <linux/stddef.h> | 6 | #include <linux/stddef.h> |
7 | 7 | ||
8 | #define MTRRcap_MSR 0x0fe | ||
9 | #define MTRRdefType_MSR 0x2ff | ||
10 | |||
11 | #define MTRRfix64K_00000_MSR 0x250 | ||
12 | #define MTRRfix16K_80000_MSR 0x258 | ||
13 | #define MTRRfix16K_A0000_MSR 0x259 | ||
14 | #define MTRRfix4K_C0000_MSR 0x268 | ||
15 | #define MTRRfix4K_C8000_MSR 0x269 | ||
16 | #define MTRRfix4K_D0000_MSR 0x26a | ||
17 | #define MTRRfix4K_D8000_MSR 0x26b | ||
18 | #define MTRRfix4K_E0000_MSR 0x26c | ||
19 | #define MTRRfix4K_E8000_MSR 0x26d | ||
20 | #define MTRRfix4K_F0000_MSR 0x26e | ||
21 | #define MTRRfix4K_F8000_MSR 0x26f | ||
22 | |||
23 | #define MTRR_CHANGE_MASK_FIXED 0x01 | 8 | #define MTRR_CHANGE_MASK_FIXED 0x01 |
24 | #define MTRR_CHANGE_MASK_VARIABLE 0x02 | 9 | #define MTRR_CHANGE_MASK_VARIABLE 0x02 |
25 | #define MTRR_CHANGE_MASK_DEFTYPE 0x04 | 10 | #define MTRR_CHANGE_MASK_DEFTYPE 0x04 |
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 7f7e2753685b..1f5fb1588d1f 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c | |||
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) | |||
35 | 35 | ||
36 | if (use_intel()) | 36 | if (use_intel()) |
37 | /* Save MTRR state */ | 37 | /* Save MTRR state */ |
38 | rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); | 38 | rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); |
39 | else | 39 | else |
40 | /* Cyrix ARRs - everything else were excluded at the top */ | 40 | /* Cyrix ARRs - everything else were excluded at the top */ |
41 | ctxt->ccr3 = getCx86(CX86_CCR3); | 41 | ctxt->ccr3 = getCx86(CX86_CCR3); |
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) | |||
46 | { | 46 | { |
47 | if (use_intel()) | 47 | if (use_intel()) |
48 | /* Disable MTRRs, and set the default type to uncached */ | 48 | /* Disable MTRRs, and set the default type to uncached */ |
49 | mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, | 49 | mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, |
50 | ctxt->deftype_hi); | 50 | ctxt->deftype_hi); |
51 | else if (is_cpu(CYRIX)) | 51 | else if (is_cpu(CYRIX)) |
52 | /* Cyrix ARRs - everything else were excluded at the top */ | 52 | /* Cyrix ARRs - everything else were excluded at the top */ |
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt) | |||
64 | /* Restore MTRRdefType */ | 64 | /* Restore MTRRdefType */ |
65 | if (use_intel()) | 65 | if (use_intel()) |
66 | /* Intel (P6) standard MTRRs */ | 66 | /* Intel (P6) standard MTRRs */ |
67 | mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); | 67 | mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); |
68 | else | 68 | else |
69 | /* Cyrix ARRs - everything else was excluded at the top */ | 69 | /* Cyrix ARRs - everything else was excluded at the top */ |
70 | setCx86(CX86_CCR3, ctxt->ccr3); | 70 | setCx86(CX86_CCR3, ctxt->ccr3); |
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c new file mode 100644 index 000000000000..275bc142cd5d --- /dev/null +++ b/arch/x86/kernel/cpu/perf_counter.c | |||
@@ -0,0 +1,1711 @@ | |||
1 | /* | ||
2 | * Performance counter x86 architecture code | ||
3 | * | ||
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar | ||
6 | * Copyright (C) 2009 Jaswinder Singh Rajput | ||
7 | * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter | ||
8 | * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
9 | * | ||
10 | * For licencing details see kernel-base/COPYING | ||
11 | */ | ||
12 | |||
13 | #include <linux/perf_counter.h> | ||
14 | #include <linux/capability.h> | ||
15 | #include <linux/notifier.h> | ||
16 | #include <linux/hardirq.h> | ||
17 | #include <linux/kprobes.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/kdebug.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/uaccess.h> | ||
22 | |||
23 | #include <asm/apic.h> | ||
24 | #include <asm/stacktrace.h> | ||
25 | #include <asm/nmi.h> | ||
26 | |||
27 | static u64 perf_counter_mask __read_mostly; | ||
28 | |||
29 | struct cpu_hw_counters { | ||
30 | struct perf_counter *counters[X86_PMC_IDX_MAX]; | ||
31 | unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; | ||
32 | unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; | ||
33 | unsigned long interrupts; | ||
34 | int enabled; | ||
35 | }; | ||
36 | |||
37 | /* | ||
38 | * struct x86_pmu - generic x86 pmu | ||
39 | */ | ||
40 | struct x86_pmu { | ||
41 | const char *name; | ||
42 | int version; | ||
43 | int (*handle_irq)(struct pt_regs *); | ||
44 | void (*disable_all)(void); | ||
45 | void (*enable_all)(void); | ||
46 | void (*enable)(struct hw_perf_counter *, int); | ||
47 | void (*disable)(struct hw_perf_counter *, int); | ||
48 | unsigned eventsel; | ||
49 | unsigned perfctr; | ||
50 | u64 (*event_map)(int); | ||
51 | u64 (*raw_event)(u64); | ||
52 | int max_events; | ||
53 | int num_counters; | ||
54 | int num_counters_fixed; | ||
55 | int counter_bits; | ||
56 | u64 counter_mask; | ||
57 | u64 max_period; | ||
58 | u64 intel_ctrl; | ||
59 | }; | ||
60 | |||
61 | static struct x86_pmu x86_pmu __read_mostly; | ||
62 | |||
63 | static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { | ||
64 | .enabled = 1, | ||
65 | }; | ||
66 | |||
67 | /* | ||
68 | * Intel PerfMon v3. Used on Core2 and later. | ||
69 | */ | ||
70 | static const u64 intel_perfmon_event_map[] = | ||
71 | { | ||
72 | [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, | ||
73 | [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, | ||
74 | [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, | ||
75 | [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, | ||
76 | [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, | ||
77 | [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, | ||
78 | [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, | ||
79 | }; | ||
80 | |||
81 | static u64 intel_pmu_event_map(int event) | ||
82 | { | ||
83 | return intel_perfmon_event_map[event]; | ||
84 | } | ||
85 | |||
86 | /* | ||
87 | * Generalized hw caching related event table, filled | ||
88 | * in on a per model basis. A value of 0 means | ||
89 | * 'not supported', -1 means 'event makes no sense on | ||
90 | * this CPU', any other value means the raw event | ||
91 | * ID. | ||
92 | */ | ||
93 | |||
94 | #define C(x) PERF_COUNT_HW_CACHE_##x | ||
95 | |||
96 | static u64 __read_mostly hw_cache_event_ids | ||
97 | [PERF_COUNT_HW_CACHE_MAX] | ||
98 | [PERF_COUNT_HW_CACHE_OP_MAX] | ||
99 | [PERF_COUNT_HW_CACHE_RESULT_MAX]; | ||
100 | |||
101 | static const u64 nehalem_hw_cache_event_ids | ||
102 | [PERF_COUNT_HW_CACHE_MAX] | ||
103 | [PERF_COUNT_HW_CACHE_OP_MAX] | ||
104 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | ||
105 | { | ||
106 | [ C(L1D) ] = { | ||
107 | [ C(OP_READ) ] = { | ||
108 | [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ | ||
109 | [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ | ||
110 | }, | ||
111 | [ C(OP_WRITE) ] = { | ||
112 | [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ | ||
113 | [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ | ||
114 | }, | ||
115 | [ C(OP_PREFETCH) ] = { | ||
116 | [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ | ||
117 | [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ | ||
118 | }, | ||
119 | }, | ||
120 | [ C(L1I ) ] = { | ||
121 | [ C(OP_READ) ] = { | ||
122 | [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ | ||
123 | [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ | ||
124 | }, | ||
125 | [ C(OP_WRITE) ] = { | ||
126 | [ C(RESULT_ACCESS) ] = -1, | ||
127 | [ C(RESULT_MISS) ] = -1, | ||
128 | }, | ||
129 | [ C(OP_PREFETCH) ] = { | ||
130 | [ C(RESULT_ACCESS) ] = 0x0, | ||
131 | [ C(RESULT_MISS) ] = 0x0, | ||
132 | }, | ||
133 | }, | ||
134 | [ C(LL ) ] = { | ||
135 | [ C(OP_READ) ] = { | ||
136 | [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ | ||
137 | [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ | ||
138 | }, | ||
139 | [ C(OP_WRITE) ] = { | ||
140 | [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ | ||
141 | [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ | ||
142 | }, | ||
143 | [ C(OP_PREFETCH) ] = { | ||
144 | [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ | ||
145 | [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ | ||
146 | }, | ||
147 | }, | ||
148 | [ C(DTLB) ] = { | ||
149 | [ C(OP_READ) ] = { | ||
150 | [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ | ||
151 | [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ | ||
152 | }, | ||
153 | [ C(OP_WRITE) ] = { | ||
154 | [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ | ||
155 | [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ | ||
156 | }, | ||
157 | [ C(OP_PREFETCH) ] = { | ||
158 | [ C(RESULT_ACCESS) ] = 0x0, | ||
159 | [ C(RESULT_MISS) ] = 0x0, | ||
160 | }, | ||
161 | }, | ||
162 | [ C(ITLB) ] = { | ||
163 | [ C(OP_READ) ] = { | ||
164 | [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ | ||
165 | [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ | ||
166 | }, | ||
167 | [ C(OP_WRITE) ] = { | ||
168 | [ C(RESULT_ACCESS) ] = -1, | ||
169 | [ C(RESULT_MISS) ] = -1, | ||
170 | }, | ||
171 | [ C(OP_PREFETCH) ] = { | ||
172 | [ C(RESULT_ACCESS) ] = -1, | ||
173 | [ C(RESULT_MISS) ] = -1, | ||
174 | }, | ||
175 | }, | ||
176 | [ C(BPU ) ] = { | ||
177 | [ C(OP_READ) ] = { | ||
178 | [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ | ||
179 | [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ | ||
180 | }, | ||
181 | [ C(OP_WRITE) ] = { | ||
182 | [ C(RESULT_ACCESS) ] = -1, | ||
183 | [ C(RESULT_MISS) ] = -1, | ||
184 | }, | ||
185 | [ C(OP_PREFETCH) ] = { | ||
186 | [ C(RESULT_ACCESS) ] = -1, | ||
187 | [ C(RESULT_MISS) ] = -1, | ||
188 | }, | ||
189 | }, | ||
190 | }; | ||
191 | |||
192 | static const u64 core2_hw_cache_event_ids | ||
193 | [PERF_COUNT_HW_CACHE_MAX] | ||
194 | [PERF_COUNT_HW_CACHE_OP_MAX] | ||
195 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | ||
196 | { | ||
197 | [ C(L1D) ] = { | ||
198 | [ C(OP_READ) ] = { | ||
199 | [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ | ||
200 | [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ | ||
201 | }, | ||
202 | [ C(OP_WRITE) ] = { | ||
203 | [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ | ||
204 | [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ | ||
205 | }, | ||
206 | [ C(OP_PREFETCH) ] = { | ||
207 | [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ | ||
208 | [ C(RESULT_MISS) ] = 0, | ||
209 | }, | ||
210 | }, | ||
211 | [ C(L1I ) ] = { | ||
212 | [ C(OP_READ) ] = { | ||
213 | [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ | ||
214 | [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ | ||
215 | }, | ||
216 | [ C(OP_WRITE) ] = { | ||
217 | [ C(RESULT_ACCESS) ] = -1, | ||
218 | [ C(RESULT_MISS) ] = -1, | ||
219 | }, | ||
220 | [ C(OP_PREFETCH) ] = { | ||
221 | [ C(RESULT_ACCESS) ] = 0, | ||
222 | [ C(RESULT_MISS) ] = 0, | ||
223 | }, | ||
224 | }, | ||
225 | [ C(LL ) ] = { | ||
226 | [ C(OP_READ) ] = { | ||
227 | [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ | ||
228 | [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ | ||
229 | }, | ||
230 | [ C(OP_WRITE) ] = { | ||
231 | [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ | ||
232 | [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ | ||
233 | }, | ||
234 | [ C(OP_PREFETCH) ] = { | ||
235 | [ C(RESULT_ACCESS) ] = 0, | ||
236 | [ C(RESULT_MISS) ] = 0, | ||
237 | }, | ||
238 | }, | ||
239 | [ C(DTLB) ] = { | ||
240 | [ C(OP_READ) ] = { | ||
241 | [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ | ||
242 | [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ | ||
243 | }, | ||
244 | [ C(OP_WRITE) ] = { | ||
245 | [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ | ||
246 | [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ | ||
247 | }, | ||
248 | [ C(OP_PREFETCH) ] = { | ||
249 | [ C(RESULT_ACCESS) ] = 0, | ||
250 | [ C(RESULT_MISS) ] = 0, | ||
251 | }, | ||
252 | }, | ||
253 | [ C(ITLB) ] = { | ||
254 | [ C(OP_READ) ] = { | ||
255 | [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ | ||
256 | [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ | ||
257 | }, | ||
258 | [ C(OP_WRITE) ] = { | ||
259 | [ C(RESULT_ACCESS) ] = -1, | ||
260 | [ C(RESULT_MISS) ] = -1, | ||
261 | }, | ||
262 | [ C(OP_PREFETCH) ] = { | ||
263 | [ C(RESULT_ACCESS) ] = -1, | ||
264 | [ C(RESULT_MISS) ] = -1, | ||
265 | }, | ||
266 | }, | ||
267 | [ C(BPU ) ] = { | ||
268 | [ C(OP_READ) ] = { | ||
269 | [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ | ||
270 | [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ | ||
271 | }, | ||
272 | [ C(OP_WRITE) ] = { | ||
273 | [ C(RESULT_ACCESS) ] = -1, | ||
274 | [ C(RESULT_MISS) ] = -1, | ||
275 | }, | ||
276 | [ C(OP_PREFETCH) ] = { | ||
277 | [ C(RESULT_ACCESS) ] = -1, | ||
278 | [ C(RESULT_MISS) ] = -1, | ||
279 | }, | ||
280 | }, | ||
281 | }; | ||
282 | |||
283 | static const u64 atom_hw_cache_event_ids | ||
284 | [PERF_COUNT_HW_CACHE_MAX] | ||
285 | [PERF_COUNT_HW_CACHE_OP_MAX] | ||
286 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | ||
287 | { | ||
288 | [ C(L1D) ] = { | ||
289 | [ C(OP_READ) ] = { | ||
290 | [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ | ||
291 | [ C(RESULT_MISS) ] = 0, | ||
292 | }, | ||
293 | [ C(OP_WRITE) ] = { | ||
294 | [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ | ||
295 | [ C(RESULT_MISS) ] = 0, | ||
296 | }, | ||
297 | [ C(OP_PREFETCH) ] = { | ||
298 | [ C(RESULT_ACCESS) ] = 0x0, | ||
299 | [ C(RESULT_MISS) ] = 0, | ||
300 | }, | ||
301 | }, | ||
302 | [ C(L1I ) ] = { | ||
303 | [ C(OP_READ) ] = { | ||
304 | [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ | ||
305 | [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ | ||
306 | }, | ||
307 | [ C(OP_WRITE) ] = { | ||
308 | [ C(RESULT_ACCESS) ] = -1, | ||
309 | [ C(RESULT_MISS) ] = -1, | ||
310 | }, | ||
311 | [ C(OP_PREFETCH) ] = { | ||
312 | [ C(RESULT_ACCESS) ] = 0, | ||
313 | [ C(RESULT_MISS) ] = 0, | ||
314 | }, | ||
315 | }, | ||
316 | [ C(LL ) ] = { | ||
317 | [ C(OP_READ) ] = { | ||
318 | [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ | ||
319 | [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ | ||
320 | }, | ||
321 | [ C(OP_WRITE) ] = { | ||
322 | [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ | ||
323 | [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ | ||
324 | }, | ||
325 | [ C(OP_PREFETCH) ] = { | ||
326 | [ C(RESULT_ACCESS) ] = 0, | ||
327 | [ C(RESULT_MISS) ] = 0, | ||
328 | }, | ||
329 | }, | ||
330 | [ C(DTLB) ] = { | ||
331 | [ C(OP_READ) ] = { | ||
332 | [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ | ||
333 | [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ | ||
334 | }, | ||
335 | [ C(OP_WRITE) ] = { | ||
336 | [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ | ||
337 | [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ | ||
338 | }, | ||
339 | [ C(OP_PREFETCH) ] = { | ||
340 | [ C(RESULT_ACCESS) ] = 0, | ||
341 | [ C(RESULT_MISS) ] = 0, | ||
342 | }, | ||
343 | }, | ||
344 | [ C(ITLB) ] = { | ||
345 | [ C(OP_READ) ] = { | ||
346 | [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ | ||
347 | [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ | ||
348 | }, | ||
349 | [ C(OP_WRITE) ] = { | ||
350 | [ C(RESULT_ACCESS) ] = -1, | ||
351 | [ C(RESULT_MISS) ] = -1, | ||
352 | }, | ||
353 | [ C(OP_PREFETCH) ] = { | ||
354 | [ C(RESULT_ACCESS) ] = -1, | ||
355 | [ C(RESULT_MISS) ] = -1, | ||
356 | }, | ||
357 | }, | ||
358 | [ C(BPU ) ] = { | ||
359 | [ C(OP_READ) ] = { | ||
360 | [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ | ||
361 | [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ | ||
362 | }, | ||
363 | [ C(OP_WRITE) ] = { | ||
364 | [ C(RESULT_ACCESS) ] = -1, | ||
365 | [ C(RESULT_MISS) ] = -1, | ||
366 | }, | ||
367 | [ C(OP_PREFETCH) ] = { | ||
368 | [ C(RESULT_ACCESS) ] = -1, | ||
369 | [ C(RESULT_MISS) ] = -1, | ||
370 | }, | ||
371 | }, | ||
372 | }; | ||
373 | |||
374 | static u64 intel_pmu_raw_event(u64 event) | ||
375 | { | ||
376 | #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL | ||
377 | #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL | ||
378 | #define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL | ||
379 | #define CORE_EVNTSEL_INV_MASK 0x00800000ULL | ||
380 | #define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL | ||
381 | |||
382 | #define CORE_EVNTSEL_MASK \ | ||
383 | (CORE_EVNTSEL_EVENT_MASK | \ | ||
384 | CORE_EVNTSEL_UNIT_MASK | \ | ||
385 | CORE_EVNTSEL_EDGE_MASK | \ | ||
386 | CORE_EVNTSEL_INV_MASK | \ | ||
387 | CORE_EVNTSEL_COUNTER_MASK) | ||
388 | |||
389 | return event & CORE_EVNTSEL_MASK; | ||
390 | } | ||
391 | |||
392 | static const u64 amd_0f_hw_cache_event_ids | ||
393 | [PERF_COUNT_HW_CACHE_MAX] | ||
394 | [PERF_COUNT_HW_CACHE_OP_MAX] | ||
395 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | ||
396 | { | ||
397 | [ C(L1D) ] = { | ||
398 | [ C(OP_READ) ] = { | ||
399 | [ C(RESULT_ACCESS) ] = 0, | ||
400 | [ C(RESULT_MISS) ] = 0, | ||
401 | }, | ||
402 | [ C(OP_WRITE) ] = { | ||
403 | [ C(RESULT_ACCESS) ] = 0, | ||
404 | [ C(RESULT_MISS) ] = 0, | ||
405 | }, | ||
406 | [ C(OP_PREFETCH) ] = { | ||
407 | [ C(RESULT_ACCESS) ] = 0, | ||
408 | [ C(RESULT_MISS) ] = 0, | ||
409 | }, | ||
410 | }, | ||
411 | [ C(L1I ) ] = { | ||
412 | [ C(OP_READ) ] = { | ||
413 | [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ | ||
414 | [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ | ||
415 | }, | ||
416 | [ C(OP_WRITE) ] = { | ||
417 | [ C(RESULT_ACCESS) ] = -1, | ||
418 | [ C(RESULT_MISS) ] = -1, | ||
419 | }, | ||
420 | [ C(OP_PREFETCH) ] = { | ||
421 | [ C(RESULT_ACCESS) ] = 0, | ||
422 | [ C(RESULT_MISS) ] = 0, | ||
423 | }, | ||
424 | }, | ||
425 | [ C(LL ) ] = { | ||
426 | [ C(OP_READ) ] = { | ||
427 | [ C(RESULT_ACCESS) ] = 0, | ||
428 | [ C(RESULT_MISS) ] = 0, | ||
429 | }, | ||
430 | [ C(OP_WRITE) ] = { | ||
431 | [ C(RESULT_ACCESS) ] = 0, | ||
432 | [ C(RESULT_MISS) ] = 0, | ||
433 | }, | ||
434 | [ C(OP_PREFETCH) ] = { | ||
435 | [ C(RESULT_ACCESS) ] = 0, | ||
436 | [ C(RESULT_MISS) ] = 0, | ||
437 | }, | ||
438 | }, | ||
439 | [ C(DTLB) ] = { | ||
440 | [ C(OP_READ) ] = { | ||
441 | [ C(RESULT_ACCESS) ] = 0, | ||
442 | [ C(RESULT_MISS) ] = 0, | ||
443 | }, | ||
444 | [ C(OP_WRITE) ] = { | ||
445 | [ C(RESULT_ACCESS) ] = 0, | ||
446 | [ C(RESULT_MISS) ] = 0, | ||
447 | }, | ||
448 | [ C(OP_PREFETCH) ] = { | ||
449 | [ C(RESULT_ACCESS) ] = 0, | ||
450 | [ C(RESULT_MISS) ] = 0, | ||
451 | }, | ||
452 | }, | ||
453 | [ C(ITLB) ] = { | ||
454 | [ C(OP_READ) ] = { | ||
455 | [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ | ||
456 | [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ | ||
457 | }, | ||
458 | [ C(OP_WRITE) ] = { | ||
459 | [ C(RESULT_ACCESS) ] = -1, | ||
460 | [ C(RESULT_MISS) ] = -1, | ||
461 | }, | ||
462 | [ C(OP_PREFETCH) ] = { | ||
463 | [ C(RESULT_ACCESS) ] = -1, | ||
464 | [ C(RESULT_MISS) ] = -1, | ||
465 | }, | ||
466 | }, | ||
467 | [ C(BPU ) ] = { | ||
468 | [ C(OP_READ) ] = { | ||
469 | [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ | ||
470 | [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ | ||
471 | }, | ||
472 | [ C(OP_WRITE) ] = { | ||
473 | [ C(RESULT_ACCESS) ] = -1, | ||
474 | [ C(RESULT_MISS) ] = -1, | ||
475 | }, | ||
476 | [ C(OP_PREFETCH) ] = { | ||
477 | [ C(RESULT_ACCESS) ] = -1, | ||
478 | [ C(RESULT_MISS) ] = -1, | ||
479 | }, | ||
480 | }, | ||
481 | }; | ||
482 | |||
483 | /* | ||
484 | * AMD Performance Monitor K7 and later. | ||
485 | */ | ||
486 | static const u64 amd_perfmon_event_map[] = | ||
487 | { | ||
488 | [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, | ||
489 | [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, | ||
490 | [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, | ||
491 | [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, | ||
492 | [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, | ||
493 | [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, | ||
494 | }; | ||
495 | |||
496 | static u64 amd_pmu_event_map(int event) | ||
497 | { | ||
498 | return amd_perfmon_event_map[event]; | ||
499 | } | ||
500 | |||
501 | static u64 amd_pmu_raw_event(u64 event) | ||
502 | { | ||
503 | #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL | ||
504 | #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL | ||
505 | #define K7_EVNTSEL_EDGE_MASK 0x000040000ULL | ||
506 | #define K7_EVNTSEL_INV_MASK 0x000800000ULL | ||
507 | #define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL | ||
508 | |||
509 | #define K7_EVNTSEL_MASK \ | ||
510 | (K7_EVNTSEL_EVENT_MASK | \ | ||
511 | K7_EVNTSEL_UNIT_MASK | \ | ||
512 | K7_EVNTSEL_EDGE_MASK | \ | ||
513 | K7_EVNTSEL_INV_MASK | \ | ||
514 | K7_EVNTSEL_COUNTER_MASK) | ||
515 | |||
516 | return event & K7_EVNTSEL_MASK; | ||
517 | } | ||
518 | |||
519 | /* | ||
520 | * Propagate counter elapsed time into the generic counter. | ||
521 | * Can only be executed on the CPU where the counter is active. | ||
522 | * Returns the delta events processed. | ||
523 | */ | ||
524 | static u64 | ||
525 | x86_perf_counter_update(struct perf_counter *counter, | ||
526 | struct hw_perf_counter *hwc, int idx) | ||
527 | { | ||
528 | int shift = 64 - x86_pmu.counter_bits; | ||
529 | u64 prev_raw_count, new_raw_count; | ||
530 | s64 delta; | ||
531 | |||
532 | /* | ||
533 | * Careful: an NMI might modify the previous counter value. | ||
534 | * | ||
535 | * Our tactic to handle this is to first atomically read and | ||
536 | * exchange a new raw count - then add that new-prev delta | ||
537 | * count to the generic counter atomically: | ||
538 | */ | ||
539 | again: | ||
540 | prev_raw_count = atomic64_read(&hwc->prev_count); | ||
541 | rdmsrl(hwc->counter_base + idx, new_raw_count); | ||
542 | |||
543 | if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, | ||
544 | new_raw_count) != prev_raw_count) | ||
545 | goto again; | ||
546 | |||
547 | /* | ||
548 | * Now we have the new raw value and have updated the prev | ||
549 | * timestamp already. We can now calculate the elapsed delta | ||
550 | * (counter-)time and add that to the generic counter. | ||
551 | * | ||
552 | * Careful, not all hw sign-extends above the physical width | ||
553 | * of the count. | ||
554 | */ | ||
555 | delta = (new_raw_count << shift) - (prev_raw_count << shift); | ||
556 | delta >>= shift; | ||
557 | |||
558 | atomic64_add(delta, &counter->count); | ||
559 | atomic64_sub(delta, &hwc->period_left); | ||
560 | |||
561 | return new_raw_count; | ||
562 | } | ||
563 | |||
564 | static atomic_t active_counters; | ||
565 | static DEFINE_MUTEX(pmc_reserve_mutex); | ||
566 | |||
567 | static bool reserve_pmc_hardware(void) | ||
568 | { | ||
569 | int i; | ||
570 | |||
571 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
572 | disable_lapic_nmi_watchdog(); | ||
573 | |||
574 | for (i = 0; i < x86_pmu.num_counters; i++) { | ||
575 | if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) | ||
576 | goto perfctr_fail; | ||
577 | } | ||
578 | |||
579 | for (i = 0; i < x86_pmu.num_counters; i++) { | ||
580 | if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) | ||
581 | goto eventsel_fail; | ||
582 | } | ||
583 | |||
584 | return true; | ||
585 | |||
586 | eventsel_fail: | ||
587 | for (i--; i >= 0; i--) | ||
588 | release_evntsel_nmi(x86_pmu.eventsel + i); | ||
589 | |||
590 | i = x86_pmu.num_counters; | ||
591 | |||
592 | perfctr_fail: | ||
593 | for (i--; i >= 0; i--) | ||
594 | release_perfctr_nmi(x86_pmu.perfctr + i); | ||
595 | |||
596 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
597 | enable_lapic_nmi_watchdog(); | ||
598 | |||
599 | return false; | ||
600 | } | ||
601 | |||
602 | static void release_pmc_hardware(void) | ||
603 | { | ||
604 | int i; | ||
605 | |||
606 | for (i = 0; i < x86_pmu.num_counters; i++) { | ||
607 | release_perfctr_nmi(x86_pmu.perfctr + i); | ||
608 | release_evntsel_nmi(x86_pmu.eventsel + i); | ||
609 | } | ||
610 | |||
611 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
612 | enable_lapic_nmi_watchdog(); | ||
613 | } | ||
614 | |||
615 | static void hw_perf_counter_destroy(struct perf_counter *counter) | ||
616 | { | ||
617 | if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { | ||
618 | release_pmc_hardware(); | ||
619 | mutex_unlock(&pmc_reserve_mutex); | ||
620 | } | ||
621 | } | ||
622 | |||
623 | static inline int x86_pmu_initialized(void) | ||
624 | { | ||
625 | return x86_pmu.handle_irq != NULL; | ||
626 | } | ||
627 | |||
628 | static inline int | ||
629 | set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) | ||
630 | { | ||
631 | unsigned int cache_type, cache_op, cache_result; | ||
632 | u64 config, val; | ||
633 | |||
634 | config = attr->config; | ||
635 | |||
636 | cache_type = (config >> 0) & 0xff; | ||
637 | if (cache_type >= PERF_COUNT_HW_CACHE_MAX) | ||
638 | return -EINVAL; | ||
639 | |||
640 | cache_op = (config >> 8) & 0xff; | ||
641 | if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) | ||
642 | return -EINVAL; | ||
643 | |||
644 | cache_result = (config >> 16) & 0xff; | ||
645 | if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) | ||
646 | return -EINVAL; | ||
647 | |||
648 | val = hw_cache_event_ids[cache_type][cache_op][cache_result]; | ||
649 | |||
650 | if (val == 0) | ||
651 | return -ENOENT; | ||
652 | |||
653 | if (val == -1) | ||
654 | return -EINVAL; | ||
655 | |||
656 | hwc->config |= val; | ||
657 | |||
658 | return 0; | ||
659 | } | ||
660 | |||
661 | /* | ||
662 | * Setup the hardware configuration for a given attr_type | ||
663 | */ | ||
664 | static int __hw_perf_counter_init(struct perf_counter *counter) | ||
665 | { | ||
666 | struct perf_counter_attr *attr = &counter->attr; | ||
667 | struct hw_perf_counter *hwc = &counter->hw; | ||
668 | int err; | ||
669 | |||
670 | if (!x86_pmu_initialized()) | ||
671 | return -ENODEV; | ||
672 | |||
673 | err = 0; | ||
674 | if (!atomic_inc_not_zero(&active_counters)) { | ||
675 | mutex_lock(&pmc_reserve_mutex); | ||
676 | if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) | ||
677 | err = -EBUSY; | ||
678 | else | ||
679 | atomic_inc(&active_counters); | ||
680 | mutex_unlock(&pmc_reserve_mutex); | ||
681 | } | ||
682 | if (err) | ||
683 | return err; | ||
684 | |||
685 | /* | ||
686 | * Generate PMC IRQs: | ||
687 | * (keep 'enabled' bit clear for now) | ||
688 | */ | ||
689 | hwc->config = ARCH_PERFMON_EVENTSEL_INT; | ||
690 | |||
691 | /* | ||
692 | * Count user and OS events unless requested not to. | ||
693 | */ | ||
694 | if (!attr->exclude_user) | ||
695 | hwc->config |= ARCH_PERFMON_EVENTSEL_USR; | ||
696 | if (!attr->exclude_kernel) | ||
697 | hwc->config |= ARCH_PERFMON_EVENTSEL_OS; | ||
698 | |||
699 | if (!hwc->sample_period) { | ||
700 | hwc->sample_period = x86_pmu.max_period; | ||
701 | hwc->last_period = hwc->sample_period; | ||
702 | atomic64_set(&hwc->period_left, hwc->sample_period); | ||
703 | } | ||
704 | |||
705 | counter->destroy = hw_perf_counter_destroy; | ||
706 | |||
707 | /* | ||
708 | * Raw event type provide the config in the event structure | ||
709 | */ | ||
710 | if (attr->type == PERF_TYPE_RAW) { | ||
711 | hwc->config |= x86_pmu.raw_event(attr->config); | ||
712 | return 0; | ||
713 | } | ||
714 | |||
715 | if (attr->type == PERF_TYPE_HW_CACHE) | ||
716 | return set_ext_hw_attr(hwc, attr); | ||
717 | |||
718 | if (attr->config >= x86_pmu.max_events) | ||
719 | return -EINVAL; | ||
720 | /* | ||
721 | * The generic map: | ||
722 | */ | ||
723 | hwc->config |= x86_pmu.event_map(attr->config); | ||
724 | |||
725 | return 0; | ||
726 | } | ||
727 | |||
728 | static void intel_pmu_disable_all(void) | ||
729 | { | ||
730 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); | ||
731 | } | ||
732 | |||
733 | static void amd_pmu_disable_all(void) | ||
734 | { | ||
735 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
736 | int idx; | ||
737 | |||
738 | if (!cpuc->enabled) | ||
739 | return; | ||
740 | |||
741 | cpuc->enabled = 0; | ||
742 | /* | ||
743 | * ensure we write the disable before we start disabling the | ||
744 | * counters proper, so that amd_pmu_enable_counter() does the | ||
745 | * right thing. | ||
746 | */ | ||
747 | barrier(); | ||
748 | |||
749 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | ||
750 | u64 val; | ||
751 | |||
752 | if (!test_bit(idx, cpuc->active_mask)) | ||
753 | continue; | ||
754 | rdmsrl(MSR_K7_EVNTSEL0 + idx, val); | ||
755 | if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) | ||
756 | continue; | ||
757 | val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
758 | wrmsrl(MSR_K7_EVNTSEL0 + idx, val); | ||
759 | } | ||
760 | } | ||
761 | |||
762 | void hw_perf_disable(void) | ||
763 | { | ||
764 | if (!x86_pmu_initialized()) | ||
765 | return; | ||
766 | return x86_pmu.disable_all(); | ||
767 | } | ||
768 | |||
769 | static void intel_pmu_enable_all(void) | ||
770 | { | ||
771 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); | ||
772 | } | ||
773 | |||
774 | static void amd_pmu_enable_all(void) | ||
775 | { | ||
776 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
777 | int idx; | ||
778 | |||
779 | if (cpuc->enabled) | ||
780 | return; | ||
781 | |||
782 | cpuc->enabled = 1; | ||
783 | barrier(); | ||
784 | |||
785 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | ||
786 | u64 val; | ||
787 | |||
788 | if (!test_bit(idx, cpuc->active_mask)) | ||
789 | continue; | ||
790 | rdmsrl(MSR_K7_EVNTSEL0 + idx, val); | ||
791 | if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) | ||
792 | continue; | ||
793 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
794 | wrmsrl(MSR_K7_EVNTSEL0 + idx, val); | ||
795 | } | ||
796 | } | ||
797 | |||
798 | void hw_perf_enable(void) | ||
799 | { | ||
800 | if (!x86_pmu_initialized()) | ||
801 | return; | ||
802 | x86_pmu.enable_all(); | ||
803 | } | ||
804 | |||
805 | static inline u64 intel_pmu_get_status(void) | ||
806 | { | ||
807 | u64 status; | ||
808 | |||
809 | rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); | ||
810 | |||
811 | return status; | ||
812 | } | ||
813 | |||
814 | static inline void intel_pmu_ack_status(u64 ack) | ||
815 | { | ||
816 | wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); | ||
817 | } | ||
818 | |||
819 | static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | ||
820 | { | ||
821 | int err; | ||
822 | err = checking_wrmsrl(hwc->config_base + idx, | ||
823 | hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); | ||
824 | } | ||
825 | |||
826 | static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) | ||
827 | { | ||
828 | int err; | ||
829 | err = checking_wrmsrl(hwc->config_base + idx, | ||
830 | hwc->config); | ||
831 | } | ||
832 | |||
833 | static inline void | ||
834 | intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) | ||
835 | { | ||
836 | int idx = __idx - X86_PMC_IDX_FIXED; | ||
837 | u64 ctrl_val, mask; | ||
838 | int err; | ||
839 | |||
840 | mask = 0xfULL << (idx * 4); | ||
841 | |||
842 | rdmsrl(hwc->config_base, ctrl_val); | ||
843 | ctrl_val &= ~mask; | ||
844 | err = checking_wrmsrl(hwc->config_base, ctrl_val); | ||
845 | } | ||
846 | |||
847 | static inline void | ||
848 | intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) | ||
849 | { | ||
850 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { | ||
851 | intel_pmu_disable_fixed(hwc, idx); | ||
852 | return; | ||
853 | } | ||
854 | |||
855 | x86_pmu_disable_counter(hwc, idx); | ||
856 | } | ||
857 | |||
858 | static inline void | ||
859 | amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) | ||
860 | { | ||
861 | x86_pmu_disable_counter(hwc, idx); | ||
862 | } | ||
863 | |||
864 | static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); | ||
865 | |||
866 | /* | ||
867 | * Set the next IRQ period, based on the hwc->period_left value. | ||
868 | * To be called with the counter disabled in hw: | ||
869 | */ | ||
870 | static int | ||
871 | x86_perf_counter_set_period(struct perf_counter *counter, | ||
872 | struct hw_perf_counter *hwc, int idx) | ||
873 | { | ||
874 | s64 left = atomic64_read(&hwc->period_left); | ||
875 | s64 period = hwc->sample_period; | ||
876 | int err, ret = 0; | ||
877 | |||
878 | /* | ||
879 | * If we are way outside a reasoable range then just skip forward: | ||
880 | */ | ||
881 | if (unlikely(left <= -period)) { | ||
882 | left = period; | ||
883 | atomic64_set(&hwc->period_left, left); | ||
884 | hwc->last_period = period; | ||
885 | ret = 1; | ||
886 | } | ||
887 | |||
888 | if (unlikely(left <= 0)) { | ||
889 | left += period; | ||
890 | atomic64_set(&hwc->period_left, left); | ||
891 | hwc->last_period = period; | ||
892 | ret = 1; | ||
893 | } | ||
894 | /* | ||
895 | * Quirk: certain CPUs dont like it if just 1 event is left: | ||
896 | */ | ||
897 | if (unlikely(left < 2)) | ||
898 | left = 2; | ||
899 | |||
900 | if (left > x86_pmu.max_period) | ||
901 | left = x86_pmu.max_period; | ||
902 | |||
903 | per_cpu(prev_left[idx], smp_processor_id()) = left; | ||
904 | |||
905 | /* | ||
906 | * The hw counter starts counting from this counter offset, | ||
907 | * mark it to be able to extra future deltas: | ||
908 | */ | ||
909 | atomic64_set(&hwc->prev_count, (u64)-left); | ||
910 | |||
911 | err = checking_wrmsrl(hwc->counter_base + idx, | ||
912 | (u64)(-left) & x86_pmu.counter_mask); | ||
913 | |||
914 | return ret; | ||
915 | } | ||
916 | |||
917 | static inline void | ||
918 | intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) | ||
919 | { | ||
920 | int idx = __idx - X86_PMC_IDX_FIXED; | ||
921 | u64 ctrl_val, bits, mask; | ||
922 | int err; | ||
923 | |||
924 | /* | ||
925 | * Enable IRQ generation (0x8), | ||
926 | * and enable ring-3 counting (0x2) and ring-0 counting (0x1) | ||
927 | * if requested: | ||
928 | */ | ||
929 | bits = 0x8ULL; | ||
930 | if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) | ||
931 | bits |= 0x2; | ||
932 | if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) | ||
933 | bits |= 0x1; | ||
934 | bits <<= (idx * 4); | ||
935 | mask = 0xfULL << (idx * 4); | ||
936 | |||
937 | rdmsrl(hwc->config_base, ctrl_val); | ||
938 | ctrl_val &= ~mask; | ||
939 | ctrl_val |= bits; | ||
940 | err = checking_wrmsrl(hwc->config_base, ctrl_val); | ||
941 | } | ||
942 | |||
943 | static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | ||
944 | { | ||
945 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { | ||
946 | intel_pmu_enable_fixed(hwc, idx); | ||
947 | return; | ||
948 | } | ||
949 | |||
950 | x86_pmu_enable_counter(hwc, idx); | ||
951 | } | ||
952 | |||
953 | static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | ||
954 | { | ||
955 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
956 | |||
957 | if (cpuc->enabled) | ||
958 | x86_pmu_enable_counter(hwc, idx); | ||
959 | else | ||
960 | x86_pmu_disable_counter(hwc, idx); | ||
961 | } | ||
962 | |||
963 | static int | ||
964 | fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) | ||
965 | { | ||
966 | unsigned int event; | ||
967 | |||
968 | if (!x86_pmu.num_counters_fixed) | ||
969 | return -1; | ||
970 | |||
971 | /* | ||
972 | * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: | ||
973 | */ | ||
974 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && | ||
975 | boot_cpu_data.x86_model == 28) | ||
976 | return -1; | ||
977 | |||
978 | event = hwc->config & ARCH_PERFMON_EVENT_MASK; | ||
979 | |||
980 | if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) | ||
981 | return X86_PMC_IDX_FIXED_INSTRUCTIONS; | ||
982 | if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) | ||
983 | return X86_PMC_IDX_FIXED_CPU_CYCLES; | ||
984 | if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) | ||
985 | return X86_PMC_IDX_FIXED_BUS_CYCLES; | ||
986 | |||
987 | return -1; | ||
988 | } | ||
989 | |||
990 | /* | ||
991 | * Find a PMC slot for the freshly enabled / scheduled in counter: | ||
992 | */ | ||
993 | static int x86_pmu_enable(struct perf_counter *counter) | ||
994 | { | ||
995 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
996 | struct hw_perf_counter *hwc = &counter->hw; | ||
997 | int idx; | ||
998 | |||
999 | idx = fixed_mode_idx(counter, hwc); | ||
1000 | if (idx >= 0) { | ||
1001 | /* | ||
1002 | * Try to get the fixed counter, if that is already taken | ||
1003 | * then try to get a generic counter: | ||
1004 | */ | ||
1005 | if (test_and_set_bit(idx, cpuc->used_mask)) | ||
1006 | goto try_generic; | ||
1007 | |||
1008 | hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; | ||
1009 | /* | ||
1010 | * We set it so that counter_base + idx in wrmsr/rdmsr maps to | ||
1011 | * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: | ||
1012 | */ | ||
1013 | hwc->counter_base = | ||
1014 | MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; | ||
1015 | hwc->idx = idx; | ||
1016 | } else { | ||
1017 | idx = hwc->idx; | ||
1018 | /* Try to get the previous generic counter again */ | ||
1019 | if (test_and_set_bit(idx, cpuc->used_mask)) { | ||
1020 | try_generic: | ||
1021 | idx = find_first_zero_bit(cpuc->used_mask, | ||
1022 | x86_pmu.num_counters); | ||
1023 | if (idx == x86_pmu.num_counters) | ||
1024 | return -EAGAIN; | ||
1025 | |||
1026 | set_bit(idx, cpuc->used_mask); | ||
1027 | hwc->idx = idx; | ||
1028 | } | ||
1029 | hwc->config_base = x86_pmu.eventsel; | ||
1030 | hwc->counter_base = x86_pmu.perfctr; | ||
1031 | } | ||
1032 | |||
1033 | perf_counters_lapic_init(); | ||
1034 | |||
1035 | x86_pmu.disable(hwc, idx); | ||
1036 | |||
1037 | cpuc->counters[idx] = counter; | ||
1038 | set_bit(idx, cpuc->active_mask); | ||
1039 | |||
1040 | x86_perf_counter_set_period(counter, hwc, idx); | ||
1041 | x86_pmu.enable(hwc, idx); | ||
1042 | |||
1043 | return 0; | ||
1044 | } | ||
1045 | |||
1046 | static void x86_pmu_unthrottle(struct perf_counter *counter) | ||
1047 | { | ||
1048 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
1049 | struct hw_perf_counter *hwc = &counter->hw; | ||
1050 | |||
1051 | if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || | ||
1052 | cpuc->counters[hwc->idx] != counter)) | ||
1053 | return; | ||
1054 | |||
1055 | x86_pmu.enable(hwc, hwc->idx); | ||
1056 | } | ||
1057 | |||
1058 | void perf_counter_print_debug(void) | ||
1059 | { | ||
1060 | u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; | ||
1061 | struct cpu_hw_counters *cpuc; | ||
1062 | unsigned long flags; | ||
1063 | int cpu, idx; | ||
1064 | |||
1065 | if (!x86_pmu.num_counters) | ||
1066 | return; | ||
1067 | |||
1068 | local_irq_save(flags); | ||
1069 | |||
1070 | cpu = smp_processor_id(); | ||
1071 | cpuc = &per_cpu(cpu_hw_counters, cpu); | ||
1072 | |||
1073 | if (x86_pmu.version >= 2) { | ||
1074 | rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); | ||
1075 | rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); | ||
1076 | rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); | ||
1077 | rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); | ||
1078 | |||
1079 | pr_info("\n"); | ||
1080 | pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); | ||
1081 | pr_info("CPU#%d: status: %016llx\n", cpu, status); | ||
1082 | pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); | ||
1083 | pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); | ||
1084 | } | ||
1085 | pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); | ||
1086 | |||
1087 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | ||
1088 | rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); | ||
1089 | rdmsrl(x86_pmu.perfctr + idx, pmc_count); | ||
1090 | |||
1091 | prev_left = per_cpu(prev_left[idx], cpu); | ||
1092 | |||
1093 | pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", | ||
1094 | cpu, idx, pmc_ctrl); | ||
1095 | pr_info("CPU#%d: gen-PMC%d count: %016llx\n", | ||
1096 | cpu, idx, pmc_count); | ||
1097 | pr_info("CPU#%d: gen-PMC%d left: %016llx\n", | ||
1098 | cpu, idx, prev_left); | ||
1099 | } | ||
1100 | for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { | ||
1101 | rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); | ||
1102 | |||
1103 | pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", | ||
1104 | cpu, idx, pmc_count); | ||
1105 | } | ||
1106 | local_irq_restore(flags); | ||
1107 | } | ||
1108 | |||
1109 | static void x86_pmu_disable(struct perf_counter *counter) | ||
1110 | { | ||
1111 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
1112 | struct hw_perf_counter *hwc = &counter->hw; | ||
1113 | int idx = hwc->idx; | ||
1114 | |||
1115 | /* | ||
1116 | * Must be done before we disable, otherwise the nmi handler | ||
1117 | * could reenable again: | ||
1118 | */ | ||
1119 | clear_bit(idx, cpuc->active_mask); | ||
1120 | x86_pmu.disable(hwc, idx); | ||
1121 | |||
1122 | /* | ||
1123 | * Make sure the cleared pointer becomes visible before we | ||
1124 | * (potentially) free the counter: | ||
1125 | */ | ||
1126 | barrier(); | ||
1127 | |||
1128 | /* | ||
1129 | * Drain the remaining delta count out of a counter | ||
1130 | * that we are disabling: | ||
1131 | */ | ||
1132 | x86_perf_counter_update(counter, hwc, idx); | ||
1133 | cpuc->counters[idx] = NULL; | ||
1134 | clear_bit(idx, cpuc->used_mask); | ||
1135 | } | ||
1136 | |||
1137 | /* | ||
1138 | * Save and restart an expired counter. Called by NMI contexts, | ||
1139 | * so it has to be careful about preempting normal counter ops: | ||
1140 | */ | ||
1141 | static int intel_pmu_save_and_restart(struct perf_counter *counter) | ||
1142 | { | ||
1143 | struct hw_perf_counter *hwc = &counter->hw; | ||
1144 | int idx = hwc->idx; | ||
1145 | int ret; | ||
1146 | |||
1147 | x86_perf_counter_update(counter, hwc, idx); | ||
1148 | ret = x86_perf_counter_set_period(counter, hwc, idx); | ||
1149 | |||
1150 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) | ||
1151 | intel_pmu_enable_counter(hwc, idx); | ||
1152 | |||
1153 | return ret; | ||
1154 | } | ||
1155 | |||
1156 | static void intel_pmu_reset(void) | ||
1157 | { | ||
1158 | unsigned long flags; | ||
1159 | int idx; | ||
1160 | |||
1161 | if (!x86_pmu.num_counters) | ||
1162 | return; | ||
1163 | |||
1164 | local_irq_save(flags); | ||
1165 | |||
1166 | printk("clearing PMU state on CPU#%d\n", smp_processor_id()); | ||
1167 | |||
1168 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | ||
1169 | checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); | ||
1170 | checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); | ||
1171 | } | ||
1172 | for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { | ||
1173 | checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); | ||
1174 | } | ||
1175 | |||
1176 | local_irq_restore(flags); | ||
1177 | } | ||
1178 | |||
1179 | |||
1180 | /* | ||
1181 | * This handler is triggered by the local APIC, so the APIC IRQ handling | ||
1182 | * rules apply: | ||
1183 | */ | ||
1184 | static int intel_pmu_handle_irq(struct pt_regs *regs) | ||
1185 | { | ||
1186 | struct perf_sample_data data; | ||
1187 | struct cpu_hw_counters *cpuc; | ||
1188 | int bit, cpu, loops; | ||
1189 | u64 ack, status; | ||
1190 | |||
1191 | data.regs = regs; | ||
1192 | data.addr = 0; | ||
1193 | |||
1194 | cpu = smp_processor_id(); | ||
1195 | cpuc = &per_cpu(cpu_hw_counters, cpu); | ||
1196 | |||
1197 | perf_disable(); | ||
1198 | status = intel_pmu_get_status(); | ||
1199 | if (!status) { | ||
1200 | perf_enable(); | ||
1201 | return 0; | ||
1202 | } | ||
1203 | |||
1204 | loops = 0; | ||
1205 | again: | ||
1206 | if (++loops > 100) { | ||
1207 | WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); | ||
1208 | perf_counter_print_debug(); | ||
1209 | intel_pmu_reset(); | ||
1210 | perf_enable(); | ||
1211 | return 1; | ||
1212 | } | ||
1213 | |||
1214 | inc_irq_stat(apic_perf_irqs); | ||
1215 | ack = status; | ||
1216 | for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { | ||
1217 | struct perf_counter *counter = cpuc->counters[bit]; | ||
1218 | |||
1219 | clear_bit(bit, (unsigned long *) &status); | ||
1220 | if (!test_bit(bit, cpuc->active_mask)) | ||
1221 | continue; | ||
1222 | |||
1223 | if (!intel_pmu_save_and_restart(counter)) | ||
1224 | continue; | ||
1225 | |||
1226 | if (perf_counter_overflow(counter, 1, &data)) | ||
1227 | intel_pmu_disable_counter(&counter->hw, bit); | ||
1228 | } | ||
1229 | |||
1230 | intel_pmu_ack_status(ack); | ||
1231 | |||
1232 | /* | ||
1233 | * Repeat if there is more work to be done: | ||
1234 | */ | ||
1235 | status = intel_pmu_get_status(); | ||
1236 | if (status) | ||
1237 | goto again; | ||
1238 | |||
1239 | perf_enable(); | ||
1240 | |||
1241 | return 1; | ||
1242 | } | ||
1243 | |||
1244 | static int amd_pmu_handle_irq(struct pt_regs *regs) | ||
1245 | { | ||
1246 | struct perf_sample_data data; | ||
1247 | struct cpu_hw_counters *cpuc; | ||
1248 | struct perf_counter *counter; | ||
1249 | struct hw_perf_counter *hwc; | ||
1250 | int cpu, idx, handled = 0; | ||
1251 | u64 val; | ||
1252 | |||
1253 | data.regs = regs; | ||
1254 | data.addr = 0; | ||
1255 | |||
1256 | cpu = smp_processor_id(); | ||
1257 | cpuc = &per_cpu(cpu_hw_counters, cpu); | ||
1258 | |||
1259 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | ||
1260 | if (!test_bit(idx, cpuc->active_mask)) | ||
1261 | continue; | ||
1262 | |||
1263 | counter = cpuc->counters[idx]; | ||
1264 | hwc = &counter->hw; | ||
1265 | |||
1266 | val = x86_perf_counter_update(counter, hwc, idx); | ||
1267 | if (val & (1ULL << (x86_pmu.counter_bits - 1))) | ||
1268 | continue; | ||
1269 | |||
1270 | /* | ||
1271 | * counter overflow | ||
1272 | */ | ||
1273 | handled = 1; | ||
1274 | data.period = counter->hw.last_period; | ||
1275 | |||
1276 | if (!x86_perf_counter_set_period(counter, hwc, idx)) | ||
1277 | continue; | ||
1278 | |||
1279 | if (perf_counter_overflow(counter, 1, &data)) | ||
1280 | amd_pmu_disable_counter(hwc, idx); | ||
1281 | } | ||
1282 | |||
1283 | if (handled) | ||
1284 | inc_irq_stat(apic_perf_irqs); | ||
1285 | |||
1286 | return handled; | ||
1287 | } | ||
1288 | |||
1289 | void smp_perf_pending_interrupt(struct pt_regs *regs) | ||
1290 | { | ||
1291 | irq_enter(); | ||
1292 | ack_APIC_irq(); | ||
1293 | inc_irq_stat(apic_pending_irqs); | ||
1294 | perf_counter_do_pending(); | ||
1295 | irq_exit(); | ||
1296 | } | ||
1297 | |||
1298 | void set_perf_counter_pending(void) | ||
1299 | { | ||
1300 | apic->send_IPI_self(LOCAL_PENDING_VECTOR); | ||
1301 | } | ||
1302 | |||
1303 | void perf_counters_lapic_init(void) | ||
1304 | { | ||
1305 | if (!x86_pmu_initialized()) | ||
1306 | return; | ||
1307 | |||
1308 | /* | ||
1309 | * Always use NMI for PMU | ||
1310 | */ | ||
1311 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
1312 | } | ||
1313 | |||
1314 | static int __kprobes | ||
1315 | perf_counter_nmi_handler(struct notifier_block *self, | ||
1316 | unsigned long cmd, void *__args) | ||
1317 | { | ||
1318 | struct die_args *args = __args; | ||
1319 | struct pt_regs *regs; | ||
1320 | |||
1321 | if (!atomic_read(&active_counters)) | ||
1322 | return NOTIFY_DONE; | ||
1323 | |||
1324 | switch (cmd) { | ||
1325 | case DIE_NMI: | ||
1326 | case DIE_NMI_IPI: | ||
1327 | break; | ||
1328 | |||
1329 | default: | ||
1330 | return NOTIFY_DONE; | ||
1331 | } | ||
1332 | |||
1333 | regs = args->regs; | ||
1334 | |||
1335 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
1336 | /* | ||
1337 | * Can't rely on the handled return value to say it was our NMI, two | ||
1338 | * counters could trigger 'simultaneously' raising two back-to-back NMIs. | ||
1339 | * | ||
1340 | * If the first NMI handles both, the latter will be empty and daze | ||
1341 | * the CPU. | ||
1342 | */ | ||
1343 | x86_pmu.handle_irq(regs); | ||
1344 | |||
1345 | return NOTIFY_STOP; | ||
1346 | } | ||
1347 | |||
1348 | static __read_mostly struct notifier_block perf_counter_nmi_notifier = { | ||
1349 | .notifier_call = perf_counter_nmi_handler, | ||
1350 | .next = NULL, | ||
1351 | .priority = 1 | ||
1352 | }; | ||
1353 | |||
1354 | static struct x86_pmu intel_pmu = { | ||
1355 | .name = "Intel", | ||
1356 | .handle_irq = intel_pmu_handle_irq, | ||
1357 | .disable_all = intel_pmu_disable_all, | ||
1358 | .enable_all = intel_pmu_enable_all, | ||
1359 | .enable = intel_pmu_enable_counter, | ||
1360 | .disable = intel_pmu_disable_counter, | ||
1361 | .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, | ||
1362 | .perfctr = MSR_ARCH_PERFMON_PERFCTR0, | ||
1363 | .event_map = intel_pmu_event_map, | ||
1364 | .raw_event = intel_pmu_raw_event, | ||
1365 | .max_events = ARRAY_SIZE(intel_perfmon_event_map), | ||
1366 | /* | ||
1367 | * Intel PMCs cannot be accessed sanely above 32 bit width, | ||
1368 | * so we install an artificial 1<<31 period regardless of | ||
1369 | * the generic counter period: | ||
1370 | */ | ||
1371 | .max_period = (1ULL << 31) - 1, | ||
1372 | }; | ||
1373 | |||
1374 | static struct x86_pmu amd_pmu = { | ||
1375 | .name = "AMD", | ||
1376 | .handle_irq = amd_pmu_handle_irq, | ||
1377 | .disable_all = amd_pmu_disable_all, | ||
1378 | .enable_all = amd_pmu_enable_all, | ||
1379 | .enable = amd_pmu_enable_counter, | ||
1380 | .disable = amd_pmu_disable_counter, | ||
1381 | .eventsel = MSR_K7_EVNTSEL0, | ||
1382 | .perfctr = MSR_K7_PERFCTR0, | ||
1383 | .event_map = amd_pmu_event_map, | ||
1384 | .raw_event = amd_pmu_raw_event, | ||
1385 | .max_events = ARRAY_SIZE(amd_perfmon_event_map), | ||
1386 | .num_counters = 4, | ||
1387 | .counter_bits = 48, | ||
1388 | .counter_mask = (1ULL << 48) - 1, | ||
1389 | /* use highest bit to detect overflow */ | ||
1390 | .max_period = (1ULL << 47) - 1, | ||
1391 | }; | ||
1392 | |||
1393 | static int intel_pmu_init(void) | ||
1394 | { | ||
1395 | union cpuid10_edx edx; | ||
1396 | union cpuid10_eax eax; | ||
1397 | unsigned int unused; | ||
1398 | unsigned int ebx; | ||
1399 | int version; | ||
1400 | |||
1401 | if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) | ||
1402 | return -ENODEV; | ||
1403 | |||
1404 | /* | ||
1405 | * Check whether the Architectural PerfMon supports | ||
1406 | * Branch Misses Retired Event or not. | ||
1407 | */ | ||
1408 | cpuid(10, &eax.full, &ebx, &unused, &edx.full); | ||
1409 | if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) | ||
1410 | return -ENODEV; | ||
1411 | |||
1412 | version = eax.split.version_id; | ||
1413 | if (version < 2) | ||
1414 | return -ENODEV; | ||
1415 | |||
1416 | x86_pmu = intel_pmu; | ||
1417 | x86_pmu.version = version; | ||
1418 | x86_pmu.num_counters = eax.split.num_counters; | ||
1419 | x86_pmu.counter_bits = eax.split.bit_width; | ||
1420 | x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; | ||
1421 | |||
1422 | /* | ||
1423 | * Quirk: v2 perfmon does not report fixed-purpose counters, so | ||
1424 | * assume at least 3 counters: | ||
1425 | */ | ||
1426 | x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); | ||
1427 | |||
1428 | rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); | ||
1429 | |||
1430 | /* | ||
1431 | * Install the hw-cache-events table: | ||
1432 | */ | ||
1433 | switch (boot_cpu_data.x86_model) { | ||
1434 | case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ | ||
1435 | case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ | ||
1436 | case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ | ||
1437 | case 29: /* six-core 45 nm xeon "Dunnington" */ | ||
1438 | memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, | ||
1439 | sizeof(hw_cache_event_ids)); | ||
1440 | |||
1441 | pr_cont("Core2 events, "); | ||
1442 | break; | ||
1443 | default: | ||
1444 | case 26: | ||
1445 | memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, | ||
1446 | sizeof(hw_cache_event_ids)); | ||
1447 | |||
1448 | pr_cont("Nehalem/Corei7 events, "); | ||
1449 | break; | ||
1450 | case 28: | ||
1451 | memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, | ||
1452 | sizeof(hw_cache_event_ids)); | ||
1453 | |||
1454 | pr_cont("Atom events, "); | ||
1455 | break; | ||
1456 | } | ||
1457 | return 0; | ||
1458 | } | ||
1459 | |||
1460 | static int amd_pmu_init(void) | ||
1461 | { | ||
1462 | x86_pmu = amd_pmu; | ||
1463 | |||
1464 | switch (boot_cpu_data.x86) { | ||
1465 | case 0x0f: | ||
1466 | case 0x10: | ||
1467 | case 0x11: | ||
1468 | memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids, | ||
1469 | sizeof(hw_cache_event_ids)); | ||
1470 | |||
1471 | pr_cont("AMD Family 0f/10/11 events, "); | ||
1472 | break; | ||
1473 | } | ||
1474 | return 0; | ||
1475 | } | ||
1476 | |||
1477 | void __init init_hw_perf_counters(void) | ||
1478 | { | ||
1479 | int err; | ||
1480 | |||
1481 | pr_info("Performance Counters: "); | ||
1482 | |||
1483 | switch (boot_cpu_data.x86_vendor) { | ||
1484 | case X86_VENDOR_INTEL: | ||
1485 | err = intel_pmu_init(); | ||
1486 | break; | ||
1487 | case X86_VENDOR_AMD: | ||
1488 | err = amd_pmu_init(); | ||
1489 | break; | ||
1490 | default: | ||
1491 | return; | ||
1492 | } | ||
1493 | if (err != 0) { | ||
1494 | pr_cont("no PMU driver, software counters only.\n"); | ||
1495 | return; | ||
1496 | } | ||
1497 | |||
1498 | pr_cont("%s PMU driver.\n", x86_pmu.name); | ||
1499 | |||
1500 | if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { | ||
1501 | x86_pmu.num_counters = X86_PMC_MAX_GENERIC; | ||
1502 | WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", | ||
1503 | x86_pmu.num_counters, X86_PMC_MAX_GENERIC); | ||
1504 | } | ||
1505 | perf_counter_mask = (1 << x86_pmu.num_counters) - 1; | ||
1506 | perf_max_counters = x86_pmu.num_counters; | ||
1507 | |||
1508 | if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { | ||
1509 | x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; | ||
1510 | WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", | ||
1511 | x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); | ||
1512 | } | ||
1513 | |||
1514 | perf_counter_mask |= | ||
1515 | ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; | ||
1516 | |||
1517 | perf_counters_lapic_init(); | ||
1518 | register_die_notifier(&perf_counter_nmi_notifier); | ||
1519 | |||
1520 | pr_info("... version: %d\n", x86_pmu.version); | ||
1521 | pr_info("... bit width: %d\n", x86_pmu.counter_bits); | ||
1522 | pr_info("... generic counters: %d\n", x86_pmu.num_counters); | ||
1523 | pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); | ||
1524 | pr_info("... max period: %016Lx\n", x86_pmu.max_period); | ||
1525 | pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); | ||
1526 | pr_info("... counter mask: %016Lx\n", perf_counter_mask); | ||
1527 | } | ||
1528 | |||
1529 | static inline void x86_pmu_read(struct perf_counter *counter) | ||
1530 | { | ||
1531 | x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); | ||
1532 | } | ||
1533 | |||
1534 | static const struct pmu pmu = { | ||
1535 | .enable = x86_pmu_enable, | ||
1536 | .disable = x86_pmu_disable, | ||
1537 | .read = x86_pmu_read, | ||
1538 | .unthrottle = x86_pmu_unthrottle, | ||
1539 | }; | ||
1540 | |||
1541 | const struct pmu *hw_perf_counter_init(struct perf_counter *counter) | ||
1542 | { | ||
1543 | int err; | ||
1544 | |||
1545 | err = __hw_perf_counter_init(counter); | ||
1546 | if (err) | ||
1547 | return ERR_PTR(err); | ||
1548 | |||
1549 | return &pmu; | ||
1550 | } | ||
1551 | |||
1552 | /* | ||
1553 | * callchain support | ||
1554 | */ | ||
1555 | |||
1556 | static inline | ||
1557 | void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) | ||
1558 | { | ||
1559 | if (entry->nr < MAX_STACK_DEPTH) | ||
1560 | entry->ip[entry->nr++] = ip; | ||
1561 | } | ||
1562 | |||
1563 | static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); | ||
1564 | static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); | ||
1565 | |||
1566 | |||
1567 | static void | ||
1568 | backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
1569 | { | ||
1570 | /* Ignore warnings */ | ||
1571 | } | ||
1572 | |||
1573 | static void backtrace_warning(void *data, char *msg) | ||
1574 | { | ||
1575 | /* Ignore warnings */ | ||
1576 | } | ||
1577 | |||
1578 | static int backtrace_stack(void *data, char *name) | ||
1579 | { | ||
1580 | /* Don't bother with IRQ stacks for now */ | ||
1581 | return -1; | ||
1582 | } | ||
1583 | |||
1584 | static void backtrace_address(void *data, unsigned long addr, int reliable) | ||
1585 | { | ||
1586 | struct perf_callchain_entry *entry = data; | ||
1587 | |||
1588 | if (reliable) | ||
1589 | callchain_store(entry, addr); | ||
1590 | } | ||
1591 | |||
1592 | static const struct stacktrace_ops backtrace_ops = { | ||
1593 | .warning = backtrace_warning, | ||
1594 | .warning_symbol = backtrace_warning_symbol, | ||
1595 | .stack = backtrace_stack, | ||
1596 | .address = backtrace_address, | ||
1597 | }; | ||
1598 | |||
1599 | static void | ||
1600 | perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) | ||
1601 | { | ||
1602 | unsigned long bp; | ||
1603 | char *stack; | ||
1604 | int nr = entry->nr; | ||
1605 | |||
1606 | callchain_store(entry, instruction_pointer(regs)); | ||
1607 | |||
1608 | stack = ((char *)regs + sizeof(struct pt_regs)); | ||
1609 | #ifdef CONFIG_FRAME_POINTER | ||
1610 | bp = frame_pointer(regs); | ||
1611 | #else | ||
1612 | bp = 0; | ||
1613 | #endif | ||
1614 | |||
1615 | dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); | ||
1616 | |||
1617 | entry->kernel = entry->nr - nr; | ||
1618 | } | ||
1619 | |||
1620 | |||
1621 | struct stack_frame { | ||
1622 | const void __user *next_fp; | ||
1623 | unsigned long return_address; | ||
1624 | }; | ||
1625 | |||
1626 | static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) | ||
1627 | { | ||
1628 | int ret; | ||
1629 | |||
1630 | if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) | ||
1631 | return 0; | ||
1632 | |||
1633 | ret = 1; | ||
1634 | pagefault_disable(); | ||
1635 | if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) | ||
1636 | ret = 0; | ||
1637 | pagefault_enable(); | ||
1638 | |||
1639 | return ret; | ||
1640 | } | ||
1641 | |||
1642 | static void | ||
1643 | perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) | ||
1644 | { | ||
1645 | struct stack_frame frame; | ||
1646 | const void __user *fp; | ||
1647 | int nr = entry->nr; | ||
1648 | |||
1649 | regs = (struct pt_regs *)current->thread.sp0 - 1; | ||
1650 | fp = (void __user *)regs->bp; | ||
1651 | |||
1652 | callchain_store(entry, regs->ip); | ||
1653 | |||
1654 | while (entry->nr < MAX_STACK_DEPTH) { | ||
1655 | frame.next_fp = NULL; | ||
1656 | frame.return_address = 0; | ||
1657 | |||
1658 | if (!copy_stack_frame(fp, &frame)) | ||
1659 | break; | ||
1660 | |||
1661 | if ((unsigned long)fp < user_stack_pointer(regs)) | ||
1662 | break; | ||
1663 | |||
1664 | callchain_store(entry, frame.return_address); | ||
1665 | fp = frame.next_fp; | ||
1666 | } | ||
1667 | |||
1668 | entry->user = entry->nr - nr; | ||
1669 | } | ||
1670 | |||
1671 | static void | ||
1672 | perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) | ||
1673 | { | ||
1674 | int is_user; | ||
1675 | |||
1676 | if (!regs) | ||
1677 | return; | ||
1678 | |||
1679 | is_user = user_mode(regs); | ||
1680 | |||
1681 | if (!current || current->pid == 0) | ||
1682 | return; | ||
1683 | |||
1684 | if (is_user && current->state != TASK_RUNNING) | ||
1685 | return; | ||
1686 | |||
1687 | if (!is_user) | ||
1688 | perf_callchain_kernel(regs, entry); | ||
1689 | |||
1690 | if (current->mm) | ||
1691 | perf_callchain_user(regs, entry); | ||
1692 | } | ||
1693 | |||
1694 | struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
1695 | { | ||
1696 | struct perf_callchain_entry *entry; | ||
1697 | |||
1698 | if (in_nmi()) | ||
1699 | entry = &__get_cpu_var(nmi_entry); | ||
1700 | else | ||
1701 | entry = &__get_cpu_var(irq_entry); | ||
1702 | |||
1703 | entry->nr = 0; | ||
1704 | entry->hv = 0; | ||
1705 | entry->kernel = 0; | ||
1706 | entry->user = 0; | ||
1707 | |||
1708 | perf_do_callchain(regs, entry); | ||
1709 | |||
1710 | return entry; | ||
1711 | } | ||
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index f6c70a164e32..d6f5b9fbde32 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c | |||
@@ -19,8 +19,8 @@ | |||
19 | #include <linux/nmi.h> | 19 | #include <linux/nmi.h> |
20 | #include <linux/kprobes.h> | 20 | #include <linux/kprobes.h> |
21 | 21 | ||
22 | #include <asm/genapic.h> | 22 | #include <asm/apic.h> |
23 | #include <asm/intel_arch_perfmon.h> | 23 | #include <asm/perf_counter.h> |
24 | 24 | ||
25 | struct nmi_watchdog_ctlblk { | 25 | struct nmi_watchdog_ctlblk { |
26 | unsigned int cccr_msr; | 26 | unsigned int cccr_msr; |
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2ac1f0c2beb3..b07af8861244 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c | |||
@@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = | |||
182 | .notifier_call = cpuid_class_cpu_callback, | 182 | .notifier_call = cpuid_class_cpu_callback, |
183 | }; | 183 | }; |
184 | 184 | ||
185 | static char *cpuid_nodename(struct device *dev) | ||
186 | { | ||
187 | return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); | ||
188 | } | ||
189 | |||
185 | static int __init cpuid_init(void) | 190 | static int __init cpuid_init(void) |
186 | { | 191 | { |
187 | int i, err = 0; | 192 | int i, err = 0; |
@@ -198,6 +203,7 @@ static int __init cpuid_init(void) | |||
198 | err = PTR_ERR(cpuid_class); | 203 | err = PTR_ERR(cpuid_class); |
199 | goto out_chrdev; | 204 | goto out_chrdev; |
200 | } | 205 | } |
206 | cpuid_class->nodename = cpuid_nodename; | ||
201 | for_each_online_cpu(i) { | 207 | for_each_online_cpu(i) { |
202 | err = cpuid_device_create(i); | 208 | err = cpuid_device_create(i); |
203 | if (err != 0) | 209 | if (err != 0) |
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index da87590b8698..81086c227ab7 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h | |||
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | |||
29 | unsigned long *sp, unsigned long bp, char *log_lvl); | 29 | unsigned long *sp, unsigned long bp, char *log_lvl); |
30 | 30 | ||
31 | extern unsigned int code_bytes; | 31 | extern unsigned int code_bytes; |
32 | extern int kstack_depth_to_print; | ||
33 | 32 | ||
34 | /* The form of the top of the frame on the stack */ | 33 | /* The form of the top of the frame on the stack */ |
35 | struct stack_frame { | 34 | struct stack_frame { |
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 006281302925..7271fa33d791 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, | |||
617 | */ | 617 | */ |
618 | __init void e820_setup_gap(void) | 618 | __init void e820_setup_gap(void) |
619 | { | 619 | { |
620 | unsigned long gapstart, gapsize, round; | 620 | unsigned long gapstart, gapsize; |
621 | int found; | 621 | int found; |
622 | 622 | ||
623 | gapstart = 0x10000000; | 623 | gapstart = 0x10000000; |
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void) | |||
635 | #endif | 635 | #endif |
636 | 636 | ||
637 | /* | 637 | /* |
638 | * See how much we want to round up: start off with | 638 | * e820_reserve_resources_late protect stolen RAM already |
639 | * rounding to the next 1MB area. | ||
640 | */ | 639 | */ |
641 | round = 0x100000; | 640 | pci_mem_start = gapstart; |
642 | while ((gapsize >> 4) > round) | ||
643 | round += round; | ||
644 | /* Fun with two's complement */ | ||
645 | pci_mem_start = (gapstart + round) & -round; | ||
646 | 641 | ||
647 | printk(KERN_INFO | 642 | printk(KERN_INFO |
648 | "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", | 643 | "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", |
@@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void) | |||
1371 | } | 1366 | } |
1372 | } | 1367 | } |
1373 | 1368 | ||
1369 | /* How much should we pad RAM ending depending on where it is? */ | ||
1370 | static unsigned long ram_alignment(resource_size_t pos) | ||
1371 | { | ||
1372 | unsigned long mb = pos >> 20; | ||
1373 | |||
1374 | /* To 64kB in the first megabyte */ | ||
1375 | if (!mb) | ||
1376 | return 64*1024; | ||
1377 | |||
1378 | /* To 1MB in the first 16MB */ | ||
1379 | if (mb < 16) | ||
1380 | return 1024*1024; | ||
1381 | |||
1382 | /* To 32MB for anything above that */ | ||
1383 | return 32*1024*1024; | ||
1384 | } | ||
1385 | |||
1374 | void __init e820_reserve_resources_late(void) | 1386 | void __init e820_reserve_resources_late(void) |
1375 | { | 1387 | { |
1376 | int i; | 1388 | int i; |
@@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void) | |||
1382 | insert_resource_expand_to_fit(&iomem_resource, res); | 1394 | insert_resource_expand_to_fit(&iomem_resource, res); |
1383 | res++; | 1395 | res++; |
1384 | } | 1396 | } |
1397 | |||
1398 | /* | ||
1399 | * Try to bump up RAM regions to reasonable boundaries to | ||
1400 | * avoid stolen RAM: | ||
1401 | */ | ||
1402 | for (i = 0; i < e820.nr_map; i++) { | ||
1403 | struct e820entry *entry = &e820_saved.map[i]; | ||
1404 | resource_size_t start, end; | ||
1405 | |||
1406 | if (entry->type != E820_RAM) | ||
1407 | continue; | ||
1408 | start = entry->addr + entry->size; | ||
1409 | end = round_up(start, ram_alignment(start)); | ||
1410 | if (start == end) | ||
1411 | continue; | ||
1412 | reserve_region_with_split(&iomem_resource, start, | ||
1413 | end - 1, "RAM buffer"); | ||
1414 | } | ||
1385 | } | 1415 | } |
1386 | 1416 | ||
1387 | char *__init default_machine_specific_memory_setup(void) | 1417 | char *__init default_machine_specific_memory_setup(void) |
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 76b8cd953dee..ebdb85cf2686 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c | |||
@@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func) | |||
97 | } | 97 | } |
98 | 98 | ||
99 | #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) | 99 | #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) |
100 | #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) | ||
100 | static u32 __init ati_ixp4x0_rev(int num, int slot, int func) | 101 | static u32 __init ati_ixp4x0_rev(int num, int slot, int func) |
101 | { | 102 | { |
102 | u32 d; | 103 | u32 d; |
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func) | |||
114 | d &= 0xff; | 115 | d &= 0xff; |
115 | return d; | 116 | return d; |
116 | } | 117 | } |
118 | #endif | ||
117 | 119 | ||
118 | static void __init ati_bugs(int num, int slot, int func) | 120 | static void __init ati_bugs(int num, int slot, int func) |
119 | { | 121 | { |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 987f91f0f755..de74f0a3e0ed 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -963,6 +963,8 @@ END(\sym) | |||
963 | #ifdef CONFIG_SMP | 963 | #ifdef CONFIG_SMP |
964 | apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ | 964 | apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ |
965 | irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt | 965 | irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt |
966 | apicinterrupt REBOOT_VECTOR \ | ||
967 | reboot_interrupt smp_reboot_interrupt | ||
966 | #endif | 968 | #endif |
967 | 969 | ||
968 | #ifdef CONFIG_X86_UV | 970 | #ifdef CONFIG_X86_UV |
@@ -994,10 +996,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ | |||
994 | #endif | 996 | #endif |
995 | 997 | ||
996 | apicinterrupt THRESHOLD_APIC_VECTOR \ | 998 | apicinterrupt THRESHOLD_APIC_VECTOR \ |
997 | threshold_interrupt mce_threshold_interrupt | 999 | threshold_interrupt smp_threshold_interrupt |
998 | apicinterrupt THERMAL_APIC_VECTOR \ | 1000 | apicinterrupt THERMAL_APIC_VECTOR \ |
999 | thermal_interrupt smp_thermal_interrupt | 1001 | thermal_interrupt smp_thermal_interrupt |
1000 | 1002 | ||
1003 | #ifdef CONFIG_X86_MCE | ||
1004 | apicinterrupt MCE_SELF_VECTOR \ | ||
1005 | mce_self_interrupt smp_mce_self_interrupt | ||
1006 | #endif | ||
1007 | |||
1001 | #ifdef CONFIG_SMP | 1008 | #ifdef CONFIG_SMP |
1002 | apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ | 1009 | apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ |
1003 | call_function_single_interrupt smp_call_function_single_interrupt | 1010 | call_function_single_interrupt smp_call_function_single_interrupt |
@@ -1012,6 +1019,11 @@ apicinterrupt ERROR_APIC_VECTOR \ | |||
1012 | apicinterrupt SPURIOUS_APIC_VECTOR \ | 1019 | apicinterrupt SPURIOUS_APIC_VECTOR \ |
1013 | spurious_interrupt smp_spurious_interrupt | 1020 | spurious_interrupt smp_spurious_interrupt |
1014 | 1021 | ||
1022 | #ifdef CONFIG_PERF_COUNTERS | ||
1023 | apicinterrupt LOCAL_PENDING_VECTOR \ | ||
1024 | perf_pending_interrupt smp_perf_pending_interrupt | ||
1025 | #endif | ||
1026 | |||
1015 | /* | 1027 | /* |
1016 | * Exception entry points. | 1028 | * Exception entry points. |
1017 | */ | 1029 | */ |
@@ -1366,10 +1378,15 @@ END(xen_failsafe_callback) | |||
1366 | paranoidzeroentry_ist debug do_debug DEBUG_STACK | 1378 | paranoidzeroentry_ist debug do_debug DEBUG_STACK |
1367 | paranoidzeroentry_ist int3 do_int3 DEBUG_STACK | 1379 | paranoidzeroentry_ist int3 do_int3 DEBUG_STACK |
1368 | paranoiderrorentry stack_segment do_stack_segment | 1380 | paranoiderrorentry stack_segment do_stack_segment |
1381 | #ifdef CONFIG_XEN | ||
1382 | zeroentry xen_debug do_debug | ||
1383 | zeroentry xen_int3 do_int3 | ||
1384 | errorentry xen_stack_segment do_stack_segment | ||
1385 | #endif | ||
1369 | errorentry general_protection do_general_protection | 1386 | errorentry general_protection do_general_protection |
1370 | errorentry page_fault do_page_fault | 1387 | errorentry page_fault do_page_fault |
1371 | #ifdef CONFIG_X86_MCE | 1388 | #ifdef CONFIG_X86_MCE |
1372 | paranoidzeroentry machine_check do_machine_check | 1389 | paranoidzeroentry machine_check *machine_check_vector(%rip) |
1373 | #endif | 1390 | #endif |
1374 | 1391 | ||
1375 | /* | 1392 | /* |
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 18dfa30795c9..b79c5533c421 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c | |||
@@ -442,7 +442,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) | |||
442 | _ASM_EXTABLE(1b, 4b) | 442 | _ASM_EXTABLE(1b, 4b) |
443 | _ASM_EXTABLE(2b, 4b) | 443 | _ASM_EXTABLE(2b, 4b) |
444 | 444 | ||
445 | : [old] "=r" (old), [faulted] "=r" (faulted) | 445 | : [old] "=&r" (old), [faulted] "=r" (faulted) |
446 | : [parent] "r" (parent), [return_hooker] "r" (return_hooker) | 446 | : [parent] "r" (parent), [return_hooker] "r" (return_hooker) |
447 | : "memory" | 447 | : "memory" |
448 | ); | 448 | ); |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 30683883e0cd..dc5ed4bdd88d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -608,13 +608,6 @@ ignore_int: | |||
608 | ENTRY(initial_code) | 608 | ENTRY(initial_code) |
609 | .long i386_start_kernel | 609 | .long i386_start_kernel |
610 | 610 | ||
611 | .section .text | ||
612 | /* | ||
613 | * Real beginning of normal "text" segment | ||
614 | */ | ||
615 | ENTRY(stext) | ||
616 | ENTRY(_stext) | ||
617 | |||
618 | /* | 611 | /* |
619 | * BSS section | 612 | * BSS section |
620 | */ | 613 | */ |
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 69451473dbd2..51d959528b1d 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c | |||
@@ -91,7 +91,7 @@ void arch_update_kernel_hw_breakpoint(void *unused) | |||
91 | */ | 91 | */ |
92 | kdr7 = temp_kdr7; | 92 | kdr7 = temp_kdr7; |
93 | set_debugreg(kdr7 | current->thread.debugreg7, 7); | 93 | set_debugreg(kdr7 | current->thread.debugreg7, 7); |
94 | put_cpu_no_resched(); | 94 | put_cpu(); |
95 | } | 95 | } |
96 | 96 | ||
97 | /* | 97 | /* |
@@ -374,7 +374,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) | |||
374 | rc = NOTIFY_DONE; | 374 | rc = NOTIFY_DONE; |
375 | 375 | ||
376 | set_debugreg(dr7, 7); | 376 | set_debugreg(dr7, 7); |
377 | put_cpu_no_resched(); | 377 | put_cpu(); |
378 | return rc; | 378 | return rc; |
379 | } | 379 | } |
380 | 380 | ||
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index c2e0bb0890d4..5cf36c053ac4 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
8 | #include <linux/jiffies.h> | 8 | #include <linux/jiffies.h> |
9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
10 | #include <linux/timex.h> | ||
10 | #include <linux/delay.h> | 11 | #include <linux/delay.h> |
11 | #include <linux/init.h> | 12 | #include <linux/init.h> |
12 | #include <linux/io.h> | 13 | #include <linux/io.h> |
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index df3bf269beab..270ff83efc11 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c | |||
@@ -12,7 +12,6 @@ | |||
12 | 12 | ||
13 | static struct signal_struct init_signals = INIT_SIGNALS(init_signals); | 13 | static struct signal_struct init_signals = INIT_SIGNALS(init_signals); |
14 | static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); | 14 | static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); |
15 | struct mm_struct init_mm = INIT_MM(init_mm); | ||
16 | 15 | ||
17 | /* | 16 | /* |
18 | * Initial thread structure. | 17 | * Initial thread structure. |
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c3fe010d74c8..b0cdde6932f5 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c | |||
@@ -12,6 +12,8 @@ | |||
12 | #include <asm/io_apic.h> | 12 | #include <asm/io_apic.h> |
13 | #include <asm/irq.h> | 13 | #include <asm/irq.h> |
14 | #include <asm/idle.h> | 14 | #include <asm/idle.h> |
15 | #include <asm/mce.h> | ||
16 | #include <asm/hw_irq.h> | ||
15 | 17 | ||
16 | atomic_t irq_err_count; | 18 | atomic_t irq_err_count; |
17 | 19 | ||
@@ -24,9 +26,9 @@ void (*generic_interrupt_extension)(void) = NULL; | |||
24 | */ | 26 | */ |
25 | void ack_bad_irq(unsigned int irq) | 27 | void ack_bad_irq(unsigned int irq) |
26 | { | 28 | { |
27 | printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); | 29 | if (printk_ratelimit()) |
30 | pr_err("unexpected IRQ trap at vector %02x\n", irq); | ||
28 | 31 | ||
29 | #ifdef CONFIG_X86_LOCAL_APIC | ||
30 | /* | 32 | /* |
31 | * Currently unexpected vectors happen only on SMP and APIC. | 33 | * Currently unexpected vectors happen only on SMP and APIC. |
32 | * We _must_ ack these because every local APIC has only N | 34 | * We _must_ ack these because every local APIC has only N |
@@ -36,9 +38,7 @@ void ack_bad_irq(unsigned int irq) | |||
36 | * completely. | 38 | * completely. |
37 | * But only ack when the APIC is enabled -AK | 39 | * But only ack when the APIC is enabled -AK |
38 | */ | 40 | */ |
39 | if (cpu_has_apic) | 41 | ack_APIC_irq(); |
40 | ack_APIC_irq(); | ||
41 | #endif | ||
42 | } | 42 | } |
43 | 43 | ||
44 | #define irq_stats(x) (&per_cpu(irq_stat, x)) | 44 | #define irq_stats(x) (&per_cpu(irq_stat, x)) |
@@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec) | |||
63 | for_each_online_cpu(j) | 63 | for_each_online_cpu(j) |
64 | seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); | 64 | seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); |
65 | seq_printf(p, " Spurious interrupts\n"); | 65 | seq_printf(p, " Spurious interrupts\n"); |
66 | seq_printf(p, "%*s: ", prec, "CNT"); | ||
67 | for_each_online_cpu(j) | ||
68 | seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); | ||
69 | seq_printf(p, " Performance counter interrupts\n"); | ||
70 | seq_printf(p, "%*s: ", prec, "PND"); | ||
71 | for_each_online_cpu(j) | ||
72 | seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); | ||
73 | seq_printf(p, " Performance pending work\n"); | ||
66 | #endif | 74 | #endif |
67 | if (generic_interrupt_extension) { | 75 | if (generic_interrupt_extension) { |
68 | seq_printf(p, "%*s: ", prec, "PLT"); | 76 | seq_printf(p, "%*s: ", prec, "PLT"); |
@@ -89,13 +97,23 @@ static int show_other_interrupts(struct seq_file *p, int prec) | |||
89 | for_each_online_cpu(j) | 97 | for_each_online_cpu(j) |
90 | seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); | 98 | seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); |
91 | seq_printf(p, " Thermal event interrupts\n"); | 99 | seq_printf(p, " Thermal event interrupts\n"); |
92 | # ifdef CONFIG_X86_64 | 100 | # ifdef CONFIG_X86_MCE_THRESHOLD |
93 | seq_printf(p, "%*s: ", prec, "THR"); | 101 | seq_printf(p, "%*s: ", prec, "THR"); |
94 | for_each_online_cpu(j) | 102 | for_each_online_cpu(j) |
95 | seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); | 103 | seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); |
96 | seq_printf(p, " Threshold APIC interrupts\n"); | 104 | seq_printf(p, " Threshold APIC interrupts\n"); |
97 | # endif | 105 | # endif |
98 | #endif | 106 | #endif |
107 | #ifdef CONFIG_X86_NEW_MCE | ||
108 | seq_printf(p, "%*s: ", prec, "MCE"); | ||
109 | for_each_online_cpu(j) | ||
110 | seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); | ||
111 | seq_printf(p, " Machine check exceptions\n"); | ||
112 | seq_printf(p, "%*s: ", prec, "MCP"); | ||
113 | for_each_online_cpu(j) | ||
114 | seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); | ||
115 | seq_printf(p, " Machine check polls\n"); | ||
116 | #endif | ||
99 | seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); | 117 | seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); |
100 | #if defined(CONFIG_X86_IO_APIC) | 118 | #if defined(CONFIG_X86_IO_APIC) |
101 | seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); | 119 | seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); |
@@ -166,6 +184,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) | |||
166 | #ifdef CONFIG_X86_LOCAL_APIC | 184 | #ifdef CONFIG_X86_LOCAL_APIC |
167 | sum += irq_stats(cpu)->apic_timer_irqs; | 185 | sum += irq_stats(cpu)->apic_timer_irqs; |
168 | sum += irq_stats(cpu)->irq_spurious_count; | 186 | sum += irq_stats(cpu)->irq_spurious_count; |
187 | sum += irq_stats(cpu)->apic_perf_irqs; | ||
188 | sum += irq_stats(cpu)->apic_pending_irqs; | ||
169 | #endif | 189 | #endif |
170 | if (generic_interrupt_extension) | 190 | if (generic_interrupt_extension) |
171 | sum += irq_stats(cpu)->generic_irqs; | 191 | sum += irq_stats(cpu)->generic_irqs; |
@@ -176,9 +196,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu) | |||
176 | #endif | 196 | #endif |
177 | #ifdef CONFIG_X86_MCE | 197 | #ifdef CONFIG_X86_MCE |
178 | sum += irq_stats(cpu)->irq_thermal_count; | 198 | sum += irq_stats(cpu)->irq_thermal_count; |
179 | # ifdef CONFIG_X86_64 | 199 | # ifdef CONFIG_X86_MCE_THRESHOLD |
180 | sum += irq_stats(cpu)->irq_threshold_count; | 200 | sum += irq_stats(cpu)->irq_threshold_count; |
201 | # endif | ||
181 | #endif | 202 | #endif |
203 | #ifdef CONFIG_X86_NEW_MCE | ||
204 | sum += per_cpu(mce_exception_count, cpu); | ||
205 | sum += per_cpu(mce_poll_count, cpu); | ||
182 | #endif | 206 | #endif |
183 | return sum; | 207 | return sum; |
184 | } | 208 | } |
@@ -213,14 +237,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) | |||
213 | irq = __get_cpu_var(vector_irq)[vector]; | 237 | irq = __get_cpu_var(vector_irq)[vector]; |
214 | 238 | ||
215 | if (!handle_irq(irq, regs)) { | 239 | if (!handle_irq(irq, regs)) { |
216 | #ifdef CONFIG_X86_64 | 240 | ack_APIC_irq(); |
217 | if (!disable_apic) | ||
218 | ack_APIC_irq(); | ||
219 | #endif | ||
220 | 241 | ||
221 | if (printk_ratelimit()) | 242 | if (printk_ratelimit()) |
222 | printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", | 243 | pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", |
223 | __func__, smp_processor_id(), vector, irq); | 244 | __func__, smp_processor_id(), vector, irq); |
224 | } | 245 | } |
225 | 246 | ||
226 | irq_exit(); | 247 | irq_exit(); |
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c index 368b0a8836f9..696f0e475c2d 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -1,20 +1,25 @@ | |||
1 | #include <linux/linkage.h> | ||
1 | #include <linux/errno.h> | 2 | #include <linux/errno.h> |
2 | #include <linux/signal.h> | 3 | #include <linux/signal.h> |
3 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
4 | #include <linux/ioport.h> | 5 | #include <linux/ioport.h> |
5 | #include <linux/interrupt.h> | 6 | #include <linux/interrupt.h> |
7 | #include <linux/timex.h> | ||
6 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
7 | #include <linux/random.h> | 9 | #include <linux/random.h> |
10 | #include <linux/kprobes.h> | ||
8 | #include <linux/init.h> | 11 | #include <linux/init.h> |
9 | #include <linux/kernel_stat.h> | 12 | #include <linux/kernel_stat.h> |
10 | #include <linux/sysdev.h> | 13 | #include <linux/sysdev.h> |
11 | #include <linux/bitops.h> | 14 | #include <linux/bitops.h> |
15 | #include <linux/acpi.h> | ||
12 | #include <linux/io.h> | 16 | #include <linux/io.h> |
13 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
14 | 18 | ||
15 | #include <asm/atomic.h> | 19 | #include <asm/atomic.h> |
16 | #include <asm/system.h> | 20 | #include <asm/system.h> |
17 | #include <asm/timer.h> | 21 | #include <asm/timer.h> |
22 | #include <asm/hw_irq.h> | ||
18 | #include <asm/pgtable.h> | 23 | #include <asm/pgtable.h> |
19 | #include <asm/desc.h> | 24 | #include <asm/desc.h> |
20 | #include <asm/apic.h> | 25 | #include <asm/apic.h> |
@@ -22,7 +27,23 @@ | |||
22 | #include <asm/i8259.h> | 27 | #include <asm/i8259.h> |
23 | #include <asm/traps.h> | 28 | #include <asm/traps.h> |
24 | 29 | ||
30 | /* | ||
31 | * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: | ||
32 | * (these are usually mapped to vectors 0x30-0x3f) | ||
33 | */ | ||
34 | |||
35 | /* | ||
36 | * The IO-APIC gives us many more interrupt sources. Most of these | ||
37 | * are unused but an SMP system is supposed to have enough memory ... | ||
38 | * sometimes (mostly wrt. hw bugs) we get corrupted vectors all | ||
39 | * across the spectrum, so we really want to be prepared to get all | ||
40 | * of these. Plus, more powerful systems might have more than 64 | ||
41 | * IO-APIC registers. | ||
42 | * | ||
43 | * (these are usually mapped into the 0x30-0xff vector range) | ||
44 | */ | ||
25 | 45 | ||
46 | #ifdef CONFIG_X86_32 | ||
26 | /* | 47 | /* |
27 | * Note that on a 486, we don't want to do a SIGFPE on an irq13 | 48 | * Note that on a 486, we don't want to do a SIGFPE on an irq13 |
28 | * as the irq is unreliable, and exception 16 works correctly | 49 | * as the irq is unreliable, and exception 16 works correctly |
@@ -52,30 +73,7 @@ static struct irqaction fpu_irq = { | |||
52 | .handler = math_error_irq, | 73 | .handler = math_error_irq, |
53 | .name = "fpu", | 74 | .name = "fpu", |
54 | }; | 75 | }; |
55 | |||
56 | void __init init_ISA_irqs(void) | ||
57 | { | ||
58 | int i; | ||
59 | |||
60 | #ifdef CONFIG_X86_LOCAL_APIC | ||
61 | init_bsp_APIC(); | ||
62 | #endif | 76 | #endif |
63 | init_8259A(0); | ||
64 | |||
65 | /* | ||
66 | * 16 old-style INTA-cycle interrupts: | ||
67 | */ | ||
68 | for (i = 0; i < NR_IRQS_LEGACY; i++) { | ||
69 | struct irq_desc *desc = irq_to_desc(i); | ||
70 | |||
71 | desc->status = IRQ_DISABLED; | ||
72 | desc->action = NULL; | ||
73 | desc->depth = 1; | ||
74 | |||
75 | set_irq_chip_and_handler_name(i, &i8259A_chip, | ||
76 | handle_level_irq, "XT"); | ||
77 | } | ||
78 | } | ||
79 | 77 | ||
80 | /* | 78 | /* |
81 | * IRQ2 is cascade interrupt to second interrupt controller | 79 | * IRQ2 is cascade interrupt to second interrupt controller |
@@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector) | |||
118 | return 0; | 116 | return 0; |
119 | } | 117 | } |
120 | 118 | ||
121 | /* Overridden in paravirt.c */ | 119 | static void __init init_ISA_irqs(void) |
122 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | ||
123 | |||
124 | void __init native_init_IRQ(void) | ||
125 | { | 120 | { |
126 | int i; | 121 | int i; |
127 | 122 | ||
128 | /* Execute any quirks before the call gates are initialised: */ | 123 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) |
129 | x86_quirk_pre_intr_init(); | 124 | init_bsp_APIC(); |
125 | #endif | ||
126 | init_8259A(0); | ||
130 | 127 | ||
131 | /* | 128 | /* |
132 | * Cover the whole vector space, no vector can escape | 129 | * 16 old-style INTA-cycle interrupts: |
133 | * us. (some of these will be overridden and become | ||
134 | * 'special' SMP interrupts) | ||
135 | */ | 130 | */ |
136 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { | 131 | for (i = 0; i < NR_IRQS_LEGACY; i++) { |
137 | /* SYSCALL_VECTOR was reserved in trap_init. */ | 132 | struct irq_desc *desc = irq_to_desc(i); |
138 | if (i != SYSCALL_VECTOR) | 133 | |
139 | set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); | 134 | desc->status = IRQ_DISABLED; |
135 | desc->action = NULL; | ||
136 | desc->depth = 1; | ||
137 | |||
138 | set_irq_chip_and_handler_name(i, &i8259A_chip, | ||
139 | handle_level_irq, "XT"); | ||
140 | } | 140 | } |
141 | } | ||
141 | 142 | ||
143 | /* Overridden in paravirt.c */ | ||
144 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | ||
142 | 145 | ||
143 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) | 146 | static void __init smp_intr_init(void) |
147 | { | ||
148 | #ifdef CONFIG_SMP | ||
149 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) | ||
144 | /* | 150 | /* |
145 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | 151 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper |
146 | * IPI, driven by wakeup. | 152 | * IPI, driven by wakeup. |
@@ -160,16 +166,35 @@ void __init native_init_IRQ(void) | |||
160 | /* IPI for generic function call */ | 166 | /* IPI for generic function call */ |
161 | alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | 167 | alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); |
162 | 168 | ||
163 | /* IPI for single call function */ | 169 | /* IPI for generic single function call */ |
164 | alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, | 170 | alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, |
165 | call_function_single_interrupt); | 171 | call_function_single_interrupt); |
166 | 172 | ||
167 | /* Low priority IPI to cleanup after moving an irq */ | 173 | /* Low priority IPI to cleanup after moving an irq */ |
168 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); | 174 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); |
169 | set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); | 175 | set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); |
176 | |||
177 | /* IPI used for rebooting/stopping */ | ||
178 | alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); | ||
170 | #endif | 179 | #endif |
180 | #endif /* CONFIG_SMP */ | ||
181 | } | ||
182 | |||
183 | static void __init apic_intr_init(void) | ||
184 | { | ||
185 | smp_intr_init(); | ||
171 | 186 | ||
172 | #ifdef CONFIG_X86_LOCAL_APIC | 187 | #ifdef CONFIG_X86_THERMAL_VECTOR |
188 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | ||
189 | #endif | ||
190 | #ifdef CONFIG_X86_THRESHOLD | ||
191 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); | ||
192 | #endif | ||
193 | #if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) | ||
194 | alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); | ||
195 | #endif | ||
196 | |||
197 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) | ||
173 | /* self generated IPI for local APIC timer */ | 198 | /* self generated IPI for local APIC timer */ |
174 | alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | 199 | alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); |
175 | 200 | ||
@@ -179,16 +204,59 @@ void __init native_init_IRQ(void) | |||
179 | /* IPI vectors for APIC spurious and error interrupts */ | 204 | /* IPI vectors for APIC spurious and error interrupts */ |
180 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | 205 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); |
181 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | 206 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); |
207 | |||
208 | /* Performance monitoring interrupts: */ | ||
209 | # ifdef CONFIG_PERF_COUNTERS | ||
210 | alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); | ||
211 | # endif | ||
212 | |||
182 | #endif | 213 | #endif |
214 | } | ||
183 | 215 | ||
184 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) | 216 | /** |
185 | /* thermal monitor LVT interrupt */ | 217 | * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors |
186 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | 218 | * |
219 | * Description: | ||
220 | * Perform any necessary interrupt initialisation prior to setting up | ||
221 | * the "ordinary" interrupt call gates. For legacy reasons, the ISA | ||
222 | * interrupts should be initialised here if the machine emulates a PC | ||
223 | * in any way. | ||
224 | **/ | ||
225 | static void __init x86_quirk_pre_intr_init(void) | ||
226 | { | ||
227 | #ifdef CONFIG_X86_32 | ||
228 | if (x86_quirks->arch_pre_intr_init) { | ||
229 | if (x86_quirks->arch_pre_intr_init()) | ||
230 | return; | ||
231 | } | ||
187 | #endif | 232 | #endif |
233 | init_ISA_irqs(); | ||
234 | } | ||
235 | |||
236 | void __init native_init_IRQ(void) | ||
237 | { | ||
238 | int i; | ||
239 | |||
240 | /* Execute any quirks before the call gates are initialised: */ | ||
241 | x86_quirk_pre_intr_init(); | ||
242 | |||
243 | apic_intr_init(); | ||
244 | |||
245 | /* | ||
246 | * Cover the whole vector space, no vector can escape | ||
247 | * us. (some of these will be overridden and become | ||
248 | * 'special' SMP interrupts) | ||
249 | */ | ||
250 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { | ||
251 | /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ | ||
252 | if (!test_bit(i, used_vectors)) | ||
253 | set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); | ||
254 | } | ||
188 | 255 | ||
189 | if (!acpi_ioapic) | 256 | if (!acpi_ioapic) |
190 | setup_irq(2, &irq2); | 257 | setup_irq(2, &irq2); |
191 | 258 | ||
259 | #ifdef CONFIG_X86_32 | ||
192 | /* | 260 | /* |
193 | * Call quirks after call gates are initialised (usually add in | 261 | * Call quirks after call gates are initialised (usually add in |
194 | * the architecture specific gates): | 262 | * the architecture specific gates): |
@@ -203,4 +271,5 @@ void __init native_init_IRQ(void) | |||
203 | setup_irq(FPU_IRQ, &fpu_irq); | 271 | setup_irq(FPU_IRQ, &fpu_irq); |
204 | 272 | ||
205 | irq_ctx_init(smp_processor_id()); | 273 | irq_ctx_init(smp_processor_id()); |
274 | #endif | ||
206 | } | 275 | } |
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c deleted file mode 100644 index 8cd10537fd46..000000000000 --- a/arch/x86/kernel/irqinit_64.c +++ /dev/null | |||
@@ -1,177 +0,0 @@ | |||
1 | #include <linux/linkage.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/signal.h> | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/ioport.h> | ||
6 | #include <linux/interrupt.h> | ||
7 | #include <linux/timex.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/random.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/kernel_stat.h> | ||
12 | #include <linux/sysdev.h> | ||
13 | #include <linux/bitops.h> | ||
14 | #include <linux/acpi.h> | ||
15 | #include <linux/io.h> | ||
16 | #include <linux/delay.h> | ||
17 | |||
18 | #include <asm/atomic.h> | ||
19 | #include <asm/system.h> | ||
20 | #include <asm/hw_irq.h> | ||
21 | #include <asm/pgtable.h> | ||
22 | #include <asm/desc.h> | ||
23 | #include <asm/apic.h> | ||
24 | #include <asm/i8259.h> | ||
25 | |||
26 | /* | ||
27 | * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: | ||
28 | * (these are usually mapped to vectors 0x30-0x3f) | ||
29 | */ | ||
30 | |||
31 | /* | ||
32 | * The IO-APIC gives us many more interrupt sources. Most of these | ||
33 | * are unused but an SMP system is supposed to have enough memory ... | ||
34 | * sometimes (mostly wrt. hw bugs) we get corrupted vectors all | ||
35 | * across the spectrum, so we really want to be prepared to get all | ||
36 | * of these. Plus, more powerful systems might have more than 64 | ||
37 | * IO-APIC registers. | ||
38 | * | ||
39 | * (these are usually mapped into the 0x30-0xff vector range) | ||
40 | */ | ||
41 | |||
42 | /* | ||
43 | * IRQ2 is cascade interrupt to second interrupt controller | ||
44 | */ | ||
45 | |||
46 | static struct irqaction irq2 = { | ||
47 | .handler = no_action, | ||
48 | .name = "cascade", | ||
49 | }; | ||
50 | DEFINE_PER_CPU(vector_irq_t, vector_irq) = { | ||
51 | [0 ... IRQ0_VECTOR - 1] = -1, | ||
52 | [IRQ0_VECTOR] = 0, | ||
53 | [IRQ1_VECTOR] = 1, | ||
54 | [IRQ2_VECTOR] = 2, | ||
55 | [IRQ3_VECTOR] = 3, | ||
56 | [IRQ4_VECTOR] = 4, | ||
57 | [IRQ5_VECTOR] = 5, | ||
58 | [IRQ6_VECTOR] = 6, | ||
59 | [IRQ7_VECTOR] = 7, | ||
60 | [IRQ8_VECTOR] = 8, | ||
61 | [IRQ9_VECTOR] = 9, | ||
62 | [IRQ10_VECTOR] = 10, | ||
63 | [IRQ11_VECTOR] = 11, | ||
64 | [IRQ12_VECTOR] = 12, | ||
65 | [IRQ13_VECTOR] = 13, | ||
66 | [IRQ14_VECTOR] = 14, | ||
67 | [IRQ15_VECTOR] = 15, | ||
68 | [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 | ||
69 | }; | ||
70 | |||
71 | int vector_used_by_percpu_irq(unsigned int vector) | ||
72 | { | ||
73 | int cpu; | ||
74 | |||
75 | for_each_online_cpu(cpu) { | ||
76 | if (per_cpu(vector_irq, cpu)[vector] != -1) | ||
77 | return 1; | ||
78 | } | ||
79 | |||
80 | return 0; | ||
81 | } | ||
82 | |||
83 | static void __init init_ISA_irqs(void) | ||
84 | { | ||
85 | int i; | ||
86 | |||
87 | init_bsp_APIC(); | ||
88 | init_8259A(0); | ||
89 | |||
90 | for (i = 0; i < NR_IRQS_LEGACY; i++) { | ||
91 | struct irq_desc *desc = irq_to_desc(i); | ||
92 | |||
93 | desc->status = IRQ_DISABLED; | ||
94 | desc->action = NULL; | ||
95 | desc->depth = 1; | ||
96 | |||
97 | /* | ||
98 | * 16 old-style INTA-cycle interrupts: | ||
99 | */ | ||
100 | set_irq_chip_and_handler_name(i, &i8259A_chip, | ||
101 | handle_level_irq, "XT"); | ||
102 | } | ||
103 | } | ||
104 | |||
105 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | ||
106 | |||
107 | static void __init smp_intr_init(void) | ||
108 | { | ||
109 | #ifdef CONFIG_SMP | ||
110 | /* | ||
111 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | ||
112 | * IPI, driven by wakeup. | ||
113 | */ | ||
114 | alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | ||
115 | |||
116 | /* IPIs for invalidation */ | ||
117 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); | ||
118 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); | ||
119 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); | ||
120 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); | ||
121 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); | ||
122 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); | ||
123 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); | ||
124 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); | ||
125 | |||
126 | /* IPI for generic function call */ | ||
127 | alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | ||
128 | |||
129 | /* IPI for generic single function call */ | ||
130 | alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, | ||
131 | call_function_single_interrupt); | ||
132 | |||
133 | /* Low priority IPI to cleanup after moving an irq */ | ||
134 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); | ||
135 | set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); | ||
136 | #endif | ||
137 | } | ||
138 | |||
139 | static void __init apic_intr_init(void) | ||
140 | { | ||
141 | smp_intr_init(); | ||
142 | |||
143 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | ||
144 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); | ||
145 | |||
146 | /* self generated IPI for local APIC timer */ | ||
147 | alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | ||
148 | |||
149 | /* generic IPI for platform specific use */ | ||
150 | alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); | ||
151 | |||
152 | /* IPI vectors for APIC spurious and error interrupts */ | ||
153 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | ||
154 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | ||
155 | } | ||
156 | |||
157 | void __init native_init_IRQ(void) | ||
158 | { | ||
159 | int i; | ||
160 | |||
161 | init_ISA_irqs(); | ||
162 | /* | ||
163 | * Cover the whole vector space, no vector can escape | ||
164 | * us. (some of these will be overridden and become | ||
165 | * 'special' SMP interrupts) | ||
166 | */ | ||
167 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
168 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
169 | if (vector != IA32_SYSCALL_VECTOR) | ||
170 | set_intr_gate(vector, interrupt[i]); | ||
171 | } | ||
172 | |||
173 | apic_intr_init(); | ||
174 | |||
175 | if (!acpi_ioapic) | ||
176 | setup_irq(2, &irq2); | ||
177 | } | ||
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index f820b73c7f28..34e86b67550c 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -143,7 +143,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) | |||
143 | gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); | 143 | gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); |
144 | gdb_regs32[GDB_CS] = __KERNEL_CS; | 144 | gdb_regs32[GDB_CS] = __KERNEL_CS; |
145 | gdb_regs32[GDB_SS] = __KERNEL_DS; | 145 | gdb_regs32[GDB_SS] = __KERNEL_DS; |
146 | gdb_regs[GDB_PC] = p->thread.ip; | 146 | gdb_regs[GDB_PC] = 0; |
147 | gdb_regs[GDB_R8] = 0; | 147 | gdb_regs[GDB_R8] = 0; |
148 | gdb_regs[GDB_R9] = 0; | 148 | gdb_regs[GDB_R9] = 0; |
149 | gdb_regs[GDB_R10] = 0; | 149 | gdb_regs[GDB_R10] = 0; |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 33019ddb56b4..a78ecad0c900 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
29 | #include <linux/hardirq.h> | 29 | #include <linux/hardirq.h> |
30 | #include <asm/timer.h> | ||
30 | 31 | ||
31 | #define MMU_QUEUE_SIZE 1024 | 32 | #define MMU_QUEUE_SIZE 1024 |
32 | 33 | ||
@@ -195,7 +196,7 @@ static void kvm_leave_lazy_mmu(void) | |||
195 | struct kvm_para_state *state = kvm_para_state(); | 196 | struct kvm_para_state *state = kvm_para_state(); |
196 | 197 | ||
197 | mmu_queue_flush(state); | 198 | mmu_queue_flush(state); |
198 | paravirt_leave_lazy(paravirt_get_lazy_mode()); | 199 | paravirt_leave_lazy_mmu(); |
199 | state->mode = paravirt_get_lazy_mode(); | 200 | state->mode = paravirt_get_lazy_mode(); |
200 | } | 201 | } |
201 | 202 | ||
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void) | |||
230 | pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; | 231 | pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; |
231 | pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; | 232 | pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; |
232 | } | 233 | } |
234 | #ifdef CONFIG_X86_IO_APIC | ||
235 | no_timer_check = 1; | ||
236 | #endif | ||
233 | } | 237 | } |
234 | 238 | ||
235 | void __init kvm_guest_init(void) | 239 | void __init kvm_guest_init(void) |
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 453b5795a5c6..366baa179913 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c | |||
@@ -13,25 +13,13 @@ | |||
13 | * Licensed under the terms of the GNU General Public | 13 | * Licensed under the terms of the GNU General Public |
14 | * License version 2. See file COPYING for details. | 14 | * License version 2. See file COPYING for details. |
15 | */ | 15 | */ |
16 | #include <linux/platform_device.h> | ||
17 | #include <linux/capability.h> | ||
18 | #include <linux/miscdevice.h> | ||
19 | #include <linux/firmware.h> | 16 | #include <linux/firmware.h> |
20 | #include <linux/spinlock.h> | ||
21 | #include <linux/cpumask.h> | ||
22 | #include <linux/pci_ids.h> | 17 | #include <linux/pci_ids.h> |
23 | #include <linux/uaccess.h> | 18 | #include <linux/uaccess.h> |
24 | #include <linux/vmalloc.h> | 19 | #include <linux/vmalloc.h> |
25 | #include <linux/kernel.h> | 20 | #include <linux/kernel.h> |
26 | #include <linux/module.h> | 21 | #include <linux/module.h> |
27 | #include <linux/mutex.h> | ||
28 | #include <linux/sched.h> | ||
29 | #include <linux/init.h> | ||
30 | #include <linux/slab.h> | ||
31 | #include <linux/cpu.h> | ||
32 | #include <linux/pci.h> | 22 | #include <linux/pci.h> |
33 | #include <linux/fs.h> | ||
34 | #include <linux/mm.h> | ||
35 | 23 | ||
36 | #include <asm/microcode.h> | 24 | #include <asm/microcode.h> |
37 | #include <asm/processor.h> | 25 | #include <asm/processor.h> |
@@ -79,9 +67,6 @@ struct microcode_amd { | |||
79 | #define UCODE_CONTAINER_SECTION_HDR 8 | 67 | #define UCODE_CONTAINER_SECTION_HDR 8 |
80 | #define UCODE_CONTAINER_HEADER_SIZE 12 | 68 | #define UCODE_CONTAINER_HEADER_SIZE 12 |
81 | 69 | ||
82 | /* serialize access to the physical write */ | ||
83 | static DEFINE_SPINLOCK(microcode_update_lock); | ||
84 | |||
85 | static struct equiv_cpu_entry *equiv_cpu_table; | 70 | static struct equiv_cpu_entry *equiv_cpu_table; |
86 | 71 | ||
87 | static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) | 72 | static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) |
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) | |||
144 | return 1; | 129 | return 1; |
145 | } | 130 | } |
146 | 131 | ||
147 | static void apply_microcode_amd(int cpu) | 132 | static int apply_microcode_amd(int cpu) |
148 | { | 133 | { |
149 | unsigned long flags; | ||
150 | u32 rev, dummy; | 134 | u32 rev, dummy; |
151 | int cpu_num = raw_smp_processor_id(); | 135 | int cpu_num = raw_smp_processor_id(); |
152 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; | 136 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; |
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu) | |||
156 | BUG_ON(cpu_num != cpu); | 140 | BUG_ON(cpu_num != cpu); |
157 | 141 | ||
158 | if (mc_amd == NULL) | 142 | if (mc_amd == NULL) |
159 | return; | 143 | return 0; |
160 | 144 | ||
161 | spin_lock_irqsave(µcode_update_lock, flags); | ||
162 | wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); | 145 | wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); |
163 | /* get patch id after patching */ | 146 | /* get patch id after patching */ |
164 | rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); | 147 | rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); |
165 | spin_unlock_irqrestore(µcode_update_lock, flags); | ||
166 | 148 | ||
167 | /* check current patch id and patch's id for match */ | 149 | /* check current patch id and patch's id for match */ |
168 | if (rev != mc_amd->hdr.patch_id) { | 150 | if (rev != mc_amd->hdr.patch_id) { |
169 | printk(KERN_ERR "microcode: CPU%d: update failed " | 151 | printk(KERN_ERR "microcode: CPU%d: update failed " |
170 | "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); | 152 | "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); |
171 | return; | 153 | return -1; |
172 | } | 154 | } |
173 | 155 | ||
174 | printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", | 156 | printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", |
175 | cpu, rev); | 157 | cpu, rev); |
176 | 158 | ||
177 | uci->cpu_sig.rev = rev; | 159 | uci->cpu_sig.rev = rev; |
160 | |||
161 | return 0; | ||
178 | } | 162 | } |
179 | 163 | ||
180 | static int get_ucode_data(void *to, const u8 *from, size_t n) | 164 | static int get_ucode_data(void *to, const u8 *from, size_t n) |
@@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf) | |||
257 | 241 | ||
258 | static void free_equiv_cpu_table(void) | 242 | static void free_equiv_cpu_table(void) |
259 | { | 243 | { |
260 | if (equiv_cpu_table) { | 244 | vfree(equiv_cpu_table); |
261 | vfree(equiv_cpu_table); | 245 | equiv_cpu_table = NULL; |
262 | equiv_cpu_table = NULL; | ||
263 | } | ||
264 | } | 246 | } |
265 | 247 | ||
266 | static int generic_load_microcode(int cpu, const u8 *data, size_t size) | 248 | static enum ucode_state |
249 | generic_load_microcode(int cpu, const u8 *data, size_t size) | ||
267 | { | 250 | { |
268 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | 251 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; |
269 | const u8 *ucode_ptr = data; | 252 | const u8 *ucode_ptr = data; |
@@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) | |||
272 | int new_rev = uci->cpu_sig.rev; | 255 | int new_rev = uci->cpu_sig.rev; |
273 | unsigned int leftover; | 256 | unsigned int leftover; |
274 | unsigned long offset; | 257 | unsigned long offset; |
258 | enum ucode_state state = UCODE_OK; | ||
275 | 259 | ||
276 | offset = install_equiv_cpu_table(ucode_ptr); | 260 | offset = install_equiv_cpu_table(ucode_ptr); |
277 | if (!offset) { | 261 | if (!offset) { |
278 | printk(KERN_ERR "microcode: failed to create " | 262 | printk(KERN_ERR "microcode: failed to create " |
279 | "equivalent cpu table\n"); | 263 | "equivalent cpu table\n"); |
280 | return -EINVAL; | 264 | return UCODE_ERROR; |
281 | } | 265 | } |
282 | 266 | ||
283 | ucode_ptr += offset; | 267 | ucode_ptr += offset; |
@@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) | |||
293 | 277 | ||
294 | mc_header = (struct microcode_header_amd *)mc; | 278 | mc_header = (struct microcode_header_amd *)mc; |
295 | if (get_matching_microcode(cpu, mc, new_rev)) { | 279 | if (get_matching_microcode(cpu, mc, new_rev)) { |
296 | if (new_mc) | 280 | vfree(new_mc); |
297 | vfree(new_mc); | ||
298 | new_rev = mc_header->patch_id; | 281 | new_rev = mc_header->patch_id; |
299 | new_mc = mc; | 282 | new_mc = mc; |
300 | } else | 283 | } else |
@@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) | |||
306 | 289 | ||
307 | if (new_mc) { | 290 | if (new_mc) { |
308 | if (!leftover) { | 291 | if (!leftover) { |
309 | if (uci->mc) | 292 | vfree(uci->mc); |
310 | vfree(uci->mc); | ||
311 | uci->mc = new_mc; | 293 | uci->mc = new_mc; |
312 | pr_debug("microcode: CPU%d found a matching microcode " | 294 | pr_debug("microcode: CPU%d found a matching microcode " |
313 | "update with version 0x%x (current=0x%x)\n", | 295 | "update with version 0x%x (current=0x%x)\n", |
314 | cpu, new_rev, uci->cpu_sig.rev); | 296 | cpu, new_rev, uci->cpu_sig.rev); |
315 | } else | 297 | } else { |
316 | vfree(new_mc); | 298 | vfree(new_mc); |
317 | } | 299 | state = UCODE_ERROR; |
300 | } | ||
301 | } else | ||
302 | state = UCODE_NFOUND; | ||
318 | 303 | ||
319 | free_equiv_cpu_table(); | 304 | free_equiv_cpu_table(); |
320 | 305 | ||
321 | return (int)leftover; | 306 | return state; |
322 | } | 307 | } |
323 | 308 | ||
324 | static int request_microcode_fw(int cpu, struct device *device) | 309 | static enum ucode_state request_microcode_fw(int cpu, struct device *device) |
325 | { | 310 | { |
326 | const char *fw_name = "amd-ucode/microcode_amd.bin"; | 311 | const char *fw_name = "amd-ucode/microcode_amd.bin"; |
327 | const struct firmware *firmware; | 312 | const struct firmware *firmware; |
328 | int ret; | 313 | enum ucode_state ret; |
329 | |||
330 | /* We should bind the task to the CPU */ | ||
331 | BUG_ON(cpu != raw_smp_processor_id()); | ||
332 | 314 | ||
333 | ret = request_firmware(&firmware, fw_name, device); | 315 | if (request_firmware(&firmware, fw_name, device)) { |
334 | if (ret) { | ||
335 | printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); | 316 | printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); |
336 | return ret; | 317 | return UCODE_NFOUND; |
337 | } | 318 | } |
338 | 319 | ||
339 | ret = generic_load_microcode(cpu, firmware->data, firmware->size); | 320 | ret = generic_load_microcode(cpu, firmware->data, firmware->size); |
@@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device) | |||
343 | return ret; | 324 | return ret; |
344 | } | 325 | } |
345 | 326 | ||
346 | static int request_microcode_user(int cpu, const void __user *buf, size_t size) | 327 | static enum ucode_state |
328 | request_microcode_user(int cpu, const void __user *buf, size_t size) | ||
347 | { | 329 | { |
348 | printk(KERN_INFO "microcode: AMD microcode update via " | 330 | printk(KERN_INFO "microcode: AMD microcode update via " |
349 | "/dev/cpu/microcode not supported\n"); | 331 | "/dev/cpu/microcode not supported\n"); |
350 | return -1; | 332 | return UCODE_ERROR; |
351 | } | 333 | } |
352 | 334 | ||
353 | static void microcode_fini_cpu_amd(int cpu) | 335 | static void microcode_fini_cpu_amd(int cpu) |
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 98c470c069d1..9371448290ac 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c | |||
@@ -71,27 +71,18 @@ | |||
71 | * Thanks to Stuart Swales for pointing out this bug. | 71 | * Thanks to Stuart Swales for pointing out this bug. |
72 | */ | 72 | */ |
73 | #include <linux/platform_device.h> | 73 | #include <linux/platform_device.h> |
74 | #include <linux/capability.h> | ||
75 | #include <linux/miscdevice.h> | 74 | #include <linux/miscdevice.h> |
76 | #include <linux/firmware.h> | 75 | #include <linux/capability.h> |
77 | #include <linux/smp_lock.h> | 76 | #include <linux/smp_lock.h> |
78 | #include <linux/spinlock.h> | ||
79 | #include <linux/cpumask.h> | ||
80 | #include <linux/uaccess.h> | ||
81 | #include <linux/vmalloc.h> | ||
82 | #include <linux/kernel.h> | 77 | #include <linux/kernel.h> |
83 | #include <linux/module.h> | 78 | #include <linux/module.h> |
84 | #include <linux/mutex.h> | 79 | #include <linux/mutex.h> |
85 | #include <linux/sched.h> | ||
86 | #include <linux/init.h> | ||
87 | #include <linux/slab.h> | ||
88 | #include <linux/cpu.h> | 80 | #include <linux/cpu.h> |
89 | #include <linux/fs.h> | 81 | #include <linux/fs.h> |
90 | #include <linux/mm.h> | 82 | #include <linux/mm.h> |
91 | 83 | ||
92 | #include <asm/microcode.h> | 84 | #include <asm/microcode.h> |
93 | #include <asm/processor.h> | 85 | #include <asm/processor.h> |
94 | #include <asm/msr.h> | ||
95 | 86 | ||
96 | MODULE_DESCRIPTION("Microcode Update Driver"); | 87 | MODULE_DESCRIPTION("Microcode Update Driver"); |
97 | MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); | 88 | MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); |
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL"); | |||
101 | 92 | ||
102 | static struct microcode_ops *microcode_ops; | 93 | static struct microcode_ops *microcode_ops; |
103 | 94 | ||
104 | /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ | 95 | /* |
96 | * Synchronization. | ||
97 | * | ||
98 | * All non cpu-hotplug-callback call sites use: | ||
99 | * | ||
100 | * - microcode_mutex to synchronize with each other; | ||
101 | * - get/put_online_cpus() to synchronize with | ||
102 | * the cpu-hotplug-callback call sites. | ||
103 | * | ||
104 | * We guarantee that only a single cpu is being | ||
105 | * updated at any particular moment of time. | ||
106 | */ | ||
105 | static DEFINE_MUTEX(microcode_mutex); | 107 | static DEFINE_MUTEX(microcode_mutex); |
106 | 108 | ||
107 | struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; | 109 | struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; |
108 | EXPORT_SYMBOL_GPL(ucode_cpu_info); | 110 | EXPORT_SYMBOL_GPL(ucode_cpu_info); |
109 | 111 | ||
112 | /* | ||
113 | * Operations that are run on a target cpu: | ||
114 | */ | ||
115 | |||
116 | struct cpu_info_ctx { | ||
117 | struct cpu_signature *cpu_sig; | ||
118 | int err; | ||
119 | }; | ||
120 | |||
121 | static void collect_cpu_info_local(void *arg) | ||
122 | { | ||
123 | struct cpu_info_ctx *ctx = arg; | ||
124 | |||
125 | ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), | ||
126 | ctx->cpu_sig); | ||
127 | } | ||
128 | |||
129 | static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) | ||
130 | { | ||
131 | struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; | ||
132 | int ret; | ||
133 | |||
134 | ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); | ||
135 | if (!ret) | ||
136 | ret = ctx.err; | ||
137 | |||
138 | return ret; | ||
139 | } | ||
140 | |||
141 | static int collect_cpu_info(int cpu) | ||
142 | { | ||
143 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
144 | int ret; | ||
145 | |||
146 | memset(uci, 0, sizeof(*uci)); | ||
147 | |||
148 | ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); | ||
149 | if (!ret) | ||
150 | uci->valid = 1; | ||
151 | |||
152 | return ret; | ||
153 | } | ||
154 | |||
155 | struct apply_microcode_ctx { | ||
156 | int err; | ||
157 | }; | ||
158 | |||
159 | static void apply_microcode_local(void *arg) | ||
160 | { | ||
161 | struct apply_microcode_ctx *ctx = arg; | ||
162 | |||
163 | ctx->err = microcode_ops->apply_microcode(smp_processor_id()); | ||
164 | } | ||
165 | |||
166 | static int apply_microcode_on_target(int cpu) | ||
167 | { | ||
168 | struct apply_microcode_ctx ctx = { .err = 0 }; | ||
169 | int ret; | ||
170 | |||
171 | ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); | ||
172 | if (!ret) | ||
173 | ret = ctx.err; | ||
174 | |||
175 | return ret; | ||
176 | } | ||
177 | |||
110 | #ifdef CONFIG_MICROCODE_OLD_INTERFACE | 178 | #ifdef CONFIG_MICROCODE_OLD_INTERFACE |
111 | static int do_microcode_update(const void __user *buf, size_t size) | 179 | static int do_microcode_update(const void __user *buf, size_t size) |
112 | { | 180 | { |
113 | cpumask_t old; | ||
114 | int error = 0; | 181 | int error = 0; |
115 | int cpu; | 182 | int cpu; |
116 | 183 | ||
117 | old = current->cpus_allowed; | ||
118 | |||
119 | for_each_online_cpu(cpu) { | 184 | for_each_online_cpu(cpu) { |
120 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | 185 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; |
186 | enum ucode_state ustate; | ||
121 | 187 | ||
122 | if (!uci->valid) | 188 | if (!uci->valid) |
123 | continue; | 189 | continue; |
124 | 190 | ||
125 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); | 191 | ustate = microcode_ops->request_microcode_user(cpu, buf, size); |
126 | error = microcode_ops->request_microcode_user(cpu, buf, size); | 192 | if (ustate == UCODE_ERROR) { |
127 | if (error < 0) | 193 | error = -1; |
128 | goto out; | 194 | break; |
129 | if (!error) | 195 | } else if (ustate == UCODE_OK) |
130 | microcode_ops->apply_microcode(cpu); | 196 | apply_microcode_on_target(cpu); |
131 | } | 197 | } |
132 | out: | 198 | |
133 | set_cpus_allowed_ptr(current, &old); | ||
134 | return error; | 199 | return error; |
135 | } | 200 | } |
136 | 201 | ||
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2) | |||
143 | static ssize_t microcode_write(struct file *file, const char __user *buf, | 208 | static ssize_t microcode_write(struct file *file, const char __user *buf, |
144 | size_t len, loff_t *ppos) | 209 | size_t len, loff_t *ppos) |
145 | { | 210 | { |
146 | ssize_t ret; | 211 | ssize_t ret = -EINVAL; |
147 | 212 | ||
148 | if ((len >> PAGE_SHIFT) > num_physpages) { | 213 | if ((len >> PAGE_SHIFT) > num_physpages) { |
149 | printk(KERN_ERR "microcode: too much data (max %ld pages)\n", | 214 | pr_err("microcode: too much data (max %ld pages)\n", num_physpages); |
150 | num_physpages); | 215 | return ret; |
151 | return -EINVAL; | ||
152 | } | 216 | } |
153 | 217 | ||
154 | get_online_cpus(); | 218 | get_online_cpus(); |
155 | mutex_lock(µcode_mutex); | 219 | mutex_lock(µcode_mutex); |
156 | 220 | ||
157 | ret = do_microcode_update(buf, len); | 221 | if (do_microcode_update(buf, len) == 0) |
158 | if (!ret) | ||
159 | ret = (ssize_t)len; | 222 | ret = (ssize_t)len; |
160 | 223 | ||
161 | mutex_unlock(µcode_mutex); | 224 | mutex_unlock(µcode_mutex); |
@@ -165,15 +228,16 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, | |||
165 | } | 228 | } |
166 | 229 | ||
167 | static const struct file_operations microcode_fops = { | 230 | static const struct file_operations microcode_fops = { |
168 | .owner = THIS_MODULE, | 231 | .owner = THIS_MODULE, |
169 | .write = microcode_write, | 232 | .write = microcode_write, |
170 | .open = microcode_open, | 233 | .open = microcode_open, |
171 | }; | 234 | }; |
172 | 235 | ||
173 | static struct miscdevice microcode_dev = { | 236 | static struct miscdevice microcode_dev = { |
174 | .minor = MICROCODE_MINOR, | 237 | .minor = MICROCODE_MINOR, |
175 | .name = "microcode", | 238 | .name = "microcode", |
176 | .fops = µcode_fops, | 239 | .devnode = "cpu/microcode", |
240 | .fops = µcode_fops, | ||
177 | }; | 241 | }; |
178 | 242 | ||
179 | static int __init microcode_dev_init(void) | 243 | static int __init microcode_dev_init(void) |
@@ -182,9 +246,7 @@ static int __init microcode_dev_init(void) | |||
182 | 246 | ||
183 | error = misc_register(µcode_dev); | 247 | error = misc_register(µcode_dev); |
184 | if (error) { | 248 | if (error) { |
185 | printk(KERN_ERR | 249 | pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); |
186 | "microcode: can't misc_register on minor=%d\n", | ||
187 | MICROCODE_MINOR); | ||
188 | return error; | 250 | return error; |
189 | } | 251 | } |
190 | 252 | ||
@@ -205,42 +267,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); | |||
205 | /* fake device for request_firmware */ | 267 | /* fake device for request_firmware */ |
206 | static struct platform_device *microcode_pdev; | 268 | static struct platform_device *microcode_pdev; |
207 | 269 | ||
208 | static long reload_for_cpu(void *unused) | 270 | static int reload_for_cpu(int cpu) |
209 | { | 271 | { |
210 | struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); | 272 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; |
211 | int err = 0; | 273 | int err = 0; |
212 | 274 | ||
213 | mutex_lock(µcode_mutex); | 275 | mutex_lock(µcode_mutex); |
214 | if (uci->valid) { | 276 | if (uci->valid) { |
215 | err = microcode_ops->request_microcode_fw(smp_processor_id(), | 277 | enum ucode_state ustate; |
216 | µcode_pdev->dev); | 278 | |
217 | if (!err) | 279 | ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); |
218 | microcode_ops->apply_microcode(smp_processor_id()); | 280 | if (ustate == UCODE_OK) |
281 | apply_microcode_on_target(cpu); | ||
282 | else | ||
283 | if (ustate == UCODE_ERROR) | ||
284 | err = -EINVAL; | ||
219 | } | 285 | } |
220 | mutex_unlock(µcode_mutex); | 286 | mutex_unlock(µcode_mutex); |
287 | |||
221 | return err; | 288 | return err; |
222 | } | 289 | } |
223 | 290 | ||
224 | static ssize_t reload_store(struct sys_device *dev, | 291 | static ssize_t reload_store(struct sys_device *dev, |
225 | struct sysdev_attribute *attr, | 292 | struct sysdev_attribute *attr, |
226 | const char *buf, size_t sz) | 293 | const char *buf, size_t size) |
227 | { | 294 | { |
228 | char *end; | 295 | unsigned long val; |
229 | unsigned long val = simple_strtoul(buf, &end, 0); | ||
230 | int err = 0; | ||
231 | int cpu = dev->id; | 296 | int cpu = dev->id; |
297 | int ret = 0; | ||
298 | char *end; | ||
232 | 299 | ||
300 | val = simple_strtoul(buf, &end, 0); | ||
233 | if (end == buf) | 301 | if (end == buf) |
234 | return -EINVAL; | 302 | return -EINVAL; |
303 | |||
235 | if (val == 1) { | 304 | if (val == 1) { |
236 | get_online_cpus(); | 305 | get_online_cpus(); |
237 | if (cpu_online(cpu)) | 306 | if (cpu_online(cpu)) |
238 | err = work_on_cpu(cpu, reload_for_cpu, NULL); | 307 | ret = reload_for_cpu(cpu); |
239 | put_online_cpus(); | 308 | put_online_cpus(); |
240 | } | 309 | } |
241 | if (err) | 310 | |
242 | return err; | 311 | if (!ret) |
243 | return sz; | 312 | ret = size; |
313 | |||
314 | return ret; | ||
244 | } | 315 | } |
245 | 316 | ||
246 | static ssize_t version_show(struct sys_device *dev, | 317 | static ssize_t version_show(struct sys_device *dev, |
@@ -271,11 +342,11 @@ static struct attribute *mc_default_attrs[] = { | |||
271 | }; | 342 | }; |
272 | 343 | ||
273 | static struct attribute_group mc_attr_group = { | 344 | static struct attribute_group mc_attr_group = { |
274 | .attrs = mc_default_attrs, | 345 | .attrs = mc_default_attrs, |
275 | .name = "microcode", | 346 | .name = "microcode", |
276 | }; | 347 | }; |
277 | 348 | ||
278 | static void __microcode_fini_cpu(int cpu) | 349 | static void microcode_fini_cpu(int cpu) |
279 | { | 350 | { |
280 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | 351 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; |
281 | 352 | ||
@@ -283,103 +354,68 @@ static void __microcode_fini_cpu(int cpu) | |||
283 | uci->valid = 0; | 354 | uci->valid = 0; |
284 | } | 355 | } |
285 | 356 | ||
286 | static void microcode_fini_cpu(int cpu) | 357 | static enum ucode_state microcode_resume_cpu(int cpu) |
287 | { | ||
288 | mutex_lock(µcode_mutex); | ||
289 | __microcode_fini_cpu(cpu); | ||
290 | mutex_unlock(µcode_mutex); | ||
291 | } | ||
292 | |||
293 | static void collect_cpu_info(int cpu) | ||
294 | { | 358 | { |
295 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | 359 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; |
296 | 360 | ||
297 | memset(uci, 0, sizeof(*uci)); | 361 | if (!uci->mc) |
298 | if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) | 362 | return UCODE_NFOUND; |
299 | uci->valid = 1; | 363 | |
364 | pr_debug("microcode: CPU%d updated upon resume\n", cpu); | ||
365 | apply_microcode_on_target(cpu); | ||
366 | |||
367 | return UCODE_OK; | ||
300 | } | 368 | } |
301 | 369 | ||
302 | static int microcode_resume_cpu(int cpu) | 370 | static enum ucode_state microcode_init_cpu(int cpu) |
303 | { | 371 | { |
304 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | 372 | enum ucode_state ustate; |
305 | struct cpu_signature nsig; | ||
306 | 373 | ||
307 | pr_debug("microcode: CPU%d resumed\n", cpu); | 374 | if (collect_cpu_info(cpu)) |
375 | return UCODE_ERROR; | ||
308 | 376 | ||
309 | if (!uci->mc) | 377 | /* --dimm. Trigger a delayed update? */ |
310 | return 1; | 378 | if (system_state != SYSTEM_RUNNING) |
379 | return UCODE_NFOUND; | ||
311 | 380 | ||
312 | /* | 381 | ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); |
313 | * Let's verify that the 'cached' ucode does belong | ||
314 | * to this cpu (a bit of paranoia): | ||
315 | */ | ||
316 | if (microcode_ops->collect_cpu_info(cpu, &nsig)) { | ||
317 | __microcode_fini_cpu(cpu); | ||
318 | printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", | ||
319 | cpu); | ||
320 | return -1; | ||
321 | } | ||
322 | 382 | ||
323 | if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { | 383 | if (ustate == UCODE_OK) { |
324 | __microcode_fini_cpu(cpu); | 384 | pr_debug("microcode: CPU%d updated upon init\n", cpu); |
325 | printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", | 385 | apply_microcode_on_target(cpu); |
326 | cpu); | ||
327 | /* Should we look for a new ucode here? */ | ||
328 | return 1; | ||
329 | } | 386 | } |
330 | 387 | ||
331 | return 0; | 388 | return ustate; |
332 | } | 389 | } |
333 | 390 | ||
334 | static long microcode_update_cpu(void *unused) | 391 | static enum ucode_state microcode_update_cpu(int cpu) |
335 | { | 392 | { |
336 | struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); | 393 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; |
337 | int err = 0; | 394 | enum ucode_state ustate; |
338 | 395 | ||
339 | /* | 396 | if (uci->valid) |
340 | * Check if the system resume is in progress (uci->valid != NULL), | 397 | ustate = microcode_resume_cpu(cpu); |
341 | * otherwise just request a firmware: | 398 | else |
342 | */ | 399 | ustate = microcode_init_cpu(cpu); |
343 | if (uci->valid) { | ||
344 | err = microcode_resume_cpu(smp_processor_id()); | ||
345 | } else { | ||
346 | collect_cpu_info(smp_processor_id()); | ||
347 | if (uci->valid && system_state == SYSTEM_RUNNING) | ||
348 | err = microcode_ops->request_microcode_fw( | ||
349 | smp_processor_id(), | ||
350 | µcode_pdev->dev); | ||
351 | } | ||
352 | if (!err) | ||
353 | microcode_ops->apply_microcode(smp_processor_id()); | ||
354 | return err; | ||
355 | } | ||
356 | 400 | ||
357 | static int microcode_init_cpu(int cpu) | 401 | return ustate; |
358 | { | ||
359 | int err; | ||
360 | mutex_lock(µcode_mutex); | ||
361 | err = work_on_cpu(cpu, microcode_update_cpu, NULL); | ||
362 | mutex_unlock(µcode_mutex); | ||
363 | |||
364 | return err; | ||
365 | } | 402 | } |
366 | 403 | ||
367 | static int mc_sysdev_add(struct sys_device *sys_dev) | 404 | static int mc_sysdev_add(struct sys_device *sys_dev) |
368 | { | 405 | { |
369 | int err, cpu = sys_dev->id; | 406 | int err, cpu = sys_dev->id; |
370 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
371 | 407 | ||
372 | if (!cpu_online(cpu)) | 408 | if (!cpu_online(cpu)) |
373 | return 0; | 409 | return 0; |
374 | 410 | ||
375 | pr_debug("microcode: CPU%d added\n", cpu); | 411 | pr_debug("microcode: CPU%d added\n", cpu); |
376 | memset(uci, 0, sizeof(*uci)); | ||
377 | 412 | ||
378 | err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); | 413 | err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); |
379 | if (err) | 414 | if (err) |
380 | return err; | 415 | return err; |
381 | 416 | ||
382 | err = microcode_init_cpu(cpu); | 417 | if (microcode_init_cpu(cpu) == UCODE_ERROR) |
418 | err = -EINVAL; | ||
383 | 419 | ||
384 | return err; | 420 | return err; |
385 | } | 421 | } |
@@ -400,19 +436,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) | |||
400 | static int mc_sysdev_resume(struct sys_device *dev) | 436 | static int mc_sysdev_resume(struct sys_device *dev) |
401 | { | 437 | { |
402 | int cpu = dev->id; | 438 | int cpu = dev->id; |
439 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
403 | 440 | ||
404 | if (!cpu_online(cpu)) | 441 | if (!cpu_online(cpu)) |
405 | return 0; | 442 | return 0; |
406 | 443 | ||
407 | /* only CPU 0 will apply ucode here */ | 444 | /* |
408 | microcode_update_cpu(NULL); | 445 | * All non-bootup cpus are still disabled, |
446 | * so only CPU 0 will apply ucode here. | ||
447 | * | ||
448 | * Moreover, there can be no concurrent | ||
449 | * updates from any other places at this point. | ||
450 | */ | ||
451 | WARN_ON(cpu != 0); | ||
452 | |||
453 | if (uci->valid && uci->mc) | ||
454 | microcode_ops->apply_microcode(cpu); | ||
455 | |||
409 | return 0; | 456 | return 0; |
410 | } | 457 | } |
411 | 458 | ||
412 | static struct sysdev_driver mc_sysdev_driver = { | 459 | static struct sysdev_driver mc_sysdev_driver = { |
413 | .add = mc_sysdev_add, | 460 | .add = mc_sysdev_add, |
414 | .remove = mc_sysdev_remove, | 461 | .remove = mc_sysdev_remove, |
415 | .resume = mc_sysdev_resume, | 462 | .resume = mc_sysdev_resume, |
416 | }; | 463 | }; |
417 | 464 | ||
418 | static __cpuinit int | 465 | static __cpuinit int |
@@ -425,15 +472,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) | |||
425 | switch (action) { | 472 | switch (action) { |
426 | case CPU_ONLINE: | 473 | case CPU_ONLINE: |
427 | case CPU_ONLINE_FROZEN: | 474 | case CPU_ONLINE_FROZEN: |
428 | if (microcode_init_cpu(cpu)) | 475 | microcode_update_cpu(cpu); |
429 | printk(KERN_ERR "microcode: failed to init CPU%d\n", | ||
430 | cpu); | ||
431 | case CPU_DOWN_FAILED: | 476 | case CPU_DOWN_FAILED: |
432 | case CPU_DOWN_FAILED_FROZEN: | 477 | case CPU_DOWN_FAILED_FROZEN: |
433 | pr_debug("microcode: CPU%d added\n", cpu); | 478 | pr_debug("microcode: CPU%d added\n", cpu); |
434 | if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) | 479 | if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) |
435 | printk(KERN_ERR "microcode: Failed to create the sysfs " | 480 | pr_err("microcode: Failed to create group for CPU%d\n", cpu); |
436 | "group for CPU%d\n", cpu); | ||
437 | break; | 481 | break; |
438 | case CPU_DOWN_PREPARE: | 482 | case CPU_DOWN_PREPARE: |
439 | case CPU_DOWN_PREPARE_FROZEN: | 483 | case CPU_DOWN_PREPARE_FROZEN: |
@@ -465,13 +509,10 @@ static int __init microcode_init(void) | |||
465 | microcode_ops = init_amd_microcode(); | 509 | microcode_ops = init_amd_microcode(); |
466 | 510 | ||
467 | if (!microcode_ops) { | 511 | if (!microcode_ops) { |
468 | printk(KERN_ERR "microcode: no support for this CPU vendor\n"); | 512 | pr_err("microcode: no support for this CPU vendor\n"); |
469 | return -ENODEV; | 513 | return -ENODEV; |
470 | } | 514 | } |
471 | 515 | ||
472 | error = microcode_dev_init(); | ||
473 | if (error) | ||
474 | return error; | ||
475 | microcode_pdev = platform_device_register_simple("microcode", -1, | 516 | microcode_pdev = platform_device_register_simple("microcode", -1, |
476 | NULL, 0); | 517 | NULL, 0); |
477 | if (IS_ERR(microcode_pdev)) { | 518 | if (IS_ERR(microcode_pdev)) { |
@@ -480,23 +521,31 @@ static int __init microcode_init(void) | |||
480 | } | 521 | } |
481 | 522 | ||
482 | get_online_cpus(); | 523 | get_online_cpus(); |
524 | mutex_lock(µcode_mutex); | ||
525 | |||
483 | error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); | 526 | error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); |
527 | |||
528 | mutex_unlock(µcode_mutex); | ||
484 | put_online_cpus(); | 529 | put_online_cpus(); |
530 | |||
485 | if (error) { | 531 | if (error) { |
486 | microcode_dev_exit(); | ||
487 | platform_device_unregister(microcode_pdev); | 532 | platform_device_unregister(microcode_pdev); |
488 | return error; | 533 | return error; |
489 | } | 534 | } |
490 | 535 | ||
536 | error = microcode_dev_init(); | ||
537 | if (error) | ||
538 | return error; | ||
539 | |||
491 | register_hotcpu_notifier(&mc_cpu_notifier); | 540 | register_hotcpu_notifier(&mc_cpu_notifier); |
492 | 541 | ||
493 | printk(KERN_INFO | 542 | pr_info("Microcode Update Driver: v" MICROCODE_VERSION |
494 | "Microcode Update Driver: v" MICROCODE_VERSION | ||
495 | " <tigran@aivazian.fsnet.co.uk>," | 543 | " <tigran@aivazian.fsnet.co.uk>," |
496 | " Peter Oruba\n"); | 544 | " Peter Oruba\n"); |
497 | 545 | ||
498 | return 0; | 546 | return 0; |
499 | } | 547 | } |
548 | module_init(microcode_init); | ||
500 | 549 | ||
501 | static void __exit microcode_exit(void) | 550 | static void __exit microcode_exit(void) |
502 | { | 551 | { |
@@ -505,16 +554,17 @@ static void __exit microcode_exit(void) | |||
505 | unregister_hotcpu_notifier(&mc_cpu_notifier); | 554 | unregister_hotcpu_notifier(&mc_cpu_notifier); |
506 | 555 | ||
507 | get_online_cpus(); | 556 | get_online_cpus(); |
557 | mutex_lock(µcode_mutex); | ||
558 | |||
508 | sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); | 559 | sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); |
560 | |||
561 | mutex_unlock(µcode_mutex); | ||
509 | put_online_cpus(); | 562 | put_online_cpus(); |
510 | 563 | ||
511 | platform_device_unregister(microcode_pdev); | 564 | platform_device_unregister(microcode_pdev); |
512 | 565 | ||
513 | microcode_ops = NULL; | 566 | microcode_ops = NULL; |
514 | 567 | ||
515 | printk(KERN_INFO | 568 | pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); |
516 | "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); | ||
517 | } | 569 | } |
518 | |||
519 | module_init(microcode_init); | ||
520 | module_exit(microcode_exit); | 570 | module_exit(microcode_exit); |
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 149b9ec7c1ab..0d334ddd0a96 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c | |||
@@ -70,24 +70,11 @@ | |||
70 | * Fix sigmatch() macro to handle old CPUs with pf == 0. | 70 | * Fix sigmatch() macro to handle old CPUs with pf == 0. |
71 | * Thanks to Stuart Swales for pointing out this bug. | 71 | * Thanks to Stuart Swales for pointing out this bug. |
72 | */ | 72 | */ |
73 | #include <linux/platform_device.h> | ||
74 | #include <linux/capability.h> | ||
75 | #include <linux/miscdevice.h> | ||
76 | #include <linux/firmware.h> | 73 | #include <linux/firmware.h> |
77 | #include <linux/smp_lock.h> | ||
78 | #include <linux/spinlock.h> | ||
79 | #include <linux/cpumask.h> | ||
80 | #include <linux/uaccess.h> | 74 | #include <linux/uaccess.h> |
81 | #include <linux/vmalloc.h> | ||
82 | #include <linux/kernel.h> | 75 | #include <linux/kernel.h> |
83 | #include <linux/module.h> | 76 | #include <linux/module.h> |
84 | #include <linux/mutex.h> | 77 | #include <linux/vmalloc.h> |
85 | #include <linux/sched.h> | ||
86 | #include <linux/init.h> | ||
87 | #include <linux/slab.h> | ||
88 | #include <linux/cpu.h> | ||
89 | #include <linux/fs.h> | ||
90 | #include <linux/mm.h> | ||
91 | 78 | ||
92 | #include <asm/microcode.h> | 79 | #include <asm/microcode.h> |
93 | #include <asm/processor.h> | 80 | #include <asm/processor.h> |
@@ -150,13 +137,9 @@ struct extended_sigtable { | |||
150 | 137 | ||
151 | #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) | 138 | #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) |
152 | 139 | ||
153 | /* serialize access to the physical write to MSR 0x79 */ | ||
154 | static DEFINE_SPINLOCK(microcode_update_lock); | ||
155 | |||
156 | static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) | 140 | static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) |
157 | { | 141 | { |
158 | struct cpuinfo_x86 *c = &cpu_data(cpu_num); | 142 | struct cpuinfo_x86 *c = &cpu_data(cpu_num); |
159 | unsigned long flags; | ||
160 | unsigned int val[2]; | 143 | unsigned int val[2]; |
161 | 144 | ||
162 | memset(csig, 0, sizeof(*csig)); | 145 | memset(csig, 0, sizeof(*csig)); |
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) | |||
176 | csig->pf = 1 << ((val[1] >> 18) & 7); | 159 | csig->pf = 1 << ((val[1] >> 18) & 7); |
177 | } | 160 | } |
178 | 161 | ||
179 | /* serialize access to the physical write to MSR 0x79 */ | ||
180 | spin_lock_irqsave(µcode_update_lock, flags); | ||
181 | |||
182 | wrmsr(MSR_IA32_UCODE_REV, 0, 0); | 162 | wrmsr(MSR_IA32_UCODE_REV, 0, 0); |
183 | /* see notes above for revision 1.07. Apparent chip bug */ | 163 | /* see notes above for revision 1.07. Apparent chip bug */ |
184 | sync_core(); | 164 | sync_core(); |
185 | /* get the current revision from MSR 0x8B */ | 165 | /* get the current revision from MSR 0x8B */ |
186 | rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); | 166 | rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); |
187 | spin_unlock_irqrestore(µcode_update_lock, flags); | ||
188 | 167 | ||
189 | pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", | 168 | printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", |
190 | csig->sig, csig->pf, csig->rev); | 169 | cpu_num, csig->sig, csig->pf, csig->rev); |
191 | 170 | ||
192 | return 0; | 171 | return 0; |
193 | } | 172 | } |
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) | |||
318 | return 0; | 297 | return 0; |
319 | } | 298 | } |
320 | 299 | ||
321 | static void apply_microcode(int cpu) | 300 | static int apply_microcode(int cpu) |
322 | { | 301 | { |
323 | struct microcode_intel *mc_intel; | 302 | struct microcode_intel *mc_intel; |
324 | struct ucode_cpu_info *uci; | 303 | struct ucode_cpu_info *uci; |
325 | unsigned long flags; | ||
326 | unsigned int val[2]; | 304 | unsigned int val[2]; |
327 | int cpu_num; | 305 | int cpu_num; |
328 | 306 | ||
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu) | |||
334 | BUG_ON(cpu_num != cpu); | 312 | BUG_ON(cpu_num != cpu); |
335 | 313 | ||
336 | if (mc_intel == NULL) | 314 | if (mc_intel == NULL) |
337 | return; | 315 | return 0; |
338 | |||
339 | /* serialize access to the physical write to MSR 0x79 */ | ||
340 | spin_lock_irqsave(µcode_update_lock, flags); | ||
341 | 316 | ||
342 | /* write microcode via MSR 0x79 */ | 317 | /* write microcode via MSR 0x79 */ |
343 | wrmsr(MSR_IA32_UCODE_WRITE, | 318 | wrmsr(MSR_IA32_UCODE_WRITE, |
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu) | |||
351 | /* get the current revision from MSR 0x8B */ | 326 | /* get the current revision from MSR 0x8B */ |
352 | rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); | 327 | rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); |
353 | 328 | ||
354 | spin_unlock_irqrestore(µcode_update_lock, flags); | ||
355 | if (val[1] != mc_intel->hdr.rev) { | 329 | if (val[1] != mc_intel->hdr.rev) { |
356 | printk(KERN_ERR "microcode: CPU%d update from revision " | 330 | printk(KERN_ERR "microcode: CPU%d update " |
357 | "0x%x to 0x%x failed\n", | 331 | "to revision 0x%x failed\n", |
358 | cpu_num, uci->cpu_sig.rev, val[1]); | 332 | cpu_num, mc_intel->hdr.rev); |
359 | return; | 333 | return -1; |
360 | } | 334 | } |
361 | printk(KERN_INFO "microcode: CPU%d updated from revision " | 335 | printk(KERN_INFO "microcode: CPU%d updated to revision " |
362 | "0x%x to 0x%x, date = %04x-%02x-%02x \n", | 336 | "0x%x, date = %04x-%02x-%02x \n", |
363 | cpu_num, uci->cpu_sig.rev, val[1], | 337 | cpu_num, val[1], |
364 | mc_intel->hdr.date & 0xffff, | 338 | mc_intel->hdr.date & 0xffff, |
365 | mc_intel->hdr.date >> 24, | 339 | mc_intel->hdr.date >> 24, |
366 | (mc_intel->hdr.date >> 16) & 0xff); | 340 | (mc_intel->hdr.date >> 16) & 0xff); |
367 | 341 | ||
368 | uci->cpu_sig.rev = val[1]; | 342 | uci->cpu_sig.rev = val[1]; |
343 | |||
344 | return 0; | ||
369 | } | 345 | } |
370 | 346 | ||
371 | static int generic_load_microcode(int cpu, void *data, size_t size, | 347 | static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, |
372 | int (*get_ucode_data)(void *, const void *, size_t)) | 348 | int (*get_ucode_data)(void *, const void *, size_t)) |
373 | { | 349 | { |
374 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | 350 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; |
375 | u8 *ucode_ptr = data, *new_mc = NULL, *mc; | 351 | u8 *ucode_ptr = data, *new_mc = NULL, *mc; |
376 | int new_rev = uci->cpu_sig.rev; | 352 | int new_rev = uci->cpu_sig.rev; |
377 | unsigned int leftover = size; | 353 | unsigned int leftover = size; |
354 | enum ucode_state state = UCODE_OK; | ||
378 | 355 | ||
379 | while (leftover) { | 356 | while (leftover) { |
380 | struct microcode_header_intel mc_header; | 357 | struct microcode_header_intel mc_header; |
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size, | |||
412 | leftover -= mc_size; | 389 | leftover -= mc_size; |
413 | } | 390 | } |
414 | 391 | ||
415 | if (!new_mc) | 392 | if (leftover) { |
393 | if (new_mc) | ||
394 | vfree(new_mc); | ||
395 | state = UCODE_ERROR; | ||
416 | goto out; | 396 | goto out; |
397 | } | ||
417 | 398 | ||
418 | if (leftover) { | 399 | if (!new_mc) { |
419 | vfree(new_mc); | 400 | state = UCODE_NFOUND; |
420 | goto out; | 401 | goto out; |
421 | } | 402 | } |
422 | 403 | ||
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, | |||
427 | pr_debug("microcode: CPU%d found a matching microcode update with" | 408 | pr_debug("microcode: CPU%d found a matching microcode update with" |
428 | " version 0x%x (current=0x%x)\n", | 409 | " version 0x%x (current=0x%x)\n", |
429 | cpu, new_rev, uci->cpu_sig.rev); | 410 | cpu, new_rev, uci->cpu_sig.rev); |
430 | 411 | out: | |
431 | out: | 412 | return state; |
432 | return (int)leftover; | ||
433 | } | 413 | } |
434 | 414 | ||
435 | static int get_ucode_fw(void *to, const void *from, size_t n) | 415 | static int get_ucode_fw(void *to, const void *from, size_t n) |
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n) | |||
438 | return 0; | 418 | return 0; |
439 | } | 419 | } |
440 | 420 | ||
441 | static int request_microcode_fw(int cpu, struct device *device) | 421 | static enum ucode_state request_microcode_fw(int cpu, struct device *device) |
442 | { | 422 | { |
443 | char name[30]; | 423 | char name[30]; |
444 | struct cpuinfo_x86 *c = &cpu_data(cpu); | 424 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
445 | const struct firmware *firmware; | 425 | const struct firmware *firmware; |
446 | int ret; | 426 | enum ucode_state ret; |
447 | 427 | ||
448 | /* We should bind the task to the CPU */ | ||
449 | BUG_ON(cpu != raw_smp_processor_id()); | ||
450 | sprintf(name, "intel-ucode/%02x-%02x-%02x", | 428 | sprintf(name, "intel-ucode/%02x-%02x-%02x", |
451 | c->x86, c->x86_model, c->x86_mask); | 429 | c->x86, c->x86_model, c->x86_mask); |
452 | ret = request_firmware(&firmware, name, device); | 430 | |
453 | if (ret) { | 431 | if (request_firmware(&firmware, name, device)) { |
454 | pr_debug("microcode: data file %s load failed\n", name); | 432 | pr_debug("microcode: data file %s load failed\n", name); |
455 | return ret; | 433 | return UCODE_NFOUND; |
456 | } | 434 | } |
457 | 435 | ||
458 | ret = generic_load_microcode(cpu, (void *)firmware->data, | 436 | ret = generic_load_microcode(cpu, (void *)firmware->data, |
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n) | |||
468 | return copy_from_user(to, from, n); | 446 | return copy_from_user(to, from, n); |
469 | } | 447 | } |
470 | 448 | ||
471 | static int request_microcode_user(int cpu, const void __user *buf, size_t size) | 449 | static enum ucode_state |
450 | request_microcode_user(int cpu, const void __user *buf, size_t size) | ||
472 | { | 451 | { |
473 | /* We should bind the task to the CPU */ | ||
474 | BUG_ON(cpu != raw_smp_processor_id()); | ||
475 | |||
476 | return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); | 452 | return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); |
477 | } | 453 | } |
478 | 454 | ||
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module.c index c23880b90b5c..89f386f044e4 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module.c | |||
@@ -1,6 +1,5 @@ | |||
1 | /* Kernel module help for x86-64 | 1 | /* Kernel module help for x86. |
2 | Copyright (C) 2001 Rusty Russell. | 2 | Copyright (C) 2001 Rusty Russell. |
3 | Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | 3 | ||
5 | This program is free software; you can redistribute it and/or modify | 4 | This program is free software; you can redistribute it and/or modify |
6 | it under the terms of the GNU General Public License as published by | 5 | it under the terms of the GNU General Public License as published by |
@@ -22,23 +21,18 @@ | |||
22 | #include <linux/fs.h> | 21 | #include <linux/fs.h> |
23 | #include <linux/string.h> | 22 | #include <linux/string.h> |
24 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
25 | #include <linux/mm.h> | ||
26 | #include <linux/slab.h> | ||
27 | #include <linux/bug.h> | 24 | #include <linux/bug.h> |
25 | #include <linux/mm.h> | ||
28 | 26 | ||
29 | #include <asm/system.h> | 27 | #include <asm/system.h> |
30 | #include <asm/page.h> | 28 | #include <asm/page.h> |
31 | #include <asm/pgtable.h> | 29 | #include <asm/pgtable.h> |
32 | 30 | ||
31 | #if 0 | ||
32 | #define DEBUGP printk | ||
33 | #else | ||
33 | #define DEBUGP(fmt...) | 34 | #define DEBUGP(fmt...) |
34 | 35 | #endif | |
35 | #ifndef CONFIG_UML | ||
36 | void module_free(struct module *mod, void *module_region) | ||
37 | { | ||
38 | vfree(module_region); | ||
39 | /* FIXME: If module_region == mod->init_region, trim exception | ||
40 | table entries. */ | ||
41 | } | ||
42 | 36 | ||
43 | void *module_alloc(unsigned long size) | 37 | void *module_alloc(unsigned long size) |
44 | { | 38 | { |
@@ -54,9 +48,15 @@ void *module_alloc(unsigned long size) | |||
54 | if (!area) | 48 | if (!area) |
55 | return NULL; | 49 | return NULL; |
56 | 50 | ||
57 | return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); | 51 | return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, |
52 | PAGE_KERNEL_EXEC); | ||
53 | } | ||
54 | |||
55 | /* Free memory returned from module_alloc */ | ||
56 | void module_free(struct module *mod, void *module_region) | ||
57 | { | ||
58 | vfree(module_region); | ||
58 | } | 59 | } |
59 | #endif | ||
60 | 60 | ||
61 | /* We don't need anything special. */ | 61 | /* We don't need anything special. */ |
62 | int module_frob_arch_sections(Elf_Ehdr *hdr, | 62 | int module_frob_arch_sections(Elf_Ehdr *hdr, |
@@ -67,6 +67,58 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, | |||
67 | return 0; | 67 | return 0; |
68 | } | 68 | } |
69 | 69 | ||
70 | #ifdef CONFIG_X86_32 | ||
71 | int apply_relocate(Elf32_Shdr *sechdrs, | ||
72 | const char *strtab, | ||
73 | unsigned int symindex, | ||
74 | unsigned int relsec, | ||
75 | struct module *me) | ||
76 | { | ||
77 | unsigned int i; | ||
78 | Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; | ||
79 | Elf32_Sym *sym; | ||
80 | uint32_t *location; | ||
81 | |||
82 | DEBUGP("Applying relocate section %u to %u\n", relsec, | ||
83 | sechdrs[relsec].sh_info); | ||
84 | for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { | ||
85 | /* This is where to make the change */ | ||
86 | location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr | ||
87 | + rel[i].r_offset; | ||
88 | /* This is the symbol it is referring to. Note that all | ||
89 | undefined symbols have been resolved. */ | ||
90 | sym = (Elf32_Sym *)sechdrs[symindex].sh_addr | ||
91 | + ELF32_R_SYM(rel[i].r_info); | ||
92 | |||
93 | switch (ELF32_R_TYPE(rel[i].r_info)) { | ||
94 | case R_386_32: | ||
95 | /* We add the value into the location given */ | ||
96 | *location += sym->st_value; | ||
97 | break; | ||
98 | case R_386_PC32: | ||
99 | /* Add the value, subtract its postition */ | ||
100 | *location += sym->st_value - (uint32_t)location; | ||
101 | break; | ||
102 | default: | ||
103 | printk(KERN_ERR "module %s: Unknown relocation: %u\n", | ||
104 | me->name, ELF32_R_TYPE(rel[i].r_info)); | ||
105 | return -ENOEXEC; | ||
106 | } | ||
107 | } | ||
108 | return 0; | ||
109 | } | ||
110 | |||
111 | int apply_relocate_add(Elf32_Shdr *sechdrs, | ||
112 | const char *strtab, | ||
113 | unsigned int symindex, | ||
114 | unsigned int relsec, | ||
115 | struct module *me) | ||
116 | { | ||
117 | printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", | ||
118 | me->name); | ||
119 | return -ENOEXEC; | ||
120 | } | ||
121 | #else /*X86_64*/ | ||
70 | int apply_relocate_add(Elf64_Shdr *sechdrs, | 122 | int apply_relocate_add(Elf64_Shdr *sechdrs, |
71 | const char *strtab, | 123 | const char *strtab, |
72 | unsigned int symindex, | 124 | unsigned int symindex, |
@@ -147,6 +199,8 @@ int apply_relocate(Elf_Shdr *sechdrs, | |||
147 | return -ENOSYS; | 199 | return -ENOSYS; |
148 | } | 200 | } |
149 | 201 | ||
202 | #endif | ||
203 | |||
150 | int module_finalize(const Elf_Ehdr *hdr, | 204 | int module_finalize(const Elf_Ehdr *hdr, |
151 | const Elf_Shdr *sechdrs, | 205 | const Elf_Shdr *sechdrs, |
152 | struct module *me) | 206 | struct module *me) |
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c deleted file mode 100644 index 0edd819050e7..000000000000 --- a/arch/x86/kernel/module_32.c +++ /dev/null | |||
@@ -1,152 +0,0 @@ | |||
1 | /* Kernel module help for i386. | ||
2 | Copyright (C) 2001 Rusty Russell. | ||
3 | |||
4 | This program is free software; you can redistribute it and/or modify | ||
5 | it under the terms of the GNU General Public License as published by | ||
6 | the Free Software Foundation; either version 2 of the License, or | ||
7 | (at your option) any later version. | ||
8 | |||
9 | This program is distributed in the hope that it will be useful, | ||
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | GNU General Public License for more details. | ||
13 | |||
14 | You should have received a copy of the GNU General Public License | ||
15 | along with this program; if not, write to the Free Software | ||
16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #include <linux/moduleloader.h> | ||
19 | #include <linux/elf.h> | ||
20 | #include <linux/vmalloc.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/string.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/bug.h> | ||
25 | |||
26 | #if 0 | ||
27 | #define DEBUGP printk | ||
28 | #else | ||
29 | #define DEBUGP(fmt...) | ||
30 | #endif | ||
31 | |||
32 | void *module_alloc(unsigned long size) | ||
33 | { | ||
34 | if (size == 0) | ||
35 | return NULL; | ||
36 | return vmalloc_exec(size); | ||
37 | } | ||
38 | |||
39 | |||
40 | /* Free memory returned from module_alloc */ | ||
41 | void module_free(struct module *mod, void *module_region) | ||
42 | { | ||
43 | vfree(module_region); | ||
44 | /* FIXME: If module_region == mod->init_region, trim exception | ||
45 | table entries. */ | ||
46 | } | ||
47 | |||
48 | /* We don't need anything special. */ | ||
49 | int module_frob_arch_sections(Elf_Ehdr *hdr, | ||
50 | Elf_Shdr *sechdrs, | ||
51 | char *secstrings, | ||
52 | struct module *mod) | ||
53 | { | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | int apply_relocate(Elf32_Shdr *sechdrs, | ||
58 | const char *strtab, | ||
59 | unsigned int symindex, | ||
60 | unsigned int relsec, | ||
61 | struct module *me) | ||
62 | { | ||
63 | unsigned int i; | ||
64 | Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; | ||
65 | Elf32_Sym *sym; | ||
66 | uint32_t *location; | ||
67 | |||
68 | DEBUGP("Applying relocate section %u to %u\n", relsec, | ||
69 | sechdrs[relsec].sh_info); | ||
70 | for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { | ||
71 | /* This is where to make the change */ | ||
72 | location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr | ||
73 | + rel[i].r_offset; | ||
74 | /* This is the symbol it is referring to. Note that all | ||
75 | undefined symbols have been resolved. */ | ||
76 | sym = (Elf32_Sym *)sechdrs[symindex].sh_addr | ||
77 | + ELF32_R_SYM(rel[i].r_info); | ||
78 | |||
79 | switch (ELF32_R_TYPE(rel[i].r_info)) { | ||
80 | case R_386_32: | ||
81 | /* We add the value into the location given */ | ||
82 | *location += sym->st_value; | ||
83 | break; | ||
84 | case R_386_PC32: | ||
85 | /* Add the value, subtract its postition */ | ||
86 | *location += sym->st_value - (uint32_t)location; | ||
87 | break; | ||
88 | default: | ||
89 | printk(KERN_ERR "module %s: Unknown relocation: %u\n", | ||
90 | me->name, ELF32_R_TYPE(rel[i].r_info)); | ||
91 | return -ENOEXEC; | ||
92 | } | ||
93 | } | ||
94 | return 0; | ||
95 | } | ||
96 | |||
97 | int apply_relocate_add(Elf32_Shdr *sechdrs, | ||
98 | const char *strtab, | ||
99 | unsigned int symindex, | ||
100 | unsigned int relsec, | ||
101 | struct module *me) | ||
102 | { | ||
103 | printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", | ||
104 | me->name); | ||
105 | return -ENOEXEC; | ||
106 | } | ||
107 | |||
108 | int module_finalize(const Elf_Ehdr *hdr, | ||
109 | const Elf_Shdr *sechdrs, | ||
110 | struct module *me) | ||
111 | { | ||
112 | const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, | ||
113 | *para = NULL; | ||
114 | char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; | ||
115 | |||
116 | for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { | ||
117 | if (!strcmp(".text", secstrings + s->sh_name)) | ||
118 | text = s; | ||
119 | if (!strcmp(".altinstructions", secstrings + s->sh_name)) | ||
120 | alt = s; | ||
121 | if (!strcmp(".smp_locks", secstrings + s->sh_name)) | ||
122 | locks = s; | ||
123 | if (!strcmp(".parainstructions", secstrings + s->sh_name)) | ||
124 | para = s; | ||
125 | } | ||
126 | |||
127 | if (alt) { | ||
128 | /* patch .altinstructions */ | ||
129 | void *aseg = (void *)alt->sh_addr; | ||
130 | apply_alternatives(aseg, aseg + alt->sh_size); | ||
131 | } | ||
132 | if (locks && text) { | ||
133 | void *lseg = (void *)locks->sh_addr; | ||
134 | void *tseg = (void *)text->sh_addr; | ||
135 | alternatives_smp_module_add(me, me->name, | ||
136 | lseg, lseg + locks->sh_size, | ||
137 | tseg, tseg + text->sh_size); | ||
138 | } | ||
139 | |||
140 | if (para) { | ||
141 | void *pseg = (void *)para->sh_addr; | ||
142 | apply_paravirt(pseg, pseg + para->sh_size); | ||
143 | } | ||
144 | |||
145 | return module_bug_finalize(hdr, sechdrs, me); | ||
146 | } | ||
147 | |||
148 | void module_arch_cleanup(struct module *mod) | ||
149 | { | ||
150 | alternatives_smp_module_del(mod); | ||
151 | module_bug_cleanup(mod); | ||
152 | } | ||
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 70fd7e414c15..651c93b28862 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/acpi.h> | 17 | #include <linux/acpi.h> |
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
20 | #include <linux/pci.h> | ||
20 | 21 | ||
21 | #include <asm/mtrr.h> | 22 | #include <asm/mtrr.h> |
22 | #include <asm/mpspec.h> | 23 | #include <asm/mpspec.h> |
@@ -870,24 +871,17 @@ static | |||
870 | inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} | 871 | inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} |
871 | #endif /* CONFIG_X86_IO_APIC */ | 872 | #endif /* CONFIG_X86_IO_APIC */ |
872 | 873 | ||
873 | static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, | 874 | static int |
874 | int count) | 875 | check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) |
875 | { | 876 | { |
876 | if (!mpc_new_phys) { | 877 | int ret = 0; |
877 | pr_info("No spare slots, try to append...take your risk, " | 878 | |
878 | "new mpc_length %x\n", count); | 879 | if (!mpc_new_phys || count <= mpc_new_length) { |
879 | } else { | 880 | WARN(1, "update_mptable: No spare slots (length: %x)\n", count); |
880 | if (count <= mpc_new_length) | 881 | return -1; |
881 | pr_info("No spare slots, try to append..., " | ||
882 | "new mpc_length %x\n", count); | ||
883 | else { | ||
884 | pr_err("mpc_new_length %lx is too small\n", | ||
885 | mpc_new_length); | ||
886 | return -1; | ||
887 | } | ||
888 | } | 882 | } |
889 | 883 | ||
890 | return 0; | 884 | return ret; |
891 | } | 885 | } |
892 | 886 | ||
893 | static int __init replace_intsrc_all(struct mpc_table *mpc, | 887 | static int __init replace_intsrc_all(struct mpc_table *mpc, |
@@ -946,7 +940,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, | |||
946 | } else { | 940 | } else { |
947 | struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; | 941 | struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; |
948 | count += sizeof(struct mpc_intsrc); | 942 | count += sizeof(struct mpc_intsrc); |
949 | if (!check_slot(mpc_new_phys, mpc_new_length, count)) | 943 | if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) |
950 | goto out; | 944 | goto out; |
951 | assign_to_mpc_intsrc(&mp_irqs[i], m); | 945 | assign_to_mpc_intsrc(&mp_irqs[i], m); |
952 | mpc->length = count; | 946 | mpc->length = count; |
@@ -963,11 +957,14 @@ out: | |||
963 | return 0; | 957 | return 0; |
964 | } | 958 | } |
965 | 959 | ||
966 | static int __initdata enable_update_mptable; | 960 | int enable_update_mptable; |
967 | 961 | ||
968 | static int __init update_mptable_setup(char *str) | 962 | static int __init update_mptable_setup(char *str) |
969 | { | 963 | { |
970 | enable_update_mptable = 1; | 964 | enable_update_mptable = 1; |
965 | #ifdef CONFIG_PCI | ||
966 | pci_routeirq = 1; | ||
967 | #endif | ||
971 | return 0; | 968 | return 0; |
972 | } | 969 | } |
973 | early_param("update_mptable", update_mptable_setup); | 970 | early_param("update_mptable", update_mptable_setup); |
@@ -980,6 +977,9 @@ static int __initdata alloc_mptable; | |||
980 | static int __init parse_alloc_mptable_opt(char *p) | 977 | static int __init parse_alloc_mptable_opt(char *p) |
981 | { | 978 | { |
982 | enable_update_mptable = 1; | 979 | enable_update_mptable = 1; |
980 | #ifdef CONFIG_PCI | ||
981 | pci_routeirq = 1; | ||
982 | #endif | ||
983 | alloc_mptable = 1; | 983 | alloc_mptable = 1; |
984 | if (!p) | 984 | if (!p) |
985 | return 0; | 985 | return 0; |
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 3cf3413ec626..98fd6cd4e3a4 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c | |||
@@ -196,6 +196,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { | |||
196 | .notifier_call = msr_class_cpu_callback, | 196 | .notifier_call = msr_class_cpu_callback, |
197 | }; | 197 | }; |
198 | 198 | ||
199 | static char *msr_nodename(struct device *dev) | ||
200 | { | ||
201 | return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); | ||
202 | } | ||
203 | |||
199 | static int __init msr_init(void) | 204 | static int __init msr_init(void) |
200 | { | 205 | { |
201 | int i, err = 0; | 206 | int i, err = 0; |
@@ -212,6 +217,7 @@ static int __init msr_init(void) | |||
212 | err = PTR_ERR(msr_class); | 217 | err = PTR_ERR(msr_class); |
213 | goto out_chrdev; | 218 | goto out_chrdev; |
214 | } | 219 | } |
220 | msr_class->nodename = msr_nodename; | ||
215 | for_each_online_cpu(i) { | 221 | for_each_online_cpu(i) { |
216 | err = msr_device_create(i); | 222 | err = msr_device_create(i); |
217 | if (err != 0) | 223 | if (err != 0) |
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8e45f4464880..70ec9b951d76 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -134,7 +134,9 @@ static void *get_call_destination(u8 type) | |||
134 | .pv_irq_ops = pv_irq_ops, | 134 | .pv_irq_ops = pv_irq_ops, |
135 | .pv_apic_ops = pv_apic_ops, | 135 | .pv_apic_ops = pv_apic_ops, |
136 | .pv_mmu_ops = pv_mmu_ops, | 136 | .pv_mmu_ops = pv_mmu_ops, |
137 | #ifdef CONFIG_PARAVIRT_SPINLOCKS | ||
137 | .pv_lock_ops = pv_lock_ops, | 138 | .pv_lock_ops = pv_lock_ops, |
139 | #endif | ||
138 | }; | 140 | }; |
139 | return *((void **)&tmpl + type); | 141 | return *((void **)&tmpl + type); |
140 | } | 142 | } |
@@ -246,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA | |||
246 | 248 | ||
247 | static inline void enter_lazy(enum paravirt_lazy_mode mode) | 249 | static inline void enter_lazy(enum paravirt_lazy_mode mode) |
248 | { | 250 | { |
249 | BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); | 251 | BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); |
250 | BUG_ON(preemptible()); | ||
251 | 252 | ||
252 | __get_cpu_var(paravirt_lazy_mode) = mode; | 253 | percpu_write(paravirt_lazy_mode, mode); |
253 | } | 254 | } |
254 | 255 | ||
255 | void paravirt_leave_lazy(enum paravirt_lazy_mode mode) | 256 | static void leave_lazy(enum paravirt_lazy_mode mode) |
256 | { | 257 | { |
257 | BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); | 258 | BUG_ON(percpu_read(paravirt_lazy_mode) != mode); |
258 | BUG_ON(preemptible()); | ||
259 | 259 | ||
260 | __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; | 260 | percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); |
261 | } | 261 | } |
262 | 262 | ||
263 | void paravirt_enter_lazy_mmu(void) | 263 | void paravirt_enter_lazy_mmu(void) |
@@ -267,22 +267,36 @@ void paravirt_enter_lazy_mmu(void) | |||
267 | 267 | ||
268 | void paravirt_leave_lazy_mmu(void) | 268 | void paravirt_leave_lazy_mmu(void) |
269 | { | 269 | { |
270 | paravirt_leave_lazy(PARAVIRT_LAZY_MMU); | 270 | leave_lazy(PARAVIRT_LAZY_MMU); |
271 | } | 271 | } |
272 | 272 | ||
273 | void paravirt_enter_lazy_cpu(void) | 273 | void paravirt_start_context_switch(struct task_struct *prev) |
274 | { | 274 | { |
275 | BUG_ON(preemptible()); | ||
276 | |||
277 | if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { | ||
278 | arch_leave_lazy_mmu_mode(); | ||
279 | set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); | ||
280 | } | ||
275 | enter_lazy(PARAVIRT_LAZY_CPU); | 281 | enter_lazy(PARAVIRT_LAZY_CPU); |
276 | } | 282 | } |
277 | 283 | ||
278 | void paravirt_leave_lazy_cpu(void) | 284 | void paravirt_end_context_switch(struct task_struct *next) |
279 | { | 285 | { |
280 | paravirt_leave_lazy(PARAVIRT_LAZY_CPU); | 286 | BUG_ON(preemptible()); |
287 | |||
288 | leave_lazy(PARAVIRT_LAZY_CPU); | ||
289 | |||
290 | if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) | ||
291 | arch_enter_lazy_mmu_mode(); | ||
281 | } | 292 | } |
282 | 293 | ||
283 | enum paravirt_lazy_mode paravirt_get_lazy_mode(void) | 294 | enum paravirt_lazy_mode paravirt_get_lazy_mode(void) |
284 | { | 295 | { |
285 | return __get_cpu_var(paravirt_lazy_mode); | 296 | if (in_interrupt()) |
297 | return PARAVIRT_LAZY_NONE; | ||
298 | |||
299 | return percpu_read(paravirt_lazy_mode); | ||
286 | } | 300 | } |
287 | 301 | ||
288 | void arch_flush_lazy_mmu_mode(void) | 302 | void arch_flush_lazy_mmu_mode(void) |
@@ -290,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void) | |||
290 | preempt_disable(); | 304 | preempt_disable(); |
291 | 305 | ||
292 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { | 306 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { |
293 | WARN_ON(preempt_count() == 1); | ||
294 | arch_leave_lazy_mmu_mode(); | 307 | arch_leave_lazy_mmu_mode(); |
295 | arch_enter_lazy_mmu_mode(); | 308 | arch_enter_lazy_mmu_mode(); |
296 | } | 309 | } |
@@ -298,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void) | |||
298 | preempt_enable(); | 311 | preempt_enable(); |
299 | } | 312 | } |
300 | 313 | ||
301 | void arch_flush_lazy_cpu_mode(void) | ||
302 | { | ||
303 | preempt_disable(); | ||
304 | |||
305 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { | ||
306 | WARN_ON(preempt_count() == 1); | ||
307 | arch_leave_lazy_cpu_mode(); | ||
308 | arch_enter_lazy_cpu_mode(); | ||
309 | } | ||
310 | |||
311 | preempt_enable(); | ||
312 | } | ||
313 | |||
314 | struct pv_info pv_info = { | 314 | struct pv_info pv_info = { |
315 | .name = "bare hardware", | 315 | .name = "bare hardware", |
316 | .paravirt_enabled = 0, | 316 | .paravirt_enabled = 0, |
@@ -402,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = { | |||
402 | .set_iopl_mask = native_set_iopl_mask, | 402 | .set_iopl_mask = native_set_iopl_mask, |
403 | .io_delay = native_io_delay, | 403 | .io_delay = native_io_delay, |
404 | 404 | ||
405 | .lazy_mode = { | 405 | .start_context_switch = paravirt_nop, |
406 | .enter = paravirt_nop, | 406 | .end_context_switch = paravirt_nop, |
407 | .leave = paravirt_nop, | ||
408 | }, | ||
409 | }; | 407 | }; |
410 | 408 | ||
411 | struct pv_apic_ops pv_apic_ops = { | 409 | struct pv_apic_ops pv_apic_ops = { |
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 755c21e906f3..971a3bec47a8 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c | |||
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = { | |||
186 | 186 | ||
187 | static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; | 187 | static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; |
188 | 188 | ||
189 | /* enable this to stress test the chip's TCE cache */ | ||
190 | #ifdef CONFIG_IOMMU_DEBUG | ||
191 | static int debugging = 1; | ||
192 | |||
193 | static inline unsigned long verify_bit_range(unsigned long* bitmap, | ||
194 | int expected, unsigned long start, unsigned long end) | ||
195 | { | ||
196 | unsigned long idx = start; | ||
197 | |||
198 | BUG_ON(start >= end); | ||
199 | |||
200 | while (idx < end) { | ||
201 | if (!!test_bit(idx, bitmap) != expected) | ||
202 | return idx; | ||
203 | ++idx; | ||
204 | } | ||
205 | |||
206 | /* all bits have the expected value */ | ||
207 | return ~0UL; | ||
208 | } | ||
209 | #else /* debugging is disabled */ | ||
210 | static int debugging; | ||
211 | |||
212 | static inline unsigned long verify_bit_range(unsigned long* bitmap, | ||
213 | int expected, unsigned long start, unsigned long end) | ||
214 | { | ||
215 | return ~0UL; | ||
216 | } | ||
217 | |||
218 | #endif /* CONFIG_IOMMU_DEBUG */ | ||
219 | |||
220 | static inline int translation_enabled(struct iommu_table *tbl) | 189 | static inline int translation_enabled(struct iommu_table *tbl) |
221 | { | 190 | { |
222 | /* only PHBs with translation enabled have an IOMMU table */ | 191 | /* only PHBs with translation enabled have an IOMMU table */ |
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, | |||
228 | { | 197 | { |
229 | unsigned long index; | 198 | unsigned long index; |
230 | unsigned long end; | 199 | unsigned long end; |
231 | unsigned long badbit; | ||
232 | unsigned long flags; | 200 | unsigned long flags; |
233 | 201 | ||
234 | index = start_addr >> PAGE_SHIFT; | 202 | index = start_addr >> PAGE_SHIFT; |
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, | |||
243 | 211 | ||
244 | spin_lock_irqsave(&tbl->it_lock, flags); | 212 | spin_lock_irqsave(&tbl->it_lock, flags); |
245 | 213 | ||
246 | badbit = verify_bit_range(tbl->it_map, 0, index, end); | ||
247 | if (badbit != ~0UL) { | ||
248 | if (printk_ratelimit()) | ||
249 | printk(KERN_ERR "Calgary: entry already allocated at " | ||
250 | "0x%lx tbl %p dma 0x%lx npages %u\n", | ||
251 | badbit, tbl, start_addr, npages); | ||
252 | } | ||
253 | |||
254 | iommu_area_reserve(tbl->it_map, index, npages); | 214 | iommu_area_reserve(tbl->it_map, index, npages); |
255 | 215 | ||
256 | spin_unlock_irqrestore(&tbl->it_lock, flags); | 216 | spin_unlock_irqrestore(&tbl->it_lock, flags); |
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, | |||
326 | unsigned int npages) | 286 | unsigned int npages) |
327 | { | 287 | { |
328 | unsigned long entry; | 288 | unsigned long entry; |
329 | unsigned long badbit; | ||
330 | unsigned long badend; | 289 | unsigned long badend; |
331 | unsigned long flags; | 290 | unsigned long flags; |
332 | 291 | ||
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, | |||
346 | 305 | ||
347 | spin_lock_irqsave(&tbl->it_lock, flags); | 306 | spin_lock_irqsave(&tbl->it_lock, flags); |
348 | 307 | ||
349 | badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); | ||
350 | if (badbit != ~0UL) { | ||
351 | if (printk_ratelimit()) | ||
352 | printk(KERN_ERR "Calgary: bit is off at 0x%lx " | ||
353 | "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", | ||
354 | badbit, tbl, dma_addr, entry, npages); | ||
355 | } | ||
356 | |||
357 | iommu_area_free(tbl->it_map, entry, npages); | 308 | iommu_area_free(tbl->it_map, entry, npages); |
358 | 309 | ||
359 | spin_unlock_irqrestore(&tbl->it_lock, flags); | 310 | spin_unlock_irqrestore(&tbl->it_lock, flags); |
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void) | |||
1488 | iommu_detected = 1; | 1439 | iommu_detected = 1; |
1489 | calgary_detected = 1; | 1440 | calgary_detected = 1; |
1490 | printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); | 1441 | printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); |
1491 | printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " | 1442 | printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", |
1492 | "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, | 1443 | specified_table_size); |
1493 | debugging ? "enabled" : "disabled"); | ||
1494 | 1444 | ||
1495 | /* swiotlb for devices that aren't behind the Calgary. */ | 1445 | /* swiotlb for devices that aren't behind the Calgary. */ |
1496 | if (max_pfn > MAX_DMA32_PFN) | 1446 | if (max_pfn > MAX_DMA32_PFN) |
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index b284b58c035c..cfd9f9063896 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c | |||
@@ -144,48 +144,21 @@ static void flush_gart(void) | |||
144 | } | 144 | } |
145 | 145 | ||
146 | #ifdef CONFIG_IOMMU_LEAK | 146 | #ifdef CONFIG_IOMMU_LEAK |
147 | |||
148 | #define SET_LEAK(x) \ | ||
149 | do { \ | ||
150 | if (iommu_leak_tab) \ | ||
151 | iommu_leak_tab[x] = __builtin_return_address(0);\ | ||
152 | } while (0) | ||
153 | |||
154 | #define CLEAR_LEAK(x) \ | ||
155 | do { \ | ||
156 | if (iommu_leak_tab) \ | ||
157 | iommu_leak_tab[x] = NULL; \ | ||
158 | } while (0) | ||
159 | |||
160 | /* Debugging aid for drivers that don't free their IOMMU tables */ | 147 | /* Debugging aid for drivers that don't free their IOMMU tables */ |
161 | static void **iommu_leak_tab; | ||
162 | static int leak_trace; | 148 | static int leak_trace; |
163 | static int iommu_leak_pages = 20; | 149 | static int iommu_leak_pages = 20; |
164 | 150 | ||
165 | static void dump_leak(void) | 151 | static void dump_leak(void) |
166 | { | 152 | { |
167 | int i; | ||
168 | static int dump; | 153 | static int dump; |
169 | 154 | ||
170 | if (dump || !iommu_leak_tab) | 155 | if (dump) |
171 | return; | 156 | return; |
172 | dump = 1; | 157 | dump = 1; |
173 | show_stack(NULL, NULL); | ||
174 | 158 | ||
175 | /* Very crude. dump some from the end of the table too */ | 159 | show_stack(NULL, NULL); |
176 | printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", | 160 | debug_dma_dump_mappings(NULL); |
177 | iommu_leak_pages); | ||
178 | for (i = 0; i < iommu_leak_pages; i += 2) { | ||
179 | printk(KERN_DEBUG "%lu: ", iommu_pages-i); | ||
180 | printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], | ||
181 | 0); | ||
182 | printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); | ||
183 | } | ||
184 | printk(KERN_DEBUG "\n"); | ||
185 | } | 161 | } |
186 | #else | ||
187 | # define SET_LEAK(x) | ||
188 | # define CLEAR_LEAK(x) | ||
189 | #endif | 162 | #endif |
190 | 163 | ||
191 | static void iommu_full(struct device *dev, size_t size, int dir) | 164 | static void iommu_full(struct device *dev, size_t size, int dir) |
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, | |||
248 | 221 | ||
249 | for (i = 0; i < npages; i++) { | 222 | for (i = 0; i < npages; i++) { |
250 | iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); | 223 | iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); |
251 | SET_LEAK(iommu_page + i); | ||
252 | phys_mem += PAGE_SIZE; | 224 | phys_mem += PAGE_SIZE; |
253 | } | 225 | } |
254 | return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); | 226 | return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); |
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, | |||
294 | npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); | 266 | npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); |
295 | for (i = 0; i < npages; i++) { | 267 | for (i = 0; i < npages; i++) { |
296 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; | 268 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; |
297 | CLEAR_LEAK(iommu_page + i); | ||
298 | } | 269 | } |
299 | free_iommu(iommu_page, npages); | 270 | free_iommu(iommu_page, npages); |
300 | } | 271 | } |
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, | |||
377 | pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); | 348 | pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); |
378 | while (pages--) { | 349 | while (pages--) { |
379 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); | 350 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); |
380 | SET_LEAK(iommu_page); | ||
381 | addr += PAGE_SIZE; | 351 | addr += PAGE_SIZE; |
382 | iommu_page++; | 352 | iommu_page++; |
383 | } | 353 | } |
@@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) | |||
688 | 658 | ||
689 | agp_gatt_table = gatt; | 659 | agp_gatt_table = gatt; |
690 | 660 | ||
691 | enable_gart_translations(); | ||
692 | |||
693 | error = sysdev_class_register(&gart_sysdev_class); | 661 | error = sysdev_class_register(&gart_sysdev_class); |
694 | if (!error) | 662 | if (!error) |
695 | error = sysdev_register(&device_gart); | 663 | error = sysdev_register(&device_gart); |
@@ -801,11 +769,12 @@ void __init gart_iommu_init(void) | |||
801 | 769 | ||
802 | #ifdef CONFIG_IOMMU_LEAK | 770 | #ifdef CONFIG_IOMMU_LEAK |
803 | if (leak_trace) { | 771 | if (leak_trace) { |
804 | iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, | 772 | int ret; |
805 | get_order(iommu_pages*sizeof(void *))); | 773 | |
806 | if (!iommu_leak_tab) | 774 | ret = dma_debug_resize_entries(iommu_pages); |
775 | if (ret) | ||
807 | printk(KERN_DEBUG | 776 | printk(KERN_DEBUG |
808 | "PCI-DMA: Cannot allocate leak trace area\n"); | 777 | "PCI-DMA: Cannot trace all the entries\n"); |
809 | } | 778 | } |
810 | #endif | 779 | #endif |
811 | 780 | ||
@@ -845,6 +814,14 @@ void __init gart_iommu_init(void) | |||
845 | * the pages as Not-Present: | 814 | * the pages as Not-Present: |
846 | */ | 815 | */ |
847 | wbinvd(); | 816 | wbinvd(); |
817 | |||
818 | /* | ||
819 | * Now all caches are flushed and we can safely enable | ||
820 | * GART hardware. Doing it early leaves the possibility | ||
821 | * of stale cache entries that can lead to GART PTE | ||
822 | * errors. | ||
823 | */ | ||
824 | enable_gart_translations(); | ||
848 | 825 | ||
849 | /* | 826 | /* |
850 | * Try to workaround a bug (thanks to BenH): | 827 | * Try to workaround a bug (thanks to BenH): |
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 221a3853e268..a1712f2b50f1 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c | |||
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) | |||
28 | return paddr; | 28 | return paddr; |
29 | } | 29 | } |
30 | 30 | ||
31 | phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) | 31 | phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) |
32 | { | 32 | { |
33 | return baddr; | 33 | return baddr; |
34 | } | 34 | } |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 19a686c401b5..fc6e4b773fc4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -8,9 +8,11 @@ | |||
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include <linux/pm.h> | 9 | #include <linux/pm.h> |
10 | #include <linux/clockchips.h> | 10 | #include <linux/clockchips.h> |
11 | #include <linux/random.h> | ||
11 | #include <trace/power.h> | 12 | #include <trace/power.h> |
12 | #include <asm/system.h> | 13 | #include <asm/system.h> |
13 | #include <asm/apic.h> | 14 | #include <asm/apic.h> |
15 | #include <asm/syscalls.h> | ||
14 | #include <asm/idle.h> | 16 | #include <asm/idle.h> |
15 | #include <asm/uaccess.h> | 17 | #include <asm/uaccess.h> |
16 | #include <asm/i387.h> | 18 | #include <asm/i387.h> |
@@ -65,7 +67,7 @@ void arch_task_cache_init(void) | |||
65 | task_xstate_cachep = | 67 | task_xstate_cachep = |
66 | kmem_cache_create("task_xstate", xstate_size, | 68 | kmem_cache_create("task_xstate", xstate_size, |
67 | __alignof__(union thread_xstate), | 69 | __alignof__(union thread_xstate), |
68 | SLAB_PANIC, NULL); | 70 | SLAB_PANIC | SLAB_NOTRACK, NULL); |
69 | } | 71 | } |
70 | 72 | ||
71 | /* | 73 | /* |
@@ -604,3 +606,16 @@ static int __init idle_setup(char *str) | |||
604 | } | 606 | } |
605 | early_param("idle", idle_setup); | 607 | early_param("idle", idle_setup); |
606 | 608 | ||
609 | unsigned long arch_align_stack(unsigned long sp) | ||
610 | { | ||
611 | if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) | ||
612 | sp -= get_random_int() % 8192; | ||
613 | return sp & ~0xf; | ||
614 | } | ||
615 | |||
616 | unsigned long arch_randomize_brk(struct mm_struct *mm) | ||
617 | { | ||
618 | unsigned long range_end = mm->brk + 0x02000000; | ||
619 | return randomize_range(mm->brk, range_end, 0) ? : mm->brk; | ||
620 | } | ||
621 | |||
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 297ffff2ffc2..00a8fe4c58bb 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -9,8 +9,6 @@ | |||
9 | * This file handles the architecture-dependent parts of process handling.. | 9 | * This file handles the architecture-dependent parts of process handling.. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <stdarg.h> | ||
13 | |||
14 | #include <linux/stackprotector.h> | 12 | #include <linux/stackprotector.h> |
15 | #include <linux/cpu.h> | 13 | #include <linux/cpu.h> |
16 | #include <linux/errno.h> | 14 | #include <linux/errno.h> |
@@ -33,7 +31,6 @@ | |||
33 | #include <linux/module.h> | 31 | #include <linux/module.h> |
34 | #include <linux/kallsyms.h> | 32 | #include <linux/kallsyms.h> |
35 | #include <linux/ptrace.h> | 33 | #include <linux/ptrace.h> |
36 | #include <linux/random.h> | ||
37 | #include <linux/personality.h> | 34 | #include <linux/personality.h> |
38 | #include <linux/tick.h> | 35 | #include <linux/tick.h> |
39 | #include <linux/percpu.h> | 36 | #include <linux/percpu.h> |
@@ -419,7 +416,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
419 | * done before math_state_restore, so the TS bit is up | 416 | * done before math_state_restore, so the TS bit is up |
420 | * to date. | 417 | * to date. |
421 | */ | 418 | */ |
422 | arch_leave_lazy_cpu_mode(); | 419 | arch_end_context_switch(next_p); |
423 | 420 | ||
424 | /* If the task has used fpu the last 5 timeslices, just do a full | 421 | /* If the task has used fpu the last 5 timeslices, just do a full |
425 | * restore of the math state immediately to avoid the trap; the | 422 | * restore of the math state immediately to avoid the trap; the |
@@ -526,15 +523,3 @@ unsigned long get_wchan(struct task_struct *p) | |||
526 | return 0; | 523 | return 0; |
527 | } | 524 | } |
528 | 525 | ||
529 | unsigned long arch_align_stack(unsigned long sp) | ||
530 | { | ||
531 | if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) | ||
532 | sp -= get_random_int() % 8192; | ||
533 | return sp & ~0xf; | ||
534 | } | ||
535 | |||
536 | unsigned long arch_randomize_brk(struct mm_struct *mm) | ||
537 | { | ||
538 | unsigned long range_end = mm->brk + 0x02000000; | ||
539 | return randomize_range(mm->brk, range_end, 0) ? : mm->brk; | ||
540 | } | ||
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index f7b276d4b3fb..89c46f1259d3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -14,8 +14,6 @@ | |||
14 | * This file handles the architecture-dependent parts of process handling.. | 14 | * This file handles the architecture-dependent parts of process handling.. |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <stdarg.h> | ||
18 | |||
19 | #include <linux/stackprotector.h> | 17 | #include <linux/stackprotector.h> |
20 | #include <linux/cpu.h> | 18 | #include <linux/cpu.h> |
21 | #include <linux/errno.h> | 19 | #include <linux/errno.h> |
@@ -32,7 +30,6 @@ | |||
32 | #include <linux/delay.h> | 30 | #include <linux/delay.h> |
33 | #include <linux/module.h> | 31 | #include <linux/module.h> |
34 | #include <linux/ptrace.h> | 32 | #include <linux/ptrace.h> |
35 | #include <linux/random.h> | ||
36 | #include <linux/notifier.h> | 33 | #include <linux/notifier.h> |
37 | #include <linux/kprobes.h> | 34 | #include <linux/kprobes.h> |
38 | #include <linux/kdebug.h> | 35 | #include <linux/kdebug.h> |
@@ -442,7 +439,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
442 | * done before math_state_restore, so the TS bit is up | 439 | * done before math_state_restore, so the TS bit is up |
443 | * to date. | 440 | * to date. |
444 | */ | 441 | */ |
445 | arch_leave_lazy_cpu_mode(); | 442 | arch_end_context_switch(next_p); |
446 | 443 | ||
447 | /* | 444 | /* |
448 | * Switch FS and GS. | 445 | * Switch FS and GS. |
@@ -692,15 +689,3 @@ long sys_arch_prctl(int code, unsigned long addr) | |||
692 | return do_arch_prctl(current, code, addr); | 689 | return do_arch_prctl(current, code, addr); |
693 | } | 690 | } |
694 | 691 | ||
695 | unsigned long arch_align_stack(unsigned long sp) | ||
696 | { | ||
697 | if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) | ||
698 | sp -= get_random_int() % 8192; | ||
699 | return sp & ~0xf; | ||
700 | } | ||
701 | |||
702 | unsigned long arch_randomize_brk(struct mm_struct *mm) | ||
703 | { | ||
704 | unsigned long range_end = mm->brk + 0x02000000; | ||
705 | return randomize_range(mm->brk, range_end, 0) ? : mm->brk; | ||
706 | } | ||
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 7563b31b4f03..af71d06624bf 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c | |||
@@ -491,5 +491,42 @@ void force_hpet_resume(void) | |||
491 | break; | 491 | break; |
492 | } | 492 | } |
493 | } | 493 | } |
494 | #endif | ||
495 | |||
496 | #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) | ||
497 | /* Set correct numa_node information for AMD NB functions */ | ||
498 | static void __init quirk_amd_nb_node(struct pci_dev *dev) | ||
499 | { | ||
500 | struct pci_dev *nb_ht; | ||
501 | unsigned int devfn; | ||
502 | u32 val; | ||
503 | |||
504 | devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); | ||
505 | nb_ht = pci_get_slot(dev->bus, devfn); | ||
506 | if (!nb_ht) | ||
507 | return; | ||
508 | |||
509 | pci_read_config_dword(nb_ht, 0x60, &val); | ||
510 | set_dev_node(&dev->dev, val & 7); | ||
511 | pci_dev_put(dev); | ||
512 | } | ||
494 | 513 | ||
514 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, | ||
515 | quirk_amd_nb_node); | ||
516 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, | ||
517 | quirk_amd_nb_node); | ||
518 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, | ||
519 | quirk_amd_nb_node); | ||
520 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, | ||
521 | quirk_amd_nb_node); | ||
522 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, | ||
523 | quirk_amd_nb_node); | ||
524 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, | ||
525 | quirk_amd_nb_node); | ||
526 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, | ||
527 | quirk_amd_nb_node); | ||
528 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, | ||
529 | quirk_amd_nb_node); | ||
530 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, | ||
531 | quirk_amd_nb_node); | ||
495 | #endif | 532 | #endif |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1340dad417f4..d2d1ce8170f0 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
192 | DMI_MATCH(DMI_BOARD_NAME, "0KP561"), | 192 | DMI_MATCH(DMI_BOARD_NAME, "0KP561"), |
193 | }, | 193 | }, |
194 | }, | 194 | }, |
195 | { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ | ||
196 | .callback = set_bios_reboot, | ||
197 | .ident = "Dell OptiPlex 360", | ||
198 | .matches = { | ||
199 | DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), | ||
200 | DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), | ||
201 | DMI_MATCH(DMI_BOARD_NAME, "0T656F"), | ||
202 | }, | ||
203 | }, | ||
195 | { /* Handle problems with rebooting on Dell 2400's */ | 204 | { /* Handle problems with rebooting on Dell 2400's */ |
196 | .callback = set_bios_reboot, | 205 | .callback = set_bios_reboot, |
197 | .ident = "Dell PowerEdge 2400", | 206 | .ident = "Dell PowerEdge 2400", |
@@ -232,6 +241,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
232 | DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), | 241 | DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), |
233 | }, | 242 | }, |
234 | }, | 243 | }, |
244 | { /* Handle problems with rebooting on Sony VGN-Z540N */ | ||
245 | .callback = set_bios_reboot, | ||
246 | .ident = "Sony VGN-Z540N", | ||
247 | .matches = { | ||
248 | DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), | ||
249 | DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), | ||
250 | }, | ||
251 | }, | ||
235 | { } | 252 | { } |
236 | }; | 253 | }; |
237 | 254 | ||
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf63..be5ae80f897f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -112,6 +112,14 @@ | |||
112 | #define ARCH_SETUP | 112 | #define ARCH_SETUP |
113 | #endif | 113 | #endif |
114 | 114 | ||
115 | /* | ||
116 | * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. | ||
117 | * The direct mapping extends to max_pfn_mapped, so that we can directly access | ||
118 | * apertures, ACPI and other tables without having to play with fixmaps. | ||
119 | */ | ||
120 | unsigned long max_low_pfn_mapped; | ||
121 | unsigned long max_pfn_mapped; | ||
122 | |||
115 | RESERVE_BRK(dmi_alloc, 65536); | 123 | RESERVE_BRK(dmi_alloc, 65536); |
116 | 124 | ||
117 | unsigned int boot_cpu_id __read_mostly; | 125 | unsigned int boot_cpu_id __read_mostly; |
@@ -214,8 +222,8 @@ unsigned long mmu_cr4_features; | |||
214 | unsigned long mmu_cr4_features = X86_CR4_PAE; | 222 | unsigned long mmu_cr4_features = X86_CR4_PAE; |
215 | #endif | 223 | #endif |
216 | 224 | ||
217 | /* Boot loader ID as an integer, for the benefit of proc_dointvec */ | 225 | /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ |
218 | int bootloader_type; | 226 | int bootloader_type, bootloader_version; |
219 | 227 | ||
220 | /* | 228 | /* |
221 | * Setup options | 229 | * Setup options |
@@ -293,15 +301,13 @@ static void __init reserve_brk(void) | |||
293 | 301 | ||
294 | #ifdef CONFIG_BLK_DEV_INITRD | 302 | #ifdef CONFIG_BLK_DEV_INITRD |
295 | 303 | ||
296 | #ifdef CONFIG_X86_32 | ||
297 | |||
298 | #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) | 304 | #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) |
299 | static void __init relocate_initrd(void) | 305 | static void __init relocate_initrd(void) |
300 | { | 306 | { |
301 | 307 | ||
302 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; | 308 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; |
303 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; | 309 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; |
304 | u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; | 310 | u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; |
305 | u64 ramdisk_here; | 311 | u64 ramdisk_here; |
306 | unsigned long slop, clen, mapaddr; | 312 | unsigned long slop, clen, mapaddr; |
307 | char *p, *q; | 313 | char *p, *q; |
@@ -357,14 +363,13 @@ static void __init relocate_initrd(void) | |||
357 | ramdisk_image, ramdisk_image + ramdisk_size - 1, | 363 | ramdisk_image, ramdisk_image + ramdisk_size - 1, |
358 | ramdisk_here, ramdisk_here + ramdisk_size - 1); | 364 | ramdisk_here, ramdisk_here + ramdisk_size - 1); |
359 | } | 365 | } |
360 | #endif | ||
361 | 366 | ||
362 | static void __init reserve_initrd(void) | 367 | static void __init reserve_initrd(void) |
363 | { | 368 | { |
364 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; | 369 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; |
365 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; | 370 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; |
366 | u64 ramdisk_end = ramdisk_image + ramdisk_size; | 371 | u64 ramdisk_end = ramdisk_image + ramdisk_size; |
367 | u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; | 372 | u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; |
368 | 373 | ||
369 | if (!boot_params.hdr.type_of_loader || | 374 | if (!boot_params.hdr.type_of_loader || |
370 | !ramdisk_image || !ramdisk_size) | 375 | !ramdisk_image || !ramdisk_size) |
@@ -394,14 +399,8 @@ static void __init reserve_initrd(void) | |||
394 | return; | 399 | return; |
395 | } | 400 | } |
396 | 401 | ||
397 | #ifdef CONFIG_X86_32 | ||
398 | relocate_initrd(); | 402 | relocate_initrd(); |
399 | #else | 403 | |
400 | printk(KERN_ERR "initrd extends beyond end of memory " | ||
401 | "(0x%08llx > 0x%08llx)\ndisabling initrd\n", | ||
402 | ramdisk_end, end_of_lowmem); | ||
403 | initrd_start = 0; | ||
404 | #endif | ||
405 | free_early(ramdisk_image, ramdisk_end); | 404 | free_early(ramdisk_image, ramdisk_end); |
406 | } | 405 | } |
407 | #else | 406 | #else |
@@ -706,6 +705,12 @@ void __init setup_arch(char **cmdline_p) | |||
706 | #endif | 705 | #endif |
707 | saved_video_mode = boot_params.hdr.vid_mode; | 706 | saved_video_mode = boot_params.hdr.vid_mode; |
708 | bootloader_type = boot_params.hdr.type_of_loader; | 707 | bootloader_type = boot_params.hdr.type_of_loader; |
708 | if ((bootloader_type >> 4) == 0xe) { | ||
709 | bootloader_type &= 0xf; | ||
710 | bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; | ||
711 | } | ||
712 | bootloader_version = bootloader_type & 0xf; | ||
713 | bootloader_version |= boot_params.hdr.ext_loader_ver << 4; | ||
709 | 714 | ||
710 | #ifdef CONFIG_BLK_DEV_RAM | 715 | #ifdef CONFIG_BLK_DEV_RAM |
711 | rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; | 716 | rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; |
@@ -854,12 +859,16 @@ void __init setup_arch(char **cmdline_p) | |||
854 | max_low_pfn = max_pfn; | 859 | max_low_pfn = max_pfn; |
855 | 860 | ||
856 | high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; | 861 | high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; |
862 | max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; | ||
857 | #endif | 863 | #endif |
858 | 864 | ||
859 | #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION | 865 | #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION |
860 | setup_bios_corruption_check(); | 866 | setup_bios_corruption_check(); |
861 | #endif | 867 | #endif |
862 | 868 | ||
869 | printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", | ||
870 | max_pfn_mapped<<PAGE_SHIFT); | ||
871 | |||
863 | reserve_brk(); | 872 | reserve_brk(); |
864 | 873 | ||
865 | /* max_pfn_mapped is updated here */ | 874 | /* max_pfn_mapped is updated here */ |
@@ -997,24 +1006,6 @@ void __init setup_arch(char **cmdline_p) | |||
997 | #ifdef CONFIG_X86_32 | 1006 | #ifdef CONFIG_X86_32 |
998 | 1007 | ||
999 | /** | 1008 | /** |
1000 | * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors | ||
1001 | * | ||
1002 | * Description: | ||
1003 | * Perform any necessary interrupt initialisation prior to setting up | ||
1004 | * the "ordinary" interrupt call gates. For legacy reasons, the ISA | ||
1005 | * interrupts should be initialised here if the machine emulates a PC | ||
1006 | * in any way. | ||
1007 | **/ | ||
1008 | void __init x86_quirk_pre_intr_init(void) | ||
1009 | { | ||
1010 | if (x86_quirks->arch_pre_intr_init) { | ||
1011 | if (x86_quirks->arch_pre_intr_init()) | ||
1012 | return; | ||
1013 | } | ||
1014 | init_ISA_irqs(); | ||
1015 | } | ||
1016 | |||
1017 | /** | ||
1018 | * x86_quirk_intr_init - post gate setup interrupt initialisation | 1009 | * x86_quirk_intr_init - post gate setup interrupt initialisation |
1019 | * | 1010 | * |
1020 | * Description: | 1011 | * Description: |
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 3a97a4cf1872..9c3f0823e6aa 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -160,8 +160,10 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) | |||
160 | /* | 160 | /* |
161 | * If large page isn't supported, there's no benefit in doing | 161 | * If large page isn't supported, there's no benefit in doing |
162 | * this. Also, on non-NUMA, embedding is better. | 162 | * this. Also, on non-NUMA, embedding is better. |
163 | * | ||
164 | * NOTE: disabled for now. | ||
163 | */ | 165 | */ |
164 | if (!cpu_has_pse || !pcpu_need_numa()) | 166 | if (true || !cpu_has_pse || !pcpu_need_numa()) |
165 | return -EINVAL; | 167 | return -EINVAL; |
166 | 168 | ||
167 | /* | 169 | /* |
@@ -423,6 +425,14 @@ void __init setup_per_cpu_areas(void) | |||
423 | early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; | 425 | early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; |
424 | #endif | 426 | #endif |
425 | 427 | ||
428 | #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) | ||
429 | /* | ||
430 | * make sure boot cpu node_number is right, when boot cpu is on the | ||
431 | * node that doesn't have mem installed | ||
432 | */ | ||
433 | per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); | ||
434 | #endif | ||
435 | |||
426 | /* Setup node to cpumask map */ | 436 | /* Setup node to cpumask map */ |
427 | setup_node_to_cpumask_map(); | 437 | setup_node_to_cpumask_map(); |
428 | 438 | ||
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index f33d2e0ef095..0f89a4f20db2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c | |||
@@ -6,7 +6,6 @@ | |||
6 | * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes | 6 | * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes |
7 | * 2000-2002 x86-64 support by Andi Kleen | 7 | * 2000-2002 x86-64 support by Andi Kleen |
8 | */ | 8 | */ |
9 | |||
10 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
11 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
12 | #include <linux/smp.h> | 11 | #include <linux/smp.h> |
@@ -25,11 +24,11 @@ | |||
25 | #include <asm/ucontext.h> | 24 | #include <asm/ucontext.h> |
26 | #include <asm/i387.h> | 25 | #include <asm/i387.h> |
27 | #include <asm/vdso.h> | 26 | #include <asm/vdso.h> |
27 | #include <asm/mce.h> | ||
28 | 28 | ||
29 | #ifdef CONFIG_X86_64 | 29 | #ifdef CONFIG_X86_64 |
30 | #include <asm/proto.h> | 30 | #include <asm/proto.h> |
31 | #include <asm/ia32_unistd.h> | 31 | #include <asm/ia32_unistd.h> |
32 | #include <asm/mce.h> | ||
33 | #endif /* CONFIG_X86_64 */ | 32 | #endif /* CONFIG_X86_64 */ |
34 | 33 | ||
35 | #include <asm/syscall.h> | 34 | #include <asm/syscall.h> |
@@ -848,10 +847,10 @@ static void do_signal(struct pt_regs *regs) | |||
848 | void | 847 | void |
849 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | 848 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) |
850 | { | 849 | { |
851 | #if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) | 850 | #ifdef CONFIG_X86_NEW_MCE |
852 | /* notify userspace of pending MCEs */ | 851 | /* notify userspace of pending MCEs */ |
853 | if (thread_info_flags & _TIF_MCE_NOTIFY) | 852 | if (thread_info_flags & _TIF_MCE_NOTIFY) |
854 | mce_notify_user(); | 853 | mce_notify_process(); |
855 | #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ | 854 | #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ |
856 | 855 | ||
857 | /* deal with pending signal delivery */ | 856 | /* deal with pending signal delivery */ |
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 13f33ea8ccaa..ec1de97600e7 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c | |||
@@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask) | |||
150 | * this function calls the 'stop' function on all other CPUs in the system. | 150 | * this function calls the 'stop' function on all other CPUs in the system. |
151 | */ | 151 | */ |
152 | 152 | ||
153 | asmlinkage void smp_reboot_interrupt(void) | ||
154 | { | ||
155 | ack_APIC_irq(); | ||
156 | irq_enter(); | ||
157 | stop_this_cpu(NULL); | ||
158 | irq_exit(); | ||
159 | } | ||
160 | |||
153 | static void native_smp_send_stop(void) | 161 | static void native_smp_send_stop(void) |
154 | { | 162 | { |
155 | unsigned long flags; | 163 | unsigned long flags; |
164 | unsigned long wait; | ||
156 | 165 | ||
157 | if (reboot_force) | 166 | if (reboot_force) |
158 | return; | 167 | return; |
159 | 168 | ||
160 | smp_call_function(stop_this_cpu, NULL, 0); | 169 | /* |
170 | * Use an own vector here because smp_call_function | ||
171 | * does lots of things not suitable in a panic situation. | ||
172 | * On most systems we could also use an NMI here, | ||
173 | * but there are a few systems around where NMI | ||
174 | * is problematic so stay with an non NMI for now | ||
175 | * (this implies we cannot stop CPUs spinning with irq off | ||
176 | * currently) | ||
177 | */ | ||
178 | if (num_online_cpus() > 1) { | ||
179 | apic->send_IPI_allbutself(REBOOT_VECTOR); | ||
180 | |||
181 | /* Don't wait longer than a second */ | ||
182 | wait = USEC_PER_SEC; | ||
183 | while (num_online_cpus() > 1 && wait--) | ||
184 | udelay(1); | ||
185 | } | ||
186 | |||
161 | local_irq_save(flags); | 187 | local_irq_save(flags); |
162 | disable_local_APIC(); | 188 | disable_local_APIC(); |
163 | local_irq_restore(flags); | 189 | local_irq_restore(flags); |
@@ -172,6 +198,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs) | |||
172 | { | 198 | { |
173 | ack_APIC_irq(); | 199 | ack_APIC_irq(); |
174 | inc_irq_stat(irq_resched_count); | 200 | inc_irq_stat(irq_resched_count); |
201 | /* | ||
202 | * KVM uses this interrupt to force a cpu out of guest mode | ||
203 | */ | ||
175 | } | 204 | } |
176 | 205 | ||
177 | void smp_call_function_interrupt(struct pt_regs *regs) | 206 | void smp_call_function_interrupt(struct pt_regs *regs) |
@@ -193,19 +222,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) | |||
193 | } | 222 | } |
194 | 223 | ||
195 | struct smp_ops smp_ops = { | 224 | struct smp_ops smp_ops = { |
196 | .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, | 225 | .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, |
197 | .smp_prepare_cpus = native_smp_prepare_cpus, | 226 | .smp_prepare_cpus = native_smp_prepare_cpus, |
198 | .smp_cpus_done = native_smp_cpus_done, | 227 | .smp_cpus_done = native_smp_cpus_done, |
199 | 228 | ||
200 | .smp_send_stop = native_smp_send_stop, | 229 | .smp_send_stop = native_smp_send_stop, |
201 | .smp_send_reschedule = native_smp_send_reschedule, | 230 | .smp_send_reschedule = native_smp_send_reschedule, |
202 | 231 | ||
203 | .cpu_up = native_cpu_up, | 232 | .cpu_up = native_cpu_up, |
204 | .cpu_die = native_cpu_die, | 233 | .cpu_die = native_cpu_die, |
205 | .cpu_disable = native_cpu_disable, | 234 | .cpu_disable = native_cpu_disable, |
206 | .play_dead = native_play_dead, | 235 | .play_dead = native_play_dead, |
207 | 236 | ||
208 | .send_call_func_ipi = native_send_call_func_ipi, | 237 | .send_call_func_ipi = native_send_call_func_ipi, |
209 | .send_call_func_single_ipi = native_send_call_func_single_ipi, | 238 | .send_call_func_single_ipi = native_send_call_func_single_ipi, |
210 | }; | 239 | }; |
211 | EXPORT_SYMBOL_GPL(smp_ops); | 240 | EXPORT_SYMBOL_GPL(smp_ops); |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2b2652d205c0..dee0f3d814af 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -506,7 +506,7 @@ void __inquire_remote_apic(int apicid) | |||
506 | * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this | 506 | * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this |
507 | * won't ... remember to clear down the APIC, etc later. | 507 | * won't ... remember to clear down the APIC, etc later. |
508 | */ | 508 | */ |
509 | int __devinit | 509 | int __cpuinit |
510 | wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) | 510 | wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) |
511 | { | 511 | { |
512 | unsigned long send_status, accept_status = 0; | 512 | unsigned long send_status, accept_status = 0; |
@@ -540,7 +540,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) | |||
540 | return (send_status | accept_status); | 540 | return (send_status | accept_status); |
541 | } | 541 | } |
542 | 542 | ||
543 | int __devinit | 543 | static int __cpuinit |
544 | wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) | 544 | wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) |
545 | { | 545 | { |
546 | unsigned long send_status, accept_status = 0; | 546 | unsigned long send_status, accept_status = 0; |
@@ -824,10 +824,12 @@ do_rest: | |||
824 | /* mark "stuck" area as not stuck */ | 824 | /* mark "stuck" area as not stuck */ |
825 | *((volatile unsigned long *)trampoline_base) = 0; | 825 | *((volatile unsigned long *)trampoline_base) = 0; |
826 | 826 | ||
827 | /* | 827 | if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { |
828 | * Cleanup possible dangling ends... | 828 | /* |
829 | */ | 829 | * Cleanup possible dangling ends... |
830 | smpboot_restore_warm_reset_vector(); | 830 | */ |
831 | smpboot_restore_warm_reset_vector(); | ||
832 | } | ||
831 | 833 | ||
832 | return boot_error; | 834 | return boot_error; |
833 | } | 835 | } |
@@ -873,7 +875,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) | |||
873 | 875 | ||
874 | err = do_boot_cpu(apicid, cpu); | 876 | err = do_boot_cpu(apicid, cpu); |
875 | 877 | ||
876 | zap_low_mappings(); | 878 | zap_low_mappings(false); |
877 | low_mappings = 0; | 879 | low_mappings = 0; |
878 | #else | 880 | #else |
879 | err = do_boot_cpu(apicid, cpu); | 881 | err = do_boot_cpu(apicid, cpu); |
@@ -992,10 +994,12 @@ static int __init smp_sanity_check(unsigned max_cpus) | |||
992 | */ | 994 | */ |
993 | if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && | 995 | if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && |
994 | !cpu_has_apic) { | 996 | !cpu_has_apic) { |
995 | printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", | 997 | if (!disable_apic) { |
996 | boot_cpu_physical_apicid); | 998 | pr_err("BIOS bug, local APIC #%d not detected!...\n", |
997 | printk(KERN_ERR "... forcing use of dummy APIC emulation." | 999 | boot_cpu_physical_apicid); |
1000 | pr_err("... forcing use of dummy APIC emulation." | ||
998 | "(tell your hw vendor)\n"); | 1001 | "(tell your hw vendor)\n"); |
1002 | } | ||
999 | smpboot_clear_io_apic(); | 1003 | smpboot_clear_io_apic(); |
1000 | arch_disable_smp_support(); | 1004 | arch_disable_smp_support(); |
1001 | return -1; | 1005 | return -1; |
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 4aaf7e48394f..c3eb207181fe 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c | |||
@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace) | |||
77 | } | 77 | } |
78 | EXPORT_SYMBOL_GPL(save_stack_trace); | 78 | EXPORT_SYMBOL_GPL(save_stack_trace); |
79 | 79 | ||
80 | void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) | ||
81 | { | ||
82 | dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); | ||
83 | if (trace->nr_entries < trace->max_entries) | ||
84 | trace->entries[trace->nr_entries++] = ULONG_MAX; | ||
85 | } | ||
86 | |||
80 | void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) | 87 | void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) |
81 | { | 88 | { |
82 | dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); | 89 | dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); |
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index ff5c8736b491..d51321ddafda 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S | |||
@@ -334,3 +334,5 @@ ENTRY(sys_call_table) | |||
334 | .long sys_inotify_init1 | 334 | .long sys_inotify_init1 |
335 | .long sys_preadv | 335 | .long sys_preadv |
336 | .long sys_pwritev | 336 | .long sys_pwritev |
337 | .long sys_rt_tgsigqueueinfo /* 335 */ | ||
338 | .long sys_perf_counter_open | ||
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index ed0c33761e6d..124d40c575df 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c | |||
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode) | |||
715 | struct bau_desc *adp; | 715 | struct bau_desc *adp; |
716 | struct bau_desc *ad2; | 716 | struct bau_desc *ad2; |
717 | 717 | ||
718 | adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); | 718 | /* |
719 | * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) | ||
720 | * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade | ||
721 | */ | ||
722 | adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* | ||
723 | UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); | ||
719 | BUG_ON(!adp); | 724 | BUG_ON(!adp); |
720 | 725 | ||
721 | pa = uv_gpa(adp); /* need the real nasid*/ | 726 | pa = uv_gpa(adp); /* need the real nasid*/ |
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode) | |||
729 | (n << UV_DESC_BASE_PNODE_SHIFT | m)); | 734 | (n << UV_DESC_BASE_PNODE_SHIFT | m)); |
730 | } | 735 | } |
731 | 736 | ||
732 | for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { | 737 | /* |
738 | * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each | ||
739 | * cpu even though we only use the first one; one descriptor can | ||
740 | * describe a broadcast to 256 nodes. | ||
741 | */ | ||
742 | for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); | ||
743 | i++, ad2++) { | ||
733 | memset(ad2, 0, sizeof(struct bau_desc)); | 744 | memset(ad2, 0, sizeof(struct bau_desc)); |
734 | ad2->header.sw_ack_flag = 1; | 745 | ad2->header.sw_ack_flag = 1; |
735 | /* | 746 | /* |
@@ -832,7 +843,7 @@ static int __init uv_bau_init(void) | |||
832 | return 0; | 843 | return 0; |
833 | 844 | ||
834 | for_each_possible_cpu(cur_cpu) | 845 | for_each_possible_cpu(cur_cpu) |
835 | alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), | 846 | zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), |
836 | GFP_KERNEL, cpu_to_node(cur_cpu)); | 847 | GFP_KERNEL, cpu_to_node(cur_cpu)); |
837 | 848 | ||
838 | uv_bau_retry_limit = 1; | 849 | uv_bau_retry_limit = 1; |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 124a4d5a95b2..286d64eba31b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <linux/edac.h> | 45 | #include <linux/edac.h> |
46 | #endif | 46 | #endif |
47 | 47 | ||
48 | #include <asm/kmemcheck.h> | ||
48 | #include <asm/stacktrace.h> | 49 | #include <asm/stacktrace.h> |
49 | #include <asm/processor.h> | 50 | #include <asm/processor.h> |
50 | #include <asm/debugreg.h> | 51 | #include <asm/debugreg.h> |
@@ -534,6 +535,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
534 | 535 | ||
535 | get_debugreg(dr6, 6); | 536 | get_debugreg(dr6, 6); |
536 | 537 | ||
538 | /* Catch kmemcheck conditions first of all! */ | ||
539 | if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) | ||
540 | return; | ||
541 | |||
537 | /* DR6 may or may not be cleared by the CPU */ | 542 | /* DR6 may or may not be cleared by the CPU */ |
538 | set_debugreg(0, 6); | 543 | set_debugreg(0, 6); |
539 | /* | 544 | /* |
@@ -777,15 +782,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) | |||
777 | 782 | ||
778 | return new_kesp; | 783 | return new_kesp; |
779 | } | 784 | } |
780 | #else | 785 | #endif |
786 | |||
781 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | 787 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) |
782 | { | 788 | { |
783 | } | 789 | } |
784 | 790 | ||
785 | asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) | 791 | asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) |
786 | { | 792 | { |
787 | } | 793 | } |
788 | #endif | ||
789 | 794 | ||
790 | /* | 795 | /* |
791 | * 'math_state_restore()' saves the current math information in the | 796 | * 'math_state_restore()' saves the current math information in the |
@@ -818,9 +823,6 @@ asmlinkage void math_state_restore(void) | |||
818 | } | 823 | } |
819 | 824 | ||
820 | clts(); /* Allow maths ops (or we recurse) */ | 825 | clts(); /* Allow maths ops (or we recurse) */ |
821 | #ifdef CONFIG_X86_32 | ||
822 | restore_fpu(tsk); | ||
823 | #else | ||
824 | /* | 826 | /* |
825 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. | 827 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. |
826 | */ | 828 | */ |
@@ -829,7 +831,7 @@ asmlinkage void math_state_restore(void) | |||
829 | force_sig(SIGSEGV, tsk); | 831 | force_sig(SIGSEGV, tsk); |
830 | return; | 832 | return; |
831 | } | 833 | } |
832 | #endif | 834 | |
833 | thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ | 835 | thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ |
834 | tsk->fpu_counter++; | 836 | tsk->fpu_counter++; |
835 | } | 837 | } |
@@ -924,8 +926,13 @@ void __init trap_init(void) | |||
924 | #endif | 926 | #endif |
925 | set_intr_gate(19, &simd_coprocessor_error); | 927 | set_intr_gate(19, &simd_coprocessor_error); |
926 | 928 | ||
929 | /* Reserve all the builtin and the syscall vector: */ | ||
930 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) | ||
931 | set_bit(i, used_vectors); | ||
932 | |||
927 | #ifdef CONFIG_IA32_EMULATION | 933 | #ifdef CONFIG_IA32_EMULATION |
928 | set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); | 934 | set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); |
935 | set_bit(IA32_SYSCALL_VECTOR, used_vectors); | ||
929 | #endif | 936 | #endif |
930 | 937 | ||
931 | #ifdef CONFIG_X86_32 | 938 | #ifdef CONFIG_X86_32 |
@@ -942,17 +949,9 @@ void __init trap_init(void) | |||
942 | } | 949 | } |
943 | 950 | ||
944 | set_system_trap_gate(SYSCALL_VECTOR, &system_call); | 951 | set_system_trap_gate(SYSCALL_VECTOR, &system_call); |
945 | #endif | ||
946 | |||
947 | /* Reserve all the builtin and the syscall vector: */ | ||
948 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) | ||
949 | set_bit(i, used_vectors); | ||
950 | |||
951 | #ifdef CONFIG_X86_64 | ||
952 | set_bit(IA32_SYSCALL_VECTOR, used_vectors); | ||
953 | #else | ||
954 | set_bit(SYSCALL_VECTOR, used_vectors); | 952 | set_bit(SYSCALL_VECTOR, used_vectors); |
955 | #endif | 953 | #endif |
954 | |||
956 | /* | 955 | /* |
957 | * Should be a barrier for any external CPU state: | 956 | * Should be a barrier for any external CPU state: |
958 | */ | 957 | */ |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index d57de05dc430..ae3180c506a6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/delay.h> | 9 | #include <linux/delay.h> |
10 | #include <linux/clocksource.h> | 10 | #include <linux/clocksource.h> |
11 | #include <linux/percpu.h> | 11 | #include <linux/percpu.h> |
12 | #include <linux/timex.h> | ||
12 | 13 | ||
13 | #include <asm/hpet.h> | 14 | #include <asm/hpet.h> |
14 | #include <asm/timer.h> | 15 | #include <asm/timer.h> |
@@ -384,13 +385,13 @@ unsigned long native_calibrate_tsc(void) | |||
384 | { | 385 | { |
385 | u64 tsc1, tsc2, delta, ref1, ref2; | 386 | u64 tsc1, tsc2, delta, ref1, ref2; |
386 | unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; | 387 | unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; |
387 | unsigned long flags, latch, ms, fast_calibrate, tsc_khz; | 388 | unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; |
388 | int hpet = is_hpet_enabled(), i, loopmin; | 389 | int hpet = is_hpet_enabled(), i, loopmin; |
389 | 390 | ||
390 | tsc_khz = get_hypervisor_tsc_freq(); | 391 | hv_tsc_khz = get_hypervisor_tsc_freq(); |
391 | if (tsc_khz) { | 392 | if (hv_tsc_khz) { |
392 | printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); | 393 | printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); |
393 | return tsc_khz; | 394 | return hv_tsc_khz; |
394 | } | 395 | } |
395 | 396 | ||
396 | local_irq_save(flags); | 397 | local_irq_save(flags); |
@@ -710,7 +711,16 @@ static cycle_t read_tsc(struct clocksource *cs) | |||
710 | #ifdef CONFIG_X86_64 | 711 | #ifdef CONFIG_X86_64 |
711 | static cycle_t __vsyscall_fn vread_tsc(void) | 712 | static cycle_t __vsyscall_fn vread_tsc(void) |
712 | { | 713 | { |
713 | cycle_t ret = (cycle_t)vget_cycles(); | 714 | cycle_t ret; |
715 | |||
716 | /* | ||
717 | * Surround the RDTSC by barriers, to make sure it's not | ||
718 | * speculated to outside the seqlock critical section and | ||
719 | * does not cause time warps: | ||
720 | */ | ||
721 | rdtsc_barrier(); | ||
722 | ret = (cycle_t)vget_cycles(); | ||
723 | rdtsc_barrier(); | ||
714 | 724 | ||
715 | return ret >= __vsyscall_gtod_data.clock.cycle_last ? | 725 | return ret >= __vsyscall_gtod_data.clock.cycle_last ? |
716 | ret : __vsyscall_gtod_data.clock.cycle_last; | 726 | ret : __vsyscall_gtod_data.clock.cycle_last; |
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bf36328f6ef9..027b5b498993 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c | |||
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count; | |||
34 | * of a critical section, to be able to prove TSC time-warps: | 34 | * of a critical section, to be able to prove TSC time-warps: |
35 | */ | 35 | */ |
36 | static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; | 36 | static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; |
37 | |||
37 | static __cpuinitdata cycles_t last_tsc; | 38 | static __cpuinitdata cycles_t last_tsc; |
38 | static __cpuinitdata cycles_t max_warp; | 39 | static __cpuinitdata cycles_t max_warp; |
39 | static __cpuinitdata int nr_warps; | 40 | static __cpuinitdata int nr_warps; |
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu) | |||
113 | return; | 114 | return; |
114 | 115 | ||
115 | if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { | 116 | if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { |
116 | printk(KERN_INFO | 117 | pr_info("Skipping synchronization checks as TSC is reliable.\n"); |
117 | "Skipping synchronization checks as TSC is reliable.\n"); | ||
118 | return; | 118 | return; |
119 | } | 119 | } |
120 | 120 | ||
121 | printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", | 121 | pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", |
122 | smp_processor_id(), cpu); | 122 | smp_processor_id(), cpu); |
123 | 123 | ||
124 | /* | 124 | /* |
125 | * Reset it - in case this is a second bootup: | 125 | * Reset it - in case this is a second bootup: |
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu) | |||
143 | 143 | ||
144 | if (nr_warps) { | 144 | if (nr_warps) { |
145 | printk("\n"); | 145 | printk("\n"); |
146 | printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," | 146 | pr_warning("Measured %Ld cycles TSC warp between CPUs, " |
147 | " turning off TSC clock.\n", max_warp); | 147 | "turning off TSC clock.\n", max_warp); |
148 | mark_tsc_unstable("check_tsc_sync_source failed"); | 148 | mark_tsc_unstable("check_tsc_sync_source failed"); |
149 | } else { | 149 | } else { |
150 | printk(" passed.\n"); | 150 | printk(" passed.\n"); |
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void) | |||
195 | while (atomic_read(&stop_count) != cpus) | 195 | while (atomic_read(&stop_count) != cpus) |
196 | cpu_relax(); | 196 | cpu_relax(); |
197 | } | 197 | } |
198 | #undef NR_LOOPS | ||
199 | |||
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index d7ac84e7fc1c..9c4e62539058 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk | |||
287 | info->regs.pt.ds = 0; | 287 | info->regs.pt.ds = 0; |
288 | info->regs.pt.es = 0; | 288 | info->regs.pt.es = 0; |
289 | info->regs.pt.fs = 0; | 289 | info->regs.pt.fs = 0; |
290 | 290 | #ifndef CONFIG_X86_32_LAZY_GS | |
291 | /* we are clearing gs later just before "jmp resume_userspace", | 291 | info->regs.pt.gs = 0; |
292 | * because it is not saved/restored. | 292 | #endif |
293 | */ | ||
294 | 293 | ||
295 | /* | 294 | /* |
296 | * The flags register is also special: we cannot trust that the user | 295 | * The flags register is also special: we cannot trust that the user |
@@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk | |||
318 | } | 317 | } |
319 | 318 | ||
320 | /* | 319 | /* |
321 | * Save old state, set default return value (%ax) to 0 | 320 | * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) |
322 | */ | 321 | */ |
323 | info->regs32->ax = 0; | 322 | info->regs32->ax = VM86_SIGNAL; |
324 | tsk->thread.saved_sp0 = tsk->thread.sp0; | 323 | tsk->thread.saved_sp0 = tsk->thread.sp0; |
325 | tsk->thread.saved_fs = info->regs32->fs; | 324 | tsk->thread.saved_fs = info->regs32->fs; |
326 | tsk->thread.saved_gs = get_user_gs(info->regs32); | 325 | tsk->thread.saved_gs = get_user_gs(info->regs32); |
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk | |||
343 | __asm__ __volatile__( | 342 | __asm__ __volatile__( |
344 | "movl %0,%%esp\n\t" | 343 | "movl %0,%%esp\n\t" |
345 | "movl %1,%%ebp\n\t" | 344 | "movl %1,%%ebp\n\t" |
345 | #ifdef CONFIG_X86_32_LAZY_GS | ||
346 | "mov %2, %%gs\n\t" | 346 | "mov %2, %%gs\n\t" |
347 | #endif | ||
347 | "jmp resume_userspace" | 348 | "jmp resume_userspace" |
348 | : /* no outputs */ | 349 | : /* no outputs */ |
349 | :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); | 350 | :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); |
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 95deb9f2211e..b263423fbe2a 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c | |||
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, | |||
462 | } | 462 | } |
463 | #endif | 463 | #endif |
464 | 464 | ||
465 | static void vmi_enter_lazy_cpu(void) | 465 | static void vmi_start_context_switch(struct task_struct *prev) |
466 | { | 466 | { |
467 | paravirt_enter_lazy_cpu(); | 467 | paravirt_start_context_switch(prev); |
468 | vmi_ops.set_lazy_mode(2); | 468 | vmi_ops.set_lazy_mode(2); |
469 | } | 469 | } |
470 | 470 | ||
471 | static void vmi_end_context_switch(struct task_struct *next) | ||
472 | { | ||
473 | vmi_ops.set_lazy_mode(0); | ||
474 | paravirt_end_context_switch(next); | ||
475 | } | ||
476 | |||
471 | static void vmi_enter_lazy_mmu(void) | 477 | static void vmi_enter_lazy_mmu(void) |
472 | { | 478 | { |
473 | paravirt_enter_lazy_mmu(); | 479 | paravirt_enter_lazy_mmu(); |
474 | vmi_ops.set_lazy_mode(1); | 480 | vmi_ops.set_lazy_mode(1); |
475 | } | 481 | } |
476 | 482 | ||
477 | static void vmi_leave_lazy(void) | 483 | static void vmi_leave_lazy_mmu(void) |
478 | { | 484 | { |
479 | paravirt_leave_lazy(paravirt_get_lazy_mode()); | ||
480 | vmi_ops.set_lazy_mode(0); | 485 | vmi_ops.set_lazy_mode(0); |
486 | paravirt_leave_lazy_mmu(); | ||
481 | } | 487 | } |
482 | 488 | ||
483 | static inline int __init check_vmi_rom(struct vrom_header *rom) | 489 | static inline int __init check_vmi_rom(struct vrom_header *rom) |
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void) | |||
711 | para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); | 717 | para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); |
712 | para_fill(pv_cpu_ops.io_delay, IODelay); | 718 | para_fill(pv_cpu_ops.io_delay, IODelay); |
713 | 719 | ||
714 | para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, | 720 | para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch, |
715 | set_lazy_mode, SetLazyMode); | 721 | set_lazy_mode, SetLazyMode); |
716 | para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, | 722 | para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch, |
717 | set_lazy_mode, SetLazyMode); | 723 | set_lazy_mode, SetLazyMode); |
718 | 724 | ||
719 | para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, | 725 | para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, |
720 | set_lazy_mode, SetLazyMode); | 726 | set_lazy_mode, SetLazyMode); |
721 | para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, | 727 | para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu, |
722 | set_lazy_mode, SetLazyMode); | 728 | set_lazy_mode, SetLazyMode); |
723 | 729 | ||
724 | /* user and kernel flush are just handled with different flags to FlushTLB */ | 730 | /* user and kernel flush are just handled with different flags to FlushTLB */ |
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 849ee611f013..367e87882041 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -1,5 +1,433 @@ | |||
1 | /* | ||
2 | * ld script for the x86 kernel | ||
3 | * | ||
4 | * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz> | ||
5 | * | ||
6 | * Modernisation, unification and other changes and fixes: | ||
7 | * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org> | ||
8 | * | ||
9 | * | ||
10 | * Don't define absolute symbols until and unless you know that symbol | ||
11 | * value is should remain constant even if kernel image is relocated | ||
12 | * at run time. Absolute symbols are not relocated. If symbol value should | ||
13 | * change if kernel is relocated, make the symbol section relative and | ||
14 | * put it inside the section definition. | ||
15 | */ | ||
16 | |||
1 | #ifdef CONFIG_X86_32 | 17 | #ifdef CONFIG_X86_32 |
2 | # include "vmlinux_32.lds.S" | 18 | #define LOAD_OFFSET __PAGE_OFFSET |
3 | #else | 19 | #else |
4 | # include "vmlinux_64.lds.S" | 20 | #define LOAD_OFFSET __START_KERNEL_map |
5 | #endif | 21 | #endif |
22 | |||
23 | #include <asm-generic/vmlinux.lds.h> | ||
24 | #include <asm/asm-offsets.h> | ||
25 | #include <asm/thread_info.h> | ||
26 | #include <asm/page_types.h> | ||
27 | #include <asm/cache.h> | ||
28 | #include <asm/boot.h> | ||
29 | |||
30 | #undef i386 /* in case the preprocessor is a 32bit one */ | ||
31 | |||
32 | OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) | ||
33 | |||
34 | #ifdef CONFIG_X86_32 | ||
35 | OUTPUT_ARCH(i386) | ||
36 | ENTRY(phys_startup_32) | ||
37 | jiffies = jiffies_64; | ||
38 | #else | ||
39 | OUTPUT_ARCH(i386:x86-64) | ||
40 | ENTRY(phys_startup_64) | ||
41 | jiffies_64 = jiffies; | ||
42 | #endif | ||
43 | |||
44 | PHDRS { | ||
45 | text PT_LOAD FLAGS(5); /* R_E */ | ||
46 | data PT_LOAD FLAGS(7); /* RWE */ | ||
47 | #ifdef CONFIG_X86_64 | ||
48 | user PT_LOAD FLAGS(7); /* RWE */ | ||
49 | data.init PT_LOAD FLAGS(7); /* RWE */ | ||
50 | #ifdef CONFIG_SMP | ||
51 | percpu PT_LOAD FLAGS(7); /* RWE */ | ||
52 | #endif | ||
53 | data.init2 PT_LOAD FLAGS(7); /* RWE */ | ||
54 | #endif | ||
55 | note PT_NOTE FLAGS(0); /* ___ */ | ||
56 | } | ||
57 | |||
58 | SECTIONS | ||
59 | { | ||
60 | #ifdef CONFIG_X86_32 | ||
61 | . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; | ||
62 | phys_startup_32 = startup_32 - LOAD_OFFSET; | ||
63 | #else | ||
64 | . = __START_KERNEL; | ||
65 | phys_startup_64 = startup_64 - LOAD_OFFSET; | ||
66 | #endif | ||
67 | |||
68 | /* Text and read-only data */ | ||
69 | |||
70 | /* bootstrapping code */ | ||
71 | .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { | ||
72 | _text = .; | ||
73 | *(.text.head) | ||
74 | } :text = 0x9090 | ||
75 | |||
76 | /* The rest of the text */ | ||
77 | .text : AT(ADDR(.text) - LOAD_OFFSET) { | ||
78 | #ifdef CONFIG_X86_32 | ||
79 | /* not really needed, already page aligned */ | ||
80 | . = ALIGN(PAGE_SIZE); | ||
81 | *(.text.page_aligned) | ||
82 | #endif | ||
83 | . = ALIGN(8); | ||
84 | _stext = .; | ||
85 | TEXT_TEXT | ||
86 | SCHED_TEXT | ||
87 | LOCK_TEXT | ||
88 | KPROBES_TEXT | ||
89 | IRQENTRY_TEXT | ||
90 | *(.fixup) | ||
91 | *(.gnu.warning) | ||
92 | /* End of text section */ | ||
93 | _etext = .; | ||
94 | } :text = 0x9090 | ||
95 | |||
96 | NOTES :text :note | ||
97 | |||
98 | /* Exception table */ | ||
99 | . = ALIGN(16); | ||
100 | __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { | ||
101 | __start___ex_table = .; | ||
102 | *(__ex_table) | ||
103 | __stop___ex_table = .; | ||
104 | } :text = 0x9090 | ||
105 | |||
106 | RODATA | ||
107 | |||
108 | /* Data */ | ||
109 | . = ALIGN(PAGE_SIZE); | ||
110 | .data : AT(ADDR(.data) - LOAD_OFFSET) { | ||
111 | /* Start of data section */ | ||
112 | _sdata = .; | ||
113 | DATA_DATA | ||
114 | CONSTRUCTORS | ||
115 | |||
116 | #ifdef CONFIG_X86_64 | ||
117 | /* End of data section */ | ||
118 | _edata = .; | ||
119 | #endif | ||
120 | } :data | ||
121 | |||
122 | #ifdef CONFIG_X86_32 | ||
123 | /* 32 bit has nosave before _edata */ | ||
124 | . = ALIGN(PAGE_SIZE); | ||
125 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { | ||
126 | __nosave_begin = .; | ||
127 | *(.data.nosave) | ||
128 | . = ALIGN(PAGE_SIZE); | ||
129 | __nosave_end = .; | ||
130 | } | ||
131 | #endif | ||
132 | |||
133 | . = ALIGN(PAGE_SIZE); | ||
134 | .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { | ||
135 | *(.data.page_aligned) | ||
136 | *(.data.idt) | ||
137 | } | ||
138 | |||
139 | #ifdef CONFIG_X86_32 | ||
140 | . = ALIGN(32); | ||
141 | #else | ||
142 | . = ALIGN(PAGE_SIZE); | ||
143 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
144 | #endif | ||
145 | .data.cacheline_aligned : | ||
146 | AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { | ||
147 | *(.data.cacheline_aligned) | ||
148 | } | ||
149 | |||
150 | /* rarely changed data like cpu maps */ | ||
151 | #ifdef CONFIG_X86_32 | ||
152 | . = ALIGN(32); | ||
153 | #else | ||
154 | . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); | ||
155 | #endif | ||
156 | .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { | ||
157 | *(.data.read_mostly) | ||
158 | |||
159 | #ifdef CONFIG_X86_32 | ||
160 | /* End of data section */ | ||
161 | _edata = .; | ||
162 | #endif | ||
163 | } | ||
164 | |||
165 | #ifdef CONFIG_X86_64 | ||
166 | |||
167 | #define VSYSCALL_ADDR (-10*1024*1024) | ||
168 | #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ | ||
169 | SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | ||
170 | #define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ | ||
171 | SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | ||
172 | |||
173 | #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) | ||
174 | #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) | ||
175 | |||
176 | #define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) | ||
177 | #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) | ||
178 | |||
179 | . = VSYSCALL_ADDR; | ||
180 | .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { | ||
181 | *(.vsyscall_0) | ||
182 | } :user | ||
183 | |||
184 | __vsyscall_0 = VSYSCALL_VIRT_ADDR; | ||
185 | |||
186 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
187 | .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { | ||
188 | *(.vsyscall_fn) | ||
189 | } | ||
190 | |||
191 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
192 | .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { | ||
193 | *(.vsyscall_gtod_data) | ||
194 | } | ||
195 | |||
196 | vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); | ||
197 | .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { | ||
198 | *(.vsyscall_clock) | ||
199 | } | ||
200 | vsyscall_clock = VVIRT(.vsyscall_clock); | ||
201 | |||
202 | |||
203 | .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { | ||
204 | *(.vsyscall_1) | ||
205 | } | ||
206 | .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { | ||
207 | *(.vsyscall_2) | ||
208 | } | ||
209 | |||
210 | .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { | ||
211 | *(.vgetcpu_mode) | ||
212 | } | ||
213 | vgetcpu_mode = VVIRT(.vgetcpu_mode); | ||
214 | |||
215 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
216 | .jiffies : AT(VLOAD(.jiffies)) { | ||
217 | *(.jiffies) | ||
218 | } | ||
219 | jiffies = VVIRT(.jiffies); | ||
220 | |||
221 | .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { | ||
222 | *(.vsyscall_3) | ||
223 | } | ||
224 | |||
225 | . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; | ||
226 | |||
227 | #undef VSYSCALL_ADDR | ||
228 | #undef VSYSCALL_PHYS_ADDR | ||
229 | #undef VSYSCALL_VIRT_ADDR | ||
230 | #undef VLOAD_OFFSET | ||
231 | #undef VLOAD | ||
232 | #undef VVIRT_OFFSET | ||
233 | #undef VVIRT | ||
234 | |||
235 | #endif /* CONFIG_X86_64 */ | ||
236 | |||
237 | /* init_task */ | ||
238 | . = ALIGN(THREAD_SIZE); | ||
239 | .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { | ||
240 | *(.data.init_task) | ||
241 | } | ||
242 | #ifdef CONFIG_X86_64 | ||
243 | :data.init | ||
244 | #endif | ||
245 | |||
246 | /* | ||
247 | * smp_locks might be freed after init | ||
248 | * start/end must be page aligned | ||
249 | */ | ||
250 | . = ALIGN(PAGE_SIZE); | ||
251 | .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { | ||
252 | __smp_locks = .; | ||
253 | *(.smp_locks) | ||
254 | __smp_locks_end = .; | ||
255 | . = ALIGN(PAGE_SIZE); | ||
256 | } | ||
257 | |||
258 | /* Init code and data - will be freed after init */ | ||
259 | . = ALIGN(PAGE_SIZE); | ||
260 | .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { | ||
261 | __init_begin = .; /* paired with __init_end */ | ||
262 | _sinittext = .; | ||
263 | INIT_TEXT | ||
264 | _einittext = .; | ||
265 | } | ||
266 | |||
267 | .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { | ||
268 | INIT_DATA | ||
269 | } | ||
270 | |||
271 | . = ALIGN(16); | ||
272 | .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { | ||
273 | __setup_start = .; | ||
274 | *(.init.setup) | ||
275 | __setup_end = .; | ||
276 | } | ||
277 | .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { | ||
278 | __initcall_start = .; | ||
279 | INITCALLS | ||
280 | __initcall_end = .; | ||
281 | } | ||
282 | |||
283 | .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { | ||
284 | __con_initcall_start = .; | ||
285 | *(.con_initcall.init) | ||
286 | __con_initcall_end = .; | ||
287 | } | ||
288 | |||
289 | .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { | ||
290 | __x86_cpu_dev_start = .; | ||
291 | *(.x86_cpu_dev.init) | ||
292 | __x86_cpu_dev_end = .; | ||
293 | } | ||
294 | |||
295 | SECURITY_INIT | ||
296 | |||
297 | . = ALIGN(8); | ||
298 | .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { | ||
299 | __parainstructions = .; | ||
300 | *(.parainstructions) | ||
301 | __parainstructions_end = .; | ||
302 | } | ||
303 | |||
304 | . = ALIGN(8); | ||
305 | .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { | ||
306 | __alt_instructions = .; | ||
307 | *(.altinstructions) | ||
308 | __alt_instructions_end = .; | ||
309 | } | ||
310 | |||
311 | .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { | ||
312 | *(.altinstr_replacement) | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * .exit.text is discard at runtime, not link time, to deal with | ||
317 | * references from .altinstructions and .eh_frame | ||
318 | */ | ||
319 | .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { | ||
320 | EXIT_TEXT | ||
321 | } | ||
322 | |||
323 | .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { | ||
324 | EXIT_DATA | ||
325 | } | ||
326 | |||
327 | #ifdef CONFIG_BLK_DEV_INITRD | ||
328 | . = ALIGN(PAGE_SIZE); | ||
329 | .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { | ||
330 | __initramfs_start = .; | ||
331 | *(.init.ramfs) | ||
332 | __initramfs_end = .; | ||
333 | } | ||
334 | #endif | ||
335 | |||
336 | #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) | ||
337 | /* | ||
338 | * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the | ||
339 | * output PHDR, so the next output section - __data_nosave - should | ||
340 | * start another section data.init2. Also, pda should be at the head of | ||
341 | * percpu area. Preallocate it and define the percpu offset symbol | ||
342 | * so that it can be accessed as a percpu variable. | ||
343 | */ | ||
344 | . = ALIGN(PAGE_SIZE); | ||
345 | PERCPU_VADDR(0, :percpu) | ||
346 | #else | ||
347 | PERCPU(PAGE_SIZE) | ||
348 | #endif | ||
349 | |||
350 | . = ALIGN(PAGE_SIZE); | ||
351 | |||
352 | /* freed after init ends here */ | ||
353 | .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) { | ||
354 | __init_end = .; | ||
355 | } | ||
356 | |||
357 | #ifdef CONFIG_X86_64 | ||
358 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { | ||
359 | . = ALIGN(PAGE_SIZE); | ||
360 | __nosave_begin = .; | ||
361 | *(.data.nosave) | ||
362 | . = ALIGN(PAGE_SIZE); | ||
363 | __nosave_end = .; | ||
364 | } :data.init2 | ||
365 | /* use another section data.init2, see PERCPU_VADDR() above */ | ||
366 | #endif | ||
367 | |||
368 | /* BSS */ | ||
369 | . = ALIGN(PAGE_SIZE); | ||
370 | .bss : AT(ADDR(.bss) - LOAD_OFFSET) { | ||
371 | __bss_start = .; | ||
372 | *(.bss.page_aligned) | ||
373 | *(.bss) | ||
374 | . = ALIGN(4); | ||
375 | __bss_stop = .; | ||
376 | } | ||
377 | |||
378 | . = ALIGN(PAGE_SIZE); | ||
379 | .brk : AT(ADDR(.brk) - LOAD_OFFSET) { | ||
380 | __brk_base = .; | ||
381 | . += 64 * 1024; /* 64k alignment slop space */ | ||
382 | *(.brk_reservation) /* areas brk users have reserved */ | ||
383 | __brk_limit = .; | ||
384 | } | ||
385 | |||
386 | .end : AT(ADDR(.end) - LOAD_OFFSET) { | ||
387 | _end = .; | ||
388 | } | ||
389 | |||
390 | /* Sections to be discarded */ | ||
391 | /DISCARD/ : { | ||
392 | *(.exitcall.exit) | ||
393 | *(.eh_frame) | ||
394 | *(.discard) | ||
395 | } | ||
396 | |||
397 | STABS_DEBUG | ||
398 | DWARF_DEBUG | ||
399 | } | ||
400 | |||
401 | |||
402 | #ifdef CONFIG_X86_32 | ||
403 | ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), | ||
404 | "kernel image bigger than KERNEL_IMAGE_SIZE") | ||
405 | #else | ||
406 | /* | ||
407 | * Per-cpu symbols which need to be offset from __per_cpu_load | ||
408 | * for the boot processor. | ||
409 | */ | ||
410 | #define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load | ||
411 | INIT_PER_CPU(gdt_page); | ||
412 | INIT_PER_CPU(irq_stack_union); | ||
413 | |||
414 | /* | ||
415 | * Build-time check on the image size: | ||
416 | */ | ||
417 | ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), | ||
418 | "kernel image bigger than KERNEL_IMAGE_SIZE") | ||
419 | |||
420 | #ifdef CONFIG_SMP | ||
421 | ASSERT((per_cpu__irq_stack_union == 0), | ||
422 | "irq_stack_union is not at start of per-cpu area"); | ||
423 | #endif | ||
424 | |||
425 | #endif /* CONFIG_X86_32 */ | ||
426 | |||
427 | #ifdef CONFIG_KEXEC | ||
428 | #include <asm/kexec.h> | ||
429 | |||
430 | ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, | ||
431 | "kexec control code size is too big") | ||
432 | #endif | ||
433 | |||
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S deleted file mode 100644 index 62ad500d55f3..000000000000 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ /dev/null | |||
@@ -1,229 +0,0 @@ | |||
1 | /* ld script to make i386 Linux kernel | ||
2 | * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; | ||
3 | * | ||
4 | * Don't define absolute symbols until and unless you know that symbol | ||
5 | * value is should remain constant even if kernel image is relocated | ||
6 | * at run time. Absolute symbols are not relocated. If symbol value should | ||
7 | * change if kernel is relocated, make the symbol section relative and | ||
8 | * put it inside the section definition. | ||
9 | */ | ||
10 | |||
11 | #define LOAD_OFFSET __PAGE_OFFSET | ||
12 | |||
13 | #include <asm-generic/vmlinux.lds.h> | ||
14 | #include <asm/thread_info.h> | ||
15 | #include <asm/page_types.h> | ||
16 | #include <asm/cache.h> | ||
17 | #include <asm/boot.h> | ||
18 | |||
19 | OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") | ||
20 | OUTPUT_ARCH(i386) | ||
21 | ENTRY(phys_startup_32) | ||
22 | jiffies = jiffies_64; | ||
23 | |||
24 | PHDRS { | ||
25 | text PT_LOAD FLAGS(5); /* R_E */ | ||
26 | data PT_LOAD FLAGS(7); /* RWE */ | ||
27 | note PT_NOTE FLAGS(0); /* ___ */ | ||
28 | } | ||
29 | SECTIONS | ||
30 | { | ||
31 | . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; | ||
32 | phys_startup_32 = startup_32 - LOAD_OFFSET; | ||
33 | |||
34 | .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { | ||
35 | _text = .; /* Text and read-only data */ | ||
36 | *(.text.head) | ||
37 | } :text = 0x9090 | ||
38 | |||
39 | /* read-only */ | ||
40 | .text : AT(ADDR(.text) - LOAD_OFFSET) { | ||
41 | . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */ | ||
42 | *(.text.page_aligned) | ||
43 | TEXT_TEXT | ||
44 | SCHED_TEXT | ||
45 | LOCK_TEXT | ||
46 | KPROBES_TEXT | ||
47 | IRQENTRY_TEXT | ||
48 | *(.fixup) | ||
49 | *(.gnu.warning) | ||
50 | _etext = .; /* End of text section */ | ||
51 | } :text = 0x9090 | ||
52 | |||
53 | NOTES :text :note | ||
54 | |||
55 | . = ALIGN(16); /* Exception table */ | ||
56 | __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { | ||
57 | __start___ex_table = .; | ||
58 | *(__ex_table) | ||
59 | __stop___ex_table = .; | ||
60 | } :text = 0x9090 | ||
61 | |||
62 | RODATA | ||
63 | |||
64 | /* writeable */ | ||
65 | . = ALIGN(PAGE_SIZE); | ||
66 | .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ | ||
67 | DATA_DATA | ||
68 | CONSTRUCTORS | ||
69 | } :data | ||
70 | |||
71 | . = ALIGN(PAGE_SIZE); | ||
72 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { | ||
73 | __nosave_begin = .; | ||
74 | *(.data.nosave) | ||
75 | . = ALIGN(PAGE_SIZE); | ||
76 | __nosave_end = .; | ||
77 | } | ||
78 | |||
79 | . = ALIGN(PAGE_SIZE); | ||
80 | .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { | ||
81 | *(.data.page_aligned) | ||
82 | *(.data.idt) | ||
83 | } | ||
84 | |||
85 | . = ALIGN(32); | ||
86 | .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { | ||
87 | *(.data.cacheline_aligned) | ||
88 | } | ||
89 | |||
90 | /* rarely changed data like cpu maps */ | ||
91 | . = ALIGN(32); | ||
92 | .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { | ||
93 | *(.data.read_mostly) | ||
94 | _edata = .; /* End of data section */ | ||
95 | } | ||
96 | |||
97 | . = ALIGN(THREAD_SIZE); /* init_task */ | ||
98 | .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { | ||
99 | *(.data.init_task) | ||
100 | } | ||
101 | |||
102 | /* might get freed after init */ | ||
103 | . = ALIGN(PAGE_SIZE); | ||
104 | .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { | ||
105 | __smp_locks = .; | ||
106 | *(.smp_locks) | ||
107 | __smp_locks_end = .; | ||
108 | } | ||
109 | /* will be freed after init | ||
110 | * Following ALIGN() is required to make sure no other data falls on the | ||
111 | * same page where __smp_alt_end is pointing as that page might be freed | ||
112 | * after boot. Always make sure that ALIGN() directive is present after | ||
113 | * the section which contains __smp_alt_end. | ||
114 | */ | ||
115 | . = ALIGN(PAGE_SIZE); | ||
116 | |||
117 | /* will be freed after init */ | ||
118 | . = ALIGN(PAGE_SIZE); /* Init code and data */ | ||
119 | .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { | ||
120 | __init_begin = .; | ||
121 | _sinittext = .; | ||
122 | INIT_TEXT | ||
123 | _einittext = .; | ||
124 | } | ||
125 | .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { | ||
126 | INIT_DATA | ||
127 | } | ||
128 | . = ALIGN(16); | ||
129 | .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { | ||
130 | __setup_start = .; | ||
131 | *(.init.setup) | ||
132 | __setup_end = .; | ||
133 | } | ||
134 | .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { | ||
135 | __initcall_start = .; | ||
136 | INITCALLS | ||
137 | __initcall_end = .; | ||
138 | } | ||
139 | .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { | ||
140 | __con_initcall_start = .; | ||
141 | *(.con_initcall.init) | ||
142 | __con_initcall_end = .; | ||
143 | } | ||
144 | .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { | ||
145 | __x86_cpu_dev_start = .; | ||
146 | *(.x86_cpu_dev.init) | ||
147 | __x86_cpu_dev_end = .; | ||
148 | } | ||
149 | SECURITY_INIT | ||
150 | . = ALIGN(4); | ||
151 | .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { | ||
152 | __alt_instructions = .; | ||
153 | *(.altinstructions) | ||
154 | __alt_instructions_end = .; | ||
155 | } | ||
156 | .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { | ||
157 | *(.altinstr_replacement) | ||
158 | } | ||
159 | . = ALIGN(4); | ||
160 | .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { | ||
161 | __parainstructions = .; | ||
162 | *(.parainstructions) | ||
163 | __parainstructions_end = .; | ||
164 | } | ||
165 | /* .exit.text is discard at runtime, not link time, to deal with references | ||
166 | from .altinstructions and .eh_frame */ | ||
167 | .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { | ||
168 | EXIT_TEXT | ||
169 | } | ||
170 | .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { | ||
171 | EXIT_DATA | ||
172 | } | ||
173 | #if defined(CONFIG_BLK_DEV_INITRD) | ||
174 | . = ALIGN(PAGE_SIZE); | ||
175 | .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { | ||
176 | __initramfs_start = .; | ||
177 | *(.init.ramfs) | ||
178 | __initramfs_end = .; | ||
179 | } | ||
180 | #endif | ||
181 | PERCPU(PAGE_SIZE) | ||
182 | . = ALIGN(PAGE_SIZE); | ||
183 | /* freed after init ends here */ | ||
184 | |||
185 | .bss : AT(ADDR(.bss) - LOAD_OFFSET) { | ||
186 | __init_end = .; | ||
187 | __bss_start = .; /* BSS */ | ||
188 | *(.bss.page_aligned) | ||
189 | *(.bss) | ||
190 | . = ALIGN(4); | ||
191 | __bss_stop = .; | ||
192 | } | ||
193 | |||
194 | .brk : AT(ADDR(.brk) - LOAD_OFFSET) { | ||
195 | . = ALIGN(PAGE_SIZE); | ||
196 | __brk_base = . ; | ||
197 | . += 64 * 1024 ; /* 64k alignment slop space */ | ||
198 | *(.brk_reservation) /* areas brk users have reserved */ | ||
199 | __brk_limit = . ; | ||
200 | } | ||
201 | |||
202 | .end : AT(ADDR(.end) - LOAD_OFFSET) { | ||
203 | _end = . ; | ||
204 | } | ||
205 | |||
206 | /* Sections to be discarded */ | ||
207 | /DISCARD/ : { | ||
208 | *(.exitcall.exit) | ||
209 | *(.discard) | ||
210 | } | ||
211 | |||
212 | STABS_DEBUG | ||
213 | |||
214 | DWARF_DEBUG | ||
215 | } | ||
216 | |||
217 | /* | ||
218 | * Build-time check on the image size: | ||
219 | */ | ||
220 | ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), | ||
221 | "kernel image bigger than KERNEL_IMAGE_SIZE") | ||
222 | |||
223 | #ifdef CONFIG_KEXEC | ||
224 | /* Link time checks */ | ||
225 | #include <asm/kexec.h> | ||
226 | |||
227 | ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, | ||
228 | "kexec control code size is too big") | ||
229 | #endif | ||
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S deleted file mode 100644 index c8742507b030..000000000000 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ /dev/null | |||
@@ -1,298 +0,0 @@ | |||
1 | /* ld script to make x86-64 Linux kernel | ||
2 | * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; | ||
3 | */ | ||
4 | |||
5 | #define LOAD_OFFSET __START_KERNEL_map | ||
6 | |||
7 | #include <asm-generic/vmlinux.lds.h> | ||
8 | #include <asm/asm-offsets.h> | ||
9 | #include <asm/page_types.h> | ||
10 | |||
11 | #undef i386 /* in case the preprocessor is a 32bit one */ | ||
12 | |||
13 | OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") | ||
14 | OUTPUT_ARCH(i386:x86-64) | ||
15 | ENTRY(phys_startup_64) | ||
16 | jiffies_64 = jiffies; | ||
17 | PHDRS { | ||
18 | text PT_LOAD FLAGS(5); /* R_E */ | ||
19 | data PT_LOAD FLAGS(7); /* RWE */ | ||
20 | user PT_LOAD FLAGS(7); /* RWE */ | ||
21 | data.init PT_LOAD FLAGS(7); /* RWE */ | ||
22 | #ifdef CONFIG_SMP | ||
23 | percpu PT_LOAD FLAGS(7); /* RWE */ | ||
24 | #endif | ||
25 | data.init2 PT_LOAD FLAGS(7); /* RWE */ | ||
26 | note PT_NOTE FLAGS(0); /* ___ */ | ||
27 | } | ||
28 | SECTIONS | ||
29 | { | ||
30 | . = __START_KERNEL; | ||
31 | phys_startup_64 = startup_64 - LOAD_OFFSET; | ||
32 | .text : AT(ADDR(.text) - LOAD_OFFSET) { | ||
33 | _text = .; /* Text and read-only data */ | ||
34 | /* First the code that has to be first for bootstrapping */ | ||
35 | *(.text.head) | ||
36 | _stext = .; | ||
37 | /* Then the rest */ | ||
38 | TEXT_TEXT | ||
39 | SCHED_TEXT | ||
40 | LOCK_TEXT | ||
41 | KPROBES_TEXT | ||
42 | IRQENTRY_TEXT | ||
43 | *(.fixup) | ||
44 | *(.gnu.warning) | ||
45 | _etext = .; /* End of text section */ | ||
46 | } :text = 0x9090 | ||
47 | |||
48 | NOTES :text :note | ||
49 | |||
50 | . = ALIGN(16); /* Exception table */ | ||
51 | __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { | ||
52 | __start___ex_table = .; | ||
53 | *(__ex_table) | ||
54 | __stop___ex_table = .; | ||
55 | } :text = 0x9090 | ||
56 | |||
57 | RODATA | ||
58 | |||
59 | . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ | ||
60 | /* Data */ | ||
61 | .data : AT(ADDR(.data) - LOAD_OFFSET) { | ||
62 | DATA_DATA | ||
63 | CONSTRUCTORS | ||
64 | _edata = .; /* End of data section */ | ||
65 | } :data | ||
66 | |||
67 | |||
68 | .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { | ||
69 | . = ALIGN(PAGE_SIZE); | ||
70 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
71 | *(.data.cacheline_aligned) | ||
72 | } | ||
73 | . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); | ||
74 | .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { | ||
75 | *(.data.read_mostly) | ||
76 | } | ||
77 | |||
78 | #define VSYSCALL_ADDR (-10*1024*1024) | ||
79 | #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | ||
80 | #define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | ||
81 | |||
82 | #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) | ||
83 | #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) | ||
84 | |||
85 | #define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) | ||
86 | #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) | ||
87 | |||
88 | . = VSYSCALL_ADDR; | ||
89 | .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user | ||
90 | __vsyscall_0 = VSYSCALL_VIRT_ADDR; | ||
91 | |||
92 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
93 | .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } | ||
94 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
95 | .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) | ||
96 | { *(.vsyscall_gtod_data) } | ||
97 | vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); | ||
98 | .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) | ||
99 | { *(.vsyscall_clock) } | ||
100 | vsyscall_clock = VVIRT(.vsyscall_clock); | ||
101 | |||
102 | |||
103 | .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) | ||
104 | { *(.vsyscall_1) } | ||
105 | .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) | ||
106 | { *(.vsyscall_2) } | ||
107 | |||
108 | .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } | ||
109 | vgetcpu_mode = VVIRT(.vgetcpu_mode); | ||
110 | |||
111 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
112 | .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } | ||
113 | jiffies = VVIRT(.jiffies); | ||
114 | |||
115 | .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) | ||
116 | { *(.vsyscall_3) } | ||
117 | |||
118 | . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; | ||
119 | |||
120 | #undef VSYSCALL_ADDR | ||
121 | #undef VSYSCALL_PHYS_ADDR | ||
122 | #undef VSYSCALL_VIRT_ADDR | ||
123 | #undef VLOAD_OFFSET | ||
124 | #undef VLOAD | ||
125 | #undef VVIRT_OFFSET | ||
126 | #undef VVIRT | ||
127 | |||
128 | .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { | ||
129 | . = ALIGN(THREAD_SIZE); /* init_task */ | ||
130 | *(.data.init_task) | ||
131 | }:data.init | ||
132 | |||
133 | .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { | ||
134 | . = ALIGN(PAGE_SIZE); | ||
135 | *(.data.page_aligned) | ||
136 | } | ||
137 | |||
138 | .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { | ||
139 | /* might get freed after init */ | ||
140 | . = ALIGN(PAGE_SIZE); | ||
141 | __smp_alt_begin = .; | ||
142 | __smp_locks = .; | ||
143 | *(.smp_locks) | ||
144 | __smp_locks_end = .; | ||
145 | . = ALIGN(PAGE_SIZE); | ||
146 | __smp_alt_end = .; | ||
147 | } | ||
148 | |||
149 | . = ALIGN(PAGE_SIZE); /* Init code and data */ | ||
150 | __init_begin = .; /* paired with __init_end */ | ||
151 | .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { | ||
152 | _sinittext = .; | ||
153 | INIT_TEXT | ||
154 | _einittext = .; | ||
155 | } | ||
156 | .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { | ||
157 | __initdata_begin = .; | ||
158 | INIT_DATA | ||
159 | __initdata_end = .; | ||
160 | } | ||
161 | |||
162 | .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { | ||
163 | . = ALIGN(16); | ||
164 | __setup_start = .; | ||
165 | *(.init.setup) | ||
166 | __setup_end = .; | ||
167 | } | ||
168 | .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { | ||
169 | __initcall_start = .; | ||
170 | INITCALLS | ||
171 | __initcall_end = .; | ||
172 | } | ||
173 | .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { | ||
174 | __con_initcall_start = .; | ||
175 | *(.con_initcall.init) | ||
176 | __con_initcall_end = .; | ||
177 | } | ||
178 | .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { | ||
179 | __x86_cpu_dev_start = .; | ||
180 | *(.x86_cpu_dev.init) | ||
181 | __x86_cpu_dev_end = .; | ||
182 | } | ||
183 | SECURITY_INIT | ||
184 | |||
185 | . = ALIGN(8); | ||
186 | .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { | ||
187 | __parainstructions = .; | ||
188 | *(.parainstructions) | ||
189 | __parainstructions_end = .; | ||
190 | } | ||
191 | |||
192 | .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { | ||
193 | . = ALIGN(8); | ||
194 | __alt_instructions = .; | ||
195 | *(.altinstructions) | ||
196 | __alt_instructions_end = .; | ||
197 | } | ||
198 | .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { | ||
199 | *(.altinstr_replacement) | ||
200 | } | ||
201 | /* .exit.text is discard at runtime, not link time, to deal with references | ||
202 | from .altinstructions and .eh_frame */ | ||
203 | .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { | ||
204 | EXIT_TEXT | ||
205 | } | ||
206 | .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { | ||
207 | EXIT_DATA | ||
208 | } | ||
209 | |||
210 | #ifdef CONFIG_BLK_DEV_INITRD | ||
211 | . = ALIGN(PAGE_SIZE); | ||
212 | .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { | ||
213 | __initramfs_start = .; | ||
214 | *(.init.ramfs) | ||
215 | __initramfs_end = .; | ||
216 | } | ||
217 | #endif | ||
218 | |||
219 | #ifdef CONFIG_SMP | ||
220 | /* | ||
221 | * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the | ||
222 | * output PHDR, so the next output section - __data_nosave - should | ||
223 | * start another section data.init2. Also, pda should be at the head of | ||
224 | * percpu area. Preallocate it and define the percpu offset symbol | ||
225 | * so that it can be accessed as a percpu variable. | ||
226 | */ | ||
227 | . = ALIGN(PAGE_SIZE); | ||
228 | PERCPU_VADDR(0, :percpu) | ||
229 | #else | ||
230 | PERCPU(PAGE_SIZE) | ||
231 | #endif | ||
232 | |||
233 | . = ALIGN(PAGE_SIZE); | ||
234 | __init_end = .; | ||
235 | |||
236 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { | ||
237 | . = ALIGN(PAGE_SIZE); | ||
238 | __nosave_begin = .; | ||
239 | *(.data.nosave) | ||
240 | . = ALIGN(PAGE_SIZE); | ||
241 | __nosave_end = .; | ||
242 | } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ | ||
243 | |||
244 | .bss : AT(ADDR(.bss) - LOAD_OFFSET) { | ||
245 | . = ALIGN(PAGE_SIZE); | ||
246 | __bss_start = .; /* BSS */ | ||
247 | *(.bss.page_aligned) | ||
248 | *(.bss) | ||
249 | __bss_stop = .; | ||
250 | } | ||
251 | |||
252 | .brk : AT(ADDR(.brk) - LOAD_OFFSET) { | ||
253 | . = ALIGN(PAGE_SIZE); | ||
254 | __brk_base = . ; | ||
255 | . += 64 * 1024 ; /* 64k alignment slop space */ | ||
256 | *(.brk_reservation) /* areas brk users have reserved */ | ||
257 | __brk_limit = . ; | ||
258 | } | ||
259 | |||
260 | _end = . ; | ||
261 | |||
262 | /* Sections to be discarded */ | ||
263 | /DISCARD/ : { | ||
264 | *(.exitcall.exit) | ||
265 | *(.eh_frame) | ||
266 | *(.discard) | ||
267 | } | ||
268 | |||
269 | STABS_DEBUG | ||
270 | |||
271 | DWARF_DEBUG | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * Per-cpu symbols which need to be offset from __per_cpu_load | ||
276 | * for the boot processor. | ||
277 | */ | ||
278 | #define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load | ||
279 | INIT_PER_CPU(gdt_page); | ||
280 | INIT_PER_CPU(irq_stack_union); | ||
281 | |||
282 | /* | ||
283 | * Build-time check on the image size: | ||
284 | */ | ||
285 | ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), | ||
286 | "kernel image bigger than KERNEL_IMAGE_SIZE") | ||
287 | |||
288 | #ifdef CONFIG_SMP | ||
289 | ASSERT((per_cpu__irq_stack_union == 0), | ||
290 | "irq_stack_union is not at start of per-cpu area"); | ||
291 | #endif | ||
292 | |||
293 | #ifdef CONFIG_KEXEC | ||
294 | #include <asm/kexec.h> | ||
295 | |||
296 | ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, | ||
297 | "kexec control code size is too big") | ||
298 | #endif | ||
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 44153afc9067..25ee06a80aad 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) | |||
132 | return; | 132 | return; |
133 | } | 133 | } |
134 | 134 | ||
135 | /* | ||
136 | * Surround the RDTSC by barriers, to make sure it's not | ||
137 | * speculated to outside the seqlock critical section and | ||
138 | * does not cause time warps: | ||
139 | */ | ||
140 | rdtsc_barrier(); | ||
141 | now = vread(); | 135 | now = vread(); |
142 | rdtsc_barrier(); | ||
143 | |||
144 | base = __vsyscall_gtod_data.clock.cycle_last; | 136 | base = __vsyscall_gtod_data.clock.cycle_last; |
145 | mask = __vsyscall_gtod_data.clock.mask; | 137 | mask = __vsyscall_gtod_data.clock.mask; |
146 | mult = __vsyscall_gtod_data.clock.mult; | 138 | mult = __vsyscall_gtod_data.clock.mult; |