aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
commitada47b5fe13d89735805b566185f4885f5a3f750 (patch)
tree644b88f8a71896307d71438e9b3af49126ffb22b /arch/x86/kernel
parent43e98717ad40a4ae64545b5ba047c7b86aa44f4f (diff)
parent3280f21d43ee541f97f8cda5792150d2dbec20d5 (diff)
Merge branch 'wip-2.6.34' into old-private-masterarchived-private-master
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c173
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/acpi/processor.c101
-rw-r--r--arch/x86/kernel/acpi/sleep.c26
-rw-r--r--arch/x86/kernel/alternative.c83
-rw-r--r--arch/x86/kernel/amd_iommu.c1330
-rw-r--r--arch/x86/kernel/amd_iommu_init.c157
-rw-r--r--arch/x86/kernel/apb_timer.c785
-rw-r--r--arch/x86/kernel/aperture_64.c28
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c68
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c12
-rw-r--r--arch/x86/kernel/apic/apic_noop.c200
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c18
-rw-r--r--arch/x86/kernel/apic/es7000_32.c29
-rw-r--r--arch/x86/kernel/apic/io_apic.c802
-rw-r--r--arch/x86/kernel/apic/nmi.c28
-rw-r--r--arch/x86/kernel/apic/numaq_32.c21
-rw-r--r--arch/x86/kernel/apic/probe_32.c31
-rw-r--r--arch/x86/kernel/apic/probe_64.c13
-rw-r--r--arch/x86/kernel/apic/summit_32.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c164
-rw-r--r--arch/x86/kernel/apm_32.c18
-rw-r--r--arch/x86/kernel/bios_uv.c47
-rw-r--r--arch/x86/kernel/bootflag.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c19
-rw-r--r--arch/x86/kernel/cpu/amd.c57
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c52
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c688
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c50
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c621
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c3
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c3
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h24
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c3
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c33
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c420
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c23
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c129
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c45
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c189
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c11
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c12
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h6
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event.c1865
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c422
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c980
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c159
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c15
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/cpuid.c25
-rw-r--r--arch/x86/kernel/crash.c7
-rw-r--r--arch/x86/kernel/crash_dump_32.c1
-rw-r--r--arch/x86/kernel/ds.c4
-rw-r--r--arch/x86/kernel/dumpstack.c50
-rw-r--r--arch/x86/kernel/dumpstack.h28
-rw-r--r--arch/x86/kernel/dumpstack_32.c14
-rw-r--r--arch/x86/kernel/dumpstack_64.c99
-rw-r--r--arch/x86/kernel/e820.c370
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/entry_32.S100
-rw-r--r--arch/x86/kernel/entry_64.S84
-rw-r--r--arch/x86/kernel/ftrace.c133
-rw-r--r--arch/x86/kernel/geode_32.c196
-rw-r--r--arch/x86/kernel/head32.c14
-rw-r--r--arch/x86/kernel/head64.c5
-rw-r--r--arch/x86/kernel/head_32.S24
-rw-r--r--arch/x86/kernel/head_64.S9
-rw-r--r--arch/x86/kernel/hpet.c95
-rw-r--r--arch/x86/kernel/hw_breakpoint.c530
-rw-r--r--arch/x86/kernel/i387.c72
-rw-r--r--arch/x86/kernel/i8259.c95
-rw-r--r--arch/x86/kernel/ioport.c28
-rw-r--r--arch/x86/kernel/irq.c126
-rw-r--r--arch/x86/kernel/irq_32.c45
-rw-r--r--arch/x86/kernel/irq_64.c58
-rw-r--r--arch/x86/kernel/irqinit.c63
-rw-r--r--arch/x86/kernel/k8.c16
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c241
-rw-r--r--arch/x86/kernel/kprobes.c888
-rw-r--r--arch/x86/kernel/ldt.c1
-rw-r--r--arch/x86/kernel/machine_kexec_32.c8
-rw-r--r--arch/x86/kernel/machine_kexec_64.c3
-rw-r--r--arch/x86/kernel/mca_32.c1
-rw-r--r--arch/x86/kernel/mfgpt_32.c410
-rw-r--r--arch/x86/kernel/microcode_amd.c57
-rw-r--r--arch/x86/kernel/microcode_core.c28
-rw-r--r--arch/x86/kernel/microcode_intel.c47
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c7
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/mpparse.c54
-rw-r--r--arch/x86/kernel/mrst.c216
-rw-r--r--arch/x86/kernel/msr.c26
-rw-r--r--arch/x86/kernel/olpc.c12
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c4
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c102
-rw-r--r--arch/x86/kernel/pci-dma.c55
-rw-r--r--arch/x86/kernel/pci-gart_64.c169
-rw-r--r--arch/x86/kernel/pci-nommu.c12
-rw-r--r--arch/x86/kernel/pci-swiotlb.c21
-rw-r--r--arch/x86/kernel/process.c171
-rw-r--r--arch/x86/kernel/process_32.c115
-rw-r--r--arch/x86/kernel/process_64.c131
-rw-r--r--arch/x86/kernel/ptrace.c494
-rw-r--r--arch/x86/kernel/quirks.c22
-rw-r--r--arch/x86/kernel/reboot.c29
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c3
-rw-r--r--arch/x86/kernel/setup.c186
-rw-r--r--arch/x86/kernel/setup_percpu.c19
-rw-r--r--arch/x86/kernel/signal.c24
-rw-r--r--arch/x86/kernel/smp.c1
-rw-r--r--arch/x86/kernel/smpboot.c86
-rw-r--r--arch/x86/kernel/stacktrace.c18
-rw-r--r--arch/x86/kernel/sys_i386_32.c210
-rw-r--r--arch/x86/kernel/sys_x86_64.c29
-rw-r--r--arch/x86/kernel/syscall_table_32.S9
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/tlb_uv.c5
-rw-r--r--arch/x86/kernel/trampoline.c20
-rw-r--r--arch/x86/kernel/traps.c76
-rw-r--r--arch/x86/kernel/tsc.c7
-rw-r--r--arch/x86/kernel/tsc_sync.c23
-rw-r--r--arch/x86/kernel/uv_irq.c239
-rw-r--r--arch/x86/kernel/uv_sysfs.c6
-rw-r--r--arch/x86/kernel/uv_time.c94
-rw-r--r--arch/x86/kernel/visws_quirks.c37
-rw-r--r--arch/x86/kernel/vm86_32.c11
-rw-r--r--arch/x86/kernel/vmi_32.c36
-rw-r--r--arch/x86/kernel/vmiclock_32.c10
-rw-r--r--arch/x86/kernel/vmlinux.lds.S48
-rw-r--r--arch/x86/kernel/vsyscall_64.c10
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c14
-rw-r--r--arch/x86/kernel/x86_init.c21
-rw-r--r--arch/x86/kernel/xsave.c1
164 files changed, 10061 insertions, 7025 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a99b34d1b3b8..d09934e22ca5 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
41obj-y += bootflag.o e820.o 41obj-y += bootflag.o e820.o
42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
43obj-y += alternative.o i8253.o pci-nommu.o 43obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
44obj-y += tsc.o io_delay.o rtc.o 44obj-y += tsc.o io_delay.o rtc.o
45 45
46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
@@ -87,9 +87,9 @@ obj-$(CONFIG_VM86) += vm86_32.o
87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o 87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
88 88
89obj-$(CONFIG_HPET_TIMER) += hpet.o 89obj-$(CONFIG_HPET_TIMER) += hpet.o
90obj-$(CONFIG_APB_TIMER) += apb_timer.o
90 91
91obj-$(CONFIG_K8_NB) += k8.o 92obj-$(CONFIG_K8_NB) += k8.o
92obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
94obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 94obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
95 95
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index fd5ca97a2ad5..6f35260bb3ef 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o 4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
5 5
6ifneq ($(CONFIG_ACPI_PROCESSOR),) 6ifneq ($(CONFIG_ACPI_PROCESSOR),)
7obj-y += cstate.o processor.o 7obj-y += cstate.o
8endif 8endif
9 9
10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin 10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 67e929b89875..cd40aba6aa95 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -31,10 +31,12 @@
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/dmi.h> 32#include <linux/dmi.h>
33#include <linux/irq.h> 33#include <linux/irq.h>
34#include <linux/slab.h>
34#include <linux/bootmem.h> 35#include <linux/bootmem.h>
35#include <linux/ioport.h> 36#include <linux/ioport.h>
36#include <linux/pci.h> 37#include <linux/pci.h>
37 38
39#include <asm/pci_x86.h>
38#include <asm/pgtable.h> 40#include <asm/pgtable.h>
39#include <asm/io_apic.h> 41#include <asm/io_apic.h>
40#include <asm/apic.h> 42#include <asm/apic.h>
@@ -49,6 +51,7 @@ EXPORT_SYMBOL(acpi_disabled);
49 51
50#ifdef CONFIG_X86_64 52#ifdef CONFIG_X86_64
51# include <asm/proto.h> 53# include <asm/proto.h>
54# include <asm/numa_64.h>
52#endif /* X86 */ 55#endif /* X86 */
53 56
54#define BAD_MADT_ENTRY(entry, end) ( \ 57#define BAD_MADT_ENTRY(entry, end) ( \
@@ -446,6 +449,12 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
446int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) 449int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
447{ 450{
448 *irq = gsi; 451 *irq = gsi;
452
453#ifdef CONFIG_X86_IO_APIC
454 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
455 setup_IO_APIC_irq_extra(gsi);
456#endif
457
449 return 0; 458 return 0;
450} 459}
451 460
@@ -473,7 +482,8 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
473 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); 482 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
474 } 483 }
475#endif 484#endif
476 acpi_gsi_to_irq(plat_gsi, &irq); 485 irq = plat_gsi;
486
477 return irq; 487 return irq;
478} 488}
479 489
@@ -481,6 +491,26 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
481 * ACPI based hotplug support for CPU 491 * ACPI based hotplug support for CPU
482 */ 492 */
483#ifdef CONFIG_ACPI_HOTPLUG_CPU 493#ifdef CONFIG_ACPI_HOTPLUG_CPU
494#include <acpi/processor.h>
495
496static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
497{
498#ifdef CONFIG_ACPI_NUMA
499 int nid;
500
501 nid = acpi_get_node(handle);
502 if (nid == -1 || !node_online(nid))
503 return;
504#ifdef CONFIG_X86_64
505 apicid_to_node[physid] = nid;
506 numa_set_node(cpu, nid);
507#else /* CONFIG_X86_32 */
508 apicid_2_node[physid] = nid;
509 cpu_to_node_map[cpu] = nid;
510#endif
511
512#endif
513}
484 514
485static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) 515static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
486{ 516{
@@ -539,7 +569,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
539 goto free_new_map; 569 goto free_new_map;
540 } 570 }
541 571
572 acpi_processor_set_pdc(handle);
573
542 cpu = cpumask_first(new_map); 574 cpu = cpumask_first(new_map);
575 acpi_map_cpu2node(handle, cpu, physid);
543 576
544 *pcpu = cpu; 577 *pcpu = cpu;
545 retval = 0; 578 retval = 0;
@@ -624,6 +657,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
624 } 657 }
625 658
626 hpet_address = hpet_tbl->address.address; 659 hpet_address = hpet_tbl->address.address;
660 hpet_blockid = hpet_tbl->sequence;
627 661
628 /* 662 /*
629 * Some broken BIOSes advertise HPET at 0x0. We really do not 663 * Some broken BIOSes advertise HPET at 0x0. We really do not
@@ -1122,7 +1156,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1122 if (!acpi_sci_override_gsi) 1156 if (!acpi_sci_override_gsi)
1123 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); 1157 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
1124 1158
1125 /* Fill in identity legacy mapings where no override */ 1159 /* Fill in identity legacy mappings where no override */
1126 mp_config_acpi_legacy_irqs(); 1160 mp_config_acpi_legacy_irqs();
1127 1161
1128 count = 1162 count =
@@ -1184,9 +1218,6 @@ static void __init acpi_process_madt(void)
1184 if (!error) { 1218 if (!error) {
1185 acpi_lapic = 1; 1219 acpi_lapic = 1;
1186 1220
1187#ifdef CONFIG_X86_BIGSMP
1188 generic_bigsmp_probe();
1189#endif
1190 /* 1221 /*
1191 * Parse MADT IO-APIC entries 1222 * Parse MADT IO-APIC entries
1192 */ 1223 */
@@ -1196,8 +1227,6 @@ static void __init acpi_process_madt(void)
1196 acpi_ioapic = 1; 1227 acpi_ioapic = 1;
1197 1228
1198 smp_found_config = 1; 1229 smp_found_config = 1;
1199 if (apic->setup_apic_routing)
1200 apic->setup_apic_routing();
1201 } 1230 }
1202 } 1231 }
1203 if (error == -EINVAL) { 1232 if (error == -EINVAL) {
@@ -1268,23 +1297,6 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
1268} 1297}
1269 1298
1270/* 1299/*
1271 * Limit ACPI to CPU enumeration for HT
1272 */
1273static int __init force_acpi_ht(const struct dmi_system_id *d)
1274{
1275 if (!acpi_force) {
1276 printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
1277 d->ident);
1278 disable_acpi();
1279 acpi_ht = 1;
1280 } else {
1281 printk(KERN_NOTICE
1282 "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
1283 }
1284 return 0;
1285}
1286
1287/*
1288 * Force ignoring BIOS IRQ0 pin2 override 1300 * Force ignoring BIOS IRQ0 pin2 override
1289 */ 1301 */
1290static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1302static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
@@ -1320,90 +1332,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1320 }, 1332 },
1321 1333
1322 /* 1334 /*
1323 * Boxes that need acpi=ht
1324 */
1325 {
1326 .callback = force_acpi_ht,
1327 .ident = "FSC Primergy T850",
1328 .matches = {
1329 DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1330 DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
1331 },
1332 },
1333 {
1334 .callback = force_acpi_ht,
1335 .ident = "HP VISUALIZE NT Workstation",
1336 .matches = {
1337 DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
1338 DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
1339 },
1340 },
1341 {
1342 .callback = force_acpi_ht,
1343 .ident = "Compaq Workstation W8000",
1344 .matches = {
1345 DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
1346 DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
1347 },
1348 },
1349 {
1350 .callback = force_acpi_ht,
1351 .ident = "ASUS P2B-DS",
1352 .matches = {
1353 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1354 DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1355 },
1356 },
1357 {
1358 .callback = force_acpi_ht,
1359 .ident = "ASUS CUR-DLS",
1360 .matches = {
1361 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1362 DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
1363 },
1364 },
1365 {
1366 .callback = force_acpi_ht,
1367 .ident = "ABIT i440BX-W83977",
1368 .matches = {
1369 DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
1370 DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
1371 },
1372 },
1373 {
1374 .callback = force_acpi_ht,
1375 .ident = "IBM Bladecenter",
1376 .matches = {
1377 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1378 DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
1379 },
1380 },
1381 {
1382 .callback = force_acpi_ht,
1383 .ident = "IBM eServer xSeries 360",
1384 .matches = {
1385 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1386 DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
1387 },
1388 },
1389 {
1390 .callback = force_acpi_ht,
1391 .ident = "IBM eserver xSeries 330",
1392 .matches = {
1393 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1394 DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
1395 },
1396 },
1397 {
1398 .callback = force_acpi_ht,
1399 .ident = "IBM eserver xSeries 440",
1400 .matches = {
1401 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1402 DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
1403 },
1404 },
1405
1406 /*
1407 * Boxes that need ACPI PCI IRQ routing disabled 1335 * Boxes that need ACPI PCI IRQ routing disabled
1408 */ 1336 */
1409 { 1337 {
@@ -1528,16 +1456,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1528 * if acpi_blacklisted() acpi_disabled = 1; 1456 * if acpi_blacklisted() acpi_disabled = 1;
1529 * acpi_irq_model=... 1457 * acpi_irq_model=...
1530 * ... 1458 * ...
1531 *
1532 * return value: (currently ignored)
1533 * 0: success
1534 * !0: failure
1535 */ 1459 */
1536 1460
1537int __init acpi_boot_table_init(void) 1461void __init acpi_boot_table_init(void)
1538{ 1462{
1539 int error;
1540
1541 dmi_check_system(acpi_dmi_table); 1463 dmi_check_system(acpi_dmi_table);
1542 1464
1543 /* 1465 /*
@@ -1545,15 +1467,14 @@ int __init acpi_boot_table_init(void)
1545 * One exception: acpi=ht continues far enough to enumerate LAPICs 1467 * One exception: acpi=ht continues far enough to enumerate LAPICs
1546 */ 1468 */
1547 if (acpi_disabled && !acpi_ht) 1469 if (acpi_disabled && !acpi_ht)
1548 return 1; 1470 return;
1549 1471
1550 /* 1472 /*
1551 * Initialize the ACPI boot-time table parser. 1473 * Initialize the ACPI boot-time table parser.
1552 */ 1474 */
1553 error = acpi_table_init(); 1475 if (acpi_table_init()) {
1554 if (error) {
1555 disable_acpi(); 1476 disable_acpi();
1556 return error; 1477 return;
1557 } 1478 }
1558 1479
1559 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); 1480 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1561,18 +1482,15 @@ int __init acpi_boot_table_init(void)
1561 /* 1482 /*
1562 * blacklist may disable ACPI entirely 1483 * blacklist may disable ACPI entirely
1563 */ 1484 */
1564 error = acpi_blacklisted(); 1485 if (acpi_blacklisted()) {
1565 if (error) {
1566 if (acpi_force) { 1486 if (acpi_force) {
1567 printk(KERN_WARNING PREFIX "acpi=force override\n"); 1487 printk(KERN_WARNING PREFIX "acpi=force override\n");
1568 } else { 1488 } else {
1569 printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); 1489 printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1570 disable_acpi(); 1490 disable_acpi();
1571 return error; 1491 return;
1572 } 1492 }
1573 } 1493 }
1574
1575 return 0;
1576} 1494}
1577 1495
1578int __init early_acpi_boot_init(void) 1496int __init early_acpi_boot_init(void)
@@ -1618,6 +1536,9 @@ int __init acpi_boot_init(void)
1618 1536
1619 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); 1537 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
1620 1538
1539 if (!acpi_noirq)
1540 x86_init.pci.init = pci_acpi_init;
1541
1621 return 0; 1542 return 0;
1622} 1543}
1623 1544
@@ -1642,8 +1563,10 @@ static int __init parse_acpi(char *arg)
1642 } 1563 }
1643 /* Limit ACPI just to boot-time to enable HT */ 1564 /* Limit ACPI just to boot-time to enable HT */
1644 else if (strcmp(arg, "ht") == 0) { 1565 else if (strcmp(arg, "ht") == 0) {
1645 if (!acpi_force) 1566 if (!acpi_force) {
1567 printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
1646 disable_acpi(); 1568 disable_acpi();
1569 }
1647 acpi_ht = 1; 1570 acpi_ht = 1;
1648 } 1571 }
1649 /* acpi=rsdt use RSDT instead of XSDT */ 1572 /* acpi=rsdt use RSDT instead of XSDT */
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 59cdfa4686b2..2e837f5080fe 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
48 * P4, Core and beyond CPUs 48 * P4, Core and beyond CPUs
49 */ 49 */
50 if (c->x86_vendor == X86_VENDOR_INTEL && 50 if (c->x86_vendor == X86_VENDOR_INTEL &&
51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) 51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
52 flags->bm_control = 0; 52 flags->bm_control = 0;
53} 53}
54EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 54EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
deleted file mode 100644
index d85d1b2432ba..000000000000
--- a/arch/x86/kernel/acpi/processor.c
+++ /dev/null
@@ -1,101 +0,0 @@
1/*
2 * Copyright (C) 2005 Intel Corporation
3 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
4 * - Added _PDC for platforms with Intel CPUs
5 */
6
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <linux/init.h>
10#include <linux/acpi.h>
11
12#include <acpi/processor.h>
13#include <asm/acpi.h>
14
15static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
16{
17 struct acpi_object_list *obj_list;
18 union acpi_object *obj;
19 u32 *buf;
20
21 /* allocate and initialize pdc. It will be used later. */
22 obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL);
23 if (!obj_list) {
24 printk(KERN_ERR "Memory allocation error\n");
25 return;
26 }
27
28 obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL);
29 if (!obj) {
30 printk(KERN_ERR "Memory allocation error\n");
31 kfree(obj_list);
32 return;
33 }
34
35 buf = kmalloc(12, GFP_KERNEL);
36 if (!buf) {
37 printk(KERN_ERR "Memory allocation error\n");
38 kfree(obj);
39 kfree(obj_list);
40 return;
41 }
42
43 buf[0] = ACPI_PDC_REVISION_ID;
44 buf[1] = 1;
45 buf[2] = ACPI_PDC_C_CAPABILITY_SMP;
46
47 /*
48 * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so
49 * that OSPM is capable of native ACPI throttling software
50 * coordination using BIOS supplied _TSD info.
51 */
52 buf[2] |= ACPI_PDC_SMP_T_SWCOORD;
53 if (cpu_has(c, X86_FEATURE_EST))
54 buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
55
56 if (cpu_has(c, X86_FEATURE_ACPI))
57 buf[2] |= ACPI_PDC_T_FFH;
58
59 /*
60 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
61 */
62 if (!cpu_has(c, X86_FEATURE_MWAIT))
63 buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
64
65 obj->type = ACPI_TYPE_BUFFER;
66 obj->buffer.length = 12;
67 obj->buffer.pointer = (u8 *) buf;
68 obj_list->count = 1;
69 obj_list->pointer = obj;
70 pr->pdc = obj_list;
71
72 return;
73}
74
75
76/* Initialize _PDC data based on the CPU vendor */
77void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
78{
79 struct cpuinfo_x86 *c = &cpu_data(pr->id);
80
81 pr->pdc = NULL;
82 if (c->x86_vendor == X86_VENDOR_INTEL ||
83 c->x86_vendor == X86_VENDOR_CENTAUR)
84 init_intel_pdc(pr, c);
85
86 return;
87}
88
89EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
90
91void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
92{
93 if (pr->pdc) {
94 kfree(pr->pdc->pointer->buffer.pointer);
95 kfree(pr->pdc->pointer);
96 kfree(pr->pdc);
97 pr->pdc = NULL;
98 }
99}
100
101EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc);
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638ba430..f9961034e557 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,12 +78,9 @@ int acpi_save_state_mem(void)
78#ifndef CONFIG_64BIT 78#ifndef CONFIG_64BIT
79 store_gdt((struct desc_ptr *)&header->pmode_gdt); 79 store_gdt((struct desc_ptr *)&header->pmode_gdt);
80 80
81 header->pmode_efer_low = nx_enabled; 81 if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
82 if (header->pmode_efer_low & 1) { 82 &header->pmode_efer_high))
83 /* This is strange, why not save efer, always? */ 83 header->pmode_efer_low = header->pmode_efer_high = 0;
84 rdmsr(MSR_EFER, header->pmode_efer_low,
85 header->pmode_efer_high);
86 }
87#endif /* !CONFIG_64BIT */ 84#endif /* !CONFIG_64BIT */
88 85
89 header->pmode_cr0 = read_cr0(); 86 header->pmode_cr0 = read_cr0();
@@ -119,29 +116,32 @@ void acpi_restore_state_mem(void)
119 116
120 117
121/** 118/**
122 * acpi_reserve_bootmem - do _very_ early ACPI initialisation 119 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
123 * 120 *
124 * We allocate a page from the first 1MB of memory for the wakeup 121 * We allocate a page from the first 1MB of memory for the wakeup
125 * routine for when we come back from a sleep state. The 122 * routine for when we come back from a sleep state. The
126 * runtime allocator allows specification of <16MB pages, but not 123 * runtime allocator allows specification of <16MB pages, but not
127 * <1MB pages. 124 * <1MB pages.
128 */ 125 */
129void __init acpi_reserve_bootmem(void) 126void __init acpi_reserve_wakeup_memory(void)
130{ 127{
128 unsigned long mem;
129
131 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { 130 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
132 printk(KERN_ERR 131 printk(KERN_ERR
133 "ACPI: Wakeup code way too big, S3 disabled.\n"); 132 "ACPI: Wakeup code way too big, S3 disabled.\n");
134 return; 133 return;
135 } 134 }
136 135
137 acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); 136 mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
138 137
139 if (!acpi_realmode) { 138 if (mem == -1L) {
140 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); 139 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
141 return; 140 return;
142 } 141 }
143 142 acpi_realmode = (unsigned long) phys_to_virt(mem);
144 acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); 143 acpi_wakeup_address = mem;
144 reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
145} 145}
146 146
147 147
@@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str)
162#endif 162#endif
163 if (strncmp(str, "old_ordering", 12) == 0) 163 if (strncmp(str, "old_ordering", 12) == 0)
164 acpi_old_suspend_ordering(); 164 acpi_old_suspend_ordering();
165 if (strncmp(str, "sci_force_enable", 16) == 0)
166 acpi_set_sci_en_on_resume();
165 str = strchr(str, ','); 167 str = strchr(str, ',');
166 if (str != NULL) 168 if (str != NULL)
167 str += strspn(str, ", \t"); 169 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index de7353c0ce9c..1a160d5d44d0 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,8 @@
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/vmalloc.h> 8#include <linux/vmalloc.h>
9#include <linux/memory.h> 9#include <linux/memory.h>
10#include <linux/stop_machine.h>
11#include <linux/slab.h>
10#include <asm/alternative.h> 12#include <asm/alternative.h>
11#include <asm/sections.h> 13#include <asm/sections.h>
12#include <asm/pgtable.h> 14#include <asm/pgtable.h>
@@ -205,7 +207,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
205 struct alt_instr *end) 207 struct alt_instr *end)
206{ 208{
207 struct alt_instr *a; 209 struct alt_instr *a;
208 char insnbuf[MAX_PATCH_LEN]; 210 u8 insnbuf[MAX_PATCH_LEN];
209 211
210 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 212 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
211 for (a = start; a < end; a++) { 213 for (a = start; a < end; a++) {
@@ -223,6 +225,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
223 } 225 }
224#endif 226#endif
225 memcpy(insnbuf, a->replacement, a->replacementlen); 227 memcpy(insnbuf, a->replacement, a->replacementlen);
228 if (*insnbuf == 0xe8 && a->replacementlen == 5)
229 *(s32 *)(insnbuf + 1) += a->replacement - a->instr;
226 add_nops(insnbuf + a->replacementlen, 230 add_nops(insnbuf + a->replacementlen,
227 a->instrlen - a->replacementlen); 231 a->instrlen - a->replacementlen);
228 text_poke_early(instr, insnbuf, a->instrlen); 232 text_poke_early(instr, insnbuf, a->instrlen);
@@ -390,6 +394,24 @@ void alternatives_smp_switch(int smp)
390 mutex_unlock(&smp_alt); 394 mutex_unlock(&smp_alt);
391} 395}
392 396
397/* Return 1 if the address range is reserved for smp-alternatives */
398int alternatives_text_reserved(void *start, void *end)
399{
400 struct smp_alt_module *mod;
401 u8 **ptr;
402 u8 *text_start = start;
403 u8 *text_end = end;
404
405 list_for_each_entry(mod, &smp_alt_modules, next) {
406 if (mod->text > text_end || mod->text_end < text_start)
407 continue;
408 for (ptr = mod->locks; ptr < mod->locks_end; ptr++)
409 if (text_start <= *ptr && text_end >= *ptr)
410 return 1;
411 }
412
413 return 0;
414}
393#endif 415#endif
394 416
395#ifdef CONFIG_PARAVIRT 417#ifdef CONFIG_PARAVIRT
@@ -552,3 +574,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
552 local_irq_restore(flags); 574 local_irq_restore(flags);
553 return addr; 575 return addr;
554} 576}
577
578/*
579 * Cross-modifying kernel text with stop_machine().
580 * This code originally comes from immediate value.
581 */
582static atomic_t stop_machine_first;
583static int wrote_text;
584
585struct text_poke_params {
586 void *addr;
587 const void *opcode;
588 size_t len;
589};
590
591static int __kprobes stop_machine_text_poke(void *data)
592{
593 struct text_poke_params *tpp = data;
594
595 if (atomic_dec_and_test(&stop_machine_first)) {
596 text_poke(tpp->addr, tpp->opcode, tpp->len);
597 smp_wmb(); /* Make sure other cpus see that this has run */
598 wrote_text = 1;
599 } else {
600 while (!wrote_text)
601 cpu_relax();
602 smp_mb(); /* Load wrote_text before following execution */
603 }
604
605 flush_icache_range((unsigned long)tpp->addr,
606 (unsigned long)tpp->addr + tpp->len);
607 return 0;
608}
609
610/**
611 * text_poke_smp - Update instructions on a live kernel on SMP
612 * @addr: address to modify
613 * @opcode: source of the copy
614 * @len: length to copy
615 *
616 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
617 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
618 * should be allowed, since stop_machine() does _not_ protect code against
619 * NMI and MCE.
620 *
621 * Note: Must be called under get_online_cpus() and text_mutex.
622 */
623void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
624{
625 struct text_poke_params tpp;
626
627 tpp.addr = addr;
628 tpp.opcode = opcode;
629 tpp.len = len;
630 atomic_set(&stop_machine_first, 1);
631 wrote_text = 0;
632 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
633 return addr;
634}
635
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0285521e0a99..f854d89b7edf 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -18,8 +18,8 @@
18 */ 18 */
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/gfp.h> 21#include <linux/bitmap.h>
22#include <linux/bitops.h> 22#include <linux/slab.h>
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
25#include <linux/dma-mapping.h> 25#include <linux/dma-mapping.h>
@@ -28,6 +28,7 @@
28#include <asm/proto.h> 28#include <asm/proto.h>
29#include <asm/iommu.h> 29#include <asm/iommu.h>
30#include <asm/gart.h> 30#include <asm/gart.h>
31#include <asm/amd_iommu_proto.h>
31#include <asm/amd_iommu_types.h> 32#include <asm/amd_iommu_types.h>
32#include <asm/amd_iommu.h> 33#include <asm/amd_iommu.h>
33 34
@@ -56,20 +57,152 @@ struct iommu_cmd {
56 u32 data[4]; 57 u32 data[4];
57}; 58};
58 59
59static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
60 struct unity_map_entry *e);
61static struct dma_ops_domain *find_protection_domain(u16 devid);
62static u64 *alloc_pte(struct protection_domain *domain,
63 unsigned long address, int end_lvl,
64 u64 **pte_page, gfp_t gfp);
65static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
66 unsigned long start_page,
67 unsigned int pages);
68static void reset_iommu_command_buffer(struct amd_iommu *iommu); 60static void reset_iommu_command_buffer(struct amd_iommu *iommu);
69static u64 *fetch_pte(struct protection_domain *domain,
70 unsigned long address, int map_size);
71static void update_domain(struct protection_domain *domain); 61static void update_domain(struct protection_domain *domain);
72 62
63/****************************************************************************
64 *
65 * Helper functions
66 *
67 ****************************************************************************/
68
69static inline u16 get_device_id(struct device *dev)
70{
71 struct pci_dev *pdev = to_pci_dev(dev);
72
73 return calc_devid(pdev->bus->number, pdev->devfn);
74}
75
76static struct iommu_dev_data *get_dev_data(struct device *dev)
77{
78 return dev->archdata.iommu;
79}
80
81/*
82 * In this function the list of preallocated protection domains is traversed to
83 * find the domain for a specific device
84 */
85static struct dma_ops_domain *find_protection_domain(u16 devid)
86{
87 struct dma_ops_domain *entry, *ret = NULL;
88 unsigned long flags;
89 u16 alias = amd_iommu_alias_table[devid];
90
91 if (list_empty(&iommu_pd_list))
92 return NULL;
93
94 spin_lock_irqsave(&iommu_pd_list_lock, flags);
95
96 list_for_each_entry(entry, &iommu_pd_list, list) {
97 if (entry->target_dev == devid ||
98 entry->target_dev == alias) {
99 ret = entry;
100 break;
101 }
102 }
103
104 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
105
106 return ret;
107}
108
109/*
110 * This function checks if the driver got a valid device from the caller to
111 * avoid dereferencing invalid pointers.
112 */
113static bool check_device(struct device *dev)
114{
115 u16 devid;
116
117 if (!dev || !dev->dma_mask)
118 return false;
119
120 /* No device or no PCI device */
121 if (dev->bus != &pci_bus_type)
122 return false;
123
124 devid = get_device_id(dev);
125
126 /* Out of our scope? */
127 if (devid > amd_iommu_last_bdf)
128 return false;
129
130 if (amd_iommu_rlookup_table[devid] == NULL)
131 return false;
132
133 return true;
134}
135
136static int iommu_init_device(struct device *dev)
137{
138 struct iommu_dev_data *dev_data;
139 struct pci_dev *pdev;
140 u16 devid, alias;
141
142 if (dev->archdata.iommu)
143 return 0;
144
145 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
146 if (!dev_data)
147 return -ENOMEM;
148
149 dev_data->dev = dev;
150
151 devid = get_device_id(dev);
152 alias = amd_iommu_alias_table[devid];
153 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
154 if (pdev)
155 dev_data->alias = &pdev->dev;
156
157 atomic_set(&dev_data->bind, 0);
158
159 dev->archdata.iommu = dev_data;
160
161
162 return 0;
163}
164
165static void iommu_uninit_device(struct device *dev)
166{
167 kfree(dev->archdata.iommu);
168}
169
170void __init amd_iommu_uninit_devices(void)
171{
172 struct pci_dev *pdev = NULL;
173
174 for_each_pci_dev(pdev) {
175
176 if (!check_device(&pdev->dev))
177 continue;
178
179 iommu_uninit_device(&pdev->dev);
180 }
181}
182
183int __init amd_iommu_init_devices(void)
184{
185 struct pci_dev *pdev = NULL;
186 int ret = 0;
187
188 for_each_pci_dev(pdev) {
189
190 if (!check_device(&pdev->dev))
191 continue;
192
193 ret = iommu_init_device(&pdev->dev);
194 if (ret)
195 goto out_free;
196 }
197
198 return 0;
199
200out_free:
201
202 amd_iommu_uninit_devices();
203
204 return ret;
205}
73#ifdef CONFIG_AMD_IOMMU_STATS 206#ifdef CONFIG_AMD_IOMMU_STATS
74 207
75/* 208/*
@@ -90,7 +223,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem);
90DECLARE_STATS_COUNTER(total_map_requests); 223DECLARE_STATS_COUNTER(total_map_requests);
91 224
92static struct dentry *stats_dir; 225static struct dentry *stats_dir;
93static struct dentry *de_isolate;
94static struct dentry *de_fflush; 226static struct dentry *de_fflush;
95 227
96static void amd_iommu_stats_add(struct __iommu_counter *cnt) 228static void amd_iommu_stats_add(struct __iommu_counter *cnt)
@@ -108,9 +240,6 @@ static void amd_iommu_stats_init(void)
108 if (stats_dir == NULL) 240 if (stats_dir == NULL)
109 return; 241 return;
110 242
111 de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
112 (u32 *)&amd_iommu_isolate);
113
114 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, 243 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
115 (u32 *)&amd_iommu_unmap_flush); 244 (u32 *)&amd_iommu_unmap_flush);
116 245
@@ -130,12 +259,6 @@ static void amd_iommu_stats_init(void)
130 259
131#endif 260#endif
132 261
133/* returns !0 if the IOMMU is caching non-present entries in its TLB */
134static int iommu_has_npcache(struct amd_iommu *iommu)
135{
136 return iommu->cap & (1UL << IOMMU_CAP_NPCACHE);
137}
138
139/**************************************************************************** 262/****************************************************************************
140 * 263 *
141 * Interrupt handling functions 264 * Interrupt handling functions
@@ -199,6 +322,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
199 break; 322 break;
200 case EVENT_TYPE_ILL_CMD: 323 case EVENT_TYPE_ILL_CMD:
201 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 324 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
325 iommu->reset_in_progress = true;
202 reset_iommu_command_buffer(iommu); 326 reset_iommu_command_buffer(iommu);
203 dump_command(address); 327 dump_command(address);
204 break; 328 break;
@@ -268,6 +392,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
268 u32 tail, head; 392 u32 tail, head;
269 u8 *target; 393 u8 *target;
270 394
395 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
271 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 396 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
272 target = iommu->cmd_buf + tail; 397 target = iommu->cmd_buf + tail;
273 memcpy_toio(target, cmd, sizeof(*cmd)); 398 memcpy_toio(target, cmd, sizeof(*cmd));
@@ -321,11 +446,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
321 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 446 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
322 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 447 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
323 448
324 if (unlikely(i == EXIT_LOOP_COUNT)) { 449 if (unlikely(i == EXIT_LOOP_COUNT))
325 spin_unlock(&iommu->lock); 450 iommu->reset_in_progress = true;
326 reset_iommu_command_buffer(iommu);
327 spin_lock(&iommu->lock);
328 }
329} 451}
330 452
331/* 453/*
@@ -372,26 +494,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
372out: 494out:
373 spin_unlock_irqrestore(&iommu->lock, flags); 495 spin_unlock_irqrestore(&iommu->lock, flags);
374 496
497 if (iommu->reset_in_progress)
498 reset_iommu_command_buffer(iommu);
499
375 return 0; 500 return 0;
376} 501}
377 502
503static void iommu_flush_complete(struct protection_domain *domain)
504{
505 int i;
506
507 for (i = 0; i < amd_iommus_present; ++i) {
508 if (!domain->dev_iommu[i])
509 continue;
510
511 /*
512 * Devices of this domain are behind this IOMMU
513 * We need to wait for completion of all commands.
514 */
515 iommu_completion_wait(amd_iommus[i]);
516 }
517}
518
378/* 519/*
379 * Command send function for invalidating a device table entry 520 * Command send function for invalidating a device table entry
380 */ 521 */
381static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 522static int iommu_flush_device(struct device *dev)
382{ 523{
524 struct amd_iommu *iommu;
383 struct iommu_cmd cmd; 525 struct iommu_cmd cmd;
384 int ret; 526 u16 devid;
385 527
386 BUG_ON(iommu == NULL); 528 devid = get_device_id(dev);
529 iommu = amd_iommu_rlookup_table[devid];
387 530
531 /* Build command */
388 memset(&cmd, 0, sizeof(cmd)); 532 memset(&cmd, 0, sizeof(cmd));
389 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 533 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
390 cmd.data[0] = devid; 534 cmd.data[0] = devid;
391 535
392 ret = iommu_queue_command(iommu, &cmd); 536 return iommu_queue_command(iommu, &cmd);
393
394 return ret;
395} 537}
396 538
397static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 539static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
@@ -430,11 +572,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
430 * It invalidates a single PTE if the range to flush is within a single 572 * It invalidates a single PTE if the range to flush is within a single
431 * page. Otherwise it flushes the whole TLB of the IOMMU. 573 * page. Otherwise it flushes the whole TLB of the IOMMU.
432 */ 574 */
433static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 575static void __iommu_flush_pages(struct protection_domain *domain,
434 u64 address, size_t size) 576 u64 address, size_t size, int pde)
435{ 577{
436 int s = 0; 578 int s = 0, i;
437 unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); 579 unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
438 580
439 address &= PAGE_MASK; 581 address &= PAGE_MASK;
440 582
@@ -447,142 +589,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
447 s = 1; 589 s = 1;
448 } 590 }
449 591
450 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
451 592
452 return 0; 593 for (i = 0; i < amd_iommus_present; ++i) {
594 if (!domain->dev_iommu[i])
595 continue;
596
597 /*
598 * Devices of this domain are behind this IOMMU
599 * We need a TLB flush
600 */
601 iommu_queue_inv_iommu_pages(amd_iommus[i], address,
602 domain->id, pde, s);
603 }
604
605 return;
453} 606}
454 607
455/* Flush the whole IO/TLB for a given protection domain */ 608static void iommu_flush_pages(struct protection_domain *domain,
456static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) 609 u64 address, size_t size)
457{ 610{
458 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 611 __iommu_flush_pages(domain, address, size, 0);
459 612}
460 INC_STATS_COUNTER(domain_flush_single);
461 613
462 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 614/* Flush the whole IO/TLB for a given protection domain */
615static void iommu_flush_tlb(struct protection_domain *domain)
616{
617 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
463} 618}
464 619
465/* Flush the whole IO/TLB for a given protection domain - including PDE */ 620/* Flush the whole IO/TLB for a given protection domain - including PDE */
466static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) 621static void iommu_flush_tlb_pde(struct protection_domain *domain)
467{ 622{
468 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 623 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
469
470 INC_STATS_COUNTER(domain_flush_single);
471
472 iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
473} 624}
474 625
626
475/* 627/*
476 * This function flushes one domain on one IOMMU 628 * This function flushes the DTEs for all devices in domain
477 */ 629 */
478static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) 630static void iommu_flush_domain_devices(struct protection_domain *domain)
479{ 631{
480 struct iommu_cmd cmd; 632 struct iommu_dev_data *dev_data;
481 unsigned long flags; 633 unsigned long flags;
482 634
483 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 635 spin_lock_irqsave(&domain->lock, flags);
484 domid, 1, 1);
485 636
486 spin_lock_irqsave(&iommu->lock, flags); 637 list_for_each_entry(dev_data, &domain->dev_list, list)
487 __iommu_queue_command(iommu, &cmd); 638 iommu_flush_device(dev_data->dev);
488 __iommu_completion_wait(iommu); 639
489 __iommu_wait_for_completion(iommu); 640 spin_unlock_irqrestore(&domain->lock, flags);
490 spin_unlock_irqrestore(&iommu->lock, flags);
491} 641}
492 642
493static void flush_all_domains_on_iommu(struct amd_iommu *iommu) 643static void iommu_flush_all_domain_devices(void)
494{ 644{
495 int i; 645 struct protection_domain *domain;
646 unsigned long flags;
496 647
497 for (i = 1; i < MAX_DOMAIN_ID; ++i) { 648 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
498 if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) 649
499 continue; 650 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
500 flush_domain_on_iommu(iommu, i); 651 iommu_flush_domain_devices(domain);
652 iommu_flush_complete(domain);
501 } 653 }
502 654
655 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
656}
657
658void amd_iommu_flush_all_devices(void)
659{
660 iommu_flush_all_domain_devices();
503} 661}
504 662
505/* 663/*
506 * This function is used to flush the IO/TLB for a given protection domain 664 * This function uses heavy locking and may disable irqs for some time. But
507 * on every IOMMU in the system 665 * this is no issue because it is only called during resume.
508 */ 666 */
509static void iommu_flush_domain(u16 domid) 667void amd_iommu_flush_all_domains(void)
510{ 668{
511 struct amd_iommu *iommu; 669 struct protection_domain *domain;
670 unsigned long flags;
512 671
513 INC_STATS_COUNTER(domain_flush_all); 672 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
514 673
515 for_each_iommu(iommu) 674 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
516 flush_domain_on_iommu(iommu, domid); 675 spin_lock(&domain->lock);
676 iommu_flush_tlb_pde(domain);
677 iommu_flush_complete(domain);
678 spin_unlock(&domain->lock);
679 }
680
681 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
517} 682}
518 683
519void amd_iommu_flush_all_domains(void) 684static void reset_iommu_command_buffer(struct amd_iommu *iommu)
520{ 685{
521 struct amd_iommu *iommu; 686 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
522 687
523 for_each_iommu(iommu) 688 if (iommu->reset_in_progress)
524 flush_all_domains_on_iommu(iommu); 689 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
690
691 amd_iommu_reset_cmd_buffer(iommu);
692 amd_iommu_flush_all_devices();
693 amd_iommu_flush_all_domains();
694
695 iommu->reset_in_progress = false;
525} 696}
526 697
527static void flush_all_devices_for_iommu(struct amd_iommu *iommu) 698/****************************************************************************
699 *
700 * The functions below are used the create the page table mappings for
701 * unity mapped regions.
702 *
703 ****************************************************************************/
704
705/*
706 * This function is used to add another level to an IO page table. Adding
707 * another level increases the size of the address space by 9 bits to a size up
708 * to 64 bits.
709 */
710static bool increase_address_space(struct protection_domain *domain,
711 gfp_t gfp)
528{ 712{
529 int i; 713 u64 *pte;
530 714
531 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 715 if (domain->mode == PAGE_MODE_6_LEVEL)
532 if (iommu != amd_iommu_rlookup_table[i]) 716 /* address space already 64 bit large */
533 continue; 717 return false;
534 718
535 iommu_queue_inv_dev_entry(iommu, i); 719 pte = (void *)get_zeroed_page(gfp);
536 iommu_completion_wait(iommu); 720 if (!pte)
537 } 721 return false;
722
723 *pte = PM_LEVEL_PDE(domain->mode,
724 virt_to_phys(domain->pt_root));
725 domain->pt_root = pte;
726 domain->mode += 1;
727 domain->updated = true;
728
729 return true;
538} 730}
539 731
540static void flush_devices_by_domain(struct protection_domain *domain) 732static u64 *alloc_pte(struct protection_domain *domain,
733 unsigned long address,
734 int end_lvl,
735 u64 **pte_page,
736 gfp_t gfp)
541{ 737{
542 struct amd_iommu *iommu; 738 u64 *pte, *page;
543 int i; 739 int level;
544 740
545 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 741 while (address > PM_LEVEL_SIZE(domain->mode))
546 if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || 742 increase_address_space(domain, gfp);
547 (amd_iommu_pd_table[i] != domain))
548 continue;
549 743
550 iommu = amd_iommu_rlookup_table[i]; 744 level = domain->mode - 1;
551 if (!iommu) 745 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
552 continue;
553 746
554 iommu_queue_inv_dev_entry(iommu, i); 747 while (level > end_lvl) {
555 iommu_completion_wait(iommu); 748 if (!IOMMU_PTE_PRESENT(*pte)) {
749 page = (u64 *)get_zeroed_page(gfp);
750 if (!page)
751 return NULL;
752 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
753 }
754
755 level -= 1;
756
757 pte = IOMMU_PTE_PAGE(*pte);
758
759 if (pte_page && level == end_lvl)
760 *pte_page = pte;
761
762 pte = &pte[PM_LEVEL_INDEX(level, address)];
556 } 763 }
764
765 return pte;
557} 766}
558 767
559static void reset_iommu_command_buffer(struct amd_iommu *iommu) 768/*
769 * This function checks if there is a PTE for a given dma address. If
770 * there is one, it returns the pointer to it.
771 */
772static u64 *fetch_pte(struct protection_domain *domain,
773 unsigned long address, int map_size)
560{ 774{
561 pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); 775 int level;
776 u64 *pte;
562 777
563 if (iommu->reset_in_progress) 778 level = domain->mode - 1;
564 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); 779 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
565 780
566 iommu->reset_in_progress = true; 781 while (level > map_size) {
782 if (!IOMMU_PTE_PRESENT(*pte))
783 return NULL;
567 784
568 amd_iommu_reset_cmd_buffer(iommu); 785 level -= 1;
569 flush_all_devices_for_iommu(iommu);
570 flush_all_domains_on_iommu(iommu);
571 786
572 iommu->reset_in_progress = false; 787 pte = IOMMU_PTE_PAGE(*pte);
573} 788 pte = &pte[PM_LEVEL_INDEX(level, address)];
574 789
575void amd_iommu_flush_all_devices(void) 790 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
576{ 791 pte = NULL;
577 flush_devices_by_domain(NULL); 792 break;
578} 793 }
794 }
579 795
580/**************************************************************************** 796 return pte;
581 * 797}
582 * The functions below are used the create the page table mappings for
583 * unity mapped regions.
584 *
585 ****************************************************************************/
586 798
587/* 799/*
588 * Generic mapping functions. It maps a physical address into a DMA 800 * Generic mapping functions. It maps a physical address into a DMA
@@ -654,28 +866,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
654} 866}
655 867
656/* 868/*
657 * Init the unity mappings for a specific IOMMU in the system
658 *
659 * Basically iterates over all unity mapping entries and applies them to
660 * the default domain DMA of that IOMMU if necessary.
661 */
662static int iommu_init_unity_mappings(struct amd_iommu *iommu)
663{
664 struct unity_map_entry *entry;
665 int ret;
666
667 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
668 if (!iommu_for_unity_map(iommu, entry))
669 continue;
670 ret = dma_ops_unity_map(iommu->default_dom, entry);
671 if (ret)
672 return ret;
673 }
674
675 return 0;
676}
677
678/*
679 * This function actually applies the mapping to the page table of the 869 * This function actually applies the mapping to the page table of the
680 * dma_ops domain. 870 * dma_ops domain.
681 */ 871 */
@@ -704,6 +894,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
704} 894}
705 895
706/* 896/*
897 * Init the unity mappings for a specific IOMMU in the system
898 *
899 * Basically iterates over all unity mapping entries and applies them to
900 * the default domain DMA of that IOMMU if necessary.
901 */
902static int iommu_init_unity_mappings(struct amd_iommu *iommu)
903{
904 struct unity_map_entry *entry;
905 int ret;
906
907 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
908 if (!iommu_for_unity_map(iommu, entry))
909 continue;
910 ret = dma_ops_unity_map(iommu->default_dom, entry);
911 if (ret)
912 return ret;
913 }
914
915 return 0;
916}
917
918/*
707 * Inits the unity mappings required for a specific device 919 * Inits the unity mappings required for a specific device
708 */ 920 */
709static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 921static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
@@ -740,34 +952,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
740 */ 952 */
741 953
742/* 954/*
743 * This function checks if there is a PTE for a given dma address. If 955 * Used to reserve address ranges in the aperture (e.g. for exclusion
744 * there is one, it returns the pointer to it. 956 * ranges.
745 */ 957 */
746static u64 *fetch_pte(struct protection_domain *domain, 958static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
747 unsigned long address, int map_size) 959 unsigned long start_page,
960 unsigned int pages)
748{ 961{
749 int level; 962 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
750 u64 *pte;
751
752 level = domain->mode - 1;
753 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
754
755 while (level > map_size) {
756 if (!IOMMU_PTE_PRESENT(*pte))
757 return NULL;
758
759 level -= 1;
760 963
761 pte = IOMMU_PTE_PAGE(*pte); 964 if (start_page + pages > last_page)
762 pte = &pte[PM_LEVEL_INDEX(level, address)]; 965 pages = last_page - start_page;
763 966
764 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { 967 for (i = start_page; i < start_page + pages; ++i) {
765 pte = NULL; 968 int index = i / APERTURE_RANGE_PAGES;
766 break; 969 int page = i % APERTURE_RANGE_PAGES;
767 } 970 __set_bit(page, dom->aperture[index]->bitmap);
768 } 971 }
769
770 return pte;
771} 972}
772 973
773/* 974/*
@@ -775,12 +976,12 @@ static u64 *fetch_pte(struct protection_domain *domain,
775 * aperture in case of dma_ops domain allocation or address allocation 976 * aperture in case of dma_ops domain allocation or address allocation
776 * failure. 977 * failure.
777 */ 978 */
778static int alloc_new_range(struct amd_iommu *iommu, 979static int alloc_new_range(struct dma_ops_domain *dma_dom,
779 struct dma_ops_domain *dma_dom,
780 bool populate, gfp_t gfp) 980 bool populate, gfp_t gfp)
781{ 981{
782 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; 982 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
783 int i; 983 struct amd_iommu *iommu;
984 unsigned long i;
784 985
785#ifdef CONFIG_IOMMU_STRESS 986#ifdef CONFIG_IOMMU_STRESS
786 populate = false; 987 populate = false;
@@ -819,14 +1020,17 @@ static int alloc_new_range(struct amd_iommu *iommu,
819 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1020 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
820 1021
821 /* Intialize the exclusion range if necessary */ 1022 /* Intialize the exclusion range if necessary */
822 if (iommu->exclusion_start && 1023 for_each_iommu(iommu) {
823 iommu->exclusion_start >= dma_dom->aperture[index]->offset && 1024 if (iommu->exclusion_start &&
824 iommu->exclusion_start < dma_dom->aperture_size) { 1025 iommu->exclusion_start >= dma_dom->aperture[index]->offset
825 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 1026 && iommu->exclusion_start < dma_dom->aperture_size) {
826 int pages = iommu_num_pages(iommu->exclusion_start, 1027 unsigned long startpage;
827 iommu->exclusion_length, 1028 int pages = iommu_num_pages(iommu->exclusion_start,
828 PAGE_SIZE); 1029 iommu->exclusion_length,
829 dma_ops_reserve_addresses(dma_dom, startpage, pages); 1030 PAGE_SIZE);
1031 startpage = iommu->exclusion_start >> PAGE_SHIFT;
1032 dma_ops_reserve_addresses(dma_dom, startpage, pages);
1033 }
830 } 1034 }
831 1035
832 /* 1036 /*
@@ -928,7 +1132,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
928 } 1132 }
929 1133
930 if (unlikely(address == -1)) 1134 if (unlikely(address == -1))
931 address = bad_dma_address; 1135 address = DMA_ERROR_CODE;
932 1136
933 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 1137 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
934 1138
@@ -959,7 +1163,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
959 1163
960 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; 1164 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
961 1165
962 iommu_area_free(range->bitmap, address, pages); 1166 bitmap_clear(range->bitmap, address, pages);
963 1167
964} 1168}
965 1169
@@ -973,6 +1177,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
973 * 1177 *
974 ****************************************************************************/ 1178 ****************************************************************************/
975 1179
1180/*
1181 * This function adds a protection domain to the global protection domain list
1182 */
1183static void add_domain_to_list(struct protection_domain *domain)
1184{
1185 unsigned long flags;
1186
1187 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1188 list_add(&domain->list, &amd_iommu_pd_list);
1189 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1190}
1191
1192/*
1193 * This function removes a protection domain to the global
1194 * protection domain list
1195 */
1196static void del_domain_from_list(struct protection_domain *domain)
1197{
1198 unsigned long flags;
1199
1200 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1201 list_del(&domain->list);
1202 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1203}
1204
976static u16 domain_id_alloc(void) 1205static u16 domain_id_alloc(void)
977{ 1206{
978 unsigned long flags; 1207 unsigned long flags;
@@ -1000,26 +1229,6 @@ static void domain_id_free(int id)
1000 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1229 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1001} 1230}
1002 1231
1003/*
1004 * Used to reserve address ranges in the aperture (e.g. for exclusion
1005 * ranges.
1006 */
1007static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
1008 unsigned long start_page,
1009 unsigned int pages)
1010{
1011 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
1012
1013 if (start_page + pages > last_page)
1014 pages = last_page - start_page;
1015
1016 for (i = start_page; i < start_page + pages; ++i) {
1017 int index = i / APERTURE_RANGE_PAGES;
1018 int page = i % APERTURE_RANGE_PAGES;
1019 __set_bit(page, dom->aperture[index]->bitmap);
1020 }
1021}
1022
1023static void free_pagetable(struct protection_domain *domain) 1232static void free_pagetable(struct protection_domain *domain)
1024{ 1233{
1025 int i, j; 1234 int i, j;
@@ -1061,6 +1270,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1061 if (!dom) 1270 if (!dom)
1062 return; 1271 return;
1063 1272
1273 del_domain_from_list(&dom->domain);
1274
1064 free_pagetable(&dom->domain); 1275 free_pagetable(&dom->domain);
1065 1276
1066 for (i = 0; i < APERTURE_MAX_RANGES; ++i) { 1277 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
@@ -1078,7 +1289,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1078 * It also intializes the page table and the address allocator data 1289 * It also intializes the page table and the address allocator data
1079 * structures required for the dma_ops interface 1290 * structures required for the dma_ops interface
1080 */ 1291 */
1081static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) 1292static struct dma_ops_domain *dma_ops_domain_alloc(void)
1082{ 1293{
1083 struct dma_ops_domain *dma_dom; 1294 struct dma_ops_domain *dma_dom;
1084 1295
@@ -1091,6 +1302,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1091 dma_dom->domain.id = domain_id_alloc(); 1302 dma_dom->domain.id = domain_id_alloc();
1092 if (dma_dom->domain.id == 0) 1303 if (dma_dom->domain.id == 0)
1093 goto free_dma_dom; 1304 goto free_dma_dom;
1305 INIT_LIST_HEAD(&dma_dom->domain.dev_list);
1094 dma_dom->domain.mode = PAGE_MODE_2_LEVEL; 1306 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1095 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1307 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1096 dma_dom->domain.flags = PD_DMA_OPS_MASK; 1308 dma_dom->domain.flags = PD_DMA_OPS_MASK;
@@ -1101,7 +1313,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1101 dma_dom->need_flush = false; 1313 dma_dom->need_flush = false;
1102 dma_dom->target_dev = 0xffff; 1314 dma_dom->target_dev = 0xffff;
1103 1315
1104 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) 1316 add_domain_to_list(&dma_dom->domain);
1317
1318 if (alloc_new_range(dma_dom, true, GFP_KERNEL))
1105 goto free_dma_dom; 1319 goto free_dma_dom;
1106 1320
1107 /* 1321 /*
@@ -1129,22 +1343,6 @@ static bool dma_ops_domain(struct protection_domain *domain)
1129 return domain->flags & PD_DMA_OPS_MASK; 1343 return domain->flags & PD_DMA_OPS_MASK;
1130} 1344}
1131 1345
1132/*
1133 * Find out the protection domain structure for a given PCI device. This
1134 * will give us the pointer to the page table root for example.
1135 */
1136static struct protection_domain *domain_for_device(u16 devid)
1137{
1138 struct protection_domain *dom;
1139 unsigned long flags;
1140
1141 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1142 dom = amd_iommu_pd_table[devid];
1143 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1144
1145 return dom;
1146}
1147
1148static void set_dte_entry(u16 devid, struct protection_domain *domain) 1346static void set_dte_entry(u16 devid, struct protection_domain *domain)
1149{ 1347{
1150 u64 pte_root = virt_to_phys(domain->pt_root); 1348 u64 pte_root = virt_to_phys(domain->pt_root);
@@ -1156,42 +1354,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain)
1156 amd_iommu_dev_table[devid].data[2] = domain->id; 1354 amd_iommu_dev_table[devid].data[2] = domain->id;
1157 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); 1355 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1158 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1356 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1357}
1358
1359static void clear_dte_entry(u16 devid)
1360{
1361 /* remove entry from the device table seen by the hardware */
1362 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1363 amd_iommu_dev_table[devid].data[1] = 0;
1364 amd_iommu_dev_table[devid].data[2] = 0;
1365
1366 amd_iommu_apply_erratum_63(devid);
1367}
1368
1369static void do_attach(struct device *dev, struct protection_domain *domain)
1370{
1371 struct iommu_dev_data *dev_data;
1372 struct amd_iommu *iommu;
1373 u16 devid;
1374
1375 devid = get_device_id(dev);
1376 iommu = amd_iommu_rlookup_table[devid];
1377 dev_data = get_dev_data(dev);
1378
1379 /* Update data structures */
1380 dev_data->domain = domain;
1381 list_add(&dev_data->list, &domain->dev_list);
1382 set_dte_entry(devid, domain);
1383
1384 /* Do reference counting */
1385 domain->dev_iommu[iommu->index] += 1;
1386 domain->dev_cnt += 1;
1159 1387
1160 amd_iommu_pd_table[devid] = domain; 1388 /* Flush the DTE entry */
1389 iommu_flush_device(dev);
1390}
1391
1392static void do_detach(struct device *dev)
1393{
1394 struct iommu_dev_data *dev_data;
1395 struct amd_iommu *iommu;
1396 u16 devid;
1397
1398 devid = get_device_id(dev);
1399 iommu = amd_iommu_rlookup_table[devid];
1400 dev_data = get_dev_data(dev);
1401
1402 /* decrease reference counters */
1403 dev_data->domain->dev_iommu[iommu->index] -= 1;
1404 dev_data->domain->dev_cnt -= 1;
1405
1406 /* Update data structures */
1407 dev_data->domain = NULL;
1408 list_del(&dev_data->list);
1409 clear_dte_entry(devid);
1410
1411 /* Flush the DTE entry */
1412 iommu_flush_device(dev);
1161} 1413}
1162 1414
1163/* 1415/*
1164 * If a device is not yet associated with a domain, this function does 1416 * If a device is not yet associated with a domain, this function does
1165 * assigns it visible for the hardware 1417 * assigns it visible for the hardware
1166 */ 1418 */
1167static void __attach_device(struct amd_iommu *iommu, 1419static int __attach_device(struct device *dev,
1168 struct protection_domain *domain, 1420 struct protection_domain *domain)
1169 u16 devid)
1170{ 1421{
1422 struct iommu_dev_data *dev_data, *alias_data;
1423
1424 dev_data = get_dev_data(dev);
1425 alias_data = get_dev_data(dev_data->alias);
1426
1427 if (!alias_data)
1428 return -EINVAL;
1429
1171 /* lock domain */ 1430 /* lock domain */
1172 spin_lock(&domain->lock); 1431 spin_lock(&domain->lock);
1173 1432
1174 /* update DTE entry */ 1433 /* Some sanity checks */
1175 set_dte_entry(devid, domain); 1434 if (alias_data->domain != NULL &&
1435 alias_data->domain != domain)
1436 return -EBUSY;
1176 1437
1177 domain->dev_cnt += 1; 1438 if (dev_data->domain != NULL &&
1439 dev_data->domain != domain)
1440 return -EBUSY;
1441
1442 /* Do real assignment */
1443 if (dev_data->alias != dev) {
1444 alias_data = get_dev_data(dev_data->alias);
1445 if (alias_data->domain == NULL)
1446 do_attach(dev_data->alias, domain);
1447
1448 atomic_inc(&alias_data->bind);
1449 }
1450
1451 if (dev_data->domain == NULL)
1452 do_attach(dev, domain);
1453
1454 atomic_inc(&dev_data->bind);
1178 1455
1179 /* ready */ 1456 /* ready */
1180 spin_unlock(&domain->lock); 1457 spin_unlock(&domain->lock);
1458
1459 return 0;
1181} 1460}
1182 1461
1183/* 1462/*
1184 * If a device is not yet associated with a domain, this function does 1463 * If a device is not yet associated with a domain, this function does
1185 * assigns it visible for the hardware 1464 * assigns it visible for the hardware
1186 */ 1465 */
1187static void attach_device(struct amd_iommu *iommu, 1466static int attach_device(struct device *dev,
1188 struct protection_domain *domain, 1467 struct protection_domain *domain)
1189 u16 devid)
1190{ 1468{
1191 unsigned long flags; 1469 unsigned long flags;
1470 int ret;
1192 1471
1193 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1472 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1194 __attach_device(iommu, domain, devid); 1473 ret = __attach_device(dev, domain);
1195 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1474 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1196 1475
1197 /* 1476 /*
@@ -1199,98 +1478,130 @@ static void attach_device(struct amd_iommu *iommu,
1199 * left the caches in the IOMMU dirty. So we have to flush 1478 * left the caches in the IOMMU dirty. So we have to flush
1200 * here to evict all dirty stuff. 1479 * here to evict all dirty stuff.
1201 */ 1480 */
1202 iommu_queue_inv_dev_entry(iommu, devid); 1481 iommu_flush_tlb_pde(domain);
1203 iommu_flush_tlb_pde(iommu, domain->id); 1482
1483 return ret;
1204} 1484}
1205 1485
1206/* 1486/*
1207 * Removes a device from a protection domain (unlocked) 1487 * Removes a device from a protection domain (unlocked)
1208 */ 1488 */
1209static void __detach_device(struct protection_domain *domain, u16 devid) 1489static void __detach_device(struct device *dev)
1210{ 1490{
1491 struct iommu_dev_data *dev_data = get_dev_data(dev);
1492 struct iommu_dev_data *alias_data;
1493 struct protection_domain *domain;
1494 unsigned long flags;
1211 1495
1212 /* lock domain */ 1496 BUG_ON(!dev_data->domain);
1213 spin_lock(&domain->lock);
1214 1497
1215 /* remove domain from the lookup table */ 1498 domain = dev_data->domain;
1216 amd_iommu_pd_table[devid] = NULL;
1217 1499
1218 /* remove entry from the device table seen by the hardware */ 1500 spin_lock_irqsave(&domain->lock, flags);
1219 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1220 amd_iommu_dev_table[devid].data[1] = 0;
1221 amd_iommu_dev_table[devid].data[2] = 0;
1222 1501
1223 amd_iommu_apply_erratum_63(devid); 1502 if (dev_data->alias != dev) {
1503 alias_data = get_dev_data(dev_data->alias);
1504 if (atomic_dec_and_test(&alias_data->bind))
1505 do_detach(dev_data->alias);
1506 }
1224 1507
1225 /* decrease reference counter */ 1508 if (atomic_dec_and_test(&dev_data->bind))
1226 domain->dev_cnt -= 1; 1509 do_detach(dev);
1227 1510
1228 /* ready */ 1511 spin_unlock_irqrestore(&domain->lock, flags);
1229 spin_unlock(&domain->lock);
1230 1512
1231 /* 1513 /*
1232 * If we run in passthrough mode the device must be assigned to the 1514 * If we run in passthrough mode the device must be assigned to the
1233 * passthrough domain if it is detached from any other domain 1515 * passthrough domain if it is detached from any other domain.
1516 * Make sure we can deassign from the pt_domain itself.
1234 */ 1517 */
1235 if (iommu_pass_through) { 1518 if (iommu_pass_through &&
1236 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 1519 (dev_data->domain == NULL && domain != pt_domain))
1237 __attach_device(iommu, pt_domain, devid); 1520 __attach_device(dev, pt_domain);
1238 }
1239} 1521}
1240 1522
1241/* 1523/*
1242 * Removes a device from a protection domain (with devtable_lock held) 1524 * Removes a device from a protection domain (with devtable_lock held)
1243 */ 1525 */
1244static void detach_device(struct protection_domain *domain, u16 devid) 1526static void detach_device(struct device *dev)
1245{ 1527{
1246 unsigned long flags; 1528 unsigned long flags;
1247 1529
1248 /* lock device table */ 1530 /* lock device table */
1249 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1531 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1250 __detach_device(domain, devid); 1532 __detach_device(dev);
1251 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1533 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1252} 1534}
1253 1535
1536/*
1537 * Find out the protection domain structure for a given PCI device. This
1538 * will give us the pointer to the page table root for example.
1539 */
1540static struct protection_domain *domain_for_device(struct device *dev)
1541{
1542 struct protection_domain *dom;
1543 struct iommu_dev_data *dev_data, *alias_data;
1544 unsigned long flags;
1545 u16 devid, alias;
1546
1547 devid = get_device_id(dev);
1548 alias = amd_iommu_alias_table[devid];
1549 dev_data = get_dev_data(dev);
1550 alias_data = get_dev_data(dev_data->alias);
1551 if (!alias_data)
1552 return NULL;
1553
1554 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1555 dom = dev_data->domain;
1556 if (dom == NULL &&
1557 alias_data->domain != NULL) {
1558 __attach_device(dev, alias_data->domain);
1559 dom = alias_data->domain;
1560 }
1561
1562 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1563
1564 return dom;
1565}
1566
1254static int device_change_notifier(struct notifier_block *nb, 1567static int device_change_notifier(struct notifier_block *nb,
1255 unsigned long action, void *data) 1568 unsigned long action, void *data)
1256{ 1569{
1257 struct device *dev = data; 1570 struct device *dev = data;
1258 struct pci_dev *pdev = to_pci_dev(dev); 1571 u16 devid;
1259 u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
1260 struct protection_domain *domain; 1572 struct protection_domain *domain;
1261 struct dma_ops_domain *dma_domain; 1573 struct dma_ops_domain *dma_domain;
1262 struct amd_iommu *iommu; 1574 struct amd_iommu *iommu;
1263 unsigned long flags; 1575 unsigned long flags;
1264 1576
1265 if (devid > amd_iommu_last_bdf) 1577 if (!check_device(dev))
1266 goto out; 1578 return 0;
1267
1268 devid = amd_iommu_alias_table[devid];
1269
1270 iommu = amd_iommu_rlookup_table[devid];
1271 if (iommu == NULL)
1272 goto out;
1273
1274 domain = domain_for_device(devid);
1275 1579
1276 if (domain && !dma_ops_domain(domain)) 1580 devid = get_device_id(dev);
1277 WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " 1581 iommu = amd_iommu_rlookup_table[devid];
1278 "to a non-dma-ops domain\n", dev_name(dev));
1279 1582
1280 switch (action) { 1583 switch (action) {
1281 case BUS_NOTIFY_UNBOUND_DRIVER: 1584 case BUS_NOTIFY_UNBOUND_DRIVER:
1585
1586 domain = domain_for_device(dev);
1587
1282 if (!domain) 1588 if (!domain)
1283 goto out; 1589 goto out;
1284 if (iommu_pass_through) 1590 if (iommu_pass_through)
1285 break; 1591 break;
1286 detach_device(domain, devid); 1592 detach_device(dev);
1287 break; 1593 break;
1288 case BUS_NOTIFY_ADD_DEVICE: 1594 case BUS_NOTIFY_ADD_DEVICE:
1595
1596 iommu_init_device(dev);
1597
1598 domain = domain_for_device(dev);
1599
1289 /* allocate a protection domain if a device is added */ 1600 /* allocate a protection domain if a device is added */
1290 dma_domain = find_protection_domain(devid); 1601 dma_domain = find_protection_domain(devid);
1291 if (dma_domain) 1602 if (dma_domain)
1292 goto out; 1603 goto out;
1293 dma_domain = dma_ops_domain_alloc(iommu); 1604 dma_domain = dma_ops_domain_alloc();
1294 if (!dma_domain) 1605 if (!dma_domain)
1295 goto out; 1606 goto out;
1296 dma_domain->target_dev = devid; 1607 dma_domain->target_dev = devid;
@@ -1300,11 +1611,15 @@ static int device_change_notifier(struct notifier_block *nb,
1300 spin_unlock_irqrestore(&iommu_pd_list_lock, flags); 1611 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1301 1612
1302 break; 1613 break;
1614 case BUS_NOTIFY_DEL_DEVICE:
1615
1616 iommu_uninit_device(dev);
1617
1303 default: 1618 default:
1304 goto out; 1619 goto out;
1305 } 1620 }
1306 1621
1307 iommu_queue_inv_dev_entry(iommu, devid); 1622 iommu_flush_device(dev);
1308 iommu_completion_wait(iommu); 1623 iommu_completion_wait(iommu);
1309 1624
1310out: 1625out:
@@ -1315,6 +1630,11 @@ static struct notifier_block device_nb = {
1315 .notifier_call = device_change_notifier, 1630 .notifier_call = device_change_notifier,
1316}; 1631};
1317 1632
1633void amd_iommu_init_notifier(void)
1634{
1635 bus_register_notifier(&pci_bus_type, &device_nb);
1636}
1637
1318/***************************************************************************** 1638/*****************************************************************************
1319 * 1639 *
1320 * The next functions belong to the dma_ops mapping/unmapping code. 1640 * The next functions belong to the dma_ops mapping/unmapping code.
@@ -1322,106 +1642,46 @@ static struct notifier_block device_nb = {
1322 *****************************************************************************/ 1642 *****************************************************************************/
1323 1643
1324/* 1644/*
1325 * This function checks if the driver got a valid device from the caller to
1326 * avoid dereferencing invalid pointers.
1327 */
1328static bool check_device(struct device *dev)
1329{
1330 if (!dev || !dev->dma_mask)
1331 return false;
1332
1333 return true;
1334}
1335
1336/*
1337 * In this function the list of preallocated protection domains is traversed to
1338 * find the domain for a specific device
1339 */
1340static struct dma_ops_domain *find_protection_domain(u16 devid)
1341{
1342 struct dma_ops_domain *entry, *ret = NULL;
1343 unsigned long flags;
1344
1345 if (list_empty(&iommu_pd_list))
1346 return NULL;
1347
1348 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1349
1350 list_for_each_entry(entry, &iommu_pd_list, list) {
1351 if (entry->target_dev == devid) {
1352 ret = entry;
1353 break;
1354 }
1355 }
1356
1357 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1358
1359 return ret;
1360}
1361
1362/*
1363 * In the dma_ops path we only have the struct device. This function 1645 * In the dma_ops path we only have the struct device. This function
1364 * finds the corresponding IOMMU, the protection domain and the 1646 * finds the corresponding IOMMU, the protection domain and the
1365 * requestor id for a given device. 1647 * requestor id for a given device.
1366 * If the device is not yet associated with a domain this is also done 1648 * If the device is not yet associated with a domain this is also done
1367 * in this function. 1649 * in this function.
1368 */ 1650 */
1369static int get_device_resources(struct device *dev, 1651static struct protection_domain *get_domain(struct device *dev)
1370 struct amd_iommu **iommu,
1371 struct protection_domain **domain,
1372 u16 *bdf)
1373{ 1652{
1653 struct protection_domain *domain;
1374 struct dma_ops_domain *dma_dom; 1654 struct dma_ops_domain *dma_dom;
1375 struct pci_dev *pcidev; 1655 u16 devid = get_device_id(dev);
1376 u16 _bdf;
1377
1378 *iommu = NULL;
1379 *domain = NULL;
1380 *bdf = 0xffff;
1381
1382 if (dev->bus != &pci_bus_type)
1383 return 0;
1384
1385 pcidev = to_pci_dev(dev);
1386 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1387 1656
1388 /* device not translated by any IOMMU in the system? */ 1657 if (!check_device(dev))
1389 if (_bdf > amd_iommu_last_bdf) 1658 return ERR_PTR(-EINVAL);
1390 return 0;
1391 1659
1392 *bdf = amd_iommu_alias_table[_bdf]; 1660 domain = domain_for_device(dev);
1661 if (domain != NULL && !dma_ops_domain(domain))
1662 return ERR_PTR(-EBUSY);
1393 1663
1394 *iommu = amd_iommu_rlookup_table[*bdf]; 1664 if (domain != NULL)
1395 if (*iommu == NULL) 1665 return domain;
1396 return 0;
1397 *domain = domain_for_device(*bdf);
1398 if (*domain == NULL) {
1399 dma_dom = find_protection_domain(*bdf);
1400 if (!dma_dom)
1401 dma_dom = (*iommu)->default_dom;
1402 *domain = &dma_dom->domain;
1403 attach_device(*iommu, *domain, *bdf);
1404 DUMP_printk("Using protection domain %d for device %s\n",
1405 (*domain)->id, dev_name(dev));
1406 }
1407 1666
1408 if (domain_for_device(_bdf) == NULL) 1667 /* Device not bount yet - bind it */
1409 attach_device(*iommu, *domain, _bdf); 1668 dma_dom = find_protection_domain(devid);
1669 if (!dma_dom)
1670 dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
1671 attach_device(dev, &dma_dom->domain);
1672 DUMP_printk("Using protection domain %d for device %s\n",
1673 dma_dom->domain.id, dev_name(dev));
1410 1674
1411 return 1; 1675 return &dma_dom->domain;
1412} 1676}
1413 1677
1414static void update_device_table(struct protection_domain *domain) 1678static void update_device_table(struct protection_domain *domain)
1415{ 1679{
1416 unsigned long flags; 1680 struct iommu_dev_data *dev_data;
1417 int i;
1418 1681
1419 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 1682 list_for_each_entry(dev_data, &domain->dev_list, list) {
1420 if (amd_iommu_pd_table[i] != domain) 1683 u16 devid = get_device_id(dev_data->dev);
1421 continue; 1684 set_dte_entry(devid, domain);
1422 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1423 set_dte_entry(i, domain);
1424 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1425 } 1685 }
1426} 1686}
1427 1687
@@ -1431,76 +1691,13 @@ static void update_domain(struct protection_domain *domain)
1431 return; 1691 return;
1432 1692
1433 update_device_table(domain); 1693 update_device_table(domain);
1434 flush_devices_by_domain(domain); 1694 iommu_flush_domain_devices(domain);
1435 iommu_flush_domain(domain->id); 1695 iommu_flush_tlb_pde(domain);
1436 1696
1437 domain->updated = false; 1697 domain->updated = false;
1438} 1698}
1439 1699
1440/* 1700/*
1441 * This function is used to add another level to an IO page table. Adding
1442 * another level increases the size of the address space by 9 bits to a size up
1443 * to 64 bits.
1444 */
1445static bool increase_address_space(struct protection_domain *domain,
1446 gfp_t gfp)
1447{
1448 u64 *pte;
1449
1450 if (domain->mode == PAGE_MODE_6_LEVEL)
1451 /* address space already 64 bit large */
1452 return false;
1453
1454 pte = (void *)get_zeroed_page(gfp);
1455 if (!pte)
1456 return false;
1457
1458 *pte = PM_LEVEL_PDE(domain->mode,
1459 virt_to_phys(domain->pt_root));
1460 domain->pt_root = pte;
1461 domain->mode += 1;
1462 domain->updated = true;
1463
1464 return true;
1465}
1466
1467static u64 *alloc_pte(struct protection_domain *domain,
1468 unsigned long address,
1469 int end_lvl,
1470 u64 **pte_page,
1471 gfp_t gfp)
1472{
1473 u64 *pte, *page;
1474 int level;
1475
1476 while (address > PM_LEVEL_SIZE(domain->mode))
1477 increase_address_space(domain, gfp);
1478
1479 level = domain->mode - 1;
1480 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1481
1482 while (level > end_lvl) {
1483 if (!IOMMU_PTE_PRESENT(*pte)) {
1484 page = (u64 *)get_zeroed_page(gfp);
1485 if (!page)
1486 return NULL;
1487 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
1488 }
1489
1490 level -= 1;
1491
1492 pte = IOMMU_PTE_PAGE(*pte);
1493
1494 if (pte_page && level == end_lvl)
1495 *pte_page = pte;
1496
1497 pte = &pte[PM_LEVEL_INDEX(level, address)];
1498 }
1499
1500 return pte;
1501}
1502
1503/*
1504 * This function fetches the PTE for a given address in the aperture 1701 * This function fetches the PTE for a given address in the aperture
1505 */ 1702 */
1506static u64* dma_ops_get_pte(struct dma_ops_domain *dom, 1703static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
@@ -1530,8 +1727,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1530 * This is the generic map function. It maps one 4kb page at paddr to 1727 * This is the generic map function. It maps one 4kb page at paddr to
1531 * the given address in the DMA address space for the domain. 1728 * the given address in the DMA address space for the domain.
1532 */ 1729 */
1533static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 1730static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
1534 struct dma_ops_domain *dom,
1535 unsigned long address, 1731 unsigned long address,
1536 phys_addr_t paddr, 1732 phys_addr_t paddr,
1537 int direction) 1733 int direction)
@@ -1544,7 +1740,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1544 1740
1545 pte = dma_ops_get_pte(dom, address); 1741 pte = dma_ops_get_pte(dom, address);
1546 if (!pte) 1742 if (!pte)
1547 return bad_dma_address; 1743 return DMA_ERROR_CODE;
1548 1744
1549 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1745 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1550 1746
@@ -1565,8 +1761,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1565/* 1761/*
1566 * The generic unmapping function for on page in the DMA address space. 1762 * The generic unmapping function for on page in the DMA address space.
1567 */ 1763 */
1568static void dma_ops_domain_unmap(struct amd_iommu *iommu, 1764static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
1569 struct dma_ops_domain *dom,
1570 unsigned long address) 1765 unsigned long address)
1571{ 1766{
1572 struct aperture_range *aperture; 1767 struct aperture_range *aperture;
@@ -1597,7 +1792,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1597 * Must be called with the domain lock held. 1792 * Must be called with the domain lock held.
1598 */ 1793 */
1599static dma_addr_t __map_single(struct device *dev, 1794static dma_addr_t __map_single(struct device *dev,
1600 struct amd_iommu *iommu,
1601 struct dma_ops_domain *dma_dom, 1795 struct dma_ops_domain *dma_dom,
1602 phys_addr_t paddr, 1796 phys_addr_t paddr,
1603 size_t size, 1797 size_t size,
@@ -1625,7 +1819,7 @@ static dma_addr_t __map_single(struct device *dev,
1625retry: 1819retry:
1626 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1820 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1627 dma_mask); 1821 dma_mask);
1628 if (unlikely(address == bad_dma_address)) { 1822 if (unlikely(address == DMA_ERROR_CODE)) {
1629 /* 1823 /*
1630 * setting next_address here will let the address 1824 * setting next_address here will let the address
1631 * allocator only scan the new allocated range in the 1825 * allocator only scan the new allocated range in the
@@ -1633,11 +1827,11 @@ retry:
1633 */ 1827 */
1634 dma_dom->next_address = dma_dom->aperture_size; 1828 dma_dom->next_address = dma_dom->aperture_size;
1635 1829
1636 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) 1830 if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
1637 goto out; 1831 goto out;
1638 1832
1639 /* 1833 /*
1640 * aperture was sucessfully enlarged by 128 MB, try 1834 * aperture was successfully enlarged by 128 MB, try
1641 * allocation again 1835 * allocation again
1642 */ 1836 */
1643 goto retry; 1837 goto retry;
@@ -1645,8 +1839,8 @@ retry:
1645 1839
1646 start = address; 1840 start = address;
1647 for (i = 0; i < pages; ++i) { 1841 for (i = 0; i < pages; ++i) {
1648 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1842 ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
1649 if (ret == bad_dma_address) 1843 if (ret == DMA_ERROR_CODE)
1650 goto out_unmap; 1844 goto out_unmap;
1651 1845
1652 paddr += PAGE_SIZE; 1846 paddr += PAGE_SIZE;
@@ -1657,10 +1851,10 @@ retry:
1657 ADD_STATS_COUNTER(alloced_io_mem, size); 1851 ADD_STATS_COUNTER(alloced_io_mem, size);
1658 1852
1659 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 1853 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
1660 iommu_flush_tlb(iommu, dma_dom->domain.id); 1854 iommu_flush_tlb(&dma_dom->domain);
1661 dma_dom->need_flush = false; 1855 dma_dom->need_flush = false;
1662 } else if (unlikely(iommu_has_npcache(iommu))) 1856 } else if (unlikely(amd_iommu_np_cache))
1663 iommu_flush_pages(iommu, dma_dom->domain.id, address, size); 1857 iommu_flush_pages(&dma_dom->domain, address, size);
1664 1858
1665out: 1859out:
1666 return address; 1860 return address;
@@ -1669,20 +1863,19 @@ out_unmap:
1669 1863
1670 for (--i; i >= 0; --i) { 1864 for (--i; i >= 0; --i) {
1671 start -= PAGE_SIZE; 1865 start -= PAGE_SIZE;
1672 dma_ops_domain_unmap(iommu, dma_dom, start); 1866 dma_ops_domain_unmap(dma_dom, start);
1673 } 1867 }
1674 1868
1675 dma_ops_free_addresses(dma_dom, address, pages); 1869 dma_ops_free_addresses(dma_dom, address, pages);
1676 1870
1677 return bad_dma_address; 1871 return DMA_ERROR_CODE;
1678} 1872}
1679 1873
1680/* 1874/*
1681 * Does the reverse of the __map_single function. Must be called with 1875 * Does the reverse of the __map_single function. Must be called with
1682 * the domain lock held too 1876 * the domain lock held too
1683 */ 1877 */
1684static void __unmap_single(struct amd_iommu *iommu, 1878static void __unmap_single(struct dma_ops_domain *dma_dom,
1685 struct dma_ops_domain *dma_dom,
1686 dma_addr_t dma_addr, 1879 dma_addr_t dma_addr,
1687 size_t size, 1880 size_t size,
1688 int dir) 1881 int dir)
@@ -1690,7 +1883,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1690 dma_addr_t i, start; 1883 dma_addr_t i, start;
1691 unsigned int pages; 1884 unsigned int pages;
1692 1885
1693 if ((dma_addr == bad_dma_address) || 1886 if ((dma_addr == DMA_ERROR_CODE) ||
1694 (dma_addr + size > dma_dom->aperture_size)) 1887 (dma_addr + size > dma_dom->aperture_size))
1695 return; 1888 return;
1696 1889
@@ -1699,7 +1892,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1699 start = dma_addr; 1892 start = dma_addr;
1700 1893
1701 for (i = 0; i < pages; ++i) { 1894 for (i = 0; i < pages; ++i) {
1702 dma_ops_domain_unmap(iommu, dma_dom, start); 1895 dma_ops_domain_unmap(dma_dom, start);
1703 start += PAGE_SIZE; 1896 start += PAGE_SIZE;
1704 } 1897 }
1705 1898
@@ -1708,7 +1901,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1708 dma_ops_free_addresses(dma_dom, dma_addr, pages); 1901 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1709 1902
1710 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 1903 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1711 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); 1904 iommu_flush_pages(&dma_dom->domain, dma_addr, size);
1712 dma_dom->need_flush = false; 1905 dma_dom->need_flush = false;
1713 } 1906 }
1714} 1907}
@@ -1722,36 +1915,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
1722 struct dma_attrs *attrs) 1915 struct dma_attrs *attrs)
1723{ 1916{
1724 unsigned long flags; 1917 unsigned long flags;
1725 struct amd_iommu *iommu;
1726 struct protection_domain *domain; 1918 struct protection_domain *domain;
1727 u16 devid;
1728 dma_addr_t addr; 1919 dma_addr_t addr;
1729 u64 dma_mask; 1920 u64 dma_mask;
1730 phys_addr_t paddr = page_to_phys(page) + offset; 1921 phys_addr_t paddr = page_to_phys(page) + offset;
1731 1922
1732 INC_STATS_COUNTER(cnt_map_single); 1923 INC_STATS_COUNTER(cnt_map_single);
1733 1924
1734 if (!check_device(dev)) 1925 domain = get_domain(dev);
1735 return bad_dma_address; 1926 if (PTR_ERR(domain) == -EINVAL)
1736
1737 dma_mask = *dev->dma_mask;
1738
1739 get_device_resources(dev, &iommu, &domain, &devid);
1740
1741 if (iommu == NULL || domain == NULL)
1742 /* device not handled by any AMD IOMMU */
1743 return (dma_addr_t)paddr; 1927 return (dma_addr_t)paddr;
1928 else if (IS_ERR(domain))
1929 return DMA_ERROR_CODE;
1744 1930
1745 if (!dma_ops_domain(domain)) 1931 dma_mask = *dev->dma_mask;
1746 return bad_dma_address;
1747 1932
1748 spin_lock_irqsave(&domain->lock, flags); 1933 spin_lock_irqsave(&domain->lock, flags);
1749 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, 1934
1935 addr = __map_single(dev, domain->priv, paddr, size, dir, false,
1750 dma_mask); 1936 dma_mask);
1751 if (addr == bad_dma_address) 1937 if (addr == DMA_ERROR_CODE)
1752 goto out; 1938 goto out;
1753 1939
1754 iommu_completion_wait(iommu); 1940 iommu_flush_complete(domain);
1755 1941
1756out: 1942out:
1757 spin_unlock_irqrestore(&domain->lock, flags); 1943 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1766,25 +1952,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
1766 enum dma_data_direction dir, struct dma_attrs *attrs) 1952 enum dma_data_direction dir, struct dma_attrs *attrs)
1767{ 1953{
1768 unsigned long flags; 1954 unsigned long flags;
1769 struct amd_iommu *iommu;
1770 struct protection_domain *domain; 1955 struct protection_domain *domain;
1771 u16 devid;
1772 1956
1773 INC_STATS_COUNTER(cnt_unmap_single); 1957 INC_STATS_COUNTER(cnt_unmap_single);
1774 1958
1775 if (!check_device(dev) || 1959 domain = get_domain(dev);
1776 !get_device_resources(dev, &iommu, &domain, &devid)) 1960 if (IS_ERR(domain))
1777 /* device not handled by any AMD IOMMU */
1778 return;
1779
1780 if (!dma_ops_domain(domain))
1781 return; 1961 return;
1782 1962
1783 spin_lock_irqsave(&domain->lock, flags); 1963 spin_lock_irqsave(&domain->lock, flags);
1784 1964
1785 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1965 __unmap_single(domain->priv, dma_addr, size, dir);
1786 1966
1787 iommu_completion_wait(iommu); 1967 iommu_flush_complete(domain);
1788 1968
1789 spin_unlock_irqrestore(&domain->lock, flags); 1969 spin_unlock_irqrestore(&domain->lock, flags);
1790} 1970}
@@ -1816,9 +1996,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1816 struct dma_attrs *attrs) 1996 struct dma_attrs *attrs)
1817{ 1997{
1818 unsigned long flags; 1998 unsigned long flags;
1819 struct amd_iommu *iommu;
1820 struct protection_domain *domain; 1999 struct protection_domain *domain;
1821 u16 devid;
1822 int i; 2000 int i;
1823 struct scatterlist *s; 2001 struct scatterlist *s;
1824 phys_addr_t paddr; 2002 phys_addr_t paddr;
@@ -1827,25 +2005,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1827 2005
1828 INC_STATS_COUNTER(cnt_map_sg); 2006 INC_STATS_COUNTER(cnt_map_sg);
1829 2007
1830 if (!check_device(dev)) 2008 domain = get_domain(dev);
2009 if (PTR_ERR(domain) == -EINVAL)
2010 return map_sg_no_iommu(dev, sglist, nelems, dir);
2011 else if (IS_ERR(domain))
1831 return 0; 2012 return 0;
1832 2013
1833 dma_mask = *dev->dma_mask; 2014 dma_mask = *dev->dma_mask;
1834 2015
1835 get_device_resources(dev, &iommu, &domain, &devid);
1836
1837 if (!iommu || !domain)
1838 return map_sg_no_iommu(dev, sglist, nelems, dir);
1839
1840 if (!dma_ops_domain(domain))
1841 return 0;
1842
1843 spin_lock_irqsave(&domain->lock, flags); 2016 spin_lock_irqsave(&domain->lock, flags);
1844 2017
1845 for_each_sg(sglist, s, nelems, i) { 2018 for_each_sg(sglist, s, nelems, i) {
1846 paddr = sg_phys(s); 2019 paddr = sg_phys(s);
1847 2020
1848 s->dma_address = __map_single(dev, iommu, domain->priv, 2021 s->dma_address = __map_single(dev, domain->priv,
1849 paddr, s->length, dir, false, 2022 paddr, s->length, dir, false,
1850 dma_mask); 2023 dma_mask);
1851 2024
@@ -1856,7 +2029,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1856 goto unmap; 2029 goto unmap;
1857 } 2030 }
1858 2031
1859 iommu_completion_wait(iommu); 2032 iommu_flush_complete(domain);
1860 2033
1861out: 2034out:
1862 spin_unlock_irqrestore(&domain->lock, flags); 2035 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1865,7 +2038,7 @@ out:
1865unmap: 2038unmap:
1866 for_each_sg(sglist, s, mapped_elems, i) { 2039 for_each_sg(sglist, s, mapped_elems, i) {
1867 if (s->dma_address) 2040 if (s->dma_address)
1868 __unmap_single(iommu, domain->priv, s->dma_address, 2041 __unmap_single(domain->priv, s->dma_address,
1869 s->dma_length, dir); 2042 s->dma_length, dir);
1870 s->dma_address = s->dma_length = 0; 2043 s->dma_address = s->dma_length = 0;
1871 } 2044 }
@@ -1884,30 +2057,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1884 struct dma_attrs *attrs) 2057 struct dma_attrs *attrs)
1885{ 2058{
1886 unsigned long flags; 2059 unsigned long flags;
1887 struct amd_iommu *iommu;
1888 struct protection_domain *domain; 2060 struct protection_domain *domain;
1889 struct scatterlist *s; 2061 struct scatterlist *s;
1890 u16 devid;
1891 int i; 2062 int i;
1892 2063
1893 INC_STATS_COUNTER(cnt_unmap_sg); 2064 INC_STATS_COUNTER(cnt_unmap_sg);
1894 2065
1895 if (!check_device(dev) || 2066 domain = get_domain(dev);
1896 !get_device_resources(dev, &iommu, &domain, &devid)) 2067 if (IS_ERR(domain))
1897 return;
1898
1899 if (!dma_ops_domain(domain))
1900 return; 2068 return;
1901 2069
1902 spin_lock_irqsave(&domain->lock, flags); 2070 spin_lock_irqsave(&domain->lock, flags);
1903 2071
1904 for_each_sg(sglist, s, nelems, i) { 2072 for_each_sg(sglist, s, nelems, i) {
1905 __unmap_single(iommu, domain->priv, s->dma_address, 2073 __unmap_single(domain->priv, s->dma_address,
1906 s->dma_length, dir); 2074 s->dma_length, dir);
1907 s->dma_address = s->dma_length = 0; 2075 s->dma_address = s->dma_length = 0;
1908 } 2076 }
1909 2077
1910 iommu_completion_wait(iommu); 2078 iommu_flush_complete(domain);
1911 2079
1912 spin_unlock_irqrestore(&domain->lock, flags); 2080 spin_unlock_irqrestore(&domain->lock, flags);
1913} 2081}
@@ -1920,49 +2088,44 @@ static void *alloc_coherent(struct device *dev, size_t size,
1920{ 2088{
1921 unsigned long flags; 2089 unsigned long flags;
1922 void *virt_addr; 2090 void *virt_addr;
1923 struct amd_iommu *iommu;
1924 struct protection_domain *domain; 2091 struct protection_domain *domain;
1925 u16 devid;
1926 phys_addr_t paddr; 2092 phys_addr_t paddr;
1927 u64 dma_mask = dev->coherent_dma_mask; 2093 u64 dma_mask = dev->coherent_dma_mask;
1928 2094
1929 INC_STATS_COUNTER(cnt_alloc_coherent); 2095 INC_STATS_COUNTER(cnt_alloc_coherent);
1930 2096
1931 if (!check_device(dev)) 2097 domain = get_domain(dev);
2098 if (PTR_ERR(domain) == -EINVAL) {
2099 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2100 *dma_addr = __pa(virt_addr);
2101 return virt_addr;
2102 } else if (IS_ERR(domain))
1932 return NULL; 2103 return NULL;
1933 2104
1934 if (!get_device_resources(dev, &iommu, &domain, &devid)) 2105 dma_mask = dev->coherent_dma_mask;
1935 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 2106 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2107 flag |= __GFP_ZERO;
1936 2108
1937 flag |= __GFP_ZERO;
1938 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 2109 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1939 if (!virt_addr) 2110 if (!virt_addr)
1940 return NULL; 2111 return NULL;
1941 2112
1942 paddr = virt_to_phys(virt_addr); 2113 paddr = virt_to_phys(virt_addr);
1943 2114
1944 if (!iommu || !domain) {
1945 *dma_addr = (dma_addr_t)paddr;
1946 return virt_addr;
1947 }
1948
1949 if (!dma_ops_domain(domain))
1950 goto out_free;
1951
1952 if (!dma_mask) 2115 if (!dma_mask)
1953 dma_mask = *dev->dma_mask; 2116 dma_mask = *dev->dma_mask;
1954 2117
1955 spin_lock_irqsave(&domain->lock, flags); 2118 spin_lock_irqsave(&domain->lock, flags);
1956 2119
1957 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 2120 *dma_addr = __map_single(dev, domain->priv, paddr,
1958 size, DMA_BIDIRECTIONAL, true, dma_mask); 2121 size, DMA_BIDIRECTIONAL, true, dma_mask);
1959 2122
1960 if (*dma_addr == bad_dma_address) { 2123 if (*dma_addr == DMA_ERROR_CODE) {
1961 spin_unlock_irqrestore(&domain->lock, flags); 2124 spin_unlock_irqrestore(&domain->lock, flags);
1962 goto out_free; 2125 goto out_free;
1963 } 2126 }
1964 2127
1965 iommu_completion_wait(iommu); 2128 iommu_flush_complete(domain);
1966 2129
1967 spin_unlock_irqrestore(&domain->lock, flags); 2130 spin_unlock_irqrestore(&domain->lock, flags);
1968 2131
@@ -1982,28 +2145,19 @@ static void free_coherent(struct device *dev, size_t size,
1982 void *virt_addr, dma_addr_t dma_addr) 2145 void *virt_addr, dma_addr_t dma_addr)
1983{ 2146{
1984 unsigned long flags; 2147 unsigned long flags;
1985 struct amd_iommu *iommu;
1986 struct protection_domain *domain; 2148 struct protection_domain *domain;
1987 u16 devid;
1988 2149
1989 INC_STATS_COUNTER(cnt_free_coherent); 2150 INC_STATS_COUNTER(cnt_free_coherent);
1990 2151
1991 if (!check_device(dev)) 2152 domain = get_domain(dev);
1992 return; 2153 if (IS_ERR(domain))
1993
1994 get_device_resources(dev, &iommu, &domain, &devid);
1995
1996 if (!iommu || !domain)
1997 goto free_mem;
1998
1999 if (!dma_ops_domain(domain))
2000 goto free_mem; 2154 goto free_mem;
2001 2155
2002 spin_lock_irqsave(&domain->lock, flags); 2156 spin_lock_irqsave(&domain->lock, flags);
2003 2157
2004 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2158 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2005 2159
2006 iommu_completion_wait(iommu); 2160 iommu_flush_complete(domain);
2007 2161
2008 spin_unlock_irqrestore(&domain->lock, flags); 2162 spin_unlock_irqrestore(&domain->lock, flags);
2009 2163
@@ -2017,22 +2171,7 @@ free_mem:
2017 */ 2171 */
2018static int amd_iommu_dma_supported(struct device *dev, u64 mask) 2172static int amd_iommu_dma_supported(struct device *dev, u64 mask)
2019{ 2173{
2020 u16 bdf; 2174 return check_device(dev);
2021 struct pci_dev *pcidev;
2022
2023 /* No device or no PCI device */
2024 if (!dev || dev->bus != &pci_bus_type)
2025 return 0;
2026
2027 pcidev = to_pci_dev(dev);
2028
2029 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
2030
2031 /* Out of our scope? */
2032 if (bdf > amd_iommu_last_bdf)
2033 return 0;
2034
2035 return 1;
2036} 2175}
2037 2176
2038/* 2177/*
@@ -2046,25 +2185,28 @@ static void prealloc_protection_domains(void)
2046{ 2185{
2047 struct pci_dev *dev = NULL; 2186 struct pci_dev *dev = NULL;
2048 struct dma_ops_domain *dma_dom; 2187 struct dma_ops_domain *dma_dom;
2049 struct amd_iommu *iommu;
2050 u16 devid; 2188 u16 devid;
2051 2189
2052 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2190 for_each_pci_dev(dev) {
2053 devid = calc_devid(dev->bus->number, dev->devfn); 2191
2054 if (devid > amd_iommu_last_bdf) 2192 /* Do we handle this device? */
2055 continue; 2193 if (!check_device(&dev->dev))
2056 devid = amd_iommu_alias_table[devid];
2057 if (domain_for_device(devid))
2058 continue; 2194 continue;
2059 iommu = amd_iommu_rlookup_table[devid]; 2195
2060 if (!iommu) 2196 /* Is there already any domain for it? */
2197 if (domain_for_device(&dev->dev))
2061 continue; 2198 continue;
2062 dma_dom = dma_ops_domain_alloc(iommu); 2199
2200 devid = get_device_id(&dev->dev);
2201
2202 dma_dom = dma_ops_domain_alloc();
2063 if (!dma_dom) 2203 if (!dma_dom)
2064 continue; 2204 continue;
2065 init_unity_mappings_for_device(dma_dom, devid); 2205 init_unity_mappings_for_device(dma_dom, devid);
2066 dma_dom->target_dev = devid; 2206 dma_dom->target_dev = devid;
2067 2207
2208 attach_device(&dev->dev, &dma_dom->domain);
2209
2068 list_add_tail(&dma_dom->list, &iommu_pd_list); 2210 list_add_tail(&dma_dom->list, &iommu_pd_list);
2069 } 2211 }
2070} 2212}
@@ -2082,6 +2224,12 @@ static struct dma_map_ops amd_iommu_dma_ops = {
2082/* 2224/*
2083 * The function which clues the AMD IOMMU driver into dma_ops. 2225 * The function which clues the AMD IOMMU driver into dma_ops.
2084 */ 2226 */
2227
2228void __init amd_iommu_init_api(void)
2229{
2230 register_iommu(&amd_iommu_ops);
2231}
2232
2085int __init amd_iommu_init_dma_ops(void) 2233int __init amd_iommu_init_dma_ops(void)
2086{ 2234{
2087 struct amd_iommu *iommu; 2235 struct amd_iommu *iommu;
@@ -2093,7 +2241,7 @@ int __init amd_iommu_init_dma_ops(void)
2093 * protection domain will be assigned to the default one. 2241 * protection domain will be assigned to the default one.
2094 */ 2242 */
2095 for_each_iommu(iommu) { 2243 for_each_iommu(iommu) {
2096 iommu->default_dom = dma_ops_domain_alloc(iommu); 2244 iommu->default_dom = dma_ops_domain_alloc();
2097 if (iommu->default_dom == NULL) 2245 if (iommu->default_dom == NULL)
2098 return -ENOMEM; 2246 return -ENOMEM;
2099 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 2247 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -2103,15 +2251,12 @@ int __init amd_iommu_init_dma_ops(void)
2103 } 2251 }
2104 2252
2105 /* 2253 /*
2106 * If device isolation is enabled, pre-allocate the protection 2254 * Pre-allocate the protection domains for each device.
2107 * domains for each device.
2108 */ 2255 */
2109 if (amd_iommu_isolate) 2256 prealloc_protection_domains();
2110 prealloc_protection_domains();
2111 2257
2112 iommu_detected = 1; 2258 iommu_detected = 1;
2113 force_iommu = 1; 2259 swiotlb = 0;
2114 bad_dma_address = 0;
2115#ifdef CONFIG_GART_IOMMU 2260#ifdef CONFIG_GART_IOMMU
2116 gart_iommu_aperture_disabled = 1; 2261 gart_iommu_aperture_disabled = 1;
2117 gart_iommu_aperture = 0; 2262 gart_iommu_aperture = 0;
@@ -2120,10 +2265,6 @@ int __init amd_iommu_init_dma_ops(void)
2120 /* Make the driver finally visible to the drivers */ 2265 /* Make the driver finally visible to the drivers */
2121 dma_ops = &amd_iommu_dma_ops; 2266 dma_ops = &amd_iommu_dma_ops;
2122 2267
2123 register_iommu(&amd_iommu_ops);
2124
2125 bus_register_notifier(&pci_bus_type, &device_nb);
2126
2127 amd_iommu_stats_init(); 2268 amd_iommu_stats_init();
2128 2269
2129 return 0; 2270 return 0;
@@ -2150,14 +2291,17 @@ free_domains:
2150 2291
2151static void cleanup_domain(struct protection_domain *domain) 2292static void cleanup_domain(struct protection_domain *domain)
2152{ 2293{
2294 struct iommu_dev_data *dev_data, *next;
2153 unsigned long flags; 2295 unsigned long flags;
2154 u16 devid;
2155 2296
2156 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2297 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2157 2298
2158 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) 2299 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2159 if (amd_iommu_pd_table[devid] == domain) 2300 struct device *dev = dev_data->dev;
2160 __detach_device(domain, devid); 2301
2302 __detach_device(dev);
2303 atomic_set(&dev_data->bind, 0);
2304 }
2161 2305
2162 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2306 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2163} 2307}
@@ -2167,6 +2311,8 @@ static void protection_domain_free(struct protection_domain *domain)
2167 if (!domain) 2311 if (!domain)
2168 return; 2312 return;
2169 2313
2314 del_domain_from_list(domain);
2315
2170 if (domain->id) 2316 if (domain->id)
2171 domain_id_free(domain->id); 2317 domain_id_free(domain->id);
2172 2318
@@ -2182,9 +2328,13 @@ static struct protection_domain *protection_domain_alloc(void)
2182 return NULL; 2328 return NULL;
2183 2329
2184 spin_lock_init(&domain->lock); 2330 spin_lock_init(&domain->lock);
2331 mutex_init(&domain->api_lock);
2185 domain->id = domain_id_alloc(); 2332 domain->id = domain_id_alloc();
2186 if (!domain->id) 2333 if (!domain->id)
2187 goto out_err; 2334 goto out_err;
2335 INIT_LIST_HEAD(&domain->dev_list);
2336
2337 add_domain_to_list(domain);
2188 2338
2189 return domain; 2339 return domain;
2190 2340
@@ -2231,9 +2381,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2231 2381
2232 free_pagetable(domain); 2382 free_pagetable(domain);
2233 2383
2234 domain_id_free(domain->id); 2384 protection_domain_free(domain);
2235
2236 kfree(domain);
2237 2385
2238 dom->priv = NULL; 2386 dom->priv = NULL;
2239} 2387}
@@ -2241,26 +2389,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2241static void amd_iommu_detach_device(struct iommu_domain *dom, 2389static void amd_iommu_detach_device(struct iommu_domain *dom,
2242 struct device *dev) 2390 struct device *dev)
2243{ 2391{
2244 struct protection_domain *domain = dom->priv; 2392 struct iommu_dev_data *dev_data = dev->archdata.iommu;
2245 struct amd_iommu *iommu; 2393 struct amd_iommu *iommu;
2246 struct pci_dev *pdev;
2247 u16 devid; 2394 u16 devid;
2248 2395
2249 if (dev->bus != &pci_bus_type) 2396 if (!check_device(dev))
2250 return; 2397 return;
2251 2398
2252 pdev = to_pci_dev(dev); 2399 devid = get_device_id(dev);
2253
2254 devid = calc_devid(pdev->bus->number, pdev->devfn);
2255 2400
2256 if (devid > 0) 2401 if (dev_data->domain != NULL)
2257 detach_device(domain, devid); 2402 detach_device(dev);
2258 2403
2259 iommu = amd_iommu_rlookup_table[devid]; 2404 iommu = amd_iommu_rlookup_table[devid];
2260 if (!iommu) 2405 if (!iommu)
2261 return; 2406 return;
2262 2407
2263 iommu_queue_inv_dev_entry(iommu, devid); 2408 iommu_flush_device(dev);
2264 iommu_completion_wait(iommu); 2409 iommu_completion_wait(iommu);
2265} 2410}
2266 2411
@@ -2268,35 +2413,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
2268 struct device *dev) 2413 struct device *dev)
2269{ 2414{
2270 struct protection_domain *domain = dom->priv; 2415 struct protection_domain *domain = dom->priv;
2271 struct protection_domain *old_domain; 2416 struct iommu_dev_data *dev_data;
2272 struct amd_iommu *iommu; 2417 struct amd_iommu *iommu;
2273 struct pci_dev *pdev; 2418 int ret;
2274 u16 devid; 2419 u16 devid;
2275 2420
2276 if (dev->bus != &pci_bus_type) 2421 if (!check_device(dev))
2277 return -EINVAL; 2422 return -EINVAL;
2278 2423
2279 pdev = to_pci_dev(dev); 2424 dev_data = dev->archdata.iommu;
2280 2425
2281 devid = calc_devid(pdev->bus->number, pdev->devfn); 2426 devid = get_device_id(dev);
2282
2283 if (devid >= amd_iommu_last_bdf ||
2284 devid != amd_iommu_alias_table[devid])
2285 return -EINVAL;
2286 2427
2287 iommu = amd_iommu_rlookup_table[devid]; 2428 iommu = amd_iommu_rlookup_table[devid];
2288 if (!iommu) 2429 if (!iommu)
2289 return -EINVAL; 2430 return -EINVAL;
2290 2431
2291 old_domain = domain_for_device(devid); 2432 if (dev_data->domain)
2292 if (old_domain) 2433 detach_device(dev);
2293 detach_device(old_domain, devid);
2294 2434
2295 attach_device(iommu, domain, devid); 2435 ret = attach_device(dev, domain);
2296 2436
2297 iommu_completion_wait(iommu); 2437 iommu_completion_wait(iommu);
2298 2438
2299 return 0; 2439 return ret;
2300} 2440}
2301 2441
2302static int amd_iommu_map_range(struct iommu_domain *dom, 2442static int amd_iommu_map_range(struct iommu_domain *dom,
@@ -2316,6 +2456,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2316 iova &= PAGE_MASK; 2456 iova &= PAGE_MASK;
2317 paddr &= PAGE_MASK; 2457 paddr &= PAGE_MASK;
2318 2458
2459 mutex_lock(&domain->api_lock);
2460
2319 for (i = 0; i < npages; ++i) { 2461 for (i = 0; i < npages; ++i) {
2320 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); 2462 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
2321 if (ret) 2463 if (ret)
@@ -2325,6 +2467,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2325 paddr += PAGE_SIZE; 2467 paddr += PAGE_SIZE;
2326 } 2468 }
2327 2469
2470 mutex_unlock(&domain->api_lock);
2471
2328 return 0; 2472 return 0;
2329} 2473}
2330 2474
@@ -2337,12 +2481,16 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2337 2481
2338 iova &= PAGE_MASK; 2482 iova &= PAGE_MASK;
2339 2483
2484 mutex_lock(&domain->api_lock);
2485
2340 for (i = 0; i < npages; ++i) { 2486 for (i = 0; i < npages; ++i) {
2341 iommu_unmap_page(domain, iova, PM_MAP_4k); 2487 iommu_unmap_page(domain, iova, PM_MAP_4k);
2342 iova += PAGE_SIZE; 2488 iova += PAGE_SIZE;
2343 } 2489 }
2344 2490
2345 iommu_flush_domain(domain->id); 2491 iommu_flush_tlb_pde(domain);
2492
2493 mutex_unlock(&domain->api_lock);
2346} 2494}
2347 2495
2348static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2496static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2393,10 +2541,11 @@ static struct iommu_ops amd_iommu_ops = {
2393 2541
2394int __init amd_iommu_init_passthrough(void) 2542int __init amd_iommu_init_passthrough(void)
2395{ 2543{
2544 struct amd_iommu *iommu;
2396 struct pci_dev *dev = NULL; 2545 struct pci_dev *dev = NULL;
2397 u16 devid, devid2; 2546 u16 devid;
2398 2547
2399 /* allocate passthroug domain */ 2548 /* allocate passthrough domain */
2400 pt_domain = protection_domain_alloc(); 2549 pt_domain = protection_domain_alloc();
2401 if (!pt_domain) 2550 if (!pt_domain)
2402 return -ENOMEM; 2551 return -ENOMEM;
@@ -2404,20 +2553,17 @@ int __init amd_iommu_init_passthrough(void)
2404 pt_domain->mode |= PAGE_MODE_NONE; 2553 pt_domain->mode |= PAGE_MODE_NONE;
2405 2554
2406 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2555 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2407 struct amd_iommu *iommu;
2408 2556
2409 devid = calc_devid(dev->bus->number, dev->devfn); 2557 if (!check_device(&dev->dev))
2410 if (devid > amd_iommu_last_bdf)
2411 continue; 2558 continue;
2412 2559
2413 devid2 = amd_iommu_alias_table[devid]; 2560 devid = get_device_id(&dev->dev);
2414 2561
2415 iommu = amd_iommu_rlookup_table[devid2]; 2562 iommu = amd_iommu_rlookup_table[devid];
2416 if (!iommu) 2563 if (!iommu)
2417 continue; 2564 continue;
2418 2565
2419 __attach_device(iommu, pt_domain, devid); 2566 attach_device(&dev->dev, pt_domain);
2420 __attach_device(iommu, pt_domain, devid2);
2421 } 2567 }
2422 2568
2423 pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); 2569 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c20001e4f556..6360abf993d4 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -19,16 +19,18 @@
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/acpi.h> 21#include <linux/acpi.h>
22#include <linux/gfp.h>
23#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
28#include <asm/amd_iommu_proto.h>
28#include <asm/amd_iommu_types.h> 29#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h> 30#include <asm/amd_iommu.h>
30#include <asm/iommu.h> 31#include <asm/iommu.h>
31#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/x86_init.h>
32 34
33/* 35/*
34 * definitions for the ACPI scanning code 36 * definitions for the ACPI scanning code
@@ -123,18 +125,29 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
123 to handle */ 125 to handle */
124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 126LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
125 we find in ACPI */ 127 we find in ACPI */
126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
129bool amd_iommu_isolate = true; /* if true, device isolation is
130 enabled */
131#endif
132
133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 128bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
134 129
135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 130LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
136 system */ 131 system */
137 132
133/* Array to assign indices to IOMMUs*/
134struct amd_iommu *amd_iommus[MAX_IOMMUS];
135int amd_iommus_present;
136
137/* IOMMUs have a non-present cache? */
138bool amd_iommu_np_cache __read_mostly;
139
140/*
141 * The ACPI table parsing functions set this variable on an error
142 */
143static int __initdata amd_iommu_init_err;
144
145/*
146 * List of protection domains - used during resume
147 */
148LIST_HEAD(amd_iommu_pd_list);
149spinlock_t amd_iommu_pd_lock;
150
138/* 151/*
139 * Pointer to the device table which is shared by all AMD IOMMUs 152 * Pointer to the device table which is shared by all AMD IOMMUs
140 * it is indexed by the PCI device id or the HT unit id and contains 153 * it is indexed by the PCI device id or the HT unit id and contains
@@ -157,12 +170,6 @@ u16 *amd_iommu_alias_table;
157struct amd_iommu **amd_iommu_rlookup_table; 170struct amd_iommu **amd_iommu_rlookup_table;
158 171
159/* 172/*
160 * The pd table (protection domain table) is used to find the protection domain
161 * data structure a device belongs to. Indexed with the PCI device id too.
162 */
163struct protection_domain **amd_iommu_pd_table;
164
165/*
166 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap 173 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
167 * to know which ones are already in use. 174 * to know which ones are already in use.
168 */ 175 */
@@ -384,9 +391,11 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
384 */ 391 */
385 for (i = 0; i < table->length; ++i) 392 for (i = 0; i < table->length; ++i)
386 checksum += p[i]; 393 checksum += p[i];
387 if (checksum != 0) 394 if (checksum != 0) {
388 /* ACPI table corrupt */ 395 /* ACPI table corrupt */
389 return -ENODEV; 396 amd_iommu_init_err = -ENODEV;
397 return 0;
398 }
390 399
391 p += IVRS_HEADER_LENGTH; 400 p += IVRS_HEADER_LENGTH;
392 401
@@ -429,7 +438,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
429 if (cmd_buf == NULL) 438 if (cmd_buf == NULL)
430 return NULL; 439 return NULL;
431 440
432 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 441 iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
433 442
434 return cmd_buf; 443 return cmd_buf;
435} 444}
@@ -465,12 +474,13 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
465 &entry, sizeof(entry)); 474 &entry, sizeof(entry));
466 475
467 amd_iommu_reset_cmd_buffer(iommu); 476 amd_iommu_reset_cmd_buffer(iommu);
477 iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
468} 478}
469 479
470static void __init free_command_buffer(struct amd_iommu *iommu) 480static void __init free_command_buffer(struct amd_iommu *iommu)
471{ 481{
472 free_pages((unsigned long)iommu->cmd_buf, 482 free_pages((unsigned long)iommu->cmd_buf,
473 get_order(iommu->cmd_buf_size)); 483 get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
474} 484}
475 485
476/* allocates the memory where the IOMMU will log its events to */ 486/* allocates the memory where the IOMMU will log its events to */
@@ -838,7 +848,18 @@ static void __init free_iommu_all(void)
838static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 848static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
839{ 849{
840 spin_lock_init(&iommu->lock); 850 spin_lock_init(&iommu->lock);
851
852 /* Add IOMMU to internal data structures */
841 list_add_tail(&iommu->list, &amd_iommu_list); 853 list_add_tail(&iommu->list, &amd_iommu_list);
854 iommu->index = amd_iommus_present++;
855
856 if (unlikely(iommu->index >= MAX_IOMMUS)) {
857 WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
858 return -ENOSYS;
859 }
860
861 /* Index is fine - add IOMMU to the array */
862 amd_iommus[iommu->index] = iommu;
842 863
843 /* 864 /*
844 * Copy data from ACPI table entry to the iommu struct 865 * Copy data from ACPI table entry to the iommu struct
@@ -868,6 +889,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
868 init_iommu_from_acpi(iommu, h); 889 init_iommu_from_acpi(iommu, h);
869 init_iommu_devices(iommu); 890 init_iommu_devices(iommu);
870 891
892 if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
893 amd_iommu_np_cache = true;
894
871 return pci_enable_device(iommu->dev); 895 return pci_enable_device(iommu->dev);
872} 896}
873 897
@@ -899,11 +923,16 @@ static int __init init_iommu_all(struct acpi_table_header *table)
899 h->mmio_phys); 923 h->mmio_phys);
900 924
901 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); 925 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
902 if (iommu == NULL) 926 if (iommu == NULL) {
903 return -ENOMEM; 927 amd_iommu_init_err = -ENOMEM;
928 return 0;
929 }
930
904 ret = init_iommu_one(iommu, h); 931 ret = init_iommu_one(iommu, h);
905 if (ret) 932 if (ret) {
906 return ret; 933 amd_iommu_init_err = ret;
934 return 0;
935 }
907 break; 936 break;
908 default: 937 default:
909 break; 938 break;
@@ -925,7 +954,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
925 * 954 *
926 ****************************************************************************/ 955 ****************************************************************************/
927 956
928static int __init iommu_setup_msi(struct amd_iommu *iommu) 957static int iommu_setup_msi(struct amd_iommu *iommu)
929{ 958{
930 int r; 959 int r;
931 960
@@ -1176,19 +1205,10 @@ static struct sys_device device_amd_iommu = {
1176 * functions. Finally it prints some information about AMD IOMMUs and 1205 * functions. Finally it prints some information about AMD IOMMUs and
1177 * the driver state and enables the hardware. 1206 * the driver state and enables the hardware.
1178 */ 1207 */
1179int __init amd_iommu_init(void) 1208static int __init amd_iommu_init(void)
1180{ 1209{
1181 int i, ret = 0; 1210 int i, ret = 0;
1182 1211
1183
1184 if (no_iommu) {
1185 printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
1186 return 0;
1187 }
1188
1189 if (!amd_iommu_detected)
1190 return -ENODEV;
1191
1192 /* 1212 /*
1193 * First parse ACPI tables to find the largest Bus/Dev/Func 1213 * First parse ACPI tables to find the largest Bus/Dev/Func
1194 * we need to handle. Upon this information the shared data 1214 * we need to handle. Upon this information the shared data
@@ -1197,6 +1217,10 @@ int __init amd_iommu_init(void)
1197 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 1217 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
1198 return -ENODEV; 1218 return -ENODEV;
1199 1219
1220 ret = amd_iommu_init_err;
1221 if (ret)
1222 goto out;
1223
1200 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); 1224 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
1201 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); 1225 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
1202 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); 1226 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
@@ -1225,15 +1249,6 @@ int __init amd_iommu_init(void)
1225 if (amd_iommu_rlookup_table == NULL) 1249 if (amd_iommu_rlookup_table == NULL)
1226 goto free; 1250 goto free;
1227 1251
1228 /*
1229 * Protection Domain table - maps devices to protection domains
1230 * This table has the same size as the rlookup_table
1231 */
1232 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1233 get_order(rlookup_table_size));
1234 if (amd_iommu_pd_table == NULL)
1235 goto free;
1236
1237 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( 1252 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1238 GFP_KERNEL | __GFP_ZERO, 1253 GFP_KERNEL | __GFP_ZERO,
1239 get_order(MAX_DOMAIN_ID/8)); 1254 get_order(MAX_DOMAIN_ID/8));
@@ -1255,6 +1270,8 @@ int __init amd_iommu_init(void)
1255 */ 1270 */
1256 amd_iommu_pd_alloc_bitmap[0] = 1; 1271 amd_iommu_pd_alloc_bitmap[0] = 1;
1257 1272
1273 spin_lock_init(&amd_iommu_pd_lock);
1274
1258 /* 1275 /*
1259 * now the data structures are allocated and basically initialized 1276 * now the data structures are allocated and basically initialized
1260 * start the real acpi table scan 1277 * start the real acpi table scan
@@ -1263,9 +1280,19 @@ int __init amd_iommu_init(void)
1263 if (acpi_table_parse("IVRS", init_iommu_all) != 0) 1280 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1264 goto free; 1281 goto free;
1265 1282
1283 if (amd_iommu_init_err) {
1284 ret = amd_iommu_init_err;
1285 goto free;
1286 }
1287
1266 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1288 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1267 goto free; 1289 goto free;
1268 1290
1291 if (amd_iommu_init_err) {
1292 ret = amd_iommu_init_err;
1293 goto free;
1294 }
1295
1269 ret = sysdev_class_register(&amd_iommu_sysdev_class); 1296 ret = sysdev_class_register(&amd_iommu_sysdev_class);
1270 if (ret) 1297 if (ret)
1271 goto free; 1298 goto free;
@@ -1274,39 +1301,44 @@ int __init amd_iommu_init(void)
1274 if (ret) 1301 if (ret)
1275 goto free; 1302 goto free;
1276 1303
1304 ret = amd_iommu_init_devices();
1305 if (ret)
1306 goto free;
1307
1308 enable_iommus();
1309
1277 if (iommu_pass_through) 1310 if (iommu_pass_through)
1278 ret = amd_iommu_init_passthrough(); 1311 ret = amd_iommu_init_passthrough();
1279 else 1312 else
1280 ret = amd_iommu_init_dma_ops(); 1313 ret = amd_iommu_init_dma_ops();
1314
1281 if (ret) 1315 if (ret)
1282 goto free; 1316 goto free;
1283 1317
1284 enable_iommus(); 1318 amd_iommu_init_api();
1319
1320 amd_iommu_init_notifier();
1285 1321
1286 if (iommu_pass_through) 1322 if (iommu_pass_through)
1287 goto out; 1323 goto out;
1288 1324
1289 printk(KERN_INFO "AMD-Vi: device isolation ");
1290 if (amd_iommu_isolate)
1291 printk("enabled\n");
1292 else
1293 printk("disabled\n");
1294
1295 if (amd_iommu_unmap_flush) 1325 if (amd_iommu_unmap_flush)
1296 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); 1326 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1297 else 1327 else
1298 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); 1328 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1299 1329
1330 x86_platform.iommu_shutdown = disable_iommus;
1300out: 1331out:
1301 return ret; 1332 return ret;
1302 1333
1303free: 1334free:
1335 disable_iommus();
1336
1337 amd_iommu_uninit_devices();
1338
1304 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1339 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1305 get_order(MAX_DOMAIN_ID/8)); 1340 get_order(MAX_DOMAIN_ID/8));
1306 1341
1307 free_pages((unsigned long)amd_iommu_pd_table,
1308 get_order(rlookup_table_size));
1309
1310 free_pages((unsigned long)amd_iommu_rlookup_table, 1342 free_pages((unsigned long)amd_iommu_rlookup_table,
1311 get_order(rlookup_table_size)); 1343 get_order(rlookup_table_size));
1312 1344
@@ -1323,11 +1355,6 @@ free:
1323 goto out; 1355 goto out;
1324} 1356}
1325 1357
1326void amd_iommu_shutdown(void)
1327{
1328 disable_iommus();
1329}
1330
1331/**************************************************************************** 1358/****************************************************************************
1332 * 1359 *
1333 * Early detect code. This code runs at IOMMU detection time in the DMA 1360 * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1342,16 +1369,16 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1342 1369
1343void __init amd_iommu_detect(void) 1370void __init amd_iommu_detect(void)
1344{ 1371{
1345 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) 1372 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1346 return; 1373 return;
1347 1374
1348 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1375 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1349 iommu_detected = 1; 1376 iommu_detected = 1;
1350 amd_iommu_detected = 1; 1377 amd_iommu_detected = 1;
1351#ifdef CONFIG_GART_IOMMU 1378 x86_init.iommu.iommu_init = amd_iommu_init;
1352 gart_iommu_aperture_disabled = 1; 1379
1353 gart_iommu_aperture = 0; 1380 /* Make sure ACS will be enabled */
1354#endif 1381 pci_request_acs();
1355 } 1382 }
1356} 1383}
1357 1384
@@ -1372,10 +1399,6 @@ static int __init parse_amd_iommu_dump(char *str)
1372static int __init parse_amd_iommu_options(char *str) 1399static int __init parse_amd_iommu_options(char *str)
1373{ 1400{
1374 for (; *str; ++str) { 1401 for (; *str; ++str) {
1375 if (strncmp(str, "isolate", 7) == 0)
1376 amd_iommu_isolate = true;
1377 if (strncmp(str, "share", 5) == 0)
1378 amd_iommu_isolate = false;
1379 if (strncmp(str, "fullflush", 9) == 0) 1402 if (strncmp(str, "fullflush", 9) == 0)
1380 amd_iommu_unmap_flush = true; 1403 amd_iommu_unmap_flush = true;
1381 } 1404 }
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
new file mode 100644
index 000000000000..a35347501d36
--- /dev/null
+++ b/arch/x86/kernel/apb_timer.c
@@ -0,0 +1,785 @@
1/*
2 * apb_timer.c: Driver for Langwell APB timers
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 *
12 * Note:
13 * Langwell is the south complex of Intel Moorestown MID platform. There are
14 * eight external timers in total that can be used by the operating system.
15 * The timer information, such as frequency and addresses, is provided to the
16 * OS via SFI tables.
17 * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
18 * individual redirection table entries (RTE).
19 * Unlike HPET, there is no master counter, therefore one of the timers are
20 * used as clocksource. The overall allocation looks like:
21 * - timer 0 - NR_CPUs for per cpu timer
22 * - one timer for clocksource
23 * - one timer for watchdog driver.
24 * It is also worth notice that APB timer does not support true one-shot mode,
25 * free-running mode will be used here to emulate one-shot mode.
26 * APB timer can also be used as broadcast timer along with per cpu local APIC
27 * timer, but by default APB timer has higher rating than local APIC timers.
28 */
29
30#include <linux/clocksource.h>
31#include <linux/clockchips.h>
32#include <linux/delay.h>
33#include <linux/errno.h>
34#include <linux/init.h>
35#include <linux/sysdev.h>
36#include <linux/slab.h>
37#include <linux/pm.h>
38#include <linux/pci.h>
39#include <linux/sfi.h>
40#include <linux/interrupt.h>
41#include <linux/cpu.h>
42#include <linux/irq.h>
43
44#include <asm/fixmap.h>
45#include <asm/apb_timer.h>
46
47#define APBT_MASK CLOCKSOURCE_MASK(32)
48#define APBT_SHIFT 22
49#define APBT_CLOCKEVENT_RATING 150
50#define APBT_CLOCKSOURCE_RATING 250
51#define APBT_MIN_DELTA_USEC 200
52
53#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
54#define APBT_CLOCKEVENT0_NUM (0)
55#define APBT_CLOCKEVENT1_NUM (1)
56#define APBT_CLOCKSOURCE_NUM (2)
57
58static unsigned long apbt_address;
59static int apb_timer_block_enabled;
60static void __iomem *apbt_virt_address;
61static int phy_cs_timer_id;
62
63/*
64 * Common DW APB timer info
65 */
66static uint64_t apbt_freq;
67
68static void apbt_set_mode(enum clock_event_mode mode,
69 struct clock_event_device *evt);
70static int apbt_next_event(unsigned long delta,
71 struct clock_event_device *evt);
72static cycle_t apbt_read_clocksource(struct clocksource *cs);
73static void apbt_restart_clocksource(struct clocksource *cs);
74
75struct apbt_dev {
76 struct clock_event_device evt;
77 unsigned int num;
78 int cpu;
79 unsigned int irq;
80 unsigned int tick;
81 unsigned int count;
82 unsigned int flags;
83 char name[10];
84};
85
86int disable_apbt_percpu __cpuinitdata;
87
88static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
89
90#ifdef CONFIG_SMP
91static unsigned int apbt_num_timers_used;
92static struct apbt_dev *apbt_devs;
93#endif
94
95static inline unsigned long apbt_readl_reg(unsigned long a)
96{
97 return readl(apbt_virt_address + a);
98}
99
100static inline void apbt_writel_reg(unsigned long d, unsigned long a)
101{
102 writel(d, apbt_virt_address + a);
103}
104
105static inline unsigned long apbt_readl(int n, unsigned long a)
106{
107 return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
108}
109
110static inline void apbt_writel(int n, unsigned long d, unsigned long a)
111{
112 writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
113}
114
115static inline void apbt_set_mapping(void)
116{
117 struct sfi_timer_table_entry *mtmr;
118
119 if (apbt_virt_address) {
120 pr_debug("APBT base already mapped\n");
121 return;
122 }
123 mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
124 if (mtmr == NULL) {
125 printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
126 APBT_CLOCKEVENT0_NUM);
127 return;
128 }
129 apbt_address = (unsigned long)mtmr->phys_addr;
130 if (!apbt_address) {
131 printk(KERN_WARNING "No timer base from SFI, use default\n");
132 apbt_address = APBT_DEFAULT_BASE;
133 }
134 apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
135 if (apbt_virt_address) {
136 pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
137 (void *)apbt_address, (void *)apbt_virt_address);
138 } else {
139 pr_debug("Failed mapping APBT phy address at %p\n",\
140 (void *)apbt_address);
141 goto panic_noapbt;
142 }
143 apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
144 sfi_free_mtmr(mtmr);
145
146 /* Now figure out the physical timer id for clocksource device */
147 mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
148 if (mtmr == NULL)
149 goto panic_noapbt;
150
151 /* Now figure out the physical timer id */
152 phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
153 / APBTMRS_REG_SIZE;
154 pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
155 return;
156
157panic_noapbt:
158 panic("Failed to setup APB system timer\n");
159
160}
161
162static inline void apbt_clear_mapping(void)
163{
164 iounmap(apbt_virt_address);
165 apbt_virt_address = NULL;
166}
167
168/*
169 * APBT timer interrupt enable / disable
170 */
171static inline int is_apbt_capable(void)
172{
173 return apbt_virt_address ? 1 : 0;
174}
175
176static struct clocksource clocksource_apbt = {
177 .name = "apbt",
178 .rating = APBT_CLOCKSOURCE_RATING,
179 .read = apbt_read_clocksource,
180 .mask = APBT_MASK,
181 .shift = APBT_SHIFT,
182 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
183 .resume = apbt_restart_clocksource,
184};
185
186/* boot APB clock event device */
187static struct clock_event_device apbt_clockevent = {
188 .name = "apbt0",
189 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
190 .set_mode = apbt_set_mode,
191 .set_next_event = apbt_next_event,
192 .shift = APBT_SHIFT,
193 .irq = 0,
194 .rating = APBT_CLOCKEVENT_RATING,
195};
196
197/*
198 * if user does not want to use per CPU apb timer, just give it a lower rating
199 * than local apic timer and skip the late per cpu timer init.
200 */
201static inline int __init setup_x86_mrst_timer(char *arg)
202{
203 if (!arg)
204 return -EINVAL;
205
206 if (strcmp("apbt_only", arg) == 0)
207 disable_apbt_percpu = 0;
208 else if (strcmp("lapic_and_apbt", arg) == 0)
209 disable_apbt_percpu = 1;
210 else {
211 pr_warning("X86 MRST timer option %s not recognised"
212 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
213 arg);
214 return -EINVAL;
215 }
216 return 0;
217}
218__setup("x86_mrst_timer=", setup_x86_mrst_timer);
219
220/*
221 * start count down from 0xffff_ffff. this is done by toggling the enable bit
222 * then load initial load count to ~0.
223 */
224static void apbt_start_counter(int n)
225{
226 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
227
228 ctrl &= ~APBTMR_CONTROL_ENABLE;
229 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
230 apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
231 /* enable, mask interrupt */
232 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
233 ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
234 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
235 /* read it once to get cached counter value initialized */
236 apbt_read_clocksource(&clocksource_apbt);
237}
238
239static irqreturn_t apbt_interrupt_handler(int irq, void *data)
240{
241 struct apbt_dev *dev = (struct apbt_dev *)data;
242 struct clock_event_device *aevt = &dev->evt;
243
244 if (!aevt->event_handler) {
245 printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
246 dev->num);
247 return IRQ_NONE;
248 }
249 aevt->event_handler(aevt);
250 return IRQ_HANDLED;
251}
252
253static void apbt_restart_clocksource(struct clocksource *cs)
254{
255 apbt_start_counter(phy_cs_timer_id);
256}
257
258/* Setup IRQ routing via IOAPIC */
259#ifdef CONFIG_SMP
260static void apbt_setup_irq(struct apbt_dev *adev)
261{
262 struct irq_chip *chip;
263 struct irq_desc *desc;
264
265 /* timer0 irq has been setup early */
266 if (adev->irq == 0)
267 return;
268 desc = irq_to_desc(adev->irq);
269 chip = get_irq_chip(adev->irq);
270 disable_irq(adev->irq);
271 desc->status |= IRQ_MOVE_PCNTXT;
272 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
273 /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
274 set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
275 enable_irq(adev->irq);
276 if (system_state == SYSTEM_BOOTING)
277 if (request_irq(adev->irq, apbt_interrupt_handler,
278 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
279 adev->name, adev)) {
280 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
281 adev->num);
282 }
283}
284#endif
285
286static void apbt_enable_int(int n)
287{
288 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
289 /* clear pending intr */
290 apbt_readl(n, APBTMR_N_EOI);
291 ctrl &= ~APBTMR_CONTROL_INT;
292 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
293}
294
295static void apbt_disable_int(int n)
296{
297 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
298
299 ctrl |= APBTMR_CONTROL_INT;
300 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
301}
302
303
304static int __init apbt_clockevent_register(void)
305{
306 struct sfi_timer_table_entry *mtmr;
307 struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev);
308
309 mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
310 if (mtmr == NULL) {
311 printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
312 APBT_CLOCKEVENT0_NUM);
313 return -ENODEV;
314 }
315
316 /*
317 * We need to calculate the scaled math multiplication factor for
318 * nanosecond to apbt tick conversion.
319 * mult = (nsec/cycle)*2^APBT_SHIFT
320 */
321 apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
322 , NSEC_PER_SEC, APBT_SHIFT);
323
324 /* Calculate the min / max delta */
325 apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
326 &apbt_clockevent);
327 apbt_clockevent.min_delta_ns = clockevent_delta2ns(
328 APBT_MIN_DELTA_USEC*apbt_freq,
329 &apbt_clockevent);
330 /*
331 * Start apbt with the boot cpu mask and make it
332 * global if not used for per cpu timer.
333 */
334 apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
335 adev->num = smp_processor_id();
336 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
337
338 if (disable_apbt_percpu) {
339 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
340 global_clock_event = &adev->evt;
341 printk(KERN_DEBUG "%s clockevent registered as global\n",
342 global_clock_event->name);
343 }
344
345 if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
346 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
347 apbt_clockevent.name, adev)) {
348 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
349 apbt_clockevent.irq);
350 }
351
352 clockevents_register_device(&adev->evt);
353 /* Start APBT 0 interrupts */
354 apbt_enable_int(APBT_CLOCKEVENT0_NUM);
355
356 sfi_free_mtmr(mtmr);
357 return 0;
358}
359
360#ifdef CONFIG_SMP
361/* Should be called with per cpu */
362void apbt_setup_secondary_clock(void)
363{
364 struct apbt_dev *adev;
365 struct clock_event_device *aevt;
366 int cpu;
367
368 /* Don't register boot CPU clockevent */
369 cpu = smp_processor_id();
370 if (cpu == boot_cpu_id)
371 return;
372 /*
373 * We need to calculate the scaled math multiplication factor for
374 * nanosecond to apbt tick conversion.
375 * mult = (nsec/cycle)*2^APBT_SHIFT
376 */
377 printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
378 adev = &per_cpu(cpu_apbt_dev, cpu);
379 aevt = &adev->evt;
380
381 memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
382 aevt->cpumask = cpumask_of(cpu);
383 aevt->name = adev->name;
384 aevt->mode = CLOCK_EVT_MODE_UNUSED;
385
386 printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
387 cpu, aevt->name, *(u32 *)aevt->cpumask);
388
389 apbt_setup_irq(adev);
390
391 clockevents_register_device(aevt);
392
393 apbt_enable_int(cpu);
394
395 return;
396}
397
398/*
399 * this notify handler process CPU hotplug events. in case of S0i3, nonboot
400 * cpus are disabled/enabled frequently, for performance reasons, we keep the
401 * per cpu timer irq registered so that we do need to do free_irq/request_irq.
402 *
403 * TODO: it might be more reliable to directly disable percpu clockevent device
404 * without the notifier chain. currently, cpu 0 may get interrupts from other
405 * cpu timers during the offline process due to the ordering of notification.
406 * the extra interrupt is harmless.
407 */
408static int apbt_cpuhp_notify(struct notifier_block *n,
409 unsigned long action, void *hcpu)
410{
411 unsigned long cpu = (unsigned long)hcpu;
412 struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
413
414 switch (action & 0xf) {
415 case CPU_DEAD:
416 apbt_disable_int(cpu);
417 if (system_state == SYSTEM_RUNNING)
418 pr_debug("skipping APBT CPU %lu offline\n", cpu);
419 else if (adev) {
420 pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
421 free_irq(adev->irq, adev);
422 }
423 break;
424 default:
425 pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
426 }
427 return NOTIFY_OK;
428}
429
430static __init int apbt_late_init(void)
431{
432 if (disable_apbt_percpu || !apb_timer_block_enabled)
433 return 0;
434 /* This notifier should be called after workqueue is ready */
435 hotcpu_notifier(apbt_cpuhp_notify, -20);
436 return 0;
437}
438fs_initcall(apbt_late_init);
439#else
440
441void apbt_setup_secondary_clock(void) {}
442
443#endif /* CONFIG_SMP */
444
445static void apbt_set_mode(enum clock_event_mode mode,
446 struct clock_event_device *evt)
447{
448 unsigned long ctrl;
449 uint64_t delta;
450 int timer_num;
451 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
452
453 timer_num = adev->num;
454 pr_debug("%s CPU %d timer %d mode=%d\n",
455 __func__, first_cpu(*evt->cpumask), timer_num, mode);
456
457 switch (mode) {
458 case CLOCK_EVT_MODE_PERIODIC:
459 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
460 delta >>= apbt_clockevent.shift;
461 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
462 ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
463 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
464 /*
465 * DW APB p. 46, have to disable timer before load counter,
466 * may cause sync problem.
467 */
468 ctrl &= ~APBTMR_CONTROL_ENABLE;
469 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
470 udelay(1);
471 pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
472 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
473 ctrl |= APBTMR_CONTROL_ENABLE;
474 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
475 break;
476 /* APB timer does not have one-shot mode, use free running mode */
477 case CLOCK_EVT_MODE_ONESHOT:
478 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
479 /*
480 * set free running mode, this mode will let timer reload max
481 * timeout which will give time (3min on 25MHz clock) to rearm
482 * the next event, therefore emulate the one-shot mode.
483 */
484 ctrl &= ~APBTMR_CONTROL_ENABLE;
485 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
486
487 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
488 /* write again to set free running mode */
489 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
490
491 /*
492 * DW APB p. 46, load counter with all 1s before starting free
493 * running mode.
494 */
495 apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
496 ctrl &= ~APBTMR_CONTROL_INT;
497 ctrl |= APBTMR_CONTROL_ENABLE;
498 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
499 break;
500
501 case CLOCK_EVT_MODE_UNUSED:
502 case CLOCK_EVT_MODE_SHUTDOWN:
503 apbt_disable_int(timer_num);
504 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
505 ctrl &= ~APBTMR_CONTROL_ENABLE;
506 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
507 break;
508
509 case CLOCK_EVT_MODE_RESUME:
510 apbt_enable_int(timer_num);
511 break;
512 }
513}
514
515static int apbt_next_event(unsigned long delta,
516 struct clock_event_device *evt)
517{
518 unsigned long ctrl;
519 int timer_num;
520
521 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
522
523 timer_num = adev->num;
524 /* Disable timer */
525 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
526 ctrl &= ~APBTMR_CONTROL_ENABLE;
527 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
528 /* write new count */
529 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
530 ctrl |= APBTMR_CONTROL_ENABLE;
531 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
532 return 0;
533}
534
535/*
536 * APB timer clock is not in sync with pclk on Langwell, which translates to
537 * unreliable read value caused by sampling error. the error does not add up
538 * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
539 * would go backwards. the following code is trying to prevent time traveling
540 * backwards. little bit paranoid.
541 */
542static cycle_t apbt_read_clocksource(struct clocksource *cs)
543{
544 unsigned long t0, t1, t2;
545 static unsigned long last_read;
546
547bad_count:
548 t1 = apbt_readl(phy_cs_timer_id,
549 APBTMR_N_CURRENT_VALUE);
550 t2 = apbt_readl(phy_cs_timer_id,
551 APBTMR_N_CURRENT_VALUE);
552 if (unlikely(t1 < t2)) {
553 pr_debug("APBT: read current count error %lx:%lx:%lx\n",
554 t1, t2, t2 - t1);
555 goto bad_count;
556 }
557 /*
558 * check against cached last read, makes sure time does not go back.
559 * it could be a normal rollover but we will do tripple check anyway
560 */
561 if (unlikely(t2 > last_read)) {
562 /* check if we have a normal rollover */
563 unsigned long raw_intr_status =
564 apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
565 /*
566 * cs timer interrupt is masked but raw intr bit is set if
567 * rollover occurs. then we read EOI reg to clear it.
568 */
569 if (raw_intr_status & (1 << phy_cs_timer_id)) {
570 apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
571 goto out;
572 }
573 pr_debug("APB CS going back %lx:%lx:%lx ",
574 t2, last_read, t2 - last_read);
575bad_count_x3:
576 pr_debug(KERN_INFO "tripple check enforced\n");
577 t0 = apbt_readl(phy_cs_timer_id,
578 APBTMR_N_CURRENT_VALUE);
579 udelay(1);
580 t1 = apbt_readl(phy_cs_timer_id,
581 APBTMR_N_CURRENT_VALUE);
582 udelay(1);
583 t2 = apbt_readl(phy_cs_timer_id,
584 APBTMR_N_CURRENT_VALUE);
585 if ((t2 > t1) || (t1 > t0)) {
586 printk(KERN_ERR "Error: APB CS tripple check failed\n");
587 goto bad_count_x3;
588 }
589 }
590out:
591 last_read = t2;
592 return (cycle_t)~t2;
593}
594
595static int apbt_clocksource_register(void)
596{
597 u64 start, now;
598 cycle_t t1;
599
600 /* Start the counter, use timer 2 as source, timer 0/1 for event */
601 apbt_start_counter(phy_cs_timer_id);
602
603 /* Verify whether apbt counter works */
604 t1 = apbt_read_clocksource(&clocksource_apbt);
605 rdtscll(start);
606
607 /*
608 * We don't know the TSC frequency yet, but waiting for
609 * 200000 TSC cycles is safe:
610 * 4 GHz == 50us
611 * 1 GHz == 200us
612 */
613 do {
614 rep_nop();
615 rdtscll(now);
616 } while ((now - start) < 200000UL);
617
618 /* APBT is the only always on clocksource, it has to work! */
619 if (t1 == apbt_read_clocksource(&clocksource_apbt))
620 panic("APBT counter not counting. APBT disabled\n");
621
622 /*
623 * initialize and register APBT clocksource
624 * convert that to ns/clock cycle
625 * mult = (ns/c) * 2^APBT_SHIFT
626 */
627 clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
628 (unsigned long) apbt_freq, APBT_SHIFT);
629 clocksource_register(&clocksource_apbt);
630
631 return 0;
632}
633
634/*
635 * Early setup the APBT timer, only use timer 0 for booting then switch to
636 * per CPU timer if possible.
637 * returns 1 if per cpu apbt is setup
638 * returns 0 if no per cpu apbt is chosen
639 * panic if set up failed, this is the only platform timer on Moorestown.
640 */
641void __init apbt_time_init(void)
642{
643#ifdef CONFIG_SMP
644 int i;
645 struct sfi_timer_table_entry *p_mtmr;
646 unsigned int percpu_timer;
647 struct apbt_dev *adev;
648#endif
649
650 if (apb_timer_block_enabled)
651 return;
652 apbt_set_mapping();
653 if (apbt_virt_address) {
654 pr_debug("Found APBT version 0x%lx\n",\
655 apbt_readl_reg(APBTMRS_COMP_VERSION));
656 } else
657 goto out_noapbt;
658 /*
659 * Read the frequency and check for a sane value, for ESL model
660 * we extend the possible clock range to allow time scaling.
661 */
662
663 if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
664 pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
665 goto out_noapbt;
666 }
667 if (apbt_clocksource_register()) {
668 pr_debug("APBT has failed to register clocksource\n");
669 goto out_noapbt;
670 }
671 if (!apbt_clockevent_register())
672 apb_timer_block_enabled = 1;
673 else {
674 pr_debug("APBT has failed to register clockevent\n");
675 goto out_noapbt;
676 }
677#ifdef CONFIG_SMP
678 /* kernel cmdline disable apb timer, so we will use lapic timers */
679 if (disable_apbt_percpu) {
680 printk(KERN_INFO "apbt: disabled per cpu timer\n");
681 return;
682 }
683 pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
684 if (num_possible_cpus() <= sfi_mtimer_num) {
685 percpu_timer = 1;
686 apbt_num_timers_used = num_possible_cpus();
687 } else {
688 percpu_timer = 0;
689 apbt_num_timers_used = 1;
690 adev = &per_cpu(cpu_apbt_dev, 0);
691 adev->flags &= ~APBT_DEV_USED;
692 }
693 pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
694
695 /* here we set up per CPU timer data structure */
696 apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
697 GFP_KERNEL);
698 if (!apbt_devs) {
699 printk(KERN_ERR "Failed to allocate APB timer devices\n");
700 return;
701 }
702 for (i = 0; i < apbt_num_timers_used; i++) {
703 adev = &per_cpu(cpu_apbt_dev, i);
704 adev->num = i;
705 adev->cpu = i;
706 p_mtmr = sfi_get_mtmr(i);
707 if (p_mtmr) {
708 adev->tick = p_mtmr->freq_hz;
709 adev->irq = p_mtmr->irq;
710 } else
711 printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
712 adev->count = 0;
713 sprintf(adev->name, "apbt%d", i);
714 }
715#endif
716
717 return;
718
719out_noapbt:
720 apbt_clear_mapping();
721 apb_timer_block_enabled = 0;
722 panic("failed to enable APB timer\n");
723}
724
725static inline void apbt_disable(int n)
726{
727 if (is_apbt_capable()) {
728 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
729 ctrl &= ~APBTMR_CONTROL_ENABLE;
730 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
731 }
732}
733
734/* called before apb_timer_enable, use early map */
735unsigned long apbt_quick_calibrate()
736{
737 int i, scale;
738 u64 old, new;
739 cycle_t t1, t2;
740 unsigned long khz = 0;
741 u32 loop, shift;
742
743 apbt_set_mapping();
744 apbt_start_counter(phy_cs_timer_id);
745
746 /* check if the timer can count down, otherwise return */
747 old = apbt_read_clocksource(&clocksource_apbt);
748 i = 10000;
749 while (--i) {
750 if (old != apbt_read_clocksource(&clocksource_apbt))
751 break;
752 }
753 if (!i)
754 goto failed;
755
756 /* count 16 ms */
757 loop = (apbt_freq * 1000) << 4;
758
759 /* restart the timer to ensure it won't get to 0 in the calibration */
760 apbt_start_counter(phy_cs_timer_id);
761
762 old = apbt_read_clocksource(&clocksource_apbt);
763 old += loop;
764
765 t1 = __native_read_tsc();
766
767 do {
768 new = apbt_read_clocksource(&clocksource_apbt);
769 } while (new < old);
770
771 t2 = __native_read_tsc();
772
773 shift = 5;
774 if (unlikely(loop >> shift == 0)) {
775 printk(KERN_INFO
776 "APBT TSC calibration failed, not enough resolution\n");
777 return 0;
778 }
779 scale = (int)div_u64((t2 - t1), loop >> shift);
780 khz = (scale * apbt_freq * 1000) >> shift;
781 printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
782 return khz;
783failed:
784 return 0;
785}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 128111d8ffe0..b5d8b0bcf235 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,6 +28,7 @@
28#include <asm/pci-direct.h> 28#include <asm/pci-direct.h>
29#include <asm/dma.h> 29#include <asm/dma.h>
30#include <asm/k8.h> 30#include <asm/k8.h>
31#include <asm/x86_init.h>
31 32
32int gart_iommu_aperture; 33int gart_iommu_aperture;
33int gart_iommu_aperture_disabled __initdata; 34int gart_iommu_aperture_disabled __initdata;
@@ -279,7 +280,8 @@ void __init early_gart_iommu_check(void)
279 * or BIOS forget to put that in reserved. 280 * or BIOS forget to put that in reserved.
280 * try to update e820 to make that region as reserved. 281 * try to update e820 to make that region as reserved.
281 */ 282 */
282 int i, fix, slot; 283 u32 agp_aper_base = 0, agp_aper_order = 0;
284 int i, fix, slot, valid_agp = 0;
283 u32 ctl; 285 u32 ctl;
284 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
285 u64 aper_base = 0, last_aper_base = 0; 287 u64 aper_base = 0, last_aper_base = 0;
@@ -289,6 +291,8 @@ void __init early_gart_iommu_check(void)
289 return; 291 return;
290 292
291 /* This is mostly duplicate of iommu_hole_init */ 293 /* This is mostly duplicate of iommu_hole_init */
294 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
295
292 fix = 0; 296 fix = 0;
293 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
294 int bus; 298 int bus;
@@ -341,10 +345,10 @@ void __init early_gart_iommu_check(void)
341 } 345 }
342 } 346 }
343 347
344 if (!fix) 348 if (valid_agp)
345 return; 349 return;
346 350
347 /* different nodes have different setting, disable them all at first*/ 351 /* disable them all at first */
348 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 352 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
349 int bus; 353 int bus;
350 int dev_base, dev_limit; 354 int dev_base, dev_limit;
@@ -389,6 +393,7 @@ void __init gart_iommu_hole_init(void)
389 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
390 int bus; 394 int bus;
391 int dev_base, dev_limit; 395 int dev_base, dev_limit;
396 u32 ctl;
392 397
393 bus = bus_dev_ranges[i].bus; 398 bus = bus_dev_ranges[i].bus;
394 dev_base = bus_dev_ranges[i].dev_base; 399 dev_base = bus_dev_ranges[i].dev_base;
@@ -400,8 +405,21 @@ void __init gart_iommu_hole_init(void)
400 405
401 iommu_detected = 1; 406 iommu_detected = 1;
402 gart_iommu_aperture = 1; 407 gart_iommu_aperture = 1;
408 x86_init.iommu.iommu_init = gart_iommu_init;
409
410 ctl = read_pci_config(bus, slot, 3,
411 AMD64_GARTAPERTURECTL);
412
413 /*
414 * Before we do anything else disable the GART. It may
415 * still be enabled if we boot into a crash-kernel here.
416 * Reconfiguring the GART while it is enabled could have
417 * unknown side-effects.
418 */
419 ctl &= ~GARTEN;
420 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
403 421
404 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; 422 aper_order = (ctl >> 1) & 7;
405 aper_size = (32 * 1024 * 1024) << aper_order; 423 aper_size = (32 * 1024 * 1024) << aper_order;
406 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; 424 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
407 aper_base <<= 25; 425 aper_base <<= 25;
@@ -456,8 +474,6 @@ out:
456 474
457 if (aper_alloc) { 475 if (aper_alloc) {
458 /* Got the aperture from the AGP bridge */ 476 /* Got the aperture from the AGP bridge */
459 } else if (swiotlb && !valid_agp) {
460 /* Do nothing */
461 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || 477 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
462 force_iommu || 478 force_iommu ||
463 valid_agp || 479 valid_agp ||
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index da7b7b9f8bd8..565c1bfc507d 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,7 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o 6obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o 7obj-$(CONFIG_SMP) += ipi.o
8 8
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 894aa97f0717..e5a4a1e01618 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U;
61 61
62/* 62/*
63 * The highest APIC ID seen during enumeration. 63 * The highest APIC ID seen during enumeration.
64 *
65 * On AMD, this determines the messaging protocol we can use: if all APIC IDs
66 * are in the 0 ... 7 range, then we can use logical addressing which
67 * has some performance advantages (better broadcasting).
68 *
69 * If there's an APIC ID above 8, we use physical addressing.
70 */ 64 */
71unsigned int max_physical_apicid; 65unsigned int max_physical_apicid;
72 66
@@ -241,28 +235,13 @@ static int modern_apic(void)
241} 235}
242 236
243/* 237/*
244 * bare function to substitute write operation 238 * right after this call apic become NOOP driven
245 * and it's _that_ fast :) 239 * so apic->write/read doesn't do anything
246 */
247static void native_apic_write_dummy(u32 reg, u32 v)
248{
249 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
250}
251
252static u32 native_apic_read_dummy(u32 reg)
253{
254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
255 return 0;
256}
257
258/*
259 * right after this call apic->write/read doesn't do anything
260 * note that there is no restore operation it works one way
261 */ 240 */
262void apic_disable(void) 241void apic_disable(void)
263{ 242{
264 apic->read = native_apic_read_dummy; 243 pr_info("APIC: switched to apic NOOP\n");
265 apic->write = native_apic_write_dummy; 244 apic = &apic_noop;
266} 245}
267 246
268void native_apic_wait_icr_idle(void) 247void native_apic_wait_icr_idle(void)
@@ -459,7 +438,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
459 v = apic_read(APIC_LVTT); 438 v = apic_read(APIC_LVTT);
460 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 439 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
461 apic_write(APIC_LVTT, v); 440 apic_write(APIC_LVTT, v);
462 apic_write(APIC_TMICT, 0xffffffff); 441 apic_write(APIC_TMICT, 0);
463 break; 442 break;
464 case CLOCK_EVT_MODE_RESUME: 443 case CLOCK_EVT_MODE_RESUME:
465 /* Nothing to do here */ 444 /* Nothing to do here */
@@ -602,7 +581,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
602 res = (((u64)(*deltatsc)) * pm_100ms); 581 res = (((u64)(*deltatsc)) * pm_100ms);
603 do_div(res, deltapm); 582 do_div(res, deltapm);
604 apic_printk(APIC_VERBOSE, "TSC delta adjusted to " 583 apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
605 "PM-Timer: %lu (%ld) \n", 584 "PM-Timer: %lu (%ld)\n",
606 (unsigned long)res, *deltatsc); 585 (unsigned long)res, *deltatsc);
607 *deltatsc = (long)res; 586 *deltatsc = (long)res;
608 } 587 }
@@ -662,7 +641,7 @@ static int __init calibrate_APIC_clock(void)
662 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; 641 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
663 642
664 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); 643 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
665 apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); 644 apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
666 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", 645 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
667 calibration_result); 646 calibration_result);
668 647
@@ -1356,7 +1335,7 @@ void enable_x2apic(void)
1356 1335
1357 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1336 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1358 if (!(msr & X2APIC_ENABLE)) { 1337 if (!(msr & X2APIC_ENABLE)) {
1359 pr_info("Enabling x2apic\n"); 1338 printk_once(KERN_INFO "Enabling x2apic\n");
1360 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1339 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1361 } 1340 }
1362} 1341}
@@ -1392,14 +1371,11 @@ void __init enable_IR_x2apic(void)
1392 unsigned long flags; 1371 unsigned long flags;
1393 struct IO_APIC_route_entry **ioapic_entries = NULL; 1372 struct IO_APIC_route_entry **ioapic_entries = NULL;
1394 int ret, x2apic_enabled = 0; 1373 int ret, x2apic_enabled = 0;
1395 int dmar_table_init_ret = 0; 1374 int dmar_table_init_ret;
1396 1375
1397#ifdef CONFIG_INTR_REMAP
1398 dmar_table_init_ret = dmar_table_init(); 1376 dmar_table_init_ret = dmar_table_init();
1399 if (dmar_table_init_ret) 1377 if (dmar_table_init_ret && !x2apic_supported())
1400 pr_debug("dmar_table_init() failed with %d:\n", 1378 return;
1401 dmar_table_init_ret);
1402#endif
1403 1379
1404 ioapic_entries = alloc_ioapic_entries(); 1380 ioapic_entries = alloc_ioapic_entries();
1405 if (!ioapic_entries) { 1381 if (!ioapic_entries) {
@@ -1414,7 +1390,7 @@ void __init enable_IR_x2apic(void)
1414 } 1390 }
1415 1391
1416 local_irq_save(flags); 1392 local_irq_save(flags);
1417 mask_8259A(); 1393 legacy_pic->mask_all();
1418 mask_IO_APIC_setup(ioapic_entries); 1394 mask_IO_APIC_setup(ioapic_entries);
1419 1395
1420 if (dmar_table_init_ret) 1396 if (dmar_table_init_ret)
@@ -1446,7 +1422,7 @@ void __init enable_IR_x2apic(void)
1446nox2apic: 1422nox2apic:
1447 if (!ret) /* IR enabling failed */ 1423 if (!ret) /* IR enabling failed */
1448 restore_IO_APIC_setup(ioapic_entries); 1424 restore_IO_APIC_setup(ioapic_entries);
1449 unmask_8259A(); 1425 legacy_pic->restore_mask();
1450 local_irq_restore(flags); 1426 local_irq_restore(flags);
1451 1427
1452out: 1428out:
@@ -1664,8 +1640,8 @@ int __init APIC_init_uniprocessor(void)
1664 } 1640 }
1665#endif 1641#endif
1666 1642
1643#ifndef CONFIG_SMP
1667 enable_IR_x2apic(); 1644 enable_IR_x2apic();
1668#ifdef CONFIG_X86_64
1669 default_setup_apic_routing(); 1645 default_setup_apic_routing();
1670#endif 1646#endif
1671 1647
@@ -1915,18 +1891,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1915 if (apicid > max_physical_apicid) 1891 if (apicid > max_physical_apicid)
1916 max_physical_apicid = apicid; 1892 max_physical_apicid = apicid;
1917 1893
1918#ifdef CONFIG_X86_32
1919 switch (boot_cpu_data.x86_vendor) {
1920 case X86_VENDOR_INTEL:
1921 if (num_processors > 8)
1922 def_to_bigsmp = 1;
1923 break;
1924 case X86_VENDOR_AMD:
1925 if (max_physical_apicid >= 8)
1926 def_to_bigsmp = 1;
1927 }
1928#endif
1929
1930#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) 1894#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
1931 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1895 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1932 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1896 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
@@ -2056,7 +2020,7 @@ static int lapic_resume(struct sys_device *dev)
2056 } 2020 }
2057 2021
2058 mask_IO_APIC_setup(ioapic_entries); 2022 mask_IO_APIC_setup(ioapic_entries);
2059 mask_8259A(); 2023 legacy_pic->mask_all();
2060 } 2024 }
2061 2025
2062 if (x2apic_mode) 2026 if (x2apic_mode)
@@ -2100,7 +2064,7 @@ static int lapic_resume(struct sys_device *dev)
2100 2064
2101 if (intr_remapping_enabled) { 2065 if (intr_remapping_enabled) {
2102 reenable_intr_remapping(x2apic_mode); 2066 reenable_intr_remapping(x2apic_mode);
2103 unmask_8259A(); 2067 legacy_pic->restore_mask();
2104 restore_IO_APIC_setup(ioapic_entries); 2068 restore_IO_APIC_setup(ioapic_entries);
2105 free_ioapic_entries(ioapic_entries); 2069 free_ioapic_entries(ioapic_entries);
2106 } 2070 }
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index d0c99abc26c3..09d3b17ce0c2 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -223,7 +223,7 @@ struct apic apic_flat = {
223}; 223};
224 224
225/* 225/*
226 * Physflat mode is used when there are more than 8 CPUs on a AMD system. 226 * Physflat mode is used when there are more than 8 CPUs on a system.
227 * We cannot use logical delivery in this case because the mask 227 * We cannot use logical delivery in this case because the mask
228 * overflows, so use physical mode. 228 * overflows, so use physical mode.
229 */ 229 */
@@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
240 printk(KERN_DEBUG "system APIC only can use physical flat"); 240 printk(KERN_DEBUG "system APIC only can use physical flat");
241 return 1; 241 return 1;
242 } 242 }
243
244 if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
245 printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
246 return 1;
247 }
243#endif 248#endif
244 249
245 return 0; 250 return 0;
@@ -306,10 +311,7 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
306 if (cpumask_test_cpu(cpu, cpu_online_mask)) 311 if (cpumask_test_cpu(cpu, cpu_online_mask))
307 break; 312 break;
308 } 313 }
309 if (cpu < nr_cpu_ids) 314 return per_cpu(x86_cpu_to_apicid, cpu);
310 return per_cpu(x86_cpu_to_apicid, cpu);
311
312 return BAD_APICID;
313} 315}
314 316
315struct apic apic_physflat = { 317struct apic apic_physflat = {
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 000000000000..e31b9ffe25f5
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,200 @@
1/*
2 * NOOP APIC driver.
3 *
4 * Does almost nothing and should be substituted by a real apic driver via
5 * probe routine.
6 *
7 * Though in case if apic is disabled (for some reason) we try
8 * to not uglify the caller's code and allow to call (some) apic routines
9 * like self-ipi, etc...
10 */
11
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/module.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <linux/errno.h>
20#include <asm/fixmap.h>
21#include <asm/mpspec.h>
22#include <asm/apicdef.h>
23#include <asm/apic.h>
24#include <asm/setup.h>
25
26#include <linux/smp.h>
27#include <asm/ipi.h>
28
29#include <linux/interrupt.h>
30#include <asm/acpi.h>
31#include <asm/e820.h>
32
33static void noop_init_apic_ldr(void) { }
34static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
35static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
36static void noop_send_IPI_allbutself(int vector) { }
37static void noop_send_IPI_all(int vector) { }
38static void noop_send_IPI_self(int vector) { }
39static void noop_apic_wait_icr_idle(void) { }
40static void noop_apic_icr_write(u32 low, u32 id) { }
41
42static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
43{
44 return -1;
45}
46
47static u32 noop_safe_apic_wait_icr_idle(void)
48{
49 return 0;
50}
51
52static u64 noop_apic_icr_read(void)
53{
54 return 0;
55}
56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{
64 return 0;
65}
66
67static unsigned int noop_get_apic_id(unsigned long x)
68{
69 return 0;
70}
71
72static int noop_probe(void)
73{
74 /*
75 * NOOP apic should not ever be
76 * enabled via probe routine
77 */
78 return 0;
79}
80
81static int noop_apic_id_registered(void)
82{
83 /*
84 * if we would be really "pedantic"
85 * we should pass read_apic_id() here
86 * but since NOOP suppose APIC ID = 0
87 * lets save a few cycles
88 */
89 return physid_isset(0, phys_cpu_present_map);
90}
91
92static const struct cpumask *noop_target_cpus(void)
93{
94 /* only BSP here */
95 return cpumask_of(0);
96}
97
98static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
99{
100 return physid_isset(apicid, *map);
101}
102
103static unsigned long noop_check_apicid_present(int bit)
104{
105 return physid_isset(bit, phys_cpu_present_map);
106}
107
108static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
109{
110 if (cpu != 0)
111 pr_warning("APIC: Vector allocated for non-BSP cpu\n");
112 cpumask_clear(retmask);
113 cpumask_set_cpu(cpu, retmask);
114}
115
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg)
123{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
125 return 0;
126}
127
128static void noop_apic_write(u32 reg, u32 v)
129{
130 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
131}
132
133struct apic apic_noop = {
134 .name = "noop",
135 .probe = noop_probe,
136 .acpi_madt_oem_check = NULL,
137
138 .apic_id_registered = noop_apic_id_registered,
139
140 .irq_delivery_mode = dest_LowestPrio,
141 /* logical delivery broadcast to all CPUs: */
142 .irq_dest_mode = 1,
143
144 .target_cpus = noop_target_cpus,
145 .disable_esr = 0,
146 .dest_logical = APIC_DEST_LOGICAL,
147 .check_apicid_used = noop_check_apicid_used,
148 .check_apicid_present = noop_check_apicid_present,
149
150 .vector_allocation_domain = noop_vector_allocation_domain,
151 .init_apic_ldr = noop_init_apic_ldr,
152
153 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid,
161
162 .setup_portio_remap = NULL,
163 .check_phys_apicid_present = default_check_phys_apicid_present,
164 .enable_apic_mode = NULL,
165
166 .phys_pkg_id = noop_phys_pkg_id,
167
168 .mps_oem_check = NULL,
169
170 .get_apic_id = noop_get_apic_id,
171 .set_apic_id = NULL,
172 .apic_id_mask = 0x0F << 24,
173
174 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
175 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
176
177 .send_IPI_mask = noop_send_IPI_mask,
178 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
179 .send_IPI_allbutself = noop_send_IPI_allbutself,
180 .send_IPI_all = noop_send_IPI_all,
181 .send_IPI_self = noop_send_IPI_self,
182
183 .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
184
185 /* should be safe */
186 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
187 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
188
189 .wait_for_init_deassert = NULL,
190
191 .smp_callin_clear_local_apic = NULL,
192 .inquire_remote_apic = NULL,
193
194 .read = noop_apic_read,
195 .write = noop_apic_write,
196 .icr_read = noop_apic_icr_read,
197 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
200};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 77a06413b6b2..cb804c5091b9 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void)
35#endif 35#endif
36} 36}
37 37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) 38static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
39{ 39{
40 return 0; 40 return 0;
41} 41}
@@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 93 return BAD_APICID;
94} 94}
95 95
96static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
97{
98 return physid_mask_of_physid(phys_apicid);
99}
100
101/* Mapping from cpu number to logical apicid */ 96/* Mapping from cpu number to logical apicid */
102static inline int bigsmp_cpu_to_logical_apicid(int cpu) 97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
103{ 98{
@@ -106,10 +101,10 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu)
106 return cpu_physical_id(cpu); 101 return cpu_physical_id(cpu);
107} 102}
108 103
109static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) 104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
110{ 105{
111 /* For clustered we don't have a good way to do this yet - hack */ 106 /* For clustered we don't have a good way to do this yet - hack */
112 return physids_promote(0xFFL); 107 physids_promote(0xFFL, retmap);
113} 108}
114 109
115static int bigsmp_check_phys_apicid_present(int phys_apicid) 110static int bigsmp_check_phys_apicid_present(int phys_apicid)
@@ -136,10 +131,7 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
136 if (cpumask_test_cpu(cpu, cpu_online_mask)) 131 if (cpumask_test_cpu(cpu, cpu_online_mask))
137 break; 132 break;
138 } 133 }
139 if (cpu < nr_cpu_ids) 134 return bigsmp_cpu_to_logical_apicid(cpu);
140 return bigsmp_cpu_to_logical_apicid(cpu);
141
142 return BAD_APICID;
143} 135}
144 136
145static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 137static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -230,7 +222,7 @@ struct apic apic_bigsmp = {
230 .apicid_to_node = bigsmp_apicid_to_node, 222 .apicid_to_node = bigsmp_apicid_to_node,
231 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, 223 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
232 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 224 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
233 .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, 225 .apicid_to_cpu_present = physid_set_mask_of_physid,
234 .setup_portio_remap = NULL, 226 .setup_portio_remap = NULL,
235 .check_phys_apicid_present = bigsmp_check_phys_apicid_present, 227 .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
236 .enable_apic_mode = NULL, 228 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 89174f847b49..03ba1b895f5e 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -27,6 +27,9 @@
27 * 27 *
28 * http://www.unisys.com 28 * http://www.unisys.com
29 */ 29 */
30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
30#include <linux/notifier.h> 33#include <linux/notifier.h>
31#include <linux/spinlock.h> 34#include <linux/spinlock.h>
32#include <linux/cpumask.h> 35#include <linux/cpumask.h>
@@ -39,6 +42,7 @@
39#include <linux/errno.h> 42#include <linux/errno.h>
40#include <linux/acpi.h> 43#include <linux/acpi.h>
41#include <linux/init.h> 44#include <linux/init.h>
45#include <linux/gfp.h>
42#include <linux/nmi.h> 46#include <linux/nmi.h>
43#include <linux/smp.h> 47#include <linux/smp.h>
44#include <linux/io.h> 48#include <linux/io.h>
@@ -223,9 +227,9 @@ static int parse_unisys_oem(char *oemptr)
223 mip_addr = val; 227 mip_addr = val;
224 mip = (struct mip_reg *)val; 228 mip = (struct mip_reg *)val;
225 mip_reg = __va(mip); 229 mip_reg = __va(mip);
226 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", 230 pr_debug("host_reg = 0x%lx\n",
227 (unsigned long)host_reg); 231 (unsigned long)host_reg);
228 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", 232 pr_debug("mip_reg = 0x%lx\n",
229 (unsigned long)mip_reg); 233 (unsigned long)mip_reg);
230 success++; 234 success++;
231 break; 235 break;
@@ -401,7 +405,7 @@ static void es7000_enable_apic_mode(void)
401 if (!es7000_plat) 405 if (!es7000_plat)
402 return; 406 return;
403 407
404 printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); 408 pr_info("Enabling APIC mode.\n");
405 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); 409 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
406 es7000_mip_reg.off_0x00 = MIP_SW_APIC; 410 es7000_mip_reg.off_0x00 = MIP_SW_APIC;
407 es7000_mip_reg.off_0x38 = MIP_VALID; 411 es7000_mip_reg.off_0x38 = MIP_VALID;
@@ -466,11 +470,11 @@ static const struct cpumask *es7000_target_cpus(void)
466 return cpumask_of(smp_processor_id()); 470 return cpumask_of(smp_processor_id());
467} 471}
468 472
469static unsigned long 473static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
470es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
471{ 474{
472 return 0; 475 return 0;
473} 476}
477
474static unsigned long es7000_check_apicid_present(int bit) 478static unsigned long es7000_check_apicid_present(int bit)
475{ 479{
476 return physid_isset(bit, phys_cpu_present_map); 480 return physid_isset(bit, phys_cpu_present_map);
@@ -514,8 +518,7 @@ static void es7000_setup_apic_routing(void)
514{ 518{
515 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); 519 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
516 520
517 printk(KERN_INFO 521 pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
518 "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
519 (apic_version[apic] == 0x14) ? 522 (apic_version[apic] == 0x14) ?
520 "Physical Cluster" : "Logical Cluster", 523 "Physical Cluster" : "Logical Cluster",
521 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 524 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
@@ -539,14 +542,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu)
539 542
540static int cpu_id; 543static int cpu_id;
541 544
542static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) 545static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
543{ 546{
544 physid_mask_t mask; 547 physid_set_mask_of_physid(cpu_id, retmap);
545
546 mask = physid_mask_of_physid(cpu_id);
547 ++cpu_id; 548 ++cpu_id;
548
549 return mask;
550} 549}
551 550
552/* Mapping from cpu number to logical apicid */ 551/* Mapping from cpu number to logical apicid */
@@ -561,10 +560,10 @@ static int es7000_cpu_to_logical_apicid(int cpu)
561#endif 560#endif
562} 561}
563 562
564static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) 563static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
565{ 564{
566 /* For clustered we don't have a good way to do this yet - hack */ 565 /* For clustered we don't have a good way to do this yet - hack */
567 return physids_promote(0xff); 566 physids_promote(0xFFL, retmap);
568} 567}
569 568
570static int es7000_check_phys_apicid_present(int cpu_physical_apicid) 569static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dc69f28489f5..eb2789c3f721 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -36,6 +36,7 @@
36#include <linux/freezer.h> 36#include <linux/freezer.h>
37#include <linux/kthread.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */ 38#include <linux/jiffies.h> /* time_after() */
39#include <linux/slab.h>
39#ifdef CONFIG_ACPI 40#ifdef CONFIG_ACPI
40#include <acpi/acpi_bus.h> 41#include <acpi/acpi_bus.h>
41#endif 42#endif
@@ -60,8 +61,6 @@
60#include <asm/irq_remapping.h> 61#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 62#include <asm/hpet.h>
62#include <asm/hw_irq.h> 63#include <asm/hw_irq.h>
63#include <asm/uv/uv_hub.h>
64#include <asm/uv/uv_irq.h>
65 64
66#include <asm/apic.h> 65#include <asm/apic.h>
67 66
@@ -75,8 +74,8 @@
75 */ 74 */
76int sis_apic_bug = -1; 75int sis_apic_bug = -1;
77 76
78static DEFINE_SPINLOCK(ioapic_lock); 77static DEFINE_RAW_SPINLOCK(ioapic_lock);
79static DEFINE_SPINLOCK(vector_lock); 78static DEFINE_RAW_SPINLOCK(vector_lock);
80 79
81/* 80/*
82 * # of IRQ routing registers 81 * # of IRQ routing registers
@@ -96,8 +95,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
96/* # of MP IRQ source entries */ 95/* # of MP IRQ source entries */
97int mp_irq_entries; 96int mp_irq_entries;
98 97
99/* Number of legacy interrupts */
100static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
101/* GSI interrupts */ 98/* GSI interrupts */
102static int nr_irqs_gsi = NR_IRQS_LEGACY; 99static int nr_irqs_gsi = NR_IRQS_LEGACY;
103 100
@@ -140,49 +137,12 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
140 return pin; 137 return pin;
141} 138}
142 139
143/*
144 * This is performance-critical, we want to do it O(1)
145 *
146 * Most irqs are mapped 1:1 with pins.
147 */
148struct irq_cfg {
149 struct irq_pin_list *irq_2_pin;
150 cpumask_var_t domain;
151 cpumask_var_t old_domain;
152 unsigned move_cleanup_count;
153 u8 vector;
154 u8 move_in_progress : 1;
155};
156
157/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 140/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
158#ifdef CONFIG_SPARSE_IRQ 141#ifdef CONFIG_SPARSE_IRQ
159static struct irq_cfg irq_cfgx[] = { 142static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
160#else 143#else
161static struct irq_cfg irq_cfgx[NR_IRQS] = { 144static struct irq_cfg irq_cfgx[NR_IRQS];
162#endif 145#endif
163 [0] = { .vector = IRQ0_VECTOR, },
164 [1] = { .vector = IRQ1_VECTOR, },
165 [2] = { .vector = IRQ2_VECTOR, },
166 [3] = { .vector = IRQ3_VECTOR, },
167 [4] = { .vector = IRQ4_VECTOR, },
168 [5] = { .vector = IRQ5_VECTOR, },
169 [6] = { .vector = IRQ6_VECTOR, },
170 [7] = { .vector = IRQ7_VECTOR, },
171 [8] = { .vector = IRQ8_VECTOR, },
172 [9] = { .vector = IRQ9_VECTOR, },
173 [10] = { .vector = IRQ10_VECTOR, },
174 [11] = { .vector = IRQ11_VECTOR, },
175 [12] = { .vector = IRQ12_VECTOR, },
176 [13] = { .vector = IRQ13_VECTOR, },
177 [14] = { .vector = IRQ14_VECTOR, },
178 [15] = { .vector = IRQ15_VECTOR, },
179};
180
181void __init io_apic_disable_legacy(void)
182{
183 nr_legacy_irqs = 0;
184 nr_irqs_gsi = 0;
185}
186 146
187int __init arch_early_irq_init(void) 147int __init arch_early_irq_init(void)
188{ 148{
@@ -192,6 +152,11 @@ int __init arch_early_irq_init(void)
192 int node; 152 int node;
193 int i; 153 int i;
194 154
155 if (!legacy_pic->nr_legacy_irqs) {
156 nr_irqs_gsi = 0;
157 io_apic_irqs = ~0UL;
158 }
159
195 cfg = irq_cfgx; 160 cfg = irq_cfgx;
196 count = ARRAY_SIZE(irq_cfgx); 161 count = ARRAY_SIZE(irq_cfgx);
197 node= cpu_to_node(boot_cpu_id); 162 node= cpu_to_node(boot_cpu_id);
@@ -201,15 +166,21 @@ int __init arch_early_irq_init(void)
201 desc->chip_data = &cfg[i]; 166 desc->chip_data = &cfg[i];
202 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 167 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
203 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); 168 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
204 if (i < nr_legacy_irqs) 169 /*
205 cpumask_setall(cfg[i].domain); 170 * For legacy IRQ's, start with assigning irq0 to irq15 to
171 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
172 */
173 if (i < legacy_pic->nr_legacy_irqs) {
174 cfg[i].vector = IRQ0_VECTOR + i;
175 cpumask_set_cpu(0, cfg[i].domain);
176 }
206 } 177 }
207 178
208 return 0; 179 return 0;
209} 180}
210 181
211#ifdef CONFIG_SPARSE_IRQ 182#ifdef CONFIG_SPARSE_IRQ
212static struct irq_cfg *irq_cfg(unsigned int irq) 183struct irq_cfg *irq_cfg(unsigned int irq)
213{ 184{
214 struct irq_cfg *cfg = NULL; 185 struct irq_cfg *cfg = NULL;
215 struct irq_desc *desc; 186 struct irq_desc *desc;
@@ -361,7 +332,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
361/* end for move_irq_desc */ 332/* end for move_irq_desc */
362 333
363#else 334#else
364static struct irq_cfg *irq_cfg(unsigned int irq) 335struct irq_cfg *irq_cfg(unsigned int irq)
365{ 336{
366 return irq < nr_irqs ? irq_cfgx + irq : NULL; 337 return irq < nr_irqs ? irq_cfgx + irq : NULL;
367} 338}
@@ -422,7 +393,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
422 struct irq_pin_list *entry; 393 struct irq_pin_list *entry;
423 unsigned long flags; 394 unsigned long flags;
424 395
425 spin_lock_irqsave(&ioapic_lock, flags); 396 raw_spin_lock_irqsave(&ioapic_lock, flags);
426 for_each_irq_pin(entry, cfg->irq_2_pin) { 397 for_each_irq_pin(entry, cfg->irq_2_pin) {
427 unsigned int reg; 398 unsigned int reg;
428 int pin; 399 int pin;
@@ -431,11 +402,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
431 reg = io_apic_read(entry->apic, 0x10 + pin*2); 402 reg = io_apic_read(entry->apic, 0x10 + pin*2);
432 /* Is the remote IRR bit set? */ 403 /* Is the remote IRR bit set? */
433 if (reg & IO_APIC_REDIR_REMOTE_IRR) { 404 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
434 spin_unlock_irqrestore(&ioapic_lock, flags); 405 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
435 return true; 406 return true;
436 } 407 }
437 } 408 }
438 spin_unlock_irqrestore(&ioapic_lock, flags); 409 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
439 410
440 return false; 411 return false;
441} 412}
@@ -449,10 +420,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
449{ 420{
450 union entry_union eu; 421 union entry_union eu;
451 unsigned long flags; 422 unsigned long flags;
452 spin_lock_irqsave(&ioapic_lock, flags); 423 raw_spin_lock_irqsave(&ioapic_lock, flags);
453 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); 424 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
454 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); 425 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
455 spin_unlock_irqrestore(&ioapic_lock, flags); 426 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
456 return eu.entry; 427 return eu.entry;
457} 428}
458 429
@@ -475,9 +446,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
475void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 446void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
476{ 447{
477 unsigned long flags; 448 unsigned long flags;
478 spin_lock_irqsave(&ioapic_lock, flags); 449 raw_spin_lock_irqsave(&ioapic_lock, flags);
479 __ioapic_write_entry(apic, pin, e); 450 __ioapic_write_entry(apic, pin, e);
480 spin_unlock_irqrestore(&ioapic_lock, flags); 451 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
481} 452}
482 453
483/* 454/*
@@ -490,10 +461,10 @@ static void ioapic_mask_entry(int apic, int pin)
490 unsigned long flags; 461 unsigned long flags;
491 union entry_union eu = { .entry.mask = 1 }; 462 union entry_union eu = { .entry.mask = 1 };
492 463
493 spin_lock_irqsave(&ioapic_lock, flags); 464 raw_spin_lock_irqsave(&ioapic_lock, flags);
494 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 465 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
495 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 466 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
496 spin_unlock_irqrestore(&ioapic_lock, flags); 467 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
497} 468}
498 469
499/* 470/*
@@ -555,23 +526,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
555 add_pin_to_irq_node(cfg, node, newapic, newpin); 526 add_pin_to_irq_node(cfg, node, newapic, newpin);
556} 527}
557 528
529static void __io_apic_modify_irq(struct irq_pin_list *entry,
530 int mask_and, int mask_or,
531 void (*final)(struct irq_pin_list *entry))
532{
533 unsigned int reg, pin;
534
535 pin = entry->pin;
536 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
537 reg &= mask_and;
538 reg |= mask_or;
539 io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
540 if (final)
541 final(entry);
542}
543
558static void io_apic_modify_irq(struct irq_cfg *cfg, 544static void io_apic_modify_irq(struct irq_cfg *cfg,
559 int mask_and, int mask_or, 545 int mask_and, int mask_or,
560 void (*final)(struct irq_pin_list *entry)) 546 void (*final)(struct irq_pin_list *entry))
561{ 547{
562 int pin;
563 struct irq_pin_list *entry; 548 struct irq_pin_list *entry;
564 549
565 for_each_irq_pin(entry, cfg->irq_2_pin) { 550 for_each_irq_pin(entry, cfg->irq_2_pin)
566 unsigned int reg; 551 __io_apic_modify_irq(entry, mask_and, mask_or, final);
567 pin = entry->pin; 552}
568 reg = io_apic_read(entry->apic, 0x10 + pin * 2); 553
569 reg &= mask_and; 554static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
570 reg |= mask_or; 555{
571 io_apic_modify(entry->apic, 0x10 + pin * 2, reg); 556 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
572 if (final) 557 IO_APIC_REDIR_MASKED, NULL);
573 final(entry); 558}
574 } 559
560static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
561{
562 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
563 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
575} 564}
576 565
577static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) 566static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
@@ -595,18 +584,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
595 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 584 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
596} 585}
597 586
598static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
599{
600 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
601 IO_APIC_REDIR_MASKED, NULL);
602}
603
604static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
605{
606 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
607 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
608}
609
610static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 587static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
611{ 588{
612 struct irq_cfg *cfg = desc->chip_data; 589 struct irq_cfg *cfg = desc->chip_data;
@@ -614,9 +591,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
614 591
615 BUG_ON(!cfg); 592 BUG_ON(!cfg);
616 593
617 spin_lock_irqsave(&ioapic_lock, flags); 594 raw_spin_lock_irqsave(&ioapic_lock, flags);
618 __mask_IO_APIC_irq(cfg); 595 __mask_IO_APIC_irq(cfg);
619 spin_unlock_irqrestore(&ioapic_lock, flags); 596 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
620} 597}
621 598
622static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) 599static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
@@ -624,9 +601,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
624 struct irq_cfg *cfg = desc->chip_data; 601 struct irq_cfg *cfg = desc->chip_data;
625 unsigned long flags; 602 unsigned long flags;
626 603
627 spin_lock_irqsave(&ioapic_lock, flags); 604 raw_spin_lock_irqsave(&ioapic_lock, flags);
628 __unmask_IO_APIC_irq(cfg); 605 __unmask_IO_APIC_irq(cfg);
629 spin_unlock_irqrestore(&ioapic_lock, flags); 606 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
630} 607}
631 608
632static void mask_IO_APIC_irq(unsigned int irq) 609static void mask_IO_APIC_irq(unsigned int irq)
@@ -875,7 +852,7 @@ static int __init find_isa_irq_apic(int irq, int type)
875 */ 852 */
876static int EISA_ELCR(unsigned int irq) 853static int EISA_ELCR(unsigned int irq)
877{ 854{
878 if (irq < nr_legacy_irqs) { 855 if (irq < legacy_pic->nr_legacy_irqs) {
879 unsigned int port = 0x4d0 + (irq >> 3); 856 unsigned int port = 0x4d0 + (irq >> 3);
880 return (inb(port) >> (irq & 7)) & 1; 857 return (inb(port) >> (irq & 7)) & 1;
881 } 858 }
@@ -1150,12 +1127,12 @@ void lock_vector_lock(void)
1150 /* Used to the online set of cpus does not change 1127 /* Used to the online set of cpus does not change
1151 * during assign_irq_vector. 1128 * during assign_irq_vector.
1152 */ 1129 */
1153 spin_lock(&vector_lock); 1130 raw_spin_lock(&vector_lock);
1154} 1131}
1155 1132
1156void unlock_vector_lock(void) 1133void unlock_vector_lock(void)
1157{ 1134{
1158 spin_unlock(&vector_lock); 1135 raw_spin_unlock(&vector_lock);
1159} 1136}
1160 1137
1161static int 1138static int
@@ -1172,12 +1149,13 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1172 * Also, we've got to be careful not to trash gate 1149 * Also, we've got to be careful not to trash gate
1173 * 0x80, because int 0x80 is hm, kind of importantish. ;) 1150 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1174 */ 1151 */
1175 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1152 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
1153 static int current_offset = VECTOR_OFFSET_START % 8;
1176 unsigned int old_vector; 1154 unsigned int old_vector;
1177 int cpu, err; 1155 int cpu, err;
1178 cpumask_var_t tmp_mask; 1156 cpumask_var_t tmp_mask;
1179 1157
1180 if ((cfg->move_in_progress) || cfg->move_cleanup_count) 1158 if (cfg->move_in_progress)
1181 return -EBUSY; 1159 return -EBUSY;
1182 1160
1183 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) 1161 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
@@ -1208,7 +1186,7 @@ next:
1208 if (vector >= first_system_vector) { 1186 if (vector >= first_system_vector) {
1209 /* If out of vectors on large boxen, must share them. */ 1187 /* If out of vectors on large boxen, must share them. */
1210 offset = (offset + 1) % 8; 1188 offset = (offset + 1) % 8;
1211 vector = FIRST_DEVICE_VECTOR + offset; 1189 vector = FIRST_EXTERNAL_VECTOR + offset;
1212 } 1190 }
1213 if (unlikely(current_vector == vector)) 1191 if (unlikely(current_vector == vector))
1214 continue; 1192 continue;
@@ -1237,15 +1215,14 @@ next:
1237 return err; 1215 return err;
1238} 1216}
1239 1217
1240static int 1218int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1241assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1242{ 1219{
1243 int err; 1220 int err;
1244 unsigned long flags; 1221 unsigned long flags;
1245 1222
1246 spin_lock_irqsave(&vector_lock, flags); 1223 raw_spin_lock_irqsave(&vector_lock, flags);
1247 err = __assign_irq_vector(irq, cfg, mask); 1224 err = __assign_irq_vector(irq, cfg, mask);
1248 spin_unlock_irqrestore(&vector_lock, flags); 1225 raw_spin_unlock_irqrestore(&vector_lock, flags);
1249 return err; 1226 return err;
1250} 1227}
1251 1228
@@ -1279,14 +1256,27 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1279void __setup_vector_irq(int cpu) 1256void __setup_vector_irq(int cpu)
1280{ 1257{
1281 /* Initialize vector_irq on a new cpu */ 1258 /* Initialize vector_irq on a new cpu */
1282 /* This function must be called with vector_lock held */
1283 int irq, vector; 1259 int irq, vector;
1284 struct irq_cfg *cfg; 1260 struct irq_cfg *cfg;
1285 struct irq_desc *desc; 1261 struct irq_desc *desc;
1286 1262
1263 /*
1264 * vector_lock will make sure that we don't run into irq vector
1265 * assignments that might be happening on another cpu in parallel,
1266 * while we setup our initial vector to irq mappings.
1267 */
1268 raw_spin_lock(&vector_lock);
1287 /* Mark the inuse vectors */ 1269 /* Mark the inuse vectors */
1288 for_each_irq_desc(irq, desc) { 1270 for_each_irq_desc(irq, desc) {
1289 cfg = desc->chip_data; 1271 cfg = desc->chip_data;
1272
1273 /*
1274 * If it is a legacy IRQ handled by the legacy PIC, this cpu
1275 * will be part of the irq_cfg's domain.
1276 */
1277 if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
1278 cpumask_set_cpu(cpu, cfg->domain);
1279
1290 if (!cpumask_test_cpu(cpu, cfg->domain)) 1280 if (!cpumask_test_cpu(cpu, cfg->domain))
1291 continue; 1281 continue;
1292 vector = cfg->vector; 1282 vector = cfg->vector;
@@ -1302,6 +1292,7 @@ void __setup_vector_irq(int cpu)
1302 if (!cpumask_test_cpu(cpu, cfg->domain)) 1292 if (!cpumask_test_cpu(cpu, cfg->domain))
1303 per_cpu(vector_irq, cpu)[vector] = -1; 1293 per_cpu(vector_irq, cpu)[vector] = -1;
1304 } 1294 }
1295 raw_spin_unlock(&vector_lock);
1305} 1296}
1306 1297
1307static struct irq_chip ioapic_chip; 1298static struct irq_chip ioapic_chip;
@@ -1451,6 +1442,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1451 1442
1452 cfg = desc->chip_data; 1443 cfg = desc->chip_data;
1453 1444
1445 /*
1446 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1447 * controllers like 8259. Now that IO-APIC can handle this irq, update
1448 * the cfg->domain.
1449 */
1450 if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
1451 apic->vector_allocation_domain(0, cfg->domain);
1452
1454 if (assign_irq_vector(irq, cfg, apic->target_cpus())) 1453 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1455 return; 1454 return;
1456 1455
@@ -1472,8 +1471,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1472 } 1471 }
1473 1472
1474 ioapic_register_intr(irq, desc, trigger); 1473 ioapic_register_intr(irq, desc, trigger);
1475 if (irq < nr_legacy_irqs) 1474 if (irq < legacy_pic->nr_legacy_irqs)
1476 disable_8259A_irq(irq); 1475 legacy_pic->chip->mask(irq);
1477 1476
1478 ioapic_write_entry(apic_id, pin, entry); 1477 ioapic_write_entry(apic_id, pin, entry);
1479} 1478}
@@ -1484,7 +1483,7 @@ static struct {
1484 1483
1485static void __init setup_IO_APIC_irqs(void) 1484static void __init setup_IO_APIC_irqs(void)
1486{ 1485{
1487 int apic_id = 0, pin, idx, irq; 1486 int apic_id, pin, idx, irq;
1488 int notcon = 0; 1487 int notcon = 0;
1489 struct irq_desc *desc; 1488 struct irq_desc *desc;
1490 struct irq_cfg *cfg; 1489 struct irq_cfg *cfg;
@@ -1492,14 +1491,7 @@ static void __init setup_IO_APIC_irqs(void)
1492 1491
1493 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1492 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1494 1493
1495#ifdef CONFIG_ACPI 1494 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1496 if (!acpi_disabled && acpi_ioapic) {
1497 apic_id = mp_find_ioapic(0);
1498 if (apic_id < 0)
1499 apic_id = 0;
1500 }
1501#endif
1502
1503 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1495 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1504 idx = find_irq_entry(apic_id, pin, mp_INT); 1496 idx = find_irq_entry(apic_id, pin, mp_INT);
1505 if (idx == -1) { 1497 if (idx == -1) {
@@ -1521,6 +1513,9 @@ static void __init setup_IO_APIC_irqs(void)
1521 1513
1522 irq = pin_2_irq(idx, apic_id, pin); 1514 irq = pin_2_irq(idx, apic_id, pin);
1523 1515
1516 if ((apic_id > 0) && (irq > 16))
1517 continue;
1518
1524 /* 1519 /*
1525 * Skip the timer IRQ if there's a quirk handler 1520 * Skip the timer IRQ if there's a quirk handler
1526 * installed and if it returns 1: 1521 * installed and if it returns 1:
@@ -1550,6 +1545,56 @@ static void __init setup_IO_APIC_irqs(void)
1550} 1545}
1551 1546
1552/* 1547/*
1548 * for the gsit that is not in first ioapic
1549 * but could not use acpi_register_gsi()
1550 * like some special sci in IBM x3330
1551 */
1552void setup_IO_APIC_irq_extra(u32 gsi)
1553{
1554 int apic_id = 0, pin, idx, irq;
1555 int node = cpu_to_node(boot_cpu_id);
1556 struct irq_desc *desc;
1557 struct irq_cfg *cfg;
1558
1559 /*
1560 * Convert 'gsi' to 'ioapic.pin'.
1561 */
1562 apic_id = mp_find_ioapic(gsi);
1563 if (apic_id < 0)
1564 return;
1565
1566 pin = mp_find_ioapic_pin(apic_id, gsi);
1567 idx = find_irq_entry(apic_id, pin, mp_INT);
1568 if (idx == -1)
1569 return;
1570
1571 irq = pin_2_irq(idx, apic_id, pin);
1572#ifdef CONFIG_SPARSE_IRQ
1573 desc = irq_to_desc(irq);
1574 if (desc)
1575 return;
1576#endif
1577 desc = irq_to_desc_alloc_node(irq, node);
1578 if (!desc) {
1579 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1580 return;
1581 }
1582
1583 cfg = desc->chip_data;
1584 add_pin_to_irq_node(cfg, node, apic_id, pin);
1585
1586 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
1587 pr_debug("Pin %d-%d already programmed\n",
1588 mp_ioapics[apic_id].apicid, pin);
1589 return;
1590 }
1591 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
1592
1593 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1594 irq_trigger(idx), irq_polarity(idx));
1595}
1596
1597/*
1553 * Set up the timer pin, possibly with the 8259A-master behind. 1598 * Set up the timer pin, possibly with the 8259A-master behind.
1554 */ 1599 */
1555static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, 1600static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
@@ -1599,9 +1644,6 @@ __apicdebuginit(void) print_IO_APIC(void)
1599 struct irq_desc *desc; 1644 struct irq_desc *desc;
1600 unsigned int irq; 1645 unsigned int irq;
1601 1646
1602 if (apic_verbosity == APIC_QUIET)
1603 return;
1604
1605 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1647 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1606 for (i = 0; i < nr_ioapics; i++) 1648 for (i = 0; i < nr_ioapics; i++)
1607 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1649 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
@@ -1615,14 +1657,14 @@ __apicdebuginit(void) print_IO_APIC(void)
1615 1657
1616 for (apic = 0; apic < nr_ioapics; apic++) { 1658 for (apic = 0; apic < nr_ioapics; apic++) {
1617 1659
1618 spin_lock_irqsave(&ioapic_lock, flags); 1660 raw_spin_lock_irqsave(&ioapic_lock, flags);
1619 reg_00.raw = io_apic_read(apic, 0); 1661 reg_00.raw = io_apic_read(apic, 0);
1620 reg_01.raw = io_apic_read(apic, 1); 1662 reg_01.raw = io_apic_read(apic, 1);
1621 if (reg_01.bits.version >= 0x10) 1663 if (reg_01.bits.version >= 0x10)
1622 reg_02.raw = io_apic_read(apic, 2); 1664 reg_02.raw = io_apic_read(apic, 2);
1623 if (reg_01.bits.version >= 0x20) 1665 if (reg_01.bits.version >= 0x20)
1624 reg_03.raw = io_apic_read(apic, 3); 1666 reg_03.raw = io_apic_read(apic, 3);
1625 spin_unlock_irqrestore(&ioapic_lock, flags); 1667 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1626 1668
1627 printk("\n"); 1669 printk("\n");
1628 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); 1670 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
@@ -1661,7 +1703,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1661 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1703 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1662 1704
1663 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" 1705 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
1664 " Stat Dmod Deli Vect: \n"); 1706 " Stat Dmod Deli Vect:\n");
1665 1707
1666 for (i = 0; i <= reg_01.bits.entries; i++) { 1708 for (i = 0; i <= reg_01.bits.entries; i++) {
1667 struct IO_APIC_route_entry entry; 1709 struct IO_APIC_route_entry entry;
@@ -1708,9 +1750,6 @@ __apicdebuginit(void) print_APIC_field(int base)
1708{ 1750{
1709 int i; 1751 int i;
1710 1752
1711 if (apic_verbosity == APIC_QUIET)
1712 return;
1713
1714 printk(KERN_DEBUG); 1753 printk(KERN_DEBUG);
1715 1754
1716 for (i = 0; i < 8; i++) 1755 for (i = 0; i < 8; i++)
@@ -1724,9 +1763,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1724 unsigned int i, v, ver, maxlvt; 1763 unsigned int i, v, ver, maxlvt;
1725 u64 icr; 1764 u64 icr;
1726 1765
1727 if (apic_verbosity == APIC_QUIET)
1728 return;
1729
1730 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1766 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1731 smp_processor_id(), hard_smp_processor_id()); 1767 smp_processor_id(), hard_smp_processor_id());
1732 v = apic_read(APIC_ID); 1768 v = apic_read(APIC_ID);
@@ -1824,13 +1860,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1824 printk("\n"); 1860 printk("\n");
1825} 1861}
1826 1862
1827__apicdebuginit(void) print_all_local_APICs(void) 1863__apicdebuginit(void) print_local_APICs(int maxcpu)
1828{ 1864{
1829 int cpu; 1865 int cpu;
1830 1866
1867 if (!maxcpu)
1868 return;
1869
1831 preempt_disable(); 1870 preempt_disable();
1832 for_each_online_cpu(cpu) 1871 for_each_online_cpu(cpu) {
1872 if (cpu >= maxcpu)
1873 break;
1833 smp_call_function_single(cpu, print_local_APIC, NULL, 1); 1874 smp_call_function_single(cpu, print_local_APIC, NULL, 1);
1875 }
1834 preempt_enable(); 1876 preempt_enable();
1835} 1877}
1836 1878
@@ -1839,12 +1881,12 @@ __apicdebuginit(void) print_PIC(void)
1839 unsigned int v; 1881 unsigned int v;
1840 unsigned long flags; 1882 unsigned long flags;
1841 1883
1842 if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) 1884 if (!legacy_pic->nr_legacy_irqs)
1843 return; 1885 return;
1844 1886
1845 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1887 printk(KERN_DEBUG "\nprinting PIC contents\n");
1846 1888
1847 spin_lock_irqsave(&i8259A_lock, flags); 1889 raw_spin_lock_irqsave(&i8259A_lock, flags);
1848 1890
1849 v = inb(0xa1) << 8 | inb(0x21); 1891 v = inb(0xa1) << 8 | inb(0x21);
1850 printk(KERN_DEBUG "... PIC IMR: %04x\n", v); 1892 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
@@ -1858,7 +1900,7 @@ __apicdebuginit(void) print_PIC(void)
1858 outb(0x0a,0xa0); 1900 outb(0x0a,0xa0);
1859 outb(0x0a,0x20); 1901 outb(0x0a,0x20);
1860 1902
1861 spin_unlock_irqrestore(&i8259A_lock, flags); 1903 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
1862 1904
1863 printk(KERN_DEBUG "... PIC ISR: %04x\n", v); 1905 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1864 1906
@@ -1866,21 +1908,41 @@ __apicdebuginit(void) print_PIC(void)
1866 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1908 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1867} 1909}
1868 1910
1869__apicdebuginit(int) print_all_ICs(void) 1911static int __initdata show_lapic = 1;
1912static __init int setup_show_lapic(char *arg)
1913{
1914 int num = -1;
1915
1916 if (strcmp(arg, "all") == 0) {
1917 show_lapic = CONFIG_NR_CPUS;
1918 } else {
1919 get_option(&arg, &num);
1920 if (num >= 0)
1921 show_lapic = num;
1922 }
1923
1924 return 1;
1925}
1926__setup("show_lapic=", setup_show_lapic);
1927
1928__apicdebuginit(int) print_ICs(void)
1870{ 1929{
1930 if (apic_verbosity == APIC_QUIET)
1931 return 0;
1932
1871 print_PIC(); 1933 print_PIC();
1872 1934
1873 /* don't print out if apic is not there */ 1935 /* don't print out if apic is not there */
1874 if (!cpu_has_apic && !apic_from_smp_config()) 1936 if (!cpu_has_apic && !apic_from_smp_config())
1875 return 0; 1937 return 0;
1876 1938
1877 print_all_local_APICs(); 1939 print_local_APICs(show_lapic);
1878 print_IO_APIC(); 1940 print_IO_APIC();
1879 1941
1880 return 0; 1942 return 0;
1881} 1943}
1882 1944
1883fs_initcall(print_all_ICs); 1945fs_initcall(print_ICs);
1884 1946
1885 1947
1886/* Where if anywhere is the i8259 connect in external int mode */ 1948/* Where if anywhere is the i8259 connect in external int mode */
@@ -1897,13 +1959,13 @@ void __init enable_IO_APIC(void)
1897 * The number of IO-APIC IRQ registers (== #pins): 1959 * The number of IO-APIC IRQ registers (== #pins):
1898 */ 1960 */
1899 for (apic = 0; apic < nr_ioapics; apic++) { 1961 for (apic = 0; apic < nr_ioapics; apic++) {
1900 spin_lock_irqsave(&ioapic_lock, flags); 1962 raw_spin_lock_irqsave(&ioapic_lock, flags);
1901 reg_01.raw = io_apic_read(apic, 1); 1963 reg_01.raw = io_apic_read(apic, 1);
1902 spin_unlock_irqrestore(&ioapic_lock, flags); 1964 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1903 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 1965 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1904 } 1966 }
1905 1967
1906 if (!nr_legacy_irqs) 1968 if (!legacy_pic->nr_legacy_irqs)
1907 return; 1969 return;
1908 1970
1909 for(apic = 0; apic < nr_ioapics; apic++) { 1971 for(apic = 0; apic < nr_ioapics; apic++) {
@@ -1960,7 +2022,7 @@ void disable_IO_APIC(void)
1960 */ 2022 */
1961 clear_IO_APIC(); 2023 clear_IO_APIC();
1962 2024
1963 if (!nr_legacy_irqs) 2025 if (!legacy_pic->nr_legacy_irqs)
1964 return; 2026 return;
1965 2027
1966 /* 2028 /*
@@ -2031,7 +2093,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2031 * This is broken; anything with a real cpu count has to 2093 * This is broken; anything with a real cpu count has to
2032 * circumvent this idiocy regardless. 2094 * circumvent this idiocy regardless.
2033 */ 2095 */
2034 phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 2096 apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
2035 2097
2036 /* 2098 /*
2037 * Set the IOAPIC ID to the value stored in the MPC table. 2099 * Set the IOAPIC ID to the value stored in the MPC table.
@@ -2039,9 +2101,9 @@ void __init setup_ioapic_ids_from_mpc(void)
2039 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { 2101 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
2040 2102
2041 /* Read the register 0 value */ 2103 /* Read the register 0 value */
2042 spin_lock_irqsave(&ioapic_lock, flags); 2104 raw_spin_lock_irqsave(&ioapic_lock, flags);
2043 reg_00.raw = io_apic_read(apic_id, 0); 2105 reg_00.raw = io_apic_read(apic_id, 0);
2044 spin_unlock_irqrestore(&ioapic_lock, flags); 2106 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2045 2107
2046 old_id = mp_ioapics[apic_id].apicid; 2108 old_id = mp_ioapics[apic_id].apicid;
2047 2109
@@ -2058,7 +2120,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2058 * system must have a unique ID or we get lots of nice 2120 * system must have a unique ID or we get lots of nice
2059 * 'stuck on smp_invalidate_needed IPI wait' messages. 2121 * 'stuck on smp_invalidate_needed IPI wait' messages.
2060 */ 2122 */
2061 if (apic->check_apicid_used(phys_id_present_map, 2123 if (apic->check_apicid_used(&phys_id_present_map,
2062 mp_ioapics[apic_id].apicid)) { 2124 mp_ioapics[apic_id].apicid)) {
2063 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2125 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2064 apic_id, mp_ioapics[apic_id].apicid); 2126 apic_id, mp_ioapics[apic_id].apicid);
@@ -2073,7 +2135,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2073 mp_ioapics[apic_id].apicid = i; 2135 mp_ioapics[apic_id].apicid = i;
2074 } else { 2136 } else {
2075 physid_mask_t tmp; 2137 physid_mask_t tmp;
2076 tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); 2138 apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
2077 apic_printk(APIC_VERBOSE, "Setting %d in the " 2139 apic_printk(APIC_VERBOSE, "Setting %d in the "
2078 "phys_id_present_map\n", 2140 "phys_id_present_map\n",
2079 mp_ioapics[apic_id].apicid); 2141 mp_ioapics[apic_id].apicid);
@@ -2100,16 +2162,16 @@ void __init setup_ioapic_ids_from_mpc(void)
2100 mp_ioapics[apic_id].apicid); 2162 mp_ioapics[apic_id].apicid);
2101 2163
2102 reg_00.bits.ID = mp_ioapics[apic_id].apicid; 2164 reg_00.bits.ID = mp_ioapics[apic_id].apicid;
2103 spin_lock_irqsave(&ioapic_lock, flags); 2165 raw_spin_lock_irqsave(&ioapic_lock, flags);
2104 io_apic_write(apic_id, 0, reg_00.raw); 2166 io_apic_write(apic_id, 0, reg_00.raw);
2105 spin_unlock_irqrestore(&ioapic_lock, flags); 2167 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2106 2168
2107 /* 2169 /*
2108 * Sanity check 2170 * Sanity check
2109 */ 2171 */
2110 spin_lock_irqsave(&ioapic_lock, flags); 2172 raw_spin_lock_irqsave(&ioapic_lock, flags);
2111 reg_00.raw = io_apic_read(apic_id, 0); 2173 reg_00.raw = io_apic_read(apic_id, 0);
2112 spin_unlock_irqrestore(&ioapic_lock, flags); 2174 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2113 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) 2175 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
2114 printk("could not set ID!\n"); 2176 printk("could not set ID!\n");
2115 else 2177 else
@@ -2192,15 +2254,15 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2192 unsigned long flags; 2254 unsigned long flags;
2193 struct irq_cfg *cfg; 2255 struct irq_cfg *cfg;
2194 2256
2195 spin_lock_irqsave(&ioapic_lock, flags); 2257 raw_spin_lock_irqsave(&ioapic_lock, flags);
2196 if (irq < nr_legacy_irqs) { 2258 if (irq < legacy_pic->nr_legacy_irqs) {
2197 disable_8259A_irq(irq); 2259 legacy_pic->chip->mask(irq);
2198 if (i8259A_irq_pending(irq)) 2260 if (legacy_pic->irq_pending(irq))
2199 was_pending = 1; 2261 was_pending = 1;
2200 } 2262 }
2201 cfg = irq_cfg(irq); 2263 cfg = irq_cfg(irq);
2202 __unmask_IO_APIC_irq(cfg); 2264 __unmask_IO_APIC_irq(cfg);
2203 spin_unlock_irqrestore(&ioapic_lock, flags); 2265 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2204 2266
2205 return was_pending; 2267 return was_pending;
2206} 2268}
@@ -2211,9 +2273,9 @@ static int ioapic_retrigger_irq(unsigned int irq)
2211 struct irq_cfg *cfg = irq_cfg(irq); 2273 struct irq_cfg *cfg = irq_cfg(irq);
2212 unsigned long flags; 2274 unsigned long flags;
2213 2275
2214 spin_lock_irqsave(&vector_lock, flags); 2276 raw_spin_lock_irqsave(&vector_lock, flags);
2215 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); 2277 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
2216 spin_unlock_irqrestore(&vector_lock, flags); 2278 raw_spin_unlock_irqrestore(&vector_lock, flags);
2217 2279
2218 return 1; 2280 return 1;
2219} 2281}
@@ -2228,20 +2290,16 @@ static int ioapic_retrigger_irq(unsigned int irq)
2228 */ 2290 */
2229 2291
2230#ifdef CONFIG_SMP 2292#ifdef CONFIG_SMP
2231static void send_cleanup_vector(struct irq_cfg *cfg) 2293void send_cleanup_vector(struct irq_cfg *cfg)
2232{ 2294{
2233 cpumask_var_t cleanup_mask; 2295 cpumask_var_t cleanup_mask;
2234 2296
2235 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { 2297 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
2236 unsigned int i; 2298 unsigned int i;
2237 cfg->move_cleanup_count = 0;
2238 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2239 cfg->move_cleanup_count++;
2240 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) 2299 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2241 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); 2300 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
2242 } else { 2301 } else {
2243 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); 2302 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
2244 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
2245 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); 2303 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2246 free_cpumask_var(cleanup_mask); 2304 free_cpumask_var(cleanup_mask);
2247 } 2305 }
@@ -2272,31 +2330,30 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2272 } 2330 }
2273} 2331}
2274 2332
2275static int
2276assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
2277
2278/* 2333/*
2279 * Either sets desc->affinity to a valid value, and returns 2334 * Either sets desc->affinity to a valid value, and returns
2280 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and 2335 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2281 * leaves desc->affinity untouched. 2336 * leaves desc->affinity untouched.
2282 */ 2337 */
2283static unsigned int 2338unsigned int
2284set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) 2339set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
2340 unsigned int *dest_id)
2285{ 2341{
2286 struct irq_cfg *cfg; 2342 struct irq_cfg *cfg;
2287 unsigned int irq; 2343 unsigned int irq;
2288 2344
2289 if (!cpumask_intersects(mask, cpu_online_mask)) 2345 if (!cpumask_intersects(mask, cpu_online_mask))
2290 return BAD_APICID; 2346 return -1;
2291 2347
2292 irq = desc->irq; 2348 irq = desc->irq;
2293 cfg = desc->chip_data; 2349 cfg = desc->chip_data;
2294 if (assign_irq_vector(irq, cfg, mask)) 2350 if (assign_irq_vector(irq, cfg, mask))
2295 return BAD_APICID; 2351 return -1;
2296 2352
2297 cpumask_copy(desc->affinity, mask); 2353 cpumask_copy(desc->affinity, mask);
2298 2354
2299 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); 2355 *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
2356 return 0;
2300} 2357}
2301 2358
2302static int 2359static int
@@ -2311,15 +2368,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2311 irq = desc->irq; 2368 irq = desc->irq;
2312 cfg = desc->chip_data; 2369 cfg = desc->chip_data;
2313 2370
2314 spin_lock_irqsave(&ioapic_lock, flags); 2371 raw_spin_lock_irqsave(&ioapic_lock, flags);
2315 dest = set_desc_affinity(desc, mask); 2372 ret = set_desc_affinity(desc, mask, &dest);
2316 if (dest != BAD_APICID) { 2373 if (!ret) {
2317 /* Only the high 8 bits are valid. */ 2374 /* Only the high 8 bits are valid. */
2318 dest = SET_APIC_LOGICAL_ID(dest); 2375 dest = SET_APIC_LOGICAL_ID(dest);
2319 __target_IO_APIC_irq(irq, dest, cfg); 2376 __target_IO_APIC_irq(irq, dest, cfg);
2320 ret = 0;
2321 } 2377 }
2322 spin_unlock_irqrestore(&ioapic_lock, flags); 2378 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2323 2379
2324 return ret; 2380 return ret;
2325} 2381}
@@ -2432,8 +2488,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2432 continue; 2488 continue;
2433 2489
2434 cfg = irq_cfg(irq); 2490 cfg = irq_cfg(irq);
2435 spin_lock(&desc->lock); 2491 raw_spin_lock(&desc->lock);
2436 if (!cfg->move_cleanup_count) 2492
2493 /*
2494 * Check if the irq migration is in progress. If so, we
2495 * haven't received the cleanup request yet for this irq.
2496 */
2497 if (cfg->move_in_progress)
2437 goto unlock; 2498 goto unlock;
2438 2499
2439 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2500 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
@@ -2452,29 +2513,43 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2452 goto unlock; 2513 goto unlock;
2453 } 2514 }
2454 __get_cpu_var(vector_irq)[vector] = -1; 2515 __get_cpu_var(vector_irq)[vector] = -1;
2455 cfg->move_cleanup_count--;
2456unlock: 2516unlock:
2457 spin_unlock(&desc->lock); 2517 raw_spin_unlock(&desc->lock);
2458 } 2518 }
2459 2519
2460 irq_exit(); 2520 irq_exit();
2461} 2521}
2462 2522
2463static void irq_complete_move(struct irq_desc **descp) 2523static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
2464{ 2524{
2465 struct irq_desc *desc = *descp; 2525 struct irq_desc *desc = *descp;
2466 struct irq_cfg *cfg = desc->chip_data; 2526 struct irq_cfg *cfg = desc->chip_data;
2467 unsigned vector, me; 2527 unsigned me;
2468 2528
2469 if (likely(!cfg->move_in_progress)) 2529 if (likely(!cfg->move_in_progress))
2470 return; 2530 return;
2471 2531
2472 vector = ~get_irq_regs()->orig_ax;
2473 me = smp_processor_id(); 2532 me = smp_processor_id();
2474 2533
2475 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2534 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2476 send_cleanup_vector(cfg); 2535 send_cleanup_vector(cfg);
2477} 2536}
2537
2538static void irq_complete_move(struct irq_desc **descp)
2539{
2540 __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
2541}
2542
2543void irq_force_complete_move(int irq)
2544{
2545 struct irq_desc *desc = irq_to_desc(irq);
2546 struct irq_cfg *cfg = desc->chip_data;
2547
2548 if (!cfg)
2549 return;
2550
2551 __irq_complete_move(&desc, cfg->vector);
2552}
2478#else 2553#else
2479static inline void irq_complete_move(struct irq_desc **descp) {} 2554static inline void irq_complete_move(struct irq_desc **descp) {}
2480#endif 2555#endif
@@ -2490,6 +2565,59 @@ static void ack_apic_edge(unsigned int irq)
2490 2565
2491atomic_t irq_mis_count; 2566atomic_t irq_mis_count;
2492 2567
2568/*
2569 * IO-APIC versions below 0x20 don't support EOI register.
2570 * For the record, here is the information about various versions:
2571 * 0Xh 82489DX
2572 * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
2573 * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
2574 * 30h-FFh Reserved
2575 *
2576 * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
2577 * version as 0x2. This is an error with documentation and these ICH chips
2578 * use io-apic's of version 0x20.
2579 *
2580 * For IO-APIC's with EOI register, we use that to do an explicit EOI.
2581 * Otherwise, we simulate the EOI message manually by changing the trigger
2582 * mode to edge and then back to level, with RTE being masked during this.
2583*/
2584static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2585{
2586 struct irq_pin_list *entry;
2587
2588 for_each_irq_pin(entry, cfg->irq_2_pin) {
2589 if (mp_ioapics[entry->apic].apicver >= 0x20) {
2590 /*
2591 * Intr-remapping uses pin number as the virtual vector
2592 * in the RTE. Actual vector is programmed in
2593 * intr-remapping table entry. Hence for the io-apic
2594 * EOI we use the pin number.
2595 */
2596 if (irq_remapped(irq))
2597 io_apic_eoi(entry->apic, entry->pin);
2598 else
2599 io_apic_eoi(entry->apic, cfg->vector);
2600 } else {
2601 __mask_and_edge_IO_APIC_irq(entry);
2602 __unmask_and_level_IO_APIC_irq(entry);
2603 }
2604 }
2605}
2606
2607static void eoi_ioapic_irq(struct irq_desc *desc)
2608{
2609 struct irq_cfg *cfg;
2610 unsigned long flags;
2611 unsigned int irq;
2612
2613 irq = desc->irq;
2614 cfg = desc->chip_data;
2615
2616 raw_spin_lock_irqsave(&ioapic_lock, flags);
2617 __eoi_ioapic_irq(irq, cfg);
2618 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2619}
2620
2493static void ack_apic_level(unsigned int irq) 2621static void ack_apic_level(unsigned int irq)
2494{ 2622{
2495 struct irq_desc *desc = irq_to_desc(irq); 2623 struct irq_desc *desc = irq_to_desc(irq);
@@ -2525,6 +2653,19 @@ static void ack_apic_level(unsigned int irq)
2525 * level-triggered interrupt. We mask the source for the time of the 2653 * level-triggered interrupt. We mask the source for the time of the
2526 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2654 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2527 * The idea is from Manfred Spraul. --macro 2655 * The idea is from Manfred Spraul. --macro
2656 *
2657 * Also in the case when cpu goes offline, fixup_irqs() will forward
2658 * any unhandled interrupt on the offlined cpu to the new cpu
2659 * destination that is handling the corresponding interrupt. This
2660 * interrupt forwarding is done via IPI's. Hence, in this case also
2661 * level-triggered io-apic interrupt will be seen as an edge
2662 * interrupt in the IRR. And we can't rely on the cpu's EOI
2663 * to be broadcasted to the IO-APIC's which will clear the remoteIRR
2664 * corresponding to the level-triggered interrupt. Hence on IO-APIC's
2665 * supporting EOI register, we do an explicit EOI to clear the
2666 * remote IRR and on IO-APIC's which don't have an EOI register,
2667 * we use the above logic (mask+edge followed by unmask+level) from
2668 * Manfred Spraul to clear the remote IRR.
2528 */ 2669 */
2529 cfg = desc->chip_data; 2670 cfg = desc->chip_data;
2530 i = cfg->vector; 2671 i = cfg->vector;
@@ -2536,6 +2677,19 @@ static void ack_apic_level(unsigned int irq)
2536 */ 2677 */
2537 ack_APIC_irq(); 2678 ack_APIC_irq();
2538 2679
2680 /*
2681 * Tail end of clearing remote IRR bit (either by delivering the EOI
2682 * message via io-apic EOI register write or simulating it using
2683 * mask+edge followed by unnask+level logic) manually when the
2684 * level triggered interrupt is seen as the edge triggered interrupt
2685 * at the cpu.
2686 */
2687 if (!(v & (1 << (i & 0x1f)))) {
2688 atomic_inc(&irq_mis_count);
2689
2690 eoi_ioapic_irq(desc);
2691 }
2692
2539 /* Now we can move and renable the irq */ 2693 /* Now we can move and renable the irq */
2540 if (unlikely(do_unmask_irq)) { 2694 if (unlikely(do_unmask_irq)) {
2541 /* Only migrate the irq if the ack has been received. 2695 /* Only migrate the irq if the ack has been received.
@@ -2569,41 +2723,9 @@ static void ack_apic_level(unsigned int irq)
2569 move_masked_irq(irq); 2723 move_masked_irq(irq);
2570 unmask_IO_APIC_irq_desc(desc); 2724 unmask_IO_APIC_irq_desc(desc);
2571 } 2725 }
2572
2573 /* Tail end of version 0x11 I/O APIC bug workaround */
2574 if (!(v & (1 << (i & 0x1f)))) {
2575 atomic_inc(&irq_mis_count);
2576 spin_lock(&ioapic_lock);
2577 __mask_and_edge_IO_APIC_irq(cfg);
2578 __unmask_and_level_IO_APIC_irq(cfg);
2579 spin_unlock(&ioapic_lock);
2580 }
2581} 2726}
2582 2727
2583#ifdef CONFIG_INTR_REMAP 2728#ifdef CONFIG_INTR_REMAP
2584static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2585{
2586 struct irq_pin_list *entry;
2587
2588 for_each_irq_pin(entry, cfg->irq_2_pin)
2589 io_apic_eoi(entry->apic, entry->pin);
2590}
2591
2592static void
2593eoi_ioapic_irq(struct irq_desc *desc)
2594{
2595 struct irq_cfg *cfg;
2596 unsigned long flags;
2597 unsigned int irq;
2598
2599 irq = desc->irq;
2600 cfg = desc->chip_data;
2601
2602 spin_lock_irqsave(&ioapic_lock, flags);
2603 __eoi_ioapic_irq(irq, cfg);
2604 spin_unlock_irqrestore(&ioapic_lock, flags);
2605}
2606
2607static void ir_ack_apic_edge(unsigned int irq) 2729static void ir_ack_apic_edge(unsigned int irq)
2608{ 2730{
2609 ack_APIC_irq(); 2731 ack_APIC_irq();
@@ -2671,8 +2793,8 @@ static inline void init_IO_APIC_traps(void)
2671 * so default to an old-fashioned 8259 2793 * so default to an old-fashioned 8259
2672 * interrupt if we can.. 2794 * interrupt if we can..
2673 */ 2795 */
2674 if (irq < nr_legacy_irqs) 2796 if (irq < legacy_pic->nr_legacy_irqs)
2675 make_8259A_irq(irq); 2797 legacy_pic->make_irq(irq);
2676 else 2798 else
2677 /* Strange. Oh, well.. */ 2799 /* Strange. Oh, well.. */
2678 desc->chip = &no_irq_chip; 2800 desc->chip = &no_irq_chip;
@@ -2829,7 +2951,7 @@ static inline void __init check_timer(void)
2829 /* 2951 /*
2830 * get/set the timer IRQ vector: 2952 * get/set the timer IRQ vector:
2831 */ 2953 */
2832 disable_8259A_irq(0); 2954 legacy_pic->chip->mask(0);
2833 assign_irq_vector(0, cfg, apic->target_cpus()); 2955 assign_irq_vector(0, cfg, apic->target_cpus());
2834 2956
2835 /* 2957 /*
@@ -2842,7 +2964,7 @@ static inline void __init check_timer(void)
2842 * automatically. 2964 * automatically.
2843 */ 2965 */
2844 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2966 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2845 init_8259A(1); 2967 legacy_pic->init(1);
2846#ifdef CONFIG_X86_32 2968#ifdef CONFIG_X86_32
2847 { 2969 {
2848 unsigned int ver; 2970 unsigned int ver;
@@ -2901,7 +3023,7 @@ static inline void __init check_timer(void)
2901 if (timer_irq_works()) { 3023 if (timer_irq_works()) {
2902 if (nmi_watchdog == NMI_IO_APIC) { 3024 if (nmi_watchdog == NMI_IO_APIC) {
2903 setup_nmi(); 3025 setup_nmi();
2904 enable_8259A_irq(0); 3026 legacy_pic->chip->unmask(0);
2905 } 3027 }
2906 if (disable_timer_pin_1 > 0) 3028 if (disable_timer_pin_1 > 0)
2907 clear_IO_APIC_pin(0, pin1); 3029 clear_IO_APIC_pin(0, pin1);
@@ -2924,14 +3046,14 @@ static inline void __init check_timer(void)
2924 */ 3046 */
2925 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); 3047 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
2926 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 3048 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2927 enable_8259A_irq(0); 3049 legacy_pic->chip->unmask(0);
2928 if (timer_irq_works()) { 3050 if (timer_irq_works()) {
2929 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 3051 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2930 timer_through_8259 = 1; 3052 timer_through_8259 = 1;
2931 if (nmi_watchdog == NMI_IO_APIC) { 3053 if (nmi_watchdog == NMI_IO_APIC) {
2932 disable_8259A_irq(0); 3054 legacy_pic->chip->mask(0);
2933 setup_nmi(); 3055 setup_nmi();
2934 enable_8259A_irq(0); 3056 legacy_pic->chip->unmask(0);
2935 } 3057 }
2936 goto out; 3058 goto out;
2937 } 3059 }
@@ -2939,7 +3061,7 @@ static inline void __init check_timer(void)
2939 * Cleanup, just in case ... 3061 * Cleanup, just in case ...
2940 */ 3062 */
2941 local_irq_disable(); 3063 local_irq_disable();
2942 disable_8259A_irq(0); 3064 legacy_pic->chip->mask(0);
2943 clear_IO_APIC_pin(apic2, pin2); 3065 clear_IO_APIC_pin(apic2, pin2);
2944 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 3066 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2945 } 3067 }
@@ -2958,22 +3080,22 @@ static inline void __init check_timer(void)
2958 3080
2959 lapic_register_intr(0, desc); 3081 lapic_register_intr(0, desc);
2960 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 3082 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
2961 enable_8259A_irq(0); 3083 legacy_pic->chip->unmask(0);
2962 3084
2963 if (timer_irq_works()) { 3085 if (timer_irq_works()) {
2964 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 3086 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2965 goto out; 3087 goto out;
2966 } 3088 }
2967 local_irq_disable(); 3089 local_irq_disable();
2968 disable_8259A_irq(0); 3090 legacy_pic->chip->mask(0);
2969 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 3091 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
2970 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); 3092 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
2971 3093
2972 apic_printk(APIC_QUIET, KERN_INFO 3094 apic_printk(APIC_QUIET, KERN_INFO
2973 "...trying to set up timer as ExtINT IRQ...\n"); 3095 "...trying to set up timer as ExtINT IRQ...\n");
2974 3096
2975 init_8259A(0); 3097 legacy_pic->init(0);
2976 make_8259A_irq(0); 3098 legacy_pic->make_irq(0);
2977 apic_write(APIC_LVT0, APIC_DM_EXTINT); 3099 apic_write(APIC_LVT0, APIC_DM_EXTINT);
2978 3100
2979 unlock_ExtINT_logic(); 3101 unlock_ExtINT_logic();
@@ -3015,7 +3137,7 @@ void __init setup_IO_APIC(void)
3015 /* 3137 /*
3016 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 3138 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
3017 */ 3139 */
3018 io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; 3140 io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
3019 3141
3020 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 3142 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
3021 /* 3143 /*
@@ -3026,7 +3148,7 @@ void __init setup_IO_APIC(void)
3026 sync_Arb_IDs(); 3148 sync_Arb_IDs();
3027 setup_IO_APIC_irqs(); 3149 setup_IO_APIC_irqs();
3028 init_IO_APIC_traps(); 3150 init_IO_APIC_traps();
3029 if (nr_legacy_irqs) 3151 if (legacy_pic->nr_legacy_irqs)
3030 check_timer(); 3152 check_timer();
3031} 3153}
3032 3154
@@ -3075,13 +3197,13 @@ static int ioapic_resume(struct sys_device *dev)
3075 data = container_of(dev, struct sysfs_ioapic_data, dev); 3197 data = container_of(dev, struct sysfs_ioapic_data, dev);
3076 entry = data->entry; 3198 entry = data->entry;
3077 3199
3078 spin_lock_irqsave(&ioapic_lock, flags); 3200 raw_spin_lock_irqsave(&ioapic_lock, flags);
3079 reg_00.raw = io_apic_read(dev->id, 0); 3201 reg_00.raw = io_apic_read(dev->id, 0);
3080 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { 3202 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
3081 reg_00.bits.ID = mp_ioapics[dev->id].apicid; 3203 reg_00.bits.ID = mp_ioapics[dev->id].apicid;
3082 io_apic_write(dev->id, 0, reg_00.raw); 3204 io_apic_write(dev->id, 0, reg_00.raw);
3083 } 3205 }
3084 spin_unlock_irqrestore(&ioapic_lock, flags); 3206 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3085 for (i = 0; i < nr_ioapic_registers[dev->id]; i++) 3207 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
3086 ioapic_write_entry(dev->id, i, entry[i]); 3208 ioapic_write_entry(dev->id, i, entry[i]);
3087 3209
@@ -3144,7 +3266,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3144 if (irq_want < nr_irqs_gsi) 3266 if (irq_want < nr_irqs_gsi)
3145 irq_want = nr_irqs_gsi; 3267 irq_want = nr_irqs_gsi;
3146 3268
3147 spin_lock_irqsave(&vector_lock, flags); 3269 raw_spin_lock_irqsave(&vector_lock, flags);
3148 for (new = irq_want; new < nr_irqs; new++) { 3270 for (new = irq_want; new < nr_irqs; new++) {
3149 desc_new = irq_to_desc_alloc_node(new, node); 3271 desc_new = irq_to_desc_alloc_node(new, node);
3150 if (!desc_new) { 3272 if (!desc_new) {
@@ -3157,19 +3279,17 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3157 continue; 3279 continue;
3158 3280
3159 desc_new = move_irq_desc(desc_new, node); 3281 desc_new = move_irq_desc(desc_new, node);
3282 cfg_new = desc_new->chip_data;
3160 3283
3161 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3284 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3162 irq = new; 3285 irq = new;
3163 break; 3286 break;
3164 } 3287 }
3165 spin_unlock_irqrestore(&vector_lock, flags); 3288 raw_spin_unlock_irqrestore(&vector_lock, flags);
3289
3290 if (irq > 0)
3291 dynamic_irq_init_keep_chip_data(irq);
3166 3292
3167 if (irq > 0) {
3168 dynamic_irq_init(irq);
3169 /* restore it, in case dynamic_irq_init clear it */
3170 if (desc_new)
3171 desc_new->chip_data = cfg_new;
3172 }
3173 return irq; 3293 return irq;
3174} 3294}
3175 3295
@@ -3191,27 +3311,21 @@ int create_irq(void)
3191void destroy_irq(unsigned int irq) 3311void destroy_irq(unsigned int irq)
3192{ 3312{
3193 unsigned long flags; 3313 unsigned long flags;
3194 struct irq_cfg *cfg;
3195 struct irq_desc *desc;
3196 3314
3197 /* store it, in case dynamic_irq_cleanup clear it */ 3315 dynamic_irq_cleanup_keep_chip_data(irq);
3198 desc = irq_to_desc(irq);
3199 cfg = desc->chip_data;
3200 dynamic_irq_cleanup(irq);
3201 /* connect back irq_cfg */
3202 desc->chip_data = cfg;
3203 3316
3204 free_irte(irq); 3317 free_irte(irq);
3205 spin_lock_irqsave(&vector_lock, flags); 3318 raw_spin_lock_irqsave(&vector_lock, flags);
3206 __clear_irq_vector(irq, cfg); 3319 __clear_irq_vector(irq, get_irq_chip_data(irq));
3207 spin_unlock_irqrestore(&vector_lock, flags); 3320 raw_spin_unlock_irqrestore(&vector_lock, flags);
3208} 3321}
3209 3322
3210/* 3323/*
3211 * MSI message composition 3324 * MSI message composition
3212 */ 3325 */
3213#ifdef CONFIG_PCI_MSI 3326#ifdef CONFIG_PCI_MSI
3214static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) 3327static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3328 struct msi_msg *msg, u8 hpet_id)
3215{ 3329{
3216 struct irq_cfg *cfg; 3330 struct irq_cfg *cfg;
3217 int err; 3331 int err;
@@ -3245,7 +3359,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3245 irte.dest_id = IRTE_DEST(dest); 3359 irte.dest_id = IRTE_DEST(dest);
3246 3360
3247 /* Set source-id of interrupt request */ 3361 /* Set source-id of interrupt request */
3248 set_msi_sid(&irte, pdev); 3362 if (pdev)
3363 set_msi_sid(&irte, pdev);
3364 else
3365 set_hpet_sid(&irte, hpet_id);
3249 3366
3250 modify_irte(irq, &irte); 3367 modify_irte(irq, &irte);
3251 3368
@@ -3291,8 +3408,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3291 struct msi_msg msg; 3408 struct msi_msg msg;
3292 unsigned int dest; 3409 unsigned int dest;
3293 3410
3294 dest = set_desc_affinity(desc, mask); 3411 if (set_desc_affinity(desc, mask, &dest))
3295 if (dest == BAD_APICID)
3296 return -1; 3412 return -1;
3297 3413
3298 cfg = desc->chip_data; 3414 cfg = desc->chip_data;
@@ -3324,8 +3440,7 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3324 if (get_irte(irq, &irte)) 3440 if (get_irte(irq, &irte))
3325 return -1; 3441 return -1;
3326 3442
3327 dest = set_desc_affinity(desc, mask); 3443 if (set_desc_affinity(desc, mask, &dest))
3328 if (dest == BAD_APICID)
3329 return -1; 3444 return -1;
3330 3445
3331 irte.vector = cfg->vector; 3446 irte.vector = cfg->vector;
@@ -3410,7 +3525,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3410 int ret; 3525 int ret;
3411 struct msi_msg msg; 3526 struct msi_msg msg;
3412 3527
3413 ret = msi_compose_msg(dev, irq, &msg); 3528 ret = msi_compose_msg(dev, irq, &msg, -1);
3414 if (ret < 0) 3529 if (ret < 0)
3415 return ret; 3530 return ret;
3416 3531
@@ -3507,8 +3622,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3507 struct msi_msg msg; 3622 struct msi_msg msg;
3508 unsigned int dest; 3623 unsigned int dest;
3509 3624
3510 dest = set_desc_affinity(desc, mask); 3625 if (set_desc_affinity(desc, mask, &dest))
3511 if (dest == BAD_APICID)
3512 return -1; 3626 return -1;
3513 3627
3514 cfg = desc->chip_data; 3628 cfg = desc->chip_data;
@@ -3543,7 +3657,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3543 int ret; 3657 int ret;
3544 struct msi_msg msg; 3658 struct msi_msg msg;
3545 3659
3546 ret = msi_compose_msg(NULL, irq, &msg); 3660 ret = msi_compose_msg(NULL, irq, &msg, -1);
3547 if (ret < 0) 3661 if (ret < 0)
3548 return ret; 3662 return ret;
3549 dmar_msi_write(irq, &msg); 3663 dmar_msi_write(irq, &msg);
@@ -3563,8 +3677,7 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3563 struct msi_msg msg; 3677 struct msi_msg msg;
3564 unsigned int dest; 3678 unsigned int dest;
3565 3679
3566 dest = set_desc_affinity(desc, mask); 3680 if (set_desc_affinity(desc, mask, &dest))
3567 if (dest == BAD_APICID)
3568 return -1; 3681 return -1;
3569 3682
3570 cfg = desc->chip_data; 3683 cfg = desc->chip_data;
@@ -3583,6 +3696,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3583 3696
3584#endif /* CONFIG_SMP */ 3697#endif /* CONFIG_SMP */
3585 3698
3699static struct irq_chip ir_hpet_msi_type = {
3700 .name = "IR-HPET_MSI",
3701 .unmask = hpet_msi_unmask,
3702 .mask = hpet_msi_mask,
3703#ifdef CONFIG_INTR_REMAP
3704 .ack = ir_ack_apic_edge,
3705#ifdef CONFIG_SMP
3706 .set_affinity = ir_set_msi_irq_affinity,
3707#endif
3708#endif
3709 .retrigger = ioapic_retrigger_irq,
3710};
3711
3586static struct irq_chip hpet_msi_type = { 3712static struct irq_chip hpet_msi_type = {
3587 .name = "HPET_MSI", 3713 .name = "HPET_MSI",
3588 .unmask = hpet_msi_unmask, 3714 .unmask = hpet_msi_unmask,
@@ -3594,20 +3720,36 @@ static struct irq_chip hpet_msi_type = {
3594 .retrigger = ioapic_retrigger_irq, 3720 .retrigger = ioapic_retrigger_irq,
3595}; 3721};
3596 3722
3597int arch_setup_hpet_msi(unsigned int irq) 3723int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3598{ 3724{
3599 int ret; 3725 int ret;
3600 struct msi_msg msg; 3726 struct msi_msg msg;
3601 struct irq_desc *desc = irq_to_desc(irq); 3727 struct irq_desc *desc = irq_to_desc(irq);
3602 3728
3603 ret = msi_compose_msg(NULL, irq, &msg); 3729 if (intr_remapping_enabled) {
3730 struct intel_iommu *iommu = map_hpet_to_ir(id);
3731 int index;
3732
3733 if (!iommu)
3734 return -1;
3735
3736 index = alloc_irte(iommu, irq, 1);
3737 if (index < 0)
3738 return -1;
3739 }
3740
3741 ret = msi_compose_msg(NULL, irq, &msg, id);
3604 if (ret < 0) 3742 if (ret < 0)
3605 return ret; 3743 return ret;
3606 3744
3607 hpet_msi_write(irq, &msg); 3745 hpet_msi_write(irq, &msg);
3608 desc->status |= IRQ_MOVE_PCNTXT; 3746 desc->status |= IRQ_MOVE_PCNTXT;
3609 set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, 3747 if (irq_remapped(irq))
3610 "edge"); 3748 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
3749 handle_edge_irq, "edge");
3750 else
3751 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3752 handle_edge_irq, "edge");
3611 3753
3612 return 0; 3754 return 0;
3613} 3755}
@@ -3641,8 +3783,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3641 struct irq_cfg *cfg; 3783 struct irq_cfg *cfg;
3642 unsigned int dest; 3784 unsigned int dest;
3643 3785
3644 dest = set_desc_affinity(desc, mask); 3786 if (set_desc_affinity(desc, mask, &dest))
3645 if (dest == BAD_APICID)
3646 return -1; 3787 return -1;
3647 3788
3648 cfg = desc->chip_data; 3789 cfg = desc->chip_data;
@@ -3708,83 +3849,14 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3708} 3849}
3709#endif /* CONFIG_HT_IRQ */ 3850#endif /* CONFIG_HT_IRQ */
3710 3851
3711#ifdef CONFIG_X86_UV
3712/*
3713 * Re-target the irq to the specified CPU and enable the specified MMR located
3714 * on the specified blade to allow the sending of MSIs to the specified CPU.
3715 */
3716int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3717 unsigned long mmr_offset)
3718{
3719 const struct cpumask *eligible_cpu = cpumask_of(cpu);
3720 struct irq_cfg *cfg;
3721 int mmr_pnode;
3722 unsigned long mmr_value;
3723 struct uv_IO_APIC_route_entry *entry;
3724 unsigned long flags;
3725 int err;
3726
3727 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3728
3729 cfg = irq_cfg(irq);
3730
3731 err = assign_irq_vector(irq, cfg, eligible_cpu);
3732 if (err != 0)
3733 return err;
3734
3735 spin_lock_irqsave(&vector_lock, flags);
3736 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
3737 irq_name);
3738 spin_unlock_irqrestore(&vector_lock, flags);
3739
3740 mmr_value = 0;
3741 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3742 entry->vector = cfg->vector;
3743 entry->delivery_mode = apic->irq_delivery_mode;
3744 entry->dest_mode = apic->irq_dest_mode;
3745 entry->polarity = 0;
3746 entry->trigger = 0;
3747 entry->mask = 0;
3748 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3749
3750 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3751 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3752
3753 if (cfg->move_in_progress)
3754 send_cleanup_vector(cfg);
3755
3756 return irq;
3757}
3758
3759/*
3760 * Disable the specified MMR located on the specified blade so that MSIs are
3761 * longer allowed to be sent.
3762 */
3763void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3764{
3765 unsigned long mmr_value;
3766 struct uv_IO_APIC_route_entry *entry;
3767 int mmr_pnode;
3768
3769 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3770
3771 mmr_value = 0;
3772 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3773 entry->mask = 1;
3774
3775 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3776 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3777}
3778#endif /* CONFIG_X86_64 */
3779
3780int __init io_apic_get_redir_entries (int ioapic) 3852int __init io_apic_get_redir_entries (int ioapic)
3781{ 3853{
3782 union IO_APIC_reg_01 reg_01; 3854 union IO_APIC_reg_01 reg_01;
3783 unsigned long flags; 3855 unsigned long flags;
3784 3856
3785 spin_lock_irqsave(&ioapic_lock, flags); 3857 raw_spin_lock_irqsave(&ioapic_lock, flags);
3786 reg_01.raw = io_apic_read(ioapic, 1); 3858 reg_01.raw = io_apic_read(ioapic, 1);
3787 spin_unlock_irqrestore(&ioapic_lock, flags); 3859 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3788 3860
3789 return reg_01.bits.entries; 3861 return reg_01.bits.entries;
3790} 3862}
@@ -3867,7 +3939,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
3867 /* 3939 /*
3868 * IRQs < 16 are already in the irq_2_pin[] map 3940 * IRQs < 16 are already in the irq_2_pin[] map
3869 */ 3941 */
3870 if (irq >= nr_legacy_irqs) { 3942 if (irq >= legacy_pic->nr_legacy_irqs) {
3871 cfg = desc->chip_data; 3943 cfg = desc->chip_data;
3872 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { 3944 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3873 printk(KERN_INFO "can not add pin %d for irq %d\n", 3945 printk(KERN_INFO "can not add pin %d for irq %d\n",
@@ -3944,11 +4016,11 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3944 */ 4016 */
3945 4017
3946 if (physids_empty(apic_id_map)) 4018 if (physids_empty(apic_id_map))
3947 apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 4019 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
3948 4020
3949 spin_lock_irqsave(&ioapic_lock, flags); 4021 raw_spin_lock_irqsave(&ioapic_lock, flags);
3950 reg_00.raw = io_apic_read(ioapic, 0); 4022 reg_00.raw = io_apic_read(ioapic, 0);
3951 spin_unlock_irqrestore(&ioapic_lock, flags); 4023 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3952 4024
3953 if (apic_id >= get_physical_broadcast()) { 4025 if (apic_id >= get_physical_broadcast()) {
3954 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " 4026 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
@@ -3960,10 +4032,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3960 * Every APIC in a system must have a unique ID or we get lots of nice 4032 * Every APIC in a system must have a unique ID or we get lots of nice
3961 * 'stuck on smp_invalidate_needed IPI wait' messages. 4033 * 'stuck on smp_invalidate_needed IPI wait' messages.
3962 */ 4034 */
3963 if (apic->check_apicid_used(apic_id_map, apic_id)) { 4035 if (apic->check_apicid_used(&apic_id_map, apic_id)) {
3964 4036
3965 for (i = 0; i < get_physical_broadcast(); i++) { 4037 for (i = 0; i < get_physical_broadcast(); i++) {
3966 if (!apic->check_apicid_used(apic_id_map, i)) 4038 if (!apic->check_apicid_used(&apic_id_map, i))
3967 break; 4039 break;
3968 } 4040 }
3969 4041
@@ -3976,16 +4048,16 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3976 apic_id = i; 4048 apic_id = i;
3977 } 4049 }
3978 4050
3979 tmp = apic->apicid_to_cpu_present(apic_id); 4051 apic->apicid_to_cpu_present(apic_id, &tmp);
3980 physids_or(apic_id_map, apic_id_map, tmp); 4052 physids_or(apic_id_map, apic_id_map, tmp);
3981 4053
3982 if (reg_00.bits.ID != apic_id) { 4054 if (reg_00.bits.ID != apic_id) {
3983 reg_00.bits.ID = apic_id; 4055 reg_00.bits.ID = apic_id;
3984 4056
3985 spin_lock_irqsave(&ioapic_lock, flags); 4057 raw_spin_lock_irqsave(&ioapic_lock, flags);
3986 io_apic_write(ioapic, 0, reg_00.raw); 4058 io_apic_write(ioapic, 0, reg_00.raw);
3987 reg_00.raw = io_apic_read(ioapic, 0); 4059 reg_00.raw = io_apic_read(ioapic, 0);
3988 spin_unlock_irqrestore(&ioapic_lock, flags); 4060 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3989 4061
3990 /* Sanity check */ 4062 /* Sanity check */
3991 if (reg_00.bits.ID != apic_id) { 4063 if (reg_00.bits.ID != apic_id) {
@@ -4006,9 +4078,9 @@ int __init io_apic_get_version(int ioapic)
4006 union IO_APIC_reg_01 reg_01; 4078 union IO_APIC_reg_01 reg_01;
4007 unsigned long flags; 4079 unsigned long flags;
4008 4080
4009 spin_lock_irqsave(&ioapic_lock, flags); 4081 raw_spin_lock_irqsave(&ioapic_lock, flags);
4010 reg_01.raw = io_apic_read(ioapic, 1); 4082 reg_01.raw = io_apic_read(ioapic, 1);
4011 spin_unlock_irqrestore(&ioapic_lock, flags); 4083 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
4012 4084
4013 return reg_01.bits.version; 4085 return reg_01.bits.version;
4014} 4086}
@@ -4040,27 +4112,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4040#ifdef CONFIG_SMP 4112#ifdef CONFIG_SMP
4041void __init setup_ioapic_dest(void) 4113void __init setup_ioapic_dest(void)
4042{ 4114{
4043 int pin, ioapic = 0, irq, irq_entry; 4115 int pin, ioapic, irq, irq_entry;
4044 struct irq_desc *desc; 4116 struct irq_desc *desc;
4045 const struct cpumask *mask; 4117 const struct cpumask *mask;
4046 4118
4047 if (skip_ioapic_setup == 1) 4119 if (skip_ioapic_setup == 1)
4048 return; 4120 return;
4049 4121
4050#ifdef CONFIG_ACPI 4122 for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
4051 if (!acpi_disabled && acpi_ioapic) {
4052 ioapic = mp_find_ioapic(0);
4053 if (ioapic < 0)
4054 ioapic = 0;
4055 }
4056#endif
4057
4058 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 4123 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4059 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 4124 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4060 if (irq_entry == -1) 4125 if (irq_entry == -1)
4061 continue; 4126 continue;
4062 irq = pin_2_irq(irq_entry, ioapic, pin); 4127 irq = pin_2_irq(irq_entry, ioapic, pin);
4063 4128
4129 if ((ioapic > 0) && (irq > 16))
4130 continue;
4131
4064 desc = irq_to_desc(irq); 4132 desc = irq_to_desc(irq);
4065 4133
4066 /* 4134 /*
@@ -4106,7 +4174,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
4106 for (i = 0; i < nr_ioapics; i++) { 4174 for (i = 0; i < nr_ioapics; i++) {
4107 res[i].name = mem; 4175 res[i].name = mem;
4108 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; 4176 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4109 sprintf(mem, "IOAPIC %u", i); 4177 snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
4110 mem += IOAPIC_RESOURCE_NAME_SIZE; 4178 mem += IOAPIC_RESOURCE_NAME_SIZE;
4111 } 4179 }
4112 4180
@@ -4140,18 +4208,17 @@ void __init ioapic_init_mappings(void)
4140#ifdef CONFIG_X86_32 4208#ifdef CONFIG_X86_32
4141fake_ioapic_page: 4209fake_ioapic_page:
4142#endif 4210#endif
4143 ioapic_phys = (unsigned long) 4211 ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
4144 alloc_bootmem_pages(PAGE_SIZE);
4145 ioapic_phys = __pa(ioapic_phys); 4212 ioapic_phys = __pa(ioapic_phys);
4146 } 4213 }
4147 set_fixmap_nocache(idx, ioapic_phys); 4214 set_fixmap_nocache(idx, ioapic_phys);
4148 apic_printk(APIC_VERBOSE, 4215 apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
4149 "mapped IOAPIC to %08lx (%08lx)\n", 4216 __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
4150 __fix_to_virt(idx), ioapic_phys); 4217 ioapic_phys);
4151 idx++; 4218 idx++;
4152 4219
4153 ioapic_res->start = ioapic_phys; 4220 ioapic_res->start = ioapic_phys;
4154 ioapic_res->end = ioapic_phys + (4 * 1024) - 1; 4221 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
4155 ioapic_res++; 4222 ioapic_res++;
4156 } 4223 }
4157} 4224}
@@ -4246,3 +4313,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4246 4313
4247 nr_ioapics++; 4314 nr_ioapics++;
4248} 4315}
4316
4317/* Enable IOAPIC early just for system timer */
4318void __init pre_init_apic_IRQ0(void)
4319{
4320 struct irq_cfg *cfg;
4321 struct irq_desc *desc;
4322
4323 printk(KERN_INFO "Early APIC setup for system timer0\n");
4324#ifndef CONFIG_SMP
4325 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
4326#endif
4327 desc = irq_to_desc_alloc_node(0, 0);
4328
4329 setup_local_APIC();
4330
4331 cfg = irq_cfg(0);
4332 add_pin_to_irq_node(cfg, 0, 0, 0);
4333 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
4334
4335 setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
4336}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7ff61d6a188a..1edaf15c0b8e 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -18,6 +18,7 @@
18#include <linux/delay.h> 18#include <linux/delay.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/slab.h>
21#include <linux/sysdev.h> 22#include <linux/sysdev.h>
22#include <linux/sysctl.h> 23#include <linux/sysctl.h>
23#include <linux/percpu.h> 24#include <linux/percpu.h>
@@ -39,7 +40,8 @@
39int unknown_nmi_panic; 40int unknown_nmi_panic;
40int nmi_watchdog_enabled; 41int nmi_watchdog_enabled;
41 42
42static cpumask_t backtrace_mask __read_mostly; 43/* For reliability, we're prepared to waste bits here. */
44static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
43 45
44/* nmi_active: 46/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 47 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -176,7 +178,7 @@ int __init check_nmi_watchdog(void)
176error: 178error:
177 if (nmi_watchdog == NMI_IO_APIC) { 179 if (nmi_watchdog == NMI_IO_APIC) {
178 if (!timer_through_8259) 180 if (!timer_through_8259)
179 disable_8259A_irq(0); 181 legacy_pic->chip->mask(0);
180 on_each_cpu(__acpi_nmi_disable, NULL, 1); 182 on_each_cpu(__acpi_nmi_disable, NULL, 1);
181 } 183 }
182 184
@@ -360,7 +362,7 @@ void stop_apic_nmi_watchdog(void *unused)
360 */ 362 */
361 363
362static DEFINE_PER_CPU(unsigned, last_irq_sum); 364static DEFINE_PER_CPU(unsigned, last_irq_sum);
363static DEFINE_PER_CPU(local_t, alert_counter); 365static DEFINE_PER_CPU(long, alert_counter);
364static DEFINE_PER_CPU(int, nmi_touch); 366static DEFINE_PER_CPU(int, nmi_touch);
365 367
366void touch_nmi_watchdog(void) 368void touch_nmi_watchdog(void)
@@ -414,15 +416,15 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
414 } 416 }
415 417
416 /* We can be called before check_nmi_watchdog, hence NULL check. */ 418 /* We can be called before check_nmi_watchdog, hence NULL check. */
417 if (cpumask_test_cpu(cpu, &backtrace_mask)) { 419 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 420 static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
419 421
420 spin_lock(&lock); 422 raw_spin_lock(&lock);
421 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 423 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
422 show_regs(regs); 424 show_regs(regs);
423 dump_stack(); 425 dump_stack();
424 spin_unlock(&lock); 426 raw_spin_unlock(&lock);
425 cpumask_clear_cpu(cpu, &backtrace_mask); 427 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
426 428
427 rc = 1; 429 rc = 1;
428 } 430 }
@@ -437,8 +439,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
437 * Ayiee, looks like this CPU is stuck ... 439 * Ayiee, looks like this CPU is stuck ...
438 * wait a few IRQs (5 seconds) before doing the oops ... 440 * wait a few IRQs (5 seconds) before doing the oops ...
439 */ 441 */
440 local_inc(&__get_cpu_var(alert_counter)); 442 __this_cpu_inc(alert_counter);
441 if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) 443 if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
442 /* 444 /*
443 * die_nmi will return ONLY if NOTIFY_STOP happens.. 445 * die_nmi will return ONLY if NOTIFY_STOP happens..
444 */ 446 */
@@ -446,7 +448,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
446 regs, panic_on_timeout); 448 regs, panic_on_timeout);
447 } else { 449 } else {
448 __get_cpu_var(last_irq_sum) = sum; 450 __get_cpu_var(last_irq_sum) = sum;
449 local_set(&__get_cpu_var(alert_counter), 0); 451 __this_cpu_write(alert_counter, 0);
450 } 452 }
451 453
452 /* see if the nmi watchdog went off */ 454 /* see if the nmi watchdog went off */
@@ -558,14 +560,14 @@ void arch_trigger_all_cpu_backtrace(void)
558{ 560{
559 int i; 561 int i;
560 562
561 cpumask_copy(&backtrace_mask, cpu_online_mask); 563 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
562 564
563 printk(KERN_INFO "sending NMI to all CPUs:\n"); 565 printk(KERN_INFO "sending NMI to all CPUs:\n");
564 apic->send_IPI_all(NMI_VECTOR); 566 apic->send_IPI_all(NMI_VECTOR);
565 567
566 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 568 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
567 for (i = 0; i < 10 * 1000; i++) { 569 for (i = 0; i < 10 * 1000; i++) {
568 if (cpumask_empty(&backtrace_mask)) 570 if (cpumask_empty(to_cpumask(backtrace_mask)))
569 break; 571 break;
570 mdelay(1); 572 mdelay(1);
571 } 573 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index efa00e2b8505..3e28401f161c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
225 225
226 mpc_record = 0; 226 mpc_record = 0;
227 printk(KERN_INFO 227 printk(KERN_INFO
228 "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); 228 "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
229 229
230 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { 230 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
231 printk(KERN_WARNING 231 printk(KERN_WARNING
@@ -264,11 +264,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
264static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
265{ 265{
266 /* 266 /*
267 * Find possible boot-time SMP configuration:
268 */
269 early_find_smp_config();
270
271 /*
272 * get boot-time SMP configuration: 267 * get boot-time SMP configuration:
273 */ 268 */
274 if (smp_found_config) 269 if (smp_found_config)
@@ -282,6 +277,7 @@ static __init void early_check_numaq(void)
282 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; 277 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
283 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; 278 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
284 x86_init.timers.tsc_pre_init = numaq_tsc_init; 279 x86_init.timers.tsc_pre_init = numaq_tsc_init;
280 x86_init.pci.init = pci_numaq_init;
285 } 281 }
286} 282}
287 283
@@ -334,10 +330,9 @@ static inline const struct cpumask *numaq_target_cpus(void)
334 return cpu_all_mask; 330 return cpu_all_mask;
335} 331}
336 332
337static inline unsigned long 333static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
338numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
339{ 334{
340 return physid_isset(apicid, bitmap); 335 return physid_isset(apicid, *map);
341} 336}
342 337
343static inline unsigned long numaq_check_apicid_present(int bit) 338static inline unsigned long numaq_check_apicid_present(int bit)
@@ -371,10 +366,10 @@ static inline int numaq_multi_timer_check(int apic, int irq)
371 return apic != 0 && irq == 0; 366 return apic != 0 && irq == 0;
372} 367}
373 368
374static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) 369static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
375{ 370{
376 /* We don't have a good way to do this yet - hack */ 371 /* We don't have a good way to do this yet - hack */
377 return physids_promote(0xFUL); 372 return physids_promote(0xFUL, retmap);
378} 373}
379 374
380static inline int numaq_cpu_to_logical_apicid(int cpu) 375static inline int numaq_cpu_to_logical_apicid(int cpu)
@@ -402,12 +397,12 @@ static inline int numaq_apicid_to_node(int logical_apicid)
402 return logical_apicid >> 4; 397 return logical_apicid >> 4;
403} 398}
404 399
405static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) 400static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
406{ 401{
407 int node = numaq_apicid_to_node(logical_apicid); 402 int node = numaq_apicid_to_node(logical_apicid);
408 int cpu = __ffs(logical_apicid & 0xf); 403 int cpu = __ffs(logical_apicid & 0xf);
409 404
410 return physid_mask_of_physid(cpu + 4*node); 405 physid_set_mask_of_physid(cpu + 4*node, retmap);
411} 406}
412 407
413/* Where the IO area was mapped on multiquad, always 0 otherwise */ 408/* Where the IO area was mapped on multiquad, always 0 otherwise */
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0c0182cc947d..99d2fe016084 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,7 +52,32 @@ static int __init print_ipi_mode(void)
52} 52}
53late_initcall(print_ipi_mode); 53late_initcall(print_ipi_mode);
54 54
55void default_setup_apic_routing(void) 55void __init default_setup_apic_routing(void)
56{
57 int version = apic_version[boot_cpu_physical_apicid];
58
59 if (num_possible_cpus() > 8) {
60 switch (boot_cpu_data.x86_vendor) {
61 case X86_VENDOR_INTEL:
62 if (!APIC_XAPIC(version)) {
63 def_to_bigsmp = 0;
64 break;
65 }
66 /* If P4 and above fall through */
67 case X86_VENDOR_AMD:
68 def_to_bigsmp = 1;
69 }
70 }
71
72#ifdef CONFIG_X86_BIGSMP
73 generic_bigsmp_probe();
74#endif
75
76 if (apic->setup_apic_routing)
77 apic->setup_apic_routing();
78}
79
80static void setup_apic_flat_routing(void)
56{ 81{
57#ifdef CONFIG_X86_IO_APIC 82#ifdef CONFIG_X86_IO_APIC
58 printk(KERN_INFO 83 printk(KERN_INFO
@@ -103,12 +128,12 @@ struct apic apic_default = {
103 .init_apic_ldr = default_init_apic_ldr, 128 .init_apic_ldr = default_init_apic_ldr,
104 129
105 .ioapic_phys_id_map = default_ioapic_phys_id_map, 130 .ioapic_phys_id_map = default_ioapic_phys_id_map,
106 .setup_apic_routing = default_setup_apic_routing, 131 .setup_apic_routing = setup_apic_flat_routing,
107 .multi_timer_check = NULL, 132 .multi_timer_check = NULL,
108 .apicid_to_node = default_apicid_to_node, 133 .apicid_to_node = default_apicid_to_node,
109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid, 134 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
110 .cpu_present_to_apicid = default_cpu_present_to_apicid, 135 .cpu_present_to_apicid = default_cpu_present_to_apicid,
111 .apicid_to_cpu_present = default_apicid_to_cpu_present, 136 .apicid_to_cpu_present = physid_set_mask_of_physid,
112 .setup_portio_remap = NULL, 137 .setup_portio_remap = NULL,
113 .check_phys_apicid_present = default_check_phys_apicid_present, 138 .check_phys_apicid_present = default_check_phys_apicid_present,
114 .enable_apic_mode = NULL, 139 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index c4cbd3080c1c..83e9be4778e2 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -67,17 +67,8 @@ void __init default_setup_apic_routing(void)
67 } 67 }
68#endif 68#endif
69 69
70 if (apic == &apic_flat) { 70 if (apic == &apic_flat && num_possible_cpus() > 8)
71 switch (boot_cpu_data.x86_vendor) { 71 apic = &apic_physflat;
72 case X86_VENDOR_INTEL:
73 if (num_processors > 8)
74 apic = &apic_physflat;
75 break;
76 case X86_VENDOR_AMD:
77 if (max_physical_apicid >= 8)
78 apic = &apic_physflat;
79 }
80 }
81 72
82 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 73 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
83 74
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 645ecc4ff0be..9b419263d90d 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void)
183 return cpumask_of(0); 183 return cpumask_of(0);
184} 184}
185 185
186static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) 186static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
187{ 187{
188 return 0; 188 return 0;
189} 189}
@@ -261,15 +261,15 @@ static int summit_cpu_present_to_apicid(int mps_cpu)
261 return BAD_APICID; 261 return BAD_APICID;
262} 262}
263 263
264static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) 264static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
265{ 265{
266 /* For clustered we don't have a good way to do this yet - hack */ 266 /* For clustered we don't have a good way to do this yet - hack */
267 return physids_promote(0x0F); 267 physids_promote(0x0FL, retmap);
268} 268}
269 269
270static physid_mask_t summit_apicid_to_cpu_present(int apicid) 270static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
271{ 271{
272 return physid_mask_of_physid(0); 272 physid_set_mask_of_physid(0, retmap);
273} 273}
274 274
275static int summit_check_phys_apicid_present(int physical_apicid) 275static int summit_check_phys_apicid_present(int physical_apicid)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index a5371ec36776..cf69c59f4910 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -148,10 +148,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
148 break; 148 break;
149 } 149 }
150 150
151 if (cpu < nr_cpu_ids) 151 return per_cpu(x86_cpu_to_logical_apicid, cpu);
152 return per_cpu(x86_cpu_to_logical_apicid, cpu);
153
154 return BAD_APICID;
155} 152}
156 153
157static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) 154static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a8989aadc99a..8972f38c5ced 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -146,10 +146,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
146 break; 146 break;
147 } 147 }
148 148
149 if (cpu < nr_cpu_ids) 149 return per_cpu(x86_cpu_to_apicid, cpu);
150 return per_cpu(x86_cpu_to_apicid, cpu);
151
152 return BAD_APICID;
153} 150}
154 151
155static unsigned int x2apic_phys_get_apic_id(unsigned long x) 152static unsigned int x2apic_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 326c25477d3d..c085d52dbaf2 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV APIC functions (note: not an Intel compatible APIC) 6 * SGI UV APIC functions (note: not an Intel compatible APIC)
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
@@ -17,9 +17,12 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/timer.h> 19#include <linux/timer.h>
20#include <linux/slab.h>
20#include <linux/cpu.h> 21#include <linux/cpu.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/pci.h>
25#include <linux/kdebug.h>
23 26
24#include <asm/uv/uv_mmrs.h> 27#include <asm/uv/uv_mmrs.h>
25#include <asm/uv/uv_hub.h> 28#include <asm/uv/uv_hub.h>
@@ -30,10 +33,27 @@
30#include <asm/apic.h> 33#include <asm/apic.h>
31#include <asm/ipi.h> 34#include <asm/ipi.h>
32#include <asm/smp.h> 35#include <asm/smp.h>
36#include <asm/x86_init.h>
33 37
34DEFINE_PER_CPU(int, x2apic_extra_bits); 38DEFINE_PER_CPU(int, x2apic_extra_bits);
35 39
40#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
41
36static enum uv_system_type uv_system_type; 42static enum uv_system_type uv_system_type;
43static u64 gru_start_paddr, gru_end_paddr;
44int uv_min_hub_revision_id;
45EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
46static DEFINE_SPINLOCK(uv_nmi_lock);
47
48static inline bool is_GRU_range(u64 start, u64 end)
49{
50 return start >= gru_start_paddr && end <= gru_end_paddr;
51}
52
53static bool uv_is_untracked_pat_range(u64 start, u64 end)
54{
55 return is_ISA_range(start, end) || is_GRU_range(start, end);
56}
37 57
38static int early_get_nodeid(void) 58static int early_get_nodeid(void)
39{ 59{
@@ -43,19 +63,28 @@ static int early_get_nodeid(void)
43 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); 63 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
44 node_id.v = *mmr; 64 node_id.v = *mmr;
45 early_iounmap(mmr, sizeof(*mmr)); 65 early_iounmap(mmr, sizeof(*mmr));
66
67 /* Currently, all blades have same revision number */
68 uv_min_hub_revision_id = node_id.s.revision;
69
46 return node_id.s.node_id; 70 return node_id.s.node_id;
47} 71}
48 72
49static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 73static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 74{
75 int nodeid;
76
51 if (!strcmp(oem_id, "SGI")) { 77 if (!strcmp(oem_id, "SGI")) {
78 nodeid = early_get_nodeid();
79 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
80 x86_platform.nmi_init = uv_nmi_init;
52 if (!strcmp(oem_table_id, "UVL")) 81 if (!strcmp(oem_table_id, "UVL"))
53 uv_system_type = UV_LEGACY_APIC; 82 uv_system_type = UV_LEGACY_APIC;
54 else if (!strcmp(oem_table_id, "UVX")) 83 else if (!strcmp(oem_table_id, "UVX"))
55 uv_system_type = UV_X2APIC; 84 uv_system_type = UV_X2APIC;
56 else if (!strcmp(oem_table_id, "UVH")) { 85 else if (!strcmp(oem_table_id, "UVH")) {
57 __get_cpu_var(x2apic_extra_bits) = 86 __get_cpu_var(x2apic_extra_bits) =
58 early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); 87 nodeid << (UV_APIC_PNODE_SHIFT - 1);
59 uv_system_type = UV_NON_UNIQUE_APIC; 88 uv_system_type = UV_NON_UNIQUE_APIC;
60 return 1; 89 return 1;
61 } 90 }
@@ -92,11 +121,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
92unsigned long sn_rtc_cycles_per_second; 121unsigned long sn_rtc_cycles_per_second;
93EXPORT_SYMBOL(sn_rtc_cycles_per_second); 122EXPORT_SYMBOL(sn_rtc_cycles_per_second);
94 123
95/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
96
97static const struct cpumask *uv_target_cpus(void) 124static const struct cpumask *uv_target_cpus(void)
98{ 125{
99 return cpumask_of(0); 126 return cpu_online_mask;
100} 127}
101 128
102static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) 129static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
@@ -212,10 +239,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
212 if (cpumask_test_cpu(cpu, cpu_online_mask)) 239 if (cpumask_test_cpu(cpu, cpu_online_mask))
213 break; 240 break;
214 } 241 }
215 if (cpu < nr_cpu_ids) 242 return per_cpu(x86_cpu_to_apicid, cpu);
216 return per_cpu(x86_cpu_to_apicid, cpu);
217
218 return BAD_APICID;
219} 243}
220 244
221static unsigned int x2apic_get_apic_id(unsigned long x) 245static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -364,13 +388,13 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
364 388
365enum map_type {map_wb, map_uc}; 389enum map_type {map_wb, map_uc};
366 390
367static __init void map_high(char *id, unsigned long base, int shift, 391static __init void map_high(char *id, unsigned long base, int pshift,
368 int max_pnode, enum map_type map_type) 392 int bshift, int max_pnode, enum map_type map_type)
369{ 393{
370 unsigned long bytes, paddr; 394 unsigned long bytes, paddr;
371 395
372 paddr = base << shift; 396 paddr = base << pshift;
373 bytes = (1UL << shift) * (max_pnode + 1); 397 bytes = (1UL << bshift) * (max_pnode + 1);
374 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 398 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
375 paddr + bytes); 399 paddr + bytes);
376 if (map_type == map_uc) 400 if (map_type == map_uc)
@@ -385,8 +409,12 @@ static __init void map_gru_high(int max_pnode)
385 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; 409 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
386 410
387 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 411 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
388 if (gru.s.enable) 412 if (gru.s.enable) {
389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 413 map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
414 gru_start_paddr = ((u64)gru.s.base << shift);
415 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
416
417 }
390} 418}
391 419
392static __init void map_mmr_high(int max_pnode) 420static __init void map_mmr_high(int max_pnode)
@@ -396,7 +424,7 @@ static __init void map_mmr_high(int max_pnode)
396 424
397 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); 425 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
398 if (mmr.s.enable) 426 if (mmr.s.enable)
399 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); 427 map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
400} 428}
401 429
402static __init void map_mmioh_high(int max_pnode) 430static __init void map_mmioh_high(int max_pnode)
@@ -406,7 +434,14 @@ static __init void map_mmioh_high(int max_pnode)
406 434
407 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 435 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
408 if (mmioh.s.enable) 436 if (mmioh.s.enable)
409 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); 437 map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
438 max_pnode, map_uc);
439}
440
441static __init void map_low_mmrs(void)
442{
443 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
444 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
410} 445}
411 446
412static __init void uv_rtc_init(void) 447static __init void uv_rtc_init(void)
@@ -452,7 +487,7 @@ static void uv_heartbeat(unsigned long ignored)
452 487
453static void __cpuinit uv_heartbeat_enable(int cpu) 488static void __cpuinit uv_heartbeat_enable(int cpu)
454{ 489{
455 if (!uv_cpu_hub_info(cpu)->scir.enabled) { 490 while (!uv_cpu_hub_info(cpu)->scir.enabled) {
456 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; 491 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
457 492
458 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); 493 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
@@ -460,11 +495,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu)
460 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; 495 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
461 add_timer_on(timer, cpu); 496 add_timer_on(timer, cpu);
462 uv_cpu_hub_info(cpu)->scir.enabled = 1; 497 uv_cpu_hub_info(cpu)->scir.enabled = 1;
463 }
464 498
465 /* check boot cpu */ 499 /* also ensure that boot cpu is enabled */
466 if (!uv_cpu_hub_info(0)->scir.enabled) 500 cpu = 0;
467 uv_heartbeat_enable(0); 501 }
468} 502}
469 503
470#ifdef CONFIG_HOTPLUG_CPU 504#ifdef CONFIG_HOTPLUG_CPU
@@ -523,6 +557,30 @@ late_initcall(uv_init_heartbeat);
523 557
524#endif /* !CONFIG_HOTPLUG_CPU */ 558#endif /* !CONFIG_HOTPLUG_CPU */
525 559
560/* Direct Legacy VGA I/O traffic to designated IOH */
561int uv_set_vga_state(struct pci_dev *pdev, bool decode,
562 unsigned int command_bits, bool change_bridge)
563{
564 int domain, bus, rc;
565
566 PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
567 pdev->devfn, decode, command_bits, change_bridge);
568
569 if (!change_bridge)
570 return 0;
571
572 if ((command_bits & PCI_COMMAND_IO) == 0)
573 return 0;
574
575 domain = pci_domain_nr(pdev->bus);
576 bus = pdev->bus->number;
577
578 rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
579 PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
580
581 return rc;
582}
583
526/* 584/*
527 * Called on each cpu to initialize the per_cpu UV data area. 585 * Called on each cpu to initialize the per_cpu UV data area.
528 * FIXME: hotplug not supported yet 586 * FIXME: hotplug not supported yet
@@ -539,6 +597,46 @@ void __cpuinit uv_cpu_init(void)
539 set_x2apic_extra_bits(uv_hub_info->pnode); 597 set_x2apic_extra_bits(uv_hub_info->pnode);
540} 598}
541 599
600/*
601 * When NMI is received, print a stack trace.
602 */
603int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
604{
605 if (reason != DIE_NMI_IPI)
606 return NOTIFY_OK;
607 /*
608 * Use a lock so only one cpu prints at a time
609 * to prevent intermixed output.
610 */
611 spin_lock(&uv_nmi_lock);
612 pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
613 dump_stack();
614 spin_unlock(&uv_nmi_lock);
615
616 return NOTIFY_STOP;
617}
618
619static struct notifier_block uv_dump_stack_nmi_nb = {
620 .notifier_call = uv_handle_nmi
621};
622
623void uv_register_nmi_notifier(void)
624{
625 if (register_die_notifier(&uv_dump_stack_nmi_nb))
626 printk(KERN_WARNING "UV NMI handler failed to register\n");
627}
628
629void uv_nmi_init(void)
630{
631 unsigned int value;
632
633 /*
634 * Unmask NMI on all cpus
635 */
636 value = apic_read(APIC_LVT1) | APIC_DM_NMI;
637 value &= ~APIC_LVT_MASKED;
638 apic_write(APIC_LVT1, value);
639}
542 640
543void __init uv_system_init(void) 641void __init uv_system_init(void)
544{ 642{
@@ -550,6 +648,8 @@ void __init uv_system_init(void)
550 unsigned long mmr_base, present, paddr; 648 unsigned long mmr_base, present, paddr;
551 unsigned short pnode_mask; 649 unsigned short pnode_mask;
552 650
651 map_low_mmrs();
652
553 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 653 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
554 m_val = m_n_config.s.m_skt; 654 m_val = m_n_config.s.m_skt;
555 n_val = m_n_config.s.n_skt; 655 n_val = m_n_config.s.n_skt;
@@ -602,13 +702,15 @@ void __init uv_system_init(void)
602 } 702 }
603 703
604 uv_bios_init(); 704 uv_bios_init();
605 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 705 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
606 &sn_coherency_id, &sn_region_size); 706 &sn_region_size, &system_serial_number);
607 uv_rtc_init(); 707 uv_rtc_init();
608 708
609 for_each_present_cpu(cpu) { 709 for_each_present_cpu(cpu) {
710 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
711
610 nid = cpu_to_node(cpu); 712 nid = cpu_to_node(cpu);
611 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 713 pnode = uv_apicid_to_pnode(apicid);
612 blade = boot_pnode_to_blade(pnode); 714 blade = boot_pnode_to_blade(pnode);
613 lcpu = uv_blade_info[blade].nr_possible_cpus; 715 lcpu = uv_blade_info[blade].nr_possible_cpus;
614 uv_blade_info[blade].nr_possible_cpus++; 716 uv_blade_info[blade].nr_possible_cpus++;
@@ -629,15 +731,13 @@ void __init uv_system_init(void)
629 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 731 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
630 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 732 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
631 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 733 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
632 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 734 uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
633 uv_node_to_blade[nid] = blade; 735 uv_node_to_blade[nid] = blade;
634 uv_cpu_to_blade[cpu] = blade; 736 uv_cpu_to_blade[cpu] = blade;
635 max_pnode = max(pnode, max_pnode); 737 max_pnode = max(pnode, max_pnode);
636 738
637 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " 739 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
638 "lcpu %d, blade %d\n", 740 cpu, apicid, pnode, nid, lcpu, blade);
639 cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
640 lcpu, blade);
641 } 741 }
642 742
643 /* Add blade/pnode info for nodes without cpus */ 743 /* Add blade/pnode info for nodes without cpus */
@@ -658,5 +758,9 @@ void __init uv_system_init(void)
658 758
659 uv_cpu_init(); 759 uv_cpu_init();
660 uv_scir_register_cpu_notifier(); 760 uv_scir_register_cpu_notifier();
761 uv_register_nmi_notifier();
661 proc_mkdir("sgi_uv", NULL); 762 proc_mkdir("sgi_uv", NULL);
763
764 /* register Legacy VGA I/O redirection handler */
765 pci_register_set_vga_state(uv_set_vga_state);
662} 766}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 151ace69a5aa..031aa887b0eb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,7 +204,6 @@
204#include <linux/module.h> 204#include <linux/module.h>
205 205
206#include <linux/poll.h> 206#include <linux/poll.h>
207#include <linux/smp_lock.h>
208#include <linux/types.h> 207#include <linux/types.h>
209#include <linux/stddef.h> 208#include <linux/stddef.h>
210#include <linux/timer.h> 209#include <linux/timer.h>
@@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 402static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
404static struct apm_user *user_list; 403static struct apm_user *user_list;
405static DEFINE_SPINLOCK(user_list_lock); 404static DEFINE_SPINLOCK(user_list_lock);
405static DEFINE_MUTEX(apm_mutex);
406 406
407/* 407/*
408 * Set up a segment that references the real mode segment 0x40 408 * Set up a segment that references the real mode segment 0x40
@@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1531 return -EPERM; 1531 return -EPERM;
1532 switch (cmd) { 1532 switch (cmd) {
1533 case APM_IOC_STANDBY: 1533 case APM_IOC_STANDBY:
1534 lock_kernel(); 1534 mutex_lock(&apm_mutex);
1535 if (as->standbys_read > 0) { 1535 if (as->standbys_read > 0) {
1536 as->standbys_read--; 1536 as->standbys_read--;
1537 as->standbys_pending--; 1537 as->standbys_pending--;
@@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1540 queue_event(APM_USER_STANDBY, as); 1540 queue_event(APM_USER_STANDBY, as);
1541 if (standbys_pending <= 0) 1541 if (standbys_pending <= 0)
1542 standby(); 1542 standby();
1543 unlock_kernel(); 1543 mutex_unlock(&apm_mutex);
1544 break; 1544 break;
1545 case APM_IOC_SUSPEND: 1545 case APM_IOC_SUSPEND:
1546 lock_kernel(); 1546 mutex_lock(&apm_mutex);
1547 if (as->suspends_read > 0) { 1547 if (as->suspends_read > 0) {
1548 as->suspends_read--; 1548 as->suspends_read--;
1549 as->suspends_pending--; 1549 as->suspends_pending--;
@@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1552 queue_event(APM_USER_SUSPEND, as); 1552 queue_event(APM_USER_SUSPEND, as);
1553 if (suspends_pending <= 0) { 1553 if (suspends_pending <= 0) {
1554 ret = suspend(1); 1554 ret = suspend(1);
1555 mutex_unlock(&apm_mutex);
1555 } else { 1556 } else {
1556 as->suspend_wait = 1; 1557 as->suspend_wait = 1;
1558 mutex_unlock(&apm_mutex);
1557 wait_event_interruptible(apm_suspend_waitqueue, 1559 wait_event_interruptible(apm_suspend_waitqueue,
1558 as->suspend_wait == 0); 1560 as->suspend_wait == 0);
1559 ret = as->suspend_result; 1561 ret = as->suspend_result;
1560 } 1562 }
1561 unlock_kernel();
1562 return ret; 1563 return ret;
1563 default: 1564 default:
1564 return -ENOTTY; 1565 return -ENOTTY;
@@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp)
1608{ 1609{
1609 struct apm_user *as; 1610 struct apm_user *as;
1610 1611
1611 lock_kernel();
1612 as = kmalloc(sizeof(*as), GFP_KERNEL); 1612 as = kmalloc(sizeof(*as), GFP_KERNEL);
1613 if (as == NULL) { 1613 if (as == NULL) {
1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", 1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1615 sizeof(*as)); 1615 sizeof(*as));
1616 unlock_kernel();
1617 return -ENOMEM; 1616 return -ENOMEM;
1618 } 1617 }
1619 as->magic = APM_BIOS_MAGIC; 1618 as->magic = APM_BIOS_MAGIC;
@@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp)
1635 user_list = as; 1634 user_list = as;
1636 spin_unlock(&user_list_lock); 1635 spin_unlock(&user_list_lock);
1637 filp->private_data = as; 1636 filp->private_data = as;
1638 unlock_kernel();
1639 return 0; 1637 return 0;
1640} 1638}
1641 1639
@@ -1994,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
1994 apm_info.disabled = 1; 1992 apm_info.disabled = 1;
1995 printk(KERN_INFO "%s machine detected. " 1993 printk(KERN_INFO "%s machine detected. "
1996 "Disabling APM.\n", d->ident); 1994 "Disabling APM.\n", d->ident);
1997 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); 1995 printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n");
1998 printk(KERN_INFO "download from support.intel.com \n"); 1996 printk(KERN_INFO "download from support.intel.com\n");
1999 } 1997 }
2000 return 0; 1998 return 0;
2001} 1999}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index 63a88e1f987d..8bc57baaa9ad 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -15,8 +15,8 @@
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 * 17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. 18 * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson 19 * Copyright (c) Russ Anderson <rja@sgi.com>
20 */ 20 */
21 21
22#include <linux/efi.h> 22#include <linux/efi.h>
@@ -30,6 +30,7 @@ static struct uv_systab uv_systab;
30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) 30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
31{ 31{
32 struct uv_systab *tab = &uv_systab; 32 struct uv_systab *tab = &uv_systab;
33 s64 ret;
33 34
34 if (!tab->function) 35 if (!tab->function)
35 /* 36 /*
@@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
37 */ 38 */
38 return BIOS_STATUS_UNIMPLEMENTED; 39 return BIOS_STATUS_UNIMPLEMENTED;
39 40
40 return efi_call6((void *)__va(tab->function), 41 ret = efi_call6((void *)__va(tab->function), (u64)which,
41 (u64)which, a1, a2, a3, a4, a5); 42 a1, a2, a3, a4, a5);
43 return ret;
42} 44}
45EXPORT_SYMBOL_GPL(uv_bios_call);
43 46
44s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, 47s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
45 u64 a4, u64 a5) 48 u64 a4, u64 a5)
@@ -73,11 +76,14 @@ long sn_coherency_id;
73EXPORT_SYMBOL_GPL(sn_coherency_id); 76EXPORT_SYMBOL_GPL(sn_coherency_id);
74long sn_region_size; 77long sn_region_size;
75EXPORT_SYMBOL_GPL(sn_region_size); 78EXPORT_SYMBOL_GPL(sn_region_size);
79long system_serial_number;
80EXPORT_SYMBOL_GPL(system_serial_number);
76int uv_type; 81int uv_type;
82EXPORT_SYMBOL_GPL(uv_type);
77 83
78 84
79s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, 85s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
80 long *region) 86 long *region, long *ssn)
81{ 87{
82 s64 ret; 88 s64 ret;
83 u64 v0, v1; 89 u64 v0, v1;
@@ -97,25 +103,24 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
97 *coher = part.coherence_id; 103 *coher = part.coherence_id;
98 if (region) 104 if (region)
99 *region = part.region_size; 105 *region = part.region_size;
106 if (ssn)
107 *ssn = v1;
100 return ret; 108 return ret;
101} 109}
110EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
102 111
103int 112int
104uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, 113uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
105 unsigned long *intr_mmr_offset) 114 unsigned long *intr_mmr_offset)
106{ 115{
107 union uv_watchlist_u size_blade;
108 u64 watchlist; 116 u64 watchlist;
109 s64 ret; 117 s64 ret;
110 118
111 size_blade.size = mq_size;
112 size_blade.blade = blade;
113
114 /* 119 /*
115 * bios returns watchlist number or negative error number. 120 * bios returns watchlist number or negative error number.
116 */ 121 */
117 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, 122 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
118 size_blade.val, (u64)intr_mmr_offset, 123 mq_size, (u64)intr_mmr_offset,
119 (u64)&watchlist, 0); 124 (u64)&watchlist, 0);
120 if (ret < BIOS_STATUS_SUCCESS) 125 if (ret < BIOS_STATUS_SUCCESS)
121 return ret; 126 return ret;
@@ -158,6 +163,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
158} 163}
159EXPORT_SYMBOL_GPL(uv_bios_freq_base); 164EXPORT_SYMBOL_GPL(uv_bios_freq_base);
160 165
166/*
167 * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
168 * @decode: true to enable target, false to disable target
169 * @domain: PCI domain number
170 * @bus: PCI bus number
171 *
172 * Returns:
173 * 0: Success
174 * -EINVAL: Invalid domain or bus number
175 * -ENOSYS: Capability not available
176 * -EBUSY: Legacy VGA I/O cannot be retargeted at this time
177 */
178int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
179{
180 return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
181 (u64)decode, (u64)domain, (u64)bus, 0, 0);
182}
183EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
184
161 185
162#ifdef CONFIG_EFI 186#ifdef CONFIG_EFI
163void uv_bios_init(void) 187void uv_bios_init(void)
@@ -189,4 +213,3 @@ void uv_bios_init(void)
189 213
190void uv_bios_init(void) { } 214void uv_bios_init(void) { }
191#endif 215#endif
192
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 30f25a75fe28..5de7f4c56971 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -5,7 +5,6 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/slab.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/acpi.h> 9#include <linux/acpi.h>
11#include <asm/io.h> 10#include <asm/io.h>
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e957a9b..c202b62f3671 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER 6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg 7CFLAGS_REMOVE_common.o = -pg
8CFLAGS_REMOVE_perf_event.o = -pg
8endif 9endif
9 10
10# Make sure load_percpu_segment has no stackprotector 11# Make sure load_percpu_segment has no stackprotector
@@ -18,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o
18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
19obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
20 21
21obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
22
23obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
24obj-$(CONFIG_CPU_SUP_AMD) += amd.o 23obj-$(CONFIG_CPU_SUP_AMD) += amd.o
25obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 24obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c965e5212714..97ad79cdf688 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, 34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
36 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
37 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
38 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
35 { 0, 0, 0, 0 } 39 { 0, 0, 0, 0 }
36 }; 40 };
37 41
@@ -74,6 +78,7 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
74 unsigned int eax, ebx, ecx, edx, sub_index; 78 unsigned int eax, ebx, ecx, edx, sub_index;
75 unsigned int ht_mask_width, core_plus_mask_width; 79 unsigned int ht_mask_width, core_plus_mask_width;
76 unsigned int core_select_mask, core_level_siblings; 80 unsigned int core_select_mask, core_level_siblings;
81 static bool printed;
77 82
78 if (c->cpuid_level < 0xb) 83 if (c->cpuid_level < 0xb)
79 return; 84 return;
@@ -127,12 +132,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
127 132
128 c->x86_max_cores = (core_level_siblings / smp_num_siblings); 133 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
129 134
130 135 if (!printed) {
131 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 136 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
132 c->phys_proc_id); 137 c->phys_proc_id);
133 if (c->x86_max_cores > 1) 138 if (c->x86_max_cores > 1)
134 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 139 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
135 c->cpu_core_id); 140 c->cpu_core_id);
141 printed = 1;
142 }
136 return; 143 return;
137#endif 144#endif
138} 145}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c910a716a71c..e485825130d2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -254,59 +254,36 @@ static int __cpuinit nearby_node(int apicid)
254 254
255/* 255/*
256 * Fixup core topology information for AMD multi-node processors. 256 * Fixup core topology information for AMD multi-node processors.
257 * Assumption 1: Number of cores in each internal node is the same. 257 * Assumption: Number of cores in each internal node is the same.
258 * Assumption 2: Mixed systems with both single-node and dual-node
259 * processors are not supported.
260 */ 258 */
261#ifdef CONFIG_X86_HT 259#ifdef CONFIG_X86_HT
262static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) 260static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
263{ 261{
264#ifdef CONFIG_PCI 262 unsigned long long value;
265 u32 t, cpn; 263 u32 nodes, cores_per_node;
266 u8 n, n_id;
267 int cpu = smp_processor_id(); 264 int cpu = smp_processor_id();
268 265
266 if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
267 return;
268
269 /* fixup topology information only once for a core */ 269 /* fixup topology information only once for a core */
270 if (cpu_has(c, X86_FEATURE_AMD_DCM)) 270 if (cpu_has(c, X86_FEATURE_AMD_DCM))
271 return; 271 return;
272 272
273 /* check for multi-node processor on boot cpu */ 273 rdmsrl(MSR_FAM10H_NODE_ID, value);
274 t = read_pci_config(0, 24, 3, 0xe8); 274
275 if (!(t & (1 << 29))) 275 nodes = ((value >> 3) & 7) + 1;
276 if (nodes == 1)
276 return; 277 return;
277 278
278 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 279 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
280 cores_per_node = c->x86_max_cores / nodes;
279 281
280 /* cores per node: each internal node has half the number of cores */ 282 /* store NodeID, use llc_shared_map to store sibling info */
281 cpn = c->x86_max_cores >> 1; 283 per_cpu(cpu_llc_id, cpu) = value & 7;
282
283 /* even-numbered NB_id of this dual-node processor */
284 n = c->phys_proc_id << 1;
285
286 /*
287 * determine internal node id and assign cores fifty-fifty to
288 * each node of the dual-node processor
289 */
290 t = read_pci_config(0, 24 + n, 3, 0xe8);
291 n = (t>>30) & 0x3;
292 if (n == 0) {
293 if (c->cpu_core_id < cpn)
294 n_id = 0;
295 else
296 n_id = 1;
297 } else {
298 if (c->cpu_core_id < cpn)
299 n_id = 1;
300 else
301 n_id = 0;
302 }
303
304 /* compute entire NodeID, use llc_shared_map to store sibling info */
305 per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id;
306 284
307 /* fixup core id to be in range from 0 to cpn */ 285 /* fixup core id to be in range from 0 to (cores_per_node - 1) */
308 c->cpu_core_id = c->cpu_core_id % cpn; 286 c->cpu_core_id = c->cpu_core_id % cores_per_node;
309#endif
310} 287}
311#endif 288#endif
312 289
@@ -375,8 +352,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
375 node = nearby_node(apicid); 352 node = nearby_node(apicid);
376 } 353 }
377 numa_set_node(cpu, node); 354 numa_set_node(cpu, node);
378
379 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
380#endif 355#endif
381} 356}
382 357
@@ -535,7 +510,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
535 } 510 }
536 } 511 }
537 512
538 display_cacheinfo(c); 513 cpu_detect_cache_sizes(c);
539 514
540 /* Multi core CPU? */ 515 /* Multi core CPU? */
541 if (c->extended_cpuid_level >= 0x80000008) { 516 if (c->extended_cpuid_level >= 0x80000008) {
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index c95e831bb095..e58d978e0758 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 } 295 }
296 296
297 display_cacheinfo(c); 297 cpu_detect_cache_sizes(c);
298} 298}
299 299
300enum { 300enum {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc25c2b4a567..4868e4a951ee 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -61,7 +61,7 @@ void __init setup_cpu_local_masks(void)
61static void __cpuinit default_init(struct cpuinfo_x86 *c) 61static void __cpuinit default_init(struct cpuinfo_x86 *c)
62{ 62{
63#ifdef CONFIG_X86_64 63#ifdef CONFIG_X86_64
64 display_cacheinfo(c); 64 cpu_detect_cache_sizes(c);
65#else 65#else
66 /* Not much we can do here... */ 66 /* Not much we can do here... */
67 /* Check if at least it has cpuid */ 67 /* Check if at least it has cpuid */
@@ -383,7 +383,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
383 } 383 }
384} 384}
385 385
386void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 386void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
387{ 387{
388 unsigned int n, dummy, ebx, ecx, edx, l2size; 388 unsigned int n, dummy, ebx, ecx, edx, l2size;
389 389
@@ -391,8 +391,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
391 391
392 if (n >= 0x80000005) { 392 if (n >= 0x80000005) {
393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); 393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
394 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
395 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
396 c->x86_cache_size = (ecx>>24) + (edx>>24); 394 c->x86_cache_size = (ecx>>24) + (edx>>24);
397#ifdef CONFIG_X86_64 395#ifdef CONFIG_X86_64
398 /* On K8 L1 TLB is inclusive, so don't count it */ 396 /* On K8 L1 TLB is inclusive, so don't count it */
@@ -422,9 +420,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
422#endif 420#endif
423 421
424 c->x86_cache_size = l2size; 422 c->x86_cache_size = l2size;
425
426 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
427 l2size, ecx & 0xFF);
428} 423}
429 424
430void __cpuinit detect_ht(struct cpuinfo_x86 *c) 425void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -432,6 +427,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
432#ifdef CONFIG_X86_HT 427#ifdef CONFIG_X86_HT
433 u32 eax, ebx, ecx, edx; 428 u32 eax, ebx, ecx, edx;
434 int index_msb, core_bits; 429 int index_msb, core_bits;
430 static bool printed;
435 431
436 if (!cpu_has(c, X86_FEATURE_HT)) 432 if (!cpu_has(c, X86_FEATURE_HT))
437 return; 433 return;
@@ -447,7 +443,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
447 smp_num_siblings = (ebx & 0xff0000) >> 16; 443 smp_num_siblings = (ebx & 0xff0000) >> 16;
448 444
449 if (smp_num_siblings == 1) { 445 if (smp_num_siblings == 1) {
450 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 446 printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
451 goto out; 447 goto out;
452 } 448 }
453 449
@@ -474,11 +470,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
474 ((1 << core_bits) - 1); 470 ((1 << core_bits) - 1);
475 471
476out: 472out:
477 if ((c->x86_max_cores * smp_num_siblings) > 1) { 473 if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
478 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 474 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
479 c->phys_proc_id); 475 c->phys_proc_id);
480 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 476 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
481 c->cpu_core_id); 477 c->cpu_core_id);
478 printed = 1;
482 } 479 }
483#endif 480#endif
484} 481}
@@ -659,24 +656,31 @@ void __init early_cpu_init(void)
659 const struct cpu_dev *const *cdev; 656 const struct cpu_dev *const *cdev;
660 int count = 0; 657 int count = 0;
661 658
659#ifdef PROCESSOR_SELECT
662 printk(KERN_INFO "KERNEL supported cpus:\n"); 660 printk(KERN_INFO "KERNEL supported cpus:\n");
661#endif
662
663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { 663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
664 const struct cpu_dev *cpudev = *cdev; 664 const struct cpu_dev *cpudev = *cdev;
665 unsigned int j;
666 665
667 if (count >= X86_VENDOR_NUM) 666 if (count >= X86_VENDOR_NUM)
668 break; 667 break;
669 cpu_devs[count] = cpudev; 668 cpu_devs[count] = cpudev;
670 count++; 669 count++;
671 670
672 for (j = 0; j < 2; j++) { 671#ifdef PROCESSOR_SELECT
673 if (!cpudev->c_ident[j]) 672 {
674 continue; 673 unsigned int j;
675 printk(KERN_INFO " %s %s\n", cpudev->c_vendor, 674
676 cpudev->c_ident[j]); 675 for (j = 0; j < 2; j++) {
676 if (!cpudev->c_ident[j])
677 continue;
678 printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
679 cpudev->c_ident[j]);
680 }
677 } 681 }
682#endif
678 } 683 }
679
680 early_identify_cpu(&boot_cpu_data); 684 early_identify_cpu(&boot_cpu_data);
681} 685}
682 686
@@ -837,10 +841,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
837 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 841 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
838 } 842 }
839 843
840#ifdef CONFIG_X86_MCE
841 /* Init Machine Check Exception if available. */ 844 /* Init Machine Check Exception if available. */
842 mcheck_init(c); 845 mcheck_cpu_init(c);
843#endif
844 846
845 select_idle_routine(c); 847 select_idle_routine(c);
846 848
@@ -1093,7 +1095,7 @@ static void clear_all_debug_regs(void)
1093 1095
1094void __cpuinit cpu_init(void) 1096void __cpuinit cpu_init(void)
1095{ 1097{
1096 struct orig_ist *orig_ist; 1098 struct orig_ist *oist;
1097 struct task_struct *me; 1099 struct task_struct *me;
1098 struct tss_struct *t; 1100 struct tss_struct *t;
1099 unsigned long v; 1101 unsigned long v;
@@ -1102,7 +1104,7 @@ void __cpuinit cpu_init(void)
1102 1104
1103 cpu = stack_smp_processor_id(); 1105 cpu = stack_smp_processor_id();
1104 t = &per_cpu(init_tss, cpu); 1106 t = &per_cpu(init_tss, cpu);
1105 orig_ist = &per_cpu(orig_ist, cpu); 1107 oist = &per_cpu(orig_ist, cpu);
1106 1108
1107#ifdef CONFIG_NUMA 1109#ifdef CONFIG_NUMA
1108 if (cpu != 0 && percpu_read(node_number) == 0 && 1110 if (cpu != 0 && percpu_read(node_number) == 0 &&
@@ -1115,7 +1117,7 @@ void __cpuinit cpu_init(void)
1115 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) 1117 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
1116 panic("CPU#%d already initialized!\n", cpu); 1118 panic("CPU#%d already initialized!\n", cpu);
1117 1119
1118 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1120 pr_debug("Initializing CPU#%d\n", cpu);
1119 1121
1120 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1122 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1121 1123
@@ -1136,19 +1138,19 @@ void __cpuinit cpu_init(void)
1136 wrmsrl(MSR_KERNEL_GS_BASE, 0); 1138 wrmsrl(MSR_KERNEL_GS_BASE, 0);
1137 barrier(); 1139 barrier();
1138 1140
1139 check_efer(); 1141 x86_configure_nx();
1140 if (cpu != 0) 1142 if (cpu != 0)
1141 enable_x2apic(); 1143 enable_x2apic();
1142 1144
1143 /* 1145 /*
1144 * set up and load the per-CPU TSS 1146 * set up and load the per-CPU TSS
1145 */ 1147 */
1146 if (!orig_ist->ist[0]) { 1148 if (!oist->ist[0]) {
1147 char *estacks = per_cpu(exception_stacks, cpu); 1149 char *estacks = per_cpu(exception_stacks, cpu);
1148 1150
1149 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1151 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1150 estacks += exception_stack_sizes[v]; 1152 estacks += exception_stack_sizes[v];
1151 orig_ist->ist[v] = t->x86_tss.ist[v] = 1153 oist->ist[v] = t->x86_tss.ist[v] =
1152 (unsigned long)estacks; 1154 (unsigned long)estacks;
1153 } 1155 }
1154 } 1156 }
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 6de9a908e400..3624e8a0f71b 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,6 @@ struct cpu_dev {
32extern const struct cpu_dev *const __x86_cpu_dev_start[], 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void display_cacheinfo(struct cpuinfo_x86 *c); 35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36 36
37#endif 37#endif
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
deleted file mode 100644
index dca325c03999..000000000000
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ /dev/null
@@ -1,688 +0,0 @@
1/*
2 * CPU x86 architecture debug code
3 *
4 * Copyright(C) 2009 Jaswinder Singh Rajput
5 *
6 * For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/interrupt.h>
10#include <linux/compiler.h>
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/kprobes.h>
14#include <linux/uaccess.h>
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/percpu.h>
18#include <linux/signal.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/types.h>
22#include <linux/init.h>
23#include <linux/slab.h>
24#include <linux/smp.h>
25
26#include <asm/cpu_debug.h>
27#include <asm/paravirt.h>
28#include <asm/system.h>
29#include <asm/traps.h>
30#include <asm/apic.h>
31#include <asm/desc.h>
32
33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
35static DEFINE_PER_CPU(int, cpu_priv_count);
36
37static DEFINE_MUTEX(cpu_debug_lock);
38
39static struct dentry *cpu_debugfs_dir;
40
41static struct cpu_debug_base cpu_base[] = {
42 { "mc", CPU_MC, 0 },
43 { "monitor", CPU_MONITOR, 0 },
44 { "time", CPU_TIME, 0 },
45 { "pmc", CPU_PMC, 1 },
46 { "platform", CPU_PLATFORM, 0 },
47 { "apic", CPU_APIC, 0 },
48 { "poweron", CPU_POWERON, 0 },
49 { "control", CPU_CONTROL, 0 },
50 { "features", CPU_FEATURES, 0 },
51 { "lastbranch", CPU_LBRANCH, 0 },
52 { "bios", CPU_BIOS, 0 },
53 { "freq", CPU_FREQ, 0 },
54 { "mtrr", CPU_MTRR, 0 },
55 { "perf", CPU_PERF, 0 },
56 { "cache", CPU_CACHE, 0 },
57 { "sysenter", CPU_SYSENTER, 0 },
58 { "therm", CPU_THERM, 0 },
59 { "misc", CPU_MISC, 0 },
60 { "debug", CPU_DEBUG, 0 },
61 { "pat", CPU_PAT, 0 },
62 { "vmx", CPU_VMX, 0 },
63 { "call", CPU_CALL, 0 },
64 { "base", CPU_BASE, 0 },
65 { "ver", CPU_VER, 0 },
66 { "conf", CPU_CONF, 0 },
67 { "smm", CPU_SMM, 0 },
68 { "svm", CPU_SVM, 0 },
69 { "osvm", CPU_OSVM, 0 },
70 { "tss", CPU_TSS, 0 },
71 { "cr", CPU_CR, 0 },
72 { "dt", CPU_DT, 0 },
73 { "registers", CPU_REG_ALL, 0 },
74};
75
76static struct cpu_file_base cpu_file[] = {
77 { "index", CPU_REG_ALL, 0 },
78 { "value", CPU_REG_ALL, 1 },
79};
80
81/* CPU Registers Range */
82static struct cpu_debug_range cpu_reg_range[] = {
83 { 0x00000000, 0x00000001, CPU_MC, },
84 { 0x00000006, 0x00000007, CPU_MONITOR, },
85 { 0x00000010, 0x00000010, CPU_TIME, },
86 { 0x00000011, 0x00000013, CPU_PMC, },
87 { 0x00000017, 0x00000017, CPU_PLATFORM, },
88 { 0x0000001B, 0x0000001B, CPU_APIC, },
89 { 0x0000002A, 0x0000002B, CPU_POWERON, },
90 { 0x0000002C, 0x0000002C, CPU_FREQ, },
91 { 0x0000003A, 0x0000003A, CPU_CONTROL, },
92 { 0x00000040, 0x00000047, CPU_LBRANCH, },
93 { 0x00000060, 0x00000067, CPU_LBRANCH, },
94 { 0x00000079, 0x00000079, CPU_BIOS, },
95 { 0x00000088, 0x0000008A, CPU_CACHE, },
96 { 0x0000008B, 0x0000008B, CPU_BIOS, },
97 { 0x0000009B, 0x0000009B, CPU_MONITOR, },
98 { 0x000000C1, 0x000000C4, CPU_PMC, },
99 { 0x000000CD, 0x000000CD, CPU_FREQ, },
100 { 0x000000E7, 0x000000E8, CPU_PERF, },
101 { 0x000000FE, 0x000000FE, CPU_MTRR, },
102
103 { 0x00000116, 0x0000011E, CPU_CACHE, },
104 { 0x00000174, 0x00000176, CPU_SYSENTER, },
105 { 0x00000179, 0x0000017B, CPU_MC, },
106 { 0x00000186, 0x00000189, CPU_PMC, },
107 { 0x00000198, 0x00000199, CPU_PERF, },
108 { 0x0000019A, 0x0000019A, CPU_TIME, },
109 { 0x0000019B, 0x0000019D, CPU_THERM, },
110 { 0x000001A0, 0x000001A0, CPU_MISC, },
111 { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
112 { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
113 { 0x000001D9, 0x000001D9, CPU_DEBUG, },
114 { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
115
116 { 0x00000200, 0x0000020F, CPU_MTRR, },
117 { 0x00000250, 0x00000250, CPU_MTRR, },
118 { 0x00000258, 0x00000259, CPU_MTRR, },
119 { 0x00000268, 0x0000026F, CPU_MTRR, },
120 { 0x00000277, 0x00000277, CPU_PAT, },
121 { 0x000002FF, 0x000002FF, CPU_MTRR, },
122
123 { 0x00000300, 0x00000311, CPU_PMC, },
124 { 0x00000345, 0x00000345, CPU_PMC, },
125 { 0x00000360, 0x00000371, CPU_PMC, },
126 { 0x0000038D, 0x00000390, CPU_PMC, },
127 { 0x000003A0, 0x000003BE, CPU_PMC, },
128 { 0x000003C0, 0x000003CD, CPU_PMC, },
129 { 0x000003E0, 0x000003E1, CPU_PMC, },
130 { 0x000003F0, 0x000003F2, CPU_PMC, },
131
132 { 0x00000400, 0x00000417, CPU_MC, },
133 { 0x00000480, 0x0000048B, CPU_VMX, },
134
135 { 0x00000600, 0x00000600, CPU_DEBUG, },
136 { 0x00000680, 0x0000068F, CPU_LBRANCH, },
137 { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
138
139 { 0x000107CC, 0x000107D3, CPU_PMC, },
140
141 { 0xC0000080, 0xC0000080, CPU_FEATURES, },
142 { 0xC0000081, 0xC0000084, CPU_CALL, },
143 { 0xC0000100, 0xC0000102, CPU_BASE, },
144 { 0xC0000103, 0xC0000103, CPU_TIME, },
145
146 { 0xC0010000, 0xC0010007, CPU_PMC, },
147 { 0xC0010010, 0xC0010010, CPU_CONF, },
148 { 0xC0010015, 0xC0010015, CPU_CONF, },
149 { 0xC0010016, 0xC001001A, CPU_MTRR, },
150 { 0xC001001D, 0xC001001D, CPU_MTRR, },
151 { 0xC001001F, 0xC001001F, CPU_CONF, },
152 { 0xC0010030, 0xC0010035, CPU_BIOS, },
153 { 0xC0010044, 0xC0010048, CPU_MC, },
154 { 0xC0010050, 0xC0010056, CPU_SMM, },
155 { 0xC0010058, 0xC0010058, CPU_CONF, },
156 { 0xC0010060, 0xC0010060, CPU_CACHE, },
157 { 0xC0010061, 0xC0010068, CPU_SMM, },
158 { 0xC0010069, 0xC001006B, CPU_SMM, },
159 { 0xC0010070, 0xC0010071, CPU_SMM, },
160 { 0xC0010111, 0xC0010113, CPU_SMM, },
161 { 0xC0010114, 0xC0010118, CPU_SVM, },
162 { 0xC0010140, 0xC0010141, CPU_OSVM, },
163 { 0xC0011022, 0xC0011023, CPU_CONF, },
164};
165
166static int is_typeflag_valid(unsigned cpu, unsigned flag)
167{
168 int i;
169
170 /* Standard Registers should be always valid */
171 if (flag >= CPU_TSS)
172 return 1;
173
174 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
175 if (cpu_reg_range[i].flag == flag)
176 return 1;
177 }
178
179 /* Invalid */
180 return 0;
181}
182
183static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
184 int index, unsigned flag)
185{
186 if (cpu_reg_range[index].flag == flag) {
187 *min = cpu_reg_range[index].min;
188 *max = cpu_reg_range[index].max;
189 } else
190 *max = 0;
191
192 return *max;
193}
194
195/* This function can also be called with seq = NULL for printk */
196static void print_cpu_data(struct seq_file *seq, unsigned type,
197 u32 low, u32 high)
198{
199 struct cpu_private *priv;
200 u64 val = high;
201
202 if (seq) {
203 priv = seq->private;
204 if (priv->file) {
205 val = (val << 32) | low;
206 seq_printf(seq, "0x%llx\n", val);
207 } else
208 seq_printf(seq, " %08x: %08x_%08x\n",
209 type, high, low);
210 } else
211 printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
212}
213
214/* This function can also be called with seq = NULL for printk */
215static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
216{
217 unsigned msr, msr_min, msr_max;
218 struct cpu_private *priv;
219 u32 low, high;
220 int i;
221
222 if (seq) {
223 priv = seq->private;
224 if (priv->file) {
225 if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
226 &low, &high))
227 print_cpu_data(seq, priv->reg, low, high);
228 return;
229 }
230 }
231
232 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
233 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
234 continue;
235
236 for (msr = msr_min; msr <= msr_max; msr++) {
237 if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
238 continue;
239 print_cpu_data(seq, msr, low, high);
240 }
241 }
242}
243
244static void print_tss(void *arg)
245{
246 struct pt_regs *regs = task_pt_regs(current);
247 struct seq_file *seq = arg;
248 unsigned int seg;
249
250 seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
251 seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
252 seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
253 seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
254
255 seq_printf(seq, " RSI\t: %016lx\n", regs->si);
256 seq_printf(seq, " RDI\t: %016lx\n", regs->di);
257 seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
258 seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
259
260#ifdef CONFIG_X86_64
261 seq_printf(seq, " R08\t: %016lx\n", regs->r8);
262 seq_printf(seq, " R09\t: %016lx\n", regs->r9);
263 seq_printf(seq, " R10\t: %016lx\n", regs->r10);
264 seq_printf(seq, " R11\t: %016lx\n", regs->r11);
265 seq_printf(seq, " R12\t: %016lx\n", regs->r12);
266 seq_printf(seq, " R13\t: %016lx\n", regs->r13);
267 seq_printf(seq, " R14\t: %016lx\n", regs->r14);
268 seq_printf(seq, " R15\t: %016lx\n", regs->r15);
269#endif
270
271 asm("movl %%cs,%0" : "=r" (seg));
272 seq_printf(seq, " CS\t: %04x\n", seg);
273 asm("movl %%ds,%0" : "=r" (seg));
274 seq_printf(seq, " DS\t: %04x\n", seg);
275 seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
276 asm("movl %%es,%0" : "=r" (seg));
277 seq_printf(seq, " ES\t: %04x\n", seg);
278 asm("movl %%fs,%0" : "=r" (seg));
279 seq_printf(seq, " FS\t: %04x\n", seg);
280 asm("movl %%gs,%0" : "=r" (seg));
281 seq_printf(seq, " GS\t: %04x\n", seg);
282
283 seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
284
285 seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
286}
287
288static void print_cr(void *arg)
289{
290 struct seq_file *seq = arg;
291
292 seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
293 seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
294 seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
295 seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
296#ifdef CONFIG_X86_64
297 seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
298#endif
299}
300
301static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
302{
303 seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
304}
305
306static void print_dt(void *seq)
307{
308 struct desc_ptr dt;
309 unsigned long ldt;
310
311 /* IDT */
312 store_idt((struct desc_ptr *)&dt);
313 print_desc_ptr("IDT", seq, dt);
314
315 /* GDT */
316 store_gdt((struct desc_ptr *)&dt);
317 print_desc_ptr("GDT", seq, dt);
318
319 /* LDT */
320 store_ldt(ldt);
321 seq_printf(seq, " LDT\t: %016lx\n", ldt);
322
323 /* TR */
324 store_tr(ldt);
325 seq_printf(seq, " TR\t: %016lx\n", ldt);
326}
327
328static void print_dr(void *arg)
329{
330 struct seq_file *seq = arg;
331 unsigned long dr;
332 int i;
333
334 for (i = 0; i < 8; i++) {
335 /* Ignore db4, db5 */
336 if ((i == 4) || (i == 5))
337 continue;
338 get_debugreg(dr, i);
339 seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
340 }
341
342 seq_printf(seq, "\n MSR\t:\n");
343}
344
345static void print_apic(void *arg)
346{
347 struct seq_file *seq = arg;
348
349#ifdef CONFIG_X86_LOCAL_APIC
350 seq_printf(seq, " LAPIC\t:\n");
351 seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
352 seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
353 seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
354 seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
355 seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
356 seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
357 seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
358 seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
359 seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
360 seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
361 seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
362 seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
363 seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
364 seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
365 seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
366 seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
367 seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
368 seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
369 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
370 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
371 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
372 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
373 unsigned int i, v, maxeilvt;
374
375 v = apic_read(APIC_EFEAT);
376 maxeilvt = (v >> 16) & 0xff;
377 seq_printf(seq, " EFEAT\t\t: %08x\n", v);
378 seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
379
380 for (i = 0; i < maxeilvt; i++) {
381 v = apic_read(APIC_EILVTn(i));
382 seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
383 }
384 }
385#endif /* CONFIG_X86_LOCAL_APIC */
386 seq_printf(seq, "\n MSR\t:\n");
387}
388
389static int cpu_seq_show(struct seq_file *seq, void *v)
390{
391 struct cpu_private *priv = seq->private;
392
393 if (priv == NULL)
394 return -EINVAL;
395
396 switch (cpu_base[priv->type].flag) {
397 case CPU_TSS:
398 smp_call_function_single(priv->cpu, print_tss, seq, 1);
399 break;
400 case CPU_CR:
401 smp_call_function_single(priv->cpu, print_cr, seq, 1);
402 break;
403 case CPU_DT:
404 smp_call_function_single(priv->cpu, print_dt, seq, 1);
405 break;
406 case CPU_DEBUG:
407 if (priv->file == CPU_INDEX_BIT)
408 smp_call_function_single(priv->cpu, print_dr, seq, 1);
409 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
410 break;
411 case CPU_APIC:
412 if (priv->file == CPU_INDEX_BIT)
413 smp_call_function_single(priv->cpu, print_apic, seq, 1);
414 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
415 break;
416
417 default:
418 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
419 break;
420 }
421 seq_printf(seq, "\n");
422
423 return 0;
424}
425
426static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
427{
428 if (*pos == 0) /* One time is enough ;-) */
429 return seq;
430
431 return NULL;
432}
433
434static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
435{
436 (*pos)++;
437
438 return cpu_seq_start(seq, pos);
439}
440
441static void cpu_seq_stop(struct seq_file *seq, void *v)
442{
443}
444
445static const struct seq_operations cpu_seq_ops = {
446 .start = cpu_seq_start,
447 .next = cpu_seq_next,
448 .stop = cpu_seq_stop,
449 .show = cpu_seq_show,
450};
451
452static int cpu_seq_open(struct inode *inode, struct file *file)
453{
454 struct cpu_private *priv = inode->i_private;
455 struct seq_file *seq;
456 int err;
457
458 err = seq_open(file, &cpu_seq_ops);
459 if (!err) {
460 seq = file->private_data;
461 seq->private = priv;
462 }
463
464 return err;
465}
466
467static int write_msr(struct cpu_private *priv, u64 val)
468{
469 u32 low, high;
470
471 high = (val >> 32) & 0xffffffff;
472 low = val & 0xffffffff;
473
474 if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
475 return 0;
476
477 return -EPERM;
478}
479
480static int write_cpu_register(struct cpu_private *priv, const char *buf)
481{
482 int ret = -EPERM;
483 u64 val;
484
485 ret = strict_strtoull(buf, 0, &val);
486 if (ret < 0)
487 return ret;
488
489 /* Supporting only MSRs */
490 if (priv->type < CPU_TSS_BIT)
491 return write_msr(priv, val);
492
493 return ret;
494}
495
496static ssize_t cpu_write(struct file *file, const char __user *ubuf,
497 size_t count, loff_t *off)
498{
499 struct seq_file *seq = file->private_data;
500 struct cpu_private *priv = seq->private;
501 char buf[19];
502
503 if ((priv == NULL) || (count >= sizeof(buf)))
504 return -EINVAL;
505
506 if (copy_from_user(&buf, ubuf, count))
507 return -EFAULT;
508
509 buf[count] = 0;
510
511 if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
512 if (!write_cpu_register(priv, buf))
513 return count;
514
515 return -EACCES;
516}
517
518static const struct file_operations cpu_fops = {
519 .owner = THIS_MODULE,
520 .open = cpu_seq_open,
521 .read = seq_read,
522 .write = cpu_write,
523 .llseek = seq_lseek,
524 .release = seq_release,
525};
526
527static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
528 unsigned file, struct dentry *dentry)
529{
530 struct cpu_private *priv = NULL;
531
532 /* Already intialized */
533 if (file == CPU_INDEX_BIT)
534 if (per_cpu(cpu_arr[type].init, cpu))
535 return 0;
536
537 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
538 if (priv == NULL)
539 return -ENOMEM;
540
541 priv->cpu = cpu;
542 priv->type = type;
543 priv->reg = reg;
544 priv->file = file;
545 mutex_lock(&cpu_debug_lock);
546 per_cpu(priv_arr[type], cpu) = priv;
547 per_cpu(cpu_priv_count, cpu)++;
548 mutex_unlock(&cpu_debug_lock);
549
550 if (file)
551 debugfs_create_file(cpu_file[file].name, S_IRUGO,
552 dentry, (void *)priv, &cpu_fops);
553 else {
554 debugfs_create_file(cpu_base[type].name, S_IRUGO,
555 per_cpu(cpu_arr[type].dentry, cpu),
556 (void *)priv, &cpu_fops);
557 mutex_lock(&cpu_debug_lock);
558 per_cpu(cpu_arr[type].init, cpu) = 1;
559 mutex_unlock(&cpu_debug_lock);
560 }
561
562 return 0;
563}
564
565static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
566 struct dentry *dentry)
567{
568 unsigned file;
569 int err = 0;
570
571 for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
572 err = cpu_create_file(cpu, type, reg, file, dentry);
573 if (err)
574 return err;
575 }
576
577 return err;
578}
579
580static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
581{
582 struct dentry *cpu_dentry = NULL;
583 unsigned reg, reg_min, reg_max;
584 int i, err = 0;
585 char reg_dir[12];
586 u32 low, high;
587
588 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
589 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
590 cpu_base[type].flag))
591 continue;
592
593 for (reg = reg_min; reg <= reg_max; reg++) {
594 if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
595 continue;
596
597 sprintf(reg_dir, "0x%x", reg);
598 cpu_dentry = debugfs_create_dir(reg_dir, dentry);
599 err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
600 if (err)
601 return err;
602 }
603 }
604
605 return err;
606}
607
608static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
609{
610 struct dentry *cpu_dentry = NULL;
611 unsigned type;
612 int err = 0;
613
614 for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
615 if (!is_typeflag_valid(cpu, cpu_base[type].flag))
616 continue;
617 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
618 per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
619
620 if (type < CPU_TSS_BIT)
621 err = cpu_init_msr(cpu, type, cpu_dentry);
622 else
623 err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
624 cpu_dentry);
625 if (err)
626 return err;
627 }
628
629 return err;
630}
631
632static int cpu_init_cpu(void)
633{
634 struct dentry *cpu_dentry = NULL;
635 struct cpuinfo_x86 *cpui;
636 char cpu_dir[12];
637 unsigned cpu;
638 int err = 0;
639
640 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
641 cpui = &cpu_data(cpu);
642 if (!cpu_has(cpui, X86_FEATURE_MSR))
643 continue;
644
645 sprintf(cpu_dir, "cpu%d", cpu);
646 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
647 err = cpu_init_allreg(cpu, cpu_dentry);
648
649 pr_info("cpu%d(%d) debug files %d\n",
650 cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
651 if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
652 pr_err("Register files count %d exceeds limit %d\n",
653 per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
654 per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
655 err = -ENFILE;
656 }
657 if (err)
658 return err;
659 }
660
661 return err;
662}
663
664static int __init cpu_debug_init(void)
665{
666 cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
667
668 return cpu_init_cpu();
669}
670
671static void __exit cpu_debug_exit(void)
672{
673 int i, cpu;
674
675 if (cpu_debugfs_dir)
676 debugfs_remove_recursive(cpu_debugfs_dir);
677
678 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
679 for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
680 kfree(per_cpu(priv_arr[i], cpu));
681}
682
683module_init(cpu_debug_init);
684module_exit(cpu_debug_exit);
685
686MODULE_AUTHOR("Jaswinder Singh Rajput");
687MODULE_DESCRIPTION("CPU Debug module");
688MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index f138c6c389b9..870e6cc6ad28 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -10,6 +10,20 @@ if CPU_FREQ
10 10
11comment "CPUFreq processor drivers" 11comment "CPUFreq processor drivers"
12 12
13config X86_PCC_CPUFREQ
14 tristate "Processor Clocking Control interface driver"
15 depends on ACPI && ACPI_PROCESSOR
16 help
17 This driver adds support for the PCC interface.
18
19 For details, take a look at:
20 <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
21
22 To compile this driver as a module, choose M here: the
23 module will be called pcc-cpufreq.
24
25 If in doubt, say N.
26
13config X86_ACPI_CPUFREQ 27config X86_ACPI_CPUFREQ
14 tristate "ACPI Processor P-States driver" 28 tristate "ACPI Processor P-States driver"
15 select CPU_FREQ_TABLE 29 select CPU_FREQ_TABLE
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 509296df294d..1840c0a5170b 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -4,6 +4,7 @@
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
7obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
8obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
9obj-$(CONFIG_X86_LONGHAUL) += longhaul.o 10obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8b581d3905cb..459168083b77 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/slab.h>
36#include <trace/events/power.h> 37#include <trace/events/power.h>
37 38
38#include <linux/acpi.h> 39#include <linux/acpi.h>
@@ -68,9 +69,9 @@ struct acpi_cpufreq_data {
68 unsigned int cpu_feature; 69 unsigned int cpu_feature;
69}; 70};
70 71
71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
72 73
73static DEFINE_PER_CPU(struct aperfmperf, old_perf); 74static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
74 75
75/* acpi_perf_data is a pointer to percpu data. */ 76/* acpi_perf_data is a pointer to percpu data. */
76static struct acpi_processor_performance *acpi_perf_data; 77static struct acpi_processor_performance *acpi_perf_data;
@@ -190,9 +191,11 @@ static void do_drv_write(void *_cmd)
190 191
191static void drv_read(struct drv_cmd *cmd) 192static void drv_read(struct drv_cmd *cmd)
192{ 193{
194 int err;
193 cmd->val = 0; 195 cmd->val = 0;
194 196
195 smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1); 197 err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
198 WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
196} 199}
197 200
198static void drv_write(struct drv_cmd *cmd) 201static void drv_write(struct drv_cmd *cmd)
@@ -214,14 +217,14 @@ static u32 get_cur_val(const struct cpumask *mask)
214 if (unlikely(cpumask_empty(mask))) 217 if (unlikely(cpumask_empty(mask)))
215 return 0; 218 return 0;
216 219
217 switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) { 220 switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
218 case SYSTEM_INTEL_MSR_CAPABLE: 221 case SYSTEM_INTEL_MSR_CAPABLE:
219 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 222 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
220 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; 223 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
221 break; 224 break;
222 case SYSTEM_IO_CAPABLE: 225 case SYSTEM_IO_CAPABLE:
223 cmd.type = SYSTEM_IO_CAPABLE; 226 cmd.type = SYSTEM_IO_CAPABLE;
224 perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data; 227 perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
225 cmd.addr.io.port = perf->control_register.address; 228 cmd.addr.io.port = perf->control_register.address;
226 cmd.addr.io.bit_width = perf->control_register.bit_width; 229 cmd.addr.io.bit_width = perf->control_register.bit_width;
227 break; 230 break;
@@ -268,8 +271,8 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
268 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) 271 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
269 return 0; 272 return 0;
270 273
271 ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); 274 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
272 per_cpu(old_perf, cpu) = perf; 275 per_cpu(acfreq_old_perf, cpu) = perf;
273 276
274 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; 277 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
275 278
@@ -278,7 +281,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
278 281
279static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 282static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
280{ 283{
281 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); 284 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
282 unsigned int freq; 285 unsigned int freq;
283 unsigned int cached_freq; 286 unsigned int cached_freq;
284 287
@@ -322,7 +325,7 @@ static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
322static int acpi_cpufreq_target(struct cpufreq_policy *policy, 325static int acpi_cpufreq_target(struct cpufreq_policy *policy,
323 unsigned int target_freq, unsigned int relation) 326 unsigned int target_freq, unsigned int relation)
324{ 327{
325 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 328 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
326 struct acpi_processor_performance *perf; 329 struct acpi_processor_performance *perf;
327 struct cpufreq_freqs freqs; 330 struct cpufreq_freqs freqs;
328 struct drv_cmd cmd; 331 struct drv_cmd cmd;
@@ -416,7 +419,7 @@ out:
416 419
417static int acpi_cpufreq_verify(struct cpufreq_policy *policy) 420static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
418{ 421{
419 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 422 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
420 423
421 dprintk("acpi_cpufreq_verify\n"); 424 dprintk("acpi_cpufreq_verify\n");
422 425
@@ -574,7 +577,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
574 return -ENOMEM; 577 return -ENOMEM;
575 578
576 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); 579 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
577 per_cpu(drv_data, cpu) = data; 580 per_cpu(acfreq_data, cpu) = data;
578 581
579 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 582 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
580 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; 583 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -725,20 +728,20 @@ err_unreg:
725 acpi_processor_unregister_performance(perf, cpu); 728 acpi_processor_unregister_performance(perf, cpu);
726err_free: 729err_free:
727 kfree(data); 730 kfree(data);
728 per_cpu(drv_data, cpu) = NULL; 731 per_cpu(acfreq_data, cpu) = NULL;
729 732
730 return result; 733 return result;
731} 734}
732 735
733static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) 736static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
734{ 737{
735 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 738 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
736 739
737 dprintk("acpi_cpufreq_cpu_exit\n"); 740 dprintk("acpi_cpufreq_cpu_exit\n");
738 741
739 if (data) { 742 if (data) {
740 cpufreq_frequency_table_put_attr(policy->cpu); 743 cpufreq_frequency_table_put_attr(policy->cpu);
741 per_cpu(drv_data, policy->cpu) = NULL; 744 per_cpu(acfreq_data, policy->cpu) = NULL;
742 acpi_processor_unregister_performance(data->acpi_data, 745 acpi_processor_unregister_performance(data->acpi_data,
743 policy->cpu); 746 policy->cpu);
744 kfree(data); 747 kfree(data);
@@ -749,7 +752,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
749 752
750static int acpi_cpufreq_resume(struct cpufreq_policy *policy) 753static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
751{ 754{
752 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 755 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
753 756
754 dprintk("acpi_cpufreq_resume\n"); 757 dprintk("acpi_cpufreq_resume\n");
755 758
@@ -764,14 +767,15 @@ static struct freq_attr *acpi_cpufreq_attr[] = {
764}; 767};
765 768
766static struct cpufreq_driver acpi_cpufreq_driver = { 769static struct cpufreq_driver acpi_cpufreq_driver = {
767 .verify = acpi_cpufreq_verify, 770 .verify = acpi_cpufreq_verify,
768 .target = acpi_cpufreq_target, 771 .target = acpi_cpufreq_target,
769 .init = acpi_cpufreq_cpu_init, 772 .bios_limit = acpi_processor_get_bios_limit,
770 .exit = acpi_cpufreq_cpu_exit, 773 .init = acpi_cpufreq_cpu_init,
771 .resume = acpi_cpufreq_resume, 774 .exit = acpi_cpufreq_cpu_exit,
772 .name = "acpi-cpufreq", 775 .resume = acpi_cpufreq_resume,
773 .owner = THIS_MODULE, 776 .name = "acpi-cpufreq",
774 .attr = acpi_cpufreq_attr, 777 .owner = THIS_MODULE,
778 .attr = acpi_cpufreq_attr,
775}; 779};
776 780
777static int __init acpi_cpufreq_init(void) 781static int __init acpi_cpufreq_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 006b278b0d5d..c587db472a75 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -20,7 +20,6 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/init.h> 21#include <linux/init.h>
22 22
23#include <linux/slab.h>
24#include <linux/delay.h> 23#include <linux/delay.h>
25#include <linux/cpufreq.h> 24#include <linux/cpufreq.h>
26 25
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index ac27ec2264d5..16e3483be9e3 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -80,6 +80,7 @@
80#include <linux/cpufreq.h> 80#include <linux/cpufreq.h>
81#include <linux/pci.h> 81#include <linux/pci.h>
82#include <linux/errno.h> 82#include <linux/errno.h>
83#include <linux/slab.h>
83 84
84#include <asm/processor-cyrix.h> 85#include <asm/processor-cyrix.h>
85 86
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index cabd2fa3fc93..7e7eea4f8261 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -885,7 +885,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
885 885
886 /* Find ACPI data for processor */ 886 /* Find ACPI data for processor */
887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, 887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
888 ACPI_UINT32_MAX, &longhaul_walk_callback, 888 ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
889 NULL, (void *)&pr); 889 NULL, (void *)&pr);
890 890
891 /* Check ACPI support for C3 state */ 891 /* Check ACPI support for C3 state */
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index da5f70fcb766..e7b559d74c52 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -9,7 +9,6 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/slab.h>
13#include <linux/cpufreq.h> 12#include <linux/cpufreq.h>
14#include <linux/timex.h> 13#include <linux/timex.h>
15 14
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 869615193720..7b8a8ba67b07 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -25,7 +25,6 @@
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/smp.h> 26#include <linux/smp.h>
27#include <linux/cpufreq.h> 27#include <linux/cpufreq.h>
28#include <linux/slab.h>
29#include <linux/cpumask.h> 28#include <linux/cpumask.h>
30#include <linux/timex.h> 29#include <linux/timex.h>
31 30
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
new file mode 100644
index 000000000000..ce7cde713e71
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -0,0 +1,621 @@
1/*
2 * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/kernel.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/sched.h>
31#include <linux/cpufreq.h>
32#include <linux/compiler.h>
33#include <linux/slab.h>
34
35#include <linux/acpi.h>
36#include <linux/io.h>
37#include <linux/spinlock.h>
38#include <linux/uaccess.h>
39
40#include <acpi/processor.h>
41
42#define PCC_VERSION "1.00.00"
43#define POLL_LOOPS 300
44
45#define CMD_COMPLETE 0x1
46#define CMD_GET_FREQ 0x0
47#define CMD_SET_FREQ 0x1
48
49#define BUF_SZ 4
50
51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
52 "pcc-cpufreq", msg)
53
54struct pcc_register_resource {
55 u8 descriptor;
56 u16 length;
57 u8 space_id;
58 u8 bit_width;
59 u8 bit_offset;
60 u8 access_size;
61 u64 address;
62} __attribute__ ((packed));
63
64struct pcc_memory_resource {
65 u8 descriptor;
66 u16 length;
67 u8 space_id;
68 u8 resource_usage;
69 u8 type_specific;
70 u64 granularity;
71 u64 minimum;
72 u64 maximum;
73 u64 translation_offset;
74 u64 address_length;
75} __attribute__ ((packed));
76
77static struct cpufreq_driver pcc_cpufreq_driver;
78
79struct pcc_header {
80 u32 signature;
81 u16 length;
82 u8 major;
83 u8 minor;
84 u32 features;
85 u16 command;
86 u16 status;
87 u32 latency;
88 u32 minimum_time;
89 u32 maximum_time;
90 u32 nominal;
91 u32 throttled_frequency;
92 u32 minimum_frequency;
93};
94
95static void __iomem *pcch_virt_addr;
96static struct pcc_header __iomem *pcch_hdr;
97
98static DEFINE_SPINLOCK(pcc_lock);
99
100static struct acpi_generic_address doorbell;
101
102static u64 doorbell_preserve;
103static u64 doorbell_write;
104
105static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
106 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
107
108struct pcc_cpu {
109 u32 input_offset;
110 u32 output_offset;
111};
112
113static struct pcc_cpu *pcc_cpu_info;
114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{
117 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
118 policy->cpuinfo.max_freq);
119 return 0;
120}
121
122static inline void pcc_cmd(void)
123{
124 u64 doorbell_value;
125 int i;
126
127 acpi_read(&doorbell_value, &doorbell);
128 acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
129 &doorbell);
130
131 for (i = 0; i < POLL_LOOPS; i++) {
132 if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
133 break;
134 }
135}
136
137static inline void pcc_clear_mapping(void)
138{
139 if (pcch_virt_addr)
140 iounmap(pcch_virt_addr);
141 pcch_virt_addr = NULL;
142}
143
144static unsigned int pcc_get_freq(unsigned int cpu)
145{
146 struct pcc_cpu *pcc_cpu_data;
147 unsigned int curr_freq;
148 unsigned int freq_limit;
149 u16 status;
150 u32 input_buffer;
151 u32 output_buffer;
152
153 spin_lock(&pcc_lock);
154
155 dprintk("get: get_freq for CPU %d\n", cpu);
156 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
157
158 input_buffer = 0x1;
159 iowrite32(input_buffer,
160 (pcch_virt_addr + pcc_cpu_data->input_offset));
161 iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
162
163 pcc_cmd();
164
165 output_buffer =
166 ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
167
168 /* Clear the input buffer - we are done with the current command */
169 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
170
171 status = ioread16(&pcch_hdr->status);
172 if (status != CMD_COMPLETE) {
173 dprintk("get: FAILED: for CPU %d, status is %d\n",
174 cpu, status);
175 goto cmd_incomplete;
176 }
177 iowrite16(0, &pcch_hdr->status);
178 curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
179 / 100) * 1000);
180
181 dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
182 "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
183 cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
184 output_buffer, curr_freq);
185
186 freq_limit = (output_buffer >> 8) & 0xff;
187 if (freq_limit != 0xff) {
188 dprintk("get: frequency for cpu %d is being temporarily"
189 " capped at %d\n", cpu, curr_freq);
190 }
191
192 spin_unlock(&pcc_lock);
193 return curr_freq;
194
195cmd_incomplete:
196 iowrite16(0, &pcch_hdr->status);
197 spin_unlock(&pcc_lock);
198 return -EINVAL;
199}
200
201static int pcc_cpufreq_target(struct cpufreq_policy *policy,
202 unsigned int target_freq,
203 unsigned int relation)
204{
205 struct pcc_cpu *pcc_cpu_data;
206 struct cpufreq_freqs freqs;
207 u16 status;
208 u32 input_buffer;
209 int cpu;
210
211 spin_lock(&pcc_lock);
212 cpu = policy->cpu;
213 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
214
215 dprintk("target: CPU %d should go to target freq: %d "
216 "(virtual) input_offset is 0x%x\n",
217 cpu, target_freq,
218 (pcch_virt_addr + pcc_cpu_data->input_offset));
219
220 freqs.new = target_freq;
221 freqs.cpu = cpu;
222 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
223
224 input_buffer = 0x1 | (((target_freq * 100)
225 / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
226 iowrite32(input_buffer,
227 (pcch_virt_addr + pcc_cpu_data->input_offset));
228 iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
229
230 pcc_cmd();
231
232 /* Clear the input buffer - we are done with the current command */
233 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
234
235 status = ioread16(&pcch_hdr->status);
236 if (status != CMD_COMPLETE) {
237 dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
238 cpu, status);
239 goto cmd_incomplete;
240 }
241 iowrite16(0, &pcch_hdr->status);
242
243 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
244 dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
245 spin_unlock(&pcc_lock);
246
247 return 0;
248
249cmd_incomplete:
250 iowrite16(0, &pcch_hdr->status);
251 spin_unlock(&pcc_lock);
252 return -EINVAL;
253}
254
255static int pcc_get_offset(int cpu)
256{
257 acpi_status status;
258 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
259 union acpi_object *pccp, *offset;
260 struct pcc_cpu *pcc_cpu_data;
261 struct acpi_processor *pr;
262 int ret = 0;
263
264 pr = per_cpu(processors, cpu);
265 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
266
267 status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
268 if (ACPI_FAILURE(status))
269 return -ENODEV;
270
271 pccp = buffer.pointer;
272 if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
273 ret = -ENODEV;
274 goto out_free;
275 };
276
277 offset = &(pccp->package.elements[0]);
278 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
279 ret = -ENODEV;
280 goto out_free;
281 }
282
283 pcc_cpu_data->input_offset = offset->integer.value;
284
285 offset = &(pccp->package.elements[1]);
286 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
287 ret = -ENODEV;
288 goto out_free;
289 }
290
291 pcc_cpu_data->output_offset = offset->integer.value;
292
293 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
294 memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
295
296 dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
297 "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
298 cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
299out_free:
300 kfree(buffer.pointer);
301 return ret;
302}
303
304static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
305{
306 acpi_status status;
307 struct acpi_object_list input;
308 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
309 union acpi_object in_params[4];
310 union acpi_object *out_obj;
311 u32 capabilities[2];
312 u32 errors;
313 u32 supported;
314 int ret = 0;
315
316 input.count = 4;
317 input.pointer = in_params;
318 input.count = 4;
319 input.pointer = in_params;
320 in_params[0].type = ACPI_TYPE_BUFFER;
321 in_params[0].buffer.length = 16;
322 in_params[0].buffer.pointer = OSC_UUID;
323 in_params[1].type = ACPI_TYPE_INTEGER;
324 in_params[1].integer.value = 1;
325 in_params[2].type = ACPI_TYPE_INTEGER;
326 in_params[2].integer.value = 2;
327 in_params[3].type = ACPI_TYPE_BUFFER;
328 in_params[3].buffer.length = 8;
329 in_params[3].buffer.pointer = (u8 *)&capabilities;
330
331 capabilities[0] = OSC_QUERY_ENABLE;
332 capabilities[1] = 0x1;
333
334 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
335 if (ACPI_FAILURE(status))
336 return -ENODEV;
337
338 if (!output.length)
339 return -ENODEV;
340
341 out_obj = output.pointer;
342 if (out_obj->type != ACPI_TYPE_BUFFER) {
343 ret = -ENODEV;
344 goto out_free;
345 }
346
347 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
348 if (errors) {
349 ret = -ENODEV;
350 goto out_free;
351 }
352
353 supported = *((u32 *)(out_obj->buffer.pointer + 4));
354 if (!(supported & 0x1)) {
355 ret = -ENODEV;
356 goto out_free;
357 }
358
359 kfree(output.pointer);
360 capabilities[0] = 0x0;
361 capabilities[1] = 0x1;
362
363 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
364 if (ACPI_FAILURE(status))
365 return -ENODEV;
366
367 if (!output.length)
368 return -ENODEV;
369
370 out_obj = output.pointer;
371 if (out_obj->type != ACPI_TYPE_BUFFER) {
372 ret = -ENODEV;
373 goto out_free;
374 }
375
376 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
377 if (errors) {
378 ret = -ENODEV;
379 goto out_free;
380 }
381
382 supported = *((u32 *)(out_obj->buffer.pointer + 4));
383 if (!(supported & 0x1)) {
384 ret = -ENODEV;
385 goto out_free;
386 }
387
388out_free:
389 kfree(output.pointer);
390 return ret;
391}
392
393static int __init pcc_cpufreq_probe(void)
394{
395 acpi_status status;
396 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
397 struct pcc_memory_resource *mem_resource;
398 struct pcc_register_resource *reg_resource;
399 union acpi_object *out_obj, *member;
400 acpi_handle handle, osc_handle;
401 int ret = 0;
402
403 status = acpi_get_handle(NULL, "\\_SB", &handle);
404 if (ACPI_FAILURE(status))
405 return -ENODEV;
406
407 status = acpi_get_handle(handle, "_OSC", &osc_handle);
408 if (ACPI_SUCCESS(status)) {
409 ret = pcc_cpufreq_do_osc(&osc_handle);
410 if (ret)
411 dprintk("probe: _OSC evaluation did not succeed\n");
412 /* Firmware's use of _OSC is optional */
413 ret = 0;
414 }
415
416 status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
417 if (ACPI_FAILURE(status))
418 return -ENODEV;
419
420 out_obj = output.pointer;
421 if (out_obj->type != ACPI_TYPE_PACKAGE) {
422 ret = -ENODEV;
423 goto out_free;
424 }
425
426 member = &out_obj->package.elements[0];
427 if (member->type != ACPI_TYPE_BUFFER) {
428 ret = -ENODEV;
429 goto out_free;
430 }
431
432 mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
433
434 dprintk("probe: mem_resource descriptor: 0x%x,"
435 " length: %d, space_id: %d, resource_usage: %d,"
436 " type_specific: %d, granularity: 0x%llx,"
437 " minimum: 0x%llx, maximum: 0x%llx,"
438 " translation_offset: 0x%llx, address_length: 0x%llx\n",
439 mem_resource->descriptor, mem_resource->length,
440 mem_resource->space_id, mem_resource->resource_usage,
441 mem_resource->type_specific, mem_resource->granularity,
442 mem_resource->minimum, mem_resource->maximum,
443 mem_resource->translation_offset,
444 mem_resource->address_length);
445
446 if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
447 ret = -ENODEV;
448 goto out_free;
449 }
450
451 pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
452 mem_resource->address_length);
453 if (pcch_virt_addr == NULL) {
454 dprintk("probe: could not map shared mem region\n");
455 goto out_free;
456 }
457 pcch_hdr = pcch_virt_addr;
458
459 dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
460 dprintk("probe: PCCH header is at physical address: 0x%llx,"
461 " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
462 " supported features: 0x%x, command field: 0x%x,"
463 " status field: 0x%x, nominal latency: %d us\n",
464 mem_resource->minimum, ioread32(&pcch_hdr->signature),
465 ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
466 ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
467 ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
468 ioread32(&pcch_hdr->latency));
469
470 dprintk("probe: min time between commands: %d us,"
471 " max time between commands: %d us,"
472 " nominal CPU frequency: %d MHz,"
473 " minimum CPU frequency: %d MHz,"
474 " minimum CPU frequency without throttling: %d MHz\n",
475 ioread32(&pcch_hdr->minimum_time),
476 ioread32(&pcch_hdr->maximum_time),
477 ioread32(&pcch_hdr->nominal),
478 ioread32(&pcch_hdr->throttled_frequency),
479 ioread32(&pcch_hdr->minimum_frequency));
480
481 member = &out_obj->package.elements[1];
482 if (member->type != ACPI_TYPE_BUFFER) {
483 ret = -ENODEV;
484 goto pcch_free;
485 }
486
487 reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
488
489 doorbell.space_id = reg_resource->space_id;
490 doorbell.bit_width = reg_resource->bit_width;
491 doorbell.bit_offset = reg_resource->bit_offset;
492 doorbell.access_width = 64;
493 doorbell.address = reg_resource->address;
494
495 dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
496 "bit_offset is %d, access_width is %d, address is 0x%llx\n",
497 doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
498 doorbell.access_width, reg_resource->address);
499
500 member = &out_obj->package.elements[2];
501 if (member->type != ACPI_TYPE_INTEGER) {
502 ret = -ENODEV;
503 goto pcch_free;
504 }
505
506 doorbell_preserve = member->integer.value;
507
508 member = &out_obj->package.elements[3];
509 if (member->type != ACPI_TYPE_INTEGER) {
510 ret = -ENODEV;
511 goto pcch_free;
512 }
513
514 doorbell_write = member->integer.value;
515
516 dprintk("probe: doorbell_preserve: 0x%llx,"
517 " doorbell_write: 0x%llx\n",
518 doorbell_preserve, doorbell_write);
519
520 pcc_cpu_info = alloc_percpu(struct pcc_cpu);
521 if (!pcc_cpu_info) {
522 ret = -ENOMEM;
523 goto pcch_free;
524 }
525
526 printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
527 " limits: %d MHz, %d MHz\n", PCC_VERSION,
528 ioread32(&pcch_hdr->minimum_frequency),
529 ioread32(&pcch_hdr->nominal));
530 kfree(output.pointer);
531 return ret;
532pcch_free:
533 pcc_clear_mapping();
534out_free:
535 kfree(output.pointer);
536 return ret;
537}
538
539static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
540{
541 unsigned int cpu = policy->cpu;
542 unsigned int result = 0;
543
544 if (!pcch_virt_addr) {
545 result = -1;
546 goto pcch_null;
547 }
548
549 result = pcc_get_offset(cpu);
550 if (result) {
551 dprintk("init: PCCP evaluation failed\n");
552 goto free;
553 }
554
555 policy->max = policy->cpuinfo.max_freq =
556 ioread32(&pcch_hdr->nominal) * 1000;
557 policy->min = policy->cpuinfo.min_freq =
558 ioread32(&pcch_hdr->minimum_frequency) * 1000;
559 policy->cur = pcc_get_freq(cpu);
560
561 dprintk("init: policy->max is %d, policy->min is %d\n",
562 policy->max, policy->min);
563
564 return 0;
565free:
566 pcc_clear_mapping();
567 free_percpu(pcc_cpu_info);
568pcch_null:
569 return result;
570}
571
572static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
573{
574 return 0;
575}
576
577static struct cpufreq_driver pcc_cpufreq_driver = {
578 .flags = CPUFREQ_CONST_LOOPS,
579 .get = pcc_get_freq,
580 .verify = pcc_cpufreq_verify,
581 .target = pcc_cpufreq_target,
582 .init = pcc_cpufreq_cpu_init,
583 .exit = pcc_cpufreq_cpu_exit,
584 .name = "pcc-cpufreq",
585 .owner = THIS_MODULE,
586};
587
588static int __init pcc_cpufreq_init(void)
589{
590 int ret;
591
592 if (acpi_disabled)
593 return 0;
594
595 ret = pcc_cpufreq_probe();
596 if (ret) {
597 dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
598 return ret;
599 }
600
601 ret = cpufreq_register_driver(&pcc_cpufreq_driver);
602
603 return ret;
604}
605
606static void __exit pcc_cpufreq_exit(void)
607{
608 cpufreq_unregister_driver(&pcc_cpufreq_driver);
609
610 pcc_clear_mapping();
611
612 free_percpu(pcc_cpu_info);
613}
614
615MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
616MODULE_VERSION(PCC_VERSION);
617MODULE_DESCRIPTION("Processor Clocking Control interface driver");
618MODULE_LICENSE("GPL");
619
620late_initcall(pcc_cpufreq_init);
621module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index f10dea409f40..b3379d6a5c57 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -13,7 +13,6 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/cpufreq.h> 14#include <linux/cpufreq.h>
15#include <linux/ioport.h> 15#include <linux/ioport.h>
16#include <linux/slab.h>
17#include <linux/timex.h> 16#include <linux/timex.h>
18#include <linux/io.h> 17#include <linux/io.h>
19 18
@@ -164,7 +163,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
164 } 163 }
165 164
166 /* cpuinfo and default policy values */ 165 /* cpuinfo and default policy values */
167 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; 166 policy->cpuinfo.transition_latency = 200000;
168 policy->cur = busfreq * max_multiplier; 167 policy->cur = busfreq * max_multiplier;
169 168
170 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 169 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index d47c775eb0ab..9a97116f89e5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -714,14 +714,17 @@ static struct freq_attr *powernow_table_attr[] = {
714}; 714};
715 715
716static struct cpufreq_driver powernow_driver = { 716static struct cpufreq_driver powernow_driver = {
717 .verify = powernow_verify, 717 .verify = powernow_verify,
718 .target = powernow_target, 718 .target = powernow_target,
719 .get = powernow_get, 719 .get = powernow_get,
720 .init = powernow_cpu_init, 720#ifdef CONFIG_X86_POWERNOW_K7_ACPI
721 .exit = powernow_cpu_exit, 721 .bios_limit = acpi_processor_get_bios_limit,
722 .name = "powernow-k7", 722#endif
723 .owner = THIS_MODULE, 723 .init = powernow_cpu_init,
724 .attr = powernow_table_attr, 724 .exit = powernow_cpu_exit,
725 .name = "powernow-k7",
726 .owner = THIS_MODULE,
727 .attr = powernow_table_attr,
725}; 728};
726 729
727static int __init powernow_init(void) 730static int __init powernow_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 3f12dabeab52..b6215b9798e2 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data)
806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, 806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
807 unsigned int index) 807 unsigned int index)
808{ 808{
809 acpi_integer control; 809 u64 control;
810 810
811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
812 return; 812 return;
@@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
824{ 824{
825 struct cpufreq_frequency_table *powernow_table; 825 struct cpufreq_frequency_table *powernow_table;
826 int ret_val = -ENODEV; 826 int ret_val = -ENODEV;
827 acpi_integer control, status; 827 u64 control, status;
828 828
829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
830 dprintk("register performance failed: bad ACPI data\n"); 830 dprintk("register performance failed: bad ACPI data\n");
@@ -929,7 +929,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
929 powernow_table[i].index = index; 929 powernow_table[i].index = index;
930 930
931 /* Frequency may be rounded for these */ 931 /* Frequency may be rounded for these */
932 if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) { 932 if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
933 || boot_cpu_data.x86 == 0x11) {
933 powernow_table[i].frequency = 934 powernow_table[i].frequency =
934 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); 935 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
935 } else 936 } else
@@ -948,7 +949,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
948 u32 fid; 949 u32 fid;
949 u32 vid; 950 u32 vid;
950 u32 freq, index; 951 u32 freq, index;
951 acpi_integer status, control; 952 u64 status, control;
952 953
953 if (data->exttype) { 954 if (data->exttype) {
954 status = data->acpi_data.states[i].status; 955 status = data->acpi_data.states[i].status;
@@ -1118,7 +1119,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1118static int powernowk8_target(struct cpufreq_policy *pol, 1119static int powernowk8_target(struct cpufreq_policy *pol,
1119 unsigned targfreq, unsigned relation) 1120 unsigned targfreq, unsigned relation)
1120{ 1121{
1121 cpumask_t oldmask; 1122 cpumask_var_t oldmask;
1122 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1123 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1123 u32 checkfid; 1124 u32 checkfid;
1124 u32 checkvid; 1125 u32 checkvid;
@@ -1131,9 +1132,13 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1131 checkfid = data->currfid; 1132 checkfid = data->currfid;
1132 checkvid = data->currvid; 1133 checkvid = data->currvid;
1133 1134
1134 /* only run on specific CPU from here on */ 1135 /* only run on specific CPU from here on. */
1135 oldmask = current->cpus_allowed; 1136 /* This is poor form: use a workqueue or smp_call_function_single */
1136 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1137 if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
1138 return -ENOMEM;
1139
1140 cpumask_copy(oldmask, tsk_cpus_allowed(current));
1141 set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
1137 1142
1138 if (smp_processor_id() != pol->cpu) { 1143 if (smp_processor_id() != pol->cpu) {
1139 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1144 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1193,7 +1198,8 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1193 ret = 0; 1198 ret = 0;
1194 1199
1195err_out: 1200err_out:
1196 set_cpus_allowed_ptr(current, &oldmask); 1201 set_cpus_allowed_ptr(current, oldmask);
1202 free_cpumask_var(oldmask);
1197 return ret; 1203 return ret;
1198} 1204}
1199 1205
@@ -1351,6 +1357,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1351 1357
1352 kfree(data->powernow_table); 1358 kfree(data->powernow_table);
1353 kfree(data); 1359 kfree(data);
1360 per_cpu(powernow_data, pol->cpu) = NULL;
1354 1361
1355 return 0; 1362 return 0;
1356} 1363}
@@ -1370,7 +1377,7 @@ static unsigned int powernowk8_get(unsigned int cpu)
1370 int err; 1377 int err;
1371 1378
1372 if (!data) 1379 if (!data)
1373 return -EINVAL; 1380 return 0;
1374 1381
1375 smp_call_function_single(cpu, query_values_on_cpu, &err, true); 1382 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1376 if (err) 1383 if (err)
@@ -1393,14 +1400,15 @@ static struct freq_attr *powernow_k8_attr[] = {
1393}; 1400};
1394 1401
1395static struct cpufreq_driver cpufreq_amd64_driver = { 1402static struct cpufreq_driver cpufreq_amd64_driver = {
1396 .verify = powernowk8_verify, 1403 .verify = powernowk8_verify,
1397 .target = powernowk8_target, 1404 .target = powernowk8_target,
1398 .init = powernowk8_cpu_init, 1405 .bios_limit = acpi_processor_get_bios_limit,
1399 .exit = __devexit_p(powernowk8_cpu_exit), 1406 .init = powernowk8_cpu_init,
1400 .get = powernowk8_get, 1407 .exit = __devexit_p(powernowk8_cpu_exit),
1401 .name = "powernow-k8", 1408 .get = powernowk8_get,
1402 .owner = THIS_MODULE, 1409 .name = "powernow-k8",
1403 .attr = powernow_k8_attr, 1410 .owner = THIS_MODULE,
1411 .attr = powernow_k8_attr,
1404}; 1412};
1405 1413
1406/* driver entry point for init */ 1414/* driver entry point for init */
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 8d672ef162ce..9b1ff37de46a 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> /* current */ 20#include <linux/sched.h> /* current */
21#include <linux/delay.h> 21#include <linux/delay.h>
22#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/gfp.h>
23 24
24#include <asm/msr.h> 25#include <asm/msr.h>
25#include <asm/processor.h> 26#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 3ae5a7a3a500..561758e95180 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -23,7 +23,6 @@
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/cpufreq.h> 24#include <linux/cpufreq.h>
25#include <linux/pci.h> 25#include <linux/pci.h>
26#include <linux/slab.h>
27#include <linux/sched.h> 26#include <linux/sched.h>
28 27
29#include "speedstep-lib.h" 28#include "speedstep-lib.h"
@@ -39,7 +38,7 @@ static struct pci_dev *speedstep_chipset_dev;
39 38
40/* speedstep_processor 39/* speedstep_processor
41 */ 40 */
42static unsigned int speedstep_processor; 41static enum speedstep_processor speedstep_processor;
43 42
44static u32 pmbase; 43static u32 pmbase;
45 44
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index f4c290b8482f..a94ec6be69fa 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -13,7 +13,6 @@
13#include <linux/moduleparam.h> 13#include <linux/moduleparam.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/cpufreq.h> 15#include <linux/cpufreq.h>
16#include <linux/slab.h>
17 16
18#include <asm/msr.h> 17#include <asm/msr.h>
19#include <asm/tsc.h> 18#include <asm/tsc.h>
@@ -34,7 +33,7 @@ static int relaxed_check;
34 * GET PROCESSOR CORE SPEED IN KHZ * 33 * GET PROCESSOR CORE SPEED IN KHZ *
35 *********************************************************************/ 34 *********************************************************************/
36 35
37static unsigned int pentium3_get_frequency(unsigned int processor) 36static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
38{ 37{
39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ 38 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
40 struct { 39 struct {
@@ -227,7 +226,7 @@ static unsigned int pentium4_get_frequency(void)
227 226
228 227
229/* Warning: may get called from smp_call_function_single. */ 228/* Warning: may get called from smp_call_function_single. */
230unsigned int speedstep_get_frequency(unsigned int processor) 229unsigned int speedstep_get_frequency(enum speedstep_processor processor)
231{ 230{
232 switch (processor) { 231 switch (processor) {
233 case SPEEDSTEP_CPU_PCORE: 232 case SPEEDSTEP_CPU_PCORE:
@@ -380,7 +379,7 @@ EXPORT_SYMBOL_GPL(speedstep_detect_processor);
380 * DETECT SPEEDSTEP SPEEDS * 379 * DETECT SPEEDSTEP SPEEDS *
381 *********************************************************************/ 380 *********************************************************************/
382 381
383unsigned int speedstep_get_freqs(unsigned int processor, 382unsigned int speedstep_get_freqs(enum speedstep_processor processor,
384 unsigned int *low_speed, 383 unsigned int *low_speed,
385 unsigned int *high_speed, 384 unsigned int *high_speed,
386 unsigned int *transition_latency, 385 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
index 2b6c04e5a304..70d9cea1219d 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -11,18 +11,18 @@
11 11
12 12
13/* processors */ 13/* processors */
14 14enum speedstep_processor {
15#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */ 15 SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
16#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */ 16 SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
17#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */ 17 SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
18#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */ 18 SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
19
20/* the following processors are not speedstep-capable and are not auto-detected 19/* the following processors are not speedstep-capable and are not auto-detected
21 * in speedstep_detect_processor(). However, their speed can be detected using 20 * in speedstep_detect_processor(). However, their speed can be detected using
22 * the speedstep_get_frequency() call. */ 21 * the speedstep_get_frequency() call. */
23#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */ 22 SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
24#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */ 23 SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
25#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */ 24 SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
25};
26 26
27/* speedstep states -- only two of them */ 27/* speedstep states -- only two of them */
28 28
@@ -31,10 +31,10 @@
31 31
32 32
33/* detect a speedstep-capable processor */ 33/* detect a speedstep-capable processor */
34extern unsigned int speedstep_detect_processor (void); 34extern enum speedstep_processor speedstep_detect_processor(void);
35 35
36/* detect the current speed (in khz) of the processor */ 36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_frequency(unsigned int processor); 37extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
38 38
39 39
40/* detect the low and high speeds of the processor. The callback 40/* detect the low and high speeds of the processor. The callback
@@ -42,7 +42,7 @@ extern unsigned int speedstep_get_frequency(unsigned int processor);
42 * SPEEDSTEP_LOW; the second argument is zero so that no 42 * SPEEDSTEP_LOW; the second argument is zero so that no
43 * cpufreq_notify_transition calls are initiated. 43 * cpufreq_notify_transition calls are initiated.
44 */ 44 */
45extern unsigned int speedstep_get_freqs(unsigned int processor, 45extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
46 unsigned int *low_speed, 46 unsigned int *low_speed,
47 unsigned int *high_speed, 47 unsigned int *high_speed,
48 unsigned int *transition_latency, 48 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index befea088e4f5..8abd869baabf 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -17,7 +17,6 @@
17#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/cpufreq.h> 19#include <linux/cpufreq.h>
20#include <linux/slab.h>
21#include <linux/delay.h> 20#include <linux/delay.h>
22#include <linux/io.h> 21#include <linux/io.h>
23#include <asm/ist.h> 22#include <asm/ist.h>
@@ -35,7 +34,7 @@ static int smi_cmd;
35static unsigned int smi_sig; 34static unsigned int smi_sig;
36 35
37/* info about the processor */ 36/* info about the processor */
38static unsigned int speedstep_processor; 37static enum speedstep_processor speedstep_processor;
39 38
40/* 39/*
41 * There are only two frequency states for each processor. Values 40 * There are only two frequency states for each processor. Values
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 19807b89f058..4fbd384fb645 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
373 /* Handle the GX (Formally known as the GX2) */ 373 /* Handle the GX (Formally known as the GX2) */
374 374
375 if (c->x86 == 5 && c->x86_model == 5) 375 if (c->x86 == 5 && c->x86_model == 5)
376 display_cacheinfo(c); 376 cpu_detect_cache_sizes(c);
377 else 377 else
378 init_cyrix(c); 378 init_cyrix(c);
379} 379}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 40e1835b35e8..1366c7cfd483 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -47,6 +47,27 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
47 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 47 (c->x86 == 0x6 && c->x86_model >= 0x0e))
48 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 48 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
49 49
50 /*
51 * Atom erratum AAE44/AAF40/AAG38/AAH41:
52 *
53 * A race condition between speculative fetches and invalidating
54 * a large page. This is worked around in microcode, but we
55 * need the microcode to have already been loaded... so if it is
56 * not, recommend a BIOS update and disable large pages.
57 */
58 if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) {
59 u32 ucode, junk;
60
61 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
62 sync_core();
63 rdmsr(MSR_IA32_UCODE_REV, junk, ucode);
64
65 if (ucode < 0x20e) {
66 printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
67 clear_cpu_cap(c, X86_FEATURE_PSE);
68 }
69 }
70
50#ifdef CONFIG_X86_64 71#ifdef CONFIG_X86_64
51 set_cpu_cap(c, X86_FEATURE_SYSENTER32); 72 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
52#else 73#else
@@ -70,8 +91,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
70 if (c->x86_power & (1 << 8)) { 91 if (c->x86_power & (1 << 8)) {
71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 92 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 93 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
73 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); 94 if (!check_tsc_unstable())
74 sched_clock_stable = 1; 95 sched_clock_stable = 1;
75 } 96 }
76 97
77 /* 98 /*
@@ -263,11 +284,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
263 /* Don't do the funky fallback heuristics the AMD version employs 284 /* Don't do the funky fallback heuristics the AMD version employs
264 for now. */ 285 for now. */
265 node = apicid_to_node[apicid]; 286 node = apicid_to_node[apicid];
266 if (node == NUMA_NO_NODE || !node_online(node)) 287 if (node == NUMA_NO_NODE)
267 node = first_node(node_online_map); 288 node = first_node(node_online_map);
289 else if (!node_online(node)) {
290 /* reuse the value from init_cpu_to_node() */
291 node = cpu_to_node(cpu);
292 }
268 numa_set_node(cpu, node); 293 numa_set_node(cpu, node);
269
270 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
271#endif 294#endif
272} 295}
273 296
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 3167c3d72596..94d8e475744c 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -18,6 +18,7 @@
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/k8.h>
21#include <asm/smp.h>
21 22
22#define LVL_1_INST 1 23#define LVL_1_INST 1
23#define LVL_1_DATA 2 24#define LVL_1_DATA 2
@@ -31,6 +32,8 @@ struct _cache_table {
31 short size; 32 short size;
32}; 33};
33 34
35#define MB(x) ((x) * 1024)
36
34/* All the cache descriptor types we care about (no TLB or 37/* All the cache descriptor types we care about (no TLB or
35 trace cache entries) */ 38 trace cache entries) */
36 39
@@ -44,9 +47,9 @@ static const struct _cache_table __cpuinitconst cache_table[] =
44 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ 47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
45 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ 48 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
46 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 49 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
47 { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 50 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
48 { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 51 { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */
49 { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 52 { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */
50 { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ 53 { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
51 { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ 54 { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
52 { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 55 { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -59,16 +62,16 @@ static const struct _cache_table __cpuinitconst cache_table[] =
59 { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ 62 { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
60 { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ 63 { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
61 { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ 64 { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
62 { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */ 65 { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */
63 { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */ 66 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
64 { 0x46, LVL_3, 4096 }, /* 4-way set assoc, 64 byte line size */ 67 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
65 { 0x47, LVL_3, 8192 }, /* 8-way set assoc, 64 byte line size */ 68 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
66 { 0x49, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 69 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
67 { 0x4a, LVL_3, 6144 }, /* 12-way set assoc, 64 byte line size */ 70 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
68 { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 71 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
69 { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ 72 { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */
70 { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ 73 { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */
71 { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */ 74 { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */
72 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 75 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
73 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 76 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
74 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 77 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -77,31 +80,34 @@ static const struct _cache_table __cpuinitconst cache_table[] =
77 { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ 80 { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
78 { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ 81 { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
79 { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ 82 { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
80 { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */ 83 { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
81 { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 84 { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
82 { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 85 { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
83 { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 86 { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
84 { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 87 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
85 { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */ 88 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
86 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ 89 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
87 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ 90 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
88 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ 91 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
89 { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */ 92 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
90 { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */ 93 { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */
91 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ 94 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
92 { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */ 95 { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */
93 { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ 96 { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ 97 { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */
95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ 98 { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */
96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ 99 { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */
97 { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ 100 { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */
98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 101 { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ 102 { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */
100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 103 { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
101 { 0xde, LVL_3, 8192 }, /* 12-way set assoc, 64 byte line size */ 104 { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */
102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ 105 { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */
103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 106 { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 107 { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
108 { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */
109 { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */
110 { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */
105 { 0x00, 0, 0} 111 { 0x00, 0, 0}
106}; 112};
107 113
@@ -147,7 +153,8 @@ struct _cpuid4_info {
147 union _cpuid4_leaf_ebx ebx; 153 union _cpuid4_leaf_ebx ebx;
148 union _cpuid4_leaf_ecx ecx; 154 union _cpuid4_leaf_ecx ecx;
149 unsigned long size; 155 unsigned long size;
150 unsigned long can_disable; 156 bool can_disable;
157 unsigned int l3_indices;
151 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 158 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
152}; 159};
153 160
@@ -157,7 +164,8 @@ struct _cpuid4_info_regs {
157 union _cpuid4_leaf_ebx ebx; 164 union _cpuid4_leaf_ebx ebx;
158 union _cpuid4_leaf_ecx ecx; 165 union _cpuid4_leaf_ecx ecx;
159 unsigned long size; 166 unsigned long size;
160 unsigned long can_disable; 167 bool can_disable;
168 unsigned int l3_indices;
161}; 169};
162 170
163unsigned short num_cache_leaves; 171unsigned short num_cache_leaves;
@@ -287,6 +295,36 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
287 (ebx->split.ways_of_associativity + 1) - 1; 295 (ebx->split.ways_of_associativity + 1) - 1;
288} 296}
289 297
298struct _cache_attr {
299 struct attribute attr;
300 ssize_t (*show)(struct _cpuid4_info *, char *);
301 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
302};
303
304#ifdef CONFIG_CPU_SUP_AMD
305static unsigned int __cpuinit amd_calc_l3_indices(void)
306{
307 /*
308 * We're called over smp_call_function_single() and therefore
309 * are on the correct cpu.
310 */
311 int cpu = smp_processor_id();
312 int node = cpu_to_node(cpu);
313 struct pci_dev *dev = node_to_k8_nb_misc(node);
314 unsigned int sc0, sc1, sc2, sc3;
315 u32 val = 0;
316
317 pci_read_config_dword(dev, 0x1C4, &val);
318
319 /* calculate subcache sizes */
320 sc0 = !(val & BIT(0));
321 sc1 = !(val & BIT(4));
322 sc2 = !(val & BIT(8)) + !(val & BIT(9));
323 sc3 = !(val & BIT(12)) + !(val & BIT(13));
324
325 return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
326}
327
290static void __cpuinit 328static void __cpuinit
291amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 329amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
292{ 330{
@@ -296,13 +334,108 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
296 if (boot_cpu_data.x86 == 0x11) 334 if (boot_cpu_data.x86 == 0x11)
297 return; 335 return;
298 336
299 /* see erratum #382 */ 337 /* see errata #382 and #388 */
300 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) 338 if ((boot_cpu_data.x86 == 0x10) &&
339 ((boot_cpu_data.x86_model < 0x8) ||
340 (boot_cpu_data.x86_mask < 0x1)))
301 return; 341 return;
302 342
303 this_leaf->can_disable = 1; 343 /* not in virtualized environments */
344 if (num_k8_northbridges == 0)
345 return;
346
347 this_leaf->can_disable = true;
348 this_leaf->l3_indices = amd_calc_l3_indices();
349}
350
351static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
352 unsigned int index)
353{
354 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
355 int node = amd_get_nb_id(cpu);
356 struct pci_dev *dev = node_to_k8_nb_misc(node);
357 unsigned int reg = 0;
358
359 if (!this_leaf->can_disable)
360 return -EINVAL;
361
362 if (!dev)
363 return -EINVAL;
364
365 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
366 return sprintf(buf, "0x%08x\n", reg);
304} 367}
305 368
369#define SHOW_CACHE_DISABLE(index) \
370static ssize_t \
371show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
372{ \
373 return show_cache_disable(this_leaf, buf, index); \
374}
375SHOW_CACHE_DISABLE(0)
376SHOW_CACHE_DISABLE(1)
377
378static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
379 const char *buf, size_t count, unsigned int index)
380{
381 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
382 int node = amd_get_nb_id(cpu);
383 struct pci_dev *dev = node_to_k8_nb_misc(node);
384 unsigned long val = 0;
385
386#define SUBCACHE_MASK (3UL << 20)
387#define SUBCACHE_INDEX 0xfff
388
389 if (!this_leaf->can_disable)
390 return -EINVAL;
391
392 if (!capable(CAP_SYS_ADMIN))
393 return -EPERM;
394
395 if (!dev)
396 return -EINVAL;
397
398 if (strict_strtoul(buf, 10, &val) < 0)
399 return -EINVAL;
400
401 /* do not allow writes outside of allowed bits */
402 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
403 ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
404 return -EINVAL;
405
406 val |= BIT(30);
407 pci_write_config_dword(dev, 0x1BC + index * 4, val);
408 /*
409 * We need to WBINVD on a core on the node containing the L3 cache which
410 * indices we disable therefore a simple wbinvd() is not sufficient.
411 */
412 wbinvd_on_cpu(cpu);
413 pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
414 return count;
415}
416
417#define STORE_CACHE_DISABLE(index) \
418static ssize_t \
419store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
420 const char *buf, size_t count) \
421{ \
422 return store_cache_disable(this_leaf, buf, count, index); \
423}
424STORE_CACHE_DISABLE(0)
425STORE_CACHE_DISABLE(1)
426
427static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
428 show_cache_disable_0, store_cache_disable_0);
429static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
430 show_cache_disable_1, store_cache_disable_1);
431
432#else /* CONFIG_CPU_SUP_AMD */
433static void __cpuinit
434amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
435{
436};
437#endif /* CONFIG_CPU_SUP_AMD */
438
306static int 439static int
307__cpuinit cpuid4_cache_lookup_regs(int index, 440__cpuinit cpuid4_cache_lookup_regs(int index,
308 struct _cpuid4_info_regs *this_leaf) 441 struct _cpuid4_info_regs *this_leaf)
@@ -488,22 +621,6 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
488#endif 621#endif
489 } 622 }
490 623
491 if (trace)
492 printk(KERN_INFO "CPU: Trace cache: %dK uops", trace);
493 else if (l1i)
494 printk(KERN_INFO "CPU: L1 I cache: %dK", l1i);
495
496 if (l1d)
497 printk(KERN_CONT ", L1 D cache: %dK\n", l1d);
498 else
499 printk(KERN_CONT "\n");
500
501 if (l2)
502 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
503
504 if (l3)
505 printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
506
507 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); 624 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
508 625
509 return l2; 626 return l2;
@@ -512,8 +629,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
512#ifdef CONFIG_SYSFS 629#ifdef CONFIG_SYSFS
513 630
514/* pointer to _cpuid4_info array (for each cache leaf) */ 631/* pointer to _cpuid4_info array (for each cache leaf) */
515static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 632static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
516#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 633#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
517 634
518/* returns CPUs that share the index cache with cpu */ 635/* returns CPUs that share the index cache with cpu */
519int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index) 636int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
@@ -537,18 +654,19 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
537{ 654{
538 struct _cpuid4_info *this_leaf, *sibling_leaf; 655 struct _cpuid4_info *this_leaf, *sibling_leaf;
539 unsigned long num_threads_sharing; 656 unsigned long num_threads_sharing;
540 int index_msb, i; 657 int index_msb, i, sibling;
541 struct cpuinfo_x86 *c = &cpu_data(cpu); 658 struct cpuinfo_x86 *c = &cpu_data(cpu);
542 659
543 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 660 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
544 struct cpuinfo_x86 *d; 661 for_each_cpu(i, c->llc_shared_map) {
545 for_each_online_cpu(i) { 662 if (!per_cpu(ici_cpuid4_info, i))
546 if (!per_cpu(cpuid4_info, i))
547 continue; 663 continue;
548 d = &cpu_data(i);
549 this_leaf = CPUID4_INFO_IDX(i, index); 664 this_leaf = CPUID4_INFO_IDX(i, index);
550 cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), 665 for_each_cpu(sibling, c->llc_shared_map) {
551 d->llc_shared_map); 666 if (!cpu_online(sibling))
667 continue;
668 set_bit(sibling, this_leaf->shared_cpu_map);
669 }
552 } 670 }
553 return; 671 return;
554 } 672 }
@@ -565,7 +683,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
565 c->apicid >> index_msb) { 683 c->apicid >> index_msb) {
566 cpumask_set_cpu(i, 684 cpumask_set_cpu(i,
567 to_cpumask(this_leaf->shared_cpu_map)); 685 to_cpumask(this_leaf->shared_cpu_map));
568 if (i != cpu && per_cpu(cpuid4_info, i)) { 686 if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
569 sibling_leaf = 687 sibling_leaf =
570 CPUID4_INFO_IDX(i, index); 688 CPUID4_INFO_IDX(i, index);
571 cpumask_set_cpu(cpu, to_cpumask( 689 cpumask_set_cpu(cpu, to_cpumask(
@@ -604,8 +722,8 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
604 for (i = 0; i < num_cache_leaves; i++) 722 for (i = 0; i < num_cache_leaves; i++)
605 cache_remove_shared_cpu_map(cpu, i); 723 cache_remove_shared_cpu_map(cpu, i);
606 724
607 kfree(per_cpu(cpuid4_info, cpu)); 725 kfree(per_cpu(ici_cpuid4_info, cpu));
608 per_cpu(cpuid4_info, cpu) = NULL; 726 per_cpu(ici_cpuid4_info, cpu) = NULL;
609} 727}
610 728
611static int 729static int
@@ -644,15 +762,15 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
644 if (num_cache_leaves == 0) 762 if (num_cache_leaves == 0)
645 return -ENOENT; 763 return -ENOENT;
646 764
647 per_cpu(cpuid4_info, cpu) = kzalloc( 765 per_cpu(ici_cpuid4_info, cpu) = kzalloc(
648 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 766 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
649 if (per_cpu(cpuid4_info, cpu) == NULL) 767 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
650 return -ENOMEM; 768 return -ENOMEM;
651 769
652 smp_call_function_single(cpu, get_cpu_leaves, &retval, true); 770 smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
653 if (retval) { 771 if (retval) {
654 kfree(per_cpu(cpuid4_info, cpu)); 772 kfree(per_cpu(ici_cpuid4_info, cpu));
655 per_cpu(cpuid4_info, cpu) = NULL; 773 per_cpu(ici_cpuid4_info, cpu) = NULL;
656 } 774 }
657 775
658 return retval; 776 return retval;
@@ -664,7 +782,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
664extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ 782extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
665 783
666/* pointer to kobject for cpuX/cache */ 784/* pointer to kobject for cpuX/cache */
667static DEFINE_PER_CPU(struct kobject *, cache_kobject); 785static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
668 786
669struct _index_kobject { 787struct _index_kobject {
670 struct kobject kobj; 788 struct kobject kobj;
@@ -673,8 +791,8 @@ struct _index_kobject {
673}; 791};
674 792
675/* pointer to array of kobjects for cpuX/cache/indexY */ 793/* pointer to array of kobjects for cpuX/cache/indexY */
676static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); 794static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
677#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) 795#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
678 796
679#define show_one_plus(file_name, object, val) \ 797#define show_one_plus(file_name, object, val) \
680static ssize_t show_##file_name \ 798static ssize_t show_##file_name \
@@ -740,82 +858,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
740#define to_object(k) container_of(k, struct _index_kobject, kobj) 858#define to_object(k) container_of(k, struct _index_kobject, kobj)
741#define to_attr(a) container_of(a, struct _cache_attr, attr) 859#define to_attr(a) container_of(a, struct _cache_attr, attr)
742 860
743static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
744 unsigned int index)
745{
746 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
747 int node = cpu_to_node(cpu);
748 struct pci_dev *dev = node_to_k8_nb_misc(node);
749 unsigned int reg = 0;
750
751 if (!this_leaf->can_disable)
752 return -EINVAL;
753
754 if (!dev)
755 return -EINVAL;
756
757 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
758 return sprintf(buf, "%x\n", reg);
759}
760
761#define SHOW_CACHE_DISABLE(index) \
762static ssize_t \
763show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
764{ \
765 return show_cache_disable(this_leaf, buf, index); \
766}
767SHOW_CACHE_DISABLE(0)
768SHOW_CACHE_DISABLE(1)
769
770static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
771 const char *buf, size_t count, unsigned int index)
772{
773 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
774 int node = cpu_to_node(cpu);
775 struct pci_dev *dev = node_to_k8_nb_misc(node);
776 unsigned long val = 0;
777 unsigned int scrubber = 0;
778
779 if (!this_leaf->can_disable)
780 return -EINVAL;
781
782 if (!capable(CAP_SYS_ADMIN))
783 return -EPERM;
784
785 if (!dev)
786 return -EINVAL;
787
788 if (strict_strtoul(buf, 10, &val) < 0)
789 return -EINVAL;
790
791 val |= 0xc0000000;
792
793 pci_read_config_dword(dev, 0x58, &scrubber);
794 scrubber &= ~0x1f000000;
795 pci_write_config_dword(dev, 0x58, scrubber);
796
797 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
798 wbinvd();
799 pci_write_config_dword(dev, 0x1BC + index * 4, val);
800 return count;
801}
802
803#define STORE_CACHE_DISABLE(index) \
804static ssize_t \
805store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
806 const char *buf, size_t count) \
807{ \
808 return store_cache_disable(this_leaf, buf, count, index); \
809}
810STORE_CACHE_DISABLE(0)
811STORE_CACHE_DISABLE(1)
812
813struct _cache_attr {
814 struct attribute attr;
815 ssize_t (*show)(struct _cpuid4_info *, char *);
816 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
817};
818
819#define define_one_ro(_name) \ 861#define define_one_ro(_name) \
820static struct _cache_attr _name = \ 862static struct _cache_attr _name = \
821 __ATTR(_name, 0444, show_##_name, NULL) 863 __ATTR(_name, 0444, show_##_name, NULL)
@@ -830,23 +872,28 @@ define_one_ro(size);
830define_one_ro(shared_cpu_map); 872define_one_ro(shared_cpu_map);
831define_one_ro(shared_cpu_list); 873define_one_ro(shared_cpu_list);
832 874
833static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, 875#define DEFAULT_SYSFS_CACHE_ATTRS \
834 show_cache_disable_0, store_cache_disable_0); 876 &type.attr, \
835static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 877 &level.attr, \
836 show_cache_disable_1, store_cache_disable_1); 878 &coherency_line_size.attr, \
879 &physical_line_partition.attr, \
880 &ways_of_associativity.attr, \
881 &number_of_sets.attr, \
882 &size.attr, \
883 &shared_cpu_map.attr, \
884 &shared_cpu_list.attr
837 885
838static struct attribute *default_attrs[] = { 886static struct attribute *default_attrs[] = {
839 &type.attr, 887 DEFAULT_SYSFS_CACHE_ATTRS,
840 &level.attr, 888 NULL
841 &coherency_line_size.attr, 889};
842 &physical_line_partition.attr, 890
843 &ways_of_associativity.attr, 891static struct attribute *default_l3_attrs[] = {
844 &number_of_sets.attr, 892 DEFAULT_SYSFS_CACHE_ATTRS,
845 &size.attr, 893#ifdef CONFIG_CPU_SUP_AMD
846 &shared_cpu_map.attr,
847 &shared_cpu_list.attr,
848 &cache_disable_0.attr, 894 &cache_disable_0.attr,
849 &cache_disable_1.attr, 895 &cache_disable_1.attr,
896#endif
850 NULL 897 NULL
851}; 898};
852 899
@@ -877,7 +924,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
877 return ret; 924 return ret;
878} 925}
879 926
880static struct sysfs_ops sysfs_ops = { 927static const struct sysfs_ops sysfs_ops = {
881 .show = show, 928 .show = show,
882 .store = store, 929 .store = store,
883}; 930};
@@ -893,10 +940,10 @@ static struct kobj_type ktype_percpu_entry = {
893 940
894static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) 941static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
895{ 942{
896 kfree(per_cpu(cache_kobject, cpu)); 943 kfree(per_cpu(ici_cache_kobject, cpu));
897 kfree(per_cpu(index_kobject, cpu)); 944 kfree(per_cpu(ici_index_kobject, cpu));
898 per_cpu(cache_kobject, cpu) = NULL; 945 per_cpu(ici_cache_kobject, cpu) = NULL;
899 per_cpu(index_kobject, cpu) = NULL; 946 per_cpu(ici_index_kobject, cpu) = NULL;
900 free_cache_attributes(cpu); 947 free_cache_attributes(cpu);
901} 948}
902 949
@@ -912,14 +959,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
912 return err; 959 return err;
913 960
914 /* Allocate all required memory */ 961 /* Allocate all required memory */
915 per_cpu(cache_kobject, cpu) = 962 per_cpu(ici_cache_kobject, cpu) =
916 kzalloc(sizeof(struct kobject), GFP_KERNEL); 963 kzalloc(sizeof(struct kobject), GFP_KERNEL);
917 if (unlikely(per_cpu(cache_kobject, cpu) == NULL)) 964 if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
918 goto err_out; 965 goto err_out;
919 966
920 per_cpu(index_kobject, cpu) = kzalloc( 967 per_cpu(ici_index_kobject, cpu) = kzalloc(
921 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); 968 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
922 if (unlikely(per_cpu(index_kobject, cpu) == NULL)) 969 if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
923 goto err_out; 970 goto err_out;
924 971
925 return 0; 972 return 0;
@@ -937,13 +984,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
937 unsigned int cpu = sys_dev->id; 984 unsigned int cpu = sys_dev->id;
938 unsigned long i, j; 985 unsigned long i, j;
939 struct _index_kobject *this_object; 986 struct _index_kobject *this_object;
987 struct _cpuid4_info *this_leaf;
940 int retval; 988 int retval;
941 989
942 retval = cpuid4_cache_sysfs_init(cpu); 990 retval = cpuid4_cache_sysfs_init(cpu);
943 if (unlikely(retval < 0)) 991 if (unlikely(retval < 0))
944 return retval; 992 return retval;
945 993
946 retval = kobject_init_and_add(per_cpu(cache_kobject, cpu), 994 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
947 &ktype_percpu_entry, 995 &ktype_percpu_entry,
948 &sys_dev->kobj, "%s", "cache"); 996 &sys_dev->kobj, "%s", "cache");
949 if (retval < 0) { 997 if (retval < 0) {
@@ -955,14 +1003,22 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
955 this_object = INDEX_KOBJECT_PTR(cpu, i); 1003 this_object = INDEX_KOBJECT_PTR(cpu, i);
956 this_object->cpu = cpu; 1004 this_object->cpu = cpu;
957 this_object->index = i; 1005 this_object->index = i;
1006
1007 this_leaf = CPUID4_INFO_IDX(cpu, i);
1008
1009 if (this_leaf->can_disable)
1010 ktype_cache.default_attrs = default_l3_attrs;
1011 else
1012 ktype_cache.default_attrs = default_attrs;
1013
958 retval = kobject_init_and_add(&(this_object->kobj), 1014 retval = kobject_init_and_add(&(this_object->kobj),
959 &ktype_cache, 1015 &ktype_cache,
960 per_cpu(cache_kobject, cpu), 1016 per_cpu(ici_cache_kobject, cpu),
961 "index%1lu", i); 1017 "index%1lu", i);
962 if (unlikely(retval)) { 1018 if (unlikely(retval)) {
963 for (j = 0; j < i; j++) 1019 for (j = 0; j < i; j++)
964 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); 1020 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
965 kobject_put(per_cpu(cache_kobject, cpu)); 1021 kobject_put(per_cpu(ici_cache_kobject, cpu));
966 cpuid4_cache_sysfs_exit(cpu); 1022 cpuid4_cache_sysfs_exit(cpu);
967 return retval; 1023 return retval;
968 } 1024 }
@@ -970,7 +1026,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
970 } 1026 }
971 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); 1027 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
972 1028
973 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 1029 kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
974 return 0; 1030 return 0;
975} 1031}
976 1032
@@ -979,7 +1035,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
979 unsigned int cpu = sys_dev->id; 1035 unsigned int cpu = sys_dev->id;
980 unsigned long i; 1036 unsigned long i;
981 1037
982 if (per_cpu(cpuid4_info, cpu) == NULL) 1038 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
983 return; 1039 return;
984 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) 1040 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
985 return; 1041 return;
@@ -987,7 +1043,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
987 1043
988 for (i = 0; i < num_cache_leaves; i++) 1044 for (i = 0; i < num_cache_leaves; i++)
989 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); 1045 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
990 kobject_put(per_cpu(cache_kobject, cpu)); 1046 kobject_put(per_cpu(ici_cache_kobject, cpu));
991 cpuid4_cache_sysfs_exit(cpu); 1047 cpuid4_cache_sysfs_exit(cpu);
992} 1048}
993 1049
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 472763d92098..e7dbde7bfedb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -22,6 +22,7 @@
22#include <linux/kdebug.h> 22#include <linux/kdebug.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/gfp.h>
25#include <asm/mce.h> 26#include <asm/mce.h>
26#include <asm/apic.h> 27#include <asm/apic.h>
27 28
@@ -74,7 +75,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
74 m->finished = 0; 75 m->finished = 0;
75} 76}
76 77
77static cpumask_t mce_inject_cpumask; 78static cpumask_var_t mce_inject_cpumask;
78 79
79static int mce_raise_notify(struct notifier_block *self, 80static int mce_raise_notify(struct notifier_block *self,
80 unsigned long val, void *data) 81 unsigned long val, void *data)
@@ -82,9 +83,9 @@ static int mce_raise_notify(struct notifier_block *self,
82 struct die_args *args = (struct die_args *)data; 83 struct die_args *args = (struct die_args *)data;
83 int cpu = smp_processor_id(); 84 int cpu = smp_processor_id();
84 struct mce *m = &__get_cpu_var(injectm); 85 struct mce *m = &__get_cpu_var(injectm);
85 if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) 86 if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
86 return NOTIFY_DONE; 87 return NOTIFY_DONE;
87 cpu_clear(cpu, mce_inject_cpumask); 88 cpumask_clear_cpu(cpu, mce_inject_cpumask);
88 if (m->inject_flags & MCJ_EXCEPTION) 89 if (m->inject_flags & MCJ_EXCEPTION)
89 raise_exception(m, args->regs); 90 raise_exception(m, args->regs);
90 else if (m->status) 91 else if (m->status)
@@ -148,22 +149,22 @@ static void raise_mce(struct mce *m)
148 unsigned long start; 149 unsigned long start;
149 int cpu; 150 int cpu;
150 get_online_cpus(); 151 get_online_cpus();
151 mce_inject_cpumask = cpu_online_map; 152 cpumask_copy(mce_inject_cpumask, cpu_online_mask);
152 cpu_clear(get_cpu(), mce_inject_cpumask); 153 cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
153 for_each_online_cpu(cpu) { 154 for_each_online_cpu(cpu) {
154 struct mce *mcpu = &per_cpu(injectm, cpu); 155 struct mce *mcpu = &per_cpu(injectm, cpu);
155 if (!mcpu->finished || 156 if (!mcpu->finished ||
156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) 157 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
157 cpu_clear(cpu, mce_inject_cpumask); 158 cpumask_clear_cpu(cpu, mce_inject_cpumask);
158 } 159 }
159 if (!cpus_empty(mce_inject_cpumask)) 160 if (!cpumask_empty(mce_inject_cpumask))
160 apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); 161 apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
161 start = jiffies; 162 start = jiffies;
162 while (!cpus_empty(mce_inject_cpumask)) { 163 while (!cpumask_empty(mce_inject_cpumask)) {
163 if (!time_before(jiffies, start + 2*HZ)) { 164 if (!time_before(jiffies, start + 2*HZ)) {
164 printk(KERN_ERR 165 printk(KERN_ERR
165 "Timeout waiting for mce inject NMI %lx\n", 166 "Timeout waiting for mce inject NMI %lx\n",
166 *cpus_addr(mce_inject_cpumask)); 167 *cpumask_bits(mce_inject_cpumask));
167 break; 168 break;
168 } 169 }
169 cpu_relax(); 170 cpu_relax();
@@ -210,6 +211,8 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
210 211
211static int inject_init(void) 212static int inject_init(void)
212{ 213{
214 if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
215 return -ENOMEM;
213 printk(KERN_INFO "Machine check injector initialized\n"); 216 printk(KERN_INFO "Machine check injector initialized\n");
214 mce_chrdev_ops.write = mce_write; 217 mce_chrdev_ops.write = mce_write;
215 register_die_notifier(&mce_raise_nb); 218 register_die_notifier(&mce_raise_nb);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 721a77ca8115..8a6f0afa767e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -26,6 +26,7 @@
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/sysfs.h> 27#include <linux/sysfs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
29#include <linux/init.h> 30#include <linux/init.h>
30#include <linux/kmod.h> 31#include <linux/kmod.h>
31#include <linux/poll.h> 32#include <linux/poll.h>
@@ -46,6 +47,16 @@
46 47
47#include "mce-internal.h" 48#include "mce-internal.h"
48 49
50static DEFINE_MUTEX(mce_read_mutex);
51
52#define rcu_dereference_check_mce(p) \
53 rcu_dereference_check((p), \
54 rcu_read_lock_sched_held() || \
55 lockdep_is_held(&mce_read_mutex))
56
57#define CREATE_TRACE_POINTS
58#include <trace/events/mce.h>
59
49int mce_disabled __read_mostly; 60int mce_disabled __read_mostly;
50 61
51#define MISC_MCELOG_MINOR 227 62#define MISC_MCELOG_MINOR 227
@@ -85,18 +96,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
85static DEFINE_PER_CPU(struct mce, mces_seen); 96static DEFINE_PER_CPU(struct mce, mces_seen);
86static int cpu_missing; 97static int cpu_missing;
87 98
88static void default_decode_mce(struct mce *m) 99/*
100 * CPU/chipset specific EDAC code can register a notifier call here to print
101 * MCE errors in a human-readable form.
102 */
103ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
104EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
105
106static int default_decode_mce(struct notifier_block *nb, unsigned long val,
107 void *data)
89{ 108{
90 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 109 pr_emerg("No human readable MCE decoding support on this CPU type.\n");
91 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 110 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
111
112 return NOTIFY_STOP;
92} 113}
93 114
94/* 115static struct notifier_block mce_dec_nb = {
95 * CPU/chipset specific EDAC code can register a callback here to print 116 .notifier_call = default_decode_mce,
96 * MCE errors in a human-readable form: 117 .priority = -1,
97 */ 118};
98void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce;
99EXPORT_SYMBOL(x86_mce_decode_callback);
100 119
101/* MCA banks polled by the period polling timer for corrected events */ 120/* MCA banks polled by the period polling timer for corrected events */
102DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 121DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -141,10 +160,13 @@ void mce_log(struct mce *mce)
141{ 160{
142 unsigned next, entry; 161 unsigned next, entry;
143 162
163 /* Emit the trace record: */
164 trace_mce_record(mce);
165
144 mce->finished = 0; 166 mce->finished = 0;
145 wmb(); 167 wmb();
146 for (;;) { 168 for (;;) {
147 entry = rcu_dereference(mcelog.next); 169 entry = rcu_dereference_check_mce(mcelog.next);
148 for (;;) { 170 for (;;) {
149 /* 171 /*
150 * When the buffer fills up discard new entries. 172 * When the buffer fills up discard new entries.
@@ -204,9 +226,9 @@ static void print_mce(struct mce *m)
204 226
205 /* 227 /*
206 * Print out human-readable details about the MCE error, 228 * Print out human-readable details about the MCE error,
207 * (if the CPU has an implementation for that): 229 * (if the CPU has an implementation for that)
208 */ 230 */
209 x86_mce_decode_callback(m); 231 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
210} 232}
211 233
212static void print_mce_head(void) 234static void print_mce_head(void)
@@ -1122,7 +1144,7 @@ static int check_interval = 5 * 60; /* 5 minutes */
1122static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1144static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1123static DEFINE_PER_CPU(struct timer_list, mce_timer); 1145static DEFINE_PER_CPU(struct timer_list, mce_timer);
1124 1146
1125static void mcheck_timer(unsigned long data) 1147static void mce_start_timer(unsigned long data)
1126{ 1148{
1127 struct timer_list *t = &per_cpu(mce_timer, data); 1149 struct timer_list *t = &per_cpu(mce_timer, data);
1128 int *n; 1150 int *n;
@@ -1187,7 +1209,7 @@ int mce_notify_irq(void)
1187} 1209}
1188EXPORT_SYMBOL_GPL(mce_notify_irq); 1210EXPORT_SYMBOL_GPL(mce_notify_irq);
1189 1211
1190static int mce_banks_init(void) 1212static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1191{ 1213{
1192 int i; 1214 int i;
1193 1215
@@ -1206,7 +1228,7 @@ static int mce_banks_init(void)
1206/* 1228/*
1207 * Initialize Machine Checks for a CPU. 1229 * Initialize Machine Checks for a CPU.
1208 */ 1230 */
1209static int __cpuinit mce_cap_init(void) 1231static int __cpuinit __mcheck_cpu_cap_init(void)
1210{ 1232{
1211 unsigned b; 1233 unsigned b;
1212 u64 cap; 1234 u64 cap;
@@ -1228,7 +1250,7 @@ static int __cpuinit mce_cap_init(void)
1228 WARN_ON(banks != 0 && b != banks); 1250 WARN_ON(banks != 0 && b != banks);
1229 banks = b; 1251 banks = b;
1230 if (!mce_banks) { 1252 if (!mce_banks) {
1231 int err = mce_banks_init(); 1253 int err = __mcheck_cpu_mce_banks_init();
1232 1254
1233 if (err) 1255 if (err)
1234 return err; 1256 return err;
@@ -1244,7 +1266,7 @@ static int __cpuinit mce_cap_init(void)
1244 return 0; 1266 return 0;
1245} 1267}
1246 1268
1247static void mce_init(void) 1269static void __mcheck_cpu_init_generic(void)
1248{ 1270{
1249 mce_banks_t all_banks; 1271 mce_banks_t all_banks;
1250 u64 cap; 1272 u64 cap;
@@ -1273,7 +1295,7 @@ static void mce_init(void)
1273} 1295}
1274 1296
1275/* Add per CPU specific workarounds here */ 1297/* Add per CPU specific workarounds here */
1276static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 1298static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1277{ 1299{
1278 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1300 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1279 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1301 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1341,7 +1363,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1341 return 0; 1363 return 0;
1342} 1364}
1343 1365
1344static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1366static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1345{ 1367{
1346 if (c->x86 != 5) 1368 if (c->x86 != 5)
1347 return; 1369 return;
@@ -1355,7 +1377,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1355 } 1377 }
1356} 1378}
1357 1379
1358static void mce_cpu_features(struct cpuinfo_x86 *c) 1380static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1359{ 1381{
1360 switch (c->x86_vendor) { 1382 switch (c->x86_vendor) {
1361 case X86_VENDOR_INTEL: 1383 case X86_VENDOR_INTEL:
@@ -1369,18 +1391,19 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1369 } 1391 }
1370} 1392}
1371 1393
1372static void mce_init_timer(void) 1394static void __mcheck_cpu_init_timer(void)
1373{ 1395{
1374 struct timer_list *t = &__get_cpu_var(mce_timer); 1396 struct timer_list *t = &__get_cpu_var(mce_timer);
1375 int *n = &__get_cpu_var(mce_next_interval); 1397 int *n = &__get_cpu_var(mce_next_interval);
1376 1398
1399 setup_timer(t, mce_start_timer, smp_processor_id());
1400
1377 if (mce_ignore_ce) 1401 if (mce_ignore_ce)
1378 return; 1402 return;
1379 1403
1380 *n = check_interval * HZ; 1404 *n = check_interval * HZ;
1381 if (!*n) 1405 if (!*n)
1382 return; 1406 return;
1383 setup_timer(t, mcheck_timer, smp_processor_id());
1384 t->expires = round_jiffies(jiffies + *n); 1407 t->expires = round_jiffies(jiffies + *n);
1385 add_timer_on(t, smp_processor_id()); 1408 add_timer_on(t, smp_processor_id());
1386} 1409}
@@ -1400,27 +1423,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
1400 * Called for each booted CPU to set up machine checks. 1423 * Called for each booted CPU to set up machine checks.
1401 * Must be called with preempt off: 1424 * Must be called with preempt off:
1402 */ 1425 */
1403void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1426void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1404{ 1427{
1405 if (mce_disabled) 1428 if (mce_disabled)
1406 return; 1429 return;
1407 1430
1408 mce_ancient_init(c); 1431 __mcheck_cpu_ancient_init(c);
1409 1432
1410 if (!mce_available(c)) 1433 if (!mce_available(c))
1411 return; 1434 return;
1412 1435
1413 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { 1436 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1414 mce_disabled = 1; 1437 mce_disabled = 1;
1415 return; 1438 return;
1416 } 1439 }
1417 1440
1418 machine_check_vector = do_machine_check; 1441 machine_check_vector = do_machine_check;
1419 1442
1420 mce_init(); 1443 __mcheck_cpu_init_generic();
1421 mce_cpu_features(c); 1444 __mcheck_cpu_init_vendor(c);
1422 mce_init_timer(); 1445 __mcheck_cpu_init_timer();
1423 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1446 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1447
1424} 1448}
1425 1449
1426/* 1450/*
@@ -1469,8 +1493,6 @@ static void collect_tscs(void *data)
1469 rdtscll(cpu_tsc[smp_processor_id()]); 1493 rdtscll(cpu_tsc[smp_processor_id()]);
1470} 1494}
1471 1495
1472static DEFINE_MUTEX(mce_read_mutex);
1473
1474static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1496static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1475 loff_t *off) 1497 loff_t *off)
1476{ 1498{
@@ -1484,7 +1506,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1484 return -ENOMEM; 1506 return -ENOMEM;
1485 1507
1486 mutex_lock(&mce_read_mutex); 1508 mutex_lock(&mce_read_mutex);
1487 next = rcu_dereference(mcelog.next); 1509 next = rcu_dereference_check_mce(mcelog.next);
1488 1510
1489 /* Only supports full reads right now */ 1511 /* Only supports full reads right now */
1490 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1512 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
@@ -1549,7 +1571,7 @@ timeout:
1549static unsigned int mce_poll(struct file *file, poll_table *wait) 1571static unsigned int mce_poll(struct file *file, poll_table *wait)
1550{ 1572{
1551 poll_wait(file, &mce_wait, wait); 1573 poll_wait(file, &mce_wait, wait);
1552 if (rcu_dereference(mcelog.next)) 1574 if (rcu_dereference_check_mce(mcelog.next))
1553 return POLLIN | POLLRDNORM; 1575 return POLLIN | POLLRDNORM;
1554 return 0; 1576 return 0;
1555} 1577}
@@ -1640,6 +1662,15 @@ static int __init mcheck_enable(char *str)
1640} 1662}
1641__setup("mce", mcheck_enable); 1663__setup("mce", mcheck_enable);
1642 1664
1665int __init mcheck_init(void)
1666{
1667 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1668
1669 mcheck_intel_therm_init();
1670
1671 return 0;
1672}
1673
1643/* 1674/*
1644 * Sysfs support 1675 * Sysfs support
1645 */ 1676 */
@@ -1648,7 +1679,7 @@ __setup("mce", mcheck_enable);
1648 * Disable machine checks on suspend and shutdown. We can't really handle 1679 * Disable machine checks on suspend and shutdown. We can't really handle
1649 * them later. 1680 * them later.
1650 */ 1681 */
1651static int mce_disable(void) 1682static int mce_disable_error_reporting(void)
1652{ 1683{
1653 int i; 1684 int i;
1654 1685
@@ -1663,12 +1694,12 @@ static int mce_disable(void)
1663 1694
1664static int mce_suspend(struct sys_device *dev, pm_message_t state) 1695static int mce_suspend(struct sys_device *dev, pm_message_t state)
1665{ 1696{
1666 return mce_disable(); 1697 return mce_disable_error_reporting();
1667} 1698}
1668 1699
1669static int mce_shutdown(struct sys_device *dev) 1700static int mce_shutdown(struct sys_device *dev)
1670{ 1701{
1671 return mce_disable(); 1702 return mce_disable_error_reporting();
1672} 1703}
1673 1704
1674/* 1705/*
@@ -1678,8 +1709,8 @@ static int mce_shutdown(struct sys_device *dev)
1678 */ 1709 */
1679static int mce_resume(struct sys_device *dev) 1710static int mce_resume(struct sys_device *dev)
1680{ 1711{
1681 mce_init(); 1712 __mcheck_cpu_init_generic();
1682 mce_cpu_features(&current_cpu_data); 1713 __mcheck_cpu_init_vendor(&current_cpu_data);
1683 1714
1684 return 0; 1715 return 0;
1685} 1716}
@@ -1689,8 +1720,8 @@ static void mce_cpu_restart(void *data)
1689 del_timer_sync(&__get_cpu_var(mce_timer)); 1720 del_timer_sync(&__get_cpu_var(mce_timer));
1690 if (!mce_available(&current_cpu_data)) 1721 if (!mce_available(&current_cpu_data))
1691 return; 1722 return;
1692 mce_init(); 1723 __mcheck_cpu_init_generic();
1693 mce_init_timer(); 1724 __mcheck_cpu_init_timer();
1694} 1725}
1695 1726
1696/* Reinit MCEs after user configuration changes */ 1727/* Reinit MCEs after user configuration changes */
@@ -1716,7 +1747,7 @@ static void mce_enable_ce(void *all)
1716 cmci_reenable(); 1747 cmci_reenable();
1717 cmci_recheck(); 1748 cmci_recheck();
1718 if (all) 1749 if (all)
1719 mce_init_timer(); 1750 __mcheck_cpu_init_timer();
1720} 1751}
1721 1752
1722static struct sysdev_class mce_sysclass = { 1753static struct sysdev_class mce_sysclass = {
@@ -1904,7 +1935,7 @@ error2:
1904 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1935 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1905error: 1936error:
1906 while (--i >= 0) 1937 while (--i >= 0)
1907 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1938 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1908 1939
1909 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1940 sysdev_unregister(&per_cpu(mce_dev, cpu));
1910 1941
@@ -1929,13 +1960,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1929} 1960}
1930 1961
1931/* Make sure there are no machine checks on offlined CPUs. */ 1962/* Make sure there are no machine checks on offlined CPUs. */
1932static void mce_disable_cpu(void *h) 1963static void __cpuinit mce_disable_cpu(void *h)
1933{ 1964{
1934 unsigned long action = *(unsigned long *)h; 1965 unsigned long action = *(unsigned long *)h;
1935 int i; 1966 int i;
1936 1967
1937 if (!mce_available(&current_cpu_data)) 1968 if (!mce_available(&current_cpu_data))
1938 return; 1969 return;
1970
1939 if (!(action & CPU_TASKS_FROZEN)) 1971 if (!(action & CPU_TASKS_FROZEN))
1940 cmci_clear(); 1972 cmci_clear();
1941 for (i = 0; i < banks; i++) { 1973 for (i = 0; i < banks; i++) {
@@ -1946,7 +1978,7 @@ static void mce_disable_cpu(void *h)
1946 } 1978 }
1947} 1979}
1948 1980
1949static void mce_reenable_cpu(void *h) 1981static void __cpuinit mce_reenable_cpu(void *h)
1950{ 1982{
1951 unsigned long action = *(unsigned long *)h; 1983 unsigned long action = *(unsigned long *)h;
1952 int i; 1984 int i;
@@ -1991,9 +2023,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1991 break; 2023 break;
1992 case CPU_DOWN_FAILED: 2024 case CPU_DOWN_FAILED:
1993 case CPU_DOWN_FAILED_FROZEN: 2025 case CPU_DOWN_FAILED_FROZEN:
1994 t->expires = round_jiffies(jiffies + 2026 if (!mce_ignore_ce && check_interval) {
2027 t->expires = round_jiffies(jiffies +
1995 __get_cpu_var(mce_next_interval)); 2028 __get_cpu_var(mce_next_interval));
1996 add_timer_on(t, cpu); 2029 add_timer_on(t, cpu);
2030 }
1997 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2031 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1998 break; 2032 break;
1999 case CPU_POST_DEAD: 2033 case CPU_POST_DEAD:
@@ -2016,6 +2050,7 @@ static __init void mce_init_banks(void)
2016 struct mce_bank *b = &mce_banks[i]; 2050 struct mce_bank *b = &mce_banks[i];
2017 struct sysdev_attribute *a = &b->attr; 2051 struct sysdev_attribute *a = &b->attr;
2018 2052
2053 sysfs_attr_init(&a->attr);
2019 a->attr.name = b->attrname; 2054 a->attr.name = b->attrname;
2020 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2055 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2021 2056
@@ -2025,7 +2060,7 @@ static __init void mce_init_banks(void)
2025 } 2060 }
2026} 2061}
2027 2062
2028static __init int mce_init_device(void) 2063static __init int mcheck_init_device(void)
2029{ 2064{
2030 int err; 2065 int err;
2031 int i = 0; 2066 int i = 0;
@@ -2053,7 +2088,7 @@ static __init int mce_init_device(void)
2053 return err; 2088 return err;
2054} 2089}
2055 2090
2056device_initcall(mce_init_device); 2091device_initcall(mcheck_init_device);
2057 2092
2058/* 2093/*
2059 * Old style boot options parsing. Only for compatibility. 2094 * Old style boot options parsing. Only for compatibility.
@@ -2101,7 +2136,7 @@ static int fake_panic_set(void *data, u64 val)
2101DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2136DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2102 fake_panic_set, "%llu\n"); 2137 fake_panic_set, "%llu\n");
2103 2138
2104static int __init mce_debugfs_init(void) 2139static int __init mcheck_debugfs_init(void)
2105{ 2140{
2106 struct dentry *dmce, *ffake_panic; 2141 struct dentry *dmce, *ffake_panic;
2107 2142
@@ -2115,5 +2150,5 @@ static int __init mce_debugfs_init(void)
2115 2150
2116 return 0; 2151 return 0;
2117} 2152}
2118late_initcall(mce_debugfs_init); 2153late_initcall(mcheck_debugfs_init);
2119#endif 2154#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 83a3d1f4efca..224392d8fe8c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -21,6 +21,7 @@
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/slab.h>
24#include <linux/init.h> 25#include <linux/init.h>
25#include <linux/cpu.h> 26#include <linux/cpu.h>
26#include <linux/smp.h> 27#include <linux/smp.h>
@@ -388,7 +389,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
388 return ret; 389 return ret;
389} 390}
390 391
391static struct sysfs_ops threshold_ops = { 392static const struct sysfs_ops threshold_ops = {
392 .show = show, 393 .show = show,
393 .store = store, 394 .store = store,
394}; 395};
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 7c785634af2b..62b48e40920a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -5,6 +5,7 @@
5 * Author: Andi Kleen 5 * Author: Andi Kleen
6 */ 6 */
7 7
8#include <linux/gfp.h>
8#include <linux/init.h> 9#include <linux/init.h>
9#include <linux/interrupt.h> 10#include <linux/interrupt.h>
10#include <linux/percpu.h> 11#include <linux/percpu.h>
@@ -95,7 +96,7 @@ static void cmci_discover(int banks, int boot)
95 96
96 /* Already owned by someone else? */ 97 /* Already owned by someone else? */
97 if (val & CMCI_EN) { 98 if (val & CMCI_EN) {
98 if (test_and_clear_bit(i, owned) || boot) 99 if (test_and_clear_bit(i, owned) && !boot)
99 print_update("SHD", &hdr, i); 100 print_update("SHD", &hdr, i);
100 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 101 __clear_bit(i, __get_cpu_var(mce_poll_banks));
101 continue; 102 continue;
@@ -107,7 +108,7 @@ static void cmci_discover(int banks, int boot)
107 108
108 /* Did the enable bit stick? -- the bank supports CMCI */ 109 /* Did the enable bit stick? -- the bank supports CMCI */
109 if (val & CMCI_EN) { 110 if (val & CMCI_EN) {
110 if (!test_and_set_bit(i, owned) || boot) 111 if (!test_and_set_bit(i, owned) && !boot)
111 print_update("CMCI", &hdr, i); 112 print_update("CMCI", &hdr, i);
112 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 113 __clear_bit(i, __get_cpu_var(mce_poll_banks));
113 } else { 114 } else {
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index b3a1dba75330..81c499eceb21 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state);
49 49
50static atomic_t therm_throt_en = ATOMIC_INIT(0); 50static atomic_t therm_throt_en = ATOMIC_INIT(0);
51 51
52static u32 lvtthmr_init __read_mostly;
53
52#ifdef CONFIG_SYSFS 54#ifdef CONFIG_SYSFS
53#define define_therm_throt_sysdev_one_ro(_name) \ 55#define define_therm_throt_sysdev_one_ro(_name) \
54 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
@@ -254,14 +256,34 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
254 ack_APIC_irq(); 256 ack_APIC_irq();
255} 257}
256 258
259/* Thermal monitoring depends on APIC, ACPI and clock modulation */
260static int intel_thermal_supported(struct cpuinfo_x86 *c)
261{
262 if (!cpu_has_apic)
263 return 0;
264 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
265 return 0;
266 return 1;
267}
268
269void __init mcheck_intel_therm_init(void)
270{
271 /*
272 * This function is only called on boot CPU. Save the init thermal
273 * LVT value on BSP and use that value to restore APs' thermal LVT
274 * entry BIOS programmed later
275 */
276 if (intel_thermal_supported(&boot_cpu_data))
277 lvtthmr_init = apic_read(APIC_LVTTHMR);
278}
279
257void intel_init_thermal(struct cpuinfo_x86 *c) 280void intel_init_thermal(struct cpuinfo_x86 *c)
258{ 281{
259 unsigned int cpu = smp_processor_id(); 282 unsigned int cpu = smp_processor_id();
260 int tm2 = 0; 283 int tm2 = 0;
261 u32 l, h; 284 u32 l, h;
262 285
263 /* Thermal monitoring depends on ACPI and clock modulation*/ 286 if (!intel_thermal_supported(c))
264 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
265 return; 287 return;
266 288
267 /* 289 /*
@@ -270,7 +292,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
270 * since it might be delivered via SMI already: 292 * since it might be delivered via SMI already:
271 */ 293 */
272 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 294 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
273 h = apic_read(APIC_LVTTHMR); 295
296 /*
297 * The initial value of thermal LVT entries on all APs always reads
298 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
299 * sequence to them and LVT registers are reset to 0s except for
300 * the mask bits which are set to 1s when APs receive INIT IPI.
301 * Always restore the value that BIOS has programmed on AP based on
302 * BSP's info we saved since BIOS is always setting the same value
303 * for all threads/cores
304 */
305 apic_write(APIC_LVTTHMR, lvtthmr_init);
306
307 h = lvtthmr_init;
308
274 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 309 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
275 printk(KERN_DEBUG 310 printk(KERN_DEBUG
276 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 311 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
@@ -312,8 +347,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
312 l = apic_read(APIC_LVTTHMR); 347 l = apic_read(APIC_LVTTHMR);
313 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 348 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
314 349
315 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 350 printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
316 cpu, tm2 ? "TM2" : "TM1"); 351 tm2 ? "TM2" : "TM1");
317 352
318 /* enable thermal throttle processing */ 353 /* enable thermal throttle processing */
319 atomic_set(&therm_throt_en, 1); 354 atomic_set(&therm_throt_en, 1);
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index f4361b56f8e9..ad9e5ed81181 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
1obj-y := main.o if.o generic.o state.o cleanup.o 1obj-y := main.o if.o generic.o cleanup.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3 3
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 33af14110dfd..92ba9cd31c9a 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
108 return 0; 108 return 0;
109} 109}
110 110
111static struct mtrr_ops amd_mtrr_ops = { 111static const struct mtrr_ops amd_mtrr_ops = {
112 .vendor = X86_VENDOR_AMD, 112 .vendor = X86_VENDOR_AMD,
113 .set = amd_set_mtrr, 113 .set = amd_set_mtrr,
114 .get = amd_get_mtrr, 114 .get = amd_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index de89f14eff3a..316fe3e60a97 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
110 return 0; 110 return 0;
111} 111}
112 112
113static struct mtrr_ops centaur_mtrr_ops = { 113static const struct mtrr_ops centaur_mtrr_ops = {
114 .vendor = X86_VENDOR_CENTAUR, 114 .vendor = X86_VENDOR_CENTAUR,
115 .set = centaur_set_mcr, 115 .set = centaur_set_mcr,
116 .get = centaur_get_mcr, 116 .get = centaur_get_mcr,
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 73c86db5acbe..06130b52f012 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
22#include <linux/pci.h> 22#include <linux/pci.h>
23#include <linux/smp.h> 23#include <linux/smp.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/sort.h>
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/uaccess.h> 26#include <linux/uaccess.h>
28#include <linux/kvm_para.h> 27#include <linux/kvm_para.h>
28#include <linux/range.h>
29 29
30#include <asm/processor.h> 30#include <asm/processor.h>
31#include <asm/e820.h> 31#include <asm/e820.h>
@@ -34,11 +34,6 @@
34 34
35#include "mtrr.h" 35#include "mtrr.h"
36 36
37struct res_range {
38 unsigned long start;
39 unsigned long end;
40};
41
42struct var_mtrr_range_state { 37struct var_mtrr_range_state {
43 unsigned long base_pfn; 38 unsigned long base_pfn;
44 unsigned long size_pfn; 39 unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
56/* Should be related to MTRR_VAR_RANGES nums */ 51/* Should be related to MTRR_VAR_RANGES nums */
57#define RANGE_NUM 256 52#define RANGE_NUM 256
58 53
59static struct res_range __initdata range[RANGE_NUM]; 54static struct range __initdata range[RANGE_NUM];
60static int __initdata nr_range; 55static int __initdata nr_range;
61 56
62static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; 57static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,117 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
64static int __initdata debug_print; 59static int __initdata debug_print;
65#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) 60#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
66 61
67
68static int __init
69add_range(struct res_range *range, int nr_range,
70 unsigned long start, unsigned long end)
71{
72 /* Out of slots: */
73 if (nr_range >= RANGE_NUM)
74 return nr_range;
75
76 range[nr_range].start = start;
77 range[nr_range].end = end;
78
79 nr_range++;
80
81 return nr_range;
82}
83
84static int __init
85add_range_with_merge(struct res_range *range, int nr_range,
86 unsigned long start, unsigned long end)
87{
88 int i;
89
90 /* Try to merge it with old one: */
91 for (i = 0; i < nr_range; i++) {
92 unsigned long final_start, final_end;
93 unsigned long common_start, common_end;
94
95 if (!range[i].end)
96 continue;
97
98 common_start = max(range[i].start, start);
99 common_end = min(range[i].end, end);
100 if (common_start > common_end + 1)
101 continue;
102
103 final_start = min(range[i].start, start);
104 final_end = max(range[i].end, end);
105
106 range[i].start = final_start;
107 range[i].end = final_end;
108 return nr_range;
109 }
110
111 /* Need to add it: */
112 return add_range(range, nr_range, start, end);
113}
114
115static void __init
116subtract_range(struct res_range *range, unsigned long start, unsigned long end)
117{
118 int i, j;
119
120 for (j = 0; j < RANGE_NUM; j++) {
121 if (!range[j].end)
122 continue;
123
124 if (start <= range[j].start && end >= range[j].end) {
125 range[j].start = 0;
126 range[j].end = 0;
127 continue;
128 }
129
130 if (start <= range[j].start && end < range[j].end &&
131 range[j].start < end + 1) {
132 range[j].start = end + 1;
133 continue;
134 }
135
136
137 if (start > range[j].start && end >= range[j].end &&
138 range[j].end > start - 1) {
139 range[j].end = start - 1;
140 continue;
141 }
142
143 if (start > range[j].start && end < range[j].end) {
144 /* Find the new spare: */
145 for (i = 0; i < RANGE_NUM; i++) {
146 if (range[i].end == 0)
147 break;
148 }
149 if (i < RANGE_NUM) {
150 range[i].end = range[j].end;
151 range[i].start = end + 1;
152 } else {
153 printk(KERN_ERR "run of slot in ranges\n");
154 }
155 range[j].end = start - 1;
156 continue;
157 }
158 }
159}
160
161static int __init cmp_range(const void *x1, const void *x2)
162{
163 const struct res_range *r1 = x1;
164 const struct res_range *r2 = x2;
165 long start1, start2;
166
167 start1 = r1->start;
168 start2 = r2->start;
169
170 return start1 - start2;
171}
172
173#define BIOS_BUG_MSG KERN_WARNING \ 62#define BIOS_BUG_MSG KERN_WARNING \
174 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 63 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
175 64
176static int __init 65static int __init
177x86_get_mtrr_mem_range(struct res_range *range, int nr_range, 66x86_get_mtrr_mem_range(struct range *range, int nr_range,
178 unsigned long extra_remove_base, 67 unsigned long extra_remove_base,
179 unsigned long extra_remove_size) 68 unsigned long extra_remove_size)
180{ 69{
@@ -188,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
188 continue; 77 continue;
189 base = range_state[i].base_pfn; 78 base = range_state[i].base_pfn;
190 size = range_state[i].size_pfn; 79 size = range_state[i].size_pfn;
191 nr_range = add_range_with_merge(range, nr_range, base, 80 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
192 base + size - 1); 81 base, base + size);
193 } 82 }
194 if (debug_print) { 83 if (debug_print) {
195 printk(KERN_DEBUG "After WB checking\n"); 84 printk(KERN_DEBUG "After WB checking\n");
196 for (i = 0; i < nr_range; i++) 85 for (i = 0; i < nr_range; i++)
197 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 86 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
198 range[i].start, range[i].end + 1); 87 range[i].start, range[i].end);
199 } 88 }
200 89
201 /* Take out UC ranges: */ 90 /* Take out UC ranges: */
@@ -217,51 +106,43 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
217 size -= (1<<(20-PAGE_SHIFT)) - base; 106 size -= (1<<(20-PAGE_SHIFT)) - base;
218 base = 1<<(20-PAGE_SHIFT); 107 base = 1<<(20-PAGE_SHIFT);
219 } 108 }
220 subtract_range(range, base, base + size - 1); 109 subtract_range(range, RANGE_NUM, base, base + size);
221 } 110 }
222 if (extra_remove_size) 111 if (extra_remove_size)
223 subtract_range(range, extra_remove_base, 112 subtract_range(range, RANGE_NUM, extra_remove_base,
224 extra_remove_base + extra_remove_size - 1); 113 extra_remove_base + extra_remove_size);
225 114
226 /* get new range num */
227 nr_range = 0;
228 for (i = 0; i < RANGE_NUM; i++) {
229 if (!range[i].end)
230 continue;
231 nr_range++;
232 }
233 if (debug_print) { 115 if (debug_print) {
234 printk(KERN_DEBUG "After UC checking\n"); 116 printk(KERN_DEBUG "After UC checking\n");
235 for (i = 0; i < nr_range; i++) 117 for (i = 0; i < RANGE_NUM; i++) {
236 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 118 if (!range[i].end)
237 range[i].start, range[i].end + 1); 119 continue;
120 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
121 range[i].start, range[i].end);
122 }
238 } 123 }
239 124
240 /* sort the ranges */ 125 /* sort the ranges */
241 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 126 nr_range = clean_sort_range(range, RANGE_NUM);
242 if (debug_print) { 127 if (debug_print) {
243 printk(KERN_DEBUG "After sorting\n"); 128 printk(KERN_DEBUG "After sorting\n");
244 for (i = 0; i < nr_range; i++) 129 for (i = 0; i < nr_range; i++)
245 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 130 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
246 range[i].start, range[i].end + 1); 131 range[i].start, range[i].end);
247 } 132 }
248 133
249 /* clear those is not used */
250 for (i = nr_range; i < RANGE_NUM; i++)
251 memset(&range[i], 0, sizeof(range[i]));
252
253 return nr_range; 134 return nr_range;
254} 135}
255 136
256#ifdef CONFIG_MTRR_SANITIZER 137#ifdef CONFIG_MTRR_SANITIZER
257 138
258static unsigned long __init sum_ranges(struct res_range *range, int nr_range) 139static unsigned long __init sum_ranges(struct range *range, int nr_range)
259{ 140{
260 unsigned long sum = 0; 141 unsigned long sum = 0;
261 int i; 142 int i;
262 143
263 for (i = 0; i < nr_range; i++) 144 for (i = 0; i < nr_range; i++)
264 sum += range[i].end + 1 - range[i].start; 145 sum += range[i].end - range[i].start;
265 146
266 return sum; 147 return sum;
267} 148}
@@ -590,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg)
590early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); 471early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
591 472
592static int __init 473static int __init
593x86_setup_var_mtrrs(struct res_range *range, int nr_range, 474x86_setup_var_mtrrs(struct range *range, int nr_range,
594 u64 chunk_size, u64 gran_size) 475 u64 chunk_size, u64 gran_size)
595{ 476{
596 struct var_mtrr_state var_state; 477 struct var_mtrr_state var_state;
@@ -608,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
608 /* Write the range: */ 489 /* Write the range: */
609 for (i = 0; i < nr_range; i++) { 490 for (i = 0; i < nr_range; i++) {
610 set_var_mtrr_range(&var_state, range[i].start, 491 set_var_mtrr_range(&var_state, range[i].start,
611 range[i].end - range[i].start + 1); 492 range[i].end - range[i].start);
612 } 493 }
613 494
614 /* Write the last range: */ 495 /* Write the last range: */
@@ -689,8 +570,6 @@ static int __init mtrr_need_cleanup(void)
689 continue; 570 continue;
690 if (!size) 571 if (!size)
691 type = MTRR_NUM_TYPES; 572 type = MTRR_NUM_TYPES;
692 if (type == MTRR_TYPE_WRPROT)
693 type = MTRR_TYPE_UNCACHABLE;
694 num[type]++; 573 num[type]++;
695 } 574 }
696 575
@@ -713,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
713 unsigned long x_remove_base, 592 unsigned long x_remove_base,
714 unsigned long x_remove_size, int i) 593 unsigned long x_remove_size, int i)
715{ 594{
716 static struct res_range range_new[RANGE_NUM]; 595 static struct range range_new[RANGE_NUM];
717 unsigned long range_sums_new; 596 unsigned long range_sums_new;
718 static int nr_range_new; 597 static int nr_range_new;
719 int num_reg; 598 int num_reg;
@@ -840,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits)
840 * [0, 1M) should always be covered by var mtrr with WB 719 * [0, 1M) should always be covered by var mtrr with WB
841 * and fixed mtrrs should take effect before var mtrr for it: 720 * and fixed mtrrs should take effect before var mtrr for it:
842 */ 721 */
843 nr_range = add_range_with_merge(range, nr_range, 0, 722 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
844 (1ULL<<(20 - PAGE_SHIFT)) - 1); 723 1ULL<<(20 - PAGE_SHIFT));
845 /* Sort the ranges: */ 724 /* Sort the ranges: */
846 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 725 sort_range(range, nr_range);
847 726
848 range_sums = sum_ranges(range, nr_range); 727 range_sums = sum_ranges(range, nr_range);
849 printk(KERN_INFO "total RAM covered: %ldM\n", 728 printk(KERN_INFO "total RAM covered: %ldM\n",
@@ -1060,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1060 nr_range = 0; 939 nr_range = 0;
1061 if (mtrr_tom2) { 940 if (mtrr_tom2) {
1062 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); 941 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1063 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; 942 range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
1064 if (highest_pfn < range[nr_range].end + 1) 943 if (highest_pfn < range[nr_range].end)
1065 highest_pfn = range[nr_range].end + 1; 944 highest_pfn = range[nr_range].end;
1066 nr_range++; 945 nr_range++;
1067 } 946 }
1068 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); 947 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
@@ -1074,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1074 953
1075 /* Check the holes: */ 954 /* Check the holes: */
1076 for (i = 0; i < nr_range - 1; i++) { 955 for (i = 0; i < nr_range - 1; i++) {
1077 if (range[i].end + 1 < range[i+1].start) 956 if (range[i].end < range[i+1].start)
1078 total_trim_size += real_trim_memory(range[i].end + 1, 957 total_trim_size += real_trim_memory(range[i].end,
1079 range[i+1].start); 958 range[i+1].start);
1080 } 959 }
1081 960
1082 /* Check the top: */ 961 /* Check the top: */
1083 i = nr_range - 1; 962 i = nr_range - 1;
1084 if (range[i].end + 1 < end_pfn) 963 if (range[i].end < end_pfn)
1085 total_trim_size += real_trim_memory(range[i].end + 1, 964 total_trim_size += real_trim_memory(range[i].end,
1086 end_pfn); 965 end_pfn);
1087 966
1088 if (total_trim_size) { 967 if (total_trim_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 228d982ce09c..68a3343e5798 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -265,7 +265,7 @@ static void cyrix_set_all(void)
265 post_set(); 265 post_set();
266} 266}
267 267
268static struct mtrr_ops cyrix_mtrr_ops = { 268static const struct mtrr_ops cyrix_mtrr_ops = {
269 .vendor = X86_VENDOR_CYRIX, 269 .vendor = X86_VENDOR_CYRIX,
270 .set_all = cyrix_set_all, 270 .set_all = cyrix_set_all,
271 .set = cyrix_set_arr, 271 .set = cyrix_set_arr,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 55da0c5f68dd..fd31a441c61c 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -6,7 +6,6 @@
6 6
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/io.h> 9#include <linux/io.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12 11
@@ -464,7 +463,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
464 tmp |= ~((1<<(hi - 1)) - 1); 463 tmp |= ~((1<<(hi - 1)) - 1);
465 464
466 if (tmp != mask_lo) { 465 if (tmp != mask_lo) {
467 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); 466 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
468 mask_lo = tmp; 467 mask_lo = tmp;
469 } 468 }
470 } 469 }
@@ -570,7 +569,7 @@ static unsigned long set_mtrr_state(void)
570 569
571 570
572static unsigned long cr4; 571static unsigned long cr4;
573static DEFINE_SPINLOCK(set_atomicity_lock); 572static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
574 573
575/* 574/*
576 * Since we are disabling the cache don't allow any interrupts, 575 * Since we are disabling the cache don't allow any interrupts,
@@ -590,7 +589,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
590 * changes to the way the kernel boots 589 * changes to the way the kernel boots
591 */ 590 */
592 591
593 spin_lock(&set_atomicity_lock); 592 raw_spin_lock(&set_atomicity_lock);
594 593
595 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 594 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
596 cr0 = read_cr0() | X86_CR0_CD; 595 cr0 = read_cr0() | X86_CR0_CD;
@@ -627,7 +626,7 @@ static void post_set(void) __releases(set_atomicity_lock)
627 /* Restore value of CR4 */ 626 /* Restore value of CR4 */
628 if (cpu_has_pge) 627 if (cpu_has_pge)
629 write_cr4(cr4); 628 write_cr4(cr4);
630 spin_unlock(&set_atomicity_lock); 629 raw_spin_unlock(&set_atomicity_lock);
631} 630}
632 631
633static void generic_set_all(void) 632static void generic_set_all(void)
@@ -752,7 +751,7 @@ int positive_have_wrcomb(void)
752/* 751/*
753 * Generic structure... 752 * Generic structure...
754 */ 753 */
755struct mtrr_ops generic_mtrr_ops = { 754const struct mtrr_ops generic_mtrr_ops = {
756 .use_intel_if = 1, 755 .use_intel_if = 1,
757 .set_all = generic_set_all, 756 .set_all = generic_set_all,
758 .get = generic_get_mtrr, 757 .get = generic_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 3c1b12d461d1..79289632cb27 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -4,6 +4,8 @@
4#include <linux/proc_fs.h> 4#include <linux/proc_fs.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/ctype.h> 6#include <linux/ctype.h>
7#include <linux/string.h>
8#include <linux/slab.h>
7#include <linux/init.h> 9#include <linux/init.h>
8 10
9#define LINE_SIZE 80 11#define LINE_SIZE 80
@@ -133,8 +135,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
133 return -EINVAL; 135 return -EINVAL;
134 136
135 base = simple_strtoull(line + 5, &ptr, 0); 137 base = simple_strtoull(line + 5, &ptr, 0);
136 while (isspace(*ptr)) 138 ptr = skip_spaces(ptr);
137 ptr++;
138 139
139 if (strncmp(ptr, "size=", 5)) 140 if (strncmp(ptr, "size=", 5))
140 return -EINVAL; 141 return -EINVAL;
@@ -142,14 +143,11 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
142 size = simple_strtoull(ptr + 5, &ptr, 0); 143 size = simple_strtoull(ptr + 5, &ptr, 0);
143 if ((base & 0xfff) || (size & 0xfff)) 144 if ((base & 0xfff) || (size & 0xfff))
144 return -EINVAL; 145 return -EINVAL;
145 while (isspace(*ptr)) 146 ptr = skip_spaces(ptr);
146 ptr++;
147 147
148 if (strncmp(ptr, "type=", 5)) 148 if (strncmp(ptr, "type=", 5))
149 return -EINVAL; 149 return -EINVAL;
150 ptr += 5; 150 ptr = skip_spaces(ptr + 5);
151 while (isspace(*ptr))
152 ptr++;
153 151
154 for (i = 0; i < MTRR_NUM_TYPES; ++i) { 152 for (i = 0; i < MTRR_NUM_TYPES; ++i) {
155 if (strcmp(ptr, mtrr_strings[i])) 153 if (strcmp(ptr, mtrr_strings[i]))
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 84e83de54575..79556bd9b602 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex);
60u64 size_or_mask, size_and_mask; 60u64 size_or_mask, size_and_mask;
61static bool mtrr_aps_delayed_init; 61static bool mtrr_aps_delayed_init;
62 62
63static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; 63static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
64 64
65struct mtrr_ops *mtrr_if; 65const struct mtrr_ops *mtrr_if;
66 66
67static void set_mtrr(unsigned int reg, unsigned long base, 67static void set_mtrr(unsigned int reg, unsigned long base,
68 unsigned long size, mtrr_type type); 68 unsigned long size, mtrr_type type);
69 69
70void set_mtrr_ops(struct mtrr_ops *ops) 70void set_mtrr_ops(const struct mtrr_ops *ops)
71{ 71{
72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM) 72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
73 mtrr_ops[ops->vendor] = ops; 73 mtrr_ops[ops->vendor] = ops;
@@ -145,6 +145,7 @@ struct set_mtrr_data {
145 145
146/** 146/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data
148 * 149 *
149 * Returns nothing. 150 * Returns nothing.
150 */ 151 */
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index a501dee9a87a..df5e41f31a27 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size,
32extern int generic_validate_add_page(unsigned long base, unsigned long size, 32extern int generic_validate_add_page(unsigned long base, unsigned long size,
33 unsigned int type); 33 unsigned int type);
34 34
35extern struct mtrr_ops generic_mtrr_ops; 35extern const struct mtrr_ops generic_mtrr_ops;
36 36
37extern int positive_have_wrcomb(void); 37extern int positive_have_wrcomb(void);
38 38
@@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index,
53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); 53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
54void get_mtrr_state(void); 54void get_mtrr_state(void);
55 55
56extern void set_mtrr_ops(struct mtrr_ops *ops); 56extern void set_mtrr_ops(const struct mtrr_ops *ops);
57 57
58extern u64 size_or_mask, size_and_mask; 58extern u64 size_or_mask, size_and_mask;
59extern struct mtrr_ops *mtrr_if; 59extern const struct mtrr_ops *mtrr_if;
60 60
61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) 61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) 62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
deleted file mode 100644
index dfc80b4e6b0d..000000000000
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ /dev/null
@@ -1,94 +0,0 @@
1#include <linux/init.h>
2#include <linux/io.h>
3#include <linux/mm.h>
4
5#include <asm/processor-cyrix.h>
6#include <asm/processor-flags.h>
7#include <asm/mtrr.h>
8#include <asm/msr.h>
9
10#include "mtrr.h"
11
12/* Put the processor into a state where MTRRs can be safely set */
13void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
14{
15 unsigned int cr0;
16
17 /* Disable interrupts locally */
18 local_irq_save(ctxt->flags);
19
20 if (use_intel() || is_cpu(CYRIX)) {
21
22 /* Save value of CR4 and clear Page Global Enable (bit 7) */
23 if (cpu_has_pge) {
24 ctxt->cr4val = read_cr4();
25 write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
26 }
27
28 /*
29 * Disable and flush caches. Note that wbinvd flushes the TLBs
30 * as a side-effect
31 */
32 cr0 = read_cr0() | X86_CR0_CD;
33 wbinvd();
34 write_cr0(cr0);
35 wbinvd();
36
37 if (use_intel()) {
38 /* Save MTRR state */
39 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
40 } else {
41 /*
42 * Cyrix ARRs -
43 * everything else were excluded at the top
44 */
45 ctxt->ccr3 = getCx86(CX86_CCR3);
46 }
47 }
48}
49
50void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
51{
52 if (use_intel()) {
53 /* Disable MTRRs, and set the default type to uncached */
54 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
55 ctxt->deftype_hi);
56 } else {
57 if (is_cpu(CYRIX)) {
58 /* Cyrix ARRs - everything else were excluded at the top */
59 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
60 }
61 }
62}
63
64/* Restore the processor after a set_mtrr_prepare */
65void set_mtrr_done(struct set_mtrr_context *ctxt)
66{
67 if (use_intel() || is_cpu(CYRIX)) {
68
69 /* Flush caches and TLBs */
70 wbinvd();
71
72 /* Restore MTRRdefType */
73 if (use_intel()) {
74 /* Intel (P6) standard MTRRs */
75 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo,
76 ctxt->deftype_hi);
77 } else {
78 /*
79 * Cyrix ARRs -
80 * everything else was excluded at the top
81 */
82 setCx86(CX86_CCR3, ctxt->ccr3);
83 }
84
85 /* Enable caches */
86 write_cr0(read_cr0() & 0xbfffffff);
87
88 /* Restore value of CR4 */
89 if (cpu_has_pge)
90 write_cr4(ctxt->cr4val);
91 }
92 /* Re-enable interrupts locally (if enabled previously) */
93 local_irq_restore(ctxt->flags);
94}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5801c311846..db5bdc8addf8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -7,6 +7,7 @@
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
10 * 11 *
11 * For licencing details see kernel-base/COPYING 12 * For licencing details see kernel-base/COPYING
12 */ 13 */
@@ -20,12 +21,15 @@
20#include <linux/kdebug.h> 21#include <linux/kdebug.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
22#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/slab.h>
23#include <linux/highmem.h> 25#include <linux/highmem.h>
24#include <linux/cpu.h> 26#include <linux/cpu.h>
27#include <linux/bitops.h>
25 28
26#include <asm/apic.h> 29#include <asm/apic.h>
27#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
28#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h>
29 33
30static u64 perf_event_mask __read_mostly; 34static u64 perf_event_mask __read_mostly;
31 35
@@ -68,15 +72,60 @@ struct debug_store {
68 u64 pebs_event_reset[MAX_PEBS_EVENTS]; 72 u64 pebs_event_reset[MAX_PEBS_EVENTS];
69}; 73};
70 74
75struct event_constraint {
76 union {
77 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
78 u64 idxmsk64;
79 };
80 u64 code;
81 u64 cmask;
82 int weight;
83};
84
85struct amd_nb {
86 int nb_id; /* NorthBridge id */
87 int refcnt; /* reference count */
88 struct perf_event *owners[X86_PMC_IDX_MAX];
89 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
90};
91
71struct cpu_hw_events { 92struct cpu_hw_events {
72 struct perf_event *events[X86_PMC_IDX_MAX]; 93 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 94 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
75 unsigned long interrupts; 95 unsigned long interrupts;
76 int enabled; 96 int enabled;
77 struct debug_store *ds; 97 struct debug_store *ds;
98
99 int n_events;
100 int n_added;
101 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
102 u64 tags[X86_PMC_IDX_MAX];
103 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
104 struct amd_nb *amd_nb;
78}; 105};
79 106
107#define __EVENT_CONSTRAINT(c, n, m, w) {\
108 { .idxmsk64 = (n) }, \
109 .code = (c), \
110 .cmask = (m), \
111 .weight = (w), \
112}
113
114#define EVENT_CONSTRAINT(c, n, m) \
115 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
116
117#define INTEL_EVENT_CONSTRAINT(c, n) \
118 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
119
120#define FIXED_EVENT_CONSTRAINT(c, n) \
121 EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
122
123#define EVENT_CONSTRAINT_END \
124 EVENT_CONSTRAINT(0, 0, 0)
125
126#define for_each_event_constraint(e, c) \
127 for ((e) = (c); (e)->cmask; (e)++)
128
80/* 129/*
81 * struct x86_pmu - generic x86 pmu 130 * struct x86_pmu - generic x86 pmu
82 */ 131 */
@@ -86,8 +135,8 @@ struct x86_pmu {
86 int (*handle_irq)(struct pt_regs *); 135 int (*handle_irq)(struct pt_regs *);
87 void (*disable_all)(void); 136 void (*disable_all)(void);
88 void (*enable_all)(void); 137 void (*enable_all)(void);
89 void (*enable)(struct hw_perf_event *, int); 138 void (*enable)(struct perf_event *);
90 void (*disable)(struct hw_perf_event *, int); 139 void (*disable)(struct perf_event *);
91 unsigned eventsel; 140 unsigned eventsel;
92 unsigned perfctr; 141 unsigned perfctr;
93 u64 (*event_map)(int); 142 u64 (*event_map)(int);
@@ -102,78 +151,28 @@ struct x86_pmu {
102 u64 intel_ctrl; 151 u64 intel_ctrl;
103 void (*enable_bts)(u64 config); 152 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void); 153 void (*disable_bts)(void);
105};
106 154
107static struct x86_pmu x86_pmu __read_mostly; 155 struct event_constraint *
156 (*get_event_constraints)(struct cpu_hw_events *cpuc,
157 struct perf_event *event);
108 158
109static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 159 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
110 .enabled = 1, 160 struct perf_event *event);
111}; 161 struct event_constraint *event_constraints;
112 162
113/* 163 int (*cpu_prepare)(int cpu);
114 * Not sure about some of these 164 void (*cpu_starting)(int cpu);
115 */ 165 void (*cpu_dying)(int cpu);
116static const u64 p6_perfmon_event_map[] = 166 void (*cpu_dead)(int cpu);
117{
118 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
119 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
120 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
121 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
122 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
123 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
124 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
125}; 167};
126 168
127static u64 p6_pmu_event_map(int hw_event) 169static struct x86_pmu x86_pmu __read_mostly;
128{
129 return p6_perfmon_event_map[hw_event];
130}
131
132/*
133 * Event setting that is specified not to count anything.
134 * We use this to effectively disable a counter.
135 *
136 * L2_RQSTS with 0 MESI unit mask.
137 */
138#define P6_NOP_EVENT 0x0000002EULL
139
140static u64 p6_pmu_raw_event(u64 hw_event)
141{
142#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
143#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
144#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
145#define P6_EVNTSEL_INV_MASK 0x00800000ULL
146#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
147
148#define P6_EVNTSEL_MASK \
149 (P6_EVNTSEL_EVENT_MASK | \
150 P6_EVNTSEL_UNIT_MASK | \
151 P6_EVNTSEL_EDGE_MASK | \
152 P6_EVNTSEL_INV_MASK | \
153 P6_EVNTSEL_REG_MASK)
154
155 return hw_event & P6_EVNTSEL_MASK;
156}
157
158 170
159/* 171static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
160 * Intel PerfMon v3. Used on Core2 and later. 172 .enabled = 1,
161 */
162static const u64 intel_perfmon_event_map[] =
163{
164 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
165 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
166 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
167 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
168 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
169 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
171}; 173};
172 174
173static u64 intel_pmu_event_map(int hw_event) 175static int x86_perf_event_set_period(struct perf_event *event);
174{
175 return intel_perfmon_event_map[hw_event];
176}
177 176
178/* 177/*
179 * Generalized hw caching related hw_event table, filled 178 * Generalized hw caching related hw_event table, filled
@@ -190,435 +189,18 @@ static u64 __read_mostly hw_cache_event_ids
190 [PERF_COUNT_HW_CACHE_OP_MAX] 189 [PERF_COUNT_HW_CACHE_OP_MAX]
191 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 190 [PERF_COUNT_HW_CACHE_RESULT_MAX];
192 191
193static const u64 nehalem_hw_cache_event_ids
194 [PERF_COUNT_HW_CACHE_MAX]
195 [PERF_COUNT_HW_CACHE_OP_MAX]
196 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
197{
198 [ C(L1D) ] = {
199 [ C(OP_READ) ] = {
200 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
201 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
202 },
203 [ C(OP_WRITE) ] = {
204 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
205 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
206 },
207 [ C(OP_PREFETCH) ] = {
208 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
209 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
210 },
211 },
212 [ C(L1I ) ] = {
213 [ C(OP_READ) ] = {
214 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
215 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
216 },
217 [ C(OP_WRITE) ] = {
218 [ C(RESULT_ACCESS) ] = -1,
219 [ C(RESULT_MISS) ] = -1,
220 },
221 [ C(OP_PREFETCH) ] = {
222 [ C(RESULT_ACCESS) ] = 0x0,
223 [ C(RESULT_MISS) ] = 0x0,
224 },
225 },
226 [ C(LL ) ] = {
227 [ C(OP_READ) ] = {
228 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
229 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
230 },
231 [ C(OP_WRITE) ] = {
232 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
233 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
234 },
235 [ C(OP_PREFETCH) ] = {
236 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
237 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
238 },
239 },
240 [ C(DTLB) ] = {
241 [ C(OP_READ) ] = {
242 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
243 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
244 },
245 [ C(OP_WRITE) ] = {
246 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
247 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
248 },
249 [ C(OP_PREFETCH) ] = {
250 [ C(RESULT_ACCESS) ] = 0x0,
251 [ C(RESULT_MISS) ] = 0x0,
252 },
253 },
254 [ C(ITLB) ] = {
255 [ C(OP_READ) ] = {
256 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
257 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
258 },
259 [ C(OP_WRITE) ] = {
260 [ C(RESULT_ACCESS) ] = -1,
261 [ C(RESULT_MISS) ] = -1,
262 },
263 [ C(OP_PREFETCH) ] = {
264 [ C(RESULT_ACCESS) ] = -1,
265 [ C(RESULT_MISS) ] = -1,
266 },
267 },
268 [ C(BPU ) ] = {
269 [ C(OP_READ) ] = {
270 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
271 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
272 },
273 [ C(OP_WRITE) ] = {
274 [ C(RESULT_ACCESS) ] = -1,
275 [ C(RESULT_MISS) ] = -1,
276 },
277 [ C(OP_PREFETCH) ] = {
278 [ C(RESULT_ACCESS) ] = -1,
279 [ C(RESULT_MISS) ] = -1,
280 },
281 },
282};
283
284static const u64 core2_hw_cache_event_ids
285 [PERF_COUNT_HW_CACHE_MAX]
286 [PERF_COUNT_HW_CACHE_OP_MAX]
287 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
288{
289 [ C(L1D) ] = {
290 [ C(OP_READ) ] = {
291 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
292 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
293 },
294 [ C(OP_WRITE) ] = {
295 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
296 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
297 },
298 [ C(OP_PREFETCH) ] = {
299 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
300 [ C(RESULT_MISS) ] = 0,
301 },
302 },
303 [ C(L1I ) ] = {
304 [ C(OP_READ) ] = {
305 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
306 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
307 },
308 [ C(OP_WRITE) ] = {
309 [ C(RESULT_ACCESS) ] = -1,
310 [ C(RESULT_MISS) ] = -1,
311 },
312 [ C(OP_PREFETCH) ] = {
313 [ C(RESULT_ACCESS) ] = 0,
314 [ C(RESULT_MISS) ] = 0,
315 },
316 },
317 [ C(LL ) ] = {
318 [ C(OP_READ) ] = {
319 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
320 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
321 },
322 [ C(OP_WRITE) ] = {
323 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
324 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
325 },
326 [ C(OP_PREFETCH) ] = {
327 [ C(RESULT_ACCESS) ] = 0,
328 [ C(RESULT_MISS) ] = 0,
329 },
330 },
331 [ C(DTLB) ] = {
332 [ C(OP_READ) ] = {
333 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
334 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
335 },
336 [ C(OP_WRITE) ] = {
337 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
338 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
339 },
340 [ C(OP_PREFETCH) ] = {
341 [ C(RESULT_ACCESS) ] = 0,
342 [ C(RESULT_MISS) ] = 0,
343 },
344 },
345 [ C(ITLB) ] = {
346 [ C(OP_READ) ] = {
347 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
348 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
349 },
350 [ C(OP_WRITE) ] = {
351 [ C(RESULT_ACCESS) ] = -1,
352 [ C(RESULT_MISS) ] = -1,
353 },
354 [ C(OP_PREFETCH) ] = {
355 [ C(RESULT_ACCESS) ] = -1,
356 [ C(RESULT_MISS) ] = -1,
357 },
358 },
359 [ C(BPU ) ] = {
360 [ C(OP_READ) ] = {
361 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
362 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
363 },
364 [ C(OP_WRITE) ] = {
365 [ C(RESULT_ACCESS) ] = -1,
366 [ C(RESULT_MISS) ] = -1,
367 },
368 [ C(OP_PREFETCH) ] = {
369 [ C(RESULT_ACCESS) ] = -1,
370 [ C(RESULT_MISS) ] = -1,
371 },
372 },
373};
374
375static const u64 atom_hw_cache_event_ids
376 [PERF_COUNT_HW_CACHE_MAX]
377 [PERF_COUNT_HW_CACHE_OP_MAX]
378 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
379{
380 [ C(L1D) ] = {
381 [ C(OP_READ) ] = {
382 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
383 [ C(RESULT_MISS) ] = 0,
384 },
385 [ C(OP_WRITE) ] = {
386 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
387 [ C(RESULT_MISS) ] = 0,
388 },
389 [ C(OP_PREFETCH) ] = {
390 [ C(RESULT_ACCESS) ] = 0x0,
391 [ C(RESULT_MISS) ] = 0,
392 },
393 },
394 [ C(L1I ) ] = {
395 [ C(OP_READ) ] = {
396 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
397 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
398 },
399 [ C(OP_WRITE) ] = {
400 [ C(RESULT_ACCESS) ] = -1,
401 [ C(RESULT_MISS) ] = -1,
402 },
403 [ C(OP_PREFETCH) ] = {
404 [ C(RESULT_ACCESS) ] = 0,
405 [ C(RESULT_MISS) ] = 0,
406 },
407 },
408 [ C(LL ) ] = {
409 [ C(OP_READ) ] = {
410 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
411 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
412 },
413 [ C(OP_WRITE) ] = {
414 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
415 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
416 },
417 [ C(OP_PREFETCH) ] = {
418 [ C(RESULT_ACCESS) ] = 0,
419 [ C(RESULT_MISS) ] = 0,
420 },
421 },
422 [ C(DTLB) ] = {
423 [ C(OP_READ) ] = {
424 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
425 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
426 },
427 [ C(OP_WRITE) ] = {
428 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
429 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
430 },
431 [ C(OP_PREFETCH) ] = {
432 [ C(RESULT_ACCESS) ] = 0,
433 [ C(RESULT_MISS) ] = 0,
434 },
435 },
436 [ C(ITLB) ] = {
437 [ C(OP_READ) ] = {
438 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
439 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
440 },
441 [ C(OP_WRITE) ] = {
442 [ C(RESULT_ACCESS) ] = -1,
443 [ C(RESULT_MISS) ] = -1,
444 },
445 [ C(OP_PREFETCH) ] = {
446 [ C(RESULT_ACCESS) ] = -1,
447 [ C(RESULT_MISS) ] = -1,
448 },
449 },
450 [ C(BPU ) ] = {
451 [ C(OP_READ) ] = {
452 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
453 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
454 },
455 [ C(OP_WRITE) ] = {
456 [ C(RESULT_ACCESS) ] = -1,
457 [ C(RESULT_MISS) ] = -1,
458 },
459 [ C(OP_PREFETCH) ] = {
460 [ C(RESULT_ACCESS) ] = -1,
461 [ C(RESULT_MISS) ] = -1,
462 },
463 },
464};
465
466static u64 intel_pmu_raw_event(u64 hw_event)
467{
468#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
472#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
473
474#define CORE_EVNTSEL_MASK \
475 (CORE_EVNTSEL_EVENT_MASK | \
476 CORE_EVNTSEL_UNIT_MASK | \
477 CORE_EVNTSEL_EDGE_MASK | \
478 CORE_EVNTSEL_INV_MASK | \
479 CORE_EVNTSEL_REG_MASK)
480
481 return hw_event & CORE_EVNTSEL_MASK;
482}
483
484static const u64 amd_hw_cache_event_ids
485 [PERF_COUNT_HW_CACHE_MAX]
486 [PERF_COUNT_HW_CACHE_OP_MAX]
487 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
488{
489 [ C(L1D) ] = {
490 [ C(OP_READ) ] = {
491 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
492 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
493 },
494 [ C(OP_WRITE) ] = {
495 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
496 [ C(RESULT_MISS) ] = 0,
497 },
498 [ C(OP_PREFETCH) ] = {
499 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
500 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
501 },
502 },
503 [ C(L1I ) ] = {
504 [ C(OP_READ) ] = {
505 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
506 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
507 },
508 [ C(OP_WRITE) ] = {
509 [ C(RESULT_ACCESS) ] = -1,
510 [ C(RESULT_MISS) ] = -1,
511 },
512 [ C(OP_PREFETCH) ] = {
513 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
514 [ C(RESULT_MISS) ] = 0,
515 },
516 },
517 [ C(LL ) ] = {
518 [ C(OP_READ) ] = {
519 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
520 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
521 },
522 [ C(OP_WRITE) ] = {
523 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
524 [ C(RESULT_MISS) ] = 0,
525 },
526 [ C(OP_PREFETCH) ] = {
527 [ C(RESULT_ACCESS) ] = 0,
528 [ C(RESULT_MISS) ] = 0,
529 },
530 },
531 [ C(DTLB) ] = {
532 [ C(OP_READ) ] = {
533 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
534 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
535 },
536 [ C(OP_WRITE) ] = {
537 [ C(RESULT_ACCESS) ] = 0,
538 [ C(RESULT_MISS) ] = 0,
539 },
540 [ C(OP_PREFETCH) ] = {
541 [ C(RESULT_ACCESS) ] = 0,
542 [ C(RESULT_MISS) ] = 0,
543 },
544 },
545 [ C(ITLB) ] = {
546 [ C(OP_READ) ] = {
547 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
548 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
549 },
550 [ C(OP_WRITE) ] = {
551 [ C(RESULT_ACCESS) ] = -1,
552 [ C(RESULT_MISS) ] = -1,
553 },
554 [ C(OP_PREFETCH) ] = {
555 [ C(RESULT_ACCESS) ] = -1,
556 [ C(RESULT_MISS) ] = -1,
557 },
558 },
559 [ C(BPU ) ] = {
560 [ C(OP_READ) ] = {
561 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
562 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
563 },
564 [ C(OP_WRITE) ] = {
565 [ C(RESULT_ACCESS) ] = -1,
566 [ C(RESULT_MISS) ] = -1,
567 },
568 [ C(OP_PREFETCH) ] = {
569 [ C(RESULT_ACCESS) ] = -1,
570 [ C(RESULT_MISS) ] = -1,
571 },
572 },
573};
574
575/*
576 * AMD Performance Monitor K7 and later.
577 */
578static const u64 amd_perfmon_event_map[] =
579{
580 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
581 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
582 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
583 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
584 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
585 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
586};
587
588static u64 amd_pmu_event_map(int hw_event)
589{
590 return amd_perfmon_event_map[hw_event];
591}
592
593static u64 amd_pmu_raw_event(u64 hw_event)
594{
595#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
596#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
597#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
598#define K7_EVNTSEL_INV_MASK 0x000800000ULL
599#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
600
601#define K7_EVNTSEL_MASK \
602 (K7_EVNTSEL_EVENT_MASK | \
603 K7_EVNTSEL_UNIT_MASK | \
604 K7_EVNTSEL_EDGE_MASK | \
605 K7_EVNTSEL_INV_MASK | \
606 K7_EVNTSEL_REG_MASK)
607
608 return hw_event & K7_EVNTSEL_MASK;
609}
610
611/* 192/*
612 * Propagate event elapsed time into the generic event. 193 * Propagate event elapsed time into the generic event.
613 * Can only be executed on the CPU where the event is active. 194 * Can only be executed on the CPU where the event is active.
614 * Returns the delta events processed. 195 * Returns the delta events processed.
615 */ 196 */
616static u64 197static u64
617x86_perf_event_update(struct perf_event *event, 198x86_perf_event_update(struct perf_event *event)
618 struct hw_perf_event *hwc, int idx)
619{ 199{
200 struct hw_perf_event *hwc = &event->hw;
620 int shift = 64 - x86_pmu.event_bits; 201 int shift = 64 - x86_pmu.event_bits;
621 u64 prev_raw_count, new_raw_count; 202 u64 prev_raw_count, new_raw_count;
203 int idx = hwc->idx;
622 s64 delta; 204 s64 delta;
623 205
624 if (idx == X86_PMC_IDX_FIXED_BTS) 206 if (idx == X86_PMC_IDX_FIXED_BTS)
@@ -718,7 +300,7 @@ static inline bool bts_available(void)
718 return x86_pmu.enable_bts != NULL; 300 return x86_pmu.enable_bts != NULL;
719} 301}
720 302
721static inline void init_debug_store_on_cpu(int cpu) 303static void init_debug_store_on_cpu(int cpu)
722{ 304{
723 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 305 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
724 306
@@ -730,7 +312,7 @@ static inline void init_debug_store_on_cpu(int cpu)
730 (u32)((u64)(unsigned long)ds >> 32)); 312 (u32)((u64)(unsigned long)ds >> 32));
731} 313}
732 314
733static inline void fini_debug_store_on_cpu(int cpu) 315static void fini_debug_store_on_cpu(int cpu)
734{ 316{
735 if (!per_cpu(cpu_hw_events, cpu).ds) 317 if (!per_cpu(cpu_hw_events, cpu).ds)
736 return; 318 return;
@@ -859,42 +441,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
859 return 0; 441 return 0;
860} 442}
861 443
862static void intel_pmu_enable_bts(u64 config)
863{
864 unsigned long debugctlmsr;
865
866 debugctlmsr = get_debugctlmsr();
867
868 debugctlmsr |= X86_DEBUGCTL_TR;
869 debugctlmsr |= X86_DEBUGCTL_BTS;
870 debugctlmsr |= X86_DEBUGCTL_BTINT;
871
872 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
873 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
874
875 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
876 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
877
878 update_debugctlmsr(debugctlmsr);
879}
880
881static void intel_pmu_disable_bts(void)
882{
883 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
884 unsigned long debugctlmsr;
885
886 if (!cpuc->ds)
887 return;
888
889 debugctlmsr = get_debugctlmsr();
890
891 debugctlmsr &=
892 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
893 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
894
895 update_debugctlmsr(debugctlmsr);
896}
897
898/* 444/*
899 * Setup the hardware configuration for a given attr_type 445 * Setup the hardware configuration for a given attr_type
900 */ 446 */
@@ -932,6 +478,10 @@ static int __hw_perf_event_init(struct perf_event *event)
932 */ 478 */
933 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 479 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
934 480
481 hwc->idx = -1;
482 hwc->last_cpu = -1;
483 hwc->last_tag = ~0ULL;
484
935 /* 485 /*
936 * Count user and OS events unless requested not to. 486 * Count user and OS events unless requested not to.
937 */ 487 */
@@ -960,6 +510,9 @@ static int __hw_perf_event_init(struct perf_event *event)
960 */ 510 */
961 if (attr->type == PERF_TYPE_RAW) { 511 if (attr->type == PERF_TYPE_RAW) {
962 hwc->config |= x86_pmu.raw_event(attr->config); 512 hwc->config |= x86_pmu.raw_event(attr->config);
513 if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
514 perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
515 return -EACCES;
963 return 0; 516 return 0;
964 } 517 }
965 518
@@ -999,216 +552,314 @@ static int __hw_perf_event_init(struct perf_event *event)
999 return 0; 552 return 0;
1000} 553}
1001 554
1002static void p6_pmu_disable_all(void) 555static void x86_pmu_disable_all(void)
1003{ 556{
1004 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 557 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1005 u64 val; 558 int idx;
1006
1007 if (!cpuc->enabled)
1008 return;
1009 559
1010 cpuc->enabled = 0; 560 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1011 barrier(); 561 u64 val;
1012 562
1013 /* p6 only has one enable register */ 563 if (!test_bit(idx, cpuc->active_mask))
1014 rdmsrl(MSR_P6_EVNTSEL0, val); 564 continue;
1015 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 565 rdmsrl(x86_pmu.eventsel + idx, val);
1016 wrmsrl(MSR_P6_EVNTSEL0, val); 566 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
567 continue;
568 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
569 wrmsrl(x86_pmu.eventsel + idx, val);
570 }
1017} 571}
1018 572
1019static void intel_pmu_disable_all(void) 573void hw_perf_disable(void)
1020{ 574{
1021 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 575 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1022 576
577 if (!x86_pmu_initialized())
578 return;
579
1023 if (!cpuc->enabled) 580 if (!cpuc->enabled)
1024 return; 581 return;
1025 582
583 cpuc->n_added = 0;
1026 cpuc->enabled = 0; 584 cpuc->enabled = 0;
1027 barrier(); 585 barrier();
1028 586
1029 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 587 x86_pmu.disable_all();
1030
1031 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
1032 intel_pmu_disable_bts();
1033} 588}
1034 589
1035static void amd_pmu_disable_all(void) 590static void x86_pmu_enable_all(void)
1036{ 591{
1037 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 592 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1038 int idx; 593 int idx;
1039 594
1040 if (!cpuc->enabled)
1041 return;
1042
1043 cpuc->enabled = 0;
1044 /*
1045 * ensure we write the disable before we start disabling the
1046 * events proper, so that amd_pmu_enable_event() does the
1047 * right thing.
1048 */
1049 barrier();
1050
1051 for (idx = 0; idx < x86_pmu.num_events; idx++) { 595 for (idx = 0; idx < x86_pmu.num_events; idx++) {
596 struct perf_event *event = cpuc->events[idx];
1052 u64 val; 597 u64 val;
1053 598
1054 if (!test_bit(idx, cpuc->active_mask)) 599 if (!test_bit(idx, cpuc->active_mask))
1055 continue; 600 continue;
1056 rdmsrl(MSR_K7_EVNTSEL0 + idx, val); 601
1057 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) 602 val = event->hw.config;
1058 continue; 603 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
1059 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 604 wrmsrl(x86_pmu.eventsel + idx, val);
1060 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
1061 } 605 }
1062} 606}
1063 607
1064void hw_perf_disable(void) 608static const struct pmu pmu;
609
610static inline int is_x86_event(struct perf_event *event)
1065{ 611{
1066 if (!x86_pmu_initialized()) 612 return event->pmu == &pmu;
1067 return;
1068 return x86_pmu.disable_all();
1069} 613}
1070 614
1071static void p6_pmu_enable_all(void) 615static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
1072{ 616{
1073 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 617 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
1074 unsigned long val; 618 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
619 int i, j, w, wmax, num = 0;
620 struct hw_perf_event *hwc;
1075 621
1076 if (cpuc->enabled) 622 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1077 return;
1078 623
1079 cpuc->enabled = 1; 624 for (i = 0; i < n; i++) {
1080 barrier(); 625 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
626 constraints[i] = c;
627 }
1081 628
1082 /* p6 only has one enable register */ 629 /*
1083 rdmsrl(MSR_P6_EVNTSEL0, val); 630 * fastpath, try to reuse previous register
1084 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 631 */
1085 wrmsrl(MSR_P6_EVNTSEL0, val); 632 for (i = 0; i < n; i++) {
1086} 633 hwc = &cpuc->event_list[i]->hw;
634 c = constraints[i];
1087 635
1088static void intel_pmu_enable_all(void) 636 /* never assigned */
1089{ 637 if (hwc->idx == -1)
1090 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 638 break;
1091 639
1092 if (cpuc->enabled) 640 /* constraint still honored */
1093 return; 641 if (!test_bit(hwc->idx, c->idxmsk))
642 break;
1094 643
1095 cpuc->enabled = 1; 644 /* not already used */
1096 barrier(); 645 if (test_bit(hwc->idx, used_mask))
646 break;
1097 647
1098 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 648 __set_bit(hwc->idx, used_mask);
649 if (assign)
650 assign[i] = hwc->idx;
651 }
652 if (i == n)
653 goto done;
1099 654
1100 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 655 /*
1101 struct perf_event *event = 656 * begin slow path
1102 cpuc->events[X86_PMC_IDX_FIXED_BTS]; 657 */
1103 658
1104 if (WARN_ON_ONCE(!event)) 659 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1105 return;
1106 660
1107 intel_pmu_enable_bts(event->hw.config); 661 /*
1108 } 662 * weight = number of possible counters
1109} 663 *
664 * 1 = most constrained, only works on one counter
665 * wmax = least constrained, works on any counter
666 *
667 * assign events to counters starting with most
668 * constrained events.
669 */
670 wmax = x86_pmu.num_events;
1110 671
1111static void amd_pmu_enable_all(void) 672 /*
1112{ 673 * when fixed event counters are present,
1113 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 674 * wmax is incremented by 1 to account
1114 int idx; 675 * for one more choice
676 */
677 if (x86_pmu.num_events_fixed)
678 wmax++;
1115 679
1116 if (cpuc->enabled) 680 for (w = 1, num = n; num && w <= wmax; w++) {
1117 return; 681 /* for each event */
682 for (i = 0; num && i < n; i++) {
683 c = constraints[i];
684 hwc = &cpuc->event_list[i]->hw;
1118 685
1119 cpuc->enabled = 1; 686 if (c->weight != w)
1120 barrier(); 687 continue;
1121 688
1122 for (idx = 0; idx < x86_pmu.num_events; idx++) { 689 for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
1123 struct perf_event *event = cpuc->events[idx]; 690 if (!test_bit(j, used_mask))
1124 u64 val; 691 break;
692 }
1125 693
1126 if (!test_bit(idx, cpuc->active_mask)) 694 if (j == X86_PMC_IDX_MAX)
1127 continue; 695 break;
1128 696
1129 val = event->hw.config; 697 __set_bit(j, used_mask);
1130 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 698
1131 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 699 if (assign)
700 assign[i] = j;
701 num--;
702 }
1132 } 703 }
704done:
705 /*
706 * scheduling failed or is just a simulation,
707 * free resources if necessary
708 */
709 if (!assign || num) {
710 for (i = 0; i < n; i++) {
711 if (x86_pmu.put_event_constraints)
712 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
713 }
714 }
715 return num ? -ENOSPC : 0;
1133} 716}
1134 717
1135void hw_perf_enable(void) 718/*
719 * dogrp: true if must collect siblings events (group)
720 * returns total number of events and error code
721 */
722static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
1136{ 723{
1137 if (!x86_pmu_initialized()) 724 struct perf_event *event;
1138 return; 725 int n, max_count;
1139 x86_pmu.enable_all();
1140}
1141 726
1142static inline u64 intel_pmu_get_status(void) 727 max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
1143{
1144 u64 status;
1145 728
1146 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 729 /* current number of events already accepted */
730 n = cpuc->n_events;
1147 731
1148 return status; 732 if (is_x86_event(leader)) {
1149} 733 if (n >= max_count)
734 return -ENOSPC;
735 cpuc->event_list[n] = leader;
736 n++;
737 }
738 if (!dogrp)
739 return n;
1150 740
1151static inline void intel_pmu_ack_status(u64 ack) 741 list_for_each_entry(event, &leader->sibling_list, group_entry) {
1152{ 742 if (!is_x86_event(event) ||
1153 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 743 event->state <= PERF_EVENT_STATE_OFF)
1154} 744 continue;
1155 745
1156static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) 746 if (n >= max_count)
1157{ 747 return -ENOSPC;
1158 (void)checking_wrmsrl(hwc->config_base + idx,
1159 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
1160}
1161 748
1162static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) 749 cpuc->event_list[n] = event;
1163{ 750 n++;
1164 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); 751 }
752 return n;
1165} 753}
1166 754
1167static inline void 755static inline void x86_assign_hw_event(struct perf_event *event,
1168intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) 756 struct cpu_hw_events *cpuc, int i)
1169{ 757{
1170 int idx = __idx - X86_PMC_IDX_FIXED; 758 struct hw_perf_event *hwc = &event->hw;
1171 u64 ctrl_val, mask;
1172 759
1173 mask = 0xfULL << (idx * 4); 760 hwc->idx = cpuc->assign[i];
761 hwc->last_cpu = smp_processor_id();
762 hwc->last_tag = ++cpuc->tags[i];
1174 763
1175 rdmsrl(hwc->config_base, ctrl_val); 764 if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
1176 ctrl_val &= ~mask; 765 hwc->config_base = 0;
1177 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 766 hwc->event_base = 0;
767 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
768 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
769 /*
770 * We set it so that event_base + idx in wrmsr/rdmsr maps to
771 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
772 */
773 hwc->event_base =
774 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
775 } else {
776 hwc->config_base = x86_pmu.eventsel;
777 hwc->event_base = x86_pmu.perfctr;
778 }
1178} 779}
1179 780
1180static inline void 781static inline int match_prev_assignment(struct hw_perf_event *hwc,
1181p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) 782 struct cpu_hw_events *cpuc,
783 int i)
1182{ 784{
1183 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 785 return hwc->idx == cpuc->assign[i] &&
1184 u64 val = P6_NOP_EVENT; 786 hwc->last_cpu == smp_processor_id() &&
1185 787 hwc->last_tag == cpuc->tags[i];
1186 if (cpuc->enabled)
1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1188
1189 (void)checking_wrmsrl(hwc->config_base + idx, val);
1190} 788}
1191 789
1192static inline void 790static int x86_pmu_start(struct perf_event *event);
1193intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) 791static void x86_pmu_stop(struct perf_event *event);
792
793void hw_perf_enable(void)
1194{ 794{
1195 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 795 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1196 intel_pmu_disable_bts(); 796 struct perf_event *event;
797 struct hw_perf_event *hwc;
798 int i;
799
800 if (!x86_pmu_initialized())
1197 return; 801 return;
1198 }
1199 802
1200 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 803 if (cpuc->enabled)
1201 intel_pmu_disable_fixed(hwc, idx);
1202 return; 804 return;
805
806 if (cpuc->n_added) {
807 int n_running = cpuc->n_events - cpuc->n_added;
808 /*
809 * apply assignment obtained either from
810 * hw_perf_group_sched_in() or x86_pmu_enable()
811 *
812 * step1: save events moving to new counters
813 * step2: reprogram moved events into new counters
814 */
815 for (i = 0; i < n_running; i++) {
816 event = cpuc->event_list[i];
817 hwc = &event->hw;
818
819 /*
820 * we can avoid reprogramming counter if:
821 * - assigned same counter as last time
822 * - running on same CPU as last time
823 * - no other event has used the counter since
824 */
825 if (hwc->idx == -1 ||
826 match_prev_assignment(hwc, cpuc, i))
827 continue;
828
829 x86_pmu_stop(event);
830 }
831
832 for (i = 0; i < cpuc->n_events; i++) {
833 event = cpuc->event_list[i];
834 hwc = &event->hw;
835
836 if (!match_prev_assignment(hwc, cpuc, i))
837 x86_assign_hw_event(event, cpuc, i);
838 else if (i < n_running)
839 continue;
840
841 x86_pmu_start(event);
842 }
843 cpuc->n_added = 0;
844 perf_events_lapic_init();
1203 } 845 }
1204 846
1205 x86_pmu_disable_event(hwc, idx); 847 cpuc->enabled = 1;
848 barrier();
849
850 x86_pmu.enable_all();
1206} 851}
1207 852
1208static inline void 853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
1209amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1210{ 854{
1211 x86_pmu_disable_event(hwc, idx); 855 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
856 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
857}
858
859static inline void x86_pmu_disable_event(struct perf_event *event)
860{
861 struct hw_perf_event *hwc = &event->hw;
862 (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
1212} 863}
1213 864
1214static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 865static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1218,18 +869,18 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1218 * To be called with the event disabled in hw: 869 * To be called with the event disabled in hw:
1219 */ 870 */
1220static int 871static int
1221x86_perf_event_set_period(struct perf_event *event, 872x86_perf_event_set_period(struct perf_event *event)
1222 struct hw_perf_event *hwc, int idx)
1223{ 873{
874 struct hw_perf_event *hwc = &event->hw;
1224 s64 left = atomic64_read(&hwc->period_left); 875 s64 left = atomic64_read(&hwc->period_left);
1225 s64 period = hwc->sample_period; 876 s64 period = hwc->sample_period;
1226 int err, ret = 0; 877 int err, ret = 0, idx = hwc->idx;
1227 878
1228 if (idx == X86_PMC_IDX_FIXED_BTS) 879 if (idx == X86_PMC_IDX_FIXED_BTS)
1229 return 0; 880 return 0;
1230 881
1231 /* 882 /*
1232 * If we are way outside a reasoable range then just skip forward: 883 * If we are way outside a reasonable range then just skip forward:
1233 */ 884 */
1234 if (unlikely(left <= -period)) { 885 if (unlikely(left <= -period)) {
1235 left = period; 886 left = period;
@@ -1269,157 +920,63 @@ x86_perf_event_set_period(struct perf_event *event,
1269 return ret; 920 return ret;
1270} 921}
1271 922
1272static inline void 923static void x86_pmu_enable_event(struct perf_event *event)
1273intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1274{
1275 int idx = __idx - X86_PMC_IDX_FIXED;
1276 u64 ctrl_val, bits, mask;
1277 int err;
1278
1279 /*
1280 * Enable IRQ generation (0x8),
1281 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
1282 * if requested:
1283 */
1284 bits = 0x8ULL;
1285 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
1286 bits |= 0x2;
1287 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
1288 bits |= 0x1;
1289 bits <<= (idx * 4);
1290 mask = 0xfULL << (idx * 4);
1291
1292 rdmsrl(hwc->config_base, ctrl_val);
1293 ctrl_val &= ~mask;
1294 ctrl_val |= bits;
1295 err = checking_wrmsrl(hwc->config_base, ctrl_val);
1296}
1297
1298static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1299{ 924{
1300 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 925 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1301 u64 val;
1302
1303 val = hwc->config;
1304 if (cpuc->enabled) 926 if (cpuc->enabled)
1305 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 927 __x86_pmu_enable_event(&event->hw);
1306
1307 (void)checking_wrmsrl(hwc->config_base + idx, val);
1308} 928}
1309 929
1310 930/*
1311static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) 931 * activate a single event
1312{ 932 *
1313 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 933 * The event is added to the group of enabled events
1314 if (!__get_cpu_var(cpu_hw_events).enabled) 934 * but only if it can be scehduled with existing events.
1315 return; 935 *
1316 936 * Called with PMU disabled. If successful and return value 1,
1317 intel_pmu_enable_bts(hwc->config); 937 * then guaranteed to call perf_enable() and hw_perf_enable()
1318 return; 938 */
1319 } 939static int x86_pmu_enable(struct perf_event *event)
1320
1321 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
1322 intel_pmu_enable_fixed(hwc, idx);
1323 return;
1324 }
1325
1326 x86_pmu_enable_event(hwc, idx);
1327}
1328
1329static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1330{ 940{
1331 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 941 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
942 struct hw_perf_event *hwc;
943 int assign[X86_PMC_IDX_MAX];
944 int n, n0, ret;
1332 945
1333 if (cpuc->enabled) 946 hwc = &event->hw;
1334 x86_pmu_enable_event(hwc, idx);
1335}
1336
1337static int
1338fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1339{
1340 unsigned int hw_event;
1341
1342 hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1343 947
1344 if (unlikely((hw_event == 948 n0 = cpuc->n_events;
1345 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && 949 n = collect_events(cpuc, event, false);
1346 (hwc->sample_period == 1))) 950 if (n < 0)
1347 return X86_PMC_IDX_FIXED_BTS; 951 return n;
1348 952
1349 if (!x86_pmu.num_events_fixed) 953 ret = x86_schedule_events(cpuc, n, assign);
1350 return -1; 954 if (ret)
955 return ret;
956 /*
957 * copy new assignment, now we know it is possible
958 * will be used by hw_perf_enable()
959 */
960 memcpy(cpuc->assign, assign, n*sizeof(int));
1351 961
1352 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 962 cpuc->n_events = n;
1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 963 cpuc->n_added += n - n0;
1354 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
1355 return X86_PMC_IDX_FIXED_CPU_CYCLES;
1356 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
1357 return X86_PMC_IDX_FIXED_BUS_CYCLES;
1358 964
1359 return -1; 965 return 0;
1360} 966}
1361 967
1362/* 968static int x86_pmu_start(struct perf_event *event)
1363 * Find a PMC slot for the freshly enabled / scheduled in event:
1364 */
1365static int x86_pmu_enable(struct perf_event *event)
1366{ 969{
1367 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 970 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1368 struct hw_perf_event *hwc = &event->hw; 971 int idx = event->hw.idx;
1369 int idx;
1370 972
1371 idx = fixed_mode_idx(event, hwc); 973 if (idx == -1)
1372 if (idx == X86_PMC_IDX_FIXED_BTS) { 974 return -EAGAIN;
1373 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN;
1376
1377 hwc->config_base = 0;
1378 hwc->event_base = 0;
1379 hwc->idx = idx;
1380 } else if (idx >= 0) {
1381 /*
1382 * Try to get the fixed event, if that is already taken
1383 * then try to get a generic event:
1384 */
1385 if (test_and_set_bit(idx, cpuc->used_mask))
1386 goto try_generic;
1387
1388 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1389 /*
1390 * We set it so that event_base + idx in wrmsr/rdmsr maps to
1391 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1392 */
1393 hwc->event_base =
1394 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1395 hwc->idx = idx;
1396 } else {
1397 idx = hwc->idx;
1398 /* Try to get the previous generic event again */
1399 if (test_and_set_bit(idx, cpuc->used_mask)) {
1400try_generic:
1401 idx = find_first_zero_bit(cpuc->used_mask,
1402 x86_pmu.num_events);
1403 if (idx == x86_pmu.num_events)
1404 return -EAGAIN;
1405
1406 set_bit(idx, cpuc->used_mask);
1407 hwc->idx = idx;
1408 }
1409 hwc->config_base = x86_pmu.eventsel;
1410 hwc->event_base = x86_pmu.perfctr;
1411 }
1412
1413 perf_events_lapic_init();
1414
1415 x86_pmu.disable(hwc, idx);
1416 975
976 x86_perf_event_set_period(event);
1417 cpuc->events[idx] = event; 977 cpuc->events[idx] = event;
1418 set_bit(idx, cpuc->active_mask); 978 __set_bit(idx, cpuc->active_mask);
1419 979 x86_pmu.enable(event);
1420 x86_perf_event_set_period(event, hwc, idx);
1421 x86_pmu.enable(hwc, idx);
1422
1423 perf_event_update_userpage(event); 980 perf_event_update_userpage(event);
1424 981
1425 return 0; 982 return 0;
@@ -1427,14 +984,8 @@ try_generic:
1427 984
1428static void x86_pmu_unthrottle(struct perf_event *event) 985static void x86_pmu_unthrottle(struct perf_event *event)
1429{ 986{
1430 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 987 int ret = x86_pmu_start(event);
1431 struct hw_perf_event *hwc = &event->hw; 988 WARN_ON_ONCE(ret);
1432
1433 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1434 cpuc->events[hwc->idx] != event))
1435 return;
1436
1437 x86_pmu.enable(hwc, hwc->idx);
1438} 989}
1439 990
1440void perf_event_print_debug(void) 991void perf_event_print_debug(void)
@@ -1464,7 +1015,7 @@ void perf_event_print_debug(void)
1464 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1015 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1465 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1016 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1466 } 1017 }
1467 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); 1018 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1468 1019
1469 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1020 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1470 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1021 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -1488,254 +1039,50 @@ void perf_event_print_debug(void)
1488 local_irq_restore(flags); 1039 local_irq_restore(flags);
1489} 1040}
1490 1041
1491static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) 1042static void x86_pmu_stop(struct perf_event *event)
1492{
1493 struct debug_store *ds = cpuc->ds;
1494 struct bts_record {
1495 u64 from;
1496 u64 to;
1497 u64 flags;
1498 };
1499 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
1500 struct bts_record *at, *top;
1501 struct perf_output_handle handle;
1502 struct perf_event_header header;
1503 struct perf_sample_data data;
1504 struct pt_regs regs;
1505
1506 if (!event)
1507 return;
1508
1509 if (!ds)
1510 return;
1511
1512 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1513 top = (struct bts_record *)(unsigned long)ds->bts_index;
1514
1515 if (top <= at)
1516 return;
1517
1518 ds->bts_index = ds->bts_buffer_base;
1519
1520
1521 data.period = event->hw.last_period;
1522 data.addr = 0;
1523 regs.ip = 0;
1524
1525 /*
1526 * Prepare a generic sample, i.e. fill in the invariant fields.
1527 * We will overwrite the from and to address before we output
1528 * the sample.
1529 */
1530 perf_prepare_sample(&header, &data, event, &regs);
1531
1532 if (perf_output_begin(&handle, event,
1533 header.size * (top - at), 1, 1))
1534 return;
1535
1536 for (; at < top; at++) {
1537 data.ip = at->from;
1538 data.addr = at->to;
1539
1540 perf_output_sample(&handle, &header, &data, event);
1541 }
1542
1543 perf_output_end(&handle);
1544
1545 /* There's new data available. */
1546 event->hw.interrupts++;
1547 event->pending_kill = POLL_IN;
1548}
1549
1550static void x86_pmu_disable(struct perf_event *event)
1551{ 1043{
1552 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1044 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1553 struct hw_perf_event *hwc = &event->hw; 1045 struct hw_perf_event *hwc = &event->hw;
1554 int idx = hwc->idx; 1046 int idx = hwc->idx;
1555 1047
1556 /* 1048 if (!__test_and_clear_bit(idx, cpuc->active_mask))
1557 * Must be done before we disable, otherwise the nmi handler 1049 return;
1558 * could reenable again:
1559 */
1560 clear_bit(idx, cpuc->active_mask);
1561 x86_pmu.disable(hwc, idx);
1562 1050
1563 /* 1051 x86_pmu.disable(event);
1564 * Make sure the cleared pointer becomes visible before we
1565 * (potentially) free the event:
1566 */
1567 barrier();
1568 1052
1569 /* 1053 /*
1570 * Drain the remaining delta count out of a event 1054 * Drain the remaining delta count out of a event
1571 * that we are disabling: 1055 * that we are disabling:
1572 */ 1056 */
1573 x86_perf_event_update(event, hwc, idx); 1057 x86_perf_event_update(event);
1574
1575 /* Drain the remaining BTS records. */
1576 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
1577 intel_pmu_drain_bts_buffer(cpuc);
1578 1058
1579 cpuc->events[idx] = NULL; 1059 cpuc->events[idx] = NULL;
1580 clear_bit(idx, cpuc->used_mask);
1581
1582 perf_event_update_userpage(event);
1583}
1584
1585/*
1586 * Save and restart an expired event. Called by NMI contexts,
1587 * so it has to be careful about preempting normal event ops:
1588 */
1589static int intel_pmu_save_and_restart(struct perf_event *event)
1590{
1591 struct hw_perf_event *hwc = &event->hw;
1592 int idx = hwc->idx;
1593 int ret;
1594
1595 x86_perf_event_update(event, hwc, idx);
1596 ret = x86_perf_event_set_period(event, hwc, idx);
1597
1598 if (event->state == PERF_EVENT_STATE_ACTIVE)
1599 intel_pmu_enable_event(hwc, idx);
1600
1601 return ret;
1602} 1060}
1603 1061
1604static void intel_pmu_reset(void) 1062static void x86_pmu_disable(struct perf_event *event)
1605{
1606 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
1607 unsigned long flags;
1608 int idx;
1609
1610 if (!x86_pmu.num_events)
1611 return;
1612
1613 local_irq_save(flags);
1614
1615 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1616
1617 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1618 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1619 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1620 }
1621 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1622 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1623 }
1624 if (ds)
1625 ds->bts_index = ds->bts_buffer_base;
1626
1627 local_irq_restore(flags);
1628}
1629
1630static int p6_pmu_handle_irq(struct pt_regs *regs)
1631{
1632 struct perf_sample_data data;
1633 struct cpu_hw_events *cpuc;
1634 struct perf_event *event;
1635 struct hw_perf_event *hwc;
1636 int idx, handled = 0;
1637 u64 val;
1638
1639 data.addr = 0;
1640
1641 cpuc = &__get_cpu_var(cpu_hw_events);
1642
1643 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1644 if (!test_bit(idx, cpuc->active_mask))
1645 continue;
1646
1647 event = cpuc->events[idx];
1648 hwc = &event->hw;
1649
1650 val = x86_perf_event_update(event, hwc, idx);
1651 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1652 continue;
1653
1654 /*
1655 * event overflow
1656 */
1657 handled = 1;
1658 data.period = event->hw.last_period;
1659
1660 if (!x86_perf_event_set_period(event, hwc, idx))
1661 continue;
1662
1663 if (perf_event_overflow(event, 1, &data, regs))
1664 p6_pmu_disable_event(hwc, idx);
1665 }
1666
1667 if (handled)
1668 inc_irq_stat(apic_perf_irqs);
1669
1670 return handled;
1671}
1672
1673/*
1674 * This handler is triggered by the local APIC, so the APIC IRQ handling
1675 * rules apply:
1676 */
1677static int intel_pmu_handle_irq(struct pt_regs *regs)
1678{ 1063{
1679 struct perf_sample_data data; 1064 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1680 struct cpu_hw_events *cpuc; 1065 int i;
1681 int bit, loops;
1682 u64 ack, status;
1683
1684 data.addr = 0;
1685
1686 cpuc = &__get_cpu_var(cpu_hw_events);
1687
1688 perf_disable();
1689 intel_pmu_drain_bts_buffer(cpuc);
1690 status = intel_pmu_get_status();
1691 if (!status) {
1692 perf_enable();
1693 return 0;
1694 }
1695
1696 loops = 0;
1697again:
1698 if (++loops > 100) {
1699 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
1700 perf_event_print_debug();
1701 intel_pmu_reset();
1702 perf_enable();
1703 return 1;
1704 }
1705 1066
1706 inc_irq_stat(apic_perf_irqs); 1067 x86_pmu_stop(event);
1707 ack = status;
1708 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1709 struct perf_event *event = cpuc->events[bit];
1710 1068
1711 clear_bit(bit, (unsigned long *) &status); 1069 for (i = 0; i < cpuc->n_events; i++) {
1712 if (!test_bit(bit, cpuc->active_mask)) 1070 if (event == cpuc->event_list[i]) {
1713 continue;
1714 1071
1715 if (!intel_pmu_save_and_restart(event)) 1072 if (x86_pmu.put_event_constraints)
1716 continue; 1073 x86_pmu.put_event_constraints(cpuc, event);
1717 1074
1718 data.period = event->hw.last_period; 1075 while (++i < cpuc->n_events)
1076 cpuc->event_list[i-1] = cpuc->event_list[i];
1719 1077
1720 if (perf_event_overflow(event, 1, &data, regs)) 1078 --cpuc->n_events;
1721 intel_pmu_disable_event(&event->hw, bit); 1079 break;
1080 }
1722 } 1081 }
1723 1082 perf_event_update_userpage(event);
1724 intel_pmu_ack_status(ack);
1725
1726 /*
1727 * Repeat if there is more work to be done:
1728 */
1729 status = intel_pmu_get_status();
1730 if (status)
1731 goto again;
1732
1733 perf_enable();
1734
1735 return 1;
1736} 1083}
1737 1084
1738static int amd_pmu_handle_irq(struct pt_regs *regs) 1085static int x86_pmu_handle_irq(struct pt_regs *regs)
1739{ 1086{
1740 struct perf_sample_data data; 1087 struct perf_sample_data data;
1741 struct cpu_hw_events *cpuc; 1088 struct cpu_hw_events *cpuc;
@@ -1744,7 +1091,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1744 int idx, handled = 0; 1091 int idx, handled = 0;
1745 u64 val; 1092 u64 val;
1746 1093
1747 data.addr = 0; 1094 perf_sample_data_init(&data, 0);
1748 1095
1749 cpuc = &__get_cpu_var(cpu_hw_events); 1096 cpuc = &__get_cpu_var(cpu_hw_events);
1750 1097
@@ -1755,7 +1102,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1755 event = cpuc->events[idx]; 1102 event = cpuc->events[idx];
1756 hwc = &event->hw; 1103 hwc = &event->hw;
1757 1104
1758 val = x86_perf_event_update(event, hwc, idx); 1105 val = x86_perf_event_update(event);
1759 if (val & (1ULL << (x86_pmu.event_bits - 1))) 1106 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1760 continue; 1107 continue;
1761 1108
@@ -1765,11 +1112,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1765 handled = 1; 1112 handled = 1;
1766 data.period = event->hw.last_period; 1113 data.period = event->hw.last_period;
1767 1114
1768 if (!x86_perf_event_set_period(event, hwc, idx)) 1115 if (!x86_perf_event_set_period(event))
1769 continue; 1116 continue;
1770 1117
1771 if (perf_event_overflow(event, 1, &data, regs)) 1118 if (perf_event_overflow(event, 1, &data, regs))
1772 amd_pmu_disable_event(hwc, idx); 1119 x86_pmu_stop(event);
1773 } 1120 }
1774 1121
1775 if (handled) 1122 if (handled)
@@ -1852,196 +1199,186 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1852 .priority = 1 1199 .priority = 1
1853}; 1200};
1854 1201
1855static struct x86_pmu p6_pmu = { 1202static struct event_constraint unconstrained;
1856 .name = "p6", 1203static struct event_constraint emptyconstraint;
1857 .handle_irq = p6_pmu_handle_irq,
1858 .disable_all = p6_pmu_disable_all,
1859 .enable_all = p6_pmu_enable_all,
1860 .enable = p6_pmu_enable_event,
1861 .disable = p6_pmu_disable_event,
1862 .eventsel = MSR_P6_EVNTSEL0,
1863 .perfctr = MSR_P6_PERFCTR0,
1864 .event_map = p6_pmu_event_map,
1865 .raw_event = p6_pmu_raw_event,
1866 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
1867 .apic = 1,
1868 .max_period = (1ULL << 31) - 1,
1869 .version = 0,
1870 .num_events = 2,
1871 /*
1872 * Events have 40 bits implemented. However they are designed such
1873 * that bits [32-39] are sign extensions of bit 31. As such the
1874 * effective width of a event for P6-like PMU is 32 bits only.
1875 *
1876 * See IA-32 Intel Architecture Software developer manual Vol 3B
1877 */
1878 .event_bits = 32,
1879 .event_mask = (1ULL << 32) - 1,
1880};
1881 1204
1882static struct x86_pmu intel_pmu = { 1205static struct event_constraint *
1883 .name = "Intel", 1206x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1884 .handle_irq = intel_pmu_handle_irq, 1207{
1885 .disable_all = intel_pmu_disable_all, 1208 struct event_constraint *c;
1886 .enable_all = intel_pmu_enable_all,
1887 .enable = intel_pmu_enable_event,
1888 .disable = intel_pmu_disable_event,
1889 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
1890 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
1891 .event_map = intel_pmu_event_map,
1892 .raw_event = intel_pmu_raw_event,
1893 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
1894 .apic = 1,
1895 /*
1896 * Intel PMCs cannot be accessed sanely above 32 bit width,
1897 * so we install an artificial 1<<31 period regardless of
1898 * the generic event period:
1899 */
1900 .max_period = (1ULL << 31) - 1,
1901 .enable_bts = intel_pmu_enable_bts,
1902 .disable_bts = intel_pmu_disable_bts,
1903};
1904 1209
1905static struct x86_pmu amd_pmu = { 1210 if (x86_pmu.event_constraints) {
1906 .name = "AMD", 1211 for_each_event_constraint(c, x86_pmu.event_constraints) {
1907 .handle_irq = amd_pmu_handle_irq, 1212 if ((event->hw.config & c->cmask) == c->code)
1908 .disable_all = amd_pmu_disable_all, 1213 return c;
1909 .enable_all = amd_pmu_enable_all, 1214 }
1910 .enable = amd_pmu_enable_event, 1215 }
1911 .disable = amd_pmu_disable_event, 1216
1912 .eventsel = MSR_K7_EVNTSEL0, 1217 return &unconstrained;
1913 .perfctr = MSR_K7_PERFCTR0, 1218}
1914 .event_map = amd_pmu_event_map,
1915 .raw_event = amd_pmu_raw_event,
1916 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
1917 .num_events = 4,
1918 .event_bits = 48,
1919 .event_mask = (1ULL << 48) - 1,
1920 .apic = 1,
1921 /* use highest bit to detect overflow */
1922 .max_period = (1ULL << 47) - 1,
1923};
1924 1219
1925static int p6_pmu_init(void) 1220static int x86_event_sched_in(struct perf_event *event,
1221 struct perf_cpu_context *cpuctx)
1926{ 1222{
1927 switch (boot_cpu_data.x86_model) { 1223 int ret = 0;
1928 case 1:
1929 case 3: /* Pentium Pro */
1930 case 5:
1931 case 6: /* Pentium II */
1932 case 7:
1933 case 8:
1934 case 11: /* Pentium III */
1935 break;
1936 case 9:
1937 case 13:
1938 /* Pentium M */
1939 break;
1940 default:
1941 pr_cont("unsupported p6 CPU model %d ",
1942 boot_cpu_data.x86_model);
1943 return -ENODEV;
1944 }
1945 1224
1946 x86_pmu = p6_pmu; 1225 event->state = PERF_EVENT_STATE_ACTIVE;
1226 event->oncpu = smp_processor_id();
1227 event->tstamp_running += event->ctx->time - event->tstamp_stopped;
1947 1228
1948 if (!cpu_has_apic) { 1229 if (!is_x86_event(event))
1949 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); 1230 ret = event->pmu->enable(event);
1950 pr_info("no hardware sampling interrupt available.\n");
1951 x86_pmu.apic = 0;
1952 }
1953 1231
1954 return 0; 1232 if (!ret && !is_software_event(event))
1233 cpuctx->active_oncpu++;
1234
1235 if (!ret && event->attr.exclusive)
1236 cpuctx->exclusive = 1;
1237
1238 return ret;
1955} 1239}
1956 1240
1957static int intel_pmu_init(void) 1241static void x86_event_sched_out(struct perf_event *event,
1242 struct perf_cpu_context *cpuctx)
1958{ 1243{
1959 union cpuid10_edx edx; 1244 event->state = PERF_EVENT_STATE_INACTIVE;
1960 union cpuid10_eax eax; 1245 event->oncpu = -1;
1961 unsigned int unused;
1962 unsigned int ebx;
1963 int version;
1964
1965 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
1966 /* check for P6 processor family */
1967 if (boot_cpu_data.x86 == 6) {
1968 return p6_pmu_init();
1969 } else {
1970 return -ENODEV;
1971 }
1972 }
1973 1246
1974 /* 1247 if (!is_x86_event(event))
1975 * Check whether the Architectural PerfMon supports 1248 event->pmu->disable(event);
1976 * Branch Misses Retired hw_event or not.
1977 */
1978 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1979 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
1980 return -ENODEV;
1981 1249
1982 version = eax.split.version_id; 1250 event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
1983 if (version < 2) 1251
1984 return -ENODEV; 1252 if (!is_software_event(event))
1253 cpuctx->active_oncpu--;
1985 1254
1986 x86_pmu = intel_pmu; 1255 if (event->attr.exclusive || !cpuctx->active_oncpu)
1987 x86_pmu.version = version; 1256 cpuctx->exclusive = 0;
1988 x86_pmu.num_events = eax.split.num_events; 1257}
1989 x86_pmu.event_bits = eax.split.bit_width;
1990 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
1991 1258
1259/*
1260 * Called to enable a whole group of events.
1261 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
1262 * Assumes the caller has disabled interrupts and has
1263 * frozen the PMU with hw_perf_save_disable.
1264 *
1265 * called with PMU disabled. If successful and return value 1,
1266 * then guaranteed to call perf_enable() and hw_perf_enable()
1267 */
1268int hw_perf_group_sched_in(struct perf_event *leader,
1269 struct perf_cpu_context *cpuctx,
1270 struct perf_event_context *ctx)
1271{
1272 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1273 struct perf_event *sub;
1274 int assign[X86_PMC_IDX_MAX];
1275 int n0, n1, ret;
1276
1277 /* n0 = total number of events */
1278 n0 = collect_events(cpuc, leader, true);
1279 if (n0 < 0)
1280 return n0;
1281
1282 ret = x86_schedule_events(cpuc, n0, assign);
1283 if (ret)
1284 return ret;
1285
1286 ret = x86_event_sched_in(leader, cpuctx);
1287 if (ret)
1288 return ret;
1289
1290 n1 = 1;
1291 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1292 if (sub->state > PERF_EVENT_STATE_OFF) {
1293 ret = x86_event_sched_in(sub, cpuctx);
1294 if (ret)
1295 goto undo;
1296 ++n1;
1297 }
1298 }
1992 /* 1299 /*
1993 * Quirk: v2 perfmon does not report fixed-purpose events, so 1300 * copy new assignment, now we know it is possible
1994 * assume at least 3 events: 1301 * will be used by hw_perf_enable()
1995 */ 1302 */
1996 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); 1303 memcpy(cpuc->assign, assign, n0*sizeof(int));
1304
1305 cpuc->n_events = n0;
1306 cpuc->n_added += n1;
1307 ctx->nr_active += n1;
1997 1308
1998 /* 1309 /*
1999 * Install the hw-cache-events table: 1310 * 1 means successful and events are active
1311 * This is not quite true because we defer
1312 * actual activation until hw_perf_enable() but
1313 * this way we* ensure caller won't try to enable
1314 * individual events
2000 */ 1315 */
2001 switch (boot_cpu_data.x86_model) { 1316 return 1;
2002 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 1317undo:
2003 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 1318 x86_event_sched_out(leader, cpuctx);
2004 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 1319 n0 = 1;
2005 case 29: /* six-core 45 nm xeon "Dunnington" */ 1320 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2006 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, 1321 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
2007 sizeof(hw_cache_event_ids)); 1322 x86_event_sched_out(sub, cpuctx);
2008 1323 if (++n0 == n1)
2009 pr_cont("Core2 events, "); 1324 break;
1325 }
1326 }
1327 return ret;
1328}
1329
1330#include "perf_event_amd.c"
1331#include "perf_event_p6.c"
1332#include "perf_event_intel.c"
1333
1334static int __cpuinit
1335x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1336{
1337 unsigned int cpu = (long)hcpu;
1338 int ret = NOTIFY_OK;
1339
1340 switch (action & ~CPU_TASKS_FROZEN) {
1341 case CPU_UP_PREPARE:
1342 if (x86_pmu.cpu_prepare)
1343 ret = x86_pmu.cpu_prepare(cpu);
2010 break; 1344 break;
2011 default:
2012 case 26:
2013 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2014 sizeof(hw_cache_event_ids));
2015 1345
2016 pr_cont("Nehalem/Corei7 events, "); 1346 case CPU_STARTING:
1347 if (x86_pmu.cpu_starting)
1348 x86_pmu.cpu_starting(cpu);
2017 break; 1349 break;
2018 case 28:
2019 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
2020 sizeof(hw_cache_event_ids));
2021 1350
2022 pr_cont("Atom events, "); 1351 case CPU_DYING:
1352 if (x86_pmu.cpu_dying)
1353 x86_pmu.cpu_dying(cpu);
1354 break;
1355
1356 case CPU_UP_CANCELED:
1357 case CPU_DEAD:
1358 if (x86_pmu.cpu_dead)
1359 x86_pmu.cpu_dead(cpu);
1360 break;
1361
1362 default:
2023 break; 1363 break;
2024 } 1364 }
2025 return 0; 1365
1366 return ret;
2026} 1367}
2027 1368
2028static int amd_pmu_init(void) 1369static void __init pmu_check_apic(void)
2029{ 1370{
2030 /* Performance-monitoring supported from K7 and later: */ 1371 if (cpu_has_apic)
2031 if (boot_cpu_data.x86 < 6) 1372 return;
2032 return -ENODEV;
2033
2034 x86_pmu = amd_pmu;
2035
2036 /* Events are common for all AMDs */
2037 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
2038 sizeof(hw_cache_event_ids));
2039 1373
2040 return 0; 1374 x86_pmu.apic = 0;
1375 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1376 pr_info("no hardware sampling interrupt available.\n");
2041} 1377}
2042 1378
2043void __init init_hw_perf_events(void) 1379void __init init_hw_perf_events(void)
2044{ 1380{
1381 struct event_constraint *c;
2045 int err; 1382 int err;
2046 1383
2047 pr_info("Performance Events: "); 1384 pr_info("Performance Events: ");
@@ -2061,6 +1398,8 @@ void __init init_hw_perf_events(void)
2061 return; 1398 return;
2062 } 1399 }
2063 1400
1401 pmu_check_apic();
1402
2064 pr_cont("%s PMU driver.\n", x86_pmu.name); 1403 pr_cont("%s PMU driver.\n", x86_pmu.name);
2065 1404
2066 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 1405 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
@@ -2084,6 +1423,20 @@ void __init init_hw_perf_events(void)
2084 perf_events_lapic_init(); 1423 perf_events_lapic_init();
2085 register_die_notifier(&perf_event_nmi_notifier); 1424 register_die_notifier(&perf_event_nmi_notifier);
2086 1425
1426 unconstrained = (struct event_constraint)
1427 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
1428 0, x86_pmu.num_events);
1429
1430 if (x86_pmu.event_constraints) {
1431 for_each_event_constraint(c, x86_pmu.event_constraints) {
1432 if (c->cmask != INTEL_ARCH_FIXED_MASK)
1433 continue;
1434
1435 c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
1436 c->weight += x86_pmu.num_events;
1437 }
1438 }
1439
2087 pr_info("... version: %d\n", x86_pmu.version); 1440 pr_info("... version: %d\n", x86_pmu.version);
2088 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1441 pr_info("... bit width: %d\n", x86_pmu.event_bits);
2089 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1442 pr_info("... generic registers: %d\n", x86_pmu.num_events);
@@ -2091,25 +1444,92 @@ void __init init_hw_perf_events(void)
2091 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1444 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
2092 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); 1445 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
2093 pr_info("... event mask: %016Lx\n", perf_event_mask); 1446 pr_info("... event mask: %016Lx\n", perf_event_mask);
1447
1448 perf_cpu_notifier(x86_pmu_notifier);
2094} 1449}
2095 1450
2096static inline void x86_pmu_read(struct perf_event *event) 1451static inline void x86_pmu_read(struct perf_event *event)
2097{ 1452{
2098 x86_perf_event_update(event, &event->hw, event->hw.idx); 1453 x86_perf_event_update(event);
2099} 1454}
2100 1455
2101static const struct pmu pmu = { 1456static const struct pmu pmu = {
2102 .enable = x86_pmu_enable, 1457 .enable = x86_pmu_enable,
2103 .disable = x86_pmu_disable, 1458 .disable = x86_pmu_disable,
1459 .start = x86_pmu_start,
1460 .stop = x86_pmu_stop,
2104 .read = x86_pmu_read, 1461 .read = x86_pmu_read,
2105 .unthrottle = x86_pmu_unthrottle, 1462 .unthrottle = x86_pmu_unthrottle,
2106}; 1463};
2107 1464
1465/*
1466 * validate a single event group
1467 *
1468 * validation include:
1469 * - check events are compatible which each other
1470 * - events do not compete for the same counter
1471 * - number of events <= number of counters
1472 *
1473 * validation ensures the group can be loaded onto the
1474 * PMU if it was the only group available.
1475 */
1476static int validate_group(struct perf_event *event)
1477{
1478 struct perf_event *leader = event->group_leader;
1479 struct cpu_hw_events *fake_cpuc;
1480 int ret, n;
1481
1482 ret = -ENOMEM;
1483 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1484 if (!fake_cpuc)
1485 goto out;
1486
1487 /*
1488 * the event is not yet connected with its
1489 * siblings therefore we must first collect
1490 * existing siblings, then add the new event
1491 * before we can simulate the scheduling
1492 */
1493 ret = -ENOSPC;
1494 n = collect_events(fake_cpuc, leader, true);
1495 if (n < 0)
1496 goto out_free;
1497
1498 fake_cpuc->n_events = n;
1499 n = collect_events(fake_cpuc, event, false);
1500 if (n < 0)
1501 goto out_free;
1502
1503 fake_cpuc->n_events = n;
1504
1505 ret = x86_schedule_events(fake_cpuc, n, NULL);
1506
1507out_free:
1508 kfree(fake_cpuc);
1509out:
1510 return ret;
1511}
1512
2108const struct pmu *hw_perf_event_init(struct perf_event *event) 1513const struct pmu *hw_perf_event_init(struct perf_event *event)
2109{ 1514{
1515 const struct pmu *tmp;
2110 int err; 1516 int err;
2111 1517
2112 err = __hw_perf_event_init(event); 1518 err = __hw_perf_event_init(event);
1519 if (!err) {
1520 /*
1521 * we temporarily connect event to its pmu
1522 * such that validate_group() can classify
1523 * it as an x86 event using is_x86_event()
1524 */
1525 tmp = event->pmu;
1526 event->pmu = &pmu;
1527
1528 if (event->group_leader != event)
1529 err = validate_group(event);
1530
1531 event->pmu = tmp;
1532 }
2113 if (err) { 1533 if (err) {
2114 if (event->destroy) 1534 if (event->destroy)
2115 event->destroy(event); 1535 event->destroy(event);
@@ -2132,7 +1552,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2132 1552
2133static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 1553static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2134static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 1554static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2135static DEFINE_PER_CPU(int, in_nmi_frame);
2136 1555
2137 1556
2138static void 1557static void
@@ -2148,9 +1567,6 @@ static void backtrace_warning(void *data, char *msg)
2148 1567
2149static int backtrace_stack(void *data, char *name) 1568static int backtrace_stack(void *data, char *name)
2150{ 1569{
2151 per_cpu(in_nmi_frame, smp_processor_id()) =
2152 x86_is_stack_id(NMI_STACK, name);
2153
2154 return 0; 1570 return 0;
2155} 1571}
2156 1572
@@ -2158,9 +1574,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
2158{ 1574{
2159 struct perf_callchain_entry *entry = data; 1575 struct perf_callchain_entry *entry = data;
2160 1576
2161 if (per_cpu(in_nmi_frame, smp_processor_id()))
2162 return;
2163
2164 if (reliable) 1577 if (reliable)
2165 callchain_store(entry, addr); 1578 callchain_store(entry, addr);
2166} 1579}
@@ -2170,6 +1583,7 @@ static const struct stacktrace_ops backtrace_ops = {
2170 .warning_symbol = backtrace_warning_symbol, 1583 .warning_symbol = backtrace_warning_symbol,
2171 .stack = backtrace_stack, 1584 .stack = backtrace_stack,
2172 .address = backtrace_address, 1585 .address = backtrace_address,
1586 .walk_stack = print_context_stack_bp,
2173}; 1587};
2174 1588
2175#include "../dumpstack.h" 1589#include "../dumpstack.h"
@@ -2180,7 +1594,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
2180 callchain_store(entry, PERF_CONTEXT_KERNEL); 1594 callchain_store(entry, PERF_CONTEXT_KERNEL);
2181 callchain_store(entry, regs->ip); 1595 callchain_store(entry, regs->ip);
2182 1596
2183 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); 1597 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
2184} 1598}
2185 1599
2186/* 1600/*
@@ -2218,14 +1632,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
2218 return len; 1632 return len;
2219} 1633}
2220 1634
2221static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) 1635#ifdef CONFIG_COMPAT
1636static inline int
1637perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
2222{ 1638{
2223 unsigned long bytes; 1639 /* 32-bit process in 64-bit kernel. */
1640 struct stack_frame_ia32 frame;
1641 const void __user *fp;
1642
1643 if (!test_thread_flag(TIF_IA32))
1644 return 0;
1645
1646 fp = compat_ptr(regs->bp);
1647 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1648 unsigned long bytes;
1649 frame.next_frame = 0;
1650 frame.return_address = 0;
2224 1651
2225 bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); 1652 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1653 if (bytes != sizeof(frame))
1654 break;
2226 1655
2227 return bytes == sizeof(*frame); 1656 if (fp < compat_ptr(regs->sp))
1657 break;
1658
1659 callchain_store(entry, frame.return_address);
1660 fp = compat_ptr(frame.next_frame);
1661 }
1662 return 1;
2228} 1663}
1664#else
1665static inline int
1666perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1667{
1668 return 0;
1669}
1670#endif
2229 1671
2230static void 1672static void
2231perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) 1673perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -2241,11 +1683,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
2241 callchain_store(entry, PERF_CONTEXT_USER); 1683 callchain_store(entry, PERF_CONTEXT_USER);
2242 callchain_store(entry, regs->ip); 1684 callchain_store(entry, regs->ip);
2243 1685
1686 if (perf_callchain_user32(regs, entry))
1687 return;
1688
2244 while (entry->nr < PERF_MAX_STACK_DEPTH) { 1689 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1690 unsigned long bytes;
2245 frame.next_frame = NULL; 1691 frame.next_frame = NULL;
2246 frame.return_address = 0; 1692 frame.return_address = 0;
2247 1693
2248 if (!copy_stack_frame(fp, &frame)) 1694 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1695 if (bytes != sizeof(frame))
2249 break; 1696 break;
2250 1697
2251 if ((unsigned long)fp < regs->sp) 1698 if ((unsigned long)fp < regs->sp)
@@ -2266,9 +1713,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
2266 1713
2267 is_user = user_mode(regs); 1714 is_user = user_mode(regs);
2268 1715
2269 if (!current || current->pid == 0)
2270 return;
2271
2272 if (is_user && current->state != TASK_RUNNING) 1716 if (is_user && current->state != TASK_RUNNING)
2273 return; 1717 return;
2274 1718
@@ -2295,7 +1739,14 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2295 return entry; 1739 return entry;
2296} 1740}
2297 1741
2298void hw_perf_event_setup_online(int cpu) 1742void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2299{ 1743{
2300 init_debug_store_on_cpu(cpu); 1744 regs->ip = ip;
1745 /*
1746 * perf_arch_fetch_caller_regs adds another call, we need to increment
1747 * the skip level
1748 */
1749 regs->bp = rewind_frame_pointer(skip + 1);
1750 regs->cs = __KERNEL_CS;
1751 local_save_flags(regs->flags);
2301} 1752}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
new file mode 100644
index 000000000000..db6f7d4056e1
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -0,0 +1,422 @@
1#ifdef CONFIG_CPU_SUP_AMD
2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4
5static __initconst u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
9{
10 [ C(L1D) ] = {
11 [ C(OP_READ) ] = {
12 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
13 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
14 },
15 [ C(OP_WRITE) ] = {
16 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
17 [ C(RESULT_MISS) ] = 0,
18 },
19 [ C(OP_PREFETCH) ] = {
20 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
21 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
22 },
23 },
24 [ C(L1I ) ] = {
25 [ C(OP_READ) ] = {
26 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
27 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
28 },
29 [ C(OP_WRITE) ] = {
30 [ C(RESULT_ACCESS) ] = -1,
31 [ C(RESULT_MISS) ] = -1,
32 },
33 [ C(OP_PREFETCH) ] = {
34 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
35 [ C(RESULT_MISS) ] = 0,
36 },
37 },
38 [ C(LL ) ] = {
39 [ C(OP_READ) ] = {
40 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
41 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
42 },
43 [ C(OP_WRITE) ] = {
44 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
45 [ C(RESULT_MISS) ] = 0,
46 },
47 [ C(OP_PREFETCH) ] = {
48 [ C(RESULT_ACCESS) ] = 0,
49 [ C(RESULT_MISS) ] = 0,
50 },
51 },
52 [ C(DTLB) ] = {
53 [ C(OP_READ) ] = {
54 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
55 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
56 },
57 [ C(OP_WRITE) ] = {
58 [ C(RESULT_ACCESS) ] = 0,
59 [ C(RESULT_MISS) ] = 0,
60 },
61 [ C(OP_PREFETCH) ] = {
62 [ C(RESULT_ACCESS) ] = 0,
63 [ C(RESULT_MISS) ] = 0,
64 },
65 },
66 [ C(ITLB) ] = {
67 [ C(OP_READ) ] = {
68 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
69 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
70 },
71 [ C(OP_WRITE) ] = {
72 [ C(RESULT_ACCESS) ] = -1,
73 [ C(RESULT_MISS) ] = -1,
74 },
75 [ C(OP_PREFETCH) ] = {
76 [ C(RESULT_ACCESS) ] = -1,
77 [ C(RESULT_MISS) ] = -1,
78 },
79 },
80 [ C(BPU ) ] = {
81 [ C(OP_READ) ] = {
82 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
83 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
84 },
85 [ C(OP_WRITE) ] = {
86 [ C(RESULT_ACCESS) ] = -1,
87 [ C(RESULT_MISS) ] = -1,
88 },
89 [ C(OP_PREFETCH) ] = {
90 [ C(RESULT_ACCESS) ] = -1,
91 [ C(RESULT_MISS) ] = -1,
92 },
93 },
94};
95
96/*
97 * AMD Performance Monitor K7 and later.
98 */
99static const u64 amd_perfmon_event_map[] =
100{
101 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
107};
108
109static u64 amd_pmu_event_map(int hw_event)
110{
111 return amd_perfmon_event_map[hw_event];
112}
113
114static u64 amd_pmu_raw_event(u64 hw_event)
115{
116#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL
117#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
118#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
119#define K7_EVNTSEL_INV_MASK 0x000800000ULL
120#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
121
122#define K7_EVNTSEL_MASK \
123 (K7_EVNTSEL_EVENT_MASK | \
124 K7_EVNTSEL_UNIT_MASK | \
125 K7_EVNTSEL_EDGE_MASK | \
126 K7_EVNTSEL_INV_MASK | \
127 K7_EVNTSEL_REG_MASK)
128
129 return hw_event & K7_EVNTSEL_MASK;
130}
131
132/*
133 * AMD64 events are detected based on their event codes.
134 */
135static inline int amd_is_nb_event(struct hw_perf_event *hwc)
136{
137 return (hwc->config & 0xe0) == 0xe0;
138}
139
140static inline int amd_has_nb(struct cpu_hw_events *cpuc)
141{
142 struct amd_nb *nb = cpuc->amd_nb;
143
144 return nb && nb->nb_id != -1;
145}
146
147static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
148 struct perf_event *event)
149{
150 struct hw_perf_event *hwc = &event->hw;
151 struct amd_nb *nb = cpuc->amd_nb;
152 int i;
153
154 /*
155 * only care about NB events
156 */
157 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
158 return;
159
160 /*
161 * need to scan whole list because event may not have
162 * been assigned during scheduling
163 *
164 * no race condition possible because event can only
165 * be removed on one CPU at a time AND PMU is disabled
166 * when we come here
167 */
168 for (i = 0; i < x86_pmu.num_events; i++) {
169 if (nb->owners[i] == event) {
170 cmpxchg(nb->owners+i, event, NULL);
171 break;
172 }
173 }
174}
175
176 /*
177 * AMD64 NorthBridge events need special treatment because
178 * counter access needs to be synchronized across all cores
179 * of a package. Refer to BKDG section 3.12
180 *
181 * NB events are events measuring L3 cache, Hypertransport
182 * traffic. They are identified by an event code >= 0xe00.
183 * They measure events on the NorthBride which is shared
184 * by all cores on a package. NB events are counted on a
185 * shared set of counters. When a NB event is programmed
186 * in a counter, the data actually comes from a shared
187 * counter. Thus, access to those counters needs to be
188 * synchronized.
189 *
190 * We implement the synchronization such that no two cores
191 * can be measuring NB events using the same counters. Thus,
192 * we maintain a per-NB allocation table. The available slot
193 * is propagated using the event_constraint structure.
194 *
195 * We provide only one choice for each NB event based on
196 * the fact that only NB events have restrictions. Consequently,
197 * if a counter is available, there is a guarantee the NB event
198 * will be assigned to it. If no slot is available, an empty
199 * constraint is returned and scheduling will eventually fail
200 * for this event.
201 *
202 * Note that all cores attached the same NB compete for the same
203 * counters to host NB events, this is why we use atomic ops. Some
204 * multi-chip CPUs may have more than one NB.
205 *
206 * Given that resources are allocated (cmpxchg), they must be
207 * eventually freed for others to use. This is accomplished by
208 * calling amd_put_event_constraints().
209 *
210 * Non NB events are not impacted by this restriction.
211 */
212static struct event_constraint *
213amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
214{
215 struct hw_perf_event *hwc = &event->hw;
216 struct amd_nb *nb = cpuc->amd_nb;
217 struct perf_event *old = NULL;
218 int max = x86_pmu.num_events;
219 int i, j, k = -1;
220
221 /*
222 * if not NB event or no NB, then no constraints
223 */
224 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
225 return &unconstrained;
226
227 /*
228 * detect if already present, if so reuse
229 *
230 * cannot merge with actual allocation
231 * because of possible holes
232 *
233 * event can already be present yet not assigned (in hwc->idx)
234 * because of successive calls to x86_schedule_events() from
235 * hw_perf_group_sched_in() without hw_perf_enable()
236 */
237 for (i = 0; i < max; i++) {
238 /*
239 * keep track of first free slot
240 */
241 if (k == -1 && !nb->owners[i])
242 k = i;
243
244 /* already present, reuse */
245 if (nb->owners[i] == event)
246 goto done;
247 }
248 /*
249 * not present, so grab a new slot
250 * starting either at:
251 */
252 if (hwc->idx != -1) {
253 /* previous assignment */
254 i = hwc->idx;
255 } else if (k != -1) {
256 /* start from free slot found */
257 i = k;
258 } else {
259 /*
260 * event not found, no slot found in
261 * first pass, try again from the
262 * beginning
263 */
264 i = 0;
265 }
266 j = i;
267 do {
268 old = cmpxchg(nb->owners+i, NULL, event);
269 if (!old)
270 break;
271 if (++i == max)
272 i = 0;
273 } while (i != j);
274done:
275 if (!old)
276 return &nb->event_constraints[i];
277
278 return &emptyconstraint;
279}
280
281static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
282{
283 struct amd_nb *nb;
284 int i;
285
286 nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
287 if (!nb)
288 return NULL;
289
290 memset(nb, 0, sizeof(*nb));
291 nb->nb_id = nb_id;
292
293 /*
294 * initialize all possible NB constraints
295 */
296 for (i = 0; i < x86_pmu.num_events; i++) {
297 __set_bit(i, nb->event_constraints[i].idxmsk);
298 nb->event_constraints[i].weight = 1;
299 }
300 return nb;
301}
302
303static int amd_pmu_cpu_prepare(int cpu)
304{
305 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
306
307 WARN_ON_ONCE(cpuc->amd_nb);
308
309 if (boot_cpu_data.x86_max_cores < 2)
310 return NOTIFY_OK;
311
312 cpuc->amd_nb = amd_alloc_nb(cpu, -1);
313 if (!cpuc->amd_nb)
314 return NOTIFY_BAD;
315
316 return NOTIFY_OK;
317}
318
319static void amd_pmu_cpu_starting(int cpu)
320{
321 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
322 struct amd_nb *nb;
323 int i, nb_id;
324
325 if (boot_cpu_data.x86_max_cores < 2)
326 return;
327
328 nb_id = amd_get_nb_id(cpu);
329 WARN_ON_ONCE(nb_id == BAD_APICID);
330
331 raw_spin_lock(&amd_nb_lock);
332
333 for_each_online_cpu(i) {
334 nb = per_cpu(cpu_hw_events, i).amd_nb;
335 if (WARN_ON_ONCE(!nb))
336 continue;
337
338 if (nb->nb_id == nb_id) {
339 kfree(cpuc->amd_nb);
340 cpuc->amd_nb = nb;
341 break;
342 }
343 }
344
345 cpuc->amd_nb->nb_id = nb_id;
346 cpuc->amd_nb->refcnt++;
347
348 raw_spin_unlock(&amd_nb_lock);
349}
350
351static void amd_pmu_cpu_dead(int cpu)
352{
353 struct cpu_hw_events *cpuhw;
354
355 if (boot_cpu_data.x86_max_cores < 2)
356 return;
357
358 cpuhw = &per_cpu(cpu_hw_events, cpu);
359
360 raw_spin_lock(&amd_nb_lock);
361
362 if (cpuhw->amd_nb) {
363 struct amd_nb *nb = cpuhw->amd_nb;
364
365 if (nb->nb_id == -1 || --nb->refcnt == 0)
366 kfree(nb);
367
368 cpuhw->amd_nb = NULL;
369 }
370
371 raw_spin_unlock(&amd_nb_lock);
372}
373
374static __initconst struct x86_pmu amd_pmu = {
375 .name = "AMD",
376 .handle_irq = x86_pmu_handle_irq,
377 .disable_all = x86_pmu_disable_all,
378 .enable_all = x86_pmu_enable_all,
379 .enable = x86_pmu_enable_event,
380 .disable = x86_pmu_disable_event,
381 .eventsel = MSR_K7_EVNTSEL0,
382 .perfctr = MSR_K7_PERFCTR0,
383 .event_map = amd_pmu_event_map,
384 .raw_event = amd_pmu_raw_event,
385 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
386 .num_events = 4,
387 .event_bits = 48,
388 .event_mask = (1ULL << 48) - 1,
389 .apic = 1,
390 /* use highest bit to detect overflow */
391 .max_period = (1ULL << 47) - 1,
392 .get_event_constraints = amd_get_event_constraints,
393 .put_event_constraints = amd_put_event_constraints,
394
395 .cpu_prepare = amd_pmu_cpu_prepare,
396 .cpu_starting = amd_pmu_cpu_starting,
397 .cpu_dead = amd_pmu_cpu_dead,
398};
399
400static __init int amd_pmu_init(void)
401{
402 /* Performance-monitoring supported from K7 and later: */
403 if (boot_cpu_data.x86 < 6)
404 return -ENODEV;
405
406 x86_pmu = amd_pmu;
407
408 /* Events are common for all AMDs */
409 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
410 sizeof(hw_cache_event_ids));
411
412 return 0;
413}
414
415#else /* CONFIG_CPU_SUP_AMD */
416
417static int amd_pmu_init(void)
418{
419 return 0;
420}
421
422#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
new file mode 100644
index 000000000000..9c794ac87837
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -0,0 +1,980 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Intel PerfMon, used on Core and later.
5 */
6static const u64 intel_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
15};
16
17static struct event_constraint intel_core_event_constraints[] =
18{
19 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
20 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
21 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
22 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
23 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
24 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
25 EVENT_CONSTRAINT_END
26};
27
28static struct event_constraint intel_core2_event_constraints[] =
29{
30 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
31 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
32 /*
33 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
34 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
35 * ratio between these counters.
36 */
37 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
38 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
39 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
40 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
41 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
42 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
43 INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
44 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
45 INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
46 INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
47 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
48 EVENT_CONSTRAINT_END
49};
50
51static struct event_constraint intel_nehalem_event_constraints[] =
52{
53 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
54 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
55 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
56 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
57 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
58 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
59 INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
60 INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
61 INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
62 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
63 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
64 EVENT_CONSTRAINT_END
65};
66
67static struct event_constraint intel_westmere_event_constraints[] =
68{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
70 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
71 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
72 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
73 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
74 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
75 EVENT_CONSTRAINT_END
76};
77
78static struct event_constraint intel_gen_event_constraints[] =
79{
80 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
81 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
82 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
83 EVENT_CONSTRAINT_END
84};
85
86static u64 intel_pmu_event_map(int hw_event)
87{
88 return intel_perfmon_event_map[hw_event];
89}
90
91static __initconst u64 westmere_hw_cache_event_ids
92 [PERF_COUNT_HW_CACHE_MAX]
93 [PERF_COUNT_HW_CACHE_OP_MAX]
94 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
95{
96 [ C(L1D) ] = {
97 [ C(OP_READ) ] = {
98 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
99 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
100 },
101 [ C(OP_WRITE) ] = {
102 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
103 [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
104 },
105 [ C(OP_PREFETCH) ] = {
106 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
107 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
108 },
109 },
110 [ C(L1I ) ] = {
111 [ C(OP_READ) ] = {
112 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
113 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
114 },
115 [ C(OP_WRITE) ] = {
116 [ C(RESULT_ACCESS) ] = -1,
117 [ C(RESULT_MISS) ] = -1,
118 },
119 [ C(OP_PREFETCH) ] = {
120 [ C(RESULT_ACCESS) ] = 0x0,
121 [ C(RESULT_MISS) ] = 0x0,
122 },
123 },
124 [ C(LL ) ] = {
125 [ C(OP_READ) ] = {
126 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
127 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
128 },
129 [ C(OP_WRITE) ] = {
130 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
131 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
132 },
133 [ C(OP_PREFETCH) ] = {
134 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
135 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
136 },
137 },
138 [ C(DTLB) ] = {
139 [ C(OP_READ) ] = {
140 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
141 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
142 },
143 [ C(OP_WRITE) ] = {
144 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
145 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
146 },
147 [ C(OP_PREFETCH) ] = {
148 [ C(RESULT_ACCESS) ] = 0x0,
149 [ C(RESULT_MISS) ] = 0x0,
150 },
151 },
152 [ C(ITLB) ] = {
153 [ C(OP_READ) ] = {
154 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
155 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
156 },
157 [ C(OP_WRITE) ] = {
158 [ C(RESULT_ACCESS) ] = -1,
159 [ C(RESULT_MISS) ] = -1,
160 },
161 [ C(OP_PREFETCH) ] = {
162 [ C(RESULT_ACCESS) ] = -1,
163 [ C(RESULT_MISS) ] = -1,
164 },
165 },
166 [ C(BPU ) ] = {
167 [ C(OP_READ) ] = {
168 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
169 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
170 },
171 [ C(OP_WRITE) ] = {
172 [ C(RESULT_ACCESS) ] = -1,
173 [ C(RESULT_MISS) ] = -1,
174 },
175 [ C(OP_PREFETCH) ] = {
176 [ C(RESULT_ACCESS) ] = -1,
177 [ C(RESULT_MISS) ] = -1,
178 },
179 },
180};
181
182static __initconst u64 nehalem_hw_cache_event_ids
183 [PERF_COUNT_HW_CACHE_MAX]
184 [PERF_COUNT_HW_CACHE_OP_MAX]
185 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
186{
187 [ C(L1D) ] = {
188 [ C(OP_READ) ] = {
189 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
190 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
191 },
192 [ C(OP_WRITE) ] = {
193 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
194 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
195 },
196 [ C(OP_PREFETCH) ] = {
197 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
198 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
199 },
200 },
201 [ C(L1I ) ] = {
202 [ C(OP_READ) ] = {
203 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
204 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
205 },
206 [ C(OP_WRITE) ] = {
207 [ C(RESULT_ACCESS) ] = -1,
208 [ C(RESULT_MISS) ] = -1,
209 },
210 [ C(OP_PREFETCH) ] = {
211 [ C(RESULT_ACCESS) ] = 0x0,
212 [ C(RESULT_MISS) ] = 0x0,
213 },
214 },
215 [ C(LL ) ] = {
216 [ C(OP_READ) ] = {
217 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
218 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
219 },
220 [ C(OP_WRITE) ] = {
221 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
222 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
223 },
224 [ C(OP_PREFETCH) ] = {
225 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
226 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
227 },
228 },
229 [ C(DTLB) ] = {
230 [ C(OP_READ) ] = {
231 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
232 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
233 },
234 [ C(OP_WRITE) ] = {
235 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
236 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
237 },
238 [ C(OP_PREFETCH) ] = {
239 [ C(RESULT_ACCESS) ] = 0x0,
240 [ C(RESULT_MISS) ] = 0x0,
241 },
242 },
243 [ C(ITLB) ] = {
244 [ C(OP_READ) ] = {
245 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
246 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
247 },
248 [ C(OP_WRITE) ] = {
249 [ C(RESULT_ACCESS) ] = -1,
250 [ C(RESULT_MISS) ] = -1,
251 },
252 [ C(OP_PREFETCH) ] = {
253 [ C(RESULT_ACCESS) ] = -1,
254 [ C(RESULT_MISS) ] = -1,
255 },
256 },
257 [ C(BPU ) ] = {
258 [ C(OP_READ) ] = {
259 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
260 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
261 },
262 [ C(OP_WRITE) ] = {
263 [ C(RESULT_ACCESS) ] = -1,
264 [ C(RESULT_MISS) ] = -1,
265 },
266 [ C(OP_PREFETCH) ] = {
267 [ C(RESULT_ACCESS) ] = -1,
268 [ C(RESULT_MISS) ] = -1,
269 },
270 },
271};
272
273static __initconst u64 core2_hw_cache_event_ids
274 [PERF_COUNT_HW_CACHE_MAX]
275 [PERF_COUNT_HW_CACHE_OP_MAX]
276 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
277{
278 [ C(L1D) ] = {
279 [ C(OP_READ) ] = {
280 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
281 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
282 },
283 [ C(OP_WRITE) ] = {
284 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
285 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
286 },
287 [ C(OP_PREFETCH) ] = {
288 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
289 [ C(RESULT_MISS) ] = 0,
290 },
291 },
292 [ C(L1I ) ] = {
293 [ C(OP_READ) ] = {
294 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
295 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
296 },
297 [ C(OP_WRITE) ] = {
298 [ C(RESULT_ACCESS) ] = -1,
299 [ C(RESULT_MISS) ] = -1,
300 },
301 [ C(OP_PREFETCH) ] = {
302 [ C(RESULT_ACCESS) ] = 0,
303 [ C(RESULT_MISS) ] = 0,
304 },
305 },
306 [ C(LL ) ] = {
307 [ C(OP_READ) ] = {
308 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
309 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
310 },
311 [ C(OP_WRITE) ] = {
312 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
313 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
314 },
315 [ C(OP_PREFETCH) ] = {
316 [ C(RESULT_ACCESS) ] = 0,
317 [ C(RESULT_MISS) ] = 0,
318 },
319 },
320 [ C(DTLB) ] = {
321 [ C(OP_READ) ] = {
322 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
323 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
324 },
325 [ C(OP_WRITE) ] = {
326 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
327 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
328 },
329 [ C(OP_PREFETCH) ] = {
330 [ C(RESULT_ACCESS) ] = 0,
331 [ C(RESULT_MISS) ] = 0,
332 },
333 },
334 [ C(ITLB) ] = {
335 [ C(OP_READ) ] = {
336 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
337 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
338 },
339 [ C(OP_WRITE) ] = {
340 [ C(RESULT_ACCESS) ] = -1,
341 [ C(RESULT_MISS) ] = -1,
342 },
343 [ C(OP_PREFETCH) ] = {
344 [ C(RESULT_ACCESS) ] = -1,
345 [ C(RESULT_MISS) ] = -1,
346 },
347 },
348 [ C(BPU ) ] = {
349 [ C(OP_READ) ] = {
350 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
351 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
352 },
353 [ C(OP_WRITE) ] = {
354 [ C(RESULT_ACCESS) ] = -1,
355 [ C(RESULT_MISS) ] = -1,
356 },
357 [ C(OP_PREFETCH) ] = {
358 [ C(RESULT_ACCESS) ] = -1,
359 [ C(RESULT_MISS) ] = -1,
360 },
361 },
362};
363
364static __initconst u64 atom_hw_cache_event_ids
365 [PERF_COUNT_HW_CACHE_MAX]
366 [PERF_COUNT_HW_CACHE_OP_MAX]
367 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
368{
369 [ C(L1D) ] = {
370 [ C(OP_READ) ] = {
371 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
372 [ C(RESULT_MISS) ] = 0,
373 },
374 [ C(OP_WRITE) ] = {
375 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
376 [ C(RESULT_MISS) ] = 0,
377 },
378 [ C(OP_PREFETCH) ] = {
379 [ C(RESULT_ACCESS) ] = 0x0,
380 [ C(RESULT_MISS) ] = 0,
381 },
382 },
383 [ C(L1I ) ] = {
384 [ C(OP_READ) ] = {
385 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
386 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
387 },
388 [ C(OP_WRITE) ] = {
389 [ C(RESULT_ACCESS) ] = -1,
390 [ C(RESULT_MISS) ] = -1,
391 },
392 [ C(OP_PREFETCH) ] = {
393 [ C(RESULT_ACCESS) ] = 0,
394 [ C(RESULT_MISS) ] = 0,
395 },
396 },
397 [ C(LL ) ] = {
398 [ C(OP_READ) ] = {
399 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
400 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
401 },
402 [ C(OP_WRITE) ] = {
403 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
404 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
405 },
406 [ C(OP_PREFETCH) ] = {
407 [ C(RESULT_ACCESS) ] = 0,
408 [ C(RESULT_MISS) ] = 0,
409 },
410 },
411 [ C(DTLB) ] = {
412 [ C(OP_READ) ] = {
413 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
414 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
415 },
416 [ C(OP_WRITE) ] = {
417 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
418 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
419 },
420 [ C(OP_PREFETCH) ] = {
421 [ C(RESULT_ACCESS) ] = 0,
422 [ C(RESULT_MISS) ] = 0,
423 },
424 },
425 [ C(ITLB) ] = {
426 [ C(OP_READ) ] = {
427 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
428 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
429 },
430 [ C(OP_WRITE) ] = {
431 [ C(RESULT_ACCESS) ] = -1,
432 [ C(RESULT_MISS) ] = -1,
433 },
434 [ C(OP_PREFETCH) ] = {
435 [ C(RESULT_ACCESS) ] = -1,
436 [ C(RESULT_MISS) ] = -1,
437 },
438 },
439 [ C(BPU ) ] = {
440 [ C(OP_READ) ] = {
441 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
442 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
443 },
444 [ C(OP_WRITE) ] = {
445 [ C(RESULT_ACCESS) ] = -1,
446 [ C(RESULT_MISS) ] = -1,
447 },
448 [ C(OP_PREFETCH) ] = {
449 [ C(RESULT_ACCESS) ] = -1,
450 [ C(RESULT_MISS) ] = -1,
451 },
452 },
453};
454
455static u64 intel_pmu_raw_event(u64 hw_event)
456{
457#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
458#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
459#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
460#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
461#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
462
463#define CORE_EVNTSEL_MASK \
464 (INTEL_ARCH_EVTSEL_MASK | \
465 INTEL_ARCH_UNIT_MASK | \
466 INTEL_ARCH_EDGE_MASK | \
467 INTEL_ARCH_INV_MASK | \
468 INTEL_ARCH_CNT_MASK)
469
470 return hw_event & CORE_EVNTSEL_MASK;
471}
472
473static void intel_pmu_enable_bts(u64 config)
474{
475 unsigned long debugctlmsr;
476
477 debugctlmsr = get_debugctlmsr();
478
479 debugctlmsr |= X86_DEBUGCTL_TR;
480 debugctlmsr |= X86_DEBUGCTL_BTS;
481 debugctlmsr |= X86_DEBUGCTL_BTINT;
482
483 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
484 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
485
486 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
487 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
488
489 update_debugctlmsr(debugctlmsr);
490}
491
492static void intel_pmu_disable_bts(void)
493{
494 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
495 unsigned long debugctlmsr;
496
497 if (!cpuc->ds)
498 return;
499
500 debugctlmsr = get_debugctlmsr();
501
502 debugctlmsr &=
503 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
504 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
505
506 update_debugctlmsr(debugctlmsr);
507}
508
509static void intel_pmu_disable_all(void)
510{
511 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
512
513 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
514
515 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
516 intel_pmu_disable_bts();
517}
518
519static void intel_pmu_enable_all(void)
520{
521 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
522
523 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
524
525 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
526 struct perf_event *event =
527 cpuc->events[X86_PMC_IDX_FIXED_BTS];
528
529 if (WARN_ON_ONCE(!event))
530 return;
531
532 intel_pmu_enable_bts(event->hw.config);
533 }
534}
535
536static inline u64 intel_pmu_get_status(void)
537{
538 u64 status;
539
540 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
541
542 return status;
543}
544
545static inline void intel_pmu_ack_status(u64 ack)
546{
547 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
548}
549
550static inline void
551intel_pmu_disable_fixed(struct hw_perf_event *hwc)
552{
553 int idx = hwc->idx - X86_PMC_IDX_FIXED;
554 u64 ctrl_val, mask;
555
556 mask = 0xfULL << (idx * 4);
557
558 rdmsrl(hwc->config_base, ctrl_val);
559 ctrl_val &= ~mask;
560 (void)checking_wrmsrl(hwc->config_base, ctrl_val);
561}
562
563static void intel_pmu_drain_bts_buffer(void)
564{
565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
566 struct debug_store *ds = cpuc->ds;
567 struct bts_record {
568 u64 from;
569 u64 to;
570 u64 flags;
571 };
572 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
573 struct bts_record *at, *top;
574 struct perf_output_handle handle;
575 struct perf_event_header header;
576 struct perf_sample_data data;
577 struct pt_regs regs;
578
579 if (!event)
580 return;
581
582 if (!ds)
583 return;
584
585 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
586 top = (struct bts_record *)(unsigned long)ds->bts_index;
587
588 if (top <= at)
589 return;
590
591 ds->bts_index = ds->bts_buffer_base;
592
593 perf_sample_data_init(&data, 0);
594
595 data.period = event->hw.last_period;
596 regs.ip = 0;
597
598 /*
599 * Prepare a generic sample, i.e. fill in the invariant fields.
600 * We will overwrite the from and to address before we output
601 * the sample.
602 */
603 perf_prepare_sample(&header, &data, event, &regs);
604
605 if (perf_output_begin(&handle, event,
606 header.size * (top - at), 1, 1))
607 return;
608
609 for (; at < top; at++) {
610 data.ip = at->from;
611 data.addr = at->to;
612
613 perf_output_sample(&handle, &header, &data, event);
614 }
615
616 perf_output_end(&handle);
617
618 /* There's new data available. */
619 event->hw.interrupts++;
620 event->pending_kill = POLL_IN;
621}
622
623static inline void
624intel_pmu_disable_event(struct perf_event *event)
625{
626 struct hw_perf_event *hwc = &event->hw;
627
628 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
629 intel_pmu_disable_bts();
630 intel_pmu_drain_bts_buffer();
631 return;
632 }
633
634 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
635 intel_pmu_disable_fixed(hwc);
636 return;
637 }
638
639 x86_pmu_disable_event(event);
640}
641
642static inline void
643intel_pmu_enable_fixed(struct hw_perf_event *hwc)
644{
645 int idx = hwc->idx - X86_PMC_IDX_FIXED;
646 u64 ctrl_val, bits, mask;
647 int err;
648
649 /*
650 * Enable IRQ generation (0x8),
651 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
652 * if requested:
653 */
654 bits = 0x8ULL;
655 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
656 bits |= 0x2;
657 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
658 bits |= 0x1;
659
660 /*
661 * ANY bit is supported in v3 and up
662 */
663 if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
664 bits |= 0x4;
665
666 bits <<= (idx * 4);
667 mask = 0xfULL << (idx * 4);
668
669 rdmsrl(hwc->config_base, ctrl_val);
670 ctrl_val &= ~mask;
671 ctrl_val |= bits;
672 err = checking_wrmsrl(hwc->config_base, ctrl_val);
673}
674
675static void intel_pmu_enable_event(struct perf_event *event)
676{
677 struct hw_perf_event *hwc = &event->hw;
678
679 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
680 if (!__get_cpu_var(cpu_hw_events).enabled)
681 return;
682
683 intel_pmu_enable_bts(hwc->config);
684 return;
685 }
686
687 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
688 intel_pmu_enable_fixed(hwc);
689 return;
690 }
691
692 __x86_pmu_enable_event(hwc);
693}
694
695/*
696 * Save and restart an expired event. Called by NMI contexts,
697 * so it has to be careful about preempting normal event ops:
698 */
699static int intel_pmu_save_and_restart(struct perf_event *event)
700{
701 x86_perf_event_update(event);
702 return x86_perf_event_set_period(event);
703}
704
705static void intel_pmu_reset(void)
706{
707 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
708 unsigned long flags;
709 int idx;
710
711 if (!x86_pmu.num_events)
712 return;
713
714 local_irq_save(flags);
715
716 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
717
718 for (idx = 0; idx < x86_pmu.num_events; idx++) {
719 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
720 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
721 }
722 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
723 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
724 }
725 if (ds)
726 ds->bts_index = ds->bts_buffer_base;
727
728 local_irq_restore(flags);
729}
730
731/*
732 * This handler is triggered by the local APIC, so the APIC IRQ handling
733 * rules apply:
734 */
735static int intel_pmu_handle_irq(struct pt_regs *regs)
736{
737 struct perf_sample_data data;
738 struct cpu_hw_events *cpuc;
739 int bit, loops;
740 u64 ack, status;
741
742 perf_sample_data_init(&data, 0);
743
744 cpuc = &__get_cpu_var(cpu_hw_events);
745
746 intel_pmu_disable_all();
747 intel_pmu_drain_bts_buffer();
748 status = intel_pmu_get_status();
749 if (!status) {
750 intel_pmu_enable_all();
751 return 0;
752 }
753
754 loops = 0;
755again:
756 if (++loops > 100) {
757 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
758 perf_event_print_debug();
759 intel_pmu_reset();
760 goto done;
761 }
762
763 inc_irq_stat(apic_perf_irqs);
764 ack = status;
765 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
766 struct perf_event *event = cpuc->events[bit];
767
768 if (!test_bit(bit, cpuc->active_mask))
769 continue;
770
771 if (!intel_pmu_save_and_restart(event))
772 continue;
773
774 data.period = event->hw.last_period;
775
776 if (perf_event_overflow(event, 1, &data, regs))
777 x86_pmu_stop(event);
778 }
779
780 intel_pmu_ack_status(ack);
781
782 /*
783 * Repeat if there is more work to be done:
784 */
785 status = intel_pmu_get_status();
786 if (status)
787 goto again;
788
789done:
790 intel_pmu_enable_all();
791 return 1;
792}
793
794static struct event_constraint bts_constraint =
795 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
796
797static struct event_constraint *
798intel_special_constraints(struct perf_event *event)
799{
800 unsigned int hw_event;
801
802 hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
803
804 if (unlikely((hw_event ==
805 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
806 (event->hw.sample_period == 1))) {
807
808 return &bts_constraint;
809 }
810 return NULL;
811}
812
813static struct event_constraint *
814intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
815{
816 struct event_constraint *c;
817
818 c = intel_special_constraints(event);
819 if (c)
820 return c;
821
822 return x86_get_event_constraints(cpuc, event);
823}
824
825static __initconst struct x86_pmu core_pmu = {
826 .name = "core",
827 .handle_irq = x86_pmu_handle_irq,
828 .disable_all = x86_pmu_disable_all,
829 .enable_all = x86_pmu_enable_all,
830 .enable = x86_pmu_enable_event,
831 .disable = x86_pmu_disable_event,
832 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
833 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
834 .event_map = intel_pmu_event_map,
835 .raw_event = intel_pmu_raw_event,
836 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
837 .apic = 1,
838 /*
839 * Intel PMCs cannot be accessed sanely above 32 bit width,
840 * so we install an artificial 1<<31 period regardless of
841 * the generic event period:
842 */
843 .max_period = (1ULL << 31) - 1,
844 .get_event_constraints = intel_get_event_constraints,
845 .event_constraints = intel_core_event_constraints,
846};
847
848static __initconst struct x86_pmu intel_pmu = {
849 .name = "Intel",
850 .handle_irq = intel_pmu_handle_irq,
851 .disable_all = intel_pmu_disable_all,
852 .enable_all = intel_pmu_enable_all,
853 .enable = intel_pmu_enable_event,
854 .disable = intel_pmu_disable_event,
855 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
856 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
857 .event_map = intel_pmu_event_map,
858 .raw_event = intel_pmu_raw_event,
859 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
860 .apic = 1,
861 /*
862 * Intel PMCs cannot be accessed sanely above 32 bit width,
863 * so we install an artificial 1<<31 period regardless of
864 * the generic event period:
865 */
866 .max_period = (1ULL << 31) - 1,
867 .enable_bts = intel_pmu_enable_bts,
868 .disable_bts = intel_pmu_disable_bts,
869 .get_event_constraints = intel_get_event_constraints,
870
871 .cpu_starting = init_debug_store_on_cpu,
872 .cpu_dying = fini_debug_store_on_cpu,
873};
874
875static __init int intel_pmu_init(void)
876{
877 union cpuid10_edx edx;
878 union cpuid10_eax eax;
879 unsigned int unused;
880 unsigned int ebx;
881 int version;
882
883 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
884 /* check for P6 processor family */
885 if (boot_cpu_data.x86 == 6) {
886 return p6_pmu_init();
887 } else {
888 return -ENODEV;
889 }
890 }
891
892 /*
893 * Check whether the Architectural PerfMon supports
894 * Branch Misses Retired hw_event or not.
895 */
896 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
897 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
898 return -ENODEV;
899
900 version = eax.split.version_id;
901 if (version < 2)
902 x86_pmu = core_pmu;
903 else
904 x86_pmu = intel_pmu;
905
906 x86_pmu.version = version;
907 x86_pmu.num_events = eax.split.num_events;
908 x86_pmu.event_bits = eax.split.bit_width;
909 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
910
911 /*
912 * Quirk: v2 perfmon does not report fixed-purpose events, so
913 * assume at least 3 events:
914 */
915 if (version > 1)
916 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
917
918 /*
919 * Install the hw-cache-events table:
920 */
921 switch (boot_cpu_data.x86_model) {
922 case 14: /* 65 nm core solo/duo, "Yonah" */
923 pr_cont("Core events, ");
924 break;
925
926 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
927 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
928 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
929 case 29: /* six-core 45 nm xeon "Dunnington" */
930 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
931 sizeof(hw_cache_event_ids));
932
933 x86_pmu.event_constraints = intel_core2_event_constraints;
934 pr_cont("Core2 events, ");
935 break;
936
937 case 26: /* 45 nm nehalem, "Bloomfield" */
938 case 30: /* 45 nm nehalem, "Lynnfield" */
939 case 46: /* 45 nm nehalem-ex, "Beckton" */
940 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
941 sizeof(hw_cache_event_ids));
942
943 x86_pmu.event_constraints = intel_nehalem_event_constraints;
944 pr_cont("Nehalem/Corei7 events, ");
945 break;
946 case 28: /* Atom */
947 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
948 sizeof(hw_cache_event_ids));
949
950 x86_pmu.event_constraints = intel_gen_event_constraints;
951 pr_cont("Atom events, ");
952 break;
953
954 case 37: /* 32 nm nehalem, "Clarkdale" */
955 case 44: /* 32 nm nehalem, "Gulftown" */
956 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
957 sizeof(hw_cache_event_ids));
958
959 x86_pmu.event_constraints = intel_westmere_event_constraints;
960 pr_cont("Westmere events, ");
961 break;
962
963 default:
964 /*
965 * default constraints for v2 and up
966 */
967 x86_pmu.event_constraints = intel_gen_event_constraints;
968 pr_cont("generic architected perfmon, ");
969 }
970 return 0;
971}
972
973#else /* CONFIG_CPU_SUP_INTEL */
974
975static int intel_pmu_init(void)
976{
977 return 0;
978}
979
980#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
new file mode 100644
index 000000000000..a330485d14da
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -0,0 +1,159 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Not sure about some of these
5 */
6static const u64 p6_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
15};
16
17static u64 p6_pmu_event_map(int hw_event)
18{
19 return p6_perfmon_event_map[hw_event];
20}
21
22/*
23 * Event setting that is specified not to count anything.
24 * We use this to effectively disable a counter.
25 *
26 * L2_RQSTS with 0 MESI unit mask.
27 */
28#define P6_NOP_EVENT 0x0000002EULL
29
30static u64 p6_pmu_raw_event(u64 hw_event)
31{
32#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
33#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
34#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
35#define P6_EVNTSEL_INV_MASK 0x00800000ULL
36#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
37
38#define P6_EVNTSEL_MASK \
39 (P6_EVNTSEL_EVENT_MASK | \
40 P6_EVNTSEL_UNIT_MASK | \
41 P6_EVNTSEL_EDGE_MASK | \
42 P6_EVNTSEL_INV_MASK | \
43 P6_EVNTSEL_REG_MASK)
44
45 return hw_event & P6_EVNTSEL_MASK;
46}
47
48static struct event_constraint p6_event_constraints[] =
49{
50 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
51 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
52 INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
53 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
54 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
55 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
56 EVENT_CONSTRAINT_END
57};
58
59static void p6_pmu_disable_all(void)
60{
61 u64 val;
62
63 /* p6 only has one enable register */
64 rdmsrl(MSR_P6_EVNTSEL0, val);
65 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
66 wrmsrl(MSR_P6_EVNTSEL0, val);
67}
68
69static void p6_pmu_enable_all(void)
70{
71 unsigned long val;
72
73 /* p6 only has one enable register */
74 rdmsrl(MSR_P6_EVNTSEL0, val);
75 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
76 wrmsrl(MSR_P6_EVNTSEL0, val);
77}
78
79static inline void
80p6_pmu_disable_event(struct perf_event *event)
81{
82 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
83 struct hw_perf_event *hwc = &event->hw;
84 u64 val = P6_NOP_EVENT;
85
86 if (cpuc->enabled)
87 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
88
89 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
90}
91
92static void p6_pmu_enable_event(struct perf_event *event)
93{
94 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
95 struct hw_perf_event *hwc = &event->hw;
96 u64 val;
97
98 val = hwc->config;
99 if (cpuc->enabled)
100 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
101
102 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
103}
104
105static __initconst struct x86_pmu p6_pmu = {
106 .name = "p6",
107 .handle_irq = x86_pmu_handle_irq,
108 .disable_all = p6_pmu_disable_all,
109 .enable_all = p6_pmu_enable_all,
110 .enable = p6_pmu_enable_event,
111 .disable = p6_pmu_disable_event,
112 .eventsel = MSR_P6_EVNTSEL0,
113 .perfctr = MSR_P6_PERFCTR0,
114 .event_map = p6_pmu_event_map,
115 .raw_event = p6_pmu_raw_event,
116 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
117 .apic = 1,
118 .max_period = (1ULL << 31) - 1,
119 .version = 0,
120 .num_events = 2,
121 /*
122 * Events have 40 bits implemented. However they are designed such
123 * that bits [32-39] are sign extensions of bit 31. As such the
124 * effective width of a event for P6-like PMU is 32 bits only.
125 *
126 * See IA-32 Intel Architecture Software developer manual Vol 3B
127 */
128 .event_bits = 32,
129 .event_mask = (1ULL << 32) - 1,
130 .get_event_constraints = x86_get_event_constraints,
131 .event_constraints = p6_event_constraints,
132};
133
134static __init int p6_pmu_init(void)
135{
136 switch (boot_cpu_data.x86_model) {
137 case 1:
138 case 3: /* Pentium Pro */
139 case 5:
140 case 6: /* Pentium II */
141 case 7:
142 case 8:
143 case 11: /* Pentium III */
144 case 9:
145 case 13:
146 /* Pentium M */
147 break;
148 default:
149 pr_cont("unsupported p6 CPU model %d ",
150 boot_cpu_data.x86_model);
151 return -ENODEV;
152 }
153
154 x86_pmu = p6_pmu;
155
156 return 0;
157}
158
159#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fab786f60ed6..fb329e9f8494 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
115 115
116 return !test_bit(counter, perfctr_nmi_owner); 116 return !test_bit(counter, perfctr_nmi_owner);
117} 117}
118
119/* checks the an msr for availability */
120int avail_to_resrv_perfctr_nmi(unsigned int msr)
121{
122 unsigned int counter;
123
124 counter = nmi_perfctr_msr_to_bit(msr);
125 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126
127 return !test_bit(counter, perfctr_nmi_owner);
128}
129EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); 118EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
130 119
131int reserve_perfctr_nmi(unsigned int msr) 120int reserve_perfctr_nmi(unsigned int msr)
@@ -691,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
691 cpu_nmi_set_wd_enabled(); 680 cpu_nmi_set_wd_enabled();
692 681
693 apic_write(APIC_LVTPC, APIC_DM_NMI); 682 apic_write(APIC_LVTPC, APIC_DM_NMI);
694 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 683 evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
695 wrmsr(evntsel_msr, evntsel, 0); 684 wrmsr(evntsel_msr, evntsel, 0);
696 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
697 return 1; 686 return 1;
@@ -712,7 +701,7 @@ static void probe_nmi_watchdog(void)
712 switch (boot_cpu_data.x86_vendor) { 701 switch (boot_cpu_data.x86_vendor) {
713 case X86_VENDOR_AMD: 702 case X86_VENDOR_AMD:
714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && 703 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
715 boot_cpu_data.x86 != 16) 704 boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
716 return; 705 return;
717 wd_ops = &k7_wd_ops; 706 wd_ops = &k7_wd_ops;
718 break; 707 break;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index bb62b3e5caad..28000743bbb0 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -26,7 +26,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
26 26
27 early_init_transmeta(c); 27 early_init_transmeta(c);
28 28
29 display_cacheinfo(c); 29 cpu_detect_cache_sizes(c);
30 30
31 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
32 max = cpuid_eax(0x80860000); 32 max = cpuid_eax(0x80860000);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97b59cf..dfdb4dba2320 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/module.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include <asm/vmware.h> 27#include <asm/vmware.h>
27#include <asm/x86_init.h> 28#include <asm/x86_init.h>
@@ -101,6 +102,7 @@ int vmware_platform(void)
101 102
102 return 0; 103 return 0;
103} 104}
105EXPORT_SYMBOL(vmware_platform);
104 106
105/* 107/*
106 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 108 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6a52d4b36a30..8b862d5900fe 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -40,6 +40,7 @@
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/uaccess.h> 42#include <linux/uaccess.h>
43#include <linux/gfp.h>
43 44
44#include <asm/processor.h> 45#include <asm/processor.h>
45#include <asm/msr.h> 46#include <asm/msr.h>
@@ -116,21 +117,16 @@ static int cpuid_open(struct inode *inode, struct file *file)
116{ 117{
117 unsigned int cpu; 118 unsigned int cpu;
118 struct cpuinfo_x86 *c; 119 struct cpuinfo_x86 *c;
119 int ret = 0;
120
121 lock_kernel();
122 120
123 cpu = iminor(file->f_path.dentry->d_inode); 121 cpu = iminor(file->f_path.dentry->d_inode);
124 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { 122 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
125 ret = -ENXIO; /* No such CPU */ 123 return -ENXIO; /* No such CPU */
126 goto out; 124
127 }
128 c = &cpu_data(cpu); 125 c = &cpu_data(cpu);
129 if (c->cpuid_level < 0) 126 if (c->cpuid_level < 0)
130 ret = -EIO; /* CPUID not supported */ 127 return -EIO; /* CPUID not supported */
131out: 128
132 unlock_kernel(); 129 return 0;
133 return ret;
134} 130}
135 131
136/* 132/*
@@ -192,7 +188,8 @@ static int __init cpuid_init(void)
192 int i, err = 0; 188 int i, err = 0;
193 i = 0; 189 i = 0;
194 190
195 if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { 191 if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
192 "cpu/cpuid", &cpuid_fops)) {
196 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", 193 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
197 CPUID_MAJOR); 194 CPUID_MAJOR);
198 err = -EBUSY; 195 err = -EBUSY;
@@ -221,7 +218,7 @@ out_class:
221 } 218 }
222 class_destroy(cpuid_class); 219 class_destroy(cpuid_class);
223out_chrdev: 220out_chrdev:
224 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 221 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
225out: 222out:
226 return err; 223 return err;
227} 224}
@@ -233,7 +230,7 @@ static void __exit cpuid_exit(void)
233 for_each_online_cpu(cpu) 230 for_each_online_cpu(cpu)
234 cpuid_device_destroy(cpu); 231 cpuid_device_destroy(cpu);
235 class_destroy(cpuid_class); 232 class_destroy(cpuid_class);
236 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 233 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
237 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 234 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
238} 235}
239 236
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 5e409dc298a4..ebd4c51d096a 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,8 +27,6 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/iommu.h>
31
32 30
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 31#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
34 32
@@ -104,10 +102,5 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
104#ifdef CONFIG_HPET_TIMER 102#ifdef CONFIG_HPET_TIMER
105 hpet_disable(); 103 hpet_disable();
106#endif 104#endif
107
108#ifdef CONFIG_X86_64
109 pci_iommu_shutdown();
110#endif
111
112 crash_save_cpu(regs, safe_smp_processor_id()); 105 crash_save_cpu(regs, safe_smp_processor_id());
113} 106}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index cd97ce18c29d..67414550c3cc 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -5,6 +5,7 @@
5 * Copyright (C) IBM Corporation, 2004. All rights reserved 5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */ 6 */
7 7
8#include <linux/slab.h>
8#include <linux/errno.h> 9#include <linux/errno.h>
9#include <linux/highmem.h> 10#include <linux/highmem.h>
10#include <linux/crash_dump.h> 11#include <linux/crash_dump.h>
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index ef42a038f1a6..1c47390dd0e5 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -265,13 +265,13 @@ struct ds_context {
265 int cpu; 265 int cpu;
266}; 266};
267 267
268static DEFINE_PER_CPU(struct ds_context *, cpu_context); 268static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
269 269
270 270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu) 271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
272{ 272{
273 struct ds_context **p_context = 273 struct ds_context **p_context =
274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); 274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
275 struct ds_context *context = NULL; 275 struct ds_context *context = NULL;
276 struct ds_context *new_context = NULL; 276 struct ds_context *new_context = NULL;
277 277
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371d4339..6d817554780a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -109,6 +109,32 @@ print_context_stack(struct thread_info *tinfo,
109 } 109 }
110 return bp; 110 return bp;
111} 111}
112EXPORT_SYMBOL_GPL(print_context_stack);
113
114unsigned long
115print_context_stack_bp(struct thread_info *tinfo,
116 unsigned long *stack, unsigned long bp,
117 const struct stacktrace_ops *ops, void *data,
118 unsigned long *end, int *graph)
119{
120 struct stack_frame *frame = (struct stack_frame *)bp;
121 unsigned long *ret_addr = &frame->return_address;
122
123 while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) {
124 unsigned long addr = *ret_addr;
125
126 if (!__kernel_text_address(addr))
127 break;
128
129 ops->address(data, addr, 1);
130 frame = frame->next_frame;
131 ret_addr = &frame->return_address;
132 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
133 }
134
135 return (unsigned long)frame;
136}
137EXPORT_SYMBOL_GPL(print_context_stack_bp);
112 138
113 139
114static void 140static void
@@ -141,10 +167,11 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
141} 167}
142 168
143static const struct stacktrace_ops print_trace_ops = { 169static const struct stacktrace_ops print_trace_ops = {
144 .warning = print_trace_warning, 170 .warning = print_trace_warning,
145 .warning_symbol = print_trace_warning_symbol, 171 .warning_symbol = print_trace_warning_symbol,
146 .stack = print_trace_stack, 172 .stack = print_trace_stack,
147 .address = print_trace_address, 173 .address = print_trace_address,
174 .walk_stack = print_context_stack,
148}; 175};
149 176
150void 177void
@@ -188,7 +215,7 @@ void dump_stack(void)
188} 215}
189EXPORT_SYMBOL(dump_stack); 216EXPORT_SYMBOL(dump_stack);
190 217
191static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; 218static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
192static int die_owner = -1; 219static int die_owner = -1;
193static unsigned int die_nest_count; 220static unsigned int die_nest_count;
194 221
@@ -207,11 +234,11 @@ unsigned __kprobes long oops_begin(void)
207 /* racy, but better than risking deadlock. */ 234 /* racy, but better than risking deadlock. */
208 raw_local_irq_save(flags); 235 raw_local_irq_save(flags);
209 cpu = smp_processor_id(); 236 cpu = smp_processor_id();
210 if (!__raw_spin_trylock(&die_lock)) { 237 if (!arch_spin_trylock(&die_lock)) {
211 if (cpu == die_owner) 238 if (cpu == die_owner)
212 /* nested oops. should stop eventually */; 239 /* nested oops. should stop eventually */;
213 else 240 else
214 __raw_spin_lock(&die_lock); 241 arch_spin_lock(&die_lock);
215 } 242 }
216 die_nest_count++; 243 die_nest_count++;
217 die_owner = cpu; 244 die_owner = cpu;
@@ -231,7 +258,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
231 die_nest_count--; 258 die_nest_count--;
232 if (!die_nest_count) 259 if (!die_nest_count)
233 /* Nest count reaches zero, release the lock. */ 260 /* Nest count reaches zero, release the lock. */
234 __raw_spin_unlock(&die_lock); 261 arch_spin_unlock(&die_lock);
235 raw_local_irq_restore(flags); 262 raw_local_irq_restore(flags);
236 oops_exit(); 263 oops_exit();
237 264
@@ -268,11 +295,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
268 295
269 show_registers(regs); 296 show_registers(regs);
270#ifdef CONFIG_X86_32 297#ifdef CONFIG_X86_32
271 sp = (unsigned long) (&regs->sp); 298 if (user_mode_vm(regs)) {
272 savesegment(ss, ss);
273 if (user_mode(regs)) {
274 sp = regs->sp; 299 sp = regs->sp;
275 ss = regs->ss & 0xffff; 300 ss = regs->ss & 0xffff;
301 } else {
302 sp = kernel_stack_pointer(regs);
303 savesegment(ss, ss);
276 } 304 }
277 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 305 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
278 print_symbol("%s", regs->ip); 306 print_symbol("%s", regs->ip);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 81086c227ab7..e1a93be4fd44 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,11 +14,7 @@
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif 15#endif
16 16
17extern unsigned long 17#include <linux/uaccess.h>
18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end, int *graph);
22 18
23extern void 19extern void
24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 20show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
@@ -35,4 +31,26 @@ struct stack_frame {
35 struct stack_frame *next_frame; 31 struct stack_frame *next_frame;
36 unsigned long return_address; 32 unsigned long return_address;
37}; 33};
34
35struct stack_frame_ia32 {
36 u32 next_frame;
37 u32 return_address;
38};
39
40static inline unsigned long rewind_frame_pointer(int n)
41{
42 struct stack_frame *frame;
43
44 get_bp(frame);
45
46#ifdef CONFIG_FRAME_POINTER
47 while (n--) {
48 if (probe_kernel_address(&frame->next_frame, frame))
49 break;
50 }
38#endif 51#endif
52
53 return (unsigned long)frame;
54}
55
56#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f7dd2a7c3bf4..11540a189d93 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -10,19 +10,14 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21/* Just a stub for now */
22int x86_is_stack_id(int id, char *name)
23{
24 return 0;
25}
26 21
27void dump_trace(struct task_struct *task, struct pt_regs *regs, 22void dump_trace(struct task_struct *task, struct pt_regs *regs,
28 unsigned long *stack, unsigned long bp, 23 unsigned long *stack, unsigned long bp,
@@ -35,6 +30,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
35 30
36 if (!stack) { 31 if (!stack) {
37 unsigned long dummy; 32 unsigned long dummy;
33
38 stack = &dummy; 34 stack = &dummy;
39 if (task && task != current) 35 if (task && task != current)
40 stack = (unsigned long *)task->thread.sp; 36 stack = (unsigned long *)task->thread.sp;
@@ -57,8 +53,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
57 53
58 context = (struct thread_info *) 54 context = (struct thread_info *)
59 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 55 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
60 bp = print_context_stack(context, stack, bp, ops, 56 bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph);
61 data, NULL, &graph);
62 57
63 stack = (unsigned long *)context->previous_esp; 58 stack = (unsigned long *)context->previous_esp;
64 if (!stack) 59 if (!stack)
@@ -72,7 +67,7 @@ EXPORT_SYMBOL(dump_trace);
72 67
73void 68void
74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 69show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
75 unsigned long *sp, unsigned long bp, char *log_lvl) 70 unsigned long *sp, unsigned long bp, char *log_lvl)
76{ 71{
77 unsigned long *stack; 72 unsigned long *stack;
78 int i; 73 int i;
@@ -156,4 +151,3 @@ int is_valid_bugaddr(unsigned long ip)
156 151
157 return ud2 == 0x0b0f; 152 return ud2 == 0x0b0f;
158} 153}
159
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a071e6be177e..272c9f1f05f3 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -10,34 +10,31 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21#define N_EXCEPTION_STACKS_END \
22 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
21 23
22static char x86_stack_ids[][8] = { 24static char x86_stack_ids[][8] = {
23 [DEBUG_STACK - 1] = "#DB", 25 [ DEBUG_STACK-1 ] = "#DB",
24 [NMI_STACK - 1] = "NMI", 26 [ NMI_STACK-1 ] = "NMI",
25 [DOUBLEFAULT_STACK - 1] = "#DF", 27 [ DOUBLEFAULT_STACK-1 ] = "#DF",
26 [STACKFAULT_STACK - 1] = "#SS", 28 [ STACKFAULT_STACK-1 ] = "#SS",
27 [MCE_STACK - 1] = "#MC", 29 [ MCE_STACK-1 ] = "#MC",
28#if DEBUG_STKSZ > EXCEPTION_STKSZ 30#if DEBUG_STKSZ > EXCEPTION_STKSZ
29 [N_EXCEPTION_STACKS ... 31 [ N_EXCEPTION_STACKS ...
30 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 32 N_EXCEPTION_STACKS_END ] = "#DB[?]"
31#endif 33#endif
32 }; 34};
33
34int x86_is_stack_id(int id, char *name)
35{
36 return x86_stack_ids[id - 1] == name;
37}
38 35
39static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 36static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
40 unsigned *usedp, char **idp) 37 unsigned *usedp, char **idp)
41{ 38{
42 unsigned k; 39 unsigned k;
43 40
@@ -101,6 +98,41 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
101 return NULL; 98 return NULL;
102} 99}
103 100
101static inline int
102in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
103 unsigned long *irq_stack_end)
104{
105 return (stack >= irq_stack && stack < irq_stack_end);
106}
107
108/*
109 * We are returning from the irq stack and go to the previous one.
110 * If the previous stack is also in the irq stack, then bp in the first
111 * frame of the irq stack points to the previous, interrupted one.
112 * Otherwise we have another level of indirection: We first save
113 * the bp of the previous stack, then we switch the stack to the irq one
114 * and save a new bp that links to the previous one.
115 * (See save_args())
116 */
117static inline unsigned long
118fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
119 unsigned long *irq_stack, unsigned long *irq_stack_end)
120{
121#ifdef CONFIG_FRAME_POINTER
122 struct stack_frame *frame = (struct stack_frame *)bp;
123 unsigned long next;
124
125 if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
126 if (!probe_kernel_address(&frame->next_frame, next))
127 return next;
128 else
129 WARN_ONCE(1, "Perf: bad frame pointer = %p in "
130 "callchain\n", &frame->next_frame);
131 }
132#endif
133 return bp;
134}
135
104/* 136/*
105 * x86-64 can have up to three kernel stacks: 137 * x86-64 can have up to three kernel stacks:
106 * process stack 138 * process stack
@@ -157,8 +189,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
157 if (ops->stack(data, id) < 0) 189 if (ops->stack(data, id) < 0)
158 break; 190 break;
159 191
160 bp = print_context_stack(tinfo, stack, bp, ops, 192 bp = ops->walk_stack(tinfo, stack, bp, ops,
161 data, estack_end, &graph); 193 data, estack_end, &graph);
162 ops->stack(data, "<EOE>"); 194 ops->stack(data, "<EOE>");
163 /* 195 /*
164 * We link to the next stack via the 196 * We link to the next stack via the
@@ -173,10 +205,10 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
173 irq_stack = irq_stack_end - 205 irq_stack = irq_stack_end -
174 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); 206 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
175 207
176 if (stack >= irq_stack && stack < irq_stack_end) { 208 if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
177 if (ops->stack(data, "IRQ") < 0) 209 if (ops->stack(data, "IRQ") < 0)
178 break; 210 break;
179 bp = print_context_stack(tinfo, stack, bp, 211 bp = ops->walk_stack(tinfo, stack, bp,
180 ops, data, irq_stack_end, &graph); 212 ops, data, irq_stack_end, &graph);
181 /* 213 /*
182 * We link to the next stack (which would be 214 * We link to the next stack (which would be
@@ -184,6 +216,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
184 * pointer (index -1 to end) in the IRQ stack: 216 * pointer (index -1 to end) in the IRQ stack:
185 */ 217 */
186 stack = (unsigned long *) (irq_stack_end[-1]); 218 stack = (unsigned long *) (irq_stack_end[-1]);
219 bp = fixup_bp_irq_link(bp, stack, irq_stack,
220 irq_stack_end);
187 irq_stack_end = NULL; 221 irq_stack_end = NULL;
188 ops->stack(data, "EOI"); 222 ops->stack(data, "EOI");
189 continue; 223 continue;
@@ -195,28 +229,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
195 /* 229 /*
196 * This handles the process stack: 230 * This handles the process stack:
197 */ 231 */
198 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); 232 bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
199 put_cpu(); 233 put_cpu();
200} 234}
201EXPORT_SYMBOL(dump_trace); 235EXPORT_SYMBOL(dump_trace);
202 236
203void 237void
204show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 238show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
205 unsigned long *sp, unsigned long bp, char *log_lvl) 239 unsigned long *sp, unsigned long bp, char *log_lvl)
206{ 240{
241 unsigned long *irq_stack_end;
242 unsigned long *irq_stack;
207 unsigned long *stack; 243 unsigned long *stack;
244 int cpu;
208 int i; 245 int i;
209 const int cpu = smp_processor_id(); 246
210 unsigned long *irq_stack_end = 247 preempt_disable();
211 (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); 248 cpu = smp_processor_id();
212 unsigned long *irq_stack = 249
213 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); 250 irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
251 irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
214 252
215 /* 253 /*
216 * debugging aid: "show_stack(NULL, NULL);" prints the 254 * Debugging aid: "show_stack(NULL, NULL);" prints the
217 * back trace for this cpu. 255 * back trace for this cpu:
218 */ 256 */
219
220 if (sp == NULL) { 257 if (sp == NULL) {
221 if (task) 258 if (task)
222 sp = (unsigned long *)task->thread.sp; 259 sp = (unsigned long *)task->thread.sp;
@@ -240,6 +277,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
240 printk(" %016lx", *stack++); 277 printk(" %016lx", *stack++);
241 touch_nmi_watchdog(); 278 touch_nmi_watchdog();
242 } 279 }
280 preempt_enable();
281
243 printk("\n"); 282 printk("\n");
244 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 283 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
245} 284}
@@ -253,6 +292,7 @@ void show_registers(struct pt_regs *regs)
253 292
254 sp = regs->sp; 293 sp = regs->sp;
255 printk("CPU %d ", cpu); 294 printk("CPU %d ", cpu);
295 print_modules();
256 __show_regs(regs, 1); 296 __show_regs(regs, 1);
257 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 297 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
258 cur->comm, cur->pid, task_thread_info(cur), cur); 298 cur->comm, cur->pid, task_thread_info(cur), cur);
@@ -303,4 +343,3 @@ int is_valid_bugaddr(unsigned long ip)
303 343
304 return ud2 == 0x0b0f; 344 return ud2 == 0x0b0f;
305} 345}
306
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d17d482a04f4..7bca3c6a02fb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -12,21 +12,13 @@
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/pfn.h> 15#include <linux/pfn.h>
21#include <linux/suspend.h> 16#include <linux/suspend.h>
22#include <linux/firmware-map.h> 17#include <linux/firmware-map.h>
23 18
24#include <asm/pgtable.h>
25#include <asm/page.h>
26#include <asm/e820.h> 19#include <asm/e820.h>
27#include <asm/proto.h> 20#include <asm/proto.h>
28#include <asm/setup.h> 21#include <asm/setup.h>
29#include <asm/trampoline.h>
30 22
31/* 23/*
32 * The e820 map is the map that gets modified e.g. with command line parameters 24 * The e820 map is the map that gets modified e.g. with command line parameters
@@ -517,31 +509,55 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
517 int checktype) 509 int checktype)
518{ 510{
519 int i; 511 int i;
512 u64 end;
520 u64 real_removed_size = 0; 513 u64 real_removed_size = 0;
521 514
522 if (size > (ULLONG_MAX - start)) 515 if (size > (ULLONG_MAX - start))
523 size = ULLONG_MAX - start; 516 size = ULLONG_MAX - start;
524 517
518 end = start + size;
519 printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
520 (unsigned long long) start,
521 (unsigned long long) end);
522 if (checktype)
523 e820_print_type(old_type);
524 printk(KERN_CONT "\n");
525
525 for (i = 0; i < e820.nr_map; i++) { 526 for (i = 0; i < e820.nr_map; i++) {
526 struct e820entry *ei = &e820.map[i]; 527 struct e820entry *ei = &e820.map[i];
527 u64 final_start, final_end; 528 u64 final_start, final_end;
529 u64 ei_end;
528 530
529 if (checktype && ei->type != old_type) 531 if (checktype && ei->type != old_type)
530 continue; 532 continue;
533
534 ei_end = ei->addr + ei->size;
531 /* totally covered? */ 535 /* totally covered? */
532 if (ei->addr >= start && 536 if (ei->addr >= start && ei_end <= end) {
533 (ei->addr + ei->size) <= (start + size)) {
534 real_removed_size += ei->size; 537 real_removed_size += ei->size;
535 memset(ei, 0, sizeof(struct e820entry)); 538 memset(ei, 0, sizeof(struct e820entry));
536 continue; 539 continue;
537 } 540 }
541
542 /* new range is totally covered? */
543 if (ei->addr < start && ei_end > end) {
544 e820_add_region(end, ei_end - end, ei->type);
545 ei->size = start - ei->addr;
546 real_removed_size += size;
547 continue;
548 }
549
538 /* partially covered */ 550 /* partially covered */
539 final_start = max(start, ei->addr); 551 final_start = max(start, ei->addr);
540 final_end = min(start + size, ei->addr + ei->size); 552 final_end = min(end, ei_end);
541 if (final_start >= final_end) 553 if (final_start >= final_end)
542 continue; 554 continue;
543 real_removed_size += final_end - final_start; 555 real_removed_size += final_end - final_start;
544 556
557 /*
558 * left range could be head or tail, so need to update
559 * size at first.
560 */
545 ei->size -= final_end - final_start; 561 ei->size -= final_end - final_start;
546 if (ei->addr < final_start) 562 if (ei->addr < final_start)
547 continue; 563 continue;
@@ -722,310 +738,44 @@ core_initcall(e820_mark_nvs_memory);
722#endif 738#endif
723 739
724/* 740/*
725 * Early reserved memory areas. 741 * Find a free area with specified alignment in a specific range.
726 */
727#define MAX_EARLY_RES 20
728
729struct early_res {
730 u64 start, end;
731 char name[16];
732 char overlap_ok;
733};
734static struct early_res early_res[MAX_EARLY_RES] __initdata = {
735 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
736 {}
737};
738
739static int __init find_overlapped_early(u64 start, u64 end)
740{
741 int i;
742 struct early_res *r;
743
744 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
745 r = &early_res[i];
746 if (end > r->start && start < r->end)
747 break;
748 }
749
750 return i;
751}
752
753/*
754 * Drop the i-th range from the early reservation map,
755 * by copying any higher ranges down one over it, and
756 * clearing what had been the last slot.
757 */
758static void __init drop_range(int i)
759{
760 int j;
761
762 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
763 ;
764
765 memmove(&early_res[i], &early_res[i + 1],
766 (j - 1 - i) * sizeof(struct early_res));
767
768 early_res[j - 1].end = 0;
769}
770
771/*
772 * Split any existing ranges that:
773 * 1) are marked 'overlap_ok', and
774 * 2) overlap with the stated range [start, end)
775 * into whatever portion (if any) of the existing range is entirely
776 * below or entirely above the stated range. Drop the portion
777 * of the existing range that overlaps with the stated range,
778 * which will allow the caller of this routine to then add that
779 * stated range without conflicting with any existing range.
780 */ 742 */
781static void __init drop_overlaps_that_are_ok(u64 start, u64 end) 743u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
782{ 744{
783 int i; 745 int i;
784 struct early_res *r;
785 u64 lower_start, lower_end;
786 u64 upper_start, upper_end;
787 char name[16];
788 746
789 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { 747 for (i = 0; i < e820.nr_map; i++) {
790 r = &early_res[i]; 748 struct e820entry *ei = &e820.map[i];
749 u64 addr;
750 u64 ei_start, ei_last;
791 751
792 /* Continue past non-overlapping ranges */ 752 if (ei->type != E820_RAM)
793 if (end <= r->start || start >= r->end)
794 continue; 753 continue;
795 754
796 /* 755 ei_last = ei->addr + ei->size;
797 * Leave non-ok overlaps as is; let caller 756 ei_start = ei->addr;
798 * panic "Overlapping early reservations" 757 addr = find_early_area(ei_start, ei_last, start, end,
799 * when it hits this overlap. 758 size, align);
800 */
801 if (!r->overlap_ok)
802 return;
803
804 /*
805 * We have an ok overlap. We will drop it from the early
806 * reservation map, and add back in any non-overlapping
807 * portions (lower or upper) as separate, overlap_ok,
808 * non-overlapping ranges.
809 */
810
811 /* 1. Note any non-overlapping (lower or upper) ranges. */
812 strncpy(name, r->name, sizeof(name) - 1);
813
814 lower_start = lower_end = 0;
815 upper_start = upper_end = 0;
816 if (r->start < start) {
817 lower_start = r->start;
818 lower_end = start;
819 }
820 if (r->end > end) {
821 upper_start = end;
822 upper_end = r->end;
823 }
824
825 /* 2. Drop the original ok overlapping range */
826 drop_range(i);
827
828 i--; /* resume for-loop on copied down entry */
829
830 /* 3. Add back in any non-overlapping ranges. */
831 if (lower_end)
832 reserve_early_overlap_ok(lower_start, lower_end, name);
833 if (upper_end)
834 reserve_early_overlap_ok(upper_start, upper_end, name);
835 }
836}
837
838static void __init __reserve_early(u64 start, u64 end, char *name,
839 int overlap_ok)
840{
841 int i;
842 struct early_res *r;
843
844 i = find_overlapped_early(start, end);
845 if (i >= MAX_EARLY_RES)
846 panic("Too many early reservations");
847 r = &early_res[i];
848 if (r->end)
849 panic("Overlapping early reservations "
850 "%llx-%llx %s to %llx-%llx %s\n",
851 start, end - 1, name?name:"", r->start,
852 r->end - 1, r->name);
853 r->start = start;
854 r->end = end;
855 r->overlap_ok = overlap_ok;
856 if (name)
857 strncpy(r->name, name, sizeof(r->name) - 1);
858}
859
860/*
861 * A few early reservtations come here.
862 *
863 * The 'overlap_ok' in the name of this routine does -not- mean it
864 * is ok for these reservations to overlap an earlier reservation.
865 * Rather it means that it is ok for subsequent reservations to
866 * overlap this one.
867 *
868 * Use this entry point to reserve early ranges when you are doing
869 * so out of "Paranoia", reserving perhaps more memory than you need,
870 * just in case, and don't mind a subsequent overlapping reservation
871 * that is known to be needed.
872 *
873 * The drop_overlaps_that_are_ok() call here isn't really needed.
874 * It would be needed if we had two colliding 'overlap_ok'
875 * reservations, so that the second such would not panic on the
876 * overlap with the first. We don't have any such as of this
877 * writing, but might as well tolerate such if it happens in
878 * the future.
879 */
880void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
881{
882 drop_overlaps_that_are_ok(start, end);
883 __reserve_early(start, end, name, 1);
884}
885
886/*
887 * Most early reservations come here.
888 *
889 * We first have drop_overlaps_that_are_ok() drop any pre-existing
890 * 'overlap_ok' ranges, so that we can then reserve this memory
891 * range without risk of panic'ing on an overlapping overlap_ok
892 * early reservation.
893 */
894void __init reserve_early(u64 start, u64 end, char *name)
895{
896 if (start >= end)
897 return;
898
899 drop_overlaps_that_are_ok(start, end);
900 __reserve_early(start, end, name, 0);
901}
902
903void __init free_early(u64 start, u64 end)
904{
905 struct early_res *r;
906 int i;
907
908 i = find_overlapped_early(start, end);
909 r = &early_res[i];
910 if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
911 panic("free_early on not reserved area: %llx-%llx!",
912 start, end - 1);
913
914 drop_range(i);
915}
916 759
917void __init early_res_to_bootmem(u64 start, u64 end) 760 if (addr != -1ULL)
918{ 761 return addr;
919 int i, count;
920 u64 final_start, final_end;
921
922 count = 0;
923 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
924 count++;
925
926 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
927 count, start, end);
928 for (i = 0; i < count; i++) {
929 struct early_res *r = &early_res[i];
930 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
931 r->start, r->end, r->name);
932 final_start = max(start, r->start);
933 final_end = min(end, r->end);
934 if (final_start >= final_end) {
935 printk(KERN_CONT "\n");
936 continue;
937 }
938 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
939 final_start, final_end);
940 reserve_bootmem_generic(final_start, final_end - final_start,
941 BOOTMEM_DEFAULT);
942 } 762 }
763 return -1ULL;
943} 764}
944 765
945/* Check for already reserved areas */ 766u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
946static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
947{
948 int i;
949 u64 addr = *addrp;
950 int changed = 0;
951 struct early_res *r;
952again:
953 i = find_overlapped_early(addr, addr + size);
954 r = &early_res[i];
955 if (i < MAX_EARLY_RES && r->end) {
956 *addrp = addr = round_up(r->end, align);
957 changed = 1;
958 goto again;
959 }
960 return changed;
961}
962
963/* Check for already reserved areas */
964static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
965{ 767{
966 int i; 768 return find_e820_area(start, end, size, align);
967 u64 addr = *addrp, last;
968 u64 size = *sizep;
969 int changed = 0;
970again:
971 last = addr + size;
972 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
973 struct early_res *r = &early_res[i];
974 if (last > r->start && addr < r->start) {
975 size = r->start - addr;
976 changed = 1;
977 goto again;
978 }
979 if (last > r->end && addr < r->end) {
980 addr = round_up(r->end, align);
981 size = last - addr;
982 changed = 1;
983 goto again;
984 }
985 if (last <= r->end && addr >= r->start) {
986 (*sizep)++;
987 return 0;
988 }
989 }
990 if (changed) {
991 *addrp = addr;
992 *sizep = size;
993 }
994 return changed;
995} 769}
996 770
997/* 771u64 __init get_max_mapped(void)
998 * Find a free area with specified alignment in a specific range.
999 */
1000u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
1001{ 772{
1002 int i; 773 u64 end = max_pfn_mapped;
1003 774
1004 for (i = 0; i < e820.nr_map; i++) { 775 end <<= PAGE_SHIFT;
1005 struct e820entry *ei = &e820.map[i];
1006 u64 addr, last;
1007 u64 ei_last;
1008 776
1009 if (ei->type != E820_RAM) 777 return end;
1010 continue;
1011 addr = round_up(ei->addr, align);
1012 ei_last = ei->addr + ei->size;
1013 if (addr < start)
1014 addr = round_up(start, align);
1015 if (addr >= ei_last)
1016 continue;
1017 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
1018 ;
1019 last = addr + size;
1020 if (last > ei_last)
1021 continue;
1022 if (last > end)
1023 continue;
1024 return addr;
1025 }
1026 return -1ULL;
1027} 778}
1028
1029/* 779/*
1030 * Find next free range after *start 780 * Find next free range after *start
1031 */ 781 */
@@ -1035,25 +785,19 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
1035 785
1036 for (i = 0; i < e820.nr_map; i++) { 786 for (i = 0; i < e820.nr_map; i++) {
1037 struct e820entry *ei = &e820.map[i]; 787 struct e820entry *ei = &e820.map[i];
1038 u64 addr, last; 788 u64 addr;
1039 u64 ei_last; 789 u64 ei_start, ei_last;
1040 790
1041 if (ei->type != E820_RAM) 791 if (ei->type != E820_RAM)
1042 continue; 792 continue;
1043 addr = round_up(ei->addr, align); 793
1044 ei_last = ei->addr + ei->size; 794 ei_last = ei->addr + ei->size;
1045 if (addr < start) 795 ei_start = ei->addr;
1046 addr = round_up(start, align); 796 addr = find_early_area_size(ei_start, ei_last, start,
1047 if (addr >= ei_last) 797 sizep, align);
1048 continue; 798
1049 *sizep = ei_last - addr; 799 if (addr != -1ULL)
1050 while (bad_addr_size(&addr, sizep, align) && 800 return addr;
1051 addr + *sizep <= ei_last)
1052 ;
1053 last = addr + *sizep;
1054 if (last > ei_last)
1055 continue;
1056 return addr;
1057 } 801 }
1058 802
1059 return -1ULL; 803 return -1ULL;
@@ -1412,6 +1156,8 @@ void __init e820_reserve_resources_late(void)
1412 end = MAX_RESOURCE_SIZE; 1156 end = MAX_RESOURCE_SIZE;
1413 if (start >= end) 1157 if (start >= end)
1414 continue; 1158 continue;
1159 printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
1160 start, end);
1415 reserve_region_with_split(&iomem_resource, start, end, 1161 reserve_region_with_split(&iomem_resource, start, end,
1416 "RAM buffer"); 1162 "RAM buffer");
1417 } 1163 }
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index cdcfb122f256..c2fa9b8b497e 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -362,7 +362,7 @@ void __init efi_init(void)
362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); 362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
363 early_iounmap(tmp, 2); 363 early_iounmap(tmp, 2);
364 364
365 printk(KERN_INFO "EFI v%u.%.02u by %s \n", 365 printk(KERN_INFO "EFI v%u.%.02u by %s\n",
366 efi.systab->hdr.revision >> 16, 366 efi.systab->hdr.revision >> 16,
367 efi.systab->hdr.revision & 0xffff, vendor); 367 efi.systab->hdr.revision & 0xffff, vendor);
368 368
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..44a8e0dc6737 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -334,6 +334,10 @@ ENTRY(ret_from_fork)
334END(ret_from_fork) 334END(ret_from_fork)
335 335
336/* 336/*
337 * Interrupt exit functions should be protected against kprobes
338 */
339 .pushsection .kprobes.text, "ax"
340/*
337 * Return to user mode is not as complex as all this looks, 341 * Return to user mode is not as complex as all this looks,
338 * but we want the default path for a system call return to 342 * but we want the default path for a system call return to
339 * go as quickly as possible which is why some of this is 343 * go as quickly as possible which is why some of this is
@@ -383,6 +387,10 @@ need_resched:
383END(resume_kernel) 387END(resume_kernel)
384#endif 388#endif
385 CFI_ENDPROC 389 CFI_ENDPROC
390/*
391 * End of kprobes section
392 */
393 .popsection
386 394
387/* SYSENTER_RETURN points to after the "sysenter" instruction in 395/* SYSENTER_RETURN points to after the "sysenter" instruction in
388 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 396 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -513,6 +521,10 @@ sysexit_audit:
513 PTGS_TO_GS_EX 521 PTGS_TO_GS_EX
514ENDPROC(ia32_sysenter_target) 522ENDPROC(ia32_sysenter_target)
515 523
524/*
525 * syscall stub including irq exit should be protected against kprobes
526 */
527 .pushsection .kprobes.text, "ax"
516 # system call handler stub 528 # system call handler stub
517ENTRY(system_call) 529ENTRY(system_call)
518 RING0_INT_FRAME # can't unwind into user space anyway 530 RING0_INT_FRAME # can't unwind into user space anyway
@@ -705,26 +717,69 @@ syscall_badsys:
705 jmp resume_userspace 717 jmp resume_userspace
706END(syscall_badsys) 718END(syscall_badsys)
707 CFI_ENDPROC 719 CFI_ENDPROC
720/*
721 * End of kprobes section
722 */
723 .popsection
708 724
709/* 725/*
710 * System calls that need a pt_regs pointer. 726 * System calls that need a pt_regs pointer.
711 */ 727 */
712#define PTREGSCALL(name) \ 728#define PTREGSCALL0(name) \
713 ALIGN; \ 729 ALIGN; \
714ptregs_##name: \ 730ptregs_##name: \
715 leal 4(%esp),%eax; \ 731 leal 4(%esp),%eax; \
716 jmp sys_##name; 732 jmp sys_##name;
717 733
718PTREGSCALL(iopl) 734#define PTREGSCALL1(name) \
719PTREGSCALL(fork) 735 ALIGN; \
720PTREGSCALL(clone) 736ptregs_##name: \
721PTREGSCALL(vfork) 737 leal 4(%esp),%edx; \
722PTREGSCALL(execve) 738 movl (PT_EBX+4)(%esp),%eax; \
723PTREGSCALL(sigaltstack) 739 jmp sys_##name;
724PTREGSCALL(sigreturn) 740
725PTREGSCALL(rt_sigreturn) 741#define PTREGSCALL2(name) \
726PTREGSCALL(vm86) 742 ALIGN; \
727PTREGSCALL(vm86old) 743ptregs_##name: \
744 leal 4(%esp),%ecx; \
745 movl (PT_ECX+4)(%esp),%edx; \
746 movl (PT_EBX+4)(%esp),%eax; \
747 jmp sys_##name;
748
749#define PTREGSCALL3(name) \
750 ALIGN; \
751ptregs_##name: \
752 leal 4(%esp),%eax; \
753 pushl %eax; \
754 movl PT_EDX(%eax),%ecx; \
755 movl PT_ECX(%eax),%edx; \
756 movl PT_EBX(%eax),%eax; \
757 call sys_##name; \
758 addl $4,%esp; \
759 ret
760
761PTREGSCALL1(iopl)
762PTREGSCALL0(fork)
763PTREGSCALL0(vfork)
764PTREGSCALL3(execve)
765PTREGSCALL2(sigaltstack)
766PTREGSCALL0(sigreturn)
767PTREGSCALL0(rt_sigreturn)
768PTREGSCALL2(vm86)
769PTREGSCALL1(vm86old)
770
771/* Clone is an oddball. The 4th arg is in %edi */
772 ALIGN;
773ptregs_clone:
774 leal 4(%esp),%eax
775 pushl %eax
776 pushl PT_EDI(%eax)
777 movl PT_EDX(%eax),%ecx
778 movl PT_ECX(%eax),%edx
779 movl PT_EBX(%eax),%eax
780 call sys_clone
781 addl $8,%esp
782 ret
728 783
729.macro FIXUP_ESPFIX_STACK 784.macro FIXUP_ESPFIX_STACK
730/* 785/*
@@ -814,6 +869,10 @@ common_interrupt:
814ENDPROC(common_interrupt) 869ENDPROC(common_interrupt)
815 CFI_ENDPROC 870 CFI_ENDPROC
816 871
872/*
873 * Irq entries should be protected against kprobes
874 */
875 .pushsection .kprobes.text, "ax"
817#define BUILD_INTERRUPT3(name, nr, fn) \ 876#define BUILD_INTERRUPT3(name, nr, fn) \
818ENTRY(name) \ 877ENTRY(name) \
819 RING0_INT_FRAME; \ 878 RING0_INT_FRAME; \
@@ -980,16 +1039,16 @@ ENTRY(spurious_interrupt_bug)
980 jmp error_code 1039 jmp error_code
981 CFI_ENDPROC 1040 CFI_ENDPROC
982END(spurious_interrupt_bug) 1041END(spurious_interrupt_bug)
1042/*
1043 * End of kprobes section
1044 */
1045 .popsection
983 1046
984ENTRY(kernel_thread_helper) 1047ENTRY(kernel_thread_helper)
985 pushl $0 # fake return address for unwinder 1048 pushl $0 # fake return address for unwinder
986 CFI_STARTPROC 1049 CFI_STARTPROC
987 movl %edx,%eax 1050 movl %edi,%eax
988 push %edx 1051 call *%esi
989 CFI_ADJUST_CFA_OFFSET 4
990 call *%ebx
991 push %eax
992 CFI_ADJUST_CFA_OFFSET 4
993 call do_exit 1052 call do_exit
994 ud2 # padding for call trace 1053 ud2 # padding for call trace
995 CFI_ENDPROC 1054 CFI_ENDPROC
@@ -1185,17 +1244,14 @@ END(ftrace_graph_caller)
1185 1244
1186.globl return_to_handler 1245.globl return_to_handler
1187return_to_handler: 1246return_to_handler:
1188 pushl $0
1189 pushl %eax 1247 pushl %eax
1190 pushl %ecx
1191 pushl %edx 1248 pushl %edx
1192 movl %ebp, %eax 1249 movl %ebp, %eax
1193 call ftrace_return_to_handler 1250 call ftrace_return_to_handler
1194 movl %eax, 0xc(%esp) 1251 movl %eax, %ecx
1195 popl %edx 1252 popl %edx
1196 popl %ecx
1197 popl %eax 1253 popl %eax
1198 ret 1254 jmp *%ecx
1199#endif 1255#endif
1200 1256
1201.section .rodata,"a" 1257.section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 5e9b0e538a18..b9ec6cd7796f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 16(%rsp) 158 movq %rax, %rdi
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $16, %rsp 161 addq $24, %rsp
162 retq 162 jmp *%rdi
163#endif 163#endif
164 164
165 165
@@ -803,6 +803,10 @@ END(interrupt)
803 call \func 803 call \func
804 .endm 804 .endm
805 805
806/*
807 * Interrupt entry/exit should be protected against kprobes
808 */
809 .pushsection .kprobes.text, "ax"
806 /* 810 /*
807 * The interrupt stubs push (~vector+0x80) onto the stack and 811 * The interrupt stubs push (~vector+0x80) onto the stack and
808 * then jump to common_interrupt. 812 * then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
941 945
942 CFI_ENDPROC 946 CFI_ENDPROC
943END(common_interrupt) 947END(common_interrupt)
948/*
949 * End of kprobes section
950 */
951 .popsection
944 952
945/* 953/*
946 * APIC interrupts. 954 * APIC interrupts.
@@ -969,8 +977,8 @@ apicinterrupt UV_BAU_MESSAGE \
969#endif 977#endif
970apicinterrupt LOCAL_TIMER_VECTOR \ 978apicinterrupt LOCAL_TIMER_VECTOR \
971 apic_timer_interrupt smp_apic_timer_interrupt 979 apic_timer_interrupt smp_apic_timer_interrupt
972apicinterrupt GENERIC_INTERRUPT_VECTOR \ 980apicinterrupt X86_PLATFORM_IPI_VECTOR \
973 generic_interrupt smp_generic_interrupt 981 x86_platform_ipi smp_x86_platform_ipi
974 982
975#ifdef CONFIG_SMP 983#ifdef CONFIG_SMP
976apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -1070,10 +1078,10 @@ ENTRY(\sym)
1070 TRACE_IRQS_OFF 1078 TRACE_IRQS_OFF
1071 movq %rsp,%rdi /* pt_regs pointer */ 1079 movq %rsp,%rdi /* pt_regs pointer */
1072 xorl %esi,%esi /* no error code */ 1080 xorl %esi,%esi /* no error code */
1073 PER_CPU(init_tss, %rbp) 1081 PER_CPU(init_tss, %r12)
1074 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1082 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1075 call \do_sym 1083 call \do_sym
1076 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1084 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1077 jmp paranoid_exit /* %ebx: no swapgs flag */ 1085 jmp paranoid_exit /* %ebx: no swapgs flag */
1078 CFI_ENDPROC 1086 CFI_ENDPROC
1079END(\sym) 1087END(\sym)
@@ -1160,63 +1168,20 @@ bad_gs:
1160 jmp 2b 1168 jmp 2b
1161 .previous 1169 .previous
1162 1170
1163/* 1171ENTRY(kernel_thread_helper)
1164 * Create a kernel thread.
1165 *
1166 * C extern interface:
1167 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
1168 *
1169 * asm input arguments:
1170 * rdi: fn, rsi: arg, rdx: flags
1171 */
1172ENTRY(kernel_thread)
1173 CFI_STARTPROC
1174 FAKE_STACK_FRAME $child_rip
1175 SAVE_ALL
1176
1177 # rdi: flags, rsi: usp, rdx: will be &pt_regs
1178 movq %rdx,%rdi
1179 orq kernel_thread_flags(%rip),%rdi
1180 movq $-1, %rsi
1181 movq %rsp, %rdx
1182
1183 xorl %r8d,%r8d
1184 xorl %r9d,%r9d
1185
1186 # clone now
1187 call do_fork
1188 movq %rax,RAX(%rsp)
1189 xorl %edi,%edi
1190
1191 /*
1192 * It isn't worth to check for reschedule here,
1193 * so internally to the x86_64 port you can rely on kernel_thread()
1194 * not to reschedule the child before returning, this avoids the need
1195 * of hacks for example to fork off the per-CPU idle tasks.
1196 * [Hopefully no generic code relies on the reschedule -AK]
1197 */
1198 RESTORE_ALL
1199 UNFAKE_STACK_FRAME
1200 ret
1201 CFI_ENDPROC
1202END(kernel_thread)
1203
1204ENTRY(child_rip)
1205 pushq $0 # fake return address 1172 pushq $0 # fake return address
1206 CFI_STARTPROC 1173 CFI_STARTPROC
1207 /* 1174 /*
1208 * Here we are in the child and the registers are set as they were 1175 * Here we are in the child and the registers are set as they were
1209 * at kernel_thread() invocation in the parent. 1176 * at kernel_thread() invocation in the parent.
1210 */ 1177 */
1211 movq %rdi, %rax 1178 call *%rsi
1212 movq %rsi, %rdi
1213 call *%rax
1214 # exit 1179 # exit
1215 mov %eax, %edi 1180 mov %eax, %edi
1216 call do_exit 1181 call do_exit
1217 ud2 # padding for call trace 1182 ud2 # padding for call trace
1218 CFI_ENDPROC 1183 CFI_ENDPROC
1219END(child_rip) 1184END(kernel_thread_helper)
1220 1185
1221/* 1186/*
1222 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1187 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1493,12 +1458,17 @@ error_kernelspace:
1493 leaq irq_return(%rip),%rcx 1458 leaq irq_return(%rip),%rcx
1494 cmpq %rcx,RIP+8(%rsp) 1459 cmpq %rcx,RIP+8(%rsp)
1495 je error_swapgs 1460 je error_swapgs
1496 movl %ecx,%ecx /* zero extend */ 1461 movl %ecx,%eax /* zero extend */
1497 cmpq %rcx,RIP+8(%rsp) 1462 cmpq %rax,RIP+8(%rsp)
1498 je error_swapgs 1463 je bstep_iret
1499 cmpq $gs_change,RIP+8(%rsp) 1464 cmpq $gs_change,RIP+8(%rsp)
1500 je error_swapgs 1465 je error_swapgs
1501 jmp error_sti 1466 jmp error_sti
1467
1468bstep_iret:
1469 /* Fix truncated RIP */
1470 movq %rcx,RIP+8(%rsp)
1471 jmp error_swapgs
1502END(error_entry) 1472END(error_entry)
1503 1473
1504 1474
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9dbb527e1652..cd37469b54ee 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
9 * the dangers of modifying code on the run. 9 * the dangers of modifying code on the run.
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
13#include <linux/hardirq.h> 15#include <linux/hardirq.h>
14#include <linux/uaccess.h> 16#include <linux/uaccess.h>
@@ -28,14 +30,32 @@
28 30
29#ifdef CONFIG_DYNAMIC_FTRACE 31#ifdef CONFIG_DYNAMIC_FTRACE
30 32
33/*
34 * modifying_code is set to notify NMIs that they need to use
35 * memory barriers when entering or exiting. But we don't want
36 * to burden NMIs with unnecessary memory barriers when code
37 * modification is not being done (which is most of the time).
38 *
39 * A mutex is already held when ftrace_arch_code_modify_prepare
40 * and post_process are called. No locks need to be taken here.
41 *
42 * Stop machine will make sure currently running NMIs are done
43 * and new NMIs will see the updated variable before we need
44 * to worry about NMIs doing memory barriers.
45 */
46static int modifying_code __read_mostly;
47static DEFINE_PER_CPU(int, save_modifying_code);
48
31int ftrace_arch_code_modify_prepare(void) 49int ftrace_arch_code_modify_prepare(void)
32{ 50{
33 set_kernel_text_rw(); 51 set_kernel_text_rw();
52 modifying_code = 1;
34 return 0; 53 return 0;
35} 54}
36 55
37int ftrace_arch_code_modify_post_process(void) 56int ftrace_arch_code_modify_post_process(void)
38{ 57{
58 modifying_code = 0;
39 set_kernel_text_ro(); 59 set_kernel_text_ro();
40 return 0; 60 return 0;
41} 61}
@@ -147,6 +167,11 @@ static void ftrace_mod_code(void)
147 167
148void ftrace_nmi_enter(void) 168void ftrace_nmi_enter(void)
149{ 169{
170 __get_cpu_var(save_modifying_code) = modifying_code;
171
172 if (!__get_cpu_var(save_modifying_code))
173 return;
174
150 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { 175 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
151 smp_rmb(); 176 smp_rmb();
152 ftrace_mod_code(); 177 ftrace_mod_code();
@@ -158,6 +183,9 @@ void ftrace_nmi_enter(void)
158 183
159void ftrace_nmi_exit(void) 184void ftrace_nmi_exit(void)
160{ 185{
186 if (!__get_cpu_var(save_modifying_code))
187 return;
188
161 /* Finish all executions before clearing nmi_running */ 189 /* Finish all executions before clearing nmi_running */
162 smp_mb(); 190 smp_mb();
163 atomic_dec(&nmi_running); 191 atomic_dec(&nmi_running);
@@ -187,9 +215,26 @@ static void wait_for_nmi(void)
187 nmi_wait_count++; 215 nmi_wait_count++;
188} 216}
189 217
218static inline int
219within(unsigned long addr, unsigned long start, unsigned long end)
220{
221 return addr >= start && addr < end;
222}
223
190static int 224static int
191do_ftrace_mod_code(unsigned long ip, void *new_code) 225do_ftrace_mod_code(unsigned long ip, void *new_code)
192{ 226{
227 /*
228 * On x86_64, kernel text mappings are mapped read-only with
229 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
230 * of the kernel text mapping to modify the kernel text.
231 *
232 * For 32bit kernels, these mappings are same and we can use
233 * kernel identity mapping to modify code.
234 */
235 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
236 ip = (unsigned long)__va(__pa(ip));
237
193 mod_code_ip = (void *)ip; 238 mod_code_ip = (void *)ip;
194 mod_code_newcode = new_code; 239 mod_code_newcode = new_code;
195 240
@@ -336,15 +381,15 @@ int __init ftrace_dyn_arch_init(void *data)
336 381
337 switch (faulted) { 382 switch (faulted) {
338 case 0: 383 case 0:
339 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); 384 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
340 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); 385 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
341 break; 386 break;
342 case 1: 387 case 1:
343 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); 388 pr_info("converting mcount calls to 66 66 66 66 90\n");
344 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); 389 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
345 break; 390 break;
346 case 2: 391 case 2:
347 pr_info("ftrace: converting mcount calls to jmp . + 5\n"); 392 pr_info("converting mcount calls to jmp . + 5\n");
348 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); 393 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
349 break; 394 break;
350 } 395 }
@@ -465,85 +510,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
465 } 510 }
466} 511}
467#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 512#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
468
469#ifdef CONFIG_FTRACE_SYSCALLS
470
471extern unsigned long __start_syscalls_metadata[];
472extern unsigned long __stop_syscalls_metadata[];
473extern unsigned long *sys_call_table;
474
475static struct syscall_metadata **syscalls_metadata;
476
477static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
478{
479 struct syscall_metadata *start;
480 struct syscall_metadata *stop;
481 char str[KSYM_SYMBOL_LEN];
482
483
484 start = (struct syscall_metadata *)__start_syscalls_metadata;
485 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
486 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
487
488 for ( ; start < stop; start++) {
489 if (start->name && !strcmp(start->name, str))
490 return start;
491 }
492 return NULL;
493}
494
495struct syscall_metadata *syscall_nr_to_meta(int nr)
496{
497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
498 return NULL;
499
500 return syscalls_metadata[nr];
501}
502
503int syscall_name_to_nr(char *name)
504{
505 int i;
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{
526 syscalls_metadata[num]->exit_id = id;
527}
528
529static int __init arch_init_ftrace_syscalls(void)
530{
531 int i;
532 struct syscall_metadata *meta;
533 unsigned long **psys_syscall_table = &sys_call_table;
534
535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
536 NR_syscalls, GFP_KERNEL);
537 if (!syscalls_metadata) {
538 WARN_ON(1);
539 return -ENOMEM;
540 }
541
542 for (i = 0; i < NR_syscalls; i++) {
543 meta = find_syscall_meta(psys_syscall_table[i]);
544 syscalls_metadata[i] = meta;
545 }
546 return 0;
547}
548arch_initcall(arch_init_ftrace_syscalls);
549#endif
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
deleted file mode 100644
index 9b08e852fd1a..000000000000
--- a/arch/x86/kernel/geode_32.c
+++ /dev/null
@@ -1,196 +0,0 @@
1/*
2 * AMD Geode southbridge support code
3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2 of the GNU General Public License
8 * as published by the Free Software Foundation.
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/ioport.h>
14#include <linux/io.h>
15#include <asm/msr.h>
16#include <asm/geode.h>
17
18static struct {
19 char *name;
20 u32 msr;
21 int size;
22 u32 base;
23} lbars[] = {
24 { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
25 { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
26 { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
27 { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
28};
29
30static void __init init_lbars(void)
31{
32 u32 lo, hi;
33 int i;
34
35 for (i = 0; i < ARRAY_SIZE(lbars); i++) {
36 rdmsr(lbars[i].msr, lo, hi);
37 if (hi & 0x01)
38 lbars[i].base = lo & 0x0000ffff;
39
40 if (lbars[i].base == 0)
41 printk(KERN_ERR "geode: Couldn't initialize '%s'\n",
42 lbars[i].name);
43 }
44}
45
46int geode_get_dev_base(unsigned int dev)
47{
48 BUG_ON(dev >= ARRAY_SIZE(lbars));
49 return lbars[dev].base;
50}
51EXPORT_SYMBOL_GPL(geode_get_dev_base);
52
53/* === GPIO API === */
54
55void geode_gpio_set(u32 gpio, unsigned int reg)
56{
57 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
58
59 if (!base)
60 return;
61
62 /* low bank register */
63 if (gpio & 0xFFFF)
64 outl(gpio & 0xFFFF, base + reg);
65 /* high bank register */
66 gpio >>= 16;
67 if (gpio)
68 outl(gpio, base + 0x80 + reg);
69}
70EXPORT_SYMBOL_GPL(geode_gpio_set);
71
72void geode_gpio_clear(u32 gpio, unsigned int reg)
73{
74 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
75
76 if (!base)
77 return;
78
79 /* low bank register */
80 if (gpio & 0xFFFF)
81 outl((gpio & 0xFFFF) << 16, base + reg);
82 /* high bank register */
83 gpio &= (0xFFFF << 16);
84 if (gpio)
85 outl(gpio, base + 0x80 + reg);
86}
87EXPORT_SYMBOL_GPL(geode_gpio_clear);
88
89int geode_gpio_isset(u32 gpio, unsigned int reg)
90{
91 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
92 u32 val;
93
94 if (!base)
95 return 0;
96
97 /* low bank register */
98 if (gpio & 0xFFFF) {
99 val = inl(base + reg) & (gpio & 0xFFFF);
100 if ((gpio & 0xFFFF) == val)
101 return 1;
102 }
103 /* high bank register */
104 gpio >>= 16;
105 if (gpio) {
106 val = inl(base + 0x80 + reg) & gpio;
107 if (gpio == val)
108 return 1;
109 }
110 return 0;
111}
112EXPORT_SYMBOL_GPL(geode_gpio_isset);
113
114void geode_gpio_set_irq(unsigned int group, unsigned int irq)
115{
116 u32 lo, hi;
117
118 if (group > 7 || irq > 15)
119 return;
120
121 rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
122
123 lo &= ~(0xF << (group * 4));
124 lo |= (irq & 0xF) << (group * 4);
125
126 wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
127}
128EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
129
130void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
131{
132 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
133 u32 offset, shift, val;
134
135 if (gpio >= 24)
136 offset = GPIO_MAP_W;
137 else if (gpio >= 16)
138 offset = GPIO_MAP_Z;
139 else if (gpio >= 8)
140 offset = GPIO_MAP_Y;
141 else
142 offset = GPIO_MAP_X;
143
144 shift = (gpio % 8) * 4;
145
146 val = inl(base + offset);
147
148 /* Clear whatever was there before */
149 val &= ~(0xF << shift);
150
151 /* And set the new value */
152
153 val |= ((pair & 7) << shift);
154
155 /* Set the PME bit if this is a PME event */
156
157 if (pme)
158 val |= (1 << (shift + 3));
159
160 outl(val, base + offset);
161}
162EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
163
164int geode_has_vsa2(void)
165{
166 static int has_vsa2 = -1;
167
168 if (has_vsa2 == -1) {
169 u16 val;
170
171 /*
172 * The VSA has virtual registers that we can query for a
173 * signature.
174 */
175 outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
176 outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
177
178 val = inw(VSA_VRC_DATA);
179 has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
180 }
181
182 return has_vsa2;
183}
184EXPORT_SYMBOL_GPL(geode_has_vsa2);
185
186static int __init geode_southbridge_init(void)
187{
188 if (!is_geode())
189 return -ENODEV;
190
191 init_lbars();
192 (void) mfgpt_timer_setup();
193 return 0;
194}
195
196postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 4f8e2507e8f3..b2e246037392 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/start_kernel.h> 9#include <linux/start_kernel.h>
10#include <linux/mm.h>
10 11
11#include <asm/setup.h> 12#include <asm/setup.h>
12#include <asm/sections.h> 13#include <asm/sections.h>
@@ -29,16 +30,25 @@ static void __init i386_default_early_setup(void)
29 30
30void __init i386_start_kernel(void) 31void __init i386_start_kernel(void)
31{ 32{
32 reserve_trampoline_memory(); 33#ifdef CONFIG_X86_TRAMPOLINE
34 /*
35 * But first pinch a few for the stack/trampoline stuff
36 * FIXME: Don't need the extra page at 4K, but need to fix
37 * trampoline before removing it. (see the GDT stuff)
38 */
39 reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
40 "EX TRAMPOLINE");
41#endif
33 42
34 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 43 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
35 44
36#ifdef CONFIG_BLK_DEV_INITRD 45#ifdef CONFIG_BLK_DEV_INITRD
37 /* Reserve INITRD */ 46 /* Reserve INITRD */
38 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 47 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
48 /* Assume only end is not page aligned */
39 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 49 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
40 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 50 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
41 u64 ramdisk_end = ramdisk_image + ramdisk_size; 51 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
42 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 52 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
43 } 53 }
44#endif 54#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0b06cd778fd9..7147143fd614 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,16 +98,15 @@ void __init x86_64_start_reservations(char *real_mode_data)
98{ 98{
99 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
100 100
101 reserve_trampoline_memory();
102
103 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 101 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
104 102
105#ifdef CONFIG_BLK_DEV_INITRD 103#ifdef CONFIG_BLK_DEV_INITRD
106 /* Reserve INITRD */ 104 /* Reserve INITRD */
107 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 105 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
106 /* Assume only end is not page aligned */
108 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 107 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
109 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 108 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
110 unsigned long ramdisk_end = ramdisk_image + ramdisk_size; 109 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
111 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 110 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
112 } 111 }
113#endif 112#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 050c278481b1..37c3d4b17d85 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -18,6 +18,8 @@
18#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
19#include <asm/setup.h> 19#include <asm/setup.h>
20#include <asm/processor-flags.h> 20#include <asm/processor-flags.h>
21#include <asm/msr-index.h>
22#include <asm/cpufeature.h>
21#include <asm/percpu.h> 23#include <asm/percpu.h>
22 24
23/* Physical address */ 25/* Physical address */
@@ -297,25 +299,27 @@ ENTRY(startup_32_smp)
297 orl %edx,%eax 299 orl %edx,%eax
298 movl %eax,%cr4 300 movl %eax,%cr4
299 301
300 btl $5, %eax # check if PAE is enabled 302 testb $X86_CR4_PAE, %al # check if PAE is enabled
301 jnc 6f 303 jz 6f
302 304
303 /* Check if extended functions are implemented */ 305 /* Check if extended functions are implemented */
304 movl $0x80000000, %eax 306 movl $0x80000000, %eax
305 cpuid 307 cpuid
306 cmpl $0x80000000, %eax 308 /* Value must be in the range 0x80000001 to 0x8000ffff */
307 jbe 6f 309 subl $0x80000001, %eax
310 cmpl $(0x8000ffff-0x80000001), %eax
311 ja 6f
308 mov $0x80000001, %eax 312 mov $0x80000001, %eax
309 cpuid 313 cpuid
310 /* Execute Disable bit supported? */ 314 /* Execute Disable bit supported? */
311 btl $20, %edx 315 btl $(X86_FEATURE_NX & 31), %edx
312 jnc 6f 316 jnc 6f
313 317
314 /* Setup EFER (Extended Feature Enable Register) */ 318 /* Setup EFER (Extended Feature Enable Register) */
315 movl $0xc0000080, %ecx 319 movl $MSR_EFER, %ecx
316 rdmsr 320 rdmsr
317 321
318 btsl $11, %eax 322 btsl $_EFER_NX, %eax
319 /* Make changes effective */ 323 /* Make changes effective */
320 wrmsr 324 wrmsr
321 325
@@ -438,8 +442,8 @@ is386: movl $2,%ecx # set MP
438 */ 442 */
439 cmpb $0,ready 443 cmpb $0,ready
440 jne 1f 444 jne 1f
441 movl $per_cpu__gdt_page,%eax 445 movl $gdt_page,%eax
442 movl $per_cpu__stack_canary,%ecx 446 movl $stack_canary,%ecx
443 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) 447 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
444 shrl $16, %ecx 448 shrl $16, %ecx
445 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) 449 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@ -702,7 +706,7 @@ idt_descr:
702 .word 0 # 32 bit align gdt_desc.address 706 .word 0 # 32 bit align gdt_desc.address
703ENTRY(early_gdt_descr) 707ENTRY(early_gdt_descr)
704 .word GDT_ENTRIES*8-1 708 .word GDT_ENTRIES*8-1
705 .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ 709 .long gdt_page /* Overwritten for secondary CPUs */
706 710
707/* 711/*
708 * The boot_gdt must mirror the equivalent in setup.S and is 712 * The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 780cd928fcd5..3d1e6f16b7a6 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -27,7 +27,7 @@
27#define GET_CR2_INTO_RCX movq %cr2, %rcx 27#define GET_CR2_INTO_RCX movq %cr2, %rcx
28#endif 28#endif
29 29
30/* we are not able to switch in one step to the final KERNEL ADRESS SPACE 30/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
31 * because we need identity-mapped pages. 31 * because we need identity-mapped pages.
32 * 32 *
33 */ 33 */
@@ -212,8 +212,8 @@ ENTRY(secondary_startup_64)
212 */ 212 */
213 lgdt early_gdt_descr(%rip) 213 lgdt early_gdt_descr(%rip)
214 214
215 /* set up data segments. actually 0 would do too */ 215 /* set up data segments */
216 movl $__KERNEL_DS,%eax 216 xorl %eax,%eax
217 movl %eax,%ds 217 movl %eax,%ds
218 movl %eax,%ss 218 movl %eax,%ss
219 movl %eax,%es 219 movl %eax,%es
@@ -262,11 +262,11 @@ ENTRY(secondary_startup_64)
262 .quad x86_64_start_kernel 262 .quad x86_64_start_kernel
263 ENTRY(initial_gs) 263 ENTRY(initial_gs)
264 .quad INIT_PER_CPU_VAR(irq_stack_union) 264 .quad INIT_PER_CPU_VAR(irq_stack_union)
265 __FINITDATA
266 265
267 ENTRY(stack_start) 266 ENTRY(stack_start)
268 .quad init_thread_union+THREAD_SIZE-8 267 .quad init_thread_union+THREAD_SIZE-8
269 .word 0 268 .word 0
269 __FINITDATA
270 270
271bad_address: 271bad_address:
272 jmp bad_address 272 jmp bad_address
@@ -340,6 +340,7 @@ ENTRY(name)
340 i = i + 1 ; \ 340 i = i + 1 ; \
341 .endr 341 .endr
342 342
343 .data
343 /* 344 /*
344 * This default setting generates an ident mapping at address 0x100000 345 * This default setting generates an ident mapping at address 0x100000
345 * and a mapping for the kernel that precisely maps virtual address 346 * and a mapping for the kernel that precisely maps virtual address
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dedc2bddf7a5..23b4ecdffa9b 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
4#include <linux/sysdev.h> 4#include <linux/sysdev.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/errno.h> 6#include <linux/errno.h>
7#include <linux/slab.h>
7#include <linux/hpet.h> 8#include <linux/hpet.h>
8#include <linux/init.h> 9#include <linux/init.h>
9#include <linux/cpu.h> 10#include <linux/cpu.h>
@@ -33,6 +34,9 @@
33 * HPET address is set in acpi/boot.c, when an ACPI entry exists 34 * HPET address is set in acpi/boot.c, when an ACPI entry exists
34 */ 35 */
35unsigned long hpet_address; 36unsigned long hpet_address;
37u8 hpet_blockid; /* OS timer block num */
38u8 hpet_msi_disable;
39
36#ifdef CONFIG_PCI_MSI 40#ifdef CONFIG_PCI_MSI
37static unsigned long hpet_num_timers; 41static unsigned long hpet_num_timers;
38#endif 42#endif
@@ -47,12 +51,12 @@ struct hpet_dev {
47 char name[10]; 51 char name[10];
48}; 52};
49 53
50unsigned long hpet_readl(unsigned long a) 54inline unsigned int hpet_readl(unsigned int a)
51{ 55{
52 return readl(hpet_virt_address + a); 56 return readl(hpet_virt_address + a);
53} 57}
54 58
55static inline void hpet_writel(unsigned long d, unsigned long a) 59static inline void hpet_writel(unsigned int d, unsigned int a)
56{ 60{
57 writel(d, hpet_virt_address + a); 61 writel(d, hpet_virt_address + a);
58} 62}
@@ -167,7 +171,7 @@ do { \
167 171
168static void hpet_reserve_msi_timers(struct hpet_data *hd); 172static void hpet_reserve_msi_timers(struct hpet_data *hd);
169 173
170static void hpet_reserve_platform_timers(unsigned long id) 174static void hpet_reserve_platform_timers(unsigned int id)
171{ 175{
172 struct hpet __iomem *hpet = hpet_virt_address; 176 struct hpet __iomem *hpet = hpet_virt_address;
173 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; 177 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
@@ -205,7 +209,7 @@ static void hpet_reserve_platform_timers(unsigned long id)
205 209
206} 210}
207#else 211#else
208static void hpet_reserve_platform_timers(unsigned long id) { } 212static void hpet_reserve_platform_timers(unsigned int id) { }
209#endif 213#endif
210 214
211/* 215/*
@@ -246,7 +250,7 @@ static void hpet_reset_counter(void)
246 250
247static void hpet_start_counter(void) 251static void hpet_start_counter(void)
248{ 252{
249 unsigned long cfg = hpet_readl(HPET_CFG); 253 unsigned int cfg = hpet_readl(HPET_CFG);
250 cfg |= HPET_CFG_ENABLE; 254 cfg |= HPET_CFG_ENABLE;
251 hpet_writel(cfg, HPET_CFG); 255 hpet_writel(cfg, HPET_CFG);
252} 256}
@@ -263,7 +267,7 @@ static void hpet_resume_device(void)
263 force_hpet_resume(); 267 force_hpet_resume();
264} 268}
265 269
266static void hpet_resume_counter(void) 270static void hpet_resume_counter(struct clocksource *cs)
267{ 271{
268 hpet_resume_device(); 272 hpet_resume_device();
269 hpet_restart_counter(); 273 hpet_restart_counter();
@@ -271,7 +275,7 @@ static void hpet_resume_counter(void)
271 275
272static void hpet_enable_legacy_int(void) 276static void hpet_enable_legacy_int(void)
273{ 277{
274 unsigned long cfg = hpet_readl(HPET_CFG); 278 unsigned int cfg = hpet_readl(HPET_CFG);
275 279
276 cfg |= HPET_CFG_LEGACY; 280 cfg |= HPET_CFG_LEGACY;
277 hpet_writel(cfg, HPET_CFG); 281 hpet_writel(cfg, HPET_CFG);
@@ -314,7 +318,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
314static void hpet_set_mode(enum clock_event_mode mode, 318static void hpet_set_mode(enum clock_event_mode mode,
315 struct clock_event_device *evt, int timer) 319 struct clock_event_device *evt, int timer)
316{ 320{
317 unsigned long cfg, cmp, now; 321 unsigned int cfg, cmp, now;
318 uint64_t delta; 322 uint64_t delta;
319 323
320 switch (mode) { 324 switch (mode) {
@@ -323,7 +327,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
323 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; 327 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
324 delta >>= evt->shift; 328 delta >>= evt->shift;
325 now = hpet_readl(HPET_COUNTER); 329 now = hpet_readl(HPET_COUNTER);
326 cmp = now + (unsigned long) delta; 330 cmp = now + (unsigned int) delta;
327 cfg = hpet_readl(HPET_Tn_CFG(timer)); 331 cfg = hpet_readl(HPET_Tn_CFG(timer));
328 /* Make sure we use edge triggered interrupts */ 332 /* Make sure we use edge triggered interrupts */
329 cfg &= ~HPET_TN_LEVEL; 333 cfg &= ~HPET_TN_LEVEL;
@@ -339,7 +343,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
339 * (See AMD-8111 HyperTransport I/O Hub Data Sheet, 343 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
340 * Publication # 24674) 344 * Publication # 24674)
341 */ 345 */
342 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); 346 hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
343 hpet_start_counter(); 347 hpet_start_counter();
344 hpet_print_config(); 348 hpet_print_config();
345 break; 349 break;
@@ -383,13 +387,30 @@ static int hpet_next_event(unsigned long delta,
383 hpet_writel(cnt, HPET_Tn_CMP(timer)); 387 hpet_writel(cnt, HPET_Tn_CMP(timer));
384 388
385 /* 389 /*
386 * We need to read back the CMP register to make sure that 390 * We need to read back the CMP register on certain HPET
387 * what we wrote hit the chip before we compare it to the 391 * implementations (ATI chipsets) which seem to delay the
388 * counter. 392 * transfer of the compare register into the internal compare
393 * logic. With small deltas this might actually be too late as
394 * the counter could already be higher than the compare value
395 * at that point and we would wait for the next hpet interrupt
396 * forever. We found out that reading the CMP register back
397 * forces the transfer so we can rely on the comparison with
398 * the counter register below. If the read back from the
399 * compare register does not match the value we programmed
400 * then we might have a real hardware problem. We can not do
401 * much about it here, but at least alert the user/admin with
402 * a prominent warning.
403 * An erratum on some chipsets (ICH9,..), results in comparator read
404 * immediately following a write returning old value. Workaround
405 * for this is to read this value second time, when first
406 * read returns old value.
389 */ 407 */
390 WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); 408 if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
409 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
410 KERN_WARNING "hpet: compare register read back failed.\n");
411 }
391 412
392 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 413 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
393} 414}
394 415
395static void hpet_legacy_set_mode(enum clock_event_mode mode, 416static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -415,7 +436,7 @@ static struct hpet_dev *hpet_devs;
415void hpet_msi_unmask(unsigned int irq) 436void hpet_msi_unmask(unsigned int irq)
416{ 437{
417 struct hpet_dev *hdev = get_irq_data(irq); 438 struct hpet_dev *hdev = get_irq_data(irq);
418 unsigned long cfg; 439 unsigned int cfg;
419 440
420 /* unmask it */ 441 /* unmask it */
421 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 442 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -425,7 +446,7 @@ void hpet_msi_unmask(unsigned int irq)
425 446
426void hpet_msi_mask(unsigned int irq) 447void hpet_msi_mask(unsigned int irq)
427{ 448{
428 unsigned long cfg; 449 unsigned int cfg;
429 struct hpet_dev *hdev = get_irq_data(irq); 450 struct hpet_dev *hdev = get_irq_data(irq);
430 451
431 /* mask it */ 452 /* mask it */
@@ -467,7 +488,7 @@ static int hpet_msi_next_event(unsigned long delta,
467 488
468static int hpet_setup_msi_irq(unsigned int irq) 489static int hpet_setup_msi_irq(unsigned int irq)
469{ 490{
470 if (arch_setup_hpet_msi(irq)) { 491 if (arch_setup_hpet_msi(irq, hpet_blockid)) {
471 destroy_irq(irq); 492 destroy_irq(irq);
472 return -EINVAL; 493 return -EINVAL;
473 } 494 }
@@ -584,6 +605,11 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
584 unsigned int num_timers_used = 0; 605 unsigned int num_timers_used = 0;
585 int i; 606 int i;
586 607
608 if (hpet_msi_disable)
609 return;
610
611 if (boot_cpu_has(X86_FEATURE_ARAT))
612 return;
587 id = hpet_readl(HPET_ID); 613 id = hpet_readl(HPET_ID);
588 614
589 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); 615 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
@@ -598,7 +624,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
598 624
599 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { 625 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
600 struct hpet_dev *hdev = &hpet_devs[num_timers_used]; 626 struct hpet_dev *hdev = &hpet_devs[num_timers_used];
601 unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); 627 unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
602 628
603 /* Only consider HPET timer with MSI support */ 629 /* Only consider HPET timer with MSI support */
604 if (!(cfg & HPET_TN_FSB_CAP)) 630 if (!(cfg & HPET_TN_FSB_CAP))
@@ -813,7 +839,7 @@ static int hpet_clocksource_register(void)
813 */ 839 */
814int __init hpet_enable(void) 840int __init hpet_enable(void)
815{ 841{
816 unsigned long id; 842 unsigned int id;
817 int i; 843 int i;
818 844
819 if (!is_hpet_capable()) 845 if (!is_hpet_capable())
@@ -872,10 +898,8 @@ int __init hpet_enable(void)
872 898
873 if (id & HPET_ID_LEGSUP) { 899 if (id & HPET_ID_LEGSUP) {
874 hpet_legacy_clockevent_register(); 900 hpet_legacy_clockevent_register();
875 hpet_msi_capability_lookup(2);
876 return 1; 901 return 1;
877 } 902 }
878 hpet_msi_capability_lookup(0);
879 return 0; 903 return 0;
880 904
881out_nohpet: 905out_nohpet:
@@ -908,9 +932,20 @@ static __init int hpet_late_init(void)
908 if (!hpet_virt_address) 932 if (!hpet_virt_address)
909 return -ENODEV; 933 return -ENODEV;
910 934
935 if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
936 hpet_msi_capability_lookup(2);
937 else
938 hpet_msi_capability_lookup(0);
939
911 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 940 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
912 hpet_print_config(); 941 hpet_print_config();
913 942
943 if (hpet_msi_disable)
944 return 0;
945
946 if (boot_cpu_has(X86_FEATURE_ARAT))
947 return 0;
948
914 for_each_online_cpu(cpu) { 949 for_each_online_cpu(cpu) {
915 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); 950 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
916 } 951 }
@@ -925,7 +960,7 @@ fs_initcall(hpet_late_init);
925void hpet_disable(void) 960void hpet_disable(void)
926{ 961{
927 if (is_hpet_capable()) { 962 if (is_hpet_capable()) {
928 unsigned long cfg = hpet_readl(HPET_CFG); 963 unsigned int cfg = hpet_readl(HPET_CFG);
929 964
930 if (hpet_legacy_int_enabled) { 965 if (hpet_legacy_int_enabled) {
931 cfg &= ~HPET_CFG_LEGACY; 966 cfg &= ~HPET_CFG_LEGACY;
@@ -965,8 +1000,8 @@ static int hpet_prev_update_sec;
965static struct rtc_time hpet_alarm_time; 1000static struct rtc_time hpet_alarm_time;
966static unsigned long hpet_pie_count; 1001static unsigned long hpet_pie_count;
967static u32 hpet_t1_cmp; 1002static u32 hpet_t1_cmp;
968static unsigned long hpet_default_delta; 1003static u32 hpet_default_delta;
969static unsigned long hpet_pie_delta; 1004static u32 hpet_pie_delta;
970static unsigned long hpet_pie_limit; 1005static unsigned long hpet_pie_limit;
971 1006
972static rtc_irq_handler irq_handler; 1007static rtc_irq_handler irq_handler;
@@ -1017,7 +1052,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
1017 */ 1052 */
1018int hpet_rtc_timer_init(void) 1053int hpet_rtc_timer_init(void)
1019{ 1054{
1020 unsigned long cfg, cnt, delta, flags; 1055 unsigned int cfg, cnt, delta;
1056 unsigned long flags;
1021 1057
1022 if (!is_hpet_enabled()) 1058 if (!is_hpet_enabled())
1023 return 0; 1059 return 0;
@@ -1027,7 +1063,7 @@ int hpet_rtc_timer_init(void)
1027 1063
1028 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1064 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1029 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; 1065 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
1030 hpet_default_delta = (unsigned long) clc; 1066 hpet_default_delta = clc;
1031 } 1067 }
1032 1068
1033 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) 1069 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
@@ -1113,7 +1149,8 @@ int hpet_set_periodic_freq(unsigned long freq)
1113 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1149 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1114 do_div(clc, freq); 1150 do_div(clc, freq);
1115 clc >>= hpet_clockevent.shift; 1151 clc >>= hpet_clockevent.shift;
1116 hpet_pie_delta = (unsigned long) clc; 1152 hpet_pie_delta = clc;
1153 hpet_pie_limit = 0;
1117 } 1154 }
1118 return 1; 1155 return 1;
1119} 1156}
@@ -1127,7 +1164,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
1127 1164
1128static void hpet_rtc_timer_reinit(void) 1165static void hpet_rtc_timer_reinit(void)
1129{ 1166{
1130 unsigned long cfg, delta; 1167 unsigned int cfg, delta;
1131 int lost_ints = -1; 1168 int lost_ints = -1;
1132 1169
1133 if (unlikely(!hpet_rtc_flags)) { 1170 if (unlikely(!hpet_rtc_flags)) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..d6cc065f519f
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,530 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) 2009 IBM Corporation
18 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Authors: Alan Stern <stern@rowland.harvard.edu>
21 * K.Prasad <prasad@linux.vnet.ibm.com>
22 * Frederic Weisbecker <fweisbec@gmail.com>
23 */
24
25/*
26 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
27 * using the CPU's debug registers.
28 */
29
30#include <linux/perf_event.h>
31#include <linux/hw_breakpoint.h>
32#include <linux/irqflags.h>
33#include <linux/notifier.h>
34#include <linux/kallsyms.h>
35#include <linux/kprobes.h>
36#include <linux/percpu.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h>
43
44#include <asm/hw_breakpoint.h>
45#include <asm/processor.h>
46#include <asm/debugreg.h>
47
48/* Per cpu debug control register value */
49DEFINE_PER_CPU(unsigned long, cpu_dr7);
50EXPORT_PER_CPU_SYMBOL(cpu_dr7);
51
52/* Per cpu debug address registers values */
53static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
54
55/*
56 * Stores the breakpoints currently in use on each breakpoint address
57 * register for each cpus
58 */
59static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
60
61
62static inline unsigned long
63__encode_dr7(int drnum, unsigned int len, unsigned int type)
64{
65 unsigned long bp_info;
66
67 bp_info = (len | type) & 0xf;
68 bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
69 bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
70
71 return bp_info;
72}
73
74/*
75 * Encode the length, type, Exact, and Enable bits for a particular breakpoint
76 * as stored in debug register 7.
77 */
78unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
79{
80 return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
81}
82
83/*
84 * Decode the length and type bits for a particular breakpoint as
85 * stored in debug register 7. Return the "enabled" status.
86 */
87int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
88{
89 int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
90
91 *len = (bp_info & 0xc) | 0x40;
92 *type = (bp_info & 0x3) | 0x80;
93
94 return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
95}
96
97/*
98 * Install a perf counter breakpoint.
99 *
100 * We seek a free debug address register and use it for this
101 * breakpoint. Eventually we enable it in the debug control register.
102 *
103 * Atomic: we hold the counter->ctx->lock and we only handle variables
104 * and registers local to this cpu.
105 */
106int arch_install_hw_breakpoint(struct perf_event *bp)
107{
108 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
109 unsigned long *dr7;
110 int i;
111
112 for (i = 0; i < HBP_NUM; i++) {
113 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
114
115 if (!*slot) {
116 *slot = bp;
117 break;
118 }
119 }
120
121 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
122 return -EBUSY;
123
124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address;
126
127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type);
129
130 set_debugreg(*dr7, 7);
131
132 return 0;
133}
134
135/*
136 * Uninstall the breakpoint contained in the given counter.
137 *
138 * First we search the debug address register it uses and then we disable
139 * it.
140 *
141 * Atomic: we hold the counter->ctx->lock and we only handle variables
142 * and registers local to this cpu.
143 */
144void arch_uninstall_hw_breakpoint(struct perf_event *bp)
145{
146 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
147 unsigned long *dr7;
148 int i;
149
150 for (i = 0; i < HBP_NUM; i++) {
151 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
152
153 if (*slot == bp) {
154 *slot = NULL;
155 break;
156 }
157 }
158
159 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
160 return;
161
162 dr7 = &__get_cpu_var(cpu_dr7);
163 *dr7 &= ~__encode_dr7(i, info->len, info->type);
164
165 set_debugreg(*dr7, 7);
166}
167
168static int get_hbp_len(u8 hbp_len)
169{
170 unsigned int len_in_bytes = 0;
171
172 switch (hbp_len) {
173 case X86_BREAKPOINT_LEN_1:
174 len_in_bytes = 1;
175 break;
176 case X86_BREAKPOINT_LEN_2:
177 len_in_bytes = 2;
178 break;
179 case X86_BREAKPOINT_LEN_4:
180 len_in_bytes = 4;
181 break;
182#ifdef CONFIG_X86_64
183 case X86_BREAKPOINT_LEN_8:
184 len_in_bytes = 8;
185 break;
186#endif
187 }
188 return len_in_bytes;
189}
190
191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space.
205 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
207{
208 unsigned int len;
209
210 len = get_hbp_len(hbp_len);
211
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213}
214
215int arch_bp_generic_fields(int x86_len, int x86_type,
216 int *gen_len, int *gen_type)
217{
218 /* Len */
219 switch (x86_len) {
220 case X86_BREAKPOINT_LEN_1:
221 *gen_len = HW_BREAKPOINT_LEN_1;
222 break;
223 case X86_BREAKPOINT_LEN_2:
224 *gen_len = HW_BREAKPOINT_LEN_2;
225 break;
226 case X86_BREAKPOINT_LEN_4:
227 *gen_len = HW_BREAKPOINT_LEN_4;
228 break;
229#ifdef CONFIG_X86_64
230 case X86_BREAKPOINT_LEN_8:
231 *gen_len = HW_BREAKPOINT_LEN_8;
232 break;
233#endif
234 default:
235 return -EINVAL;
236 }
237
238 /* Type */
239 switch (x86_type) {
240 case X86_BREAKPOINT_EXECUTE:
241 *gen_type = HW_BREAKPOINT_X;
242 break;
243 case X86_BREAKPOINT_WRITE:
244 *gen_type = HW_BREAKPOINT_W;
245 break;
246 case X86_BREAKPOINT_RW:
247 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
248 break;
249 default:
250 return -EINVAL;
251 }
252
253 return 0;
254}
255
256
257static int arch_build_bp_info(struct perf_event *bp)
258{
259 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
260
261 info->address = bp->attr.bp_addr;
262
263 /* Len */
264 switch (bp->attr.bp_len) {
265 case HW_BREAKPOINT_LEN_1:
266 info->len = X86_BREAKPOINT_LEN_1;
267 break;
268 case HW_BREAKPOINT_LEN_2:
269 info->len = X86_BREAKPOINT_LEN_2;
270 break;
271 case HW_BREAKPOINT_LEN_4:
272 info->len = X86_BREAKPOINT_LEN_4;
273 break;
274#ifdef CONFIG_X86_64
275 case HW_BREAKPOINT_LEN_8:
276 info->len = X86_BREAKPOINT_LEN_8;
277 break;
278#endif
279 default:
280 return -EINVAL;
281 }
282
283 /* Type */
284 switch (bp->attr.bp_type) {
285 case HW_BREAKPOINT_W:
286 info->type = X86_BREAKPOINT_WRITE;
287 break;
288 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
289 info->type = X86_BREAKPOINT_RW;
290 break;
291 case HW_BREAKPOINT_X:
292 info->type = X86_BREAKPOINT_EXECUTE;
293 break;
294 default:
295 return -EINVAL;
296 }
297
298 return 0;
299}
300/*
301 * Validate the arch-specific HW Breakpoint register settings
302 */
303int arch_validate_hwbkpt_settings(struct perf_event *bp,
304 struct task_struct *tsk)
305{
306 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
307 unsigned int align;
308 int ret;
309
310
311 ret = arch_build_bp_info(bp);
312 if (ret)
313 return ret;
314
315 ret = -EINVAL;
316
317 if (info->type == X86_BREAKPOINT_EXECUTE)
318 /*
319 * Ptrace-refactoring code
320 * For now, we'll allow instruction breakpoint only for user-space
321 * addresses
322 */
323 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
324 info->len != X86_BREAKPOINT_EXECUTE)
325 return ret;
326
327 switch (info->len) {
328 case X86_BREAKPOINT_LEN_1:
329 align = 0;
330 break;
331 case X86_BREAKPOINT_LEN_2:
332 align = 1;
333 break;
334 case X86_BREAKPOINT_LEN_4:
335 align = 3;
336 break;
337#ifdef CONFIG_X86_64
338 case X86_BREAKPOINT_LEN_8:
339 align = 7;
340 break;
341#endif
342 default:
343 return ret;
344 }
345
346 /*
347 * Check that the low-order bits of the address are appropriate
348 * for the alignment implied by len.
349 */
350 if (info->address & align)
351 return -EINVAL;
352
353 /* Check that the virtual address is in the proper range */
354 if (tsk) {
355 if (!arch_check_va_in_userspace(info->address, info->len))
356 return -EFAULT;
357 } else {
358 if (!arch_check_va_in_kernelspace(info->address, info->len))
359 return -EFAULT;
360 }
361
362 return 0;
363}
364
365/*
366 * Dump the debug register contents to the user.
367 * We can't dump our per cpu values because it
368 * may contain cpu wide breakpoint, something that
369 * doesn't belong to the current task.
370 *
371 * TODO: include non-ptrace user breakpoints (perf)
372 */
373void aout_dump_debugregs(struct user *dump)
374{
375 int i;
376 int dr7 = 0;
377 struct perf_event *bp;
378 struct arch_hw_breakpoint *info;
379 struct thread_struct *thread = &current->thread;
380
381 for (i = 0; i < HBP_NUM; i++) {
382 bp = thread->ptrace_bps[i];
383
384 if (bp && !bp->attr.disabled) {
385 dump->u_debugreg[i] = bp->attr.bp_addr;
386 info = counter_arch_bp(bp);
387 dr7 |= encode_dr7(i, info->len, info->type);
388 } else {
389 dump->u_debugreg[i] = 0;
390 }
391 }
392
393 dump->u_debugreg[4] = 0;
394 dump->u_debugreg[5] = 0;
395 dump->u_debugreg[6] = current->thread.debugreg6;
396
397 dump->u_debugreg[7] = dr7;
398}
399EXPORT_SYMBOL_GPL(aout_dump_debugregs);
400
401/*
402 * Release the user breakpoints used by ptrace
403 */
404void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
405{
406 int i;
407 struct thread_struct *t = &tsk->thread;
408
409 for (i = 0; i < HBP_NUM; i++) {
410 unregister_hw_breakpoint(t->ptrace_bps[i]);
411 t->ptrace_bps[i] = NULL;
412 }
413}
414
415void hw_breakpoint_restore(void)
416{
417 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
418 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
419 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
420 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
421 set_debugreg(current->thread.debugreg6, 6);
422 set_debugreg(__get_cpu_var(cpu_dr7), 7);
423}
424EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
425
426/*
427 * Handle debug exception notifications.
428 *
429 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
430 *
431 * NOTIFY_DONE returned if one of the following conditions is true.
432 * i) When the causative address is from user-space and the exception
433 * is a valid one, i.e. not triggered as a result of lazy debug register
434 * switching
435 * ii) When there are more bits than trap<n> set in DR6 register (such
436 * as BD, BS or BT) indicating that more than one debug condition is
437 * met and requires some more action in do_debug().
438 *
439 * NOTIFY_STOP returned for all other cases
440 *
441 */
442static int __kprobes hw_breakpoint_handler(struct die_args *args)
443{
444 int i, cpu, rc = NOTIFY_STOP;
445 struct perf_event *bp;
446 unsigned long dr7, dr6;
447 unsigned long *dr6_p;
448
449 /* The DR6 value is pointed by args->err */
450 dr6_p = (unsigned long *)ERR_PTR(args->err);
451 dr6 = *dr6_p;
452
453 /* Do an early return if no trap bits are set in DR6 */
454 if ((dr6 & DR_TRAP_BITS) == 0)
455 return NOTIFY_DONE;
456
457 get_debugreg(dr7, 7);
458 /* Disable breakpoints during exception handling */
459 set_debugreg(0UL, 7);
460 /*
461 * Assert that local interrupts are disabled
462 * Reset the DRn bits in the virtualized register value.
463 * The ptrace trigger routine will add in whatever is needed.
464 */
465 current->thread.debugreg6 &= ~DR_TRAP_BITS;
466 cpu = get_cpu();
467
468 /* Handle all the breakpoints that were triggered */
469 for (i = 0; i < HBP_NUM; ++i) {
470 if (likely(!(dr6 & (DR_TRAP0 << i))))
471 continue;
472
473 /*
474 * The counter may be concurrently released but that can only
475 * occur from a call_rcu() path. We can then safely fetch
476 * the breakpoint, use its callback, touch its counter
477 * while we are in an rcu_read_lock() path.
478 */
479 rcu_read_lock();
480
481 bp = per_cpu(bp_per_reg[i], cpu);
482 /*
483 * Reset the 'i'th TRAP bit in dr6 to denote completion of
484 * exception handling
485 */
486 (*dr6_p) &= ~(DR_TRAP0 << i);
487 /*
488 * bp can be NULL due to lazy debug register switching
489 * or due to concurrent perf counter removing.
490 */
491 if (!bp) {
492 rcu_read_unlock();
493 break;
494 }
495
496 perf_bp_event(bp, args->regs);
497
498 rcu_read_unlock();
499 }
500 /*
501 * Further processing in do_debug() is needed for a) user-space
502 * breakpoints (to generate signals) and b) when the system has
503 * taken exception due to multiple causes
504 */
505 if ((current->thread.debugreg6 & DR_TRAP_BITS) ||
506 (dr6 & (~DR_TRAP_BITS)))
507 rc = NOTIFY_DONE;
508
509 set_debugreg(dr7, 7);
510 put_cpu();
511
512 return rc;
513}
514
515/*
516 * Handle debug exception notifications.
517 */
518int __kprobes hw_breakpoint_exceptions_notify(
519 struct notifier_block *unused, unsigned long val, void *data)
520{
521 if (val != DIE_DEBUG)
522 return NOTIFY_DONE;
523
524 return hw_breakpoint_handler(data);
525}
526
527void hw_breakpoint_pmu_read(struct perf_event *bp)
528{
529 /* TODO */
530}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f2f8540a7f3d..54c31c285488 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/regset.h> 9#include <linux/regset.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h>
11 12
12#include <asm/sigcontext.h> 13#include <asm/sigcontext.h>
13#include <asm/processor.h> 14#include <asm/processor.h>
@@ -164,6 +165,11 @@ int init_fpu(struct task_struct *tsk)
164 return 0; 165 return 0;
165} 166}
166 167
168/*
169 * The xstateregs_active() routine is the same as the fpregs_active() routine,
170 * as the "regset->n" for the xstate regset will be updated based on the feature
171 * capabilites supported by the xsave.
172 */
167int fpregs_active(struct task_struct *target, const struct user_regset *regset) 173int fpregs_active(struct task_struct *target, const struct user_regset *regset)
168{ 174{
169 return tsk_used_math(target) ? regset->n : 0; 175 return tsk_used_math(target) ? regset->n : 0;
@@ -204,8 +210,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
204 if (ret) 210 if (ret)
205 return ret; 211 return ret;
206 212
207 set_stopped_child_used_math(target);
208
209 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 213 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
210 &target->thread.xstate->fxsave, 0, -1); 214 &target->thread.xstate->fxsave, 0, -1);
211 215
@@ -224,6 +228,68 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
224 return ret; 228 return ret;
225} 229}
226 230
231int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
232 unsigned int pos, unsigned int count,
233 void *kbuf, void __user *ubuf)
234{
235 int ret;
236
237 if (!cpu_has_xsave)
238 return -ENODEV;
239
240 ret = init_fpu(target);
241 if (ret)
242 return ret;
243
244 /*
245 * Copy the 48bytes defined by the software first into the xstate
246 * memory layout in the thread struct, so that we can copy the entire
247 * xstateregs to the user using one user_regset_copyout().
248 */
249 memcpy(&target->thread.xstate->fxsave.sw_reserved,
250 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
251
252 /*
253 * Copy the xstate memory layout.
254 */
255 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
256 &target->thread.xstate->xsave, 0, -1);
257 return ret;
258}
259
260int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
261 unsigned int pos, unsigned int count,
262 const void *kbuf, const void __user *ubuf)
263{
264 int ret;
265 struct xsave_hdr_struct *xsave_hdr;
266
267 if (!cpu_has_xsave)
268 return -ENODEV;
269
270 ret = init_fpu(target);
271 if (ret)
272 return ret;
273
274 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
275 &target->thread.xstate->xsave, 0, -1);
276
277 /*
278 * mxcsr reserved bits must be masked to zero for security reasons.
279 */
280 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
281
282 xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
283
284 xsave_hdr->xstate_bv &= pcntxt_mask;
285 /*
286 * These bits must be zero.
287 */
288 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
289
290 return ret;
291}
292
227#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 293#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
228 294
229/* 295/*
@@ -404,8 +470,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
404 if (ret) 470 if (ret)
405 return ret; 471 return ret;
406 472
407 set_stopped_child_used_math(target);
408
409 if (!HAVE_HWFP) 473 if (!HAVE_HWFP)
410 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); 474 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
411 475
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index df89102bef80..7c9f02c130f3 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -5,7 +5,6 @@
5#include <linux/ioport.h> 5#include <linux/ioport.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h> 7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h> 8#include <linux/random.h>
10#include <linux/init.h> 9#include <linux/init.h>
11#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
@@ -32,8 +31,14 @@
32 */ 31 */
33 32
34static int i8259A_auto_eoi; 33static int i8259A_auto_eoi;
35DEFINE_SPINLOCK(i8259A_lock); 34DEFINE_RAW_SPINLOCK(i8259A_lock);
36static void mask_and_ack_8259A(unsigned int); 35static void mask_and_ack_8259A(unsigned int);
36static void mask_8259A(void);
37static void unmask_8259A(void);
38static void disable_8259A_irq(unsigned int irq);
39static void enable_8259A_irq(unsigned int irq);
40static void init_8259A(int auto_eoi);
41static int i8259A_irq_pending(unsigned int irq);
37 42
38struct irq_chip i8259A_chip = { 43struct irq_chip i8259A_chip = {
39 .name = "XT-PIC", 44 .name = "XT-PIC",
@@ -63,51 +68,51 @@ unsigned int cached_irq_mask = 0xffff;
63 */ 68 */
64unsigned long io_apic_irqs; 69unsigned long io_apic_irqs;
65 70
66void disable_8259A_irq(unsigned int irq) 71static void disable_8259A_irq(unsigned int irq)
67{ 72{
68 unsigned int mask = 1 << irq; 73 unsigned int mask = 1 << irq;
69 unsigned long flags; 74 unsigned long flags;
70 75
71 spin_lock_irqsave(&i8259A_lock, flags); 76 raw_spin_lock_irqsave(&i8259A_lock, flags);
72 cached_irq_mask |= mask; 77 cached_irq_mask |= mask;
73 if (irq & 8) 78 if (irq & 8)
74 outb(cached_slave_mask, PIC_SLAVE_IMR); 79 outb(cached_slave_mask, PIC_SLAVE_IMR);
75 else 80 else
76 outb(cached_master_mask, PIC_MASTER_IMR); 81 outb(cached_master_mask, PIC_MASTER_IMR);
77 spin_unlock_irqrestore(&i8259A_lock, flags); 82 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
78} 83}
79 84
80void enable_8259A_irq(unsigned int irq) 85static void enable_8259A_irq(unsigned int irq)
81{ 86{
82 unsigned int mask = ~(1 << irq); 87 unsigned int mask = ~(1 << irq);
83 unsigned long flags; 88 unsigned long flags;
84 89
85 spin_lock_irqsave(&i8259A_lock, flags); 90 raw_spin_lock_irqsave(&i8259A_lock, flags);
86 cached_irq_mask &= mask; 91 cached_irq_mask &= mask;
87 if (irq & 8) 92 if (irq & 8)
88 outb(cached_slave_mask, PIC_SLAVE_IMR); 93 outb(cached_slave_mask, PIC_SLAVE_IMR);
89 else 94 else
90 outb(cached_master_mask, PIC_MASTER_IMR); 95 outb(cached_master_mask, PIC_MASTER_IMR);
91 spin_unlock_irqrestore(&i8259A_lock, flags); 96 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
92} 97}
93 98
94int i8259A_irq_pending(unsigned int irq) 99static int i8259A_irq_pending(unsigned int irq)
95{ 100{
96 unsigned int mask = 1<<irq; 101 unsigned int mask = 1<<irq;
97 unsigned long flags; 102 unsigned long flags;
98 int ret; 103 int ret;
99 104
100 spin_lock_irqsave(&i8259A_lock, flags); 105 raw_spin_lock_irqsave(&i8259A_lock, flags);
101 if (irq < 8) 106 if (irq < 8)
102 ret = inb(PIC_MASTER_CMD) & mask; 107 ret = inb(PIC_MASTER_CMD) & mask;
103 else 108 else
104 ret = inb(PIC_SLAVE_CMD) & (mask >> 8); 109 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
105 spin_unlock_irqrestore(&i8259A_lock, flags); 110 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
106 111
107 return ret; 112 return ret;
108} 113}
109 114
110void make_8259A_irq(unsigned int irq) 115static void make_8259A_irq(unsigned int irq)
111{ 116{
112 disable_irq_nosync(irq); 117 disable_irq_nosync(irq);
113 io_apic_irqs &= ~(1<<irq); 118 io_apic_irqs &= ~(1<<irq);
@@ -150,7 +155,7 @@ static void mask_and_ack_8259A(unsigned int irq)
150 unsigned int irqmask = 1 << irq; 155 unsigned int irqmask = 1 << irq;
151 unsigned long flags; 156 unsigned long flags;
152 157
153 spin_lock_irqsave(&i8259A_lock, flags); 158 raw_spin_lock_irqsave(&i8259A_lock, flags);
154 /* 159 /*
155 * Lightweight spurious IRQ detection. We do not want 160 * Lightweight spurious IRQ detection. We do not want
156 * to overdo spurious IRQ handling - it's usually a sign 161 * to overdo spurious IRQ handling - it's usually a sign
@@ -183,7 +188,7 @@ handle_real_irq:
183 outb(cached_master_mask, PIC_MASTER_IMR); 188 outb(cached_master_mask, PIC_MASTER_IMR);
184 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ 189 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
185 } 190 }
186 spin_unlock_irqrestore(&i8259A_lock, flags); 191 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
187 return; 192 return;
188 193
189spurious_8259A_irq: 194spurious_8259A_irq:
@@ -281,37 +286,37 @@ static int __init i8259A_init_sysfs(void)
281 286
282device_initcall(i8259A_init_sysfs); 287device_initcall(i8259A_init_sysfs);
283 288
284void mask_8259A(void) 289static void mask_8259A(void)
285{ 290{
286 unsigned long flags; 291 unsigned long flags;
287 292
288 spin_lock_irqsave(&i8259A_lock, flags); 293 raw_spin_lock_irqsave(&i8259A_lock, flags);
289 294
290 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 295 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
291 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 296 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
292 297
293 spin_unlock_irqrestore(&i8259A_lock, flags); 298 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
294} 299}
295 300
296void unmask_8259A(void) 301static void unmask_8259A(void)
297{ 302{
298 unsigned long flags; 303 unsigned long flags;
299 304
300 spin_lock_irqsave(&i8259A_lock, flags); 305 raw_spin_lock_irqsave(&i8259A_lock, flags);
301 306
302 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ 307 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
303 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ 308 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
304 309
305 spin_unlock_irqrestore(&i8259A_lock, flags); 310 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
306} 311}
307 312
308void init_8259A(int auto_eoi) 313static void init_8259A(int auto_eoi)
309{ 314{
310 unsigned long flags; 315 unsigned long flags;
311 316
312 i8259A_auto_eoi = auto_eoi; 317 i8259A_auto_eoi = auto_eoi;
313 318
314 spin_lock_irqsave(&i8259A_lock, flags); 319 raw_spin_lock_irqsave(&i8259A_lock, flags);
315 320
316 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 321 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
317 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 322 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
@@ -356,5 +361,49 @@ void init_8259A(int auto_eoi)
356 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ 361 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
357 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ 362 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
358 363
359 spin_unlock_irqrestore(&i8259A_lock, flags); 364 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
360} 365}
366
367/*
368 * make i8259 a driver so that we can select pic functions at run time. the goal
369 * is to make x86 binary compatible among pc compatible and non-pc compatible
370 * platforms, such as x86 MID.
371 */
372
373static void legacy_pic_noop(void) { };
374static void legacy_pic_uint_noop(unsigned int unused) { };
375static void legacy_pic_int_noop(int unused) { };
376
377static struct irq_chip dummy_pic_chip = {
378 .name = "dummy pic",
379 .mask = legacy_pic_uint_noop,
380 .unmask = legacy_pic_uint_noop,
381 .disable = legacy_pic_uint_noop,
382 .mask_ack = legacy_pic_uint_noop,
383};
384static int legacy_pic_irq_pending_noop(unsigned int irq)
385{
386 return 0;
387}
388
389struct legacy_pic null_legacy_pic = {
390 .nr_legacy_irqs = 0,
391 .chip = &dummy_pic_chip,
392 .mask_all = legacy_pic_noop,
393 .restore_mask = legacy_pic_noop,
394 .init = legacy_pic_int_noop,
395 .irq_pending = legacy_pic_irq_pending_noop,
396 .make_irq = legacy_pic_uint_noop,
397};
398
399struct legacy_pic default_legacy_pic = {
400 .nr_legacy_irqs = NR_IRQS_LEGACY,
401 .chip = &i8259A_chip,
402 .mask_all = mask_8259A,
403 .restore_mask = unmask_8259A,
404 .init = init_8259A,
405 .irq_pending = i8259A_irq_pending,
406 .make_irq = make_8259A_irq,
407};
408
409struct legacy_pic *legacy_pic = &default_legacy_pic;
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 99c4d308f16b..8eec0ec59af2 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -103,9 +103,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
103 * on system-call entry - see also fork() and the signal handling 103 * on system-call entry - see also fork() and the signal handling
104 * code. 104 * code.
105 */ 105 */
106static int do_iopl(unsigned int level, struct pt_regs *regs) 106long sys_iopl(unsigned int level, struct pt_regs *regs)
107{ 107{
108 unsigned int old = (regs->flags >> 12) & 3; 108 unsigned int old = (regs->flags >> 12) & 3;
109 struct thread_struct *t = &current->thread;
109 110
110 if (level > 3) 111 if (level > 3)
111 return -EINVAL; 112 return -EINVAL;
@@ -115,29 +116,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
115 return -EPERM; 116 return -EPERM;
116 } 117 }
117 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); 118 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
118
119 return 0;
120}
121
122#ifdef CONFIG_X86_32
123long sys_iopl(struct pt_regs *regs)
124{
125 unsigned int level = regs->bx;
126 struct thread_struct *t = &current->thread;
127 int rc;
128
129 rc = do_iopl(level, regs);
130 if (rc < 0)
131 goto out;
132
133 t->iopl = level << 12; 119 t->iopl = level << 12;
134 set_iopl_mask(t->iopl); 120 set_iopl_mask(t->iopl);
135out: 121
136 return rc; 122 return 0;
137}
138#else
139asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
140{
141 return do_iopl(level, regs);
142} 123}
143#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 04bbd5278568..91fd0c70a18a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -18,7 +18,7 @@
18atomic_t irq_err_count; 18atomic_t irq_err_count;
19 19
20/* Function pointer for generic interrupt vector handling */ 20/* Function pointer for generic interrupt vector handling */
21void (*generic_interrupt_extension)(void) = NULL; 21void (*x86_platform_ipi_callback)(void) = NULL;
22 22
23/* 23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'. 24 * 'what should we do if we get a hw irq event on an illegal vector'.
@@ -72,10 +72,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n"); 73 seq_printf(p, " Performance pending work\n");
74#endif 74#endif
75 if (generic_interrupt_extension) { 75 if (x86_platform_ipi_callback) {
76 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
77 for_each_online_cpu(j) 77 for_each_online_cpu(j)
78 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); 78 seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
79 seq_printf(p, " Platform interrupts\n"); 79 seq_printf(p, " Platform interrupts\n");
80 } 80 }
81#ifdef CONFIG_SMP 81#ifdef CONFIG_SMP
@@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec)
92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
93 seq_printf(p, " TLB shootdowns\n"); 93 seq_printf(p, " TLB shootdowns\n");
94#endif 94#endif
95#ifdef CONFIG_X86_MCE 95#ifdef CONFIG_X86_THERMAL_VECTOR
96 seq_printf(p, "%*s: ", prec, "TRM"); 96 seq_printf(p, "%*s: ", prec, "TRM");
97 for_each_online_cpu(j) 97 for_each_online_cpu(j)
98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
99 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
100# ifdef CONFIG_X86_MCE_THRESHOLD 100#endif
101#ifdef CONFIG_X86_MCE_THRESHOLD
101 seq_printf(p, "%*s: ", prec, "THR"); 102 seq_printf(p, "%*s: ", prec, "THR");
102 for_each_online_cpu(j) 103 for_each_online_cpu(j)
103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 104 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
104 seq_printf(p, " Threshold APIC interrupts\n"); 105 seq_printf(p, " Threshold APIC interrupts\n");
105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
@@ -149,7 +149,7 @@ int show_interrupts(struct seq_file *p, void *v)
149 if (!desc) 149 if (!desc)
150 return 0; 150 return 0;
151 151
152 spin_lock_irqsave(&desc->lock, flags); 152 raw_spin_lock_irqsave(&desc->lock, flags);
153 for_each_online_cpu(j) 153 for_each_online_cpu(j)
154 any_count |= kstat_irqs_cpu(i, j); 154 any_count |= kstat_irqs_cpu(i, j);
155 action = desc->action; 155 action = desc->action;
@@ -170,7 +170,7 @@ int show_interrupts(struct seq_file *p, void *v)
170 170
171 seq_putc(p, '\n'); 171 seq_putc(p, '\n');
172out: 172out:
173 spin_unlock_irqrestore(&desc->lock, flags); 173 raw_spin_unlock_irqrestore(&desc->lock, flags);
174 return 0; 174 return 0;
175} 175}
176 176
@@ -187,18 +187,18 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
187 sum += irq_stats(cpu)->apic_perf_irqs; 187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs; 188 sum += irq_stats(cpu)->apic_pending_irqs;
189#endif 189#endif
190 if (generic_interrupt_extension) 190 if (x86_platform_ipi_callback)
191 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->x86_platform_ipis;
192#ifdef CONFIG_SMP 192#ifdef CONFIG_SMP
193 sum += irq_stats(cpu)->irq_resched_count; 193 sum += irq_stats(cpu)->irq_resched_count;
194 sum += irq_stats(cpu)->irq_call_count; 194 sum += irq_stats(cpu)->irq_call_count;
195 sum += irq_stats(cpu)->irq_tlb_count; 195 sum += irq_stats(cpu)->irq_tlb_count;
196#endif 196#endif
197#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_THERMAL_VECTOR
198 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
199# ifdef CONFIG_X86_MCE_THRESHOLD 199#endif
200#ifdef CONFIG_X86_MCE_THRESHOLD
200 sum += irq_stats(cpu)->irq_threshold_count; 201 sum += irq_stats(cpu)->irq_threshold_count;
201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
@@ -251,9 +251,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
251} 251}
252 252
253/* 253/*
254 * Handler for GENERIC_INTERRUPT_VECTOR. 254 * Handler for X86_PLATFORM_IPI_VECTOR.
255 */ 255 */
256void smp_generic_interrupt(struct pt_regs *regs) 256void smp_x86_platform_ipi(struct pt_regs *regs)
257{ 257{
258 struct pt_regs *old_regs = set_irq_regs(regs); 258 struct pt_regs *old_regs = set_irq_regs(regs);
259 259
@@ -263,10 +263,10 @@ void smp_generic_interrupt(struct pt_regs *regs)
263 263
264 irq_enter(); 264 irq_enter();
265 265
266 inc_irq_stat(generic_irqs); 266 inc_irq_stat(x86_platform_ipis);
267 267
268 if (generic_interrupt_extension) 268 if (x86_platform_ipi_callback)
269 generic_interrupt_extension(); 269 x86_platform_ipi_callback();
270 270
271 irq_exit(); 271 irq_exit();
272 272
@@ -274,3 +274,93 @@ void smp_generic_interrupt(struct pt_regs *regs)
274} 274}
275 275
276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
277
278#ifdef CONFIG_HOTPLUG_CPU
279/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
280void fixup_irqs(void)
281{
282 unsigned int irq, vector;
283 static int warned;
284 struct irq_desc *desc;
285
286 for_each_irq_desc(irq, desc) {
287 int break_affinity = 0;
288 int set_affinity = 1;
289 const struct cpumask *affinity;
290
291 if (!desc)
292 continue;
293 if (irq == 2)
294 continue;
295
296 /* interrupt's are disabled at this point */
297 raw_spin_lock(&desc->lock);
298
299 affinity = desc->affinity;
300 if (!irq_has_action(irq) ||
301 cpumask_equal(affinity, cpu_online_mask)) {
302 raw_spin_unlock(&desc->lock);
303 continue;
304 }
305
306 /*
307 * Complete the irq move. This cpu is going down and for
308 * non intr-remapping case, we can't wait till this interrupt
309 * arrives at this cpu before completing the irq move.
310 */
311 irq_force_complete_move(irq);
312
313 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
314 break_affinity = 1;
315 affinity = cpu_all_mask;
316 }
317
318 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
319 desc->chip->mask(irq);
320
321 if (desc->chip->set_affinity)
322 desc->chip->set_affinity(irq, affinity);
323 else if (!(warned++))
324 set_affinity = 0;
325
326 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
327 desc->chip->unmask(irq);
328
329 raw_spin_unlock(&desc->lock);
330
331 if (break_affinity && set_affinity)
332 printk("Broke affinity for irq %i\n", irq);
333 else if (!set_affinity)
334 printk("Cannot set affinity for irq %i\n", irq);
335 }
336
337 /*
338 * We can remove mdelay() and then send spuriuous interrupts to
339 * new cpu targets for all the irqs that were handled previously by
340 * this cpu. While it works, I have seen spurious interrupt messages
341 * (nothing wrong but still...).
342 *
343 * So for now, retain mdelay(1) and check the IRR and then send those
344 * interrupts to new targets as this cpu is already offlined...
345 */
346 mdelay(1);
347
348 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
349 unsigned int irr;
350
351 if (__get_cpu_var(vector_irq)[vector] < 0)
352 continue;
353
354 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
355 if (irr & (1 << (vector % 32))) {
356 irq = __get_cpu_var(vector_irq)[vector];
357
358 desc = irq_to_desc(irq);
359 raw_spin_lock(&desc->lock);
360 if (desc->chip->retrigger)
361 desc->chip->retrigger(irq);
362 raw_spin_unlock(&desc->lock);
363 }
364 }
365}
366#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 7d35d0fe2329..10709f29d166 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
211 211
212 return true; 212 return true;
213} 213}
214
215#ifdef CONFIG_HOTPLUG_CPU
216
217/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
218void fixup_irqs(void)
219{
220 unsigned int irq;
221 struct irq_desc *desc;
222
223 for_each_irq_desc(irq, desc) {
224 const struct cpumask *affinity;
225
226 if (!desc)
227 continue;
228 if (irq == 2)
229 continue;
230
231 affinity = desc->affinity;
232 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
233 printk("Breaking affinity for irq %i\n", irq);
234 affinity = cpu_all_mask;
235 }
236 if (desc->chip->set_affinity)
237 desc->chip->set_affinity(irq, affinity);
238 else if (desc->action)
239 printk_once("Cannot set affinity for irq %i\n", irq);
240 }
241
242#if 0
243 barrier();
244 /* Ingo Molnar says: "after the IO-APIC masks have been redirected
245 [note the nop - the interrupt-enable boundary on x86 is two
246 instructions from sti] - to flush out pending hardirqs and
247 IPIs. After this point nothing is supposed to reach this CPU." */
248 __asm__ __volatile__("sti; nop; cli");
249 barrier();
250#else
251 /* That doesn't seem sufficient. Give it 1ms. */
252 local_irq_enable();
253 mdelay(1);
254 local_irq_disable();
255#endif
256}
257#endif
258
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 977d8b43a0dd..acf8fbf8fbda 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
62 return true; 62 return true;
63} 63}
64 64
65#ifdef CONFIG_HOTPLUG_CPU
66/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
67void fixup_irqs(void)
68{
69 unsigned int irq;
70 static int warned;
71 struct irq_desc *desc;
72
73 for_each_irq_desc(irq, desc) {
74 int break_affinity = 0;
75 int set_affinity = 1;
76 const struct cpumask *affinity;
77
78 if (!desc)
79 continue;
80 if (irq == 2)
81 continue;
82
83 /* interrupt's are disabled at this point */
84 spin_lock(&desc->lock);
85
86 affinity = desc->affinity;
87 if (!irq_has_action(irq) ||
88 cpumask_equal(affinity, cpu_online_mask)) {
89 spin_unlock(&desc->lock);
90 continue;
91 }
92
93 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
94 break_affinity = 1;
95 affinity = cpu_all_mask;
96 }
97
98 if (desc->chip->mask)
99 desc->chip->mask(irq);
100
101 if (desc->chip->set_affinity)
102 desc->chip->set_affinity(irq, affinity);
103 else if (!(warned++))
104 set_affinity = 0;
105
106 if (desc->chip->unmask)
107 desc->chip->unmask(irq);
108
109 spin_unlock(&desc->lock);
110
111 if (break_affinity && set_affinity)
112 printk("Broke affinity for irq %i\n", irq);
113 else if (!set_affinity)
114 printk("Cannot set affinity for irq %i\n", irq);
115 }
116
117 /* That doesn't seem sufficient. Give it 1ms. */
118 local_irq_enable();
119 mdelay(1);
120 local_irq_disable();
121}
122#endif
123 65
124extern void call_softirq(void); 66extern void call_softirq(void);
125 67
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f5fa64c0b37e..a760ce1a2c0d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -5,7 +5,6 @@
5#include <linux/ioport.h> 5#include <linux/ioport.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h> 7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h> 8#include <linux/random.h>
10#include <linux/kprobes.h> 9#include <linux/kprobes.h>
11#include <linux/init.h> 10#include <linux/init.h>
@@ -84,24 +83,7 @@ static struct irqaction irq2 = {
84}; 83};
85 84
86DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 85DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
87 [0 ... IRQ0_VECTOR - 1] = -1, 86 [0 ... NR_VECTORS - 1] = -1,
88 [IRQ0_VECTOR] = 0,
89 [IRQ1_VECTOR] = 1,
90 [IRQ2_VECTOR] = 2,
91 [IRQ3_VECTOR] = 3,
92 [IRQ4_VECTOR] = 4,
93 [IRQ5_VECTOR] = 5,
94 [IRQ6_VECTOR] = 6,
95 [IRQ7_VECTOR] = 7,
96 [IRQ8_VECTOR] = 8,
97 [IRQ9_VECTOR] = 9,
98 [IRQ10_VECTOR] = 10,
99 [IRQ11_VECTOR] = 11,
100 [IRQ12_VECTOR] = 12,
101 [IRQ13_VECTOR] = 13,
102 [IRQ14_VECTOR] = 14,
103 [IRQ15_VECTOR] = 15,
104 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
105}; 87};
106 88
107int vector_used_by_percpu_irq(unsigned int vector) 89int vector_used_by_percpu_irq(unsigned int vector)
@@ -123,12 +105,12 @@ void __init init_ISA_irqs(void)
123#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) 105#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
124 init_bsp_APIC(); 106 init_bsp_APIC();
125#endif 107#endif
126 init_8259A(0); 108 legacy_pic->init(0);
127 109
128 /* 110 /*
129 * 16 old-style INTA-cycle interrupts: 111 * 16 old-style INTA-cycle interrupts:
130 */ 112 */
131 for (i = 0; i < NR_IRQS_LEGACY; i++) { 113 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
132 struct irq_desc *desc = irq_to_desc(i); 114 struct irq_desc *desc = irq_to_desc(i);
133 115
134 desc->status = IRQ_DISABLED; 116 desc->status = IRQ_DISABLED;
@@ -142,9 +124,44 @@ void __init init_ISA_irqs(void)
142 124
143void __init init_IRQ(void) 125void __init init_IRQ(void)
144{ 126{
127 int i;
128
129 /*
130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
131 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
132 * then this configuration will likely be static after the boot. If
133 * these IRQ's are handled by more mordern controllers like IO-APIC,
134 * then this vector space can be freed and re-used dynamically as the
135 * irq's migrate etc.
136 */
137 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
138 per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
139
145 x86_init.irqs.intr_init(); 140 x86_init.irqs.intr_init();
146} 141}
147 142
143/*
144 * Setup the vector to irq mappings.
145 */
146void setup_vector_irq(int cpu)
147{
148#ifndef CONFIG_X86_IO_APIC
149 int irq;
150
151 /*
152 * On most of the platforms, legacy PIC delivers the interrupts on the
153 * boot cpu. But there are certain platforms where PIC interrupts are
154 * delivered to multiple cpu's. If the legacy IRQ is handled by the
155 * legacy PIC, for the new cpu that is coming online, setup the static
156 * legacy vector to irq mapping:
157 */
158 for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++)
159 per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
160#endif
161
162 __setup_vector_irq(cpu);
163}
164
148static void __init smp_intr_init(void) 165static void __init smp_intr_init(void)
149{ 166{
150#ifdef CONFIG_SMP 167#ifdef CONFIG_SMP
@@ -203,8 +220,8 @@ static void __init apic_intr_init(void)
203 /* self generated IPI for local APIC timer */ 220 /* self generated IPI for local APIC timer */
204 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 221 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
205 222
206 /* generic IPI for platform specific use */ 223 /* IPI for X86 platform specific use */
207 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); 224 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
208 225
209 /* IPI vectors for APIC spurious and error interrupts */ 226 /* IPI vectors for APIC spurious and error interrupts */
210 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 227 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index cbc4332a77b2..0f7bc20cfcde 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -2,8 +2,8 @@
2 * Shared support code for AMD K8 northbridges and derivates. 2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. 3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */ 4 */
5#include <linux/gfp.h>
6#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/slab.h>
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/module.h> 9#include <linux/module.h>
@@ -121,3 +121,17 @@ void k8_flush_garts(void)
121} 121}
122EXPORT_SYMBOL_GPL(k8_flush_garts); 122EXPORT_SYMBOL_GPL(k8_flush_garts);
123 123
124static __init int init_k8_nbs(void)
125{
126 int err = 0;
127
128 err = cache_k8_northbridges();
129
130 if (err < 0)
131 printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
132
133 return err;
134}
135
136/* This has to go after the PCI subsystem */
137fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index e444357375ce..8afd9f321f10 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -9,6 +9,7 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/slab.h>
12#include <linux/init.h> 13#include <linux/init.h>
13#include <linux/stat.h> 14#include <linux/stat.h>
14#include <linux/io.h> 15#include <linux/io.h>
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3b..b2258ca91003 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -42,7 +42,9 @@
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45#include <linux/hw_breakpoint.h>
45 46
47#include <asm/debugreg.h>
46#include <asm/apicdef.h> 48#include <asm/apicdef.h>
47#include <asm/system.h> 49#include <asm/system.h>
48 50
@@ -85,10 +87,15 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
85 gdb_regs[GDB_DS] = regs->ds; 87 gdb_regs[GDB_DS] = regs->ds;
86 gdb_regs[GDB_ES] = regs->es; 88 gdb_regs[GDB_ES] = regs->es;
87 gdb_regs[GDB_CS] = regs->cs; 89 gdb_regs[GDB_CS] = regs->cs;
88 gdb_regs[GDB_SS] = __KERNEL_DS;
89 gdb_regs[GDB_FS] = 0xFFFF; 90 gdb_regs[GDB_FS] = 0xFFFF;
90 gdb_regs[GDB_GS] = 0xFFFF; 91 gdb_regs[GDB_GS] = 0xFFFF;
91 gdb_regs[GDB_SP] = (int)&regs->sp; 92 if (user_mode_vm(regs)) {
93 gdb_regs[GDB_SS] = regs->ss;
94 gdb_regs[GDB_SP] = regs->sp;
95 } else {
96 gdb_regs[GDB_SS] = __KERNEL_DS;
97 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
98 }
92#else 99#else
93 gdb_regs[GDB_R8] = regs->r8; 100 gdb_regs[GDB_R8] = regs->r8;
94 gdb_regs[GDB_R9] = regs->r9; 101 gdb_regs[GDB_R9] = regs->r9;
@@ -101,7 +108,7 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
101 gdb_regs32[GDB_PS] = regs->flags; 108 gdb_regs32[GDB_PS] = regs->flags;
102 gdb_regs32[GDB_CS] = regs->cs; 109 gdb_regs32[GDB_CS] = regs->cs;
103 gdb_regs32[GDB_SS] = regs->ss; 110 gdb_regs32[GDB_SS] = regs->ss;
104 gdb_regs[GDB_SP] = regs->sp; 111 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
105#endif 112#endif
106} 113}
107 114
@@ -198,41 +205,81 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
198 205
199static struct hw_breakpoint { 206static struct hw_breakpoint {
200 unsigned enabled; 207 unsigned enabled;
201 unsigned type;
202 unsigned len;
203 unsigned long addr; 208 unsigned long addr;
209 int len;
210 int type;
211 struct perf_event **pev;
204} breakinfo[4]; 212} breakinfo[4];
205 213
206static void kgdb_correct_hw_break(void) 214static void kgdb_correct_hw_break(void)
207{ 215{
208 unsigned long dr7;
209 int correctit = 0;
210 int breakbit;
211 int breakno; 216 int breakno;
212 217
213 get_debugreg(dr7, 7);
214 for (breakno = 0; breakno < 4; breakno++) { 218 for (breakno = 0; breakno < 4; breakno++) {
215 breakbit = 2 << (breakno << 1); 219 struct perf_event *bp;
216 if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { 220 struct arch_hw_breakpoint *info;
217 correctit = 1; 221 int val;
218 dr7 |= breakbit; 222 int cpu = raw_smp_processor_id();
219 dr7 &= ~(0xf0000 << (breakno << 2)); 223 if (!breakinfo[breakno].enabled)
220 dr7 |= ((breakinfo[breakno].len << 2) | 224 continue;
221 breakinfo[breakno].type) << 225 bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
222 ((breakno << 2) + 16); 226 info = counter_arch_bp(bp);
223 if (breakno >= 0 && breakno <= 3) 227 if (bp->attr.disabled != 1)
224 set_debugreg(breakinfo[breakno].addr, breakno); 228 continue;
225 229 bp->attr.bp_addr = breakinfo[breakno].addr;
226 } else { 230 bp->attr.bp_len = breakinfo[breakno].len;
227 if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { 231 bp->attr.bp_type = breakinfo[breakno].type;
228 correctit = 1; 232 info->address = breakinfo[breakno].addr;
229 dr7 &= ~breakbit; 233 info->len = breakinfo[breakno].len;
230 dr7 &= ~(0xf0000 << (breakno << 2)); 234 info->type = breakinfo[breakno].type;
231 } 235 val = arch_install_hw_breakpoint(bp);
232 } 236 if (!val)
237 bp->attr.disabled = 0;
233 } 238 }
234 if (correctit) 239 hw_breakpoint_restore();
235 set_debugreg(dr7, 7); 240}
241
242static int hw_break_reserve_slot(int breakno)
243{
244 int cpu;
245 int cnt = 0;
246 struct perf_event **pevent;
247
248 for_each_online_cpu(cpu) {
249 cnt++;
250 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
251 if (dbg_reserve_bp_slot(*pevent))
252 goto fail;
253 }
254
255 return 0;
256
257fail:
258 for_each_online_cpu(cpu) {
259 cnt--;
260 if (!cnt)
261 break;
262 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
263 dbg_release_bp_slot(*pevent);
264 }
265 return -1;
266}
267
268static int hw_break_release_slot(int breakno)
269{
270 struct perf_event **pevent;
271 int cpu;
272
273 for_each_online_cpu(cpu) {
274 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
275 if (dbg_release_bp_slot(*pevent))
276 /*
277 * The debugger is responisble for handing the retry on
278 * remove failure.
279 */
280 return -1;
281 }
282 return 0;
236} 283}
237 284
238static int 285static int
@@ -246,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
246 if (i == 4) 293 if (i == 4)
247 return -1; 294 return -1;
248 295
296 if (hw_break_release_slot(i)) {
297 printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr);
298 return -1;
299 }
249 breakinfo[i].enabled = 0; 300 breakinfo[i].enabled = 0;
250 301
251 return 0; 302 return 0;
@@ -254,15 +305,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
254static void kgdb_remove_all_hw_break(void) 305static void kgdb_remove_all_hw_break(void)
255{ 306{
256 int i; 307 int i;
308 int cpu = raw_smp_processor_id();
309 struct perf_event *bp;
257 310
258 for (i = 0; i < 4; i++) 311 for (i = 0; i < 4; i++) {
259 memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); 312 if (!breakinfo[i].enabled)
313 continue;
314 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
315 if (bp->attr.disabled == 1)
316 continue;
317 arch_uninstall_hw_breakpoint(bp);
318 bp->attr.disabled = 1;
319 }
260} 320}
261 321
262static int 322static int
263kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) 323kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
264{ 324{
265 unsigned type;
266 int i; 325 int i;
267 326
268 for (i = 0; i < 4; i++) 327 for (i = 0; i < 4; i++)
@@ -273,27 +332,42 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
273 332
274 switch (bptype) { 333 switch (bptype) {
275 case BP_HARDWARE_BREAKPOINT: 334 case BP_HARDWARE_BREAKPOINT:
276 type = 0; 335 len = 1;
277 len = 1; 336 breakinfo[i].type = X86_BREAKPOINT_EXECUTE;
278 break; 337 break;
279 case BP_WRITE_WATCHPOINT: 338 case BP_WRITE_WATCHPOINT:
280 type = 1; 339 breakinfo[i].type = X86_BREAKPOINT_WRITE;
281 break; 340 break;
282 case BP_ACCESS_WATCHPOINT: 341 case BP_ACCESS_WATCHPOINT:
283 type = 3; 342 breakinfo[i].type = X86_BREAKPOINT_RW;
284 break; 343 break;
285 default: 344 default:
286 return -1; 345 return -1;
287 } 346 }
288 347 switch (len) {
289 if (len == 1 || len == 2 || len == 4) 348 case 1:
290 breakinfo[i].len = len - 1; 349 breakinfo[i].len = X86_BREAKPOINT_LEN_1;
291 else 350 break;
351 case 2:
352 breakinfo[i].len = X86_BREAKPOINT_LEN_2;
353 break;
354 case 4:
355 breakinfo[i].len = X86_BREAKPOINT_LEN_4;
356 break;
357#ifdef CONFIG_X86_64
358 case 8:
359 breakinfo[i].len = X86_BREAKPOINT_LEN_8;
360 break;
361#endif
362 default:
292 return -1; 363 return -1;
293 364 }
294 breakinfo[i].enabled = 1;
295 breakinfo[i].addr = addr; 365 breakinfo[i].addr = addr;
296 breakinfo[i].type = type; 366 if (hw_break_reserve_slot(i)) {
367 breakinfo[i].addr = 0;
368 return -1;
369 }
370 breakinfo[i].enabled = 1;
297 371
298 return 0; 372 return 0;
299} 373}
@@ -308,8 +382,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
308 */ 382 */
309void kgdb_disable_hw_debug(struct pt_regs *regs) 383void kgdb_disable_hw_debug(struct pt_regs *regs)
310{ 384{
385 int i;
386 int cpu = raw_smp_processor_id();
387 struct perf_event *bp;
388
311 /* Disable hardware debugging while we are in kgdb: */ 389 /* Disable hardware debugging while we are in kgdb: */
312 set_debugreg(0UL, 7); 390 set_debugreg(0UL, 7);
391 for (i = 0; i < 4; i++) {
392 if (!breakinfo[i].enabled)
393 continue;
394 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
395 if (bp->attr.disabled == 1)
396 continue;
397 arch_uninstall_hw_breakpoint(bp);
398 bp->attr.disabled = 1;
399 }
313} 400}
314 401
315/** 402/**
@@ -373,7 +460,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
373 struct pt_regs *linux_regs) 460 struct pt_regs *linux_regs)
374{ 461{
375 unsigned long addr; 462 unsigned long addr;
376 unsigned long dr6;
377 char *ptr; 463 char *ptr;
378 int newPC; 464 int newPC;
379 465
@@ -395,25 +481,10 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
395 /* set the trace bit if we're stepping */ 481 /* set the trace bit if we're stepping */
396 if (remcomInBuffer[0] == 's') { 482 if (remcomInBuffer[0] == 's') {
397 linux_regs->flags |= X86_EFLAGS_TF; 483 linux_regs->flags |= X86_EFLAGS_TF;
398 kgdb_single_step = 1;
399 atomic_set(&kgdb_cpu_doing_single_step, 484 atomic_set(&kgdb_cpu_doing_single_step,
400 raw_smp_processor_id()); 485 raw_smp_processor_id());
401 } 486 }
402 487
403 get_debugreg(dr6, 6);
404 if (!(dr6 & 0x4000)) {
405 int breakno;
406
407 for (breakno = 0; breakno < 4; breakno++) {
408 if (dr6 & (1 << breakno) &&
409 breakinfo[breakno].type == 0) {
410 /* Set restore flag: */
411 linux_regs->flags |= X86_EFLAGS_RF;
412 break;
413 }
414 }
415 }
416 set_debugreg(0UL, 6);
417 kgdb_correct_hw_break(); 488 kgdb_correct_hw_break();
418 489
419 return 0; 490 return 0;
@@ -434,6 +505,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
434 "resuming...\n"); 505 "resuming...\n");
435 kgdb_arch_handle_exception(args->trapnr, args->signr, 506 kgdb_arch_handle_exception(args->trapnr, args->signr,
436 args->err, "c", "", regs); 507 args->err, "c", "", regs);
508 /*
509 * Reset the BS bit in dr6 (pointed by args->err) to
510 * denote completion of processing
511 */
512 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
437 513
438 return NOTIFY_STOP; 514 return NOTIFY_STOP;
439} 515}
@@ -476,8 +552,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
476 break; 552 break;
477 553
478 case DIE_DEBUG: 554 case DIE_DEBUG:
479 if (atomic_read(&kgdb_cpu_doing_single_step) == 555 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
480 raw_smp_processor_id()) {
481 if (user_mode(regs)) 556 if (user_mode(regs))
482 return single_step_cont(regs, args); 557 return single_step_cont(regs, args);
483 break; 558 break;
@@ -530,7 +605,42 @@ static struct notifier_block kgdb_notifier = {
530 */ 605 */
531int kgdb_arch_init(void) 606int kgdb_arch_init(void)
532{ 607{
533 return register_die_notifier(&kgdb_notifier); 608 int i, cpu;
609 int ret;
610 struct perf_event_attr attr;
611 struct perf_event **pevent;
612
613 ret = register_die_notifier(&kgdb_notifier);
614 if (ret != 0)
615 return ret;
616 /*
617 * Pre-allocate the hw breakpoint structions in the non-atomic
618 * portion of kgdb because this operation requires mutexs to
619 * complete.
620 */
621 hw_breakpoint_init(&attr);
622 attr.bp_addr = (unsigned long)kgdb_arch_init;
623 attr.bp_len = HW_BREAKPOINT_LEN_1;
624 attr.bp_type = HW_BREAKPOINT_W;
625 attr.disabled = 1;
626 for (i = 0; i < 4; i++) {
627 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
628 if (IS_ERR(breakinfo[i].pev)) {
629 printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n");
630 breakinfo[i].pev = NULL;
631 kgdb_arch_exit();
632 return -1;
633 }
634 for_each_online_cpu(cpu) {
635 pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
636 pevent[0]->hw.sample_period = 1;
637 if (pevent[0]->destroy != NULL) {
638 pevent[0]->destroy = NULL;
639 release_bp_slot(*pevent);
640 }
641 }
642 }
643 return ret;
534} 644}
535 645
536/** 646/**
@@ -541,6 +651,13 @@ int kgdb_arch_init(void)
541 */ 651 */
542void kgdb_arch_exit(void) 652void kgdb_arch_exit(void)
543{ 653{
654 int i;
655 for (i = 0; i < 4; i++) {
656 if (breakinfo[i].pev) {
657 unregister_wide_hw_breakpoint(breakinfo[i].pev);
658 breakinfo[i].pev = NULL;
659 }
660 }
544 unregister_die_notifier(&kgdb_notifier); 661 unregister_die_notifier(&kgdb_notifier);
545} 662}
546 663
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7b5169d2b000..1658efdfb4e5 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -48,31 +48,23 @@
48#include <linux/preempt.h> 48#include <linux/preempt.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h>
52#include <linux/ftrace.h>
51 53
52#include <asm/cacheflush.h> 54#include <asm/cacheflush.h>
53#include <asm/desc.h> 55#include <asm/desc.h>
54#include <asm/pgtable.h> 56#include <asm/pgtable.h>
55#include <asm/uaccess.h> 57#include <asm/uaccess.h>
56#include <asm/alternative.h> 58#include <asm/alternative.h>
59#include <asm/insn.h>
60#include <asm/debugreg.h>
57 61
58void jprobe_return_end(void); 62void jprobe_return_end(void);
59 63
60DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 64DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
61DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); 65DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
62 66
63#ifdef CONFIG_X86_64 67#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
64#define stack_addr(regs) ((unsigned long *)regs->sp)
65#else
66/*
67 * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
68 * don't save the ss and esp registers if the CPU is already in kernel
69 * mode when it traps. So for kprobes, regs->sp and regs->ss are not
70 * the [nonexistent] saved stack pointer and ss register, but rather
71 * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
72 * point to the top of the pre-int3 stack.
73 */
74#define stack_addr(regs) ((unsigned long *)&regs->sp)
75#endif
76 68
77#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ 69#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
78 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ 70 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -106,50 +98,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
106 /* ----------------------------------------------- */ 98 /* ----------------------------------------------- */
107 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 99 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108}; 100};
109static const u32 onebyte_has_modrm[256 / 32] = {
110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
111 /* ----------------------------------------------- */
112 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
113 W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
114 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
115 W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
116 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
117 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
118 W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
119 W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
120 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
121 W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
122 W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
123 W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
124 W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
125 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
126 W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
127 W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
128 /* ----------------------------------------------- */
129 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
130};
131static const u32 twobyte_has_modrm[256 / 32] = {
132 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* ----------------------------------------------- */
134 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
135 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
136 W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
137 W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
138 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
139 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
140 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
141 W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
142 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
143 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
144 W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
145 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
146 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
147 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
148 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
149 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
150 /* ----------------------------------------------- */
151 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
152};
153#undef W 101#undef W
154 102
155struct kretprobe_blackpoint kretprobe_blacklist[] = { 103struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -159,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
159}; 107};
160const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); 108const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
161 109
162/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ 110static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
163static void __kprobes set_jmp_op(void *from, void *to)
164{ 111{
165 struct __arch_jmp_op { 112 struct __arch_relative_insn {
166 char op; 113 u8 op;
167 s32 raddr; 114 s32 raddr;
168 } __attribute__((packed)) * jop; 115 } __attribute__((packed)) *insn;
169 jop = (struct __arch_jmp_op *)from; 116
170 jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); 117 insn = (struct __arch_relative_insn *)from;
171 jop->op = RELATIVEJUMP_INSTRUCTION; 118 insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
119 insn->op = op;
120}
121
122/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
123static void __kprobes synthesize_reljump(void *from, void *to)
124{
125 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
172} 126}
173 127
174/* 128/*
@@ -244,6 +198,75 @@ retry:
244 } 198 }
245} 199}
246 200
201/* Recover the probed instruction at addr for further analysis. */
202static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
203{
204 struct kprobe *kp;
205 kp = get_kprobe((void *)addr);
206 if (!kp)
207 return -EINVAL;
208
209 /*
210 * Basically, kp->ainsn.insn has an original instruction.
211 * However, RIP-relative instruction can not do single-stepping
212 * at different place, __copy_instruction() tweaks the displacement of
213 * that instruction. In that case, we can't recover the instruction
214 * from the kp->ainsn.insn.
215 *
216 * On the other hand, kp->opcode has a copy of the first byte of
217 * the probed instruction, which is overwritten by int3. And
218 * the instruction at kp->addr is not modified by kprobes except
219 * for the first byte, we can recover the original instruction
220 * from it and kp->opcode.
221 */
222 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
223 buf[0] = kp->opcode;
224 return 0;
225}
226
227/* Dummy buffers for kallsyms_lookup */
228static char __dummy_buf[KSYM_NAME_LEN];
229
230/* Check if paddr is at an instruction boundary */
231static int __kprobes can_probe(unsigned long paddr)
232{
233 int ret;
234 unsigned long addr, offset = 0;
235 struct insn insn;
236 kprobe_opcode_t buf[MAX_INSN_SIZE];
237
238 if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
239 return 0;
240
241 /* Decode instructions */
242 addr = paddr - offset;
243 while (addr < paddr) {
244 kernel_insn_init(&insn, (void *)addr);
245 insn_get_opcode(&insn);
246
247 /*
248 * Check if the instruction has been modified by another
249 * kprobe, in which case we replace the breakpoint by the
250 * original instruction in our buffer.
251 */
252 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
253 ret = recover_probed_instruction(buf, addr);
254 if (ret)
255 /*
256 * Another debugging subsystem might insert
257 * this breakpoint. In that case, we can't
258 * recover it.
259 */
260 return 0;
261 kernel_insn_init(&insn, buf);
262 }
263 insn_get_length(&insn);
264 addr += insn.length;
265 }
266
267 return (addr == paddr);
268}
269
247/* 270/*
248 * Returns non-zero if opcode modifies the interrupt flag. 271 * Returns non-zero if opcode modifies the interrupt flag.
249 */ 272 */
@@ -268,86 +291,67 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
268} 291}
269 292
270/* 293/*
271 * Adjust the displacement if the instruction uses the %rip-relative 294 * Copy an instruction and adjust the displacement if the instruction
272 * addressing mode. 295 * uses the %rip-relative addressing mode.
273 * If it does, Return the address of the 32-bit displacement word. 296 * If it does, Return the address of the 32-bit displacement word.
274 * If not, return null. 297 * If not, return null.
275 * Only applicable to 64-bit x86. 298 * Only applicable to 64-bit x86.
276 */ 299 */
277static void __kprobes fix_riprel(struct kprobe *p) 300static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
278{ 301{
279#ifdef CONFIG_X86_64 302 struct insn insn;
280 u8 *insn = p->ainsn.insn; 303 int ret;
281 s64 disp; 304 kprobe_opcode_t buf[MAX_INSN_SIZE];
282 int need_modrm; 305
283 306 kernel_insn_init(&insn, src);
284 /* Skip legacy instruction prefixes. */ 307 if (recover) {
285 while (1) { 308 insn_get_opcode(&insn);
286 switch (*insn) { 309 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
287 case 0x66: 310 ret = recover_probed_instruction(buf,
288 case 0x67: 311 (unsigned long)src);
289 case 0x2e: 312 if (ret)
290 case 0x3e: 313 return 0;
291 case 0x26: 314 kernel_insn_init(&insn, buf);
292 case 0x64:
293 case 0x65:
294 case 0x36:
295 case 0xf0:
296 case 0xf3:
297 case 0xf2:
298 ++insn;
299 continue;
300 } 315 }
301 break;
302 } 316 }
317 insn_get_length(&insn);
318 memcpy(dest, insn.kaddr, insn.length);
303 319
304 /* Skip REX instruction prefix. */ 320#ifdef CONFIG_X86_64
305 if (is_REX_prefix(insn)) 321 if (insn_rip_relative(&insn)) {
306 ++insn; 322 s64 newdisp;
307 323 u8 *disp;
308 if (*insn == 0x0f) { 324 kernel_insn_init(&insn, dest);
309 /* Two-byte opcode. */ 325 insn_get_displacement(&insn);
310 ++insn; 326 /*
311 need_modrm = test_bit(*insn, 327 * The copied instruction uses the %rip-relative addressing
312 (unsigned long *)twobyte_has_modrm); 328 * mode. Adjust the displacement for the difference between
313 } else 329 * the original location of this instruction and the location
314 /* One-byte opcode. */ 330 * of the copy that will actually be run. The tricky bit here
315 need_modrm = test_bit(*insn, 331 * is making sure that the sign extension happens correctly in
316 (unsigned long *)onebyte_has_modrm); 332 * this calculation, since we need a signed 32-bit result to
317 333 * be sign-extended to 64 bits when it's added to the %rip
318 if (need_modrm) { 334 * value and yield the same 64-bit result that the sign-
319 u8 modrm = *++insn; 335 * extension of the original signed 32-bit displacement would
320 if ((modrm & 0xc7) == 0x05) { 336 * have given.
321 /* %rip+disp32 addressing mode */ 337 */
322 /* Displacement follows ModRM byte. */ 338 newdisp = (u8 *) src + (s64) insn.displacement.value -
323 ++insn; 339 (u8 *) dest;
324 /* 340 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
325 * The copied instruction uses the %rip-relative 341 disp = (u8 *) dest + insn_offset_displacement(&insn);
326 * addressing mode. Adjust the displacement for the 342 *(s32 *) disp = (s32) newdisp;
327 * difference between the original location of this
328 * instruction and the location of the copy that will
329 * actually be run. The tricky bit here is making sure
330 * that the sign extension happens correctly in this
331 * calculation, since we need a signed 32-bit result to
332 * be sign-extended to 64 bits when it's added to the
333 * %rip value and yield the same 64-bit result that the
334 * sign-extension of the original signed 32-bit
335 * displacement would have given.
336 */
337 disp = (u8 *) p->addr + *((s32 *) insn) -
338 (u8 *) p->ainsn.insn;
339 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
340 *(s32 *)insn = (s32) disp;
341 }
342 } 343 }
343#endif 344#endif
345 return insn.length;
344} 346}
345 347
346static void __kprobes arch_copy_kprobe(struct kprobe *p) 348static void __kprobes arch_copy_kprobe(struct kprobe *p)
347{ 349{
348 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 350 /*
349 351 * Copy an instruction without recovering int3, because it will be
350 fix_riprel(p); 352 * put by another subsystem.
353 */
354 __copy_instruction(p->ainsn.insn, p->addr, 0);
351 355
352 if (can_boost(p->addr)) 356 if (can_boost(p->addr))
353 p->ainsn.boostable = 0; 357 p->ainsn.boostable = 0;
@@ -359,6 +363,11 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
359 363
360int __kprobes arch_prepare_kprobe(struct kprobe *p) 364int __kprobes arch_prepare_kprobe(struct kprobe *p)
361{ 365{
366 if (alternatives_text_reserved(p->addr, p->addr))
367 return -EINVAL;
368
369 if (!can_probe((unsigned long)p->addr))
370 return -EILSEQ;
362 /* insn: must be on special executable page on x86. */ 371 /* insn: must be on special executable page on x86. */
363 p->ainsn.insn = get_insn_slot(); 372 p->ainsn.insn = get_insn_slot();
364 if (!p->ainsn.insn) 373 if (!p->ainsn.insn)
@@ -423,18 +432,6 @@ static void __kprobes restore_btf(void)
423 update_debugctlmsr(current->thread.debugctlmsr); 432 update_debugctlmsr(current->thread.debugctlmsr);
424} 433}
425 434
426static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
427{
428 clear_btf();
429 regs->flags |= X86_EFLAGS_TF;
430 regs->flags &= ~X86_EFLAGS_IF;
431 /* single step inline if the instruction is an int3 */
432 if (p->opcode == BREAKPOINT_INSTRUCTION)
433 regs->ip = (unsigned long)p->addr;
434 else
435 regs->ip = (unsigned long)p->ainsn.insn;
436}
437
438void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
439 struct pt_regs *regs) 436 struct pt_regs *regs)
440{ 437{
@@ -446,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
446 *sara = (unsigned long) &kretprobe_trampoline; 443 *sara = (unsigned long) &kretprobe_trampoline;
447} 444}
448 445
446#ifdef CONFIG_OPTPROBES
447static int __kprobes setup_detour_execution(struct kprobe *p,
448 struct pt_regs *regs,
449 int reenter);
450#else
451#define setup_detour_execution(p, regs, reenter) (0)
452#endif
453
449static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, 454static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
450 struct kprobe_ctlblk *kcb) 455 struct kprobe_ctlblk *kcb, int reenter)
451{ 456{
452#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) 457 if (setup_detour_execution(p, regs, reenter))
458 return;
459
460#if !defined(CONFIG_PREEMPT)
453 if (p->ainsn.boostable == 1 && !p->post_handler) { 461 if (p->ainsn.boostable == 1 && !p->post_handler) {
454 /* Boost up -- we can execute copied instructions directly */ 462 /* Boost up -- we can execute copied instructions directly */
455 reset_current_kprobe(); 463 if (!reenter)
464 reset_current_kprobe();
465 /*
466 * Reentering boosted probe doesn't reset current_kprobe,
467 * nor set current_kprobe, because it doesn't use single
468 * stepping.
469 */
456 regs->ip = (unsigned long)p->ainsn.insn; 470 regs->ip = (unsigned long)p->ainsn.insn;
457 preempt_enable_no_resched(); 471 preempt_enable_no_resched();
458 return; 472 return;
459 } 473 }
460#endif 474#endif
461 prepare_singlestep(p, regs); 475 if (reenter) {
462 kcb->kprobe_status = KPROBE_HIT_SS; 476 save_previous_kprobe(kcb);
477 set_current_kprobe(p, regs, kcb);
478 kcb->kprobe_status = KPROBE_REENTER;
479 } else
480 kcb->kprobe_status = KPROBE_HIT_SS;
481 /* Prepare real single stepping */
482 clear_btf();
483 regs->flags |= X86_EFLAGS_TF;
484 regs->flags &= ~X86_EFLAGS_IF;
485 /* single step inline if the instruction is an int3 */
486 if (p->opcode == BREAKPOINT_INSTRUCTION)
487 regs->ip = (unsigned long)p->addr;
488 else
489 regs->ip = (unsigned long)p->ainsn.insn;
463} 490}
464 491
465/* 492/*
@@ -472,37 +499,21 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
472{ 499{
473 switch (kcb->kprobe_status) { 500 switch (kcb->kprobe_status) {
474 case KPROBE_HIT_SSDONE: 501 case KPROBE_HIT_SSDONE:
475#ifdef CONFIG_X86_64
476 /* TODO: Provide re-entrancy from post_kprobes_handler() and
477 * avoid exception stack corruption while single-stepping on
478 * the instruction of the new probe.
479 */
480 arch_disarm_kprobe(p);
481 regs->ip = (unsigned long)p->addr;
482 reset_current_kprobe();
483 preempt_enable_no_resched();
484 break;
485#endif
486 case KPROBE_HIT_ACTIVE: 502 case KPROBE_HIT_ACTIVE:
487 save_previous_kprobe(kcb);
488 set_current_kprobe(p, regs, kcb);
489 kprobes_inc_nmissed_count(p); 503 kprobes_inc_nmissed_count(p);
490 prepare_singlestep(p, regs); 504 setup_singlestep(p, regs, kcb, 1);
491 kcb->kprobe_status = KPROBE_REENTER;
492 break; 505 break;
493 case KPROBE_HIT_SS: 506 case KPROBE_HIT_SS:
494 if (p == kprobe_running()) { 507 /* A probe has been hit in the codepath leading up to, or just
495 regs->flags &= ~X86_EFLAGS_TF; 508 * after, single-stepping of a probed instruction. This entire
496 regs->flags |= kcb->kprobe_saved_flags; 509 * codepath should strictly reside in .kprobes.text section.
497 return 0; 510 * Raise a BUG or we'll continue in an endless reentering loop
498 } else { 511 * and eventually a stack overflow.
499 /* A probe has been hit in the codepath leading up 512 */
500 * to, or just after, single-stepping of a probed 513 printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
501 * instruction. This entire codepath should strictly 514 p->addr);
502 * reside in .kprobes.text section. Raise a warning 515 dump_kprobe(p);
503 * to highlight this peculiar case. 516 BUG();
504 */
505 }
506 default: 517 default:
507 /* impossible cases */ 518 /* impossible cases */
508 WARN_ON(1); 519 WARN_ON(1);
@@ -514,7 +525,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
514 525
515/* 526/*
516 * Interrupts are disabled on entry as trap3 is an interrupt gate and they 527 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
517 * remain disabled thorough out this function. 528 * remain disabled throughout this function.
518 */ 529 */
519static int __kprobes kprobe_handler(struct pt_regs *regs) 530static int __kprobes kprobe_handler(struct pt_regs *regs)
520{ 531{
@@ -523,20 +534,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
523 struct kprobe_ctlblk *kcb; 534 struct kprobe_ctlblk *kcb;
524 535
525 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); 536 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
526 if (*addr != BREAKPOINT_INSTRUCTION) {
527 /*
528 * The breakpoint instruction was removed right
529 * after we hit it. Another cpu has removed
530 * either a probepoint or a debugger breakpoint
531 * at this address. In either case, no further
532 * handling of this interrupt is appropriate.
533 * Back up over the (now missing) int3 and run
534 * the original instruction.
535 */
536 regs->ip = (unsigned long)addr;
537 return 1;
538 }
539
540 /* 537 /*
541 * We don't want to be preempted for the entire 538 * We don't want to be preempted for the entire
542 * duration of kprobe processing. We conditionally 539 * duration of kprobe processing. We conditionally
@@ -565,13 +562,26 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
565 * more here. 562 * more here.
566 */ 563 */
567 if (!p->pre_handler || !p->pre_handler(p, regs)) 564 if (!p->pre_handler || !p->pre_handler(p, regs))
568 setup_singlestep(p, regs, kcb); 565 setup_singlestep(p, regs, kcb, 0);
569 return 1; 566 return 1;
570 } 567 }
568 } else if (*addr != BREAKPOINT_INSTRUCTION) {
569 /*
570 * The breakpoint instruction was removed right
571 * after we hit it. Another cpu has removed
572 * either a probepoint or a debugger breakpoint
573 * at this address. In either case, no further
574 * handling of this interrupt is appropriate.
575 * Back up over the (now missing) int3 and run
576 * the original instruction.
577 */
578 regs->ip = (unsigned long)addr;
579 preempt_enable_no_resched();
580 return 1;
571 } else if (kprobe_running()) { 581 } else if (kprobe_running()) {
572 p = __get_cpu_var(current_kprobe); 582 p = __get_cpu_var(current_kprobe);
573 if (p->break_handler && p->break_handler(p, regs)) { 583 if (p->break_handler && p->break_handler(p, regs)) {
574 setup_singlestep(p, regs, kcb); 584 setup_singlestep(p, regs, kcb, 0);
575 return 1; 585 return 1;
576 } 586 }
577 } /* else: not a kprobe fault; let the kernel handle it */ 587 } /* else: not a kprobe fault; let the kernel handle it */
@@ -580,6 +590,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
580 return 0; 590 return 0;
581} 591}
582 592
593#ifdef CONFIG_X86_64
594#define SAVE_REGS_STRING \
595 /* Skip cs, ip, orig_ax. */ \
596 " subq $24, %rsp\n" \
597 " pushq %rdi\n" \
598 " pushq %rsi\n" \
599 " pushq %rdx\n" \
600 " pushq %rcx\n" \
601 " pushq %rax\n" \
602 " pushq %r8\n" \
603 " pushq %r9\n" \
604 " pushq %r10\n" \
605 " pushq %r11\n" \
606 " pushq %rbx\n" \
607 " pushq %rbp\n" \
608 " pushq %r12\n" \
609 " pushq %r13\n" \
610 " pushq %r14\n" \
611 " pushq %r15\n"
612#define RESTORE_REGS_STRING \
613 " popq %r15\n" \
614 " popq %r14\n" \
615 " popq %r13\n" \
616 " popq %r12\n" \
617 " popq %rbp\n" \
618 " popq %rbx\n" \
619 " popq %r11\n" \
620 " popq %r10\n" \
621 " popq %r9\n" \
622 " popq %r8\n" \
623 " popq %rax\n" \
624 " popq %rcx\n" \
625 " popq %rdx\n" \
626 " popq %rsi\n" \
627 " popq %rdi\n" \
628 /* Skip orig_ax, ip, cs */ \
629 " addq $24, %rsp\n"
630#else
631#define SAVE_REGS_STRING \
632 /* Skip cs, ip, orig_ax and gs. */ \
633 " subl $16, %esp\n" \
634 " pushl %fs\n" \
635 " pushl %ds\n" \
636 " pushl %es\n" \
637 " pushl %eax\n" \
638 " pushl %ebp\n" \
639 " pushl %edi\n" \
640 " pushl %esi\n" \
641 " pushl %edx\n" \
642 " pushl %ecx\n" \
643 " pushl %ebx\n"
644#define RESTORE_REGS_STRING \
645 " popl %ebx\n" \
646 " popl %ecx\n" \
647 " popl %edx\n" \
648 " popl %esi\n" \
649 " popl %edi\n" \
650 " popl %ebp\n" \
651 " popl %eax\n" \
652 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
653 " addl $24, %esp\n"
654#endif
655
583/* 656/*
584 * When a retprobed function returns, this code saves registers and 657 * When a retprobed function returns, this code saves registers and
585 * calls trampoline_handler() runs, which calls the kretprobe's handler. 658 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -593,65 +666,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
593 /* We don't bother saving the ss register */ 666 /* We don't bother saving the ss register */
594 " pushq %rsp\n" 667 " pushq %rsp\n"
595 " pushfq\n" 668 " pushfq\n"
596 /* 669 SAVE_REGS_STRING
597 * Skip cs, ip, orig_ax.
598 * trampoline_handler() will plug in these values
599 */
600 " subq $24, %rsp\n"
601 " pushq %rdi\n"
602 " pushq %rsi\n"
603 " pushq %rdx\n"
604 " pushq %rcx\n"
605 " pushq %rax\n"
606 " pushq %r8\n"
607 " pushq %r9\n"
608 " pushq %r10\n"
609 " pushq %r11\n"
610 " pushq %rbx\n"
611 " pushq %rbp\n"
612 " pushq %r12\n"
613 " pushq %r13\n"
614 " pushq %r14\n"
615 " pushq %r15\n"
616 " movq %rsp, %rdi\n" 670 " movq %rsp, %rdi\n"
617 " call trampoline_handler\n" 671 " call trampoline_handler\n"
618 /* Replace saved sp with true return address. */ 672 /* Replace saved sp with true return address. */
619 " movq %rax, 152(%rsp)\n" 673 " movq %rax, 152(%rsp)\n"
620 " popq %r15\n" 674 RESTORE_REGS_STRING
621 " popq %r14\n"
622 " popq %r13\n"
623 " popq %r12\n"
624 " popq %rbp\n"
625 " popq %rbx\n"
626 " popq %r11\n"
627 " popq %r10\n"
628 " popq %r9\n"
629 " popq %r8\n"
630 " popq %rax\n"
631 " popq %rcx\n"
632 " popq %rdx\n"
633 " popq %rsi\n"
634 " popq %rdi\n"
635 /* Skip orig_ax, ip, cs */
636 " addq $24, %rsp\n"
637 " popfq\n" 675 " popfq\n"
638#else 676#else
639 " pushf\n" 677 " pushf\n"
640 /* 678 SAVE_REGS_STRING
641 * Skip cs, ip, orig_ax and gs.
642 * trampoline_handler() will plug in these values
643 */
644 " subl $16, %esp\n"
645 " pushl %fs\n"
646 " pushl %es\n"
647 " pushl %ds\n"
648 " pushl %eax\n"
649 " pushl %ebp\n"
650 " pushl %edi\n"
651 " pushl %esi\n"
652 " pushl %edx\n"
653 " pushl %ecx\n"
654 " pushl %ebx\n"
655 " movl %esp, %eax\n" 679 " movl %esp, %eax\n"
656 " call trampoline_handler\n" 680 " call trampoline_handler\n"
657 /* Move flags to cs */ 681 /* Move flags to cs */
@@ -659,15 +683,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
659 " movl %edx, 52(%esp)\n" 683 " movl %edx, 52(%esp)\n"
660 /* Replace saved flags with true return address. */ 684 /* Replace saved flags with true return address. */
661 " movl %eax, 56(%esp)\n" 685 " movl %eax, 56(%esp)\n"
662 " popl %ebx\n" 686 RESTORE_REGS_STRING
663 " popl %ecx\n"
664 " popl %edx\n"
665 " popl %esi\n"
666 " popl %edi\n"
667 " popl %ebp\n"
668 " popl %eax\n"
669 /* Skip ds, es, fs, gs, orig_ax and ip */
670 " addl $24, %esp\n"
671 " popf\n" 687 " popf\n"
672#endif 688#endif
673 " ret\n"); 689 " ret\n");
@@ -835,8 +851,8 @@ static void __kprobes resume_execution(struct kprobe *p,
835 * These instructions can be executed directly if it 851 * These instructions can be executed directly if it
836 * jumps back to correct address. 852 * jumps back to correct address.
837 */ 853 */
838 set_jmp_op((void *)regs->ip, 854 synthesize_reljump((void *)regs->ip,
839 (void *)orig_ip + (regs->ip - copy_ip)); 855 (void *)orig_ip + (regs->ip - copy_ip));
840 p->ainsn.boostable = 1; 856 p->ainsn.boostable = 1;
841 } else { 857 } else {
842 p->ainsn.boostable = -1; 858 p->ainsn.boostable = -1;
@@ -851,7 +867,7 @@ no_change:
851 867
852/* 868/*
853 * Interrupts are disabled on entry as trap1 is an interrupt gate and they 869 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
854 * remain disabled thoroughout this function. 870 * remain disabled throughout this function.
855 */ 871 */
856static int __kprobes post_kprobe_handler(struct pt_regs *regs) 872static int __kprobes post_kprobe_handler(struct pt_regs *regs)
857{ 873{
@@ -967,8 +983,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
967 ret = NOTIFY_STOP; 983 ret = NOTIFY_STOP;
968 break; 984 break;
969 case DIE_DEBUG: 985 case DIE_DEBUG:
970 if (post_kprobe_handler(args->regs)) 986 if (post_kprobe_handler(args->regs)) {
987 /*
988 * Reset the BS bit in dr6 (pointed by args->err) to
989 * denote completion of processing
990 */
991 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
971 ret = NOTIFY_STOP; 992 ret = NOTIFY_STOP;
993 }
972 break; 994 break;
973 case DIE_GPF: 995 case DIE_GPF:
974 /* 996 /*
@@ -1057,6 +1079,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1057 return 0; 1079 return 0;
1058} 1080}
1059 1081
1082
1083#ifdef CONFIG_OPTPROBES
1084
1085/* Insert a call instruction at address 'from', which calls address 'to'.*/
1086static void __kprobes synthesize_relcall(void *from, void *to)
1087{
1088 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
1089}
1090
1091/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
1092static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1093 unsigned long val)
1094{
1095#ifdef CONFIG_X86_64
1096 *addr++ = 0x48;
1097 *addr++ = 0xbf;
1098#else
1099 *addr++ = 0xb8;
1100#endif
1101 *(unsigned long *)addr = val;
1102}
1103
1104void __kprobes kprobes_optinsn_template_holder(void)
1105{
1106 asm volatile (
1107 ".global optprobe_template_entry\n"
1108 "optprobe_template_entry: \n"
1109#ifdef CONFIG_X86_64
1110 /* We don't bother saving the ss register */
1111 " pushq %rsp\n"
1112 " pushfq\n"
1113 SAVE_REGS_STRING
1114 " movq %rsp, %rsi\n"
1115 ".global optprobe_template_val\n"
1116 "optprobe_template_val: \n"
1117 ASM_NOP5
1118 ASM_NOP5
1119 ".global optprobe_template_call\n"
1120 "optprobe_template_call: \n"
1121 ASM_NOP5
1122 /* Move flags to rsp */
1123 " movq 144(%rsp), %rdx\n"
1124 " movq %rdx, 152(%rsp)\n"
1125 RESTORE_REGS_STRING
1126 /* Skip flags entry */
1127 " addq $8, %rsp\n"
1128 " popfq\n"
1129#else /* CONFIG_X86_32 */
1130 " pushf\n"
1131 SAVE_REGS_STRING
1132 " movl %esp, %edx\n"
1133 ".global optprobe_template_val\n"
1134 "optprobe_template_val: \n"
1135 ASM_NOP5
1136 ".global optprobe_template_call\n"
1137 "optprobe_template_call: \n"
1138 ASM_NOP5
1139 RESTORE_REGS_STRING
1140 " addl $4, %esp\n" /* skip cs */
1141 " popf\n"
1142#endif
1143 ".global optprobe_template_end\n"
1144 "optprobe_template_end: \n");
1145}
1146
1147#define TMPL_MOVE_IDX \
1148 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
1149#define TMPL_CALL_IDX \
1150 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
1151#define TMPL_END_IDX \
1152 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
1153
1154#define INT3_SIZE sizeof(kprobe_opcode_t)
1155
1156/* Optimized kprobe call back function: called from optinsn */
1157static void __kprobes optimized_callback(struct optimized_kprobe *op,
1158 struct pt_regs *regs)
1159{
1160 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1161
1162 preempt_disable();
1163 if (kprobe_running()) {
1164 kprobes_inc_nmissed_count(&op->kp);
1165 } else {
1166 /* Save skipped registers */
1167#ifdef CONFIG_X86_64
1168 regs->cs = __KERNEL_CS;
1169#else
1170 regs->cs = __KERNEL_CS | get_kernel_rpl();
1171 regs->gs = 0;
1172#endif
1173 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1174 regs->orig_ax = ~0UL;
1175
1176 __get_cpu_var(current_kprobe) = &op->kp;
1177 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1178 opt_pre_handler(&op->kp, regs);
1179 __get_cpu_var(current_kprobe) = NULL;
1180 }
1181 preempt_enable_no_resched();
1182}
1183
1184static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1185{
1186 int len = 0, ret;
1187
1188 while (len < RELATIVEJUMP_SIZE) {
1189 ret = __copy_instruction(dest + len, src + len, 1);
1190 if (!ret || !can_boost(dest + len))
1191 return -EINVAL;
1192 len += ret;
1193 }
1194 /* Check whether the address range is reserved */
1195 if (ftrace_text_reserved(src, src + len - 1) ||
1196 alternatives_text_reserved(src, src + len - 1))
1197 return -EBUSY;
1198
1199 return len;
1200}
1201
1202/* Check whether insn is indirect jump */
1203static int __kprobes insn_is_indirect_jump(struct insn *insn)
1204{
1205 return ((insn->opcode.bytes[0] == 0xff &&
1206 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
1207 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
1208}
1209
1210/* Check whether insn jumps into specified address range */
1211static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
1212{
1213 unsigned long target = 0;
1214
1215 switch (insn->opcode.bytes[0]) {
1216 case 0xe0: /* loopne */
1217 case 0xe1: /* loope */
1218 case 0xe2: /* loop */
1219 case 0xe3: /* jcxz */
1220 case 0xe9: /* near relative jump */
1221 case 0xeb: /* short relative jump */
1222 break;
1223 case 0x0f:
1224 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
1225 break;
1226 return 0;
1227 default:
1228 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
1229 break;
1230 return 0;
1231 }
1232 target = (unsigned long)insn->next_byte + insn->immediate.value;
1233
1234 return (start <= target && target <= start + len);
1235}
1236
1237/* Decode whole function to ensure any instructions don't jump into target */
1238static int __kprobes can_optimize(unsigned long paddr)
1239{
1240 int ret;
1241 unsigned long addr, size = 0, offset = 0;
1242 struct insn insn;
1243 kprobe_opcode_t buf[MAX_INSN_SIZE];
1244 /* Dummy buffers for lookup_symbol_attrs */
1245 static char __dummy_buf[KSYM_NAME_LEN];
1246
1247 /* Lookup symbol including addr */
1248 if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
1249 return 0;
1250
1251 /* Check there is enough space for a relative jump. */
1252 if (size - offset < RELATIVEJUMP_SIZE)
1253 return 0;
1254
1255 /* Decode instructions */
1256 addr = paddr - offset;
1257 while (addr < paddr - offset + size) { /* Decode until function end */
1258 if (search_exception_tables(addr))
1259 /*
1260 * Since some fixup code will jumps into this function,
1261 * we can't optimize kprobe in this function.
1262 */
1263 return 0;
1264 kernel_insn_init(&insn, (void *)addr);
1265 insn_get_opcode(&insn);
1266 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
1267 ret = recover_probed_instruction(buf, addr);
1268 if (ret)
1269 return 0;
1270 kernel_insn_init(&insn, buf);
1271 }
1272 insn_get_length(&insn);
1273 /* Recover address */
1274 insn.kaddr = (void *)addr;
1275 insn.next_byte = (void *)(addr + insn.length);
1276 /* Check any instructions don't jump into target */
1277 if (insn_is_indirect_jump(&insn) ||
1278 insn_jump_into_range(&insn, paddr + INT3_SIZE,
1279 RELATIVE_ADDR_SIZE))
1280 return 0;
1281 addr += insn.length;
1282 }
1283
1284 return 1;
1285}
1286
1287/* Check optimized_kprobe can actually be optimized. */
1288int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
1289{
1290 int i;
1291 struct kprobe *p;
1292
1293 for (i = 1; i < op->optinsn.size; i++) {
1294 p = get_kprobe(op->kp.addr + i);
1295 if (p && !kprobe_disabled(p))
1296 return -EEXIST;
1297 }
1298
1299 return 0;
1300}
1301
1302/* Check the addr is within the optimized instructions. */
1303int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
1304 unsigned long addr)
1305{
1306 return ((unsigned long)op->kp.addr <= addr &&
1307 (unsigned long)op->kp.addr + op->optinsn.size > addr);
1308}
1309
1310/* Free optimized instruction slot */
1311static __kprobes
1312void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
1313{
1314 if (op->optinsn.insn) {
1315 free_optinsn_slot(op->optinsn.insn, dirty);
1316 op->optinsn.insn = NULL;
1317 op->optinsn.size = 0;
1318 }
1319}
1320
1321void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
1322{
1323 __arch_remove_optimized_kprobe(op, 1);
1324}
1325
1326/*
1327 * Copy replacing target instructions
1328 * Target instructions MUST be relocatable (checked inside)
1329 */
1330int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1331{
1332 u8 *buf;
1333 int ret;
1334 long rel;
1335
1336 if (!can_optimize((unsigned long)op->kp.addr))
1337 return -EILSEQ;
1338
1339 op->optinsn.insn = get_optinsn_slot();
1340 if (!op->optinsn.insn)
1341 return -ENOMEM;
1342
1343 /*
1344 * Verify if the address gap is in 2GB range, because this uses
1345 * a relative jump.
1346 */
1347 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
1348 if (abs(rel) > 0x7fffffff)
1349 return -ERANGE;
1350
1351 buf = (u8 *)op->optinsn.insn;
1352
1353 /* Copy instructions into the out-of-line buffer */
1354 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
1355 if (ret < 0) {
1356 __arch_remove_optimized_kprobe(op, 0);
1357 return ret;
1358 }
1359 op->optinsn.size = ret;
1360
1361 /* Copy arch-dep-instance from template */
1362 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
1363
1364 /* Set probe information */
1365 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
1366
1367 /* Set probe function call */
1368 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
1369
1370 /* Set returning jmp instruction at the tail of out-of-line buffer */
1371 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
1372 (u8 *)op->kp.addr + op->optinsn.size);
1373
1374 flush_icache_range((unsigned long) buf,
1375 (unsigned long) buf + TMPL_END_IDX +
1376 op->optinsn.size + RELATIVEJUMP_SIZE);
1377 return 0;
1378}
1379
1380/* Replace a breakpoint (int3) with a relative jump. */
1381int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1382{
1383 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1384 s32 rel = (s32)((long)op->optinsn.insn -
1385 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1386
1387 /* Backup instructions which will be replaced by jump address */
1388 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1389 RELATIVE_ADDR_SIZE);
1390
1391 jmp_code[0] = RELATIVEJUMP_OPCODE;
1392 *(s32 *)(&jmp_code[1]) = rel;
1393
1394 /*
1395 * text_poke_smp doesn't support NMI/MCE code modifying.
1396 * However, since kprobes itself also doesn't support NMI/MCE
1397 * code probing, it's not a problem.
1398 */
1399 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
1400 return 0;
1401}
1402
1403/* Replace a relative jump with a breakpoint (int3). */
1404void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
1405{
1406 u8 buf[RELATIVEJUMP_SIZE];
1407
1408 /* Set int3 to first byte for kprobes */
1409 buf[0] = BREAKPOINT_INSTRUCTION;
1410 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1411 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
1412}
1413
1414static int __kprobes setup_detour_execution(struct kprobe *p,
1415 struct pt_regs *regs,
1416 int reenter)
1417{
1418 struct optimized_kprobe *op;
1419
1420 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
1421 /* This kprobe is really able to run optimized path. */
1422 op = container_of(p, struct optimized_kprobe, kp);
1423 /* Detour through copied instructions */
1424 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
1425 if (!reenter)
1426 reset_current_kprobe();
1427 preempt_enable_no_resched();
1428 return 1;
1429 }
1430 return 0;
1431}
1432#endif
1433
1060int __init arch_init_kprobes(void) 1434int __init arch_init_kprobes(void)
1061{ 1435{
1062 return 0; 1436 return 0;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ec6ef60cbd17..ea697263b373 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/errno.h> 9#include <linux/errno.h>
10#include <linux/gfp.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <linux/string.h> 12#include <linux/string.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d00130..a3fa43ba5d3b 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
25#include <asm/desc.h> 25#include <asm/desc.h>
26#include <asm/system.h> 26#include <asm/system.h>
27#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
28#include <asm/debugreg.h>
28 29
29static void set_idt(void *newidt, __u16 limit) 30static void set_idt(void *newidt, __u16 limit)
30{ 31{
@@ -157,8 +158,7 @@ int machine_kexec_prepare(struct kimage *image)
157{ 158{
158 int error; 159 int error;
159 160
160 if (nx_enabled) 161 set_pages_x(image->control_code_page, 1);
161 set_pages_x(image->control_code_page, 1);
162 error = machine_kexec_alloc_page_tables(image); 162 error = machine_kexec_alloc_page_tables(image);
163 if (error) 163 if (error)
164 return error; 164 return error;
@@ -172,8 +172,7 @@ int machine_kexec_prepare(struct kimage *image)
172 */ 172 */
173void machine_kexec_cleanup(struct kimage *image) 173void machine_kexec_cleanup(struct kimage *image)
174{ 174{
175 if (nx_enabled) 175 set_pages_nx(image->control_code_page, 1);
176 set_pages_nx(image->control_code_page, 1);
177 machine_kexec_free_page_tables(image); 176 machine_kexec_free_page_tables(image);
178} 177}
179 178
@@ -202,6 +201,7 @@ void machine_kexec(struct kimage *image)
202 201
203 /* Interrupts aren't acceptable while we reboot */ 202 /* Interrupts aren't acceptable while we reboot */
204 local_irq_disable(); 203 local_irq_disable();
204 hw_breakpoint_disable();
205 205
206 if (image->preserve_context) { 206 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 207#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e98..035c8c529181 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/kexec.h> 10#include <linux/kexec.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/gfp.h>
12#include <linux/reboot.h> 13#include <linux/reboot.h>
13#include <linux/numa.h> 14#include <linux/numa.h>
14#include <linux/ftrace.h> 15#include <linux/ftrace.h>
@@ -18,6 +19,7 @@
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19#include <asm/tlbflush.h> 20#include <asm/tlbflush.h>
20#include <asm/mmu_context.h> 21#include <asm/mmu_context.h>
22#include <asm/debugreg.h>
21 23
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd, 24static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr) 25 unsigned long addr)
@@ -282,6 +284,7 @@ void machine_kexec(struct kimage *image)
282 284
283 /* Interrupts aren't acceptable while we reboot */ 285 /* Interrupts aren't acceptable while we reboot */
284 local_irq_disable(); 286 local_irq_disable();
287 hw_breakpoint_disable();
285 288
286 if (image->preserve_context) { 289 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC 290#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 845d80ce1ef1..63eaf6596233 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -42,6 +42,7 @@
42#include <linux/kernel.h> 42#include <linux/kernel.h>
43#include <linux/mca.h> 43#include <linux/mca.h>
44#include <linux/kprobes.h> 44#include <linux/kprobes.h>
45#include <linux/slab.h>
45#include <asm/system.h> 46#include <asm/system.h>
46#include <asm/io.h> 47#include <asm/io.h>
47#include <linux/proc_fs.h> 48#include <linux/proc_fs.h>
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
deleted file mode 100644
index 2a62d843f015..000000000000
--- a/arch/x86/kernel/mfgpt_32.c
+++ /dev/null
@@ -1,410 +0,0 @@
1/*
2 * Driver/API for AMD Geode Multi-Function General Purpose Timers (MFGPT)
3 *
4 * Copyright (C) 2006, Advanced Micro Devices, Inc.
5 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of version 2 of the GNU General Public License
9 * as published by the Free Software Foundation.
10 *
11 * The MFGPTs are documented in AMD Geode CS5536 Companion Device Data Book.
12 */
13
14/*
15 * We are using the 32.768kHz input clock - it's the only one that has the
16 * ranges we find desirable. The following table lists the suitable
17 * divisors and the associated Hz, minimum interval and the maximum interval:
18 *
19 * Divisor Hz Min Delta (s) Max Delta (s)
20 * 1 32768 .00048828125 2.000
21 * 2 16384 .0009765625 4.000
22 * 4 8192 .001953125 8.000
23 * 8 4096 .00390625 16.000
24 * 16 2048 .0078125 32.000
25 * 32 1024 .015625 64.000
26 * 64 512 .03125 128.000
27 * 128 256 .0625 256.000
28 * 256 128 .125 512.000
29 */
30
31#include <linux/kernel.h>
32#include <linux/interrupt.h>
33#include <linux/module.h>
34#include <asm/geode.h>
35
36#define MFGPT_DEFAULT_IRQ 7
37
38static struct mfgpt_timer_t {
39 unsigned int avail:1;
40} mfgpt_timers[MFGPT_MAX_TIMERS];
41
42/* Selected from the table above */
43
44#define MFGPT_DIVISOR 16
45#define MFGPT_SCALE 4 /* divisor = 2^(scale) */
46#define MFGPT_HZ (32768 / MFGPT_DIVISOR)
47#define MFGPT_PERIODIC (MFGPT_HZ / HZ)
48
49/* Allow for disabling of MFGPTs */
50static int disable;
51static int __init mfgpt_disable(char *s)
52{
53 disable = 1;
54 return 1;
55}
56__setup("nomfgpt", mfgpt_disable);
57
58/* Reset the MFGPT timers. This is required by some broken BIOSes which already
59 * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
60 * affected at least (0.99 is OK with MFGPT workaround left to off).
61 */
62static int __init mfgpt_fix(char *s)
63{
64 u32 val, dummy;
65
66 /* The following udocumented bit resets the MFGPT timers */
67 val = 0xFF; dummy = 0;
68 wrmsr(MSR_MFGPT_SETUP, val, dummy);
69 return 1;
70}
71__setup("mfgptfix", mfgpt_fix);
72
73/*
74 * Check whether any MFGPTs are available for the kernel to use. In most
75 * cases, firmware that uses AMD's VSA code will claim all timers during
76 * bootup; we certainly don't want to take them if they're already in use.
77 * In other cases (such as with VSAless OpenFirmware), the system firmware
78 * leaves timers available for us to use.
79 */
80
81
82static int timers = -1;
83
84static void geode_mfgpt_detect(void)
85{
86 int i;
87 u16 val;
88
89 timers = 0;
90
91 if (disable) {
92 printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n");
93 goto done;
94 }
95
96 if (!geode_get_dev_base(GEODE_DEV_MFGPT)) {
97 printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n");
98 goto done;
99 }
100
101 for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
102 val = geode_mfgpt_read(i, MFGPT_REG_SETUP);
103 if (!(val & MFGPT_SETUP_SETUP)) {
104 mfgpt_timers[i].avail = 1;
105 timers++;
106 }
107 }
108
109done:
110 printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers);
111}
112
113int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
114{
115 u32 msr, mask, value, dummy;
116 int shift = (cmp == MFGPT_CMP1) ? 0 : 8;
117
118 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
119 return -EIO;
120
121 /*
122 * The register maps for these are described in sections 6.17.1.x of
123 * the AMD Geode CS5536 Companion Device Data Book.
124 */
125 switch (event) {
126 case MFGPT_EVENT_RESET:
127 /*
128 * XXX: According to the docs, we cannot reset timers above
129 * 6; that is, resets for 7 and 8 will be ignored. Is this
130 * a problem? -dilinger
131 */
132 msr = MSR_MFGPT_NR;
133 mask = 1 << (timer + 24);
134 break;
135
136 case MFGPT_EVENT_NMI:
137 msr = MSR_MFGPT_NR;
138 mask = 1 << (timer + shift);
139 break;
140
141 case MFGPT_EVENT_IRQ:
142 msr = MSR_MFGPT_IRQ;
143 mask = 1 << (timer + shift);
144 break;
145
146 default:
147 return -EIO;
148 }
149
150 rdmsr(msr, value, dummy);
151
152 if (enable)
153 value |= mask;
154 else
155 value &= ~mask;
156
157 wrmsr(msr, value, dummy);
158 return 0;
159}
160EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
161
162int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
163{
164 u32 zsel, lpc, dummy;
165 int shift;
166
167 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
168 return -EIO;
169
170 /*
171 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
172 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
173 * 2, and we mustn't use nor change it.
174 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
175 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
176 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
177 */
178 rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
179 shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
180 if (((zsel >> shift) & 0xF) == 2)
181 return -EIO;
182
183 /* Choose IRQ: if none supplied, keep IRQ already set or use default */
184 if (!*irq)
185 *irq = (zsel >> shift) & 0xF;
186 if (!*irq)
187 *irq = MFGPT_DEFAULT_IRQ;
188
189 /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
190 if (*irq < 1 || *irq == 2 || *irq > 15)
191 return -EIO;
192 rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
193 if (lpc & (1 << *irq))
194 return -EIO;
195
196 /* All chosen and checked - go for it */
197 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
198 return -EIO;
199 if (enable) {
200 zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
201 wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
202 }
203
204 return 0;
205}
206
207static int mfgpt_get(int timer)
208{
209 mfgpt_timers[timer].avail = 0;
210 printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer);
211 return timer;
212}
213
214int geode_mfgpt_alloc_timer(int timer, int domain)
215{
216 int i;
217
218 if (timers == -1) {
219 /* timers haven't been detected yet */
220 geode_mfgpt_detect();
221 }
222
223 if (!timers)
224 return -1;
225
226 if (timer >= MFGPT_MAX_TIMERS)
227 return -1;
228
229 if (timer < 0) {
230 /* Try to find an available timer */
231 for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
232 if (mfgpt_timers[i].avail)
233 return mfgpt_get(i);
234
235 if (i == 5 && domain == MFGPT_DOMAIN_WORKING)
236 break;
237 }
238 } else {
239 /* If they requested a specific timer, try to honor that */
240 if (mfgpt_timers[timer].avail)
241 return mfgpt_get(timer);
242 }
243
244 /* No timers available - too bad */
245 return -1;
246}
247EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
248
249
250#ifdef CONFIG_GEODE_MFGPT_TIMER
251
252/*
253 * The MFPGT timers on the CS5536 provide us with suitable timers to use
254 * as clock event sources - not as good as a HPET or APIC, but certainly
255 * better than the PIT. This isn't a general purpose MFGPT driver, but
256 * a simplified one designed specifically to act as a clock event source.
257 * For full details about the MFGPT, please consult the CS5536 data sheet.
258 */
259
260#include <linux/clocksource.h>
261#include <linux/clockchips.h>
262
263static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
264static u16 mfgpt_event_clock;
265
266static int irq;
267static int __init mfgpt_setup(char *str)
268{
269 get_option(&str, &irq);
270 return 1;
271}
272__setup("mfgpt_irq=", mfgpt_setup);
273
274static void mfgpt_disable_timer(u16 clock)
275{
276 /* avoid races by clearing CMP1 and CMP2 unconditionally */
277 geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN |
278 MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2);
279}
280
281static int mfgpt_next_event(unsigned long, struct clock_event_device *);
282static void mfgpt_set_mode(enum clock_event_mode, struct clock_event_device *);
283
284static struct clock_event_device mfgpt_clockevent = {
285 .name = "mfgpt-timer",
286 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
287 .set_mode = mfgpt_set_mode,
288 .set_next_event = mfgpt_next_event,
289 .rating = 250,
290 .cpumask = cpu_all_mask,
291 .shift = 32
292};
293
294static void mfgpt_start_timer(u16 delta)
295{
296 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta);
297 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
298
299 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
300 MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
301}
302
303static void mfgpt_set_mode(enum clock_event_mode mode,
304 struct clock_event_device *evt)
305{
306 mfgpt_disable_timer(mfgpt_event_clock);
307
308 if (mode == CLOCK_EVT_MODE_PERIODIC)
309 mfgpt_start_timer(MFGPT_PERIODIC);
310
311 mfgpt_tick_mode = mode;
312}
313
314static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt)
315{
316 mfgpt_start_timer(delta);
317 return 0;
318}
319
320static irqreturn_t mfgpt_tick(int irq, void *dev_id)
321{
322 u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP);
323
324 /* See if the interrupt was for us */
325 if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1)))
326 return IRQ_NONE;
327
328 /* Turn off the clock (and clear the event) */
329 mfgpt_disable_timer(mfgpt_event_clock);
330
331 if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN)
332 return IRQ_HANDLED;
333
334 /* Clear the counter */
335 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
336
337 /* Restart the clock in periodic mode */
338
339 if (mfgpt_tick_mode == CLOCK_EVT_MODE_PERIODIC) {
340 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
341 MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
342 }
343
344 mfgpt_clockevent.event_handler(&mfgpt_clockevent);
345 return IRQ_HANDLED;
346}
347
348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
351 .name = "mfgpt-timer"
352};
353
354int __init mfgpt_timer_setup(void)
355{
356 int timer, ret;
357 u16 val;
358
359 timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING);
360 if (timer < 0) {
361 printk(KERN_ERR
362 "mfgpt-timer: Could not allocate a MFPGT timer\n");
363 return -ENODEV;
364 }
365
366 mfgpt_event_clock = timer;
367
368 /* Set up the IRQ on the MFGPT side */
369 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
370 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
371 return -EIO;
372 }
373
374 /* And register it with the kernel */
375 ret = setup_irq(irq, &mfgptirq);
376
377 if (ret) {
378 printk(KERN_ERR
379 "mfgpt-timer: Unable to set up the interrupt.\n");
380 goto err;
381 }
382
383 /* Set the clock scale and enable the event mode for CMP2 */
384 val = MFGPT_SCALE | (3 << 8);
385
386 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
387
388 /* Set up the clock event */
389 mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC,
390 mfgpt_clockevent.shift);
391 mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF,
392 &mfgpt_clockevent);
393 mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE,
394 &mfgpt_clockevent);
395
396 printk(KERN_INFO
397 "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
398 timer, irq);
399 clockevents_register_device(&mfgpt_clockevent);
400
401 return 0;
402
403err:
404 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
405 printk(KERN_ERR
406 "mfgpt-timer: Unable to set up the MFGPT clock source\n");
407 return -EIO;
408}
409
410#endif
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index f4c538b681ca..e1af7c055c7d 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,6 +13,9 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16
17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
16#include <linux/firmware.h> 19#include <linux/firmware.h>
17#include <linux/pci_ids.h> 20#include <linux/pci_ids.h>
18#include <linux/uaccess.h> 21#include <linux/uaccess.h>
@@ -76,12 +79,12 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
76 79
77 memset(csig, 0, sizeof(*csig)); 80 memset(csig, 0, sizeof(*csig));
78 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 81 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
79 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not " 82 pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
80 "supported\n", cpu, c->x86); 83 "supported\n", cpu, c->x86);
81 return -1; 84 return -1;
82 } 85 }
83 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
84 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); 87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
85 return 0; 88 return 0;
86} 89}
87 90
@@ -103,23 +106,16 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
103 i++; 106 i++;
104 } 107 }
105 108
106 if (!equiv_cpu_id) { 109 if (!equiv_cpu_id)
107 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
108 "not listed in equivalent cpu table\n", cpu);
109 return 0; 110 return 0;
110 }
111 111
112 if (mc_header->processor_rev_id != equiv_cpu_id) { 112 if (mc_header->processor_rev_id != equiv_cpu_id)
113 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
114 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
115 cpu, mc_header->processor_rev_id, equiv_cpu_id);
116 return 0; 113 return 0;
117 }
118 114
119 /* ucode might be chipset specific -- currently we don't support this */ 115 /* ucode might be chipset specific -- currently we don't support this */
120 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
121 printk(KERN_ERR "microcode: CPU%d: loading of chipset " 117 pr_err("CPU%d: loading of chipset specific code not yet supported\n",
122 "specific code not yet supported\n", cpu); 118 cpu);
123 return 0; 119 return 0;
124 } 120 }
125 121
@@ -148,14 +144,12 @@ static int apply_microcode_amd(int cpu)
148 144
149 /* check current patch id and patch's id for match */ 145 /* check current patch id and patch's id for match */
150 if (rev != mc_amd->hdr.patch_id) { 146 if (rev != mc_amd->hdr.patch_id) {
151 printk(KERN_ERR "microcode: CPU%d: update failed " 147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n",
152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 148 cpu, mc_amd->hdr.patch_id);
153 return -1; 149 return -1;
154 } 150 }
155 151
156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
157 cpu, rev);
158
159 uci->cpu_sig.rev = rev; 153 uci->cpu_sig.rev = rev;
160 154
161 return 0; 155 return 0;
@@ -178,18 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
178 return NULL; 172 return NULL;
179 173
180 if (section_hdr[0] != UCODE_UCODE_TYPE) { 174 if (section_hdr[0] != UCODE_UCODE_TYPE) {
181 printk(KERN_ERR "microcode: error: invalid type field in " 175 pr_err("error: invalid type field in container file section header\n");
182 "container file section header\n");
183 return NULL; 176 return NULL;
184 } 177 }
185 178
186 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 179 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
187 180
188 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
189 size, total_size);
190
191 if (total_size > size || total_size > UCODE_MAX_SIZE) { 181 if (total_size > size || total_size > UCODE_MAX_SIZE) {
192 printk(KERN_ERR "microcode: error: size mismatch\n"); 182 pr_err("error: size mismatch\n");
193 return NULL; 183 return NULL;
194 } 184 }
195 185
@@ -218,15 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf)
218 size = buf_pos[2]; 208 size = buf_pos[2];
219 209
220 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 210 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
221 printk(KERN_ERR "microcode: error: invalid type field in " 211 pr_err("error: invalid type field in container file section header\n");
222 "container file section header\n");
223 return 0; 212 return 0;
224 } 213 }
225 214
226 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 215 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
227 if (!equiv_cpu_table) { 216 if (!equiv_cpu_table) {
228 printk(KERN_ERR "microcode: failed to allocate " 217 pr_err("failed to allocate equivalent CPU table\n");
229 "equivalent CPU table\n");
230 return 0; 218 return 0;
231 } 219 }
232 220
@@ -259,8 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
259 247
260 offset = install_equiv_cpu_table(ucode_ptr); 248 offset = install_equiv_cpu_table(ucode_ptr);
261 if (!offset) { 249 if (!offset) {
262 printk(KERN_ERR "microcode: failed to create " 250 pr_err("failed to create equivalent cpu table\n");
263 "equivalent cpu table\n");
264 return UCODE_ERROR; 251 return UCODE_ERROR;
265 } 252 }
266 253
@@ -291,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
291 if (!leftover) { 278 if (!leftover) {
292 vfree(uci->mc); 279 vfree(uci->mc);
293 uci->mc = new_mc; 280 uci->mc = new_mc;
294 pr_debug("microcode: CPU%d found a matching microcode " 281 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
295 "update with version 0x%x (current=0x%x)\n",
296 cpu, new_rev, uci->cpu_sig.rev); 282 cpu, new_rev, uci->cpu_sig.rev);
297 } else { 283 } else {
298 vfree(new_mc); 284 vfree(new_mc);
@@ -318,7 +304,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
318 } 304 }
319 305
320 if (*(u32 *)firmware->data != UCODE_MAGIC) { 306 if (*(u32 *)firmware->data != UCODE_MAGIC) {
321 printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", 307 pr_err("invalid UCODE_MAGIC (0x%08x)\n",
322 *(u32 *)firmware->data); 308 *(u32 *)firmware->data);
323 return UCODE_ERROR; 309 return UCODE_ERROR;
324 } 310 }
@@ -333,8 +319,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
333static enum ucode_state 319static enum ucode_state
334request_microcode_user(int cpu, const void __user *buf, size_t size) 320request_microcode_user(int cpu, const void __user *buf, size_t size)
335{ 321{
336 printk(KERN_INFO "microcode: AMD microcode update via " 322 pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
337 "/dev/cpu/microcode not supported\n");
338 return UCODE_ERROR; 323 return UCODE_ERROR;
339} 324}
340 325
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 378e9a8f1bf8..cceb5bc3c3c2 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -70,10 +70,12 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/platform_device.h> 76#include <linux/platform_device.h>
74#include <linux/miscdevice.h> 77#include <linux/miscdevice.h>
75#include <linux/capability.h> 78#include <linux/capability.h>
76#include <linux/smp_lock.h>
77#include <linux/kernel.h> 79#include <linux/kernel.h>
78#include <linux/module.h> 80#include <linux/module.h>
79#include <linux/mutex.h> 81#include <linux/mutex.h>
@@ -201,7 +203,6 @@ static int do_microcode_update(const void __user *buf, size_t size)
201 203
202static int microcode_open(struct inode *unused1, struct file *unused2) 204static int microcode_open(struct inode *unused1, struct file *unused2)
203{ 205{
204 cycle_kernel_lock();
205 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 206 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
206} 207}
207 208
@@ -211,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
211 ssize_t ret = -EINVAL; 212 ssize_t ret = -EINVAL;
212 213
213 if ((len >> PAGE_SHIFT) > totalram_pages) { 214 if ((len >> PAGE_SHIFT) > totalram_pages) {
214 pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); 215 pr_err("too much data (max %ld pages)\n", totalram_pages);
215 return ret; 216 return ret;
216 } 217 }
217 218
@@ -246,7 +247,7 @@ static int __init microcode_dev_init(void)
246 247
247 error = misc_register(&microcode_dev); 248 error = misc_register(&microcode_dev);
248 if (error) { 249 if (error) {
249 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); 250 pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
250 return error; 251 return error;
251 } 252 }
252 253
@@ -361,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu)
361 if (!uci->mc) 362 if (!uci->mc)
362 return UCODE_NFOUND; 363 return UCODE_NFOUND;
363 364
364 pr_debug("microcode: CPU%d updated upon resume\n", cpu); 365 pr_debug("CPU%d updated upon resume\n", cpu);
365 apply_microcode_on_target(cpu); 366 apply_microcode_on_target(cpu);
366 367
367 return UCODE_OK; 368 return UCODE_OK;
@@ -381,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu)
381 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev); 382 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
382 383
383 if (ustate == UCODE_OK) { 384 if (ustate == UCODE_OK) {
384 pr_debug("microcode: CPU%d updated upon init\n", cpu); 385 pr_debug("CPU%d updated upon init\n", cpu);
385 apply_microcode_on_target(cpu); 386 apply_microcode_on_target(cpu);
386 } 387 }
387 388
@@ -408,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
408 if (!cpu_online(cpu)) 409 if (!cpu_online(cpu))
409 return 0; 410 return 0;
410 411
411 pr_debug("microcode: CPU%d added\n", cpu); 412 pr_debug("CPU%d added\n", cpu);
412 413
413 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 414 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
414 if (err) 415 if (err)
@@ -427,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
427 if (!cpu_online(cpu)) 428 if (!cpu_online(cpu))
428 return 0; 429 return 0;
429 430
430 pr_debug("microcode: CPU%d removed\n", cpu); 431 pr_debug("CPU%d removed\n", cpu);
431 microcode_fini_cpu(cpu); 432 microcode_fini_cpu(cpu);
432 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 433 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
433 return 0; 434 return 0;
@@ -475,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
475 microcode_update_cpu(cpu); 476 microcode_update_cpu(cpu);
476 case CPU_DOWN_FAILED: 477 case CPU_DOWN_FAILED:
477 case CPU_DOWN_FAILED_FROZEN: 478 case CPU_DOWN_FAILED_FROZEN:
478 pr_debug("microcode: CPU%d added\n", cpu); 479 pr_debug("CPU%d added\n", cpu);
479 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 480 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
480 pr_err("microcode: Failed to create group for CPU%d\n", cpu); 481 pr_err("Failed to create group for CPU%d\n", cpu);
481 break; 482 break;
482 case CPU_DOWN_PREPARE: 483 case CPU_DOWN_PREPARE:
483 case CPU_DOWN_PREPARE_FROZEN: 484 case CPU_DOWN_PREPARE_FROZEN:
484 /* Suspend is in progress, only remove the interface */ 485 /* Suspend is in progress, only remove the interface */
485 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 486 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
486 pr_debug("microcode: CPU%d removed\n", cpu); 487 pr_debug("CPU%d removed\n", cpu);
487 break; 488 break;
488 case CPU_DEAD: 489 case CPU_DEAD:
489 case CPU_UP_CANCELED_FROZEN: 490 case CPU_UP_CANCELED_FROZEN:
@@ -509,7 +510,7 @@ static int __init microcode_init(void)
509 microcode_ops = init_amd_microcode(); 510 microcode_ops = init_amd_microcode();
510 511
511 if (!microcode_ops) { 512 if (!microcode_ops) {
512 pr_err("microcode: no support for this CPU vendor\n"); 513 pr_err("no support for this CPU vendor\n");
513 return -ENODEV; 514 return -ENODEV;
514 } 515 }
515 516
@@ -540,8 +541,7 @@ static int __init microcode_init(void)
540 register_hotcpu_notifier(&mc_cpu_notifier); 541 register_hotcpu_notifier(&mc_cpu_notifier);
541 542
542 pr_info("Microcode Update Driver: v" MICROCODE_VERSION 543 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
543 " <tigran@aivazian.fsnet.co.uk>," 544 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
544 " Peter Oruba\n");
545 545
546 return 0; 546 return 0;
547} 547}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 0d334ddd0a96..85a343e28937 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,6 +70,9 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/firmware.h> 76#include <linux/firmware.h>
74#include <linux/uaccess.h> 77#include <linux/uaccess.h>
75#include <linux/kernel.h> 78#include <linux/kernel.h>
@@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
146 149
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || 150 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) { 151 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel " 152 pr_err("CPU%d not a capable Intel processor\n", cpu_num);
150 "processor\n", cpu_num);
151 return -1; 153 return -1;
152 } 154 }
153 155
@@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
165 /* get the current revision from MSR 0x8B */ 167 /* get the current revision from MSR 0x8B */
166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 168 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
167 169
168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", 170 pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
169 cpu_num, csig->sig, csig->pf, csig->rev); 171 cpu_num, csig->sig, csig->pf, csig->rev);
170 172
171 return 0; 173 return 0;
172} 174}
@@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc)
194 data_size = get_datasize(mc_header); 196 data_size = get_datasize(mc_header);
195 197
196 if (data_size + MC_HEADER_SIZE > total_size) { 198 if (data_size + MC_HEADER_SIZE > total_size) {
197 printk(KERN_ERR "microcode: error! " 199 pr_err("error! Bad data size in microcode data file\n");
198 "Bad data size in microcode data file\n");
199 return -EINVAL; 200 return -EINVAL;
200 } 201 }
201 202
202 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { 203 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
203 printk(KERN_ERR "microcode: error! " 204 pr_err("error! Unknown microcode update format\n");
204 "Unknown microcode update format\n");
205 return -EINVAL; 205 return -EINVAL;
206 } 206 }
207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size); 207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
208 if (ext_table_size) { 208 if (ext_table_size) {
209 if ((ext_table_size < EXT_HEADER_SIZE) 209 if ((ext_table_size < EXT_HEADER_SIZE)
210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { 210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
211 printk(KERN_ERR "microcode: error! " 211 pr_err("error! Small exttable size in microcode data file\n");
212 "Small exttable size in microcode data file\n");
213 return -EINVAL; 212 return -EINVAL;
214 } 213 }
215 ext_header = mc + MC_HEADER_SIZE + data_size; 214 ext_header = mc + MC_HEADER_SIZE + data_size;
216 if (ext_table_size != exttable_size(ext_header)) { 215 if (ext_table_size != exttable_size(ext_header)) {
217 printk(KERN_ERR "microcode: error! " 216 pr_err("error! Bad exttable size in microcode data file\n");
218 "Bad exttable size in microcode data file\n");
219 return -EFAULT; 217 return -EFAULT;
220 } 218 }
221 ext_sigcount = ext_header->count; 219 ext_sigcount = ext_header->count;
@@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc)
230 while (i--) 228 while (i--)
231 ext_table_sum += ext_tablep[i]; 229 ext_table_sum += ext_tablep[i];
232 if (ext_table_sum) { 230 if (ext_table_sum) {
233 printk(KERN_WARNING "microcode: aborting, " 231 pr_warning("aborting, bad extended signature table checksum\n");
234 "bad extended signature table checksum\n");
235 return -EINVAL; 232 return -EINVAL;
236 } 233 }
237 } 234 }
@@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc)
242 while (i--) 239 while (i--)
243 orig_sum += ((int *)mc)[i]; 240 orig_sum += ((int *)mc)[i];
244 if (orig_sum) { 241 if (orig_sum) {
245 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 242 pr_err("aborting, bad checksum\n");
246 return -EINVAL; 243 return -EINVAL;
247 } 244 }
248 if (!ext_table_size) 245 if (!ext_table_size)
@@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc)
255 - (mc_header->sig + mc_header->pf + mc_header->cksum) 252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
256 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); 253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
257 if (sum) { 254 if (sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 255 pr_err("aborting, bad checksum\n");
259 return -EINVAL; 256 return -EINVAL;
260 } 257 }
261 } 258 }
@@ -327,13 +324,11 @@ static int apply_microcode(int cpu)
327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 324 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
328 325
329 if (val[1] != mc_intel->hdr.rev) { 326 if (val[1] != mc_intel->hdr.rev) {
330 printk(KERN_ERR "microcode: CPU%d update " 327 pr_err("CPU%d update to revision 0x%x failed\n",
331 "to revision 0x%x failed\n", 328 cpu_num, mc_intel->hdr.rev);
332 cpu_num, mc_intel->hdr.rev);
333 return -1; 329 return -1;
334 } 330 }
335 printk(KERN_INFO "microcode: CPU%d updated to revision " 331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
336 "0x%x, date = %04x-%02x-%02x \n",
337 cpu_num, val[1], 332 cpu_num, val[1],
338 mc_intel->hdr.date & 0xffff, 333 mc_intel->hdr.date & 0xffff,
339 mc_intel->hdr.date >> 24, 334 mc_intel->hdr.date >> 24,
@@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
362 357
363 mc_size = get_totalsize(&mc_header); 358 mc_size = get_totalsize(&mc_header);
364 if (!mc_size || mc_size > leftover) { 359 if (!mc_size || mc_size > leftover) {
365 printk(KERN_ERR "microcode: error!" 360 pr_err("error! Bad data in microcode data file\n");
366 "Bad data in microcode data file\n");
367 break; 361 break;
368 } 362 }
369 363
@@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
405 vfree(uci->mc); 399 vfree(uci->mc);
406 uci->mc = (struct microcode_intel *)new_mc; 400 uci->mc = (struct microcode_intel *)new_mc;
407 401
408 pr_debug("microcode: CPU%d found a matching microcode update with" 402 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
409 " version 0x%x (current=0x%x)\n", 403 cpu, new_rev, uci->cpu_sig.rev);
410 cpu, new_rev, uci->cpu_sig.rev);
411out: 404out:
412 return state; 405 return state;
413} 406}
@@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
429 c->x86, c->x86_model, c->x86_mask); 422 c->x86, c->x86_model, c->x86_mask);
430 423
431 if (request_firmware(&firmware, name, device)) { 424 if (request_firmware(&firmware, name, device)) {
432 pr_debug("microcode: data file %s load failed\n", name); 425 pr_debug("data file %s load failed\n", name);
433 return UCODE_NFOUND; 426 return UCODE_NFOUND;
434 } 427 }
435 428
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 712d15fdc416..71825806cd44 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -7,6 +7,8 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/dmi.h> 9#include <linux/dmi.h>
10#include <linux/range.h>
11
10#include <asm/pci-direct.h> 12#include <asm/pci-direct.h>
11#include <linux/sort.h> 13#include <linux/sort.h>
12#include <asm/io.h> 14#include <asm/io.h>
@@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
30 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, 32 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
31}; 33};
32 34
33struct range {
34 u64 start;
35 u64 end;
36};
37
38static int __cpuinit cmp_range(const void *x1, const void *x2) 35static int __cpuinit cmp_range(const void *x1, const void *x2)
39{ 36{
40 const struct range *r1 = x1; 37 const struct range *r1 = x1;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 89f386f044e4..e0bc186d7501 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -23,6 +23,7 @@
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/gfp.h>
26 27
27#include <asm/system.h> 28#include <asm/system.h>
28#include <asm/page.h> 29#include <asm/page.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5be95ef4ffec..e81030f71a8f 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -359,13 +359,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
359 x86_init.mpparse.mpc_record(1); 359 x86_init.mpparse.mpc_record(1);
360 } 360 }
361 361
362#ifdef CONFIG_X86_BIGSMP
363 generic_bigsmp_probe();
364#endif
365
366 if (apic->setup_apic_routing)
367 apic->setup_apic_routing();
368
369 if (!num_processors) 362 if (!num_processors)
370 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 363 printk(KERN_ERR "MPTABLE: no processors registered!\n");
371 return num_processors; 364 return num_processors;
@@ -667,36 +660,18 @@ void __init default_get_smp_config(unsigned int early)
667 */ 660 */
668} 661}
669 662
670static void __init smp_reserve_bootmem(struct mpf_intel *mpf) 663static void __init smp_reserve_memory(struct mpf_intel *mpf)
671{ 664{
672 unsigned long size = get_mpc_size(mpf->physptr); 665 unsigned long size = get_mpc_size(mpf->physptr);
673#ifdef CONFIG_X86_32
674 /*
675 * We cannot access to MPC table to compute table size yet,
676 * as only few megabytes from the bottom is mapped now.
677 * PC-9800's MPC table places on the very last of physical
678 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
679 * yields BUG() in reserve_bootmem.
680 * also need to make sure physptr is below than max_low_pfn
681 * we don't need reserve the area above max_low_pfn
682 */
683 unsigned long end = max_low_pfn * PAGE_SIZE;
684 666
685 if (mpf->physptr < end) { 667 reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc");
686 if (mpf->physptr + size > end)
687 size = end - mpf->physptr;
688 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
689 }
690#else
691 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
692#endif
693} 668}
694 669
695static int __init smp_scan_config(unsigned long base, unsigned long length, 670static int __init smp_scan_config(unsigned long base, unsigned long length)
696 unsigned reserve)
697{ 671{
698 unsigned int *bp = phys_to_virt(base); 672 unsigned int *bp = phys_to_virt(base);
699 struct mpf_intel *mpf; 673 struct mpf_intel *mpf;
674 unsigned long mem;
700 675
701 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 676 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
702 bp, length); 677 bp, length);
@@ -717,12 +692,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
717 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", 692 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
718 mpf, (u64)virt_to_phys(mpf)); 693 mpf, (u64)virt_to_phys(mpf));
719 694
720 if (!reserve) 695 mem = virt_to_phys(mpf);
721 return 1; 696 reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf");
722 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
723 BOOTMEM_DEFAULT);
724 if (mpf->physptr) 697 if (mpf->physptr)
725 smp_reserve_bootmem(mpf); 698 smp_reserve_memory(mpf);
726 699
727 return 1; 700 return 1;
728 } 701 }
@@ -732,7 +705,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
732 return 0; 705 return 0;
733} 706}
734 707
735void __init default_find_smp_config(unsigned int reserve) 708void __init default_find_smp_config(void)
736{ 709{
737 unsigned int address; 710 unsigned int address;
738 711
@@ -744,9 +717,9 @@ void __init default_find_smp_config(unsigned int reserve)
744 * 2) Scan the top 1K of base RAM 717 * 2) Scan the top 1K of base RAM
745 * 3) Scan the 64K of bios 718 * 3) Scan the 64K of bios
746 */ 719 */
747 if (smp_scan_config(0x0, 0x400, reserve) || 720 if (smp_scan_config(0x0, 0x400) ||
748 smp_scan_config(639 * 0x400, 0x400, reserve) || 721 smp_scan_config(639 * 0x400, 0x400) ||
749 smp_scan_config(0xF0000, 0x10000, reserve)) 722 smp_scan_config(0xF0000, 0x10000))
750 return; 723 return;
751 /* 724 /*
752 * If it is an SMP machine we should know now, unless the 725 * If it is an SMP machine we should know now, unless the
@@ -767,7 +740,7 @@ void __init default_find_smp_config(unsigned int reserve)
767 740
768 address = get_bios_ebda(); 741 address = get_bios_ebda();
769 if (address) 742 if (address)
770 smp_scan_config(address, 0x400, reserve); 743 smp_scan_config(address, 0x400);
771} 744}
772 745
773#ifdef CONFIG_X86_IO_APIC 746#ifdef CONFIG_X86_IO_APIC
@@ -965,9 +938,6 @@ void __init early_reserve_e820_mpc_new(void)
965{ 938{
966 if (enable_update_mptable && alloc_mptable) { 939 if (enable_update_mptable && alloc_mptable) {
967 u64 startt = 0; 940 u64 startt = 0;
968#ifdef CONFIG_X86_TRAMPOLINE
969 startt = TRAMPOLINE_BASE;
970#endif
971 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); 941 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
972 } 942 }
973} 943}
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 3b7078abc871..0aad8670858e 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -10,8 +10,211 @@
10 * of the License. 10 * of the License.
11 */ 11 */
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sfi.h>
15#include <linux/irq.h>
16#include <linux/module.h>
13 17
14#include <asm/setup.h> 18#include <asm/setup.h>
19#include <asm/mpspec_def.h>
20#include <asm/hw_irq.h>
21#include <asm/apic.h>
22#include <asm/io_apic.h>
23#include <asm/mrst.h>
24#include <asm/io.h>
25#include <asm/i8259.h>
26#include <asm/apb_timer.h>
27
28static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
29static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
30int sfi_mtimer_num;
31
32struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
33EXPORT_SYMBOL_GPL(sfi_mrtc_array);
34int sfi_mrtc_num;
35
36static inline void assign_to_mp_irq(struct mpc_intsrc *m,
37 struct mpc_intsrc *mp_irq)
38{
39 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
40}
41
42static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
43 struct mpc_intsrc *m)
44{
45 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
46}
47
48static void save_mp_irq(struct mpc_intsrc *m)
49{
50 int i;
51
52 for (i = 0; i < mp_irq_entries; i++) {
53 if (!mp_irq_cmp(&mp_irqs[i], m))
54 return;
55 }
56
57 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
58 if (++mp_irq_entries == MAX_IRQ_SOURCES)
59 panic("Max # of irq sources exceeded!!\n");
60}
61
62/* parse all the mtimer info to a static mtimer array */
63static int __init sfi_parse_mtmr(struct sfi_table_header *table)
64{
65 struct sfi_table_simple *sb;
66 struct sfi_timer_table_entry *pentry;
67 struct mpc_intsrc mp_irq;
68 int totallen;
69
70 sb = (struct sfi_table_simple *)table;
71 if (!sfi_mtimer_num) {
72 sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
73 struct sfi_timer_table_entry);
74 pentry = (struct sfi_timer_table_entry *) sb->pentry;
75 totallen = sfi_mtimer_num * sizeof(*pentry);
76 memcpy(sfi_mtimer_array, pentry, totallen);
77 }
78
79 printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
80 pentry = sfi_mtimer_array;
81 for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
82 printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
83 " irq = %d\n", totallen, (u32)pentry->phys_addr,
84 pentry->freq_hz, pentry->irq);
85 if (!pentry->irq)
86 continue;
87 mp_irq.type = MP_IOAPIC;
88 mp_irq.irqtype = mp_INT;
89/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
90 mp_irq.irqflag = 5;
91 mp_irq.srcbus = 0;
92 mp_irq.srcbusirq = pentry->irq; /* IRQ */
93 mp_irq.dstapic = MP_APIC_ALL;
94 mp_irq.dstirq = pentry->irq;
95 save_mp_irq(&mp_irq);
96 }
97
98 return 0;
99}
100
101struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
102{
103 int i;
104 if (hint < sfi_mtimer_num) {
105 if (!sfi_mtimer_usage[hint]) {
106 pr_debug("hint taken for timer %d irq %d\n",\
107 hint, sfi_mtimer_array[hint].irq);
108 sfi_mtimer_usage[hint] = 1;
109 return &sfi_mtimer_array[hint];
110 }
111 }
112 /* take the first timer available */
113 for (i = 0; i < sfi_mtimer_num;) {
114 if (!sfi_mtimer_usage[i]) {
115 sfi_mtimer_usage[i] = 1;
116 return &sfi_mtimer_array[i];
117 }
118 i++;
119 }
120 return NULL;
121}
122
123void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
124{
125 int i;
126 for (i = 0; i < sfi_mtimer_num;) {
127 if (mtmr->irq == sfi_mtimer_array[i].irq) {
128 sfi_mtimer_usage[i] = 0;
129 return;
130 }
131 i++;
132 }
133}
134
135/* parse all the mrtc info to a global mrtc array */
136int __init sfi_parse_mrtc(struct sfi_table_header *table)
137{
138 struct sfi_table_simple *sb;
139 struct sfi_rtc_table_entry *pentry;
140 struct mpc_intsrc mp_irq;
141
142 int totallen;
143
144 sb = (struct sfi_table_simple *)table;
145 if (!sfi_mrtc_num) {
146 sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
147 struct sfi_rtc_table_entry);
148 pentry = (struct sfi_rtc_table_entry *)sb->pentry;
149 totallen = sfi_mrtc_num * sizeof(*pentry);
150 memcpy(sfi_mrtc_array, pentry, totallen);
151 }
152
153 printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
154 pentry = sfi_mrtc_array;
155 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
156 printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
157 totallen, (u32)pentry->phys_addr, pentry->irq);
158 mp_irq.type = MP_IOAPIC;
159 mp_irq.irqtype = mp_INT;
160 mp_irq.irqflag = 0;
161 mp_irq.srcbus = 0;
162 mp_irq.srcbusirq = pentry->irq; /* IRQ */
163 mp_irq.dstapic = MP_APIC_ALL;
164 mp_irq.dstirq = pentry->irq;
165 save_mp_irq(&mp_irq);
166 }
167 return 0;
168}
169
170/*
171 * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
172 * APBT but cmdline option can also override it.
173 */
174static void __cpuinit mrst_setup_secondary_clock(void)
175{
176 /* restore default lapic clock if disabled by cmdline */
177 if (disable_apbt_percpu)
178 return setup_secondary_APIC_clock();
179 apbt_setup_secondary_clock();
180}
181
182static unsigned long __init mrst_calibrate_tsc(void)
183{
184 unsigned long flags, fast_calibrate;
185
186 local_irq_save(flags);
187 fast_calibrate = apbt_quick_calibrate();
188 local_irq_restore(flags);
189
190 if (fast_calibrate)
191 return fast_calibrate;
192
193 return 0;
194}
195
196void __init mrst_time_init(void)
197{
198 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
199 pre_init_apic_IRQ0();
200 apbt_time_init();
201}
202
203void __init mrst_rtc_init(void)
204{
205 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
206}
207
208/*
209 * if we use per cpu apb timer, the bootclock already setup. if we use lapic
210 * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
211 */
212static void __init mrst_setup_boot_clock(void)
213{
214 pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
215 if (disable_apbt_percpu)
216 setup_boot_APIC_clock();
217};
15 218
16/* 219/*
17 * Moorestown specific x86_init function overrides and early setup 220 * Moorestown specific x86_init function overrides and early setup
@@ -21,4 +224,17 @@ void __init x86_mrst_early_setup(void)
21{ 224{
22 x86_init.resources.probe_roms = x86_init_noop; 225 x86_init.resources.probe_roms = x86_init_noop;
23 x86_init.resources.reserve_resources = x86_init_noop; 226 x86_init.resources.reserve_resources = x86_init_noop;
227
228 x86_init.timers.timer_init = mrst_time_init;
229 x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
230
231 x86_init.irqs.pre_vector_init = x86_init_noop;
232
233 x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
234
235 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
236 x86_init.pci.init = pci_mrst_init;
237 x86_init.pci.fixup_irqs = x86_init_noop;
238
239 legacy_pic = &null_legacy_pic;
24} 240}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 6a3cefc7dda1..4d4468e9f47c 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -37,6 +37,7 @@
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/notifier.h> 38#include <linux/notifier.h>
39#include <linux/uaccess.h> 39#include <linux/uaccess.h>
40#include <linux/gfp.h>
40 41
41#include <asm/processor.h> 42#include <asm/processor.h>
42#include <asm/msr.h> 43#include <asm/msr.h>
@@ -172,23 +173,18 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
172 173
173static int msr_open(struct inode *inode, struct file *file) 174static int msr_open(struct inode *inode, struct file *file)
174{ 175{
175 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 176 unsigned int cpu;
176 struct cpuinfo_x86 *c = &cpu_data(cpu); 177 struct cpuinfo_x86 *c;
177 int ret = 0;
178 178
179 lock_kernel();
180 cpu = iminor(file->f_path.dentry->d_inode); 179 cpu = iminor(file->f_path.dentry->d_inode);
180 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
181 return -ENXIO; /* No such CPU */
181 182
182 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
183 ret = -ENXIO; /* No such CPU */
184 goto out;
185 }
186 c = &cpu_data(cpu); 183 c = &cpu_data(cpu);
187 if (!cpu_has(c, X86_FEATURE_MSR)) 184 if (!cpu_has(c, X86_FEATURE_MSR))
188 ret = -EIO; /* MSR not supported */ 185 return -EIO; /* MSR not supported */
189out: 186
190 unlock_kernel(); 187 return 0;
191 return ret;
192} 188}
193 189
194/* 190/*
@@ -251,7 +247,7 @@ static int __init msr_init(void)
251 int i, err = 0; 247 int i, err = 0;
252 i = 0; 248 i = 0;
253 249
254 if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { 250 if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
255 printk(KERN_ERR "msr: unable to get major %d for msr\n", 251 printk(KERN_ERR "msr: unable to get major %d for msr\n",
256 MSR_MAJOR); 252 MSR_MAJOR);
257 err = -EBUSY; 253 err = -EBUSY;
@@ -279,7 +275,7 @@ out_class:
279 msr_device_destroy(i); 275 msr_device_destroy(i);
280 class_destroy(msr_class); 276 class_destroy(msr_class);
281out_chrdev: 277out_chrdev:
282 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 278 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
283out: 279out:
284 return err; 280 return err;
285} 281}
@@ -290,7 +286,7 @@ static void __exit msr_exit(void)
290 for_each_online_cpu(cpu) 286 for_each_online_cpu(cpu)
291 msr_device_destroy(cpu); 287 msr_device_destroy(cpu);
292 class_destroy(msr_class); 288 class_destroy(msr_class);
293 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 289 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
294 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 290 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
295} 291}
296 292
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 4006c522adc7..8297160c41b3 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,7 +17,9 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/io.h> 18#include <linux/io.h>
19#include <linux/string.h> 19#include <linux/string.h>
20
20#include <asm/geode.h> 21#include <asm/geode.h>
22#include <asm/setup.h>
21#include <asm/olpc.h> 23#include <asm/olpc.h>
22 24
23#ifdef CONFIG_OPEN_FIRMWARE 25#ifdef CONFIG_OPEN_FIRMWARE
@@ -212,7 +214,7 @@ static int __init olpc_init(void)
212 unsigned char *romsig; 214 unsigned char *romsig;
213 215
214 /* The ioremap check is dangerous; limit what we run it on */ 216 /* The ioremap check is dangerous; limit what we run it on */
215 if (!is_geode() || geode_has_vsa2()) 217 if (!is_geode() || cs5535_has_vsa2())
216 return 0; 218 return 0;
217 219
218 spin_lock_init(&ec_lock); 220 spin_lock_init(&ec_lock);
@@ -243,9 +245,11 @@ static int __init olpc_init(void)
243 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, 245 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
244 (unsigned char *) &olpc_platform_info.ecver, 1); 246 (unsigned char *) &olpc_platform_info.ecver, 1);
245 247
246 /* check to see if the VSA exists */ 248#ifdef CONFIG_PCI_OLPC
247 if (geode_has_vsa2()) 249 /* If the VSA exists let it emulate PCI, if not emulate in kernel */
248 olpc_platform_info.flags |= OLPC_F_VSA; 250 if (!cs5535_has_vsa2())
251 x86_init.pci.arch_init = pci_olpc_init;
252#endif
249 253
250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", 254 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
251 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", 255 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 3a7c5a44082e..676b8c77a976 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,9 +8,9 @@
8#include <asm/paravirt.h> 8#include <asm/paravirt.h>
9 9
10static inline void 10static inline void
11default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) 11default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
12{ 12{
13 __raw_spin_lock(lock); 13 arch_spin_lock(lock);
14} 14}
15 15
16struct pv_lock_ops pv_lock_ops = { 16struct pv_lock_ops pv_lock_ops = {
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d16310..1db183ed7c01 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = {
428 .ptep_modify_prot_start = __ptep_modify_prot_start, 428 .ptep_modify_prot_start = __ptep_modify_prot_start,
429 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 429 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
430 430
431#ifdef CONFIG_HIGHPTE
432 .kmap_atomic_pte = kmap_atomic,
433#endif
434
435#if PAGETABLE_LEVELS >= 3 431#if PAGETABLE_LEVELS >= 3
436#ifdef CONFIG_X86_PAE 432#ifdef CONFIG_X86_PAE
437 .set_pte_atomic = native_set_pte_atomic, 433 .set_pte_atomic = native_set_pte_atomic,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 971a3bec47a8..fb99f7edb341 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -31,7 +31,7 @@
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h> 32#include <linux/crash_dump.h>
33#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
34#include <linux/bitops.h> 34#include <linux/bitmap.h>
35#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
36#include <linux/pci.h> 36#include <linux/pci.h>
37#include <linux/delay.h> 37#include <linux/delay.h>
@@ -46,6 +46,7 @@
46#include <asm/dma.h> 46#include <asm/dma.h>
47#include <asm/rio.h> 47#include <asm/rio.h>
48#include <asm/bios_ebda.h> 48#include <asm/bios_ebda.h>
49#include <asm/x86_init.h>
49 50
50#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT 51#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
51int use_calgary __read_mostly = 1; 52int use_calgary __read_mostly = 1;
@@ -211,7 +212,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
211 212
212 spin_lock_irqsave(&tbl->it_lock, flags); 213 spin_lock_irqsave(&tbl->it_lock, flags);
213 214
214 iommu_area_reserve(tbl->it_map, index, npages); 215 bitmap_set(tbl->it_map, index, npages);
215 216
216 spin_unlock_irqrestore(&tbl->it_lock, flags); 217 spin_unlock_irqrestore(&tbl->it_lock, flags);
217} 218}
@@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
244 if (panic_on_overflow) 245 if (panic_on_overflow)
245 panic("Calgary: fix the allocator.\n"); 246 panic("Calgary: fix the allocator.\n");
246 else 247 else
247 return bad_dma_address; 248 return DMA_ERROR_CODE;
248 } 249 }
249 } 250 }
250 251
@@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
260 void *vaddr, unsigned int npages, int direction) 261 void *vaddr, unsigned int npages, int direction)
261{ 262{
262 unsigned long entry; 263 unsigned long entry;
263 dma_addr_t ret = bad_dma_address; 264 dma_addr_t ret;
264 265
265 entry = iommu_range_alloc(dev, tbl, npages); 266 entry = iommu_range_alloc(dev, tbl, npages);
266 267
267 if (unlikely(entry == bad_dma_address)) 268 if (unlikely(entry == DMA_ERROR_CODE)) {
268 goto error; 269 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
270 "iommu %p\n", npages, tbl);
271 return DMA_ERROR_CODE;
272 }
269 273
270 /* set the return dma address */ 274 /* set the return dma address */
271 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); 275 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
@@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
273 /* put the TCEs in the HW table */ 277 /* put the TCEs in the HW table */
274 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, 278 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
275 direction); 279 direction);
276
277 return ret; 280 return ret;
278
279error:
280 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
281 "iommu %p\n", npages, tbl);
282 return bad_dma_address;
283} 281}
284 282
285static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 283static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
290 unsigned long flags; 288 unsigned long flags;
291 289
292 /* were we called with bad_dma_address? */ 290 /* were we called with bad_dma_address? */
293 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 291 badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
294 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 292 if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
295 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " 293 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
296 "address 0x%Lx\n", dma_addr); 294 "address 0x%Lx\n", dma_addr);
297 return; 295 return;
@@ -305,7 +303,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
305 303
306 spin_lock_irqsave(&tbl->it_lock, flags); 304 spin_lock_irqsave(&tbl->it_lock, flags);
307 305
308 iommu_area_free(tbl->it_map, entry, npages); 306 bitmap_clear(tbl->it_map, entry, npages);
309 307
310 spin_unlock_irqrestore(&tbl->it_lock, flags); 308 spin_unlock_irqrestore(&tbl->it_lock, flags);
311} 309}
@@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
318 316
319 pdev = to_pci_dev(dev); 317 pdev = to_pci_dev(dev);
320 318
319 /* search up the device tree for an iommu */
321 pbus = pdev->bus; 320 pbus = pdev->bus;
322 321 do {
323 /* is the device behind a bridge? Look for the root bus */ 322 tbl = pci_iommu(pbus);
324 while (pbus->parent) 323 if (tbl && tbl->it_busno == pbus->number)
324 break;
325 tbl = NULL;
325 pbus = pbus->parent; 326 pbus = pbus->parent;
326 327 } while (pbus);
327 tbl = pci_iommu(pbus);
328 328
329 BUG_ON(tbl && (tbl->it_busno != pbus->number)); 329 BUG_ON(tbl && (tbl->it_busno != pbus->number));
330 330
@@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); 373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
374 374
375 entry = iommu_range_alloc(dev, tbl, npages); 375 entry = iommu_range_alloc(dev, tbl, npages);
376 if (entry == bad_dma_address) { 376 if (entry == DMA_ERROR_CODE) {
377 /* makes sure unmap knows to stop */ 377 /* makes sure unmap knows to stop */
378 s->dma_length = 0; 378 s->dma_length = 0;
379 goto error; 379 goto error;
@@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
391error: 391error:
392 calgary_unmap_sg(dev, sg, nelems, dir, NULL); 392 calgary_unmap_sg(dev, sg, nelems, dir, NULL);
393 for_each_sg(sg, s, nelems, i) { 393 for_each_sg(sg, s, nelems, i) {
394 sg->dma_address = bad_dma_address; 394 sg->dma_address = DMA_ERROR_CODE;
395 sg->dma_length = 0; 395 sg->dma_length = 0;
396 } 396 }
397 return 0; 397 return 0;
@@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
446 446
447 /* set up tces to cover the allocated range */ 447 /* set up tces to cover the allocated range */
448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
449 if (mapping == bad_dma_address) 449 if (mapping == DMA_ERROR_CODE)
450 goto free; 450 goto free;
451 *dma_handle = mapping; 451 *dma_handle = mapping;
452 return ret; 452 return ret;
@@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
727 struct iommu_table *tbl = pci_iommu(dev->bus); 727 struct iommu_table *tbl = pci_iommu(dev->bus);
728 728
729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */ 729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */
730 iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); 730 iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
731 731
732 /* avoid the BIOS/VGA first 640KB-1MB region */ 732 /* avoid the BIOS/VGA first 640KB-1MB region */
733 /* for CalIOC2 - avoid the entire first MB */ 733 /* for CalIOC2 - avoid the entire first MB */
@@ -1309,7 +1309,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1309/* 1309/*
1310 * get_tce_space_from_tar(): 1310 * get_tce_space_from_tar():
1311 * Function for kdump case. Get the tce tables from first kernel 1311 * Function for kdump case. Get the tce tables from first kernel
1312 * by reading the contents of the base adress register of calgary iommu 1312 * by reading the contents of the base address register of calgary iommu
1313 */ 1313 */
1314static void __init get_tce_space_from_tar(void) 1314static void __init get_tce_space_from_tar(void)
1315{ 1315{
@@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void)
1344 return; 1344 return;
1345} 1345}
1346 1346
1347static int __init calgary_iommu_init(void)
1348{
1349 int ret;
1350
1351 /* ok, we're trying to use Calgary - let's roll */
1352 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1353
1354 ret = calgary_init();
1355 if (ret) {
1356 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1357 "falling back to no_iommu\n", ret);
1358 return ret;
1359 }
1360
1361 return 0;
1362}
1363
1347void __init detect_calgary(void) 1364void __init detect_calgary(void)
1348{ 1365{
1349 int bus; 1366 int bus;
@@ -1357,7 +1374,7 @@ void __init detect_calgary(void)
1357 * if the user specified iommu=off or iommu=soft or we found 1374 * if the user specified iommu=off or iommu=soft or we found
1358 * another HW IOMMU already, bail out. 1375 * another HW IOMMU already, bail out.
1359 */ 1376 */
1360 if (swiotlb || no_iommu || iommu_detected) 1377 if (no_iommu || iommu_detected)
1361 return; 1378 return;
1362 1379
1363 if (!use_calgary) 1380 if (!use_calgary)
@@ -1442,9 +1459,7 @@ void __init detect_calgary(void)
1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", 1459 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1443 specified_table_size); 1460 specified_table_size);
1444 1461
1445 /* swiotlb for devices that aren't behind the Calgary. */ 1462 x86_init.iommu.iommu_init = calgary_iommu_init;
1446 if (max_pfn > MAX_DMA32_PFN)
1447 swiotlb = 1;
1448 } 1463 }
1449 return; 1464 return;
1450 1465
@@ -1457,35 +1472,6 @@ cleanup:
1457 } 1472 }
1458} 1473}
1459 1474
1460int __init calgary_iommu_init(void)
1461{
1462 int ret;
1463
1464 if (no_iommu || (swiotlb && !calgary_detected))
1465 return -ENODEV;
1466
1467 if (!calgary_detected)
1468 return -ENODEV;
1469
1470 /* ok, we're trying to use Calgary - let's roll */
1471 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1472
1473 ret = calgary_init();
1474 if (ret) {
1475 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1476 "falling back to no_iommu\n", ret);
1477 return ret;
1478 }
1479
1480 force_iommu = 1;
1481 bad_dma_address = 0x0;
1482 /* dma_ops is set to swiotlb or nommu */
1483 if (!dma_ops)
1484 dma_ops = &nommu_dma_ops;
1485
1486 return 0;
1487}
1488
1489static int __init calgary_parse_options(char *p) 1475static int __init calgary_parse_options(char *p)
1490{ 1476{
1491 unsigned int bridge; 1477 unsigned int bridge;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index a6e804d16c35..4b7e3d8b01dd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -2,6 +2,7 @@
2#include <linux/dma-debug.h> 2#include <linux/dma-debug.h>
3#include <linux/dmar.h> 3#include <linux/dmar.h>
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <linux/gfp.h>
5#include <linux/pci.h> 6#include <linux/pci.h>
6#include <linux/kmemleak.h> 7#include <linux/kmemleak.h>
7 8
@@ -11,10 +12,11 @@
11#include <asm/gart.h> 12#include <asm/gart.h>
12#include <asm/calgary.h> 13#include <asm/calgary.h>
13#include <asm/amd_iommu.h> 14#include <asm/amd_iommu.h>
15#include <asm/x86_init.h>
14 16
15static int forbid_dac __read_mostly; 17static int forbid_dac __read_mostly;
16 18
17struct dma_map_ops *dma_ops; 19struct dma_map_ops *dma_ops = &nommu_dma_ops;
18EXPORT_SYMBOL(dma_ops); 20EXPORT_SYMBOL(dma_ops);
19 21
20static int iommu_sac_force __read_mostly; 22static int iommu_sac_force __read_mostly;
@@ -37,14 +39,11 @@ int iommu_detected __read_mostly = 0;
37 * This variable becomes 1 if iommu=pt is passed on the kernel command line. 39 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
38 * If this variable is 1, IOMMU implementations do no DMA translation for 40 * If this variable is 1, IOMMU implementations do no DMA translation for
39 * devices and allow every device to access to whole physical memory. This is 41 * devices and allow every device to access to whole physical memory. This is
40 * useful if a user want to use an IOMMU only for KVM device assignment to 42 * useful if a user wants to use an IOMMU only for KVM device assignment to
41 * guests and not for driver dma translation. 43 * guests and not for driver dma translation.
42 */ 44 */
43int iommu_pass_through __read_mostly; 45int iommu_pass_through __read_mostly;
44 46
45dma_addr_t bad_dma_address __read_mostly = 0;
46EXPORT_SYMBOL(bad_dma_address);
47
48/* Dummy device used for NULL arguments (normally ISA). */ 47/* Dummy device used for NULL arguments (normally ISA). */
49struct device x86_dma_fallback_dev = { 48struct device x86_dma_fallback_dev = {
50 .init_name = "fallback device", 49 .init_name = "fallback device",
@@ -67,7 +66,7 @@ int dma_set_mask(struct device *dev, u64 mask)
67} 66}
68EXPORT_SYMBOL(dma_set_mask); 67EXPORT_SYMBOL(dma_set_mask);
69 68
70#ifdef CONFIG_X86_64 69#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
71static __initdata void *dma32_bootmem_ptr; 70static __initdata void *dma32_bootmem_ptr;
72static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); 71static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
73 72
@@ -118,27 +117,33 @@ static void __init dma32_free_bootmem(void)
118 dma32_bootmem_ptr = NULL; 117 dma32_bootmem_ptr = NULL;
119 dma32_bootmem_size = 0; 118 dma32_bootmem_size = 0;
120} 119}
120#else
121void __init dma32_reserve_bootmem(void)
122{
123}
124static void __init dma32_free_bootmem(void)
125{
126}
127
121#endif 128#endif
122 129
123void __init pci_iommu_alloc(void) 130void __init pci_iommu_alloc(void)
124{ 131{
125#ifdef CONFIG_X86_64
126 /* free the range so iommu could get some range less than 4G */ 132 /* free the range so iommu could get some range less than 4G */
127 dma32_free_bootmem(); 133 dma32_free_bootmem();
128#endif
129 134
130 /* 135 if (pci_swiotlb_detect())
131 * The order of these functions is important for 136 goto out;
132 * fall-back/fail-over reasons 137
133 */
134 gart_iommu_hole_init(); 138 gart_iommu_hole_init();
135 139
136 detect_calgary(); 140 detect_calgary();
137 141
138 detect_intel_iommu(); 142 detect_intel_iommu();
139 143
144 /* needs to be called after gart_iommu_hole_init */
140 amd_iommu_detect(); 145 amd_iommu_detect();
141 146out:
142 pci_swiotlb_init(); 147 pci_swiotlb_init();
143} 148}
144 149
@@ -214,7 +219,7 @@ static __init int iommu_setup(char *p)
214 if (!strncmp(p, "allowdac", 8)) 219 if (!strncmp(p, "allowdac", 8))
215 forbid_dac = 0; 220 forbid_dac = 0;
216 if (!strncmp(p, "nodac", 5)) 221 if (!strncmp(p, "nodac", 5))
217 forbid_dac = -1; 222 forbid_dac = 1;
218 if (!strncmp(p, "usedac", 6)) { 223 if (!strncmp(p, "usedac", 6)) {
219 forbid_dac = -1; 224 forbid_dac = -1;
220 return 1; 225 return 1;
@@ -289,25 +294,17 @@ static int __init pci_iommu_init(void)
289#ifdef CONFIG_PCI 294#ifdef CONFIG_PCI
290 dma_debug_add_bus(&pci_bus_type); 295 dma_debug_add_bus(&pci_bus_type);
291#endif 296#endif
297 x86_init.iommu.iommu_init();
292 298
293 calgary_iommu_init(); 299 if (swiotlb) {
294 300 printk(KERN_INFO "PCI-DMA: "
295 intel_iommu_init(); 301 "Using software bounce buffering for IO (SWIOTLB)\n");
296 302 swiotlb_print_info();
297 amd_iommu_init(); 303 } else
304 swiotlb_free();
298 305
299 gart_iommu_init();
300
301 no_iommu_init();
302 return 0; 306 return 0;
303} 307}
304
305void pci_iommu_shutdown(void)
306{
307 gart_iommu_shutdown();
308
309 amd_iommu_shutdown();
310}
311/* Must execute after PCI subsystem */ 308/* Must execute after PCI subsystem */
312rootfs_initcall(pci_iommu_init); 309rootfs_initcall(pci_iommu_init);
313 310
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a7f1b64f86e0..0f7f130caa67 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -23,12 +23,13 @@
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/topology.h> 24#include <linux/topology.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/bitops.h> 26#include <linux/bitmap.h>
27#include <linux/kdebug.h> 27#include <linux/kdebug.h>
28#include <linux/scatterlist.h> 28#include <linux/scatterlist.h>
29#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
30#include <linux/sysdev.h> 30#include <linux/sysdev.h>
31#include <linux/io.h> 31#include <linux/io.h>
32#include <linux/gfp.h>
32#include <asm/atomic.h> 33#include <asm/atomic.h>
33#include <asm/mtrr.h> 34#include <asm/mtrr.h>
34#include <asm/pgtable.h> 35#include <asm/pgtable.h>
@@ -39,6 +40,7 @@
39#include <asm/swiotlb.h> 40#include <asm/swiotlb.h>
40#include <asm/dma.h> 41#include <asm/dma.h>
41#include <asm/k8.h> 42#include <asm/k8.h>
43#include <asm/x86_init.h>
42 44
43static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 45static unsigned long iommu_bus_base; /* GART remapping area (physical) */
44static unsigned long iommu_size; /* size of remapping area bytes */ 46static unsigned long iommu_size; /* size of remapping area bytes */
@@ -46,6 +48,8 @@ static unsigned long iommu_pages; /* .. and in pages */
46 48
47static u32 *iommu_gatt_base; /* Remapping table */ 49static u32 *iommu_gatt_base; /* Remapping table */
48 50
51static dma_addr_t bad_dma_addr;
52
49/* 53/*
50 * If this is disabled the IOMMU will use an optimized flushing strategy 54 * If this is disabled the IOMMU will use an optimized flushing strategy
51 * of only flushing when an mapping is reused. With it true the GART is 55 * of only flushing when an mapping is reused. With it true the GART is
@@ -92,7 +96,7 @@ static unsigned long alloc_iommu(struct device *dev, int size,
92 96
93 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 97 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
94 PAGE_SIZE) >> PAGE_SHIFT; 98 PAGE_SIZE) >> PAGE_SHIFT;
95 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, 99 boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
96 PAGE_SIZE) >> PAGE_SHIFT; 100 PAGE_SIZE) >> PAGE_SHIFT;
97 101
98 spin_lock_irqsave(&iommu_bitmap_lock, flags); 102 spin_lock_irqsave(&iommu_bitmap_lock, flags);
@@ -123,7 +127,7 @@ static void free_iommu(unsigned long offset, int size)
123 unsigned long flags; 127 unsigned long flags;
124 128
125 spin_lock_irqsave(&iommu_bitmap_lock, flags); 129 spin_lock_irqsave(&iommu_bitmap_lock, flags);
126 iommu_area_free(iommu_gart_bitmap, offset, size); 130 bitmap_clear(iommu_gart_bitmap, offset, size);
127 if (offset >= next_bit) 131 if (offset >= next_bit)
128 next_bit = offset + size; 132 next_bit = offset + size;
129 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 133 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -216,7 +220,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
216 if (panic_on_overflow) 220 if (panic_on_overflow)
217 panic("dma_map_area overflow %lu bytes\n", size); 221 panic("dma_map_area overflow %lu bytes\n", size);
218 iommu_full(dev, size, dir); 222 iommu_full(dev, size, dir);
219 return bad_dma_address; 223 return bad_dma_addr;
220 } 224 }
221 225
222 for (i = 0; i < npages; i++) { 226 for (i = 0; i < npages; i++) {
@@ -294,7 +298,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
294 int i; 298 int i;
295 299
296#ifdef CONFIG_IOMMU_DEBUG 300#ifdef CONFIG_IOMMU_DEBUG
297 printk(KERN_DEBUG "dma_map_sg overflow\n"); 301 pr_debug("dma_map_sg overflow\n");
298#endif 302#endif
299 303
300 for_each_sg(sg, s, nents, i) { 304 for_each_sg(sg, s, nents, i) {
@@ -302,7 +306,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
302 306
303 if (nonforced_iommu(dev, addr, s->length)) { 307 if (nonforced_iommu(dev, addr, s->length)) {
304 addr = dma_map_area(dev, addr, s->length, dir, 0); 308 addr = dma_map_area(dev, addr, s->length, dir, 0);
305 if (addr == bad_dma_address) { 309 if (addr == bad_dma_addr) {
306 if (i > 0) 310 if (i > 0)
307 gart_unmap_sg(dev, sg, i, dir, NULL); 311 gart_unmap_sg(dev, sg, i, dir, NULL);
308 nents = 0; 312 nents = 0;
@@ -389,12 +393,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
389 if (!dev) 393 if (!dev)
390 dev = &x86_dma_fallback_dev; 394 dev = &x86_dma_fallback_dev;
391 395
392 out = 0; 396 out = 0;
393 start = 0; 397 start = 0;
394 start_sg = sgmap = sg; 398 start_sg = sg;
395 seg_size = 0; 399 sgmap = sg;
396 max_seg_size = dma_get_max_seg_size(dev); 400 seg_size = 0;
397 ps = NULL; /* shut up gcc */ 401 max_seg_size = dma_get_max_seg_size(dev);
402 ps = NULL; /* shut up gcc */
403
398 for_each_sg(sg, s, nents, i) { 404 for_each_sg(sg, s, nents, i) {
399 dma_addr_t addr = sg_phys(s); 405 dma_addr_t addr = sg_phys(s);
400 406
@@ -417,11 +423,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
417 sgmap, pages, need) < 0) 423 sgmap, pages, need) < 0)
418 goto error; 424 goto error;
419 out++; 425 out++;
420 seg_size = 0; 426
421 sgmap = sg_next(sgmap); 427 seg_size = 0;
422 pages = 0; 428 sgmap = sg_next(sgmap);
423 start = i; 429 pages = 0;
424 start_sg = s; 430 start = i;
431 start_sg = s;
425 } 432 }
426 } 433 }
427 434
@@ -455,7 +462,7 @@ error:
455 462
456 iommu_full(dev, pages << PAGE_SHIFT, dir); 463 iommu_full(dev, pages << PAGE_SHIFT, dir);
457 for_each_sg(sg, s, nents, i) 464 for_each_sg(sg, s, nents, i)
458 s->dma_address = bad_dma_address; 465 s->dma_address = bad_dma_addr;
459 return 0; 466 return 0;
460} 467}
461 468
@@ -479,7 +486,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
479 DMA_BIDIRECTIONAL, align_mask); 486 DMA_BIDIRECTIONAL, align_mask);
480 487
481 flush_gart(); 488 flush_gart();
482 if (paddr != bad_dma_address) { 489 if (paddr != bad_dma_addr) {
483 *dma_addr = paddr; 490 *dma_addr = paddr;
484 return page_address(page); 491 return page_address(page);
485 } 492 }
@@ -499,6 +506,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
499 free_pages((unsigned long)vaddr, get_order(size)); 506 free_pages((unsigned long)vaddr, get_order(size));
500} 507}
501 508
509static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
510{
511 return (dma_addr == bad_dma_addr);
512}
513
502static int no_agp; 514static int no_agp;
503 515
504static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 516static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -515,7 +527,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
515 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; 527 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
516 528
517 if (iommu_size < 64*1024*1024) { 529 if (iommu_size < 64*1024*1024) {
518 printk(KERN_WARNING 530 pr_warning(
519 "PCI-DMA: Warning: Small IOMMU %luMB." 531 "PCI-DMA: Warning: Small IOMMU %luMB."
520 " Consider increasing the AGP aperture in BIOS\n", 532 " Consider increasing the AGP aperture in BIOS\n",
521 iommu_size >> 20); 533 iommu_size >> 20);
@@ -553,6 +565,9 @@ static void enable_gart_translations(void)
553 565
554 enable_gart_translation(dev, __pa(agp_gatt_table)); 566 enable_gart_translation(dev, __pa(agp_gatt_table));
555 } 567 }
568
569 /* Flush the GART-TLB to remove stale entries */
570 k8_flush_garts();
556} 571}
557 572
558/* 573/*
@@ -570,28 +585,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
570 aperture_alloc = aper_alloc; 585 aperture_alloc = aper_alloc;
571} 586}
572 587
573static int gart_resume(struct sys_device *dev) 588static void gart_fixup_northbridges(struct sys_device *dev)
574{ 589{
575 printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); 590 int i;
576 591
577 if (fix_up_north_bridges) { 592 if (!fix_up_north_bridges)
578 int i; 593 return;
579 594
580 printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); 595 pr_info("PCI-DMA: Restoring GART aperture settings\n");
581 596
582 for (i = 0; i < num_k8_northbridges; i++) { 597 for (i = 0; i < num_k8_northbridges; i++) {
583 struct pci_dev *dev = k8_northbridges[i]; 598 struct pci_dev *dev = k8_northbridges[i];
584 599
585 /* 600 /*
586 * Don't enable translations just yet. That is the next 601 * Don't enable translations just yet. That is the next
587 * step. Restore the pre-suspend aperture settings. 602 * step. Restore the pre-suspend aperture settings.
588 */ 603 */
589 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, 604 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
590 aperture_order << 1); 605 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
591 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
592 aperture_alloc >> 25);
593 }
594 } 606 }
607}
608
609static int gart_resume(struct sys_device *dev)
610{
611 pr_info("PCI-DMA: Resuming GART IOMMU\n");
612
613 gart_fixup_northbridges(dev);
595 614
596 enable_gart_translations(); 615 enable_gart_translations();
597 616
@@ -604,15 +623,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state)
604} 623}
605 624
606static struct sysdev_class gart_sysdev_class = { 625static struct sysdev_class gart_sysdev_class = {
607 .name = "gart", 626 .name = "gart",
608 .suspend = gart_suspend, 627 .suspend = gart_suspend,
609 .resume = gart_resume, 628 .resume = gart_resume,
610 629
611}; 630};
612 631
613static struct sys_device device_gart = { 632static struct sys_device device_gart = {
614 .id = 0, 633 .cls = &gart_sysdev_class,
615 .cls = &gart_sysdev_class,
616}; 634};
617 635
618/* 636/*
@@ -627,7 +645,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
627 void *gatt; 645 void *gatt;
628 int i, error; 646 int i, error;
629 647
630 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 648 pr_info("PCI-DMA: Disabling AGP.\n");
649
631 aper_size = aper_base = info->aper_size = 0; 650 aper_size = aper_base = info->aper_size = 0;
632 dev = NULL; 651 dev = NULL;
633 for (i = 0; i < num_k8_northbridges; i++) { 652 for (i = 0; i < num_k8_northbridges; i++) {
@@ -645,6 +664,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
645 } 664 }
646 if (!aper_base) 665 if (!aper_base)
647 goto nommu; 666 goto nommu;
667
648 info->aper_base = aper_base; 668 info->aper_base = aper_base;
649 info->aper_size = aper_size >> 20; 669 info->aper_size = aper_size >> 20;
650 670
@@ -667,14 +687,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
667 687
668 flush_gart(); 688 flush_gart();
669 689
670 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 690 pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
671 aper_base, aper_size>>10); 691 aper_base, aper_size>>10);
672 692
673 return 0; 693 return 0;
674 694
675 nommu: 695 nommu:
676 /* Should not happen anymore */ 696 /* Should not happen anymore */
677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 697 pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
678 "falling back to iommu=soft.\n"); 698 "falling back to iommu=soft.\n");
679 return -1; 699 return -1;
680} 700}
@@ -686,14 +706,16 @@ static struct dma_map_ops gart_dma_ops = {
686 .unmap_page = gart_unmap_page, 706 .unmap_page = gart_unmap_page,
687 .alloc_coherent = gart_alloc_coherent, 707 .alloc_coherent = gart_alloc_coherent,
688 .free_coherent = gart_free_coherent, 708 .free_coherent = gart_free_coherent,
709 .mapping_error = gart_mapping_error,
689}; 710};
690 711
691void gart_iommu_shutdown(void) 712static void gart_iommu_shutdown(void)
692{ 713{
693 struct pci_dev *dev; 714 struct pci_dev *dev;
694 int i; 715 int i;
695 716
696 if (no_agp && (dma_ops != &gart_dma_ops)) 717 /* don't shutdown it if there is AGP installed */
718 if (!no_agp)
697 return; 719 return;
698 720
699 for (i = 0; i < num_k8_northbridges; i++) { 721 for (i = 0; i < num_k8_northbridges; i++) {
@@ -708,7 +730,7 @@ void gart_iommu_shutdown(void)
708 } 730 }
709} 731}
710 732
711void __init gart_iommu_init(void) 733int __init gart_iommu_init(void)
712{ 734{
713 struct agp_kern_info info; 735 struct agp_kern_info info;
714 unsigned long iommu_start; 736 unsigned long iommu_start;
@@ -717,8 +739,8 @@ void __init gart_iommu_init(void)
717 unsigned long scratch; 739 unsigned long scratch;
718 long i; 740 long i;
719 741
720 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) 742 if (num_k8_northbridges == 0)
721 return; 743 return 0;
722 744
723#ifndef CONFIG_AGP_AMD64 745#ifndef CONFIG_AGP_AMD64
724 no_agp = 1; 746 no_agp = 1;
@@ -730,35 +752,28 @@ void __init gart_iommu_init(void)
730 (agp_copy_info(agp_bridge, &info) < 0); 752 (agp_copy_info(agp_bridge, &info) < 0);
731#endif 753#endif
732 754
733 if (swiotlb)
734 return;
735
736 /* Did we detect a different HW IOMMU? */
737 if (iommu_detected && !gart_iommu_aperture)
738 return;
739
740 if (no_iommu || 755 if (no_iommu ||
741 (!force_iommu && max_pfn <= MAX_DMA32_PFN) || 756 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
742 !gart_iommu_aperture || 757 !gart_iommu_aperture ||
743 (no_agp && init_k8_gatt(&info) < 0)) { 758 (no_agp && init_k8_gatt(&info) < 0)) {
744 if (max_pfn > MAX_DMA32_PFN) { 759 if (max_pfn > MAX_DMA32_PFN) {
745 printk(KERN_WARNING "More than 4GB of memory " 760 pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
746 "but GART IOMMU not available.\n"); 761 pr_warning("falling back to iommu=soft.\n");
747 printk(KERN_WARNING "falling back to iommu=soft.\n");
748 } 762 }
749 return; 763 return 0;
750 } 764 }
751 765
752 /* need to map that range */ 766 /* need to map that range */
753 aper_size = info.aper_size << 20; 767 aper_size = info.aper_size << 20;
754 aper_base = info.aper_base; 768 aper_base = info.aper_base;
755 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 769 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
770
756 if (end_pfn > max_low_pfn_mapped) { 771 if (end_pfn > max_low_pfn_mapped) {
757 start_pfn = (aper_base>>PAGE_SHIFT); 772 start_pfn = (aper_base>>PAGE_SHIFT);
758 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 773 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
759 } 774 }
760 775
761 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 776 pr_info("PCI-DMA: using GART IOMMU.\n");
762 iommu_size = check_iommu_size(info.aper_base, aper_size); 777 iommu_size = check_iommu_size(info.aper_base, aper_size);
763 iommu_pages = iommu_size >> PAGE_SHIFT; 778 iommu_pages = iommu_size >> PAGE_SHIFT;
764 779
@@ -773,8 +788,7 @@ void __init gart_iommu_init(void)
773 788
774 ret = dma_debug_resize_entries(iommu_pages); 789 ret = dma_debug_resize_entries(iommu_pages);
775 if (ret) 790 if (ret)
776 printk(KERN_DEBUG 791 pr_debug("PCI-DMA: Cannot trace all the entries\n");
777 "PCI-DMA: Cannot trace all the entries\n");
778 } 792 }
779#endif 793#endif
780 794
@@ -782,17 +796,16 @@ void __init gart_iommu_init(void)
782 * Out of IOMMU space handling. 796 * Out of IOMMU space handling.
783 * Reserve some invalid pages at the beginning of the GART. 797 * Reserve some invalid pages at the beginning of the GART.
784 */ 798 */
785 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 799 bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
786 800
787 agp_memory_reserved = iommu_size; 801 pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
788 printk(KERN_INFO
789 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
790 iommu_size >> 20); 802 iommu_size >> 20);
791 803
792 iommu_start = aper_size - iommu_size; 804 agp_memory_reserved = iommu_size;
793 iommu_bus_base = info.aper_base + iommu_start; 805 iommu_start = aper_size - iommu_size;
794 bad_dma_address = iommu_bus_base; 806 iommu_bus_base = info.aper_base + iommu_start;
795 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); 807 bad_dma_addr = iommu_bus_base;
808 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
796 809
797 /* 810 /*
798 * Unmap the IOMMU part of the GART. The alias of the page is 811 * Unmap the IOMMU part of the GART. The alias of the page is
@@ -814,7 +827,7 @@ void __init gart_iommu_init(void)
814 * the pages as Not-Present: 827 * the pages as Not-Present:
815 */ 828 */
816 wbinvd(); 829 wbinvd();
817 830
818 /* 831 /*
819 * Now all caches are flushed and we can safely enable 832 * Now all caches are flushed and we can safely enable
820 * GART hardware. Doing it early leaves the possibility 833 * GART hardware. Doing it early leaves the possibility
@@ -838,6 +851,10 @@ void __init gart_iommu_init(void)
838 851
839 flush_gart(); 852 flush_gart();
840 dma_ops = &gart_dma_ops; 853 dma_ops = &gart_dma_ops;
854 x86_platform.iommu_shutdown = gart_iommu_shutdown;
855 swiotlb = 0;
856
857 return 0;
841} 858}
842 859
843void __init gart_parse_options(char *p) 860void __init gart_parse_options(char *p)
@@ -856,7 +873,7 @@ void __init gart_parse_options(char *p)
856#endif 873#endif
857 if (isdigit(*p) && get_option(&p, &arg)) 874 if (isdigit(*p) && get_option(&p, &arg))
858 iommu_size = arg; 875 iommu_size = arg;
859 if (!strncmp(p, "fullflush", 8)) 876 if (!strncmp(p, "fullflush", 9))
860 iommu_fullflush = 1; 877 iommu_fullflush = 1;
861 if (!strncmp(p, "nofullflush", 11)) 878 if (!strncmp(p, "nofullflush", 11))
862 iommu_fullflush = 0; 879 iommu_fullflush = 0;
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4330cd..3af4af810c07 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -4,6 +4,7 @@
4#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/gfp.h>
7#include <linux/pci.h> 8#include <linux/pci.h>
8#include <linux/mm.h> 9#include <linux/mm.h>
9 10
@@ -33,7 +34,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
33 dma_addr_t bus = page_to_phys(page) + offset; 34 dma_addr_t bus = page_to_phys(page) + offset;
34 WARN_ON(size == 0); 35 WARN_ON(size == 0);
35 if (!check_addr("map_single", dev, bus, size)) 36 if (!check_addr("map_single", dev, bus, size))
36 return bad_dma_address; 37 return DMA_ERROR_CODE;
37 flush_write_buffers(); 38 flush_write_buffers();
38 return bus; 39 return bus;
39} 40}
@@ -103,12 +104,3 @@ struct dma_map_ops nommu_dma_ops = {
103 .sync_sg_for_device = nommu_sync_sg_for_device, 104 .sync_sg_for_device = nommu_sync_sg_for_device,
104 .is_phys = 1, 105 .is_phys = 1,
105}; 106};
106
107void __init no_iommu_init(void)
108{
109 if (dma_ops)
110 return;
111
112 force_iommu = 0; /* no HW IOMMU */
113 dma_ops = &nommu_dma_ops;
114}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b7839f1e..7d2829dde20e 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,18 +42,31 @@ static struct dma_map_ops swiotlb_dma_ops = {
42 .dma_supported = NULL, 42 .dma_supported = NULL,
43}; 43};
44 44
45void __init pci_swiotlb_init(void) 45/*
46 * pci_swiotlb_detect - set swiotlb to 1 if necessary
47 *
48 * This returns non-zero if we are forced to use swiotlb (by the boot
49 * option).
50 */
51int __init pci_swiotlb_detect(void)
46{ 52{
53 int use_swiotlb = swiotlb | swiotlb_force;
54
47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 55 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
48#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) 57 if (!no_iommu && max_pfn > MAX_DMA32_PFN)
50 swiotlb = 1; 58 swiotlb = 1;
51#endif 59#endif
52 if (swiotlb_force) 60 if (swiotlb_force)
53 swiotlb = 1; 61 swiotlb = 1;
62
63 return use_swiotlb;
64}
65
66void __init pci_swiotlb_init(void)
67{
54 if (swiotlb) { 68 if (swiotlb) {
55 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); 69 swiotlb_init(0);
56 swiotlb_init();
57 dma_ops = &swiotlb_dma_ops; 70 dma_ops = &swiotlb_dma_ops;
58 } 71 }
59} 72}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..0415c3ef91b5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,11 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/user-return-notifier.h>
13#include <linux/dmi.h>
14#include <linux/utsname.h>
12#include <trace/events/power.h> 15#include <trace/events/power.h>
16#include <linux/hw_breakpoint.h>
13#include <asm/system.h> 17#include <asm/system.h>
14#include <asm/apic.h> 18#include <asm/apic.h>
15#include <asm/syscalls.h> 19#include <asm/syscalls.h>
@@ -17,6 +21,7 @@
17#include <asm/uaccess.h> 21#include <asm/uaccess.h>
18#include <asm/i387.h> 22#include <asm/i387.h>
19#include <asm/ds.h> 23#include <asm/ds.h>
24#include <asm/debugreg.h>
20 25
21unsigned long idle_halt; 26unsigned long idle_halt;
22EXPORT_SYMBOL(idle_halt); 27EXPORT_SYMBOL(idle_halt);
@@ -87,30 +92,37 @@ void exit_thread(void)
87 } 92 }
88} 93}
89 94
90void flush_thread(void) 95void show_regs(struct pt_regs *regs)
91{ 96{
92 struct task_struct *tsk = current; 97 show_registers(regs);
98 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs),
99 regs->bp);
100}
93 101
94#ifdef CONFIG_X86_64 102void show_regs_common(void)
95 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { 103{
96 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); 104 const char *board, *product;
97 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
98 clear_tsk_thread_flag(tsk, TIF_IA32);
99 } else {
100 set_tsk_thread_flag(tsk, TIF_IA32);
101 current_thread_info()->status |= TS_COMPAT;
102 }
103 }
104#endif
105 105
106 clear_tsk_thread_flag(tsk, TIF_DEBUG); 106 board = dmi_get_system_info(DMI_BOARD_NAME);
107 if (!board)
108 board = "";
109 product = dmi_get_system_info(DMI_PRODUCT_NAME);
110 if (!product)
111 product = "";
107 112
108 tsk->thread.debugreg0 = 0; 113 printk(KERN_CONT "\n");
109 tsk->thread.debugreg1 = 0; 114 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
110 tsk->thread.debugreg2 = 0; 115 current->pid, current->comm, print_tainted(),
111 tsk->thread.debugreg3 = 0; 116 init_utsname()->release,
112 tsk->thread.debugreg6 = 0; 117 (int)strcspn(init_utsname()->version, " "),
113 tsk->thread.debugreg7 = 0; 118 init_utsname()->version, board, product);
119}
120
121void flush_thread(void)
122{
123 struct task_struct *tsk = current;
124
125 flush_ptrace_hw_breakpoint(tsk);
114 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 126 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
115 /* 127 /*
116 * Forget coprocessor state.. 128 * Forget coprocessor state..
@@ -192,16 +204,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
192 else if (next->debugctlmsr != prev->debugctlmsr) 204 else if (next->debugctlmsr != prev->debugctlmsr)
193 update_debugctlmsr(next->debugctlmsr); 205 update_debugctlmsr(next->debugctlmsr);
194 206
195 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
196 set_debugreg(next->debugreg0, 0);
197 set_debugreg(next->debugreg1, 1);
198 set_debugreg(next->debugreg2, 2);
199 set_debugreg(next->debugreg3, 3);
200 /* no 4 and 5 */
201 set_debugreg(next->debugreg6, 6);
202 set_debugreg(next->debugreg7, 7);
203 }
204
205 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 207 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
206 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 208 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
207 /* prev and next are different */ 209 /* prev and next are different */
@@ -224,6 +226,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
224 */ 226 */
225 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 227 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
226 } 228 }
229 propagate_user_return_notify(prev_p, next_p);
227} 230}
228 231
229int sys_fork(struct pt_regs *regs) 232int sys_fork(struct pt_regs *regs)
@@ -247,6 +250,78 @@ int sys_vfork(struct pt_regs *regs)
247 NULL, NULL); 250 NULL, NULL);
248} 251}
249 252
253long
254sys_clone(unsigned long clone_flags, unsigned long newsp,
255 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
256{
257 if (!newsp)
258 newsp = regs->sp;
259 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
260}
261
262/*
263 * This gets run with %si containing the
264 * function to call, and %di containing
265 * the "args".
266 */
267extern void kernel_thread_helper(void);
268
269/*
270 * Create a kernel thread
271 */
272int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
273{
274 struct pt_regs regs;
275
276 memset(&regs, 0, sizeof(regs));
277
278 regs.si = (unsigned long) fn;
279 regs.di = (unsigned long) arg;
280
281#ifdef CONFIG_X86_32
282 regs.ds = __USER_DS;
283 regs.es = __USER_DS;
284 regs.fs = __KERNEL_PERCPU;
285 regs.gs = __KERNEL_STACK_CANARY;
286#else
287 regs.ss = __KERNEL_DS;
288#endif
289
290 regs.orig_ax = -1;
291 regs.ip = (unsigned long) kernel_thread_helper;
292 regs.cs = __KERNEL_CS | get_kernel_rpl();
293 regs.flags = X86_EFLAGS_IF | 0x2;
294
295 /* Ok, create the new process.. */
296 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
297}
298EXPORT_SYMBOL(kernel_thread);
299
300/*
301 * sys_execve() executes a new program.
302 */
303long sys_execve(char __user *name, char __user * __user *argv,
304 char __user * __user *envp, struct pt_regs *regs)
305{
306 long error;
307 char *filename;
308
309 filename = getname(name);
310 error = PTR_ERR(filename);
311 if (IS_ERR(filename))
312 return error;
313 error = do_execve(filename, argv, envp, regs);
314
315#ifdef CONFIG_X86_32
316 if (error == 0) {
317 /* Make sure we don't return using sysenter.. */
318 set_thread_flag(TIF_IRET);
319 }
320#endif
321
322 putname(filename);
323 return error;
324}
250 325
251/* 326/*
252 * Idle related variables and functions 327 * Idle related variables and functions
@@ -451,21 +526,39 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
451} 526}
452 527
453/* 528/*
454 * Check for AMD CPUs, which have potentially C1E support 529 * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
530 * For more information see
531 * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
532 * - Erratum #365 for family 0x11 (not affected because C1e not in use)
455 */ 533 */
456static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) 534static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
457{ 535{
536 u64 val;
458 if (c->x86_vendor != X86_VENDOR_AMD) 537 if (c->x86_vendor != X86_VENDOR_AMD)
459 return 0; 538 goto no_c1e_idle;
460
461 if (c->x86 < 0x0F)
462 return 0;
463 539
464 /* Family 0x0f models < rev F do not have C1E */ 540 /* Family 0x0f models < rev F do not have C1E */
465 if (c->x86 == 0x0f && c->x86_model < 0x40) 541 if (c->x86 == 0x0F && c->x86_model >= 0x40)
466 return 0; 542 return 1;
467 543
468 return 1; 544 if (c->x86 == 0x10) {
545 /*
546 * check OSVW bit for CPUs that are not affected
547 * by erratum #400
548 */
549 if (cpu_has(c, X86_FEATURE_OSVW)) {
550 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
551 if (val >= 2) {
552 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
553 if (!(val & BIT(1)))
554 goto no_c1e_idle;
555 }
556 }
557 return 1;
558 }
559
560no_c1e_idle:
561 return 0;
469} 562}
470 563
471static cpumask_var_t c1e_mask; 564static cpumask_var_t c1e_mask;
@@ -532,7 +625,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
532{ 625{
533#ifdef CONFIG_SMP 626#ifdef CONFIG_SMP
534 if (pm_idle == poll_idle && smp_num_siblings > 1) { 627 if (pm_idle == poll_idle && smp_num_siblings > 1) {
535 printk(KERN_WARNING "WARNING: polling idle and HT enabled," 628 printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
536 " performance may degrade.\n"); 629 " performance may degrade.\n");
537 } 630 }
538#endif 631#endif
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf79567cdab..f6c62667e30c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -23,7 +23,6 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/user.h> 24#include <linux/user.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/utsname.h>
27#include <linux/delay.h> 26#include <linux/delay.h>
28#include <linux/reboot.h> 27#include <linux/reboot.h>
29#include <linux/init.h> 28#include <linux/init.h>
@@ -35,7 +34,6 @@
35#include <linux/tick.h> 34#include <linux/tick.h>
36#include <linux/percpu.h> 35#include <linux/percpu.h>
37#include <linux/prctl.h> 36#include <linux/prctl.h>
38#include <linux/dmi.h>
39#include <linux/ftrace.h> 37#include <linux/ftrace.h>
40#include <linux/uaccess.h> 38#include <linux/uaccess.h>
41#include <linux/io.h> 39#include <linux/io.h>
@@ -58,6 +56,7 @@
58#include <asm/idle.h> 56#include <asm/idle.h>
59#include <asm/syscalls.h> 57#include <asm/syscalls.h>
60#include <asm/ds.h> 58#include <asm/ds.h>
59#include <asm/debugreg.h>
61 60
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 62
@@ -127,39 +126,29 @@ void __show_regs(struct pt_regs *regs, int all)
127 unsigned long d0, d1, d2, d3, d6, d7; 126 unsigned long d0, d1, d2, d3, d6, d7;
128 unsigned long sp; 127 unsigned long sp;
129 unsigned short ss, gs; 128 unsigned short ss, gs;
130 const char *board;
131 129
132 if (user_mode_vm(regs)) { 130 if (user_mode_vm(regs)) {
133 sp = regs->sp; 131 sp = regs->sp;
134 ss = regs->ss & 0xffff; 132 ss = regs->ss & 0xffff;
135 gs = get_user_gs(regs); 133 gs = get_user_gs(regs);
136 } else { 134 } else {
137 sp = (unsigned long) (&regs->sp); 135 sp = kernel_stack_pointer(regs);
138 savesegment(ss, ss); 136 savesegment(ss, ss);
139 savesegment(gs, gs); 137 savesegment(gs, gs);
140 } 138 }
141 139
142 printk("\n"); 140 show_regs_common();
143 141
144 board = dmi_get_system_info(DMI_PRODUCT_NAME); 142 printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
145 if (!board)
146 board = "";
147 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
148 task_pid_nr(current), current->comm,
149 print_tainted(), init_utsname()->release,
150 (int)strcspn(init_utsname()->version, " "),
151 init_utsname()->version, board);
152
153 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
154 (u16)regs->cs, regs->ip, regs->flags, 143 (u16)regs->cs, regs->ip, regs->flags,
155 smp_processor_id()); 144 smp_processor_id());
156 print_symbol("EIP is at %s\n", regs->ip); 145 print_symbol("EIP is at %s\n", regs->ip);
157 146
158 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 147 printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
159 regs->ax, regs->bx, regs->cx, regs->dx); 148 regs->ax, regs->bx, regs->cx, regs->dx);
160 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", 149 printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
161 regs->si, regs->di, regs->bp, sp); 150 regs->si, regs->di, regs->bp, sp);
162 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", 151 printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
163 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); 152 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
164 153
165 if (!all) 154 if (!all)
@@ -169,61 +158,22 @@ void __show_regs(struct pt_regs *regs, int all)
169 cr2 = read_cr2(); 158 cr2 = read_cr2();
170 cr3 = read_cr3(); 159 cr3 = read_cr3();
171 cr4 = read_cr4_safe(); 160 cr4 = read_cr4_safe();
172 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", 161 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
173 cr0, cr2, cr3, cr4); 162 cr0, cr2, cr3, cr4);
174 163
175 get_debugreg(d0, 0); 164 get_debugreg(d0, 0);
176 get_debugreg(d1, 1); 165 get_debugreg(d1, 1);
177 get_debugreg(d2, 2); 166 get_debugreg(d2, 2);
178 get_debugreg(d3, 3); 167 get_debugreg(d3, 3);
179 printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", 168 printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
180 d0, d1, d2, d3); 169 d0, d1, d2, d3);
181 170
182 get_debugreg(d6, 6); 171 get_debugreg(d6, 6);
183 get_debugreg(d7, 7); 172 get_debugreg(d7, 7);
184 printk("DR6: %08lx DR7: %08lx\n", 173 printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
185 d6, d7); 174 d6, d7);
186} 175}
187 176
188void show_regs(struct pt_regs *regs)
189{
190 __show_regs(regs, 1);
191 show_trace(NULL, regs, &regs->sp, regs->bp);
192}
193
194/*
195 * This gets run with %bx containing the
196 * function to call, and %dx containing
197 * the "args".
198 */
199extern void kernel_thread_helper(void);
200
201/*
202 * Create a kernel thread
203 */
204int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
205{
206 struct pt_regs regs;
207
208 memset(&regs, 0, sizeof(regs));
209
210 regs.bx = (unsigned long) fn;
211 regs.dx = (unsigned long) arg;
212
213 regs.ds = __USER_DS;
214 regs.es = __USER_DS;
215 regs.fs = __KERNEL_PERCPU;
216 regs.gs = __KERNEL_STACK_CANARY;
217 regs.orig_ax = -1;
218 regs.ip = (unsigned long) kernel_thread_helper;
219 regs.cs = __KERNEL_CS | get_kernel_rpl();
220 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
221
222 /* Ok, create the new process.. */
223 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
224}
225EXPORT_SYMBOL(kernel_thread);
226
227void release_thread(struct task_struct *dead_task) 177void release_thread(struct task_struct *dead_task)
228{ 178{
229 BUG_ON(dead_task->mm); 179 BUG_ON(dead_task->mm);
@@ -259,7 +209,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
259 209
260 task_user_gs(p) = get_user_gs(regs); 210 task_user_gs(p) = get_user_gs(regs);
261 211
212 p->thread.io_bitmap_ptr = NULL;
262 tsk = current; 213 tsk = current;
214 err = -ENOMEM;
215
216 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
217
263 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 218 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
264 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, 219 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
265 IO_BITMAP_BYTES, GFP_KERNEL); 220 IO_BITMAP_BYTES, GFP_KERNEL);
@@ -430,46 +385,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
430 return prev_p; 385 return prev_p;
431} 386}
432 387
433int sys_clone(struct pt_regs *regs)
434{
435 unsigned long clone_flags;
436 unsigned long newsp;
437 int __user *parent_tidptr, *child_tidptr;
438
439 clone_flags = regs->bx;
440 newsp = regs->cx;
441 parent_tidptr = (int __user *)regs->dx;
442 child_tidptr = (int __user *)regs->di;
443 if (!newsp)
444 newsp = regs->sp;
445 return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
446}
447
448/*
449 * sys_execve() executes a new program.
450 */
451int sys_execve(struct pt_regs *regs)
452{
453 int error;
454 char *filename;
455
456 filename = getname((char __user *) regs->bx);
457 error = PTR_ERR(filename);
458 if (IS_ERR(filename))
459 goto out;
460 error = do_execve(filename,
461 (char __user * __user *) regs->cx,
462 (char __user * __user *) regs->dx,
463 regs);
464 if (error == 0) {
465 /* Make sure we don't return using sysenter.. */
466 set_thread_flag(TIF_IRET);
467 }
468 putname(filename);
469out:
470 return error;
471}
472
473#define top_esp (THREAD_SIZE - sizeof(unsigned long)) 388#define top_esp (THREAD_SIZE - sizeof(unsigned long))
474#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) 389#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
475 390
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index eb62cbcaa490..17cb3295cbf7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -26,7 +26,6 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/user.h> 27#include <linux/user.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/utsname.h>
30#include <linux/delay.h> 29#include <linux/delay.h>
31#include <linux/module.h> 30#include <linux/module.h>
32#include <linux/ptrace.h> 31#include <linux/ptrace.h>
@@ -38,7 +37,6 @@
38#include <linux/uaccess.h> 37#include <linux/uaccess.h>
39#include <linux/io.h> 38#include <linux/io.h>
40#include <linux/ftrace.h> 39#include <linux/ftrace.h>
41#include <linux/dmi.h>
42 40
43#include <asm/pgtable.h> 41#include <asm/pgtable.h>
44#include <asm/system.h> 42#include <asm/system.h>
@@ -52,14 +50,13 @@
52#include <asm/idle.h> 50#include <asm/idle.h>
53#include <asm/syscalls.h> 51#include <asm/syscalls.h>
54#include <asm/ds.h> 52#include <asm/ds.h>
53#include <asm/debugreg.h>
55 54
56asmlinkage extern void ret_from_fork(void); 55asmlinkage extern void ret_from_fork(void);
57 56
58DEFINE_PER_CPU(unsigned long, old_rsp); 57DEFINE_PER_CPU(unsigned long, old_rsp);
59static DEFINE_PER_CPU(unsigned char, is_idle); 58static DEFINE_PER_CPU(unsigned char, is_idle);
60 59
61unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
62
63static ATOMIC_NOTIFIER_HEAD(idle_notifier); 60static ATOMIC_NOTIFIER_HEAD(idle_notifier);
64 61
65void idle_notifier_register(struct notifier_block *n) 62void idle_notifier_register(struct notifier_block *n)
@@ -162,31 +159,21 @@ void __show_regs(struct pt_regs *regs, int all)
162 unsigned long d0, d1, d2, d3, d6, d7; 159 unsigned long d0, d1, d2, d3, d6, d7;
163 unsigned int fsindex, gsindex; 160 unsigned int fsindex, gsindex;
164 unsigned int ds, cs, es; 161 unsigned int ds, cs, es;
165 const char *board; 162
166 163 show_regs_common();
167 printk("\n"); 164 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
168 print_modules();
169 board = dmi_get_system_info(DMI_PRODUCT_NAME);
170 if (!board)
171 board = "";
172 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
173 current->pid, current->comm, print_tainted(),
174 init_utsname()->release,
175 (int)strcspn(init_utsname()->version, " "),
176 init_utsname()->version, board);
177 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
178 printk_address(regs->ip, 1); 165 printk_address(regs->ip, 1);
179 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 166 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
180 regs->sp, regs->flags); 167 regs->sp, regs->flags);
181 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", 168 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
182 regs->ax, regs->bx, regs->cx); 169 regs->ax, regs->bx, regs->cx);
183 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", 170 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
184 regs->dx, regs->si, regs->di); 171 regs->dx, regs->si, regs->di);
185 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", 172 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
186 regs->bp, regs->r8, regs->r9); 173 regs->bp, regs->r8, regs->r9);
187 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", 174 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
188 regs->r10, regs->r11, regs->r12); 175 regs->r10, regs->r11, regs->r12);
189 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", 176 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
190 regs->r13, regs->r14, regs->r15); 177 regs->r13, regs->r14, regs->r15);
191 178
192 asm("movl %%ds,%0" : "=r" (ds)); 179 asm("movl %%ds,%0" : "=r" (ds));
@@ -207,28 +194,21 @@ void __show_regs(struct pt_regs *regs, int all)
207 cr3 = read_cr3(); 194 cr3 = read_cr3();
208 cr4 = read_cr4(); 195 cr4 = read_cr4();
209 196
210 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 197 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
211 fs, fsindex, gs, gsindex, shadowgs); 198 fs, fsindex, gs, gsindex, shadowgs);
212 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 199 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
213 es, cr0); 200 es, cr0);
214 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 201 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
215 cr4); 202 cr4);
216 203
217 get_debugreg(d0, 0); 204 get_debugreg(d0, 0);
218 get_debugreg(d1, 1); 205 get_debugreg(d1, 1);
219 get_debugreg(d2, 2); 206 get_debugreg(d2, 2);
220 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 207 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
221 get_debugreg(d3, 3); 208 get_debugreg(d3, 3);
222 get_debugreg(d6, 6); 209 get_debugreg(d6, 6);
223 get_debugreg(d7, 7); 210 get_debugreg(d7, 7);
224 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 211 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
225}
226
227void show_regs(struct pt_regs *regs)
228{
229 printk(KERN_INFO "CPU %d:", smp_processor_id());
230 __show_regs(regs, 1);
231 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
232} 212}
233 213
234void release_thread(struct task_struct *dead_task) 214void release_thread(struct task_struct *dead_task)
@@ -285,8 +265,9 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
285 *childregs = *regs; 265 *childregs = *regs;
286 266
287 childregs->ax = 0; 267 childregs->ax = 0;
288 childregs->sp = sp; 268 if (user_mode(regs))
289 if (sp == ~0UL) 269 childregs->sp = sp;
270 else
290 childregs->sp = (unsigned long)childregs; 271 childregs->sp = (unsigned long)childregs;
291 272
292 p->thread.sp = (unsigned long) childregs; 273 p->thread.sp = (unsigned long) childregs;
@@ -295,14 +276,18 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
295 276
296 set_tsk_thread_flag(p, TIF_FORK); 277 set_tsk_thread_flag(p, TIF_FORK);
297 278
298 p->thread.fs = me->thread.fs; 279 p->thread.io_bitmap_ptr = NULL;
299 p->thread.gs = me->thread.gs;
300 280
301 savesegment(gs, p->thread.gsindex); 281 savesegment(gs, p->thread.gsindex);
282 p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
302 savesegment(fs, p->thread.fsindex); 283 savesegment(fs, p->thread.fsindex);
284 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
303 savesegment(es, p->thread.es); 285 savesegment(es, p->thread.es);
304 savesegment(ds, p->thread.ds); 286 savesegment(ds, p->thread.ds);
305 287
288 err = -ENOMEM;
289 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
290
306 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 291 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
307 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 292 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
308 if (!p->thread.io_bitmap_ptr) { 293 if (!p->thread.io_bitmap_ptr) {
@@ -341,29 +326,46 @@ out:
341 kfree(p->thread.io_bitmap_ptr); 326 kfree(p->thread.io_bitmap_ptr);
342 p->thread.io_bitmap_max = 0; 327 p->thread.io_bitmap_max = 0;
343 } 328 }
329
344 return err; 330 return err;
345} 331}
346 332
347void 333static void
348start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 334start_thread_common(struct pt_regs *regs, unsigned long new_ip,
335 unsigned long new_sp,
336 unsigned int _cs, unsigned int _ss, unsigned int _ds)
349{ 337{
350 loadsegment(fs, 0); 338 loadsegment(fs, 0);
351 loadsegment(es, 0); 339 loadsegment(es, _ds);
352 loadsegment(ds, 0); 340 loadsegment(ds, _ds);
353 load_gs_index(0); 341 load_gs_index(0);
354 regs->ip = new_ip; 342 regs->ip = new_ip;
355 regs->sp = new_sp; 343 regs->sp = new_sp;
356 percpu_write(old_rsp, new_sp); 344 percpu_write(old_rsp, new_sp);
357 regs->cs = __USER_CS; 345 regs->cs = _cs;
358 regs->ss = __USER_DS; 346 regs->ss = _ss;
359 regs->flags = 0x200; 347 regs->flags = X86_EFLAGS_IF;
360 set_fs(USER_DS); 348 set_fs(USER_DS);
361 /* 349 /*
362 * Free the old FP and other extended state 350 * Free the old FP and other extended state
363 */ 351 */
364 free_thread_xstate(current); 352 free_thread_xstate(current);
365} 353}
366EXPORT_SYMBOL_GPL(start_thread); 354
355void
356start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
357{
358 start_thread_common(regs, new_ip, new_sp,
359 __USER_CS, __USER_DS, 0);
360}
361
362#ifdef CONFIG_IA32_EMULATION
363void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
364{
365 start_thread_common(regs, new_ip, new_sp,
366 __USER32_CS, __USER32_DS, __USER32_DS);
367}
368#endif
367 369
368/* 370/*
369 * switch_to(x,y) should switch tasks from x to y. 371 * switch_to(x,y) should switch tasks from x to y.
@@ -495,26 +497,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
495 */ 497 */
496 if (preload_fpu) 498 if (preload_fpu)
497 __math_state_restore(); 499 __math_state_restore();
498 return prev_p;
499}
500 500
501/* 501 return prev_p;
502 * sys_execve() executes a new program.
503 */
504asmlinkage
505long sys_execve(char __user *name, char __user * __user *argv,
506 char __user * __user *envp, struct pt_regs *regs)
507{
508 long error;
509 char *filename;
510
511 filename = getname(name);
512 error = PTR_ERR(filename);
513 if (IS_ERR(filename))
514 return error;
515 error = do_execve(filename, argv, envp, regs);
516 putname(filename);
517 return error;
518} 502}
519 503
520void set_personality_64bit(void) 504void set_personality_64bit(void)
@@ -531,13 +515,16 @@ void set_personality_64bit(void)
531 current->personality &= ~READ_IMPLIES_EXEC; 515 current->personality &= ~READ_IMPLIES_EXEC;
532} 516}
533 517
534asmlinkage long 518void set_personality_ia32(void)
535sys_clone(unsigned long clone_flags, unsigned long newsp,
536 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
537{ 519{
538 if (!newsp) 520 /* inherit personality from parent */
539 newsp = regs->sp; 521
540 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 522 /* Make sure to be in 32bit mode */
523 set_thread_flag(TIF_IA32);
524 current->personality |= force_personality32;
525
526 /* Prepare the first "return" to user space */
527 current_thread_info()->status |= TS_COMPAT;
541} 528}
542 529
543unsigned long get_wchan(struct task_struct *p) 530unsigned long get_wchan(struct task_struct *p)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7b058a2dc66a..2e9b55027b7e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/slab.h>
15#include <linux/ptrace.h> 16#include <linux/ptrace.h>
16#include <linux/regset.h> 17#include <linux/regset.h>
17#include <linux/tracehook.h> 18#include <linux/tracehook.h>
@@ -22,6 +23,8 @@
22#include <linux/seccomp.h> 23#include <linux/seccomp.h>
23#include <linux/signal.h> 24#include <linux/signal.h>
24#include <linux/workqueue.h> 25#include <linux/workqueue.h>
26#include <linux/perf_event.h>
27#include <linux/hw_breakpoint.h>
25 28
26#include <asm/uaccess.h> 29#include <asm/uaccess.h>
27#include <asm/pgtable.h> 30#include <asm/pgtable.h>
@@ -34,6 +37,7 @@
34#include <asm/prctl.h> 37#include <asm/prctl.h>
35#include <asm/proto.h> 38#include <asm/proto.h>
36#include <asm/ds.h> 39#include <asm/ds.h>
40#include <asm/hw_breakpoint.h>
37 41
38#include "tls.h" 42#include "tls.h"
39 43
@@ -45,10 +49,99 @@ enum x86_regset {
45 REGSET_FP, 49 REGSET_FP,
46 REGSET_XFP, 50 REGSET_XFP,
47 REGSET_IOPERM64 = REGSET_XFP, 51 REGSET_IOPERM64 = REGSET_XFP,
52 REGSET_XSTATE,
48 REGSET_TLS, 53 REGSET_TLS,
49 REGSET_IOPERM32, 54 REGSET_IOPERM32,
50}; 55};
51 56
57struct pt_regs_offset {
58 const char *name;
59 int offset;
60};
61
62#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
63#define REG_OFFSET_END {.name = NULL, .offset = 0}
64
65static const struct pt_regs_offset regoffset_table[] = {
66#ifdef CONFIG_X86_64
67 REG_OFFSET_NAME(r15),
68 REG_OFFSET_NAME(r14),
69 REG_OFFSET_NAME(r13),
70 REG_OFFSET_NAME(r12),
71 REG_OFFSET_NAME(r11),
72 REG_OFFSET_NAME(r10),
73 REG_OFFSET_NAME(r9),
74 REG_OFFSET_NAME(r8),
75#endif
76 REG_OFFSET_NAME(bx),
77 REG_OFFSET_NAME(cx),
78 REG_OFFSET_NAME(dx),
79 REG_OFFSET_NAME(si),
80 REG_OFFSET_NAME(di),
81 REG_OFFSET_NAME(bp),
82 REG_OFFSET_NAME(ax),
83#ifdef CONFIG_X86_32
84 REG_OFFSET_NAME(ds),
85 REG_OFFSET_NAME(es),
86 REG_OFFSET_NAME(fs),
87 REG_OFFSET_NAME(gs),
88#endif
89 REG_OFFSET_NAME(orig_ax),
90 REG_OFFSET_NAME(ip),
91 REG_OFFSET_NAME(cs),
92 REG_OFFSET_NAME(flags),
93 REG_OFFSET_NAME(sp),
94 REG_OFFSET_NAME(ss),
95 REG_OFFSET_END,
96};
97
98/**
99 * regs_query_register_offset() - query register offset from its name
100 * @name: the name of a register
101 *
102 * regs_query_register_offset() returns the offset of a register in struct
103 * pt_regs from its name. If the name is invalid, this returns -EINVAL;
104 */
105int regs_query_register_offset(const char *name)
106{
107 const struct pt_regs_offset *roff;
108 for (roff = regoffset_table; roff->name != NULL; roff++)
109 if (!strcmp(roff->name, name))
110 return roff->offset;
111 return -EINVAL;
112}
113
114/**
115 * regs_query_register_name() - query register name from its offset
116 * @offset: the offset of a register in struct pt_regs.
117 *
118 * regs_query_register_name() returns the name of a register from its
119 * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
120 */
121const char *regs_query_register_name(unsigned int offset)
122{
123 const struct pt_regs_offset *roff;
124 for (roff = regoffset_table; roff->name != NULL; roff++)
125 if (roff->offset == offset)
126 return roff->name;
127 return NULL;
128}
129
130static const int arg_offs_table[] = {
131#ifdef CONFIG_X86_32
132 [0] = offsetof(struct pt_regs, ax),
133 [1] = offsetof(struct pt_regs, dx),
134 [2] = offsetof(struct pt_regs, cx)
135#else /* CONFIG_X86_64 */
136 [0] = offsetof(struct pt_regs, di),
137 [1] = offsetof(struct pt_regs, si),
138 [2] = offsetof(struct pt_regs, dx),
139 [3] = offsetof(struct pt_regs, cx),
140 [4] = offsetof(struct pt_regs, r8),
141 [5] = offsetof(struct pt_regs, r9)
142#endif
143};
144
52/* 145/*
53 * does not yet catch signals sent when the child dies. 146 * does not yet catch signals sent when the child dies.
54 * in exit.c or in signal.c. 147 * in exit.c or in signal.c.
@@ -137,11 +230,6 @@ static int set_segment_reg(struct task_struct *task,
137 return 0; 230 return 0;
138} 231}
139 232
140static unsigned long debugreg_addr_limit(struct task_struct *task)
141{
142 return TASK_SIZE - 3;
143}
144
145#else /* CONFIG_X86_64 */ 233#else /* CONFIG_X86_64 */
146 234
147#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) 235#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +354,6 @@ static int set_segment_reg(struct task_struct *task,
266 return 0; 354 return 0;
267} 355}
268 356
269static unsigned long debugreg_addr_limit(struct task_struct *task)
270{
271#ifdef CONFIG_IA32_EMULATION
272 if (test_tsk_thread_flag(task, TIF_IA32))
273 return IA32_PAGE_OFFSET - 3;
274#endif
275 return TASK_SIZE_MAX - 7;
276}
277
278#endif /* CONFIG_X86_32 */ 357#endif /* CONFIG_X86_32 */
279 358
280static unsigned long get_flags(struct task_struct *task) 359static unsigned long get_flags(struct task_struct *task)
@@ -408,14 +487,14 @@ static int genregs_get(struct task_struct *target,
408{ 487{
409 if (kbuf) { 488 if (kbuf) {
410 unsigned long *k = kbuf; 489 unsigned long *k = kbuf;
411 while (count > 0) { 490 while (count >= sizeof(*k)) {
412 *k++ = getreg(target, pos); 491 *k++ = getreg(target, pos);
413 count -= sizeof(*k); 492 count -= sizeof(*k);
414 pos += sizeof(*k); 493 pos += sizeof(*k);
415 } 494 }
416 } else { 495 } else {
417 unsigned long __user *u = ubuf; 496 unsigned long __user *u = ubuf;
418 while (count > 0) { 497 while (count >= sizeof(*u)) {
419 if (__put_user(getreg(target, pos), u++)) 498 if (__put_user(getreg(target, pos), u++))
420 return -EFAULT; 499 return -EFAULT;
421 count -= sizeof(*u); 500 count -= sizeof(*u);
@@ -434,14 +513,14 @@ static int genregs_set(struct task_struct *target,
434 int ret = 0; 513 int ret = 0;
435 if (kbuf) { 514 if (kbuf) {
436 const unsigned long *k = kbuf; 515 const unsigned long *k = kbuf;
437 while (count > 0 && !ret) { 516 while (count >= sizeof(*k) && !ret) {
438 ret = putreg(target, pos, *k++); 517 ret = putreg(target, pos, *k++);
439 count -= sizeof(*k); 518 count -= sizeof(*k);
440 pos += sizeof(*k); 519 pos += sizeof(*k);
441 } 520 }
442 } else { 521 } else {
443 const unsigned long __user *u = ubuf; 522 const unsigned long __user *u = ubuf;
444 while (count > 0 && !ret) { 523 while (count >= sizeof(*u) && !ret) {
445 unsigned long word; 524 unsigned long word;
446 ret = __get_user(word, u++); 525 ret = __get_user(word, u++);
447 if (ret) 526 if (ret)
@@ -454,99 +533,240 @@ static int genregs_set(struct task_struct *target,
454 return ret; 533 return ret;
455} 534}
456 535
536static void ptrace_triggered(struct perf_event *bp, int nmi,
537 struct perf_sample_data *data,
538 struct pt_regs *regs)
539{
540 int i;
541 struct thread_struct *thread = &(current->thread);
542
543 /*
544 * Store in the virtual DR6 register the fact that the breakpoint
545 * was hit so the thread's debugger will see it.
546 */
547 for (i = 0; i < HBP_NUM; i++) {
548 if (thread->ptrace_bps[i] == bp)
549 break;
550 }
551
552 thread->debugreg6 |= (DR_TRAP0 << i);
553}
554
457/* 555/*
458 * This function is trivial and will be inlined by the compiler. 556 * Walk through every ptrace breakpoints for this thread and
459 * Having it separates the implementation details of debug 557 * build the dr7 value on top of their attributes.
460 * registers from the interface details of ptrace. 558 *
461 */ 559 */
462static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) 560static unsigned long ptrace_get_dr7(struct perf_event *bp[])
463{ 561{
464 switch (n) { 562 int i;
465 case 0: return child->thread.debugreg0; 563 int dr7 = 0;
466 case 1: return child->thread.debugreg1; 564 struct arch_hw_breakpoint *info;
467 case 2: return child->thread.debugreg2; 565
468 case 3: return child->thread.debugreg3; 566 for (i = 0; i < HBP_NUM; i++) {
469 case 6: return child->thread.debugreg6; 567 if (bp[i] && !bp[i]->attr.disabled) {
470 case 7: return child->thread.debugreg7; 568 info = counter_arch_bp(bp[i]);
569 dr7 |= encode_dr7(i, info->len, info->type);
570 }
471 } 571 }
472 return 0; 572
573 return dr7;
473} 574}
474 575
475static int ptrace_set_debugreg(struct task_struct *child, 576static int
476 int n, unsigned long data) 577ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
578 struct task_struct *tsk, int disabled)
477{ 579{
478 int i; 580 int err;
581 int gen_len, gen_type;
582 struct perf_event_attr attr;
479 583
480 if (unlikely(n == 4 || n == 5)) 584 /*
481 return -EIO; 585 * We should have at least an inactive breakpoint at this
586 * slot. It means the user is writing dr7 without having
587 * written the address register first
588 */
589 if (!bp)
590 return -EINVAL;
482 591
483 if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) 592 err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
484 return -EIO; 593 if (err)
594 return err;
485 595
486 switch (n) { 596 attr = bp->attr;
487 case 0: child->thread.debugreg0 = data; break; 597 attr.bp_len = gen_len;
488 case 1: child->thread.debugreg1 = data; break; 598 attr.bp_type = gen_type;
489 case 2: child->thread.debugreg2 = data; break; 599 attr.disabled = disabled;
490 case 3: child->thread.debugreg3 = data; break;
491 600
492 case 6: 601 return modify_user_hw_breakpoint(bp, &attr);
493 if ((data & ~0xffffffffUL) != 0) 602}
494 return -EIO;
495 child->thread.debugreg6 = data;
496 break;
497 603
498 case 7: 604/*
605 * Handle ptrace writes to debug register 7.
606 */
607static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
608{
609 struct thread_struct *thread = &(tsk->thread);
610 unsigned long old_dr7;
611 int i, orig_ret = 0, rc = 0;
612 int enabled, second_pass = 0;
613 unsigned len, type;
614 struct perf_event *bp;
615
616 data &= ~DR_CONTROL_RESERVED;
617 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
618restore:
619 /*
620 * Loop through all the hardware breakpoints, making the
621 * appropriate changes to each.
622 */
623 for (i = 0; i < HBP_NUM; i++) {
624 enabled = decode_dr7(data, i, &len, &type);
625 bp = thread->ptrace_bps[i];
626
627 if (!enabled) {
628 if (bp) {
629 /*
630 * Don't unregister the breakpoints right-away,
631 * unless all register_user_hw_breakpoint()
632 * requests have succeeded. This prevents
633 * any window of opportunity for debug
634 * register grabbing by other users.
635 */
636 if (!second_pass)
637 continue;
638
639 rc = ptrace_modify_breakpoint(bp, len, type,
640 tsk, 1);
641 if (rc)
642 break;
643 }
644 continue;
645 }
646
647 rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
648 if (rc)
649 break;
650 }
651 /*
652 * Make a second pass to free the remaining unused breakpoints
653 * or to restore the original breakpoints if an error occurred.
654 */
655 if (!second_pass) {
656 second_pass = 1;
657 if (rc < 0) {
658 orig_ret = rc;
659 data = old_dr7;
660 }
661 goto restore;
662 }
663 return ((orig_ret < 0) ? orig_ret : rc);
664}
665
666/*
667 * Handle PTRACE_PEEKUSR calls for the debug register area.
668 */
669static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
670{
671 struct thread_struct *thread = &(tsk->thread);
672 unsigned long val = 0;
673
674 if (n < HBP_NUM) {
675 struct perf_event *bp;
676 bp = thread->ptrace_bps[n];
677 if (!bp)
678 return 0;
679 val = bp->hw.info.address;
680 } else if (n == 6) {
681 val = thread->debugreg6;
682 } else if (n == 7) {
683 val = thread->ptrace_dr7;
684 }
685 return val;
686}
687
688static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
689 unsigned long addr)
690{
691 struct perf_event *bp;
692 struct thread_struct *t = &tsk->thread;
693 struct perf_event_attr attr;
694
695 if (!t->ptrace_bps[nr]) {
696 hw_breakpoint_init(&attr);
499 /* 697 /*
500 * Sanity-check data. Take one half-byte at once with 698 * Put stub len and type to register (reserve) an inactive but
501 * check = (val >> (16 + 4*i)) & 0xf. It contains the 699 * correct bp
502 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
503 * 2 and 3 are LENi. Given a list of invalid values,
504 * we do mask |= 1 << invalid_value, so that
505 * (mask >> check) & 1 is a correct test for invalid
506 * values.
507 *
508 * R/Wi contains the type of the breakpoint /
509 * watchpoint, LENi contains the length of the watched
510 * data in the watchpoint case.
511 *
512 * The invalid values are:
513 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
514 * - R/Wi == 0x10 (break on I/O reads or writes), so
515 * mask |= 0x4444.
516 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
517 * 0x1110.
518 *
519 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
520 *
521 * See the Intel Manual "System Programming Guide",
522 * 15.2.4
523 *
524 * Note that LENi == 0x10 is defined on x86_64 in long
525 * mode (i.e. even for 32-bit userspace software, but
526 * 64-bit kernel), so the x86_64 mask value is 0x5454.
527 * See the AMD manual no. 24593 (AMD64 System Programming)
528 */ 700 */
529#ifdef CONFIG_X86_32 701 attr.bp_addr = addr;
530#define DR7_MASK 0x5f54 702 attr.bp_len = HW_BREAKPOINT_LEN_1;
531#else 703 attr.bp_type = HW_BREAKPOINT_W;
532#define DR7_MASK 0x5554 704 attr.disabled = 1;
533#endif 705
534 data &= ~DR_CONTROL_RESERVED; 706 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
535 for (i = 0; i < 4; i++) 707
536 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) 708 /*
537 return -EIO; 709 * CHECKME: the previous code returned -EIO if the addr wasn't
538 child->thread.debugreg7 = data; 710 * a valid task virtual addr. The new one will return -EINVAL in
539 if (data) 711 * this case.
540 set_tsk_thread_flag(child, TIF_DEBUG); 712 * -EINVAL may be what we want for in-kernel breakpoints users,
541 else 713 * but -EIO looks better for ptrace, since we refuse a register
542 clear_tsk_thread_flag(child, TIF_DEBUG); 714 * writing for the user. And anyway this is the previous
543 break; 715 * behaviour.
716 */
717 if (IS_ERR(bp))
718 return PTR_ERR(bp);
719
720 t->ptrace_bps[nr] = bp;
721 } else {
722 int err;
723
724 bp = t->ptrace_bps[nr];
725
726 attr = bp->attr;
727 attr.bp_addr = addr;
728 err = modify_user_hw_breakpoint(bp, &attr);
729 if (err)
730 return err;
544 } 731 }
545 732
733
546 return 0; 734 return 0;
547} 735}
548 736
549/* 737/*
738 * Handle PTRACE_POKEUSR calls for the debug register area.
739 */
740int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
741{
742 struct thread_struct *thread = &(tsk->thread);
743 int rc = 0;
744
745 /* There are no DR4 or DR5 registers */
746 if (n == 4 || n == 5)
747 return -EIO;
748
749 if (n == 6) {
750 thread->debugreg6 = val;
751 goto ret_path;
752 }
753 if (n < HBP_NUM) {
754 rc = ptrace_set_breakpoint_addr(tsk, n, val);
755 if (rc)
756 return rc;
757 }
758 /* All that's left is DR7 */
759 if (n == 7) {
760 rc = ptrace_write_dr7(tsk, val);
761 if (!rc)
762 thread->ptrace_dr7 = val;
763 }
764
765ret_path:
766 return rc;
767}
768
769/*
550 * These access the current or another (stopped) task's io permission 770 * These access the current or another (stopped) task's io permission
551 * bitmap for debugging or core dump. 771 * bitmap for debugging or core dump.
552 */ 772 */
@@ -1219,14 +1439,14 @@ static int genregs32_get(struct task_struct *target,
1219{ 1439{
1220 if (kbuf) { 1440 if (kbuf) {
1221 compat_ulong_t *k = kbuf; 1441 compat_ulong_t *k = kbuf;
1222 while (count > 0) { 1442 while (count >= sizeof(*k)) {
1223 getreg32(target, pos, k++); 1443 getreg32(target, pos, k++);
1224 count -= sizeof(*k); 1444 count -= sizeof(*k);
1225 pos += sizeof(*k); 1445 pos += sizeof(*k);
1226 } 1446 }
1227 } else { 1447 } else {
1228 compat_ulong_t __user *u = ubuf; 1448 compat_ulong_t __user *u = ubuf;
1229 while (count > 0) { 1449 while (count >= sizeof(*u)) {
1230 compat_ulong_t word; 1450 compat_ulong_t word;
1231 getreg32(target, pos, &word); 1451 getreg32(target, pos, &word);
1232 if (__put_user(word, u++)) 1452 if (__put_user(word, u++))
@@ -1247,14 +1467,14 @@ static int genregs32_set(struct task_struct *target,
1247 int ret = 0; 1467 int ret = 0;
1248 if (kbuf) { 1468 if (kbuf) {
1249 const compat_ulong_t *k = kbuf; 1469 const compat_ulong_t *k = kbuf;
1250 while (count > 0 && !ret) { 1470 while (count >= sizeof(*k) && !ret) {
1251 ret = putreg32(target, pos, *k++); 1471 ret = putreg32(target, pos, *k++);
1252 count -= sizeof(*k); 1472 count -= sizeof(*k);
1253 pos += sizeof(*k); 1473 pos += sizeof(*k);
1254 } 1474 }
1255 } else { 1475 } else {
1256 const compat_ulong_t __user *u = ubuf; 1476 const compat_ulong_t __user *u = ubuf;
1257 while (count > 0 && !ret) { 1477 while (count >= sizeof(*u) && !ret) {
1258 compat_ulong_t word; 1478 compat_ulong_t word;
1259 ret = __get_user(word, u++); 1479 ret = __get_user(word, u++);
1260 if (ret) 1480 if (ret)
@@ -1345,7 +1565,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1345 1565
1346#ifdef CONFIG_X86_64 1566#ifdef CONFIG_X86_64
1347 1567
1348static const struct user_regset x86_64_regsets[] = { 1568static struct user_regset x86_64_regsets[] __read_mostly = {
1349 [REGSET_GENERAL] = { 1569 [REGSET_GENERAL] = {
1350 .core_note_type = NT_PRSTATUS, 1570 .core_note_type = NT_PRSTATUS,
1351 .n = sizeof(struct user_regs_struct) / sizeof(long), 1571 .n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1358,6 +1578,12 @@ static const struct user_regset x86_64_regsets[] = {
1358 .size = sizeof(long), .align = sizeof(long), 1578 .size = sizeof(long), .align = sizeof(long),
1359 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1579 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1360 }, 1580 },
1581 [REGSET_XSTATE] = {
1582 .core_note_type = NT_X86_XSTATE,
1583 .size = sizeof(u64), .align = sizeof(u64),
1584 .active = xstateregs_active, .get = xstateregs_get,
1585 .set = xstateregs_set
1586 },
1361 [REGSET_IOPERM64] = { 1587 [REGSET_IOPERM64] = {
1362 .core_note_type = NT_386_IOPERM, 1588 .core_note_type = NT_386_IOPERM,
1363 .n = IO_BITMAP_LONGS, 1589 .n = IO_BITMAP_LONGS,
@@ -1383,7 +1609,7 @@ static const struct user_regset_view user_x86_64_view = {
1383#endif /* CONFIG_X86_64 */ 1609#endif /* CONFIG_X86_64 */
1384 1610
1385#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 1611#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1386static const struct user_regset x86_32_regsets[] = { 1612static struct user_regset x86_32_regsets[] __read_mostly = {
1387 [REGSET_GENERAL] = { 1613 [REGSET_GENERAL] = {
1388 .core_note_type = NT_PRSTATUS, 1614 .core_note_type = NT_PRSTATUS,
1389 .n = sizeof(struct user_regs_struct32) / sizeof(u32), 1615 .n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1402,6 +1628,12 @@ static const struct user_regset x86_32_regsets[] = {
1402 .size = sizeof(u32), .align = sizeof(u32), 1628 .size = sizeof(u32), .align = sizeof(u32),
1403 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1629 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1404 }, 1630 },
1631 [REGSET_XSTATE] = {
1632 .core_note_type = NT_X86_XSTATE,
1633 .size = sizeof(u64), .align = sizeof(u64),
1634 .active = xstateregs_active, .get = xstateregs_get,
1635 .set = xstateregs_set
1636 },
1405 [REGSET_TLS] = { 1637 [REGSET_TLS] = {
1406 .core_note_type = NT_386_TLS, 1638 .core_note_type = NT_386_TLS,
1407 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, 1639 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
@@ -1424,6 +1656,23 @@ static const struct user_regset_view user_x86_32_view = {
1424}; 1656};
1425#endif 1657#endif
1426 1658
1659/*
1660 * This represents bytes 464..511 in the memory layout exported through
1661 * the REGSET_XSTATE interface.
1662 */
1663u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
1664
1665void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
1666{
1667#ifdef CONFIG_X86_64
1668 x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
1669#endif
1670#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1671 x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
1672#endif
1673 xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
1674}
1675
1427const struct user_regset_view *task_user_regset_view(struct task_struct *task) 1676const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1428{ 1677{
1429#ifdef CONFIG_IA32_EMULATION 1678#ifdef CONFIG_IA32_EMULATION
@@ -1437,21 +1686,33 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1437#endif 1686#endif
1438} 1687}
1439 1688
1440void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, 1689static void fill_sigtrap_info(struct task_struct *tsk,
1441 int error_code, int si_code) 1690 struct pt_regs *regs,
1691 int error_code, int si_code,
1692 struct siginfo *info)
1442{ 1693{
1443 struct siginfo info;
1444
1445 tsk->thread.trap_no = 1; 1694 tsk->thread.trap_no = 1;
1446 tsk->thread.error_code = error_code; 1695 tsk->thread.error_code = error_code;
1447 1696
1448 memset(&info, 0, sizeof(info)); 1697 memset(info, 0, sizeof(*info));
1449 info.si_signo = SIGTRAP; 1698 info->si_signo = SIGTRAP;
1450 info.si_code = si_code; 1699 info->si_code = si_code;
1700 info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
1701}
1451 1702
1452 /* User-mode ip? */ 1703void user_single_step_siginfo(struct task_struct *tsk,
1453 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1704 struct pt_regs *regs,
1705 struct siginfo *info)
1706{
1707 fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info);
1708}
1454 1709
1710void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1711 int error_code, int si_code)
1712{
1713 struct siginfo info;
1714
1715 fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
1455 /* Send us the fake SIGTRAP */ 1716 /* Send us the fake SIGTRAP */
1456 force_sig_info(SIGTRAP, &info, tsk); 1717 force_sig_info(SIGTRAP, &info, tsk);
1457} 1718}
@@ -1516,29 +1777,22 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1516 1777
1517asmregparm void syscall_trace_leave(struct pt_regs *regs) 1778asmregparm void syscall_trace_leave(struct pt_regs *regs)
1518{ 1779{
1780 bool step;
1781
1519 if (unlikely(current->audit_context)) 1782 if (unlikely(current->audit_context))
1520 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1783 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1521 1784
1522 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1785 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1523 trace_sys_exit(regs, regs->ax); 1786 trace_sys_exit(regs, regs->ax);
1524 1787
1525 if (test_thread_flag(TIF_SYSCALL_TRACE))
1526 tracehook_report_syscall_exit(regs, 0);
1527
1528 /* 1788 /*
1529 * If TIF_SYSCALL_EMU is set, we only get here because of 1789 * If TIF_SYSCALL_EMU is set, we only get here because of
1530 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 1790 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1531 * We already reported this syscall instruction in 1791 * We already reported this syscall instruction in
1532 * syscall_trace_enter(), so don't do any more now. 1792 * syscall_trace_enter().
1533 */
1534 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1535 return;
1536
1537 /*
1538 * If we are single-stepping, synthesize a trap to follow the
1539 * system call instruction.
1540 */ 1793 */
1541 if (test_thread_flag(TIF_SINGLESTEP) && 1794 step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
1542 tracehook_consider_fatal_signal(current, SIGTRAP)) 1795 !test_thread_flag(TIF_SYSCALL_EMU);
1543 send_sigtrap(current, regs, 0, TRAP_BRKPT); 1796 if (step || test_thread_flag(TIF_SYSCALL_TRACE))
1797 tracehook_report_syscall_exit(regs, step);
1544} 1798}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 6c3b2c6fd772..12e9feaa2f7a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,6 +491,19 @@ void force_hpet_resume(void)
491 break; 491 break;
492 } 492 }
493} 493}
494
495/*
496 * HPET MSI on some boards (ATI SB700/SB800) has side effect on
497 * floppy DMA. Disable HPET MSI on such platforms.
498 */
499static void force_disable_hpet_msi(struct pci_dev *unused)
500{
501 hpet_msi_disable = 1;
502}
503
504DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
505 force_disable_hpet_msi);
506
494#endif 507#endif
495 508
496#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) 509#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
@@ -499,6 +512,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{ 512{
500 struct pci_dev *nb_ht; 513 struct pci_dev *nb_ht;
501 unsigned int devfn; 514 unsigned int devfn;
515 u32 node;
502 u32 val; 516 u32 val;
503 517
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); 518 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
@@ -507,7 +521,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
507 return; 521 return;
508 522
509 pci_read_config_dword(nb_ht, 0x60, &val); 523 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 524 node = val & 7;
525 /*
526 * Some hardware may return an invalid node ID,
527 * so check it first:
528 */
529 if (node_online(node))
530 set_dev_node(&dev->dev, node);
511 pci_dev_put(nb_ht); 531 pci_dev_put(nb_ht);
512} 532}
513 533
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f93078746e00..8e1aac86b50c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -23,7 +23,7 @@
23# include <linux/ctype.h> 23# include <linux/ctype.h>
24# include <linux/mc146818rtc.h> 24# include <linux/mc146818rtc.h>
25#else 25#else
26# include <asm/iommu.h> 26# include <asm/x86_init.h>
27#endif 27#endif
28 28
29/* 29/*
@@ -203,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
203 DMI_MATCH(DMI_BOARD_NAME, "0T656F"), 203 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
204 }, 204 },
205 }, 205 },
206 { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
207 .callback = set_bios_reboot,
208 .ident = "Dell OptiPlex 760",
209 .matches = {
210 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
211 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
212 DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
213 },
214 },
206 { /* Handle problems with rebooting on Dell 2400's */ 215 { /* Handle problems with rebooting on Dell 2400's */
207 .callback = set_bios_reboot, 216 .callback = set_bios_reboot,
208 .ident = "Dell PowerEdge 2400", 217 .ident = "Dell PowerEdge 2400",
@@ -259,6 +268,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
259 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), 268 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
260 }, 269 },
261 }, 270 },
271 { /* Handle problems with rebooting on ASUS P4S800 */
272 .callback = set_bios_reboot,
273 .ident = "ASUS P4S800",
274 .matches = {
275 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
276 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
277 },
278 },
262 { } 279 { }
263}; 280};
264 281
@@ -444,6 +461,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
444 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), 461 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
445 }, 462 },
446 }, 463 },
464 { /* Handle problems with rebooting on the iMac9,1. */
465 .callback = set_pci_reboot,
466 .ident = "Apple iMac9,1",
467 .matches = {
468 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
469 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
470 },
471 },
447 { } 472 { }
448}; 473};
449 474
@@ -622,7 +647,7 @@ void native_machine_shutdown(void)
622#endif 647#endif
623 648
624#ifdef CONFIG_X86_64 649#ifdef CONFIG_X86_64
625 pci_iommu_shutdown(); 650 x86_platform.iommu_shutdown();
626#endif 651#endif
627} 652}
628 653
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index 61a837743fe5..fda313ebbb03 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -12,7 +12,7 @@
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <asm/reboot_fixups.h> 13#include <asm/reboot_fixups.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/geode.h> 15#include <linux/cs5535.h>
16 16
17static void cs5530a_warm_reset(struct pci_dev *dev) 17static void cs5530a_warm_reset(struct pci_dev *dev)
18{ 18{
@@ -80,6 +80,7 @@ void mach_reboot_fixups(void)
80 continue; 80 continue;
81 81
82 cur->reboot_fixup(dev); 82 cur->reboot_fixup(dev);
83 pci_dev_put(dev);
83 } 84 }
84} 85}
85 86
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2a34f9c5be21..c4851eff57b3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -55,7 +55,6 @@
55#include <linux/stddef.h> 55#include <linux/stddef.h>
56#include <linux/unistd.h> 56#include <linux/unistd.h>
57#include <linux/ptrace.h> 57#include <linux/ptrace.h>
58#include <linux/slab.h>
59#include <linux/user.h> 58#include <linux/user.h>
60#include <linux/delay.h> 59#include <linux/delay.h>
61 60
@@ -73,6 +72,7 @@
73 72
74#include <asm/mtrr.h> 73#include <asm/mtrr.h>
75#include <asm/apic.h> 74#include <asm/apic.h>
75#include <asm/trampoline.h>
76#include <asm/e820.h> 76#include <asm/e820.h>
77#include <asm/mpspec.h> 77#include <asm/mpspec.h>
78#include <asm/setup.h> 78#include <asm/setup.h>
@@ -106,9 +106,11 @@
106#include <asm/percpu.h> 106#include <asm/percpu.h>
107#include <asm/topology.h> 107#include <asm/topology.h>
108#include <asm/apicdef.h> 108#include <asm/apicdef.h>
109#include <asm/k8.h>
109#ifdef CONFIG_X86_64 110#ifdef CONFIG_X86_64
110#include <asm/numa_64.h> 111#include <asm/numa_64.h>
111#endif 112#endif
113#include <asm/mce.h>
112 114
113/* 115/*
114 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -118,7 +120,9 @@
118unsigned long max_low_pfn_mapped; 120unsigned long max_low_pfn_mapped;
119unsigned long max_pfn_mapped; 121unsigned long max_pfn_mapped;
120 122
123#ifdef CONFIG_DMI
121RESERVE_BRK(dmi_alloc, 65536); 124RESERVE_BRK(dmi_alloc, 65536);
125#endif
122 126
123unsigned int boot_cpu_id __read_mostly; 127unsigned int boot_cpu_id __read_mostly;
124 128
@@ -247,7 +251,7 @@ EXPORT_SYMBOL(edd);
247 * from boot_params into a safe place. 251 * from boot_params into a safe place.
248 * 252 *
249 */ 253 */
250static inline void copy_edd(void) 254static inline void __init copy_edd(void)
251{ 255{
252 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, 256 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
253 sizeof(edd.mbr_signature)); 257 sizeof(edd.mbr_signature));
@@ -256,7 +260,7 @@ static inline void copy_edd(void)
256 edd.edd_info_nr = boot_params.eddbuf_entries; 260 edd.edd_info_nr = boot_params.eddbuf_entries;
257} 261}
258#else 262#else
259static inline void copy_edd(void) 263static inline void __init copy_edd(void)
260{ 264{
261} 265}
262#endif 266#endif
@@ -309,16 +313,17 @@ static void __init reserve_brk(void)
309#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 313#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
310static void __init relocate_initrd(void) 314static void __init relocate_initrd(void)
311{ 315{
312 316 /* Assume only end is not page aligned */
313 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 317 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
314 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 318 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
319 u64 area_size = PAGE_ALIGN(ramdisk_size);
315 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 320 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
316 u64 ramdisk_here; 321 u64 ramdisk_here;
317 unsigned long slop, clen, mapaddr; 322 unsigned long slop, clen, mapaddr;
318 char *p, *q; 323 char *p, *q;
319 324
320 /* We need to move the initrd down into lowmem */ 325 /* We need to move the initrd down into lowmem */
321 ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, 326 ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,
322 PAGE_SIZE); 327 PAGE_SIZE);
323 328
324 if (ramdisk_here == -1ULL) 329 if (ramdisk_here == -1ULL)
@@ -327,7 +332,7 @@ static void __init relocate_initrd(void)
327 332
328 /* Note: this includes all the lowmem currently occupied by 333 /* Note: this includes all the lowmem currently occupied by
329 the initrd, we rely on that fact to keep the data intact. */ 334 the initrd, we rely on that fact to keep the data intact. */
330 reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, 335 reserve_early(ramdisk_here, ramdisk_here + area_size,
331 "NEW RAMDISK"); 336 "NEW RAMDISK");
332 initrd_start = ramdisk_here + PAGE_OFFSET; 337 initrd_start = ramdisk_here + PAGE_OFFSET;
333 initrd_end = initrd_start + ramdisk_size; 338 initrd_end = initrd_start + ramdisk_size;
@@ -371,9 +376,10 @@ static void __init relocate_initrd(void)
371 376
372static void __init reserve_initrd(void) 377static void __init reserve_initrd(void)
373{ 378{
379 /* Assume only end is not page aligned */
374 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 380 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
375 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 381 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
376 u64 ramdisk_end = ramdisk_image + ramdisk_size; 382 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
377 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 383 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
378 384
379 if (!boot_params.hdr.type_of_loader || 385 if (!boot_params.hdr.type_of_loader ||
@@ -486,42 +492,11 @@ static void __init reserve_early_setup_data(void)
486 492
487#ifdef CONFIG_KEXEC 493#ifdef CONFIG_KEXEC
488 494
489/**
490 * Reserve @size bytes of crashkernel memory at any suitable offset.
491 *
492 * @size: Size of the crashkernel memory to reserve.
493 * Returns the base address on success, and -1ULL on failure.
494 */
495static
496unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
497{
498 const unsigned long long alignment = 16<<20; /* 16M */
499 unsigned long long start = 0LL;
500
501 while (1) {
502 int ret;
503
504 start = find_e820_area(start, ULONG_MAX, size, alignment);
505 if (start == -1ULL)
506 return start;
507
508 /* try to reserve it */
509 ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
510 if (ret >= 0)
511 return start;
512
513 start += alignment;
514 }
515}
516
517static inline unsigned long long get_total_mem(void) 495static inline unsigned long long get_total_mem(void)
518{ 496{
519 unsigned long long total; 497 unsigned long long total;
520 498
521 total = max_low_pfn - min_low_pfn; 499 total = max_pfn - min_low_pfn;
522#ifdef CONFIG_HIGHMEM
523 total += highend_pfn - highstart_pfn;
524#endif
525 500
526 return total << PAGE_SHIFT; 501 return total << PAGE_SHIFT;
527} 502}
@@ -541,21 +516,25 @@ static void __init reserve_crashkernel(void)
541 516
542 /* 0 means: find the address automatically */ 517 /* 0 means: find the address automatically */
543 if (crash_base <= 0) { 518 if (crash_base <= 0) {
544 crash_base = find_and_reserve_crashkernel(crash_size); 519 const unsigned long long alignment = 16<<20; /* 16M */
520
521 crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
522 alignment);
545 if (crash_base == -1ULL) { 523 if (crash_base == -1ULL) {
546 pr_info("crashkernel reservation failed. " 524 pr_info("crashkernel reservation failed - No suitable area found.\n");
547 "No suitable area found.\n");
548 return; 525 return;
549 } 526 }
550 } else { 527 } else {
551 ret = reserve_bootmem_generic(crash_base, crash_size, 528 unsigned long long start;
552 BOOTMEM_EXCLUSIVE); 529
553 if (ret < 0) { 530 start = find_e820_area(crash_base, ULONG_MAX, crash_size,
554 pr_info("crashkernel reservation failed - " 531 1<<20);
555 "memory is in use\n"); 532 if (start != crash_base) {
533 pr_info("crashkernel reservation failed - memory is in use.\n");
556 return; 534 return;
557 } 535 }
558 } 536 }
537 reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
559 538
560 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 539 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
561 "for crashkernel (System RAM: %ldMB)\n", 540 "for crashkernel (System RAM: %ldMB)\n",
@@ -628,6 +607,16 @@ static int __init setup_elfcorehdr(char *arg)
628early_param("elfcorehdr", setup_elfcorehdr); 607early_param("elfcorehdr", setup_elfcorehdr);
629#endif 608#endif
630 609
610static __init void reserve_ibft_region(void)
611{
612 unsigned long addr, size = 0;
613
614 addr = find_ibft_region(&size);
615
616 if (size)
617 reserve_early_overlap_ok(addr, addr + size, "ibft");
618}
619
631#ifdef CONFIG_X86_RESERVE_LOW_64K 620#ifdef CONFIG_X86_RESERVE_LOW_64K
632static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 621static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
633{ 622{
@@ -666,23 +655,48 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
666 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), 655 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
667 }, 656 },
668 }, 657 },
669 {
670 /* 658 /*
671 * AMI BIOS with low memory corruption was found on Intel DG45ID board. 659 * AMI BIOS with low memory corruption was found on Intel DG45ID and
672 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will 660 * DG45FC boards.
661 * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
673 * match only DMI_BOARD_NAME and see if there is more bad products 662 * match only DMI_BOARD_NAME and see if there is more bad products
674 * with this vendor. 663 * with this vendor.
675 */ 664 */
665 {
676 .callback = dmi_low_memory_corruption, 666 .callback = dmi_low_memory_corruption,
677 .ident = "AMI BIOS", 667 .ident = "AMI BIOS",
678 .matches = { 668 .matches = {
679 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), 669 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
680 }, 670 },
681 }, 671 },
672 {
673 .callback = dmi_low_memory_corruption,
674 .ident = "AMI BIOS",
675 .matches = {
676 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
677 },
678 },
682#endif 679#endif
683 {} 680 {}
684}; 681};
685 682
683static void __init trim_bios_range(void)
684{
685 /*
686 * A special case is the first 4Kb of memory;
687 * This is a BIOS owned area, not kernel ram, but generally
688 * not listed as such in the E820 table.
689 */
690 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
691 /*
692 * special case: Some BIOSen report the PC BIOS
693 * area (640->1Mb) as ram even though it is not.
694 * take them out.
695 */
696 e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
697 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
698}
699
686/* 700/*
687 * Determine if we were loaded by an EFI loader. If so, then we have also been 701 * Determine if we were loaded by an EFI loader. If so, then we have also been
688 * passed the efi memmap, systab, etc., so we should use these data structures 702 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -698,6 +712,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
698 712
699void __init setup_arch(char **cmdline_p) 713void __init setup_arch(char **cmdline_p)
700{ 714{
715 int acpi = 0;
716 int k8 = 0;
717
701#ifdef CONFIG_X86_32 718#ifdef CONFIG_X86_32
702 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 719 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
703 visws_early_detect(); 720 visws_early_detect();
@@ -790,21 +807,18 @@ void __init setup_arch(char **cmdline_p)
790 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 807 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
791 *cmdline_p = command_line; 808 *cmdline_p = command_line;
792 809
793#ifdef CONFIG_X86_64
794 /* 810 /*
795 * Must call this twice: Once just to detect whether hardware doesn't 811 * x86_configure_nx() is called before parse_early_param() to detect
796 * support NX (so that the early EHCI debug console setup can safely 812 * whether hardware doesn't support NX (so that the early EHCI debug
797 * call set_fixmap(), and then again after parsing early parameters to 813 * console setup can safely call set_fixmap()). It may then be called
798 * honor the respective command line option. 814 * again from within noexec_setup() during parsing early parameters
815 * to honor the respective command line option.
799 */ 816 */
800 check_efer(); 817 x86_configure_nx();
801#endif
802 818
803 parse_early_param(); 819 parse_early_param();
804 820
805#ifdef CONFIG_X86_64 821 x86_report_nx();
806 check_efer();
807#endif
808 822
809 /* Must be before kernel pagetables are setup */ 823 /* Must be before kernel pagetables are setup */
810 vmi_activate(); 824 vmi_activate();
@@ -846,7 +860,7 @@ void __init setup_arch(char **cmdline_p)
846 insert_resource(&iomem_resource, &data_resource); 860 insert_resource(&iomem_resource, &data_resource);
847 insert_resource(&iomem_resource, &bss_resource); 861 insert_resource(&iomem_resource, &bss_resource);
848 862
849 863 trim_bios_range();
850#ifdef CONFIG_X86_32 864#ifdef CONFIG_X86_32
851 if (ppro_with_ram_bug()) { 865 if (ppro_with_ram_bug()) {
852 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, 866 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
@@ -900,6 +914,22 @@ void __init setup_arch(char **cmdline_p)
900 914
901 reserve_brk(); 915 reserve_brk();
902 916
917 /*
918 * Find and reserve possible boot-time SMP configuration:
919 */
920 find_smp_config();
921
922 reserve_ibft_region();
923
924 reserve_trampoline_memory();
925
926#ifdef CONFIG_ACPI_SLEEP
927 /*
928 * Reserve low memory region for sleep support.
929 * even before init_memory_mapping
930 */
931 acpi_reserve_wakeup_memory();
932#endif
903 init_gbpages(); 933 init_gbpages();
904 934
905 /* max_pfn_mapped is updated here */ 935 /* max_pfn_mapped is updated here */
@@ -926,6 +956,8 @@ void __init setup_arch(char **cmdline_p)
926 956
927 reserve_initrd(); 957 reserve_initrd();
928 958
959 reserve_crashkernel();
960
929 vsmp_init(); 961 vsmp_init();
930 962
931 io_delay_init(); 963 io_delay_init();
@@ -941,34 +973,20 @@ void __init setup_arch(char **cmdline_p)
941 /* 973 /*
942 * Parse SRAT to discover nodes. 974 * Parse SRAT to discover nodes.
943 */ 975 */
944 acpi_numa_init(); 976 acpi = acpi_numa_init();
945#endif 977#endif
946 978
947 initmem_init(0, max_pfn); 979#ifdef CONFIG_K8_NUMA
948 980 if (!acpi)
949#ifdef CONFIG_ACPI_SLEEP 981 k8 = !k8_numa_init(0, max_pfn);
950 /*
951 * Reserve low memory region for sleep support.
952 */
953 acpi_reserve_bootmem();
954#endif 982#endif
955 /*
956 * Find and reserve possible boot-time SMP configuration:
957 */
958 find_smp_config();
959 983
960 reserve_crashkernel(); 984 initmem_init(0, max_pfn, acpi, k8);
961 985#ifndef CONFIG_NO_BOOTMEM
962#ifdef CONFIG_X86_64 986 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
963 /*
964 * dma32_reserve_bootmem() allocates bootmem which may conflict
965 * with the crashkernel command line, so do that after
966 * reserve_crashkernel()
967 */
968 dma32_reserve_bootmem();
969#endif 987#endif
970 988
971 reserve_ibft_region(); 989 dma32_reserve_bootmem();
972 990
973#ifdef CONFIG_KVM_CLOCK 991#ifdef CONFIG_KVM_CLOCK
974 kvmclock_init(); 992 kvmclock_init();
@@ -1031,6 +1049,8 @@ void __init setup_arch(char **cmdline_p)
1031#endif 1049#endif
1032#endif 1050#endif
1033 x86_init.oem.banner(); 1051 x86_init.oem.banner();
1052
1053 mcheck_init();
1034} 1054}
1035 1055
1036#ifdef CONFIG_X86_32 1056#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d559af913e1f..ef6370b00e70 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/kernel.h> 3#include <linux/kernel.h>
2#include <linux/module.h> 4#include <linux/module.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -20,9 +22,9 @@
20#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
21 23
22#ifdef CONFIG_DEBUG_PER_CPU_MAPS 24#ifdef CONFIG_DEBUG_PER_CPU_MAPS
23# define DBG(x...) printk(KERN_DEBUG x) 25# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
24#else 26#else
25# define DBG(x...) 27# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
26#endif 28#endif
27 29
28DEFINE_PER_CPU(int, cpu_number); 30DEFINE_PER_CPU(int, cpu_number);
@@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
116 } else { 118 } else {
117 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 119 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
118 size, align, goal); 120 size, align, goal);
119 pr_debug("per cpu data for cpu%d %lu bytes on node%d at " 121 pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
120 "%016lx\n", cpu, size, node, __pa(ptr)); 122 cpu, size, node, __pa(ptr));
121 } 123 }
122 return ptr; 124 return ptr;
123#else 125#else
@@ -135,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
135 137
136static void __init pcpu_fc_free(void *ptr, size_t size) 138static void __init pcpu_fc_free(void *ptr, size_t size)
137{ 139{
140#ifdef CONFIG_NO_BOOTMEM
141 u64 start = __pa(ptr);
142 u64 end = start + size;
143 free_early_partial(start, end);
144#else
138 free_bootmem(__pa(ptr), size); 145 free_bootmem(__pa(ptr), size);
146#endif
139} 147}
140 148
141static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 149static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
@@ -198,8 +206,7 @@ void __init setup_per_cpu_areas(void)
198 pcpu_cpu_distance, 206 pcpu_cpu_distance,
199 pcpu_fc_alloc, pcpu_fc_free); 207 pcpu_fc_alloc, pcpu_fc_free);
200 if (rc < 0) 208 if (rc < 0)
201 pr_warning("PERCPU: %s allocator failed (%d), " 209 pr_warning("%s allocator failed (%d), falling back to page size\n",
202 "falling back to page size\n",
203 pcpu_fc_names[pcpu_chosen_fc], rc); 210 pcpu_fc_names[pcpu_chosen_fc], rc);
204 } 211 }
205 if (rc < 0) 212 if (rc < 0)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..4fd173cd8e57 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/user-return-notifier.h>
22 23
23#include <asm/processor.h> 24#include <asm/processor.h>
24#include <asm/ucontext.h> 25#include <asm/ucontext.h>
@@ -544,22 +545,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
544} 545}
545#endif /* CONFIG_X86_32 */ 546#endif /* CONFIG_X86_32 */
546 547
547#ifdef CONFIG_X86_32 548long
548int sys_sigaltstack(struct pt_regs *regs)
549{
550 const stack_t __user *uss = (const stack_t __user *)regs->bx;
551 stack_t __user *uoss = (stack_t __user *)regs->cx;
552
553 return do_sigaltstack(uss, uoss, regs->sp);
554}
555#else /* !CONFIG_X86_32 */
556asmlinkage long
557sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 549sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
558 struct pt_regs *regs) 550 struct pt_regs *regs)
559{ 551{
560 return do_sigaltstack(uss, uoss, regs->sp); 552 return do_sigaltstack(uss, uoss, regs->sp);
561} 553}
562#endif /* CONFIG_X86_32 */
563 554
564/* 555/*
565 * Do a signal return; undo the signal stack. 556 * Do a signal return; undo the signal stack.
@@ -799,15 +790,6 @@ static void do_signal(struct pt_regs *regs)
799 790
800 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 791 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
801 if (signr > 0) { 792 if (signr > 0) {
802 /*
803 * Re-enable any watchpoints before delivering the
804 * signal to user space. The processor register will
805 * have been cleared if the watchpoint triggered
806 * inside the kernel.
807 */
808 if (current->thread.debugreg7)
809 set_debugreg(current->thread.debugreg7, 7);
810
811 /* Whee! Actually deliver the signal. */ 793 /* Whee! Actually deliver the signal. */
812 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 794 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
813 /* 795 /*
@@ -872,6 +854,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
872 if (current->replacement_session_keyring) 854 if (current->replacement_session_keyring)
873 key_replace_session_keyring(); 855 key_replace_session_keyring();
874 } 856 }
857 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
858 fire_user_return_notifiers();
875 859
876#ifdef CONFIG_X86_32 860#ifdef CONFIG_X86_32
877 clear_thread_flag(TIF_IRET); 861 clear_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index a93528bc16e9..97af589a5c0c 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
21#include <linux/cache.h> 21#include <linux/cache.h>
22#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/gfp.h>
24 25
25#include <litmus/litmus.h> 26#include <litmus/litmus.h>
26#include <litmus/trace.h> 27#include <litmus/trace.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 565ebc65920e..763d815e27a0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -48,6 +48,8 @@
48#include <linux/err.h> 48#include <linux/err.h>
49#include <linux/nmi.h> 49#include <linux/nmi.h>
50#include <linux/tboot.h> 50#include <linux/tboot.h>
51#include <linux/stackprotector.h>
52#include <linux/gfp.h>
51 53
52#include <asm/acpi.h> 54#include <asm/acpi.h>
53#include <asm/desc.h> 55#include <asm/desc.h>
@@ -67,6 +69,7 @@
67#include <linux/mc146818rtc.h> 69#include <linux/mc146818rtc.h>
68 70
69#include <asm/smpboot_hooks.h> 71#include <asm/smpboot_hooks.h>
72#include <asm/i8259.h>
70 73
71#ifdef CONFIG_X86_32 74#ifdef CONFIG_X86_32
72u8 apicid_2_node[MAX_APICID]; 75u8 apicid_2_node[MAX_APICID];
@@ -240,7 +243,10 @@ static void __cpuinit smp_callin(void)
240 end_local_APIC_setup(); 243 end_local_APIC_setup();
241 map_cpu_to_logical_apicid(); 244 map_cpu_to_logical_apicid();
242 245
243 notify_cpu_starting(cpuid); 246 /*
247 * Need to setup vector mappings before we enable interrupts.
248 */
249 setup_vector_irq(smp_processor_id());
244 /* 250 /*
245 * Get our bogomips. 251 * Get our bogomips.
246 * 252 *
@@ -257,6 +263,8 @@ static void __cpuinit smp_callin(void)
257 */ 263 */
258 smp_store_cpu_info(cpuid); 264 smp_store_cpu_info(cpuid);
259 265
266 notify_cpu_starting(cpuid);
267
260 /* 268 /*
261 * Allow the master to continue. 269 * Allow the master to continue.
262 */ 270 */
@@ -286,9 +294,9 @@ notrace static void __cpuinit start_secondary(void *unused)
286 check_tsc_sync_target(); 294 check_tsc_sync_target();
287 295
288 if (nmi_watchdog == NMI_IO_APIC) { 296 if (nmi_watchdog == NMI_IO_APIC) {
289 disable_8259A_irq(0); 297 legacy_pic->chip->mask(0);
290 enable_NMI_through_LVT0(); 298 enable_NMI_through_LVT0();
291 enable_8259A_irq(0); 299 legacy_pic->chip->unmask(0);
292 } 300 }
293 301
294#ifdef CONFIG_X86_32 302#ifdef CONFIG_X86_32
@@ -315,15 +323,18 @@ notrace static void __cpuinit start_secondary(void *unused)
315 */ 323 */
316 ipi_call_lock(); 324 ipi_call_lock();
317 lock_vector_lock(); 325 lock_vector_lock();
318 __setup_vector_irq(smp_processor_id());
319 set_cpu_online(smp_processor_id(), true); 326 set_cpu_online(smp_processor_id(), true);
320 unlock_vector_lock(); 327 unlock_vector_lock();
321 ipi_call_unlock(); 328 ipi_call_unlock();
322 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 329 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
330 x86_platform.nmi_init();
323 331
324 /* enable local interrupts */ 332 /* enable local interrupts */
325 local_irq_enable(); 333 local_irq_enable();
326 334
335 /* to prevent fake stack check failure in clock setup */
336 boot_init_stack_canary();
337
327 x86_cpuinit.setup_percpu_clockev(); 338 x86_cpuinit.setup_percpu_clockev();
328 339
329 wmb(); 340 wmb();
@@ -671,6 +682,26 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
671 complete(&c_idle->done); 682 complete(&c_idle->done);
672} 683}
673 684
685/* reduce the number of lines printed when booting a large cpu count system */
686static void __cpuinit announce_cpu(int cpu, int apicid)
687{
688 static int current_node = -1;
689 int node = cpu_to_node(cpu);
690
691 if (system_state == SYSTEM_BOOTING) {
692 if (node != current_node) {
693 if (current_node > (-1))
694 pr_cont(" Ok.\n");
695 current_node = node;
696 pr_info("Booting Node %3d, Processors ", node);
697 }
698 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
699 return;
700 } else
701 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
702 node, cpu, apicid);
703}
704
674/* 705/*
675 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 706 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
676 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 707 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -687,7 +718,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
687 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 718 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
688 }; 719 };
689 720
690 INIT_WORK(&c_idle.work, do_fork_idle); 721 INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
691 722
692 alternatives_smp_switch(1); 723 alternatives_smp_switch(1);
693 724
@@ -713,6 +744,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
713 744
714 if (IS_ERR(c_idle.idle)) { 745 if (IS_ERR(c_idle.idle)) {
715 printk("failed fork for CPU %d\n", cpu); 746 printk("failed fork for CPU %d\n", cpu);
747 destroy_work_on_stack(&c_idle.work);
716 return PTR_ERR(c_idle.idle); 748 return PTR_ERR(c_idle.idle);
717 } 749 }
718 750
@@ -736,9 +768,8 @@ do_rest:
736 /* start_ip had better be page-aligned! */ 768 /* start_ip had better be page-aligned! */
737 start_ip = setup_trampoline(); 769 start_ip = setup_trampoline();
738 770
739 /* So we see what's up */ 771 /* So we see what's up */
740 printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", 772 announce_cpu(cpu, apicid);
741 cpu, apicid, start_ip);
742 773
743 /* 774 /*
744 * This grunge runs the startup process for 775 * This grunge runs the startup process for
@@ -787,21 +818,17 @@ do_rest:
787 udelay(100); 818 udelay(100);
788 } 819 }
789 820
790 if (cpumask_test_cpu(cpu, cpu_callin_mask)) { 821 if (cpumask_test_cpu(cpu, cpu_callin_mask))
791 /* number CPUs logically, starting from 1 (BSP is 0) */ 822 pr_debug("CPU%d: has booted.\n", cpu);
792 pr_debug("OK.\n"); 823 else {
793 printk(KERN_INFO "CPU%d: ", cpu);
794 print_cpu_info(&cpu_data(cpu));
795 pr_debug("CPU has booted.\n");
796 } else {
797 boot_error = 1; 824 boot_error = 1;
798 if (*((volatile unsigned char *)trampoline_base) 825 if (*((volatile unsigned char *)trampoline_base)
799 == 0xA5) 826 == 0xA5)
800 /* trampoline started but...? */ 827 /* trampoline started but...? */
801 printk(KERN_ERR "Stuck ??\n"); 828 pr_err("CPU%d: Stuck ??\n", cpu);
802 else 829 else
803 /* trampoline code not run */ 830 /* trampoline code not run */
804 printk(KERN_ERR "Not responding.\n"); 831 pr_err("CPU%d: Not responding.\n", cpu);
805 if (apic->inquire_remote_apic) 832 if (apic->inquire_remote_apic)
806 apic->inquire_remote_apic(apicid); 833 apic->inquire_remote_apic(apicid);
807 } 834 }
@@ -831,6 +858,7 @@ do_rest:
831 smpboot_restore_warm_reset_vector(); 858 smpboot_restore_warm_reset_vector();
832 } 859 }
833 860
861 destroy_work_on_stack(&c_idle.work);
834 return boot_error; 862 return boot_error;
835} 863}
836 864
@@ -1066,9 +1094,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1066 set_cpu_sibling_map(0); 1094 set_cpu_sibling_map(0);
1067 1095
1068 enable_IR_x2apic(); 1096 enable_IR_x2apic();
1069#ifdef CONFIG_X86_64
1070 default_setup_apic_routing(); 1097 default_setup_apic_routing();
1071#endif
1072 1098
1073 if (smp_sanity_check(max_cpus) < 0) { 1099 if (smp_sanity_check(max_cpus) < 0) {
1074 printk(KERN_INFO "SMP disabled\n"); 1100 printk(KERN_INFO "SMP disabled\n");
@@ -1196,11 +1222,12 @@ __init void prefill_possible_map(void)
1196 1222
1197 total_cpus = max_t(int, possible, num_processors + disabled_cpus); 1223 total_cpus = max_t(int, possible, num_processors + disabled_cpus);
1198 1224
1199 if (possible > CONFIG_NR_CPUS) { 1225 /* nr_cpu_ids could be reduced via nr_cpus= */
1226 if (possible > nr_cpu_ids) {
1200 printk(KERN_WARNING 1227 printk(KERN_WARNING
1201 "%d Processors exceeds NR_CPUS limit of %d\n", 1228 "%d Processors exceeds NR_CPUS limit of %d\n",
1202 possible, CONFIG_NR_CPUS); 1229 possible, nr_cpu_ids);
1203 possible = CONFIG_NR_CPUS; 1230 possible = nr_cpu_ids;
1204 } 1231 }
1205 1232
1206 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1233 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
@@ -1250,16 +1277,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1250void cpu_disable_common(void) 1277void cpu_disable_common(void)
1251{ 1278{
1252 int cpu = smp_processor_id(); 1279 int cpu = smp_processor_id();
1253 /*
1254 * HACK:
1255 * Allow any queued timer interrupts to get serviced
1256 * This is only a temporary solution until we cleanup
1257 * fixup_irqs as we do for IA64.
1258 */
1259 local_irq_enable();
1260 mdelay(1);
1261 1280
1262 local_irq_disable();
1263 remove_siblinginfo(cpu); 1281 remove_siblinginfo(cpu);
1264 1282
1265 /* It's now safe to remove this processor from the online map */ 1283 /* It's now safe to remove this processor from the online map */
@@ -1300,14 +1318,16 @@ void native_cpu_die(unsigned int cpu)
1300 for (i = 0; i < 10; i++) { 1318 for (i = 0; i < 10; i++) {
1301 /* They ack this in play_dead by setting CPU_DEAD */ 1319 /* They ack this in play_dead by setting CPU_DEAD */
1302 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1320 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1303 printk(KERN_INFO "CPU %d is now offline\n", cpu); 1321 if (system_state == SYSTEM_RUNNING)
1322 pr_info("CPU %u is now offline\n", cpu);
1323
1304 if (1 == num_online_cpus()) 1324 if (1 == num_online_cpus())
1305 alternatives_smp_switch(0); 1325 alternatives_smp_switch(0);
1306 return; 1326 return;
1307 } 1327 }
1308 msleep(100); 1328 msleep(100);
1309 } 1329 }
1310 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1330 pr_err("CPU %u didn't die...\n", cpu);
1311} 1331}
1312 1332
1313void play_dead_common(void) 1333void play_dead_common(void)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c3eb207181fe..922eefbb3f6c 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -53,17 +53,19 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
53} 53}
54 54
55static const struct stacktrace_ops save_stack_ops = { 55static const struct stacktrace_ops save_stack_ops = {
56 .warning = save_stack_warning, 56 .warning = save_stack_warning,
57 .warning_symbol = save_stack_warning_symbol, 57 .warning_symbol = save_stack_warning_symbol,
58 .stack = save_stack_stack, 58 .stack = save_stack_stack,
59 .address = save_stack_address, 59 .address = save_stack_address,
60 .walk_stack = print_context_stack,
60}; 61};
61 62
62static const struct stacktrace_ops save_stack_ops_nosched = { 63static const struct stacktrace_ops save_stack_ops_nosched = {
63 .warning = save_stack_warning, 64 .warning = save_stack_warning,
64 .warning_symbol = save_stack_warning_symbol, 65 .warning_symbol = save_stack_warning_symbol,
65 .stack = save_stack_stack, 66 .stack = save_stack_stack,
66 .address = save_stack_address_nosched, 67 .address = save_stack_address_nosched,
68 .walk_stack = print_context_stack,
67}; 69};
68 70
69/* 71/*
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 1884a8d12bfa..196552bb412c 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -24,216 +24,6 @@
24 24
25#include <asm/syscalls.h> 25#include <asm/syscalls.h>
26 26
27asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
28 unsigned long prot, unsigned long flags,
29 unsigned long fd, unsigned long pgoff)
30{
31 int error = -EBADF;
32 struct file *file = NULL;
33 struct mm_struct *mm = current->mm;
34
35 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
36 if (!(flags & MAP_ANONYMOUS)) {
37 file = fget(fd);
38 if (!file)
39 goto out;
40 }
41
42 down_write(&mm->mmap_sem);
43 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
44 up_write(&mm->mmap_sem);
45
46 if (file)
47 fput(file);
48out:
49 return error;
50}
51
52/*
53 * Perform the select(nd, in, out, ex, tv) and mmap() system
54 * calls. Linux/i386 didn't use to be able to handle more than
55 * 4 system call parameters, so these system calls used a memory
56 * block for parameter passing..
57 */
58
59struct mmap_arg_struct {
60 unsigned long addr;
61 unsigned long len;
62 unsigned long prot;
63 unsigned long flags;
64 unsigned long fd;
65 unsigned long offset;
66};
67
68asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
69{
70 struct mmap_arg_struct a;
71 int err = -EFAULT;
72
73 if (copy_from_user(&a, arg, sizeof(a)))
74 goto out;
75
76 err = -EINVAL;
77 if (a.offset & ~PAGE_MASK)
78 goto out;
79
80 err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
81 a.fd, a.offset >> PAGE_SHIFT);
82out:
83 return err;
84}
85
86
87struct sel_arg_struct {
88 unsigned long n;
89 fd_set __user *inp, *outp, *exp;
90 struct timeval __user *tvp;
91};
92
93asmlinkage int old_select(struct sel_arg_struct __user *arg)
94{
95 struct sel_arg_struct a;
96
97 if (copy_from_user(&a, arg, sizeof(a)))
98 return -EFAULT;
99 /* sys_select() does the appropriate kernel locking */
100 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
101}
102
103/*
104 * sys_ipc() is the de-multiplexer for the SysV IPC calls..
105 *
106 * This is really horribly ugly.
107 */
108asmlinkage int sys_ipc(uint call, int first, int second,
109 int third, void __user *ptr, long fifth)
110{
111 int version, ret;
112
113 version = call >> 16; /* hack for backward compatibility */
114 call &= 0xffff;
115
116 switch (call) {
117 case SEMOP:
118 return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
119 case SEMTIMEDOP:
120 return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
121 (const struct timespec __user *)fifth);
122
123 case SEMGET:
124 return sys_semget(first, second, third);
125 case SEMCTL: {
126 union semun fourth;
127 if (!ptr)
128 return -EINVAL;
129 if (get_user(fourth.__pad, (void __user * __user *) ptr))
130 return -EFAULT;
131 return sys_semctl(first, second, third, fourth);
132 }
133
134 case MSGSND:
135 return sys_msgsnd(first, (struct msgbuf __user *) ptr,
136 second, third);
137 case MSGRCV:
138 switch (version) {
139 case 0: {
140 struct ipc_kludge tmp;
141 if (!ptr)
142 return -EINVAL;
143
144 if (copy_from_user(&tmp,
145 (struct ipc_kludge __user *) ptr,
146 sizeof(tmp)))
147 return -EFAULT;
148 return sys_msgrcv(first, tmp.msgp, second,
149 tmp.msgtyp, third);
150 }
151 default:
152 return sys_msgrcv(first,
153 (struct msgbuf __user *) ptr,
154 second, fifth, third);
155 }
156 case MSGGET:
157 return sys_msgget((key_t) first, second);
158 case MSGCTL:
159 return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
160
161 case SHMAT:
162 switch (version) {
163 default: {
164 ulong raddr;
165 ret = do_shmat(first, (char __user *) ptr, second, &raddr);
166 if (ret)
167 return ret;
168 return put_user(raddr, (ulong __user *) third);
169 }
170 case 1: /* iBCS2 emulator entry point */
171 if (!segment_eq(get_fs(), get_ds()))
172 return -EINVAL;
173 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
174 return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
175 }
176 case SHMDT:
177 return sys_shmdt((char __user *)ptr);
178 case SHMGET:
179 return sys_shmget(first, second, third);
180 case SHMCTL:
181 return sys_shmctl(first, second,
182 (struct shmid_ds __user *) ptr);
183 default:
184 return -ENOSYS;
185 }
186}
187
188/*
189 * Old cruft
190 */
191asmlinkage int sys_uname(struct old_utsname __user *name)
192{
193 int err;
194 if (!name)
195 return -EFAULT;
196 down_read(&uts_sem);
197 err = copy_to_user(name, utsname(), sizeof(*name));
198 up_read(&uts_sem);
199 return err? -EFAULT:0;
200}
201
202asmlinkage int sys_olduname(struct oldold_utsname __user *name)
203{
204 int error;
205
206 if (!name)
207 return -EFAULT;
208 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
209 return -EFAULT;
210
211 down_read(&uts_sem);
212
213 error = __copy_to_user(&name->sysname, &utsname()->sysname,
214 __OLD_UTS_LEN);
215 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
216 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
217 __OLD_UTS_LEN);
218 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
219 error |= __copy_to_user(&name->release, &utsname()->release,
220 __OLD_UTS_LEN);
221 error |= __put_user(0, name->release + __OLD_UTS_LEN);
222 error |= __copy_to_user(&name->version, &utsname()->version,
223 __OLD_UTS_LEN);
224 error |= __put_user(0, name->version + __OLD_UTS_LEN);
225 error |= __copy_to_user(&name->machine, &utsname()->machine,
226 __OLD_UTS_LEN);
227 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
228
229 up_read(&uts_sem);
230
231 error = error ? -EFAULT : 0;
232
233 return error;
234}
235
236
237/* 27/*
238 * Do a system call from kernel instead of calling sys_execve so we 28 * Do a system call from kernel instead of calling sys_execve so we
239 * end up with proper pt_regs. 29 * end up with proper pt_regs.
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 45e00eb09c3a..ff14a5044ce6 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -23,26 +23,11 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
23 unsigned long, fd, unsigned long, off) 23 unsigned long, fd, unsigned long, off)
24{ 24{
25 long error; 25 long error;
26 struct file *file;
27
28 error = -EINVAL; 26 error = -EINVAL;
29 if (off & ~PAGE_MASK) 27 if (off & ~PAGE_MASK)
30 goto out; 28 goto out;
31 29
32 error = -EBADF; 30 error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
33 file = NULL;
34 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
35 if (!(flags & MAP_ANONYMOUS)) {
36 file = fget(fd);
37 if (!file)
38 goto out;
39 }
40 down_write(&current->mm->mmap_sem);
41 error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
42 up_write(&current->mm->mmap_sem);
43
44 if (file)
45 fput(file);
46out: 31out:
47 return error; 32 return error;
48} 33}
@@ -224,15 +209,3 @@ bottomup:
224 209
225 return addr; 210 return addr;
226} 211}
227
228
229SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
230{
231 int err;
232 down_read(&uts_sem);
233 err = copy_to_user(name, utsname(), sizeof(*name));
234 up_read(&uts_sem);
235 if (personality(current->personality) == PER_LINUX32)
236 err |= copy_to_user(&name->machine, "i686", 5);
237 return err ? -EFAULT : 0;
238}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 17fcb3abe236..5da9a68546b7 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -81,7 +81,7 @@ ENTRY(sys_call_table)
81 .long sys_settimeofday 81 .long sys_settimeofday
82 .long sys_getgroups16 /* 80 */ 82 .long sys_getgroups16 /* 80 */
83 .long sys_setgroups16 83 .long sys_setgroups16
84 .long old_select 84 .long sys_old_select
85 .long sys_symlink 85 .long sys_symlink
86 .long sys_lstat 86 .long sys_lstat
87 .long sys_readlink /* 85 */ 87 .long sys_readlink /* 85 */
@@ -89,7 +89,7 @@ ENTRY(sys_call_table)
89 .long sys_swapon 89 .long sys_swapon
90 .long sys_reboot 90 .long sys_reboot
91 .long sys_old_readdir 91 .long sys_old_readdir
92 .long old_mmap /* 90 */ 92 .long sys_old_mmap /* 90 */
93 .long sys_munmap 93 .long sys_munmap
94 .long sys_truncate 94 .long sys_truncate
95 .long sys_ftruncate 95 .long sys_ftruncate
@@ -191,7 +191,7 @@ ENTRY(sys_call_table)
191 .long sys_ni_syscall /* reserved for streams2 */ 191 .long sys_ni_syscall /* reserved for streams2 */
192 .long ptregs_vfork /* 190 */ 192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit 193 .long sys_getrlimit
194 .long sys_mmap2 194 .long sys_mmap_pgoff
195 .long sys_truncate64 195 .long sys_truncate64
196 .long sys_ftruncate64 196 .long sys_ftruncate64
197 .long sys_stat64 /* 195 */ 197 .long sys_stat64 /* 195 */
@@ -336,7 +336,8 @@ ENTRY(sys_call_table)
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_set_rt_task_param /* LITMUS^RT 337 */ 339 .long sys_recvmmsg
340 .long sys_set_rt_task_param /* LITMUS^RT 338 */
340 .long sys_get_rt_task_param 341 .long sys_get_rt_task_param
341 .long sys_complete_job 342 .long sys_complete_job
342 .long sys_od_open 343 .long sys_od_open
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index be2573448ed9..fb5cc5e14cfa 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
70 * manually to deassert NMI lines for the watchdog if run 70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system. 71 * on an 82489DX-based system.
72 */ 72 */
73 spin_lock(&i8259A_lock); 73 raw_spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3); 74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */ 75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL); 76 inb(PIC_MASTER_POLL);
77 spin_unlock(&i8259A_lock); 77 raw_spin_unlock(&i8259A_lock);
78 } 78 }
79 79
80 global_clock_event->event_handler(global_clock_event); 80 global_clock_event->event_handler(global_clock_event);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 1740c85e24bb..17b03dd3a6b5 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -9,6 +9,7 @@
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/slab.h>
12 13
13#include <asm/mmu_context.h> 14#include <asm/mmu_context.h>
14#include <asm/uv/uv.h> 15#include <asm/uv/uv.h>
@@ -817,10 +818,8 @@ static int __init uv_init_blade(int blade)
817 */ 818 */
818 apicid = blade_to_first_apicid(blade); 819 apicid = blade_to_first_apicid(blade);
819 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); 820 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
820 if ((pa & 0xff) != UV_BAU_MESSAGE) { 821 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
821 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
822 ((apicid << 32) | UV_BAU_MESSAGE)); 822 ((apicid << 32) | UV_BAU_MESSAGE));
823 }
824 return 0; 823 return 0;
825} 824}
826 825
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index cd022121cab6..c652ef62742d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -12,21 +12,19 @@
12#endif 12#endif
13 13
14/* ready for x86_64 and x86 */ 14/* ready for x86_64 and x86 */
15unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE); 15unsigned char *__trampinitdata trampoline_base;
16 16
17void __init reserve_trampoline_memory(void) 17void __init reserve_trampoline_memory(void)
18{ 18{
19#ifdef CONFIG_X86_32 19 unsigned long mem;
20 /* 20
21 * But first pinch a few for the stack/trampoline stuff
22 * FIXME: Don't need the extra page at 4K, but need to fix
23 * trampoline before removing it. (see the GDT stuff)
24 */
25 reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
26#endif
27 /* Has to be in very low memory so we can execute real-mode AP code. */ 21 /* Has to be in very low memory so we can execute real-mode AP code. */
28 reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, 22 mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
29 "TRAMPOLINE"); 23 if (mem == -1L)
24 panic("Cannot allocate trampoline\n");
25
26 trampoline_base = __va(mem);
27 reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
30} 28}
31 29
32/* 30/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dcee0cc3..1168e4454188 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -529,77 +529,59 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
530{ 530{
531 struct task_struct *tsk = current; 531 struct task_struct *tsk = current;
532 unsigned long condition; 532 unsigned long dr6;
533 int si_code; 533 int si_code;
534 534
535 get_debugreg(condition, 6); 535 get_debugreg(dr6, 6);
536
537 /* Filter out all the reserved bits which are preset to 1 */
538 dr6 &= ~DR6_RESERVED;
536 539
537 /* Catch kmemcheck conditions first of all! */ 540 /* Catch kmemcheck conditions first of all! */
538 if (condition & DR_STEP && kmemcheck_trap(regs)) 541 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 542 return;
540 543
544 /* DR6 may or may not be cleared by the CPU */
545 set_debugreg(0, 6);
541 /* 546 /*
542 * The processor cleared BTF, so don't mark that we need it set. 547 * The processor cleared BTF, so don't mark that we need it set.
543 */ 548 */
544 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 549 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
545 tsk->thread.debugctlmsr = 0; 550 tsk->thread.debugctlmsr = 0;
546 551
547 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 552 /* Store the virtualized DR6 value */
548 SIGTRAP) == NOTIFY_STOP) 553 tsk->thread.debugreg6 = dr6;
554
555 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
556 SIGTRAP) == NOTIFY_STOP)
549 return; 557 return;
550 558
551 /* It's safe to allow irq's after DR6 has been saved */ 559 /* It's safe to allow irq's after DR6 has been saved */
552 preempt_conditional_sti(regs); 560 preempt_conditional_sti(regs);
553 561
554 /* Mask out spurious debug traps due to lazy DR7 setting */ 562 if (regs->flags & X86_VM_MASK) {
555 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 563 handle_vm86_trap((struct kernel_vm86_regs *) regs,
556 if (!tsk->thread.debugreg7) 564 error_code, 1);
557 goto clear_dr7; 565 return;
558 } 566 }
559 567
560#ifdef CONFIG_X86_32
561 if (regs->flags & X86_VM_MASK)
562 goto debug_vm86;
563#endif
564
565 /* Save debug status register where ptrace can see it */
566 tsk->thread.debugreg6 = condition;
567
568 /* 568 /*
569 * Single-stepping through TF: make sure we ignore any events in 569 * Single-stepping through system calls: ignore any exceptions in
570 * kernel space (but re-enable TF when returning to user mode). 570 * kernel space, but re-enable TF when returning to user mode.
571 *
572 * We already checked v86 mode above, so we can check for kernel mode
573 * by just checking the CPL of CS.
571 */ 574 */
572 if (condition & DR_STEP) { 575 if ((dr6 & DR_STEP) && !user_mode(regs)) {
573 if (!user_mode(regs)) 576 tsk->thread.debugreg6 &= ~DR_STEP;
574 goto clear_TF_reenable; 577 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
578 regs->flags &= ~X86_EFLAGS_TF;
575 } 579 }
576 580 si_code = get_si_code(tsk->thread.debugreg6);
577 si_code = get_si_code(condition); 581 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
578 /* Ok, finally something we can handle */ 582 send_sigtrap(tsk, regs, error_code, si_code);
579 send_sigtrap(tsk, regs, error_code, si_code);
580
581 /*
582 * Disable additional traps. They'll be re-enabled when
583 * the signal is delivered.
584 */
585clear_dr7:
586 set_debugreg(0, 7);
587 preempt_conditional_cli(regs); 583 preempt_conditional_cli(regs);
588 return;
589
590#ifdef CONFIG_X86_32
591debug_vm86:
592 /* reenable preemption: handle_vm86_trap() might sleep */
593 dec_preempt_count();
594 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
595 conditional_cli(regs);
596 return;
597#endif
598 584
599clear_TF_reenable:
600 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
601 regs->flags &= ~X86_EFLAGS_TF;
602 preempt_conditional_cli(regs);
603 return; 585 return;
604} 586}
605 587
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cd982f48e23e..9faf91ae1841 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -50,7 +50,7 @@ u64 native_sched_clock(void)
50 * unstable. We do this because unlike Time Of Day, 50 * unstable. We do this because unlike Time Of Day,
51 * the scheduler clock tolerates small errors and it's 51 * the scheduler clock tolerates small errors and it's
52 * very important for it to be as fast as the platform 52 * very important for it to be as fast as the platform
53 * can achive it. ) 53 * can achieve it. )
54 */ 54 */
55 if (unlikely(tsc_disabled)) { 55 if (unlikely(tsc_disabled)) {
56 /* No locking but a rare wrong value is not a big deal: */ 56 /* No locking but a rare wrong value is not a big deal: */
@@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void)
740} 740}
741#endif 741#endif
742 742
743static void resume_tsc(void) 743static void resume_tsc(struct clocksource *cs)
744{ 744{
745 clocksource_tsc.cycle_last = 0; 745 clocksource_tsc.cycle_last = 0;
746} 746}
@@ -763,6 +763,7 @@ void mark_tsc_unstable(char *reason)
763{ 763{
764 if (!tsc_unstable) { 764 if (!tsc_unstable) {
765 tsc_unstable = 1; 765 tsc_unstable = 1;
766 sched_clock_stable = 0;
766 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 767 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
767 /* Change only the rating, when not registered */ 768 /* Change only the rating, when not registered */
768 if (clocksource_tsc.mult) 769 if (clocksource_tsc.mult)
@@ -805,7 +806,7 @@ static void __init check_system_tsc_reliable(void)
805 unsigned long res_low, res_high; 806 unsigned long res_low, res_high;
806 807
807 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); 808 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
808 /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ 809 /* Geode_LX - the OLPC CPU has a very reliable TSC */
809 if (res_low & RTSC_SUSP) 810 if (res_low & RTSC_SUSP)
810 tsc_clocksource_reliable = 1; 811 tsc_clocksource_reliable = 1;
811#endif 812#endif
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index f37930954d15..0aa5fed8b9e6 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count;
33 * we want to have the fastest, inlined, non-debug version 33 * we want to have the fastest, inlined, non-debug version
34 * of a critical section, to be able to prove TSC time-warps: 34 * of a critical section, to be able to prove TSC time-warps:
35 */ 35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; 36static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
37 37
38static __cpuinitdata cycles_t last_tsc; 38static __cpuinitdata cycles_t last_tsc;
39static __cpuinitdata cycles_t max_warp; 39static __cpuinitdata cycles_t max_warp;
@@ -62,13 +62,13 @@ static __cpuinit void check_tsc_warp(void)
62 * previous TSC that was measured (possibly on 62 * previous TSC that was measured (possibly on
63 * another CPU) and update the previous TSC timestamp. 63 * another CPU) and update the previous TSC timestamp.
64 */ 64 */
65 __raw_spin_lock(&sync_lock); 65 arch_spin_lock(&sync_lock);
66 prev = last_tsc; 66 prev = last_tsc;
67 rdtsc_barrier(); 67 rdtsc_barrier();
68 now = get_cycles(); 68 now = get_cycles();
69 rdtsc_barrier(); 69 rdtsc_barrier();
70 last_tsc = now; 70 last_tsc = now;
71 __raw_spin_unlock(&sync_lock); 71 arch_spin_unlock(&sync_lock);
72 72
73 /* 73 /*
74 * Be nice every now and then (and also check whether 74 * Be nice every now and then (and also check whether
@@ -87,10 +87,10 @@ static __cpuinit void check_tsc_warp(void)
87 * we saw a time-warp of the TSC going backwards: 87 * we saw a time-warp of the TSC going backwards:
88 */ 88 */
89 if (unlikely(prev > now)) { 89 if (unlikely(prev > now)) {
90 __raw_spin_lock(&sync_lock); 90 arch_spin_lock(&sync_lock);
91 max_warp = max(max_warp, prev - now); 91 max_warp = max(max_warp, prev - now);
92 nr_warps++; 92 nr_warps++;
93 __raw_spin_unlock(&sync_lock); 93 arch_spin_unlock(&sync_lock);
94 } 94 }
95 } 95 }
96 WARN(!(now-start), 96 WARN(!(now-start),
@@ -114,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
117 printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); 117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
118 pr_info(
119 "Skipped synchronization checks as TSC is reliable.\n");
118 return; 120 return;
119 } 121 }
120 122
121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu);
123
124 /* 123 /*
125 * Reset it - in case this is a second bootup: 124 * Reset it - in case this is a second bootup:
126 */ 125 */
@@ -142,12 +141,14 @@ void __cpuinit check_tsc_sync_source(int cpu)
142 cpu_relax(); 141 cpu_relax();
143 142
144 if (nr_warps) { 143 if (nr_warps) {
145 printk("\n"); 144 pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
145 smp_processor_id(), cpu);
146 pr_warning("Measured %Ld cycles TSC warp between CPUs, " 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 "turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
151 smp_processor_id(), cpu);
151 } 152 }
152 153
153 /* 154 /*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index aeef529917e4..1d40336b030a 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -9,10 +9,26 @@
9 */ 9 */
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/rbtree.h>
13#include <linux/slab.h>
12#include <linux/irq.h> 14#include <linux/irq.h>
13 15
14#include <asm/apic.h> 16#include <asm/apic.h>
15#include <asm/uv/uv_irq.h> 17#include <asm/uv/uv_irq.h>
18#include <asm/uv/uv_hub.h>
19
20/* MMR offset and pnode of hub sourcing interrupts for a given irq */
21struct uv_irq_2_mmr_pnode{
22 struct rb_node list;
23 unsigned long offset;
24 int pnode;
25 int irq;
26};
27
28static spinlock_t uv_irq_lock;
29static struct rb_root uv_irq_root;
30
31static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
16 32
17static void uv_noop(unsigned int irq) 33static void uv_noop(unsigned int irq)
18{ 34{
@@ -39,25 +55,213 @@ struct irq_chip uv_irq_chip = {
39 .unmask = uv_noop, 55 .unmask = uv_noop,
40 .eoi = uv_ack_apic, 56 .eoi = uv_ack_apic,
41 .end = uv_noop, 57 .end = uv_noop,
58 .set_affinity = uv_set_irq_affinity,
42}; 59};
43 60
44/* 61/*
62 * Add offset and pnode information of the hub sourcing interrupts to the
63 * rb tree for a specific irq.
64 */
65static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
66{
67 struct rb_node **link = &uv_irq_root.rb_node;
68 struct rb_node *parent = NULL;
69 struct uv_irq_2_mmr_pnode *n;
70 struct uv_irq_2_mmr_pnode *e;
71 unsigned long irqflags;
72
73 n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
74 uv_blade_to_memory_nid(blade));
75 if (!n)
76 return -ENOMEM;
77
78 n->irq = irq;
79 n->offset = offset;
80 n->pnode = uv_blade_to_pnode(blade);
81 spin_lock_irqsave(&uv_irq_lock, irqflags);
82 /* Find the right place in the rbtree: */
83 while (*link) {
84 parent = *link;
85 e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
86
87 if (unlikely(irq == e->irq)) {
88 /* irq entry exists */
89 e->pnode = uv_blade_to_pnode(blade);
90 e->offset = offset;
91 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
92 kfree(n);
93 return 0;
94 }
95
96 if (irq < e->irq)
97 link = &(*link)->rb_left;
98 else
99 link = &(*link)->rb_right;
100 }
101
102 /* Insert the node into the rbtree. */
103 rb_link_node(&n->list, parent, link);
104 rb_insert_color(&n->list, &uv_irq_root);
105
106 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
107 return 0;
108}
109
110/* Retrieve offset and pnode information from the rb tree for a specific irq */
111int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
112{
113 struct uv_irq_2_mmr_pnode *e;
114 struct rb_node *n;
115 unsigned long irqflags;
116
117 spin_lock_irqsave(&uv_irq_lock, irqflags);
118 n = uv_irq_root.rb_node;
119 while (n) {
120 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
121
122 if (e->irq == irq) {
123 *offset = e->offset;
124 *pnode = e->pnode;
125 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
126 return 0;
127 }
128
129 if (irq < e->irq)
130 n = n->rb_left;
131 else
132 n = n->rb_right;
133 }
134 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
135 return -1;
136}
137
138/*
139 * Re-target the irq to the specified CPU and enable the specified MMR located
140 * on the specified blade to allow the sending of MSIs to the specified CPU.
141 */
142static int
143arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
144 unsigned long mmr_offset, int restrict)
145{
146 const struct cpumask *eligible_cpu = cpumask_of(cpu);
147 struct irq_desc *desc = irq_to_desc(irq);
148 struct irq_cfg *cfg;
149 int mmr_pnode;
150 unsigned long mmr_value;
151 struct uv_IO_APIC_route_entry *entry;
152 int err;
153
154 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
155 sizeof(unsigned long));
156
157 cfg = irq_cfg(irq);
158
159 err = assign_irq_vector(irq, cfg, eligible_cpu);
160 if (err != 0)
161 return err;
162
163 if (restrict == UV_AFFINITY_CPU)
164 desc->status |= IRQ_NO_BALANCING;
165 else
166 desc->status |= IRQ_MOVE_PCNTXT;
167
168 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
169 irq_name);
170
171 mmr_value = 0;
172 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
173 entry->vector = cfg->vector;
174 entry->delivery_mode = apic->irq_delivery_mode;
175 entry->dest_mode = apic->irq_dest_mode;
176 entry->polarity = 0;
177 entry->trigger = 0;
178 entry->mask = 0;
179 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
180
181 mmr_pnode = uv_blade_to_pnode(mmr_blade);
182 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
183
184 if (cfg->move_in_progress)
185 send_cleanup_vector(cfg);
186
187 return irq;
188}
189
190/*
191 * Disable the specified MMR located on the specified blade so that MSIs are
192 * longer allowed to be sent.
193 */
194static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
195{
196 unsigned long mmr_value;
197 struct uv_IO_APIC_route_entry *entry;
198
199 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
200 sizeof(unsigned long));
201
202 mmr_value = 0;
203 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
204 entry->mask = 1;
205
206 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
207}
208
209static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
210{
211 struct irq_desc *desc = irq_to_desc(irq);
212 struct irq_cfg *cfg = desc->chip_data;
213 unsigned int dest;
214 unsigned long mmr_value;
215 struct uv_IO_APIC_route_entry *entry;
216 unsigned long mmr_offset;
217 unsigned mmr_pnode;
218
219 if (set_desc_affinity(desc, mask, &dest))
220 return -1;
221
222 mmr_value = 0;
223 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
224
225 entry->vector = cfg->vector;
226 entry->delivery_mode = apic->irq_delivery_mode;
227 entry->dest_mode = apic->irq_dest_mode;
228 entry->polarity = 0;
229 entry->trigger = 0;
230 entry->mask = 0;
231 entry->dest = dest;
232
233 /* Get previously stored MMR and pnode of hub sourcing interrupts */
234 if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
235 return -1;
236
237 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
238
239 if (cfg->move_in_progress)
240 send_cleanup_vector(cfg);
241
242 return 0;
243}
244
245/*
45 * Set up a mapping of an available irq and vector, and enable the specified 246 * Set up a mapping of an available irq and vector, and enable the specified
46 * MMR that defines the MSI that is to be sent to the specified CPU when an 247 * MMR that defines the MSI that is to be sent to the specified CPU when an
47 * interrupt is raised. 248 * interrupt is raised.
48 */ 249 */
49int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, 250int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
50 unsigned long mmr_offset) 251 unsigned long mmr_offset, int restrict)
51{ 252{
52 int irq; 253 int irq, ret;
53 int ret; 254
255 irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
54 256
55 irq = create_irq();
56 if (irq <= 0) 257 if (irq <= 0)
57 return -EBUSY; 258 return -EBUSY;
58 259
59 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); 260 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
60 if (ret != irq) 261 restrict);
262 if (ret == irq)
263 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
264 else
61 destroy_irq(irq); 265 destroy_irq(irq);
62 266
63 return ret; 267 return ret;
@@ -71,9 +275,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
71 * 275 *
72 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). 276 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
73 */ 277 */
74void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) 278void uv_teardown_irq(unsigned int irq)
75{ 279{
76 arch_disable_uv_irq(mmr_blade, mmr_offset); 280 struct uv_irq_2_mmr_pnode *e;
281 struct rb_node *n;
282 unsigned long irqflags;
283
284 spin_lock_irqsave(&uv_irq_lock, irqflags);
285 n = uv_irq_root.rb_node;
286 while (n) {
287 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
288 if (e->irq == irq) {
289 arch_disable_uv_irq(e->pnode, e->offset);
290 rb_erase(n, &uv_irq_root);
291 kfree(e);
292 break;
293 }
294 if (irq < e->irq)
295 n = n->rb_left;
296 else
297 n = n->rb_right;
298 }
299 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
77 destroy_irq(irq); 300 destroy_irq(irq);
78} 301}
79EXPORT_SYMBOL_GPL(uv_teardown_irq); 302EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
index 36afb98675a4..309c70fb7759 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void)
54 if (!sgi_uv_kobj) 54 if (!sgi_uv_kobj)
55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); 55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
56 if (!sgi_uv_kobj) { 56 if (!sgi_uv_kobj) {
57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); 57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
58 return -EINVAL; 58 return -EINVAL;
59 } 59 }
60 60
61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); 61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
62 if (ret) { 62 if (ret) {
63 printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); 63 printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
64 return ret; 64 return ret;
65 } 65 }
66 66
67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); 67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
68 if (ret) { 68 if (ret) {
69 printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); 69 printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
70 return ret; 70 return ret;
71 } 71 }
72 72
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 583f11d5c480..56e421bc379b 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -19,6 +19,7 @@
19 * Copyright (c) Dimitri Sivanich 19 * Copyright (c) Dimitri Sivanich
20 */ 20 */
21#include <linux/clockchips.h> 21#include <linux/clockchips.h>
22#include <linux/slab.h>
22 23
23#include <asm/uv/uv_mmrs.h> 24#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h> 25#include <asm/uv/uv_hub.h>
@@ -74,7 +75,7 @@ struct uv_rtc_timer_head {
74 */ 75 */
75static struct uv_rtc_timer_head **blade_info __read_mostly; 76static struct uv_rtc_timer_head **blade_info __read_mostly;
76 77
77static int uv_rtc_enable; 78static int uv_rtc_evt_enable;
78 79
79/* 80/*
80 * Hardware interface routines 81 * Hardware interface routines
@@ -90,7 +91,7 @@ static void uv_rtc_send_IPI(int cpu)
90 pnode = uv_apicid_to_pnode(apicid); 91 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 92 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) | 93 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); 94 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94 95
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 96 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96} 97}
@@ -115,7 +116,7 @@ static int uv_setup_intr(int cpu, u64 expires)
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, 116 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK); 117 UVH_EVENT_OCCURRED0_RTC1_MASK);
117 118
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 119 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 120 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120 121
121 /* Set configuration */ 122 /* Set configuration */
@@ -123,7 +124,10 @@ static int uv_setup_intr(int cpu, u64 expires)
123 /* Initialize comparator value */ 124 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); 125 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125 126
126 return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); 127 if (uv_read_rtc(NULL) <= expires)
128 return 0;
129
130 return !uv_intr_pending(pnode);
127} 131}
128 132
129/* 133/*
@@ -223,6 +227,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
223 227
224 next_cpu = head->next_cpu; 228 next_cpu = head->next_cpu;
225 *t = expires; 229 *t = expires;
230
226 /* Will this one be next to go off? */ 231 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu || 232 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) { 233 expires < head->cpu[next_cpu].expires) {
@@ -231,7 +236,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
231 *t = ULLONG_MAX; 236 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode); 237 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags); 238 spin_unlock_irqrestore(&head->lock, flags);
234 return 1; 239 return -ETIME;
235 } 240 }
236 } 241 }
237 242
@@ -244,7 +249,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
244 * 249 *
245 * Returns 1 if this timer was pending. 250 * Returns 1 if this timer was pending.
246 */ 251 */
247static int uv_rtc_unset_timer(int cpu) 252static int uv_rtc_unset_timer(int cpu, int force)
248{ 253{
249 int pnode = uv_cpu_to_pnode(cpu); 254 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu); 255 int bid = uv_cpu_to_blade_id(cpu);
@@ -256,14 +261,15 @@ static int uv_rtc_unset_timer(int cpu)
256 261
257 spin_lock_irqsave(&head->lock, flags); 262 spin_lock_irqsave(&head->lock, flags);
258 263
259 if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) 264 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
260 rc = 1; 265 rc = 1;
261 266
262 *t = ULLONG_MAX; 267 if (rc) {
263 268 *t = ULLONG_MAX;
264 /* Was the hardware setup for this timer? */ 269 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu) 270 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode); 271 uv_rtc_find_next_timer(head, pnode);
272 }
267 273
268 spin_unlock_irqrestore(&head->lock, flags); 274 spin_unlock_irqrestore(&head->lock, flags);
269 275
@@ -277,10 +283,21 @@ static int uv_rtc_unset_timer(int cpu)
277 283
278/* 284/*
279 * Read the RTC. 285 * Read the RTC.
286 *
287 * Starting with HUB rev 2.0, the UV RTC register is replicated across all
288 * cachelines of it's own page. This allows faster simultaneous reads
289 * from a given socket.
280 */ 290 */
281static cycle_t uv_read_rtc(struct clocksource *cs) 291static cycle_t uv_read_rtc(struct clocksource *cs)
282{ 292{
283 return (cycle_t)uv_read_local_mmr(UVH_RTC); 293 unsigned long offset;
294
295 if (uv_get_min_hub_revision_id() == 1)
296 offset = 0;
297 else
298 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
299
300 return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
284} 301}
285 302
286/* 303/*
@@ -310,32 +327,32 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode,
310 break; 327 break;
311 case CLOCK_EVT_MODE_UNUSED: 328 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN: 329 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu); 330 uv_rtc_unset_timer(ced_cpu, 1);
314 break; 331 break;
315 } 332 }
316} 333}
317 334
318static void uv_rtc_interrupt(void) 335static void uv_rtc_interrupt(void)
319{ 336{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id(); 337 int cpu = smp_processor_id();
338 struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
322 339
323 if (!ced || !ced->event_handler) 340 if (!ced || !ced->event_handler)
324 return; 341 return;
325 342
326 if (uv_rtc_unset_timer(cpu) != 1) 343 if (uv_rtc_unset_timer(cpu, 0) != 1)
327 return; 344 return;
328 345
329 ced->event_handler(ced); 346 ced->event_handler(ced);
330} 347}
331 348
332static int __init uv_enable_rtc(char *str) 349static int __init uv_enable_evt_rtc(char *str)
333{ 350{
334 uv_rtc_enable = 1; 351 uv_rtc_evt_enable = 1;
335 352
336 return 1; 353 return 1;
337} 354}
338__setup("uvrtc", uv_enable_rtc); 355__setup("uvrtcevt", uv_enable_evt_rtc);
339 356
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy) 357static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{ 358{
@@ -350,27 +367,32 @@ static __init int uv_rtc_setup_clock(void)
350{ 367{
351 int rc; 368 int rc;
352 369
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) 370 if (!is_uv_system())
354 return -ENODEV; 371 return -ENODEV;
355 372
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, 373 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift); 374 clocksource_uv.shift);
360 375
376 /* If single blade, prefer tsc */
377 if (uv_num_possible_blades() == 1)
378 clocksource_uv.rating = 250;
379
361 rc = clocksource_register(&clocksource_uv); 380 rc = clocksource_register(&clocksource_uv);
362 if (rc) { 381 if (rc)
363 generic_interrupt_extension = NULL; 382 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
383 else
384 printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
385 sn_rtc_cycles_per_second/(unsigned long)1E6);
386
387 if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
364 return rc; 388 return rc;
365 }
366 389
367 /* Setup and register clockevents */ 390 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers(); 391 rc = uv_rtc_allocate_timers();
369 if (rc) { 392 if (rc)
370 clocksource_unregister(&clocksource_uv); 393 goto error;
371 generic_interrupt_extension = NULL; 394
372 return rc; 395 x86_platform_ipi_callback = uv_rtc_interrupt;
373 }
374 396
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, 397 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift); 398 NSEC_PER_SEC, clock_event_device_uv.shift);
@@ -383,11 +405,19 @@ static __init int uv_rtc_setup_clock(void)
383 405
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents); 406 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) { 407 if (rc) {
386 clocksource_unregister(&clocksource_uv); 408 x86_platform_ipi_callback = NULL;
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers(); 409 uv_rtc_deallocate_timers();
410 goto error;
389 } 411 }
390 412
413 printk(KERN_INFO "UV RTC clockevents registered\n");
414
415 return 0;
416
417error:
418 clocksource_unregister(&clocksource_uv);
419 printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
420
391 return rc; 421 return rc;
392} 422}
393arch_initcall(uv_rtc_setup_clock); 423arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index f068553a1b17..e680ea52db9b 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -49,11 +49,6 @@ extern int no_broadcast;
49char visws_board_type = -1; 49char visws_board_type = -1;
50char visws_board_rev = -1; 50char visws_board_rev = -1;
51 51
52int is_visws_box(void)
53{
54 return visws_board_type >= 0;
55}
56
57static void __init visws_time_init(void) 52static void __init visws_time_init(void)
58{ 53{
59 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 54 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
@@ -183,7 +178,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
183 return; 178 return;
184 } 179 }
185 180
186 apic_cpus = apic->apicid_to_cpu_present(m->apicid); 181 apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); 182 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
188 /* 183 /*
189 * Validate version 184 * Validate version
@@ -197,7 +192,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
197 apic_version[m->apicid] = ver; 192 apic_version[m->apicid] = ver;
198} 193}
199 194
200static void __init visws_find_smp_config(unsigned int reserve) 195static void __init visws_find_smp_config(void)
201{ 196{
202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); 197 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 198 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -242,6 +237,8 @@ void __init visws_early_detect(void)
242 x86_init.irqs.pre_vector_init = visws_pre_intr_init; 237 x86_init.irqs.pre_vector_init = visws_pre_intr_init;
243 x86_init.irqs.trap_init = visws_trap_init; 238 x86_init.irqs.trap_init = visws_trap_init;
244 x86_init.timers.timer_init = visws_time_init; 239 x86_init.timers.timer_init = visws_time_init;
240 x86_init.pci.init = pci_visws_init;
241 x86_init.pci.init_irq = x86_init_noop;
245 242
246 /* 243 /*
247 * Install reboot quirks: 244 * Install reboot quirks:
@@ -486,7 +483,7 @@ static void end_cobalt_irq(unsigned int irq)
486} 483}
487 484
488static struct irq_chip cobalt_irq_type = { 485static struct irq_chip cobalt_irq_type = {
489 .typename = "Cobalt-APIC", 486 .name = "Cobalt-APIC",
490 .startup = startup_cobalt_irq, 487 .startup = startup_cobalt_irq,
491 .shutdown = disable_cobalt_irq, 488 .shutdown = disable_cobalt_irq,
492 .enable = enable_cobalt_irq, 489 .enable = enable_cobalt_irq,
@@ -508,7 +505,7 @@ static struct irq_chip cobalt_irq_type = {
508 */ 505 */
509static unsigned int startup_piix4_master_irq(unsigned int irq) 506static unsigned int startup_piix4_master_irq(unsigned int irq)
510{ 507{
511 init_8259A(0); 508 legacy_pic->init(0);
512 509
513 return startup_cobalt_irq(irq); 510 return startup_cobalt_irq(irq);
514} 511}
@@ -523,7 +520,7 @@ static void end_piix4_master_irq(unsigned int irq)
523} 520}
524 521
525static struct irq_chip piix4_master_irq_type = { 522static struct irq_chip piix4_master_irq_type = {
526 .typename = "PIIX4-master", 523 .name = "PIIX4-master",
527 .startup = startup_piix4_master_irq, 524 .startup = startup_piix4_master_irq,
528 .ack = ack_cobalt_irq, 525 .ack = ack_cobalt_irq,
529 .end = end_piix4_master_irq, 526 .end = end_piix4_master_irq,
@@ -531,10 +528,7 @@ static struct irq_chip piix4_master_irq_type = {
531 528
532 529
533static struct irq_chip piix4_virtual_irq_type = { 530static struct irq_chip piix4_virtual_irq_type = {
534 .typename = "PIIX4-virtual", 531 .name = "PIIX4-virtual",
535 .shutdown = disable_8259A_irq,
536 .enable = enable_8259A_irq,
537 .disable = disable_8259A_irq,
538}; 532};
539 533
540 534
@@ -559,7 +553,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
559 struct irq_desc *desc; 553 struct irq_desc *desc;
560 unsigned long flags; 554 unsigned long flags;
561 555
562 spin_lock_irqsave(&i8259A_lock, flags); 556 raw_spin_lock_irqsave(&i8259A_lock, flags);
563 557
564 /* Find out what's interrupting in the PIIX4 master 8259 */ 558 /* Find out what's interrupting in the PIIX4 master 8259 */
565 outb(0x0c, 0x20); /* OCW3 Poll command */ 559 outb(0x0c, 0x20); /* OCW3 Poll command */
@@ -596,7 +590,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
596 outb(0x60 + realirq, 0x20); 590 outb(0x60 + realirq, 0x20);
597 } 591 }
598 592
599 spin_unlock_irqrestore(&i8259A_lock, flags); 593 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
600 594
601 desc = irq_to_desc(realirq); 595 desc = irq_to_desc(realirq);
602 596
@@ -609,12 +603,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
609 handle_IRQ_event(realirq, desc->action); 603 handle_IRQ_event(realirq, desc->action);
610 604
611 if (!(desc->status & IRQ_DISABLED)) 605 if (!(desc->status & IRQ_DISABLED))
612 enable_8259A_irq(realirq); 606 legacy_pic->chip->unmask(realirq);
613 607
614 return IRQ_HANDLED; 608 return IRQ_HANDLED;
615 609
616out_unlock: 610out_unlock:
617 spin_unlock_irqrestore(&i8259A_lock, flags); 611 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
618 return IRQ_NONE; 612 return IRQ_NONE;
619} 613}
620 614
@@ -628,6 +622,12 @@ static struct irqaction cascade_action = {
628 .name = "cascade", 622 .name = "cascade",
629}; 623};
630 624
625static inline void set_piix4_virtual_irq_type(void)
626{
627 piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
628 piix4_virtual_irq_type.enable = i8259A_chip.unmask;
629 piix4_virtual_irq_type.disable = i8259A_chip.mask;
630}
631 631
632void init_VISWS_APIC_irqs(void) 632void init_VISWS_APIC_irqs(void)
633{ 633{
@@ -653,6 +653,7 @@ void init_VISWS_APIC_irqs(void)
653 desc->chip = &piix4_master_irq_type; 653 desc->chip = &piix4_master_irq_type;
654 } 654 }
655 else if (i < CO_IRQ_APIC0) { 655 else if (i < CO_IRQ_APIC0) {
656 set_piix4_virtual_irq_type();
656 desc->chip = &piix4_virtual_irq_type; 657 desc->chip = &piix4_virtual_irq_type;
657 } 658 }
658 else if (IS_CO_APIC(i)) { 659 else if (IS_CO_APIC(i)) {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9c4e62539058..5ffb5622f793 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -197,9 +197,8 @@ out:
197static int do_vm86_irq_handling(int subfunction, int irqnumber); 197static int do_vm86_irq_handling(int subfunction, int irqnumber);
198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
199 199
200int sys_vm86old(struct pt_regs *regs) 200int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs)
201{ 201{
202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
203 struct kernel_vm86_struct info; /* declare this _on top_, 202 struct kernel_vm86_struct info; /* declare this _on top_,
204 * this avoids wasting of stack space. 203 * this avoids wasting of stack space.
205 * This remains on the stack until we 204 * This remains on the stack until we
@@ -227,7 +226,7 @@ out:
227} 226}
228 227
229 228
230int sys_vm86(struct pt_regs *regs) 229int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
231{ 230{
232 struct kernel_vm86_struct info; /* declare this _on top_, 231 struct kernel_vm86_struct info; /* declare this _on top_,
233 * this avoids wasting of stack space. 232 * this avoids wasting of stack space.
@@ -239,12 +238,12 @@ int sys_vm86(struct pt_regs *regs)
239 struct vm86plus_struct __user *v86; 238 struct vm86plus_struct __user *v86;
240 239
241 tsk = current; 240 tsk = current;
242 switch (regs->bx) { 241 switch (cmd) {
243 case VM86_REQUEST_IRQ: 242 case VM86_REQUEST_IRQ:
244 case VM86_FREE_IRQ: 243 case VM86_FREE_IRQ:
245 case VM86_GET_IRQ_BITS: 244 case VM86_GET_IRQ_BITS:
246 case VM86_GET_AND_RESET_IRQ: 245 case VM86_GET_AND_RESET_IRQ:
247 ret = do_vm86_irq_handling(regs->bx, (int)regs->cx); 246 ret = do_vm86_irq_handling(cmd, (int)arg);
248 goto out; 247 goto out;
249 case VM86_PLUS_INSTALL_CHECK: 248 case VM86_PLUS_INSTALL_CHECK:
250 /* 249 /*
@@ -261,7 +260,7 @@ int sys_vm86(struct pt_regs *regs)
261 ret = -EPERM; 260 ret = -EPERM;
262 if (tsk->thread.saved_sp0) 261 if (tsk->thread.saved_sp0)
263 goto out; 262 goto out;
264 v86 = (struct vm86plus_struct __user *)regs->cx; 263 v86 = (struct vm86plus_struct __user *)arg;
265 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
266 offsetof(struct kernel_vm86_struct, regs32) - 265 offsetof(struct kernel_vm86_struct, regs32) -
267 sizeof(info.regs)); 266 sizeof(info.regs));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index d430e4c30193..ce9fbacb7526 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -28,11 +28,13 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/gfp.h>
31#include <asm/vmi.h> 32#include <asm/vmi.h>
32#include <asm/io.h> 33#include <asm/io.h>
33#include <asm/fixmap.h> 34#include <asm/fixmap.h>
34#include <asm/apicdef.h> 35#include <asm/apicdef.h>
35#include <asm/apic.h> 36#include <asm/apic.h>
37#include <asm/pgalloc.h>
36#include <asm/processor.h> 38#include <asm/processor.h>
37#include <asm/timer.h> 39#include <asm/timer.h>
38#include <asm/vmi_time.h> 40#include <asm/vmi_time.h>
@@ -266,30 +268,6 @@ static void vmi_nop(void)
266{ 268{
267} 269}
268 270
269#ifdef CONFIG_HIGHPTE
270static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
271{
272 void *va = kmap_atomic(page, type);
273
274 /*
275 * Internally, the VMI ROM must map virtual addresses to physical
276 * addresses for processing MMU updates. By the time MMU updates
277 * are issued, this information is typically already lost.
278 * Fortunately, the VMI provides a cache of mapping slots for active
279 * page tables.
280 *
281 * We use slot zero for the linear mapping of physical memory, and
282 * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
283 *
284 * args: SLOT VA COUNT PFN
285 */
286 BUG_ON(type != KM_PTE0 && type != KM_PTE1);
287 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
288
289 return va;
290}
291#endif
292
293static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) 271static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
294{ 272{
295 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 273 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -640,6 +618,12 @@ static inline int __init activate_vmi(void)
640 u64 reloc; 618 u64 reloc;
641 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; 619 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
642 620
621 /*
622 * Prevent page tables from being allocated in highmem, even if
623 * CONFIG_HIGHPTE is enabled.
624 */
625 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
626
643 if (call_vrom_func(vmi_rom, vmi_init) != 0) { 627 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
644 printk(KERN_ERR "VMI ROM failed to initialize!"); 628 printk(KERN_ERR "VMI ROM failed to initialize!");
645 return 0; 629 return 0;
@@ -778,10 +762,6 @@ static inline int __init activate_vmi(void)
778 762
779 /* Set linear is needed in all cases */ 763 /* Set linear is needed in all cases */
780 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); 764 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
781#ifdef CONFIG_HIGHPTE
782 if (vmi_ops.set_linear_mapping)
783 pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
784#endif
785 765
786 /* 766 /*
787 * These MUST always be patched. Don't support indirect jumps 767 * These MUST always be patched. Don't support indirect jumps
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 611b9e2360d3..5e1ff66ecd73 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void)
79 79
80static inline unsigned int vmi_get_timer_vector(void) 80static inline unsigned int vmi_get_timer_vector(void)
81{ 81{
82#ifdef CONFIG_X86_IO_APIC 82 return IRQ0_VECTOR;
83 return FIRST_DEVICE_VECTOR;
84#else
85 return FIRST_EXTERNAL_VECTOR;
86#endif
87} 83}
88 84
89/** vmi clockchip */ 85/** vmi clockchip */
@@ -171,7 +167,7 @@ static int vmi_timer_next_event(unsigned long delta,
171{ 167{
172 /* Unfortunately, set_next_event interface only passes relative 168 /* Unfortunately, set_next_event interface only passes relative
173 * expiry, but we want absolute expiry. It'd be better if were 169 * expiry, but we want absolute expiry. It'd be better if were
174 * were passed an aboslute expiry, since a bunch of time may 170 * were passed an absolute expiry, since a bunch of time may
175 * have been stolen between the time the delta is computed and 171 * have been stolen between the time the delta is computed and
176 * when we set the alarm below. */ 172 * when we set the alarm below. */
177 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); 173 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
@@ -226,7 +222,7 @@ static void __devinit vmi_time_init_clockevent(void)
226 evt->min_delta_ns = clockevent_delta2ns(1, evt); 222 evt->min_delta_ns = clockevent_delta2ns(1, evt);
227 evt->cpumask = cpumask_of(cpu); 223 evt->cpumask = cpumask_of(cpu);
228 224
229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", 225 printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
230 evt->name, evt->mult, evt->shift); 226 evt->name, evt->mult, evt->shift);
231 clockevents_register_device(evt); 227 clockevents_register_device(evt);
232} 228}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3c68fe2d46cf..2cc249718c46 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,6 +41,32 @@ ENTRY(phys_startup_64)
41jiffies_64 = jiffies; 41jiffies_64 = jiffies;
42#endif 42#endif
43 43
44#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
45/*
46 * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
47 * we retain large page mappings for boundaries spanning kernel text, rodata
48 * and data sections.
49 *
50 * However, kernel identity mappings will have different RWX permissions
51 * to the pages mapping to text and to the pages padding (which are freed) the
52 * text section. Hence kernel identity mappings will be broken to smaller
53 * pages. For 64-bit, kernel text and kernel identity mappings are different,
54 * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
55 * as well as retain 2MB large page mappings for kernel text.
56 */
57#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
58
59#define X64_ALIGN_DEBUG_RODATA_END \
60 . = ALIGN(HPAGE_SIZE); \
61 __end_rodata_hpage_align = .;
62
63#else
64
65#define X64_ALIGN_DEBUG_RODATA_BEGIN
66#define X64_ALIGN_DEBUG_RODATA_END
67
68#endif
69
44PHDRS { 70PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(7); /* RWE */
@@ -90,7 +116,9 @@ SECTIONS
90 116
91 EXCEPTION_TABLE(16) :text = 0x9090 117 EXCEPTION_TABLE(16) :text = 0x9090
92 118
119 X64_ALIGN_DEBUG_RODATA_BEGIN
93 RO_DATA(PAGE_SIZE) 120 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END
94 122
95 /* Data */ 123 /* Data */
96 .data : AT(ADDR(.data) - LOAD_OFFSET) { 124 .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -107,13 +135,13 @@ SECTIONS
107 135
108 PAGE_ALIGNED_DATA(PAGE_SIZE) 136 PAGE_ALIGNED_DATA(PAGE_SIZE)
109 137
110 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) 138 CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
111 139
112 DATA_DATA 140 DATA_DATA
113 CONSTRUCTORS 141 CONSTRUCTORS
114 142
115 /* rarely changed data like cpu maps */ 143 /* rarely changed data like cpu maps */
116 READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) 144 READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
117 145
118 /* End of data section */ 146 /* End of data section */
119 _edata = .; 147 _edata = .;
@@ -137,12 +165,12 @@ SECTIONS
137 *(.vsyscall_0) 165 *(.vsyscall_0)
138 } :user 166 } :user
139 167
140 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 168 . = ALIGN(L1_CACHE_BYTES);
141 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { 169 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
142 *(.vsyscall_fn) 170 *(.vsyscall_fn)
143 } 171 }
144 172
145 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 173 . = ALIGN(L1_CACHE_BYTES);
146 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { 174 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
147 *(.vsyscall_gtod_data) 175 *(.vsyscall_gtod_data)
148 } 176 }
@@ -166,7 +194,7 @@ SECTIONS
166 } 194 }
167 vgetcpu_mode = VVIRT(.vgetcpu_mode); 195 vgetcpu_mode = VVIRT(.vgetcpu_mode);
168 196
169 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 197 . = ALIGN(L1_CACHE_BYTES);
170 .jiffies : AT(VLOAD(.jiffies)) { 198 .jiffies : AT(VLOAD(.jiffies)) {
171 *(.jiffies) 199 *(.jiffies)
172 } 200 }
@@ -263,8 +291,8 @@ SECTIONS
263 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 291 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
264 __smp_locks = .; 292 __smp_locks = .;
265 *(.smp_locks) 293 *(.smp_locks)
266 __smp_locks_end = .;
267 . = ALIGN(PAGE_SIZE); 294 . = ALIGN(PAGE_SIZE);
295 __smp_locks_end = .;
268 } 296 }
269 297
270#ifdef CONFIG_X86_64 298#ifdef CONFIG_X86_64
@@ -291,9 +319,7 @@ SECTIONS
291 __brk_limit = .; 319 __brk_limit = .;
292 } 320 }
293 321
294 .end : AT(ADDR(.end) - LOAD_OFFSET) { 322 _end = .;
295 _end = .;
296 }
297 323
298 STABS_DEBUG 324 STABS_DEBUG
299 DWARF_DEBUG 325 DWARF_DEBUG
@@ -315,7 +341,7 @@ SECTIONS
315 * Per-cpu symbols which need to be offset from __per_cpu_load 341 * Per-cpu symbols which need to be offset from __per_cpu_load
316 * for the boot processor. 342 * for the boot processor.
317 */ 343 */
318#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load 344#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
319INIT_PER_CPU(gdt_page); 345INIT_PER_CPU(gdt_page);
320INIT_PER_CPU(irq_stack_union); 346INIT_PER_CPU(irq_stack_union);
321 347
@@ -326,7 +352,7 @@ INIT_PER_CPU(irq_stack_union);
326 "kernel image bigger than KERNEL_IMAGE_SIZE"); 352 "kernel image bigger than KERNEL_IMAGE_SIZE");
327 353
328#ifdef CONFIG_SMP 354#ifdef CONFIG_SMP
329. = ASSERT((per_cpu__irq_stack_union == 0), 355. = ASSERT((irq_stack_union == 0),
330 "irq_stack_union is not at start of per-cpu area"); 356 "irq_stack_union is not at start of per-cpu area");
331#endif 357#endif
332 358
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974ff599..1c0c6ab9c60f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) 76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
77 u32 mult)
77{ 78{
78 unsigned long flags; 79 unsigned long flags;
79 80
@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
82 vsyscall_gtod_data.clock.vread = clock->vread; 83 vsyscall_gtod_data.clock.vread = clock->vread;
83 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 84 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
84 vsyscall_gtod_data.clock.mask = clock->mask; 85 vsyscall_gtod_data.clock.mask = clock->mask;
85 vsyscall_gtod_data.clock.mult = clock->mult; 86 vsyscall_gtod_data.clock.mult = mult;
86 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
@@ -237,7 +238,7 @@ static ctl_table kernel_table2[] = {
237}; 238};
238 239
239static ctl_table kernel_root_table2[] = { 240static ctl_table kernel_root_table2[] = {
240 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, 241 { .procname = "kernel", .mode = 0555,
241 .child = kernel_table2 }, 242 .child = kernel_table2 },
242 {} 243 {}
243}; 244};
@@ -300,7 +301,8 @@ static int __init vsyscall_init(void)
300 register_sysctl_table(kernel_root_table2); 301 register_sysctl_table(kernel_root_table2);
301#endif 302#endif
302 on_each_cpu(cpu_vsyscall_init, NULL, 1); 303 on_each_cpu(cpu_vsyscall_init, NULL, 1);
303 hotcpu_notifier(cpu_vsyscall_notifier, 0); 304 /* notifier priority > KVM */
305 hotcpu_notifier(cpu_vsyscall_notifier, 30);
304 return 0; 306 return 0;
305} 307}
306 308
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 3909e3ba5ce3..693920b22496 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -17,8 +17,6 @@
17EXPORT_SYMBOL(mcount); 17EXPORT_SYMBOL(mcount);
18#endif 18#endif
19 19
20EXPORT_SYMBOL(kernel_thread);
21
22EXPORT_SYMBOL(__get_user_1); 20EXPORT_SYMBOL(__get_user_1);
23EXPORT_SYMBOL(__get_user_2); 21EXPORT_SYMBOL(__get_user_2);
24EXPORT_SYMBOL(__get_user_4); 22EXPORT_SYMBOL(__get_user_4);
@@ -28,11 +26,11 @@ EXPORT_SYMBOL(__put_user_2);
28EXPORT_SYMBOL(__put_user_4); 26EXPORT_SYMBOL(__put_user_4);
29EXPORT_SYMBOL(__put_user_8); 27EXPORT_SYMBOL(__put_user_8);
30 28
31EXPORT_SYMBOL(copy_user_generic); 29EXPORT_SYMBOL(copy_user_generic_string);
30EXPORT_SYMBOL(copy_user_generic_unrolled);
32EXPORT_SYMBOL(__copy_user_nocache); 31EXPORT_SYMBOL(__copy_user_nocache);
33EXPORT_SYMBOL(copy_from_user); 32EXPORT_SYMBOL(_copy_from_user);
34EXPORT_SYMBOL(copy_to_user); 33EXPORT_SYMBOL(_copy_to_user);
35EXPORT_SYMBOL(__copy_from_user_inatomic);
36 34
37EXPORT_SYMBOL(copy_page); 35EXPORT_SYMBOL(copy_page);
38EXPORT_SYMBOL(clear_page); 36EXPORT_SYMBOL(clear_page);
@@ -57,4 +55,6 @@ EXPORT_SYMBOL(__memcpy);
57 55
58EXPORT_SYMBOL(empty_zero_page); 56EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt); 57EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 58#ifndef CONFIG_PARAVIRT
59EXPORT_SYMBOL(native_load_gs_index);
60#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 4449a4a2c2ed..61a1e8c7e19f 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -4,20 +4,26 @@
4 * For licencing details see kernel-base/COPYING 4 * For licencing details see kernel-base/COPYING
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/ioport.h>
7 8
8#include <asm/bios_ebda.h> 9#include <asm/bios_ebda.h>
9#include <asm/paravirt.h> 10#include <asm/paravirt.h>
11#include <asm/pci_x86.h>
10#include <asm/mpspec.h> 12#include <asm/mpspec.h>
11#include <asm/setup.h> 13#include <asm/setup.h>
12#include <asm/apic.h> 14#include <asm/apic.h>
13#include <asm/e820.h> 15#include <asm/e820.h>
14#include <asm/time.h> 16#include <asm/time.h>
15#include <asm/irq.h> 17#include <asm/irq.h>
18#include <asm/pat.h>
16#include <asm/tsc.h> 19#include <asm/tsc.h>
20#include <asm/iommu.h>
17 21
18void __cpuinit x86_init_noop(void) { } 22void __cpuinit x86_init_noop(void) { }
19void __init x86_init_uint_noop(unsigned int unused) { } 23void __init x86_init_uint_noop(unsigned int unused) { }
20void __init x86_init_pgd_noop(pgd_t *unused) { } 24void __init x86_init_pgd_noop(pgd_t *unused) { }
25int __init iommu_init_noop(void) { return 0; }
26void iommu_shutdown_noop(void) { }
21 27
22/* 28/*
23 * The platform setup functions are preset with the default functions 29 * The platform setup functions are preset with the default functions
@@ -62,14 +68,29 @@ struct x86_init_ops x86_init __initdata = {
62 .tsc_pre_init = x86_init_noop, 68 .tsc_pre_init = x86_init_noop,
63 .timer_init = hpet_time_init, 69 .timer_init = hpet_time_init,
64 }, 70 },
71
72 .iommu = {
73 .iommu_init = iommu_init_noop,
74 },
75
76 .pci = {
77 .init = x86_default_pci_init,
78 .init_irq = x86_default_pci_init_irq,
79 .fixup_irqs = x86_default_pci_fixup_irqs,
80 },
65}; 81};
66 82
67struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 83struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
68 .setup_percpu_clockev = setup_secondary_APIC_clock, 84 .setup_percpu_clockev = setup_secondary_APIC_clock,
69}; 85};
70 86
87static void default_nmi_init(void) { };
88
71struct x86_platform_ops x86_platform = { 89struct x86_platform_ops x86_platform = {
72 .calibrate_tsc = native_calibrate_tsc, 90 .calibrate_tsc = native_calibrate_tsc,
73 .get_wallclock = mach_get_cmos_time, 91 .get_wallclock = mach_get_cmos_time,
74 .set_wallclock = mach_set_rtc_mmss, 92 .set_wallclock = mach_set_rtc_mmss,
93 .iommu_shutdown = iommu_shutdown_noop,
94 .is_untracked_pat_range = is_ISA_range,
95 .nmi_init = default_nmi_init
75}; 96};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index c5ee17e8c6d9..782c3a362ec6 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void)
337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
338 xstate_size = ebx; 338 xstate_size = ebx;
339 339
340 update_regset_xstate_info(xstate_size, pcntxt_mask);
340 prepare_fx_sw_frame(); 341 prepare_fx_sw_frame();
341 342
342 setup_xstate_init(); 343 setup_xstate_init();