aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2010-02-17 12:27:37 -0500
committerThomas Gleixner <tglx@linutronix.de>2010-02-17 12:28:05 -0500
commitb7e56edba4b02f2079042c326a8cd72a44635817 (patch)
treeb5042002e9747cd8fb1278d61f86d8b92a74c018 /arch/x86/kernel
parent13ca0fcaa33f6b1984c4111b6ec5df42689fea6f (diff)
parentb0483e78e5c4c9871fc5541875b3bc006846d46b (diff)
Merge branch 'linus' into x86/mm
x86/mm is on 32-rc4 and missing the spinlock namespace changes which are needed for further commits into this topic. Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c30
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/acpi/processor.c100
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S3
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_iommu.c1312
-rw-r--r--arch/x86/kernel/amd_iommu_init.c143
-rw-r--r--arch/x86/kernel/aperture_64.c14
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c58
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c10
-rw-r--r--arch/x86/kernel/apic/apic_noop.c200
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c18
-rw-r--r--arch/x86/kernel/apic/es7000_32.c28
-rw-r--r--arch/x86/kernel/apic/io_apic.c454
-rw-r--r--arch/x86/kernel/apic/nmi.c19
-rw-r--r--arch/x86/kernel/apic/numaq_32.c13
-rw-r--r--arch/x86/kernel/apic/probe_32.c31
-rw-r--r--arch/x86/kernel/apic/probe_64.c13
-rw-r--r--arch/x86/kernel/apic/summit_32.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c59
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/bios_uv.c8
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c15
-rw-r--r--arch/x86/kernel/cpu/amd.c57
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c50
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c688
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c72
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c37
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c21
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h24
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c2
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c88
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c115
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c45
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event.c248
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpuid.c24
-rw-r--r--arch/x86/kernel/crash.c5
-rw-r--r--arch/x86/kernel/crash_dump_32.c19
-rw-r--r--arch/x86/kernel/ds.c4
-rw-r--r--arch/x86/kernel/dumpstack.c50
-rw-r--r--arch/x86/kernel/dumpstack.h6
-rw-r--r--arch/x86/kernel/dumpstack_32.c9
-rw-r--r--arch/x86/kernel/dumpstack_64.c83
-rw-r--r--arch/x86/kernel/e820.c13
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/entry_32.S100
-rw-r--r--arch/x86/kernel/entry_64.S84
-rw-r--r--arch/x86/kernel/ftrace.c84
-rw-r--r--arch/x86/kernel/geode_32.c196
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c85
-rw-r--r--arch/x86/kernel/hw_breakpoint.c554
-rw-r--r--arch/x86/kernel/ioport.c28
-rw-r--r--arch/x86/kernel/irq.c132
-rw-r--r--arch/x86/kernel/irq_32.c45
-rw-r--r--arch/x86/kernel/irq_64.c58
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kgdb.c241
-rw-r--r--arch/x86/kernel/kprobes.c261
-rw-r--r--arch/x86/kernel/machine_kexec_32.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c410
-rw-r--r--arch/x86/kernel/microcode_amd.c61
-rw-r--r--arch/x86/kernel/microcode_core.c28
-rw-r--r--arch/x86/kernel/microcode_intel.c47
-rw-r--r--arch/x86/kernel/mpparse.c10
-rw-r--r--arch/x86/kernel/msr.c25
-rw-r--r--arch/x86/kernel/olpc.c4
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c100
-rw-r--r--arch/x86/kernel/pci-dma.c47
-rw-r--r--arch/x86/kernel/pci-gart_64.c163
-rw-r--r--arch/x86/kernel/pci-nommu.c11
-rw-r--r--arch/x86/kernel/pci-swiotlb.c21
-rw-r--r--arch/x86/kernel/process.c128
-rw-r--r--arch/x86/kernel/process_32.c111
-rw-r--r--arch/x86/kernel/process_64.c128
-rw-r--r--arch/x86/kernel/ptrace.c480
-rw-r--r--arch/x86/kernel/quirks.c22
-rw-r--r--arch/x86/kernel/reboot.c29
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c3
-rw-r--r--arch/x86/kernel/setup.c39
-rw-r--r--arch/x86/kernel/setup_percpu.c13
-rw-r--r--arch/x86/kernel/signal.c24
-rw-r--r--arch/x86/kernel/smp.c1
-rw-r--r--arch/x86/kernel/smpboot.c60
-rw-r--r--arch/x86/kernel/stacktrace.c18
-rw-r--r--arch/x86/kernel/sys_i386_32.c27
-rw-r--r--arch/x86/kernel/sys_x86_64.c17
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/time.c3
-rw-r--r--arch/x86/kernel/tlb_uv.c9
-rw-r--r--arch/x86/kernel/trampoline.c30
-rw-r--r--arch/x86/kernel/trampoline_64.S4
-rw-r--r--arch/x86/kernel/traps.c73
-rw-r--r--arch/x86/kernel/tsc.c1
-rw-r--r--arch/x86/kernel/tsc_sync.c23
-rw-r--r--arch/x86/kernel/uv_irq.c238
-rw-r--r--arch/x86/kernel/uv_time.c93
-rw-r--r--arch/x86/kernel/visws_quirks.c8
-rw-r--r--arch/x86/kernel/vm86_32.c11
-rw-r--r--arch/x86/kernel/vmi_32.c2
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S7
-rw-r--r--arch/x86/kernel/vsyscall_64.c7
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c11
-rw-r--r--arch/x86/kernel/x86_init.c8
128 files changed, 4542 insertions, 4223 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d8e5d0cdd678..d87f09bc5a52 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
41obj-y += bootflag.o e820.o 41obj-y += bootflag.o e820.o
42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
43obj-y += alternative.o i8253.o pci-nommu.o 43obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
44obj-y += tsc.o io_delay.o rtc.o 44obj-y += tsc.o io_delay.o rtc.o
45 45
46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
@@ -89,7 +89,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
89obj-$(CONFIG_HPET_TIMER) += hpet.o 89obj-$(CONFIG_HPET_TIMER) += hpet.o
90 90
91obj-$(CONFIG_K8_NB) += k8.o 91obj-$(CONFIG_K8_NB) += k8.o
92obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 92obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
94obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 93obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
95 94
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index fd5ca97a2ad5..6f35260bb3ef 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o 4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
5 5
6ifneq ($(CONFIG_ACPI_PROCESSOR),) 6ifneq ($(CONFIG_ACPI_PROCESSOR),)
7obj-y += cstate.o processor.o 7obj-y += cstate.o
8endif 8endif
9 9
10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin 10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 67e929b89875..0acbcdfa5ca4 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -624,6 +624,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
624 } 624 }
625 625
626 hpet_address = hpet_tbl->address.address; 626 hpet_address = hpet_tbl->address.address;
627 hpet_blockid = hpet_tbl->sequence;
627 628
628 /* 629 /*
629 * Some broken BIOSes advertise HPET at 0x0. We really do not 630 * Some broken BIOSes advertise HPET at 0x0. We really do not
@@ -1122,7 +1123,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1122 if (!acpi_sci_override_gsi) 1123 if (!acpi_sci_override_gsi)
1123 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); 1124 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
1124 1125
1125 /* Fill in identity legacy mapings where no override */ 1126 /* Fill in identity legacy mappings where no override */
1126 mp_config_acpi_legacy_irqs(); 1127 mp_config_acpi_legacy_irqs();
1127 1128
1128 count = 1129 count =
@@ -1184,9 +1185,6 @@ static void __init acpi_process_madt(void)
1184 if (!error) { 1185 if (!error) {
1185 acpi_lapic = 1; 1186 acpi_lapic = 1;
1186 1187
1187#ifdef CONFIG_X86_BIGSMP
1188 generic_bigsmp_probe();
1189#endif
1190 /* 1188 /*
1191 * Parse MADT IO-APIC entries 1189 * Parse MADT IO-APIC entries
1192 */ 1190 */
@@ -1196,8 +1194,6 @@ static void __init acpi_process_madt(void)
1196 acpi_ioapic = 1; 1194 acpi_ioapic = 1;
1197 1195
1198 smp_found_config = 1; 1196 smp_found_config = 1;
1199 if (apic->setup_apic_routing)
1200 apic->setup_apic_routing();
1201 } 1197 }
1202 } 1198 }
1203 if (error == -EINVAL) { 1199 if (error == -EINVAL) {
@@ -1528,16 +1524,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1528 * if acpi_blacklisted() acpi_disabled = 1; 1524 * if acpi_blacklisted() acpi_disabled = 1;
1529 * acpi_irq_model=... 1525 * acpi_irq_model=...
1530 * ... 1526 * ...
1531 *
1532 * return value: (currently ignored)
1533 * 0: success
1534 * !0: failure
1535 */ 1527 */
1536 1528
1537int __init acpi_boot_table_init(void) 1529void __init acpi_boot_table_init(void)
1538{ 1530{
1539 int error;
1540
1541 dmi_check_system(acpi_dmi_table); 1531 dmi_check_system(acpi_dmi_table);
1542 1532
1543 /* 1533 /*
@@ -1545,15 +1535,14 @@ int __init acpi_boot_table_init(void)
1545 * One exception: acpi=ht continues far enough to enumerate LAPICs 1535 * One exception: acpi=ht continues far enough to enumerate LAPICs
1546 */ 1536 */
1547 if (acpi_disabled && !acpi_ht) 1537 if (acpi_disabled && !acpi_ht)
1548 return 1; 1538 return;
1549 1539
1550 /* 1540 /*
1551 * Initialize the ACPI boot-time table parser. 1541 * Initialize the ACPI boot-time table parser.
1552 */ 1542 */
1553 error = acpi_table_init(); 1543 if (acpi_table_init()) {
1554 if (error) {
1555 disable_acpi(); 1544 disable_acpi();
1556 return error; 1545 return;
1557 } 1546 }
1558 1547
1559 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); 1548 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1561,18 +1550,15 @@ int __init acpi_boot_table_init(void)
1561 /* 1550 /*
1562 * blacklist may disable ACPI entirely 1551 * blacklist may disable ACPI entirely
1563 */ 1552 */
1564 error = acpi_blacklisted(); 1553 if (acpi_blacklisted()) {
1565 if (error) {
1566 if (acpi_force) { 1554 if (acpi_force) {
1567 printk(KERN_WARNING PREFIX "acpi=force override\n"); 1555 printk(KERN_WARNING PREFIX "acpi=force override\n");
1568 } else { 1556 } else {
1569 printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); 1557 printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1570 disable_acpi(); 1558 disable_acpi();
1571 return error; 1559 return;
1572 } 1560 }
1573 } 1561 }
1574
1575 return 0;
1576} 1562}
1577 1563
1578int __init early_acpi_boot_init(void) 1564int __init early_acpi_boot_init(void)
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 59cdfa4686b2..2e837f5080fe 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
48 * P4, Core and beyond CPUs 48 * P4, Core and beyond CPUs
49 */ 49 */
50 if (c->x86_vendor == X86_VENDOR_INTEL && 50 if (c->x86_vendor == X86_VENDOR_INTEL &&
51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) 51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
52 flags->bm_control = 0; 52 flags->bm_control = 0;
53} 53}
54EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 54EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
deleted file mode 100644
index d296f4a195c9..000000000000
--- a/arch/x86/kernel/acpi/processor.c
+++ /dev/null
@@ -1,100 +0,0 @@
1/*
2 * Copyright (C) 2005 Intel Corporation
3 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
4 * - Added _PDC for platforms with Intel CPUs
5 */
6
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <linux/init.h>
10#include <linux/acpi.h>
11
12#include <acpi/processor.h>
13#include <asm/acpi.h>
14
15static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
16{
17 struct acpi_object_list *obj_list;
18 union acpi_object *obj;
19 u32 *buf;
20
21 /* allocate and initialize pdc. It will be used later. */
22 obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL);
23 if (!obj_list) {
24 printk(KERN_ERR "Memory allocation error\n");
25 return;
26 }
27
28 obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL);
29 if (!obj) {
30 printk(KERN_ERR "Memory allocation error\n");
31 kfree(obj_list);
32 return;
33 }
34
35 buf = kmalloc(12, GFP_KERNEL);
36 if (!buf) {
37 printk(KERN_ERR "Memory allocation error\n");
38 kfree(obj);
39 kfree(obj_list);
40 return;
41 }
42
43 buf[0] = ACPI_PDC_REVISION_ID;
44 buf[1] = 1;
45 buf[2] = ACPI_PDC_C_CAPABILITY_SMP;
46
47 /*
48 * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so
49 * that OSPM is capable of native ACPI throttling software
50 * coordination using BIOS supplied _TSD info.
51 */
52 buf[2] |= ACPI_PDC_SMP_T_SWCOORD;
53 if (cpu_has(c, X86_FEATURE_EST))
54 buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
55
56 if (cpu_has(c, X86_FEATURE_ACPI))
57 buf[2] |= ACPI_PDC_T_FFH;
58
59 /*
60 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
61 */
62 if (!cpu_has(c, X86_FEATURE_MWAIT))
63 buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
64
65 obj->type = ACPI_TYPE_BUFFER;
66 obj->buffer.length = 12;
67 obj->buffer.pointer = (u8 *) buf;
68 obj_list->count = 1;
69 obj_list->pointer = obj;
70 pr->pdc = obj_list;
71
72 return;
73}
74
75
76/* Initialize _PDC data based on the CPU vendor */
77void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
78{
79 struct cpuinfo_x86 *c = &cpu_data(pr->id);
80
81 pr->pdc = NULL;
82 if (c->x86_vendor == X86_VENDOR_INTEL)
83 init_intel_pdc(pr, c);
84
85 return;
86}
87
88EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
89
90void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
91{
92 if (pr->pdc) {
93 kfree(pr->pdc->pointer->buffer.pointer);
94 kfree(pr->pdc->pointer);
95 kfree(pr->pdc);
96 pr->pdc = NULL;
97 }
98}
99
100EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc);
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 7da00b799cda..060fff8f5c5b 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -57,5 +57,8 @@ SECTIONS
57 *(.note*) 57 *(.note*)
58 } 58 }
59 59
60 /*
61 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
62 */
60 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); 63 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
61} 64}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 82e508677b91..f9961034e557 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str)
162#endif 162#endif
163 if (strncmp(str, "old_ordering", 12) == 0) 163 if (strncmp(str, "old_ordering", 12) == 0)
164 acpi_old_suspend_ordering(); 164 acpi_old_suspend_ordering();
165 if (strncmp(str, "sci_force_enable", 16) == 0)
166 acpi_set_sci_en_on_resume();
165 str = strchr(str, ','); 167 str = strchr(str, ',');
166 if (str != NULL) 168 if (str != NULL)
167 str += strspn(str, ", \t"); 169 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 98f230f6a28d..adb0ba025702 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -19,7 +19,7 @@
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/gfp.h> 21#include <linux/gfp.h>
22#include <linux/bitops.h> 22#include <linux/bitmap.h>
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
25#include <linux/dma-mapping.h> 25#include <linux/dma-mapping.h>
@@ -28,6 +28,7 @@
28#include <asm/proto.h> 28#include <asm/proto.h>
29#include <asm/iommu.h> 29#include <asm/iommu.h>
30#include <asm/gart.h> 30#include <asm/gart.h>
31#include <asm/amd_iommu_proto.h>
31#include <asm/amd_iommu_types.h> 32#include <asm/amd_iommu_types.h>
32#include <asm/amd_iommu.h> 33#include <asm/amd_iommu.h>
33 34
@@ -56,20 +57,152 @@ struct iommu_cmd {
56 u32 data[4]; 57 u32 data[4];
57}; 58};
58 59
59static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
60 struct unity_map_entry *e);
61static struct dma_ops_domain *find_protection_domain(u16 devid);
62static u64 *alloc_pte(struct protection_domain *domain,
63 unsigned long address, int end_lvl,
64 u64 **pte_page, gfp_t gfp);
65static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
66 unsigned long start_page,
67 unsigned int pages);
68static void reset_iommu_command_buffer(struct amd_iommu *iommu); 60static void reset_iommu_command_buffer(struct amd_iommu *iommu);
69static u64 *fetch_pte(struct protection_domain *domain,
70 unsigned long address, int map_size);
71static void update_domain(struct protection_domain *domain); 61static void update_domain(struct protection_domain *domain);
72 62
63/****************************************************************************
64 *
65 * Helper functions
66 *
67 ****************************************************************************/
68
69static inline u16 get_device_id(struct device *dev)
70{
71 struct pci_dev *pdev = to_pci_dev(dev);
72
73 return calc_devid(pdev->bus->number, pdev->devfn);
74}
75
76static struct iommu_dev_data *get_dev_data(struct device *dev)
77{
78 return dev->archdata.iommu;
79}
80
81/*
82 * In this function the list of preallocated protection domains is traversed to
83 * find the domain for a specific device
84 */
85static struct dma_ops_domain *find_protection_domain(u16 devid)
86{
87 struct dma_ops_domain *entry, *ret = NULL;
88 unsigned long flags;
89 u16 alias = amd_iommu_alias_table[devid];
90
91 if (list_empty(&iommu_pd_list))
92 return NULL;
93
94 spin_lock_irqsave(&iommu_pd_list_lock, flags);
95
96 list_for_each_entry(entry, &iommu_pd_list, list) {
97 if (entry->target_dev == devid ||
98 entry->target_dev == alias) {
99 ret = entry;
100 break;
101 }
102 }
103
104 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
105
106 return ret;
107}
108
109/*
110 * This function checks if the driver got a valid device from the caller to
111 * avoid dereferencing invalid pointers.
112 */
113static bool check_device(struct device *dev)
114{
115 u16 devid;
116
117 if (!dev || !dev->dma_mask)
118 return false;
119
120 /* No device or no PCI device */
121 if (!dev || dev->bus != &pci_bus_type)
122 return false;
123
124 devid = get_device_id(dev);
125
126 /* Out of our scope? */
127 if (devid > amd_iommu_last_bdf)
128 return false;
129
130 if (amd_iommu_rlookup_table[devid] == NULL)
131 return false;
132
133 return true;
134}
135
136static int iommu_init_device(struct device *dev)
137{
138 struct iommu_dev_data *dev_data;
139 struct pci_dev *pdev;
140 u16 devid, alias;
141
142 if (dev->archdata.iommu)
143 return 0;
144
145 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
146 if (!dev_data)
147 return -ENOMEM;
148
149 dev_data->dev = dev;
150
151 devid = get_device_id(dev);
152 alias = amd_iommu_alias_table[devid];
153 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
154 if (pdev)
155 dev_data->alias = &pdev->dev;
156
157 atomic_set(&dev_data->bind, 0);
158
159 dev->archdata.iommu = dev_data;
160
161
162 return 0;
163}
164
165static void iommu_uninit_device(struct device *dev)
166{
167 kfree(dev->archdata.iommu);
168}
169
170void __init amd_iommu_uninit_devices(void)
171{
172 struct pci_dev *pdev = NULL;
173
174 for_each_pci_dev(pdev) {
175
176 if (!check_device(&pdev->dev))
177 continue;
178
179 iommu_uninit_device(&pdev->dev);
180 }
181}
182
183int __init amd_iommu_init_devices(void)
184{
185 struct pci_dev *pdev = NULL;
186 int ret = 0;
187
188 for_each_pci_dev(pdev) {
189
190 if (!check_device(&pdev->dev))
191 continue;
192
193 ret = iommu_init_device(&pdev->dev);
194 if (ret)
195 goto out_free;
196 }
197
198 return 0;
199
200out_free:
201
202 amd_iommu_uninit_devices();
203
204 return ret;
205}
73#ifdef CONFIG_AMD_IOMMU_STATS 206#ifdef CONFIG_AMD_IOMMU_STATS
74 207
75/* 208/*
@@ -90,7 +223,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem);
90DECLARE_STATS_COUNTER(total_map_requests); 223DECLARE_STATS_COUNTER(total_map_requests);
91 224
92static struct dentry *stats_dir; 225static struct dentry *stats_dir;
93static struct dentry *de_isolate;
94static struct dentry *de_fflush; 226static struct dentry *de_fflush;
95 227
96static void amd_iommu_stats_add(struct __iommu_counter *cnt) 228static void amd_iommu_stats_add(struct __iommu_counter *cnt)
@@ -108,9 +240,6 @@ static void amd_iommu_stats_init(void)
108 if (stats_dir == NULL) 240 if (stats_dir == NULL)
109 return; 241 return;
110 242
111 de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
112 (u32 *)&amd_iommu_isolate);
113
114 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, 243 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
115 (u32 *)&amd_iommu_unmap_flush); 244 (u32 *)&amd_iommu_unmap_flush);
116 245
@@ -130,12 +259,6 @@ static void amd_iommu_stats_init(void)
130 259
131#endif 260#endif
132 261
133/* returns !0 if the IOMMU is caching non-present entries in its TLB */
134static int iommu_has_npcache(struct amd_iommu *iommu)
135{
136 return iommu->cap & (1UL << IOMMU_CAP_NPCACHE);
137}
138
139/**************************************************************************** 262/****************************************************************************
140 * 263 *
141 * Interrupt handling functions 264 * Interrupt handling functions
@@ -199,6 +322,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
199 break; 322 break;
200 case EVENT_TYPE_ILL_CMD: 323 case EVENT_TYPE_ILL_CMD:
201 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 324 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
325 iommu->reset_in_progress = true;
202 reset_iommu_command_buffer(iommu); 326 reset_iommu_command_buffer(iommu);
203 dump_command(address); 327 dump_command(address);
204 break; 328 break;
@@ -321,11 +445,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
321 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 445 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
322 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 446 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
323 447
324 if (unlikely(i == EXIT_LOOP_COUNT)) { 448 if (unlikely(i == EXIT_LOOP_COUNT))
325 spin_unlock(&iommu->lock); 449 iommu->reset_in_progress = true;
326 reset_iommu_command_buffer(iommu);
327 spin_lock(&iommu->lock);
328 }
329} 450}
330 451
331/* 452/*
@@ -372,26 +493,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
372out: 493out:
373 spin_unlock_irqrestore(&iommu->lock, flags); 494 spin_unlock_irqrestore(&iommu->lock, flags);
374 495
496 if (iommu->reset_in_progress)
497 reset_iommu_command_buffer(iommu);
498
375 return 0; 499 return 0;
376} 500}
377 501
502static void iommu_flush_complete(struct protection_domain *domain)
503{
504 int i;
505
506 for (i = 0; i < amd_iommus_present; ++i) {
507 if (!domain->dev_iommu[i])
508 continue;
509
510 /*
511 * Devices of this domain are behind this IOMMU
512 * We need to wait for completion of all commands.
513 */
514 iommu_completion_wait(amd_iommus[i]);
515 }
516}
517
378/* 518/*
379 * Command send function for invalidating a device table entry 519 * Command send function for invalidating a device table entry
380 */ 520 */
381static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 521static int iommu_flush_device(struct device *dev)
382{ 522{
523 struct amd_iommu *iommu;
383 struct iommu_cmd cmd; 524 struct iommu_cmd cmd;
384 int ret; 525 u16 devid;
385 526
386 BUG_ON(iommu == NULL); 527 devid = get_device_id(dev);
528 iommu = amd_iommu_rlookup_table[devid];
387 529
530 /* Build command */
388 memset(&cmd, 0, sizeof(cmd)); 531 memset(&cmd, 0, sizeof(cmd));
389 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 532 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
390 cmd.data[0] = devid; 533 cmd.data[0] = devid;
391 534
392 ret = iommu_queue_command(iommu, &cmd); 535 return iommu_queue_command(iommu, &cmd);
393
394 return ret;
395} 536}
396 537
397static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 538static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
@@ -430,11 +571,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
430 * It invalidates a single PTE if the range to flush is within a single 571 * It invalidates a single PTE if the range to flush is within a single
431 * page. Otherwise it flushes the whole TLB of the IOMMU. 572 * page. Otherwise it flushes the whole TLB of the IOMMU.
432 */ 573 */
433static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 574static void __iommu_flush_pages(struct protection_domain *domain,
434 u64 address, size_t size) 575 u64 address, size_t size, int pde)
435{ 576{
436 int s = 0; 577 int s = 0, i;
437 unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); 578 unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
438 579
439 address &= PAGE_MASK; 580 address &= PAGE_MASK;
440 581
@@ -447,142 +588,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
447 s = 1; 588 s = 1;
448 } 589 }
449 590
450 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
451 591
452 return 0; 592 for (i = 0; i < amd_iommus_present; ++i) {
593 if (!domain->dev_iommu[i])
594 continue;
595
596 /*
597 * Devices of this domain are behind this IOMMU
598 * We need a TLB flush
599 */
600 iommu_queue_inv_iommu_pages(amd_iommus[i], address,
601 domain->id, pde, s);
602 }
603
604 return;
453} 605}
454 606
455/* Flush the whole IO/TLB for a given protection domain */ 607static void iommu_flush_pages(struct protection_domain *domain,
456static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) 608 u64 address, size_t size)
457{ 609{
458 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 610 __iommu_flush_pages(domain, address, size, 0);
459 611}
460 INC_STATS_COUNTER(domain_flush_single);
461 612
462 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 613/* Flush the whole IO/TLB for a given protection domain */
614static void iommu_flush_tlb(struct protection_domain *domain)
615{
616 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
463} 617}
464 618
465/* Flush the whole IO/TLB for a given protection domain - including PDE */ 619/* Flush the whole IO/TLB for a given protection domain - including PDE */
466static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) 620static void iommu_flush_tlb_pde(struct protection_domain *domain)
467{ 621{
468 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 622 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
469
470 INC_STATS_COUNTER(domain_flush_single);
471
472 iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
473} 623}
474 624
625
475/* 626/*
476 * This function flushes one domain on one IOMMU 627 * This function flushes the DTEs for all devices in domain
477 */ 628 */
478static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) 629static void iommu_flush_domain_devices(struct protection_domain *domain)
479{ 630{
480 struct iommu_cmd cmd; 631 struct iommu_dev_data *dev_data;
481 unsigned long flags; 632 unsigned long flags;
482 633
483 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 634 spin_lock_irqsave(&domain->lock, flags);
484 domid, 1, 1);
485 635
486 spin_lock_irqsave(&iommu->lock, flags); 636 list_for_each_entry(dev_data, &domain->dev_list, list)
487 __iommu_queue_command(iommu, &cmd); 637 iommu_flush_device(dev_data->dev);
488 __iommu_completion_wait(iommu); 638
489 __iommu_wait_for_completion(iommu); 639 spin_unlock_irqrestore(&domain->lock, flags);
490 spin_unlock_irqrestore(&iommu->lock, flags);
491} 640}
492 641
493static void flush_all_domains_on_iommu(struct amd_iommu *iommu) 642static void iommu_flush_all_domain_devices(void)
494{ 643{
495 int i; 644 struct protection_domain *domain;
645 unsigned long flags;
496 646
497 for (i = 1; i < MAX_DOMAIN_ID; ++i) { 647 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
498 if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) 648
499 continue; 649 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
500 flush_domain_on_iommu(iommu, i); 650 iommu_flush_domain_devices(domain);
651 iommu_flush_complete(domain);
501 } 652 }
502 653
654 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
655}
656
657void amd_iommu_flush_all_devices(void)
658{
659 iommu_flush_all_domain_devices();
503} 660}
504 661
505/* 662/*
506 * This function is used to flush the IO/TLB for a given protection domain 663 * This function uses heavy locking and may disable irqs for some time. But
507 * on every IOMMU in the system 664 * this is no issue because it is only called during resume.
508 */ 665 */
509static void iommu_flush_domain(u16 domid) 666void amd_iommu_flush_all_domains(void)
510{ 667{
511 struct amd_iommu *iommu; 668 struct protection_domain *domain;
669 unsigned long flags;
512 670
513 INC_STATS_COUNTER(domain_flush_all); 671 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
514 672
515 for_each_iommu(iommu) 673 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
516 flush_domain_on_iommu(iommu, domid); 674 spin_lock(&domain->lock);
675 iommu_flush_tlb_pde(domain);
676 iommu_flush_complete(domain);
677 spin_unlock(&domain->lock);
678 }
679
680 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
517} 681}
518 682
519void amd_iommu_flush_all_domains(void) 683static void reset_iommu_command_buffer(struct amd_iommu *iommu)
520{ 684{
521 struct amd_iommu *iommu; 685 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
522 686
523 for_each_iommu(iommu) 687 if (iommu->reset_in_progress)
524 flush_all_domains_on_iommu(iommu); 688 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
689
690 amd_iommu_reset_cmd_buffer(iommu);
691 amd_iommu_flush_all_devices();
692 amd_iommu_flush_all_domains();
693
694 iommu->reset_in_progress = false;
525} 695}
526 696
527static void flush_all_devices_for_iommu(struct amd_iommu *iommu) 697/****************************************************************************
698 *
699 * The functions below are used the create the page table mappings for
700 * unity mapped regions.
701 *
702 ****************************************************************************/
703
704/*
705 * This function is used to add another level to an IO page table. Adding
706 * another level increases the size of the address space by 9 bits to a size up
707 * to 64 bits.
708 */
709static bool increase_address_space(struct protection_domain *domain,
710 gfp_t gfp)
528{ 711{
529 int i; 712 u64 *pte;
530 713
531 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 714 if (domain->mode == PAGE_MODE_6_LEVEL)
532 if (iommu != amd_iommu_rlookup_table[i]) 715 /* address space already 64 bit large */
533 continue; 716 return false;
534 717
535 iommu_queue_inv_dev_entry(iommu, i); 718 pte = (void *)get_zeroed_page(gfp);
536 iommu_completion_wait(iommu); 719 if (!pte)
537 } 720 return false;
721
722 *pte = PM_LEVEL_PDE(domain->mode,
723 virt_to_phys(domain->pt_root));
724 domain->pt_root = pte;
725 domain->mode += 1;
726 domain->updated = true;
727
728 return true;
538} 729}
539 730
540static void flush_devices_by_domain(struct protection_domain *domain) 731static u64 *alloc_pte(struct protection_domain *domain,
732 unsigned long address,
733 int end_lvl,
734 u64 **pte_page,
735 gfp_t gfp)
541{ 736{
542 struct amd_iommu *iommu; 737 u64 *pte, *page;
543 int i; 738 int level;
544 739
545 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 740 while (address > PM_LEVEL_SIZE(domain->mode))
546 if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || 741 increase_address_space(domain, gfp);
547 (amd_iommu_pd_table[i] != domain))
548 continue;
549 742
550 iommu = amd_iommu_rlookup_table[i]; 743 level = domain->mode - 1;
551 if (!iommu) 744 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
552 continue; 745
746 while (level > end_lvl) {
747 if (!IOMMU_PTE_PRESENT(*pte)) {
748 page = (u64 *)get_zeroed_page(gfp);
749 if (!page)
750 return NULL;
751 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
752 }
553 753
554 iommu_queue_inv_dev_entry(iommu, i); 754 level -= 1;
555 iommu_completion_wait(iommu); 755
756 pte = IOMMU_PTE_PAGE(*pte);
757
758 if (pte_page && level == end_lvl)
759 *pte_page = pte;
760
761 pte = &pte[PM_LEVEL_INDEX(level, address)];
556 } 762 }
763
764 return pte;
557} 765}
558 766
559static void reset_iommu_command_buffer(struct amd_iommu *iommu) 767/*
768 * This function checks if there is a PTE for a given dma address. If
769 * there is one, it returns the pointer to it.
770 */
771static u64 *fetch_pte(struct protection_domain *domain,
772 unsigned long address, int map_size)
560{ 773{
561 pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); 774 int level;
775 u64 *pte;
562 776
563 if (iommu->reset_in_progress) 777 level = domain->mode - 1;
564 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); 778 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
565 779
566 iommu->reset_in_progress = true; 780 while (level > map_size) {
781 if (!IOMMU_PTE_PRESENT(*pte))
782 return NULL;
567 783
568 amd_iommu_reset_cmd_buffer(iommu); 784 level -= 1;
569 flush_all_devices_for_iommu(iommu);
570 flush_all_domains_on_iommu(iommu);
571 785
572 iommu->reset_in_progress = false; 786 pte = IOMMU_PTE_PAGE(*pte);
573} 787 pte = &pte[PM_LEVEL_INDEX(level, address)];
574 788
575void amd_iommu_flush_all_devices(void) 789 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
576{ 790 pte = NULL;
577 flush_devices_by_domain(NULL); 791 break;
578} 792 }
793 }
579 794
580/**************************************************************************** 795 return pte;
581 * 796}
582 * The functions below are used the create the page table mappings for
583 * unity mapped regions.
584 *
585 ****************************************************************************/
586 797
587/* 798/*
588 * Generic mapping functions. It maps a physical address into a DMA 799 * Generic mapping functions. It maps a physical address into a DMA
@@ -654,28 +865,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
654} 865}
655 866
656/* 867/*
657 * Init the unity mappings for a specific IOMMU in the system
658 *
659 * Basically iterates over all unity mapping entries and applies them to
660 * the default domain DMA of that IOMMU if necessary.
661 */
662static int iommu_init_unity_mappings(struct amd_iommu *iommu)
663{
664 struct unity_map_entry *entry;
665 int ret;
666
667 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
668 if (!iommu_for_unity_map(iommu, entry))
669 continue;
670 ret = dma_ops_unity_map(iommu->default_dom, entry);
671 if (ret)
672 return ret;
673 }
674
675 return 0;
676}
677
678/*
679 * This function actually applies the mapping to the page table of the 868 * This function actually applies the mapping to the page table of the
680 * dma_ops domain. 869 * dma_ops domain.
681 */ 870 */
@@ -704,6 +893,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
704} 893}
705 894
706/* 895/*
896 * Init the unity mappings for a specific IOMMU in the system
897 *
898 * Basically iterates over all unity mapping entries and applies them to
899 * the default domain DMA of that IOMMU if necessary.
900 */
901static int iommu_init_unity_mappings(struct amd_iommu *iommu)
902{
903 struct unity_map_entry *entry;
904 int ret;
905
906 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
907 if (!iommu_for_unity_map(iommu, entry))
908 continue;
909 ret = dma_ops_unity_map(iommu->default_dom, entry);
910 if (ret)
911 return ret;
912 }
913
914 return 0;
915}
916
917/*
707 * Inits the unity mappings required for a specific device 918 * Inits the unity mappings required for a specific device
708 */ 919 */
709static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 920static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
@@ -740,34 +951,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
740 */ 951 */
741 952
742/* 953/*
743 * This function checks if there is a PTE for a given dma address. If 954 * Used to reserve address ranges in the aperture (e.g. for exclusion
744 * there is one, it returns the pointer to it. 955 * ranges.
745 */ 956 */
746static u64 *fetch_pte(struct protection_domain *domain, 957static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
747 unsigned long address, int map_size) 958 unsigned long start_page,
959 unsigned int pages)
748{ 960{
749 int level; 961 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
750 u64 *pte;
751
752 level = domain->mode - 1;
753 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
754
755 while (level > map_size) {
756 if (!IOMMU_PTE_PRESENT(*pte))
757 return NULL;
758
759 level -= 1;
760 962
761 pte = IOMMU_PTE_PAGE(*pte); 963 if (start_page + pages > last_page)
762 pte = &pte[PM_LEVEL_INDEX(level, address)]; 964 pages = last_page - start_page;
763 965
764 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { 966 for (i = start_page; i < start_page + pages; ++i) {
765 pte = NULL; 967 int index = i / APERTURE_RANGE_PAGES;
766 break; 968 int page = i % APERTURE_RANGE_PAGES;
767 } 969 __set_bit(page, dom->aperture[index]->bitmap);
768 } 970 }
769
770 return pte;
771} 971}
772 972
773/* 973/*
@@ -775,12 +975,12 @@ static u64 *fetch_pte(struct protection_domain *domain,
775 * aperture in case of dma_ops domain allocation or address allocation 975 * aperture in case of dma_ops domain allocation or address allocation
776 * failure. 976 * failure.
777 */ 977 */
778static int alloc_new_range(struct amd_iommu *iommu, 978static int alloc_new_range(struct dma_ops_domain *dma_dom,
779 struct dma_ops_domain *dma_dom,
780 bool populate, gfp_t gfp) 979 bool populate, gfp_t gfp)
781{ 980{
782 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; 981 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
783 int i; 982 struct amd_iommu *iommu;
983 unsigned long i;
784 984
785#ifdef CONFIG_IOMMU_STRESS 985#ifdef CONFIG_IOMMU_STRESS
786 populate = false; 986 populate = false;
@@ -819,14 +1019,17 @@ static int alloc_new_range(struct amd_iommu *iommu,
819 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1019 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
820 1020
821 /* Intialize the exclusion range if necessary */ 1021 /* Intialize the exclusion range if necessary */
822 if (iommu->exclusion_start && 1022 for_each_iommu(iommu) {
823 iommu->exclusion_start >= dma_dom->aperture[index]->offset && 1023 if (iommu->exclusion_start &&
824 iommu->exclusion_start < dma_dom->aperture_size) { 1024 iommu->exclusion_start >= dma_dom->aperture[index]->offset
825 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 1025 && iommu->exclusion_start < dma_dom->aperture_size) {
826 int pages = iommu_num_pages(iommu->exclusion_start, 1026 unsigned long startpage;
827 iommu->exclusion_length, 1027 int pages = iommu_num_pages(iommu->exclusion_start,
828 PAGE_SIZE); 1028 iommu->exclusion_length,
829 dma_ops_reserve_addresses(dma_dom, startpage, pages); 1029 PAGE_SIZE);
1030 startpage = iommu->exclusion_start >> PAGE_SHIFT;
1031 dma_ops_reserve_addresses(dma_dom, startpage, pages);
1032 }
830 } 1033 }
831 1034
832 /* 1035 /*
@@ -928,7 +1131,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
928 } 1131 }
929 1132
930 if (unlikely(address == -1)) 1133 if (unlikely(address == -1))
931 address = bad_dma_address; 1134 address = DMA_ERROR_CODE;
932 1135
933 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 1136 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
934 1137
@@ -959,7 +1162,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
959 1162
960 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; 1163 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
961 1164
962 iommu_area_free(range->bitmap, address, pages); 1165 bitmap_clear(range->bitmap, address, pages);
963 1166
964} 1167}
965 1168
@@ -973,6 +1176,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
973 * 1176 *
974 ****************************************************************************/ 1177 ****************************************************************************/
975 1178
1179/*
1180 * This function adds a protection domain to the global protection domain list
1181 */
1182static void add_domain_to_list(struct protection_domain *domain)
1183{
1184 unsigned long flags;
1185
1186 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1187 list_add(&domain->list, &amd_iommu_pd_list);
1188 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1189}
1190
1191/*
1192 * This function removes a protection domain to the global
1193 * protection domain list
1194 */
1195static void del_domain_from_list(struct protection_domain *domain)
1196{
1197 unsigned long flags;
1198
1199 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1200 list_del(&domain->list);
1201 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1202}
1203
976static u16 domain_id_alloc(void) 1204static u16 domain_id_alloc(void)
977{ 1205{
978 unsigned long flags; 1206 unsigned long flags;
@@ -1000,26 +1228,6 @@ static void domain_id_free(int id)
1000 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1228 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1001} 1229}
1002 1230
1003/*
1004 * Used to reserve address ranges in the aperture (e.g. for exclusion
1005 * ranges.
1006 */
1007static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
1008 unsigned long start_page,
1009 unsigned int pages)
1010{
1011 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
1012
1013 if (start_page + pages > last_page)
1014 pages = last_page - start_page;
1015
1016 for (i = start_page; i < start_page + pages; ++i) {
1017 int index = i / APERTURE_RANGE_PAGES;
1018 int page = i % APERTURE_RANGE_PAGES;
1019 __set_bit(page, dom->aperture[index]->bitmap);
1020 }
1021}
1022
1023static void free_pagetable(struct protection_domain *domain) 1231static void free_pagetable(struct protection_domain *domain)
1024{ 1232{
1025 int i, j; 1233 int i, j;
@@ -1061,6 +1269,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1061 if (!dom) 1269 if (!dom)
1062 return; 1270 return;
1063 1271
1272 del_domain_from_list(&dom->domain);
1273
1064 free_pagetable(&dom->domain); 1274 free_pagetable(&dom->domain);
1065 1275
1066 for (i = 0; i < APERTURE_MAX_RANGES; ++i) { 1276 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
@@ -1078,7 +1288,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1078 * It also intializes the page table and the address allocator data 1288 * It also intializes the page table and the address allocator data
1079 * structures required for the dma_ops interface 1289 * structures required for the dma_ops interface
1080 */ 1290 */
1081static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) 1291static struct dma_ops_domain *dma_ops_domain_alloc(void)
1082{ 1292{
1083 struct dma_ops_domain *dma_dom; 1293 struct dma_ops_domain *dma_dom;
1084 1294
@@ -1091,6 +1301,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1091 dma_dom->domain.id = domain_id_alloc(); 1301 dma_dom->domain.id = domain_id_alloc();
1092 if (dma_dom->domain.id == 0) 1302 if (dma_dom->domain.id == 0)
1093 goto free_dma_dom; 1303 goto free_dma_dom;
1304 INIT_LIST_HEAD(&dma_dom->domain.dev_list);
1094 dma_dom->domain.mode = PAGE_MODE_2_LEVEL; 1305 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1095 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1306 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1096 dma_dom->domain.flags = PD_DMA_OPS_MASK; 1307 dma_dom->domain.flags = PD_DMA_OPS_MASK;
@@ -1101,7 +1312,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1101 dma_dom->need_flush = false; 1312 dma_dom->need_flush = false;
1102 dma_dom->target_dev = 0xffff; 1313 dma_dom->target_dev = 0xffff;
1103 1314
1104 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) 1315 add_domain_to_list(&dma_dom->domain);
1316
1317 if (alloc_new_range(dma_dom, true, GFP_KERNEL))
1105 goto free_dma_dom; 1318 goto free_dma_dom;
1106 1319
1107 /* 1320 /*
@@ -1129,22 +1342,6 @@ static bool dma_ops_domain(struct protection_domain *domain)
1129 return domain->flags & PD_DMA_OPS_MASK; 1342 return domain->flags & PD_DMA_OPS_MASK;
1130} 1343}
1131 1344
1132/*
1133 * Find out the protection domain structure for a given PCI device. This
1134 * will give us the pointer to the page table root for example.
1135 */
1136static struct protection_domain *domain_for_device(u16 devid)
1137{
1138 struct protection_domain *dom;
1139 unsigned long flags;
1140
1141 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1142 dom = amd_iommu_pd_table[devid];
1143 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1144
1145 return dom;
1146}
1147
1148static void set_dte_entry(u16 devid, struct protection_domain *domain) 1345static void set_dte_entry(u16 devid, struct protection_domain *domain)
1149{ 1346{
1150 u64 pte_root = virt_to_phys(domain->pt_root); 1347 u64 pte_root = virt_to_phys(domain->pt_root);
@@ -1156,42 +1353,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain)
1156 amd_iommu_dev_table[devid].data[2] = domain->id; 1353 amd_iommu_dev_table[devid].data[2] = domain->id;
1157 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); 1354 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1158 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1355 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1356}
1357
1358static void clear_dte_entry(u16 devid)
1359{
1360 /* remove entry from the device table seen by the hardware */
1361 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1362 amd_iommu_dev_table[devid].data[1] = 0;
1363 amd_iommu_dev_table[devid].data[2] = 0;
1364
1365 amd_iommu_apply_erratum_63(devid);
1366}
1367
1368static void do_attach(struct device *dev, struct protection_domain *domain)
1369{
1370 struct iommu_dev_data *dev_data;
1371 struct amd_iommu *iommu;
1372 u16 devid;
1373
1374 devid = get_device_id(dev);
1375 iommu = amd_iommu_rlookup_table[devid];
1376 dev_data = get_dev_data(dev);
1377
1378 /* Update data structures */
1379 dev_data->domain = domain;
1380 list_add(&dev_data->list, &domain->dev_list);
1381 set_dte_entry(devid, domain);
1382
1383 /* Do reference counting */
1384 domain->dev_iommu[iommu->index] += 1;
1385 domain->dev_cnt += 1;
1159 1386
1160 amd_iommu_pd_table[devid] = domain; 1387 /* Flush the DTE entry */
1388 iommu_flush_device(dev);
1389}
1390
1391static void do_detach(struct device *dev)
1392{
1393 struct iommu_dev_data *dev_data;
1394 struct amd_iommu *iommu;
1395 u16 devid;
1396
1397 devid = get_device_id(dev);
1398 iommu = amd_iommu_rlookup_table[devid];
1399 dev_data = get_dev_data(dev);
1400
1401 /* decrease reference counters */
1402 dev_data->domain->dev_iommu[iommu->index] -= 1;
1403 dev_data->domain->dev_cnt -= 1;
1404
1405 /* Update data structures */
1406 dev_data->domain = NULL;
1407 list_del(&dev_data->list);
1408 clear_dte_entry(devid);
1409
1410 /* Flush the DTE entry */
1411 iommu_flush_device(dev);
1161} 1412}
1162 1413
1163/* 1414/*
1164 * If a device is not yet associated with a domain, this function does 1415 * If a device is not yet associated with a domain, this function does
1165 * assigns it visible for the hardware 1416 * assigns it visible for the hardware
1166 */ 1417 */
1167static void __attach_device(struct amd_iommu *iommu, 1418static int __attach_device(struct device *dev,
1168 struct protection_domain *domain, 1419 struct protection_domain *domain)
1169 u16 devid)
1170{ 1420{
1421 struct iommu_dev_data *dev_data, *alias_data;
1422
1423 dev_data = get_dev_data(dev);
1424 alias_data = get_dev_data(dev_data->alias);
1425
1426 if (!alias_data)
1427 return -EINVAL;
1428
1171 /* lock domain */ 1429 /* lock domain */
1172 spin_lock(&domain->lock); 1430 spin_lock(&domain->lock);
1173 1431
1174 /* update DTE entry */ 1432 /* Some sanity checks */
1175 set_dte_entry(devid, domain); 1433 if (alias_data->domain != NULL &&
1434 alias_data->domain != domain)
1435 return -EBUSY;
1176 1436
1177 domain->dev_cnt += 1; 1437 if (dev_data->domain != NULL &&
1438 dev_data->domain != domain)
1439 return -EBUSY;
1440
1441 /* Do real assignment */
1442 if (dev_data->alias != dev) {
1443 alias_data = get_dev_data(dev_data->alias);
1444 if (alias_data->domain == NULL)
1445 do_attach(dev_data->alias, domain);
1446
1447 atomic_inc(&alias_data->bind);
1448 }
1449
1450 if (dev_data->domain == NULL)
1451 do_attach(dev, domain);
1452
1453 atomic_inc(&dev_data->bind);
1178 1454
1179 /* ready */ 1455 /* ready */
1180 spin_unlock(&domain->lock); 1456 spin_unlock(&domain->lock);
1457
1458 return 0;
1181} 1459}
1182 1460
1183/* 1461/*
1184 * If a device is not yet associated with a domain, this function does 1462 * If a device is not yet associated with a domain, this function does
1185 * assigns it visible for the hardware 1463 * assigns it visible for the hardware
1186 */ 1464 */
1187static void attach_device(struct amd_iommu *iommu, 1465static int attach_device(struct device *dev,
1188 struct protection_domain *domain, 1466 struct protection_domain *domain)
1189 u16 devid)
1190{ 1467{
1191 unsigned long flags; 1468 unsigned long flags;
1469 int ret;
1192 1470
1193 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1471 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1194 __attach_device(iommu, domain, devid); 1472 ret = __attach_device(dev, domain);
1195 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1473 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1196 1474
1197 /* 1475 /*
@@ -1199,96 +1477,130 @@ static void attach_device(struct amd_iommu *iommu,
1199 * left the caches in the IOMMU dirty. So we have to flush 1477 * left the caches in the IOMMU dirty. So we have to flush
1200 * here to evict all dirty stuff. 1478 * here to evict all dirty stuff.
1201 */ 1479 */
1202 iommu_queue_inv_dev_entry(iommu, devid); 1480 iommu_flush_tlb_pde(domain);
1203 iommu_flush_tlb_pde(iommu, domain->id); 1481
1482 return ret;
1204} 1483}
1205 1484
1206/* 1485/*
1207 * Removes a device from a protection domain (unlocked) 1486 * Removes a device from a protection domain (unlocked)
1208 */ 1487 */
1209static void __detach_device(struct protection_domain *domain, u16 devid) 1488static void __detach_device(struct device *dev)
1210{ 1489{
1490 struct iommu_dev_data *dev_data = get_dev_data(dev);
1491 struct iommu_dev_data *alias_data;
1492 struct protection_domain *domain;
1493 unsigned long flags;
1211 1494
1212 /* lock domain */ 1495 BUG_ON(!dev_data->domain);
1213 spin_lock(&domain->lock);
1214 1496
1215 /* remove domain from the lookup table */ 1497 domain = dev_data->domain;
1216 amd_iommu_pd_table[devid] = NULL;
1217 1498
1218 /* remove entry from the device table seen by the hardware */ 1499 spin_lock_irqsave(&domain->lock, flags);
1219 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1220 amd_iommu_dev_table[devid].data[1] = 0;
1221 amd_iommu_dev_table[devid].data[2] = 0;
1222 1500
1223 /* decrease reference counter */ 1501 if (dev_data->alias != dev) {
1224 domain->dev_cnt -= 1; 1502 alias_data = get_dev_data(dev_data->alias);
1503 if (atomic_dec_and_test(&alias_data->bind))
1504 do_detach(dev_data->alias);
1505 }
1225 1506
1226 /* ready */ 1507 if (atomic_dec_and_test(&dev_data->bind))
1227 spin_unlock(&domain->lock); 1508 do_detach(dev);
1509
1510 spin_unlock_irqrestore(&domain->lock, flags);
1228 1511
1229 /* 1512 /*
1230 * If we run in passthrough mode the device must be assigned to the 1513 * If we run in passthrough mode the device must be assigned to the
1231 * passthrough domain if it is detached from any other domain 1514 * passthrough domain if it is detached from any other domain.
1515 * Make sure we can deassign from the pt_domain itself.
1232 */ 1516 */
1233 if (iommu_pass_through) { 1517 if (iommu_pass_through &&
1234 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 1518 (dev_data->domain == NULL && domain != pt_domain))
1235 __attach_device(iommu, pt_domain, devid); 1519 __attach_device(dev, pt_domain);
1236 }
1237} 1520}
1238 1521
1239/* 1522/*
1240 * Removes a device from a protection domain (with devtable_lock held) 1523 * Removes a device from a protection domain (with devtable_lock held)
1241 */ 1524 */
1242static void detach_device(struct protection_domain *domain, u16 devid) 1525static void detach_device(struct device *dev)
1243{ 1526{
1244 unsigned long flags; 1527 unsigned long flags;
1245 1528
1246 /* lock device table */ 1529 /* lock device table */
1247 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1530 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1248 __detach_device(domain, devid); 1531 __detach_device(dev);
1249 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1532 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1250} 1533}
1251 1534
1535/*
1536 * Find out the protection domain structure for a given PCI device. This
1537 * will give us the pointer to the page table root for example.
1538 */
1539static struct protection_domain *domain_for_device(struct device *dev)
1540{
1541 struct protection_domain *dom;
1542 struct iommu_dev_data *dev_data, *alias_data;
1543 unsigned long flags;
1544 u16 devid, alias;
1545
1546 devid = get_device_id(dev);
1547 alias = amd_iommu_alias_table[devid];
1548 dev_data = get_dev_data(dev);
1549 alias_data = get_dev_data(dev_data->alias);
1550 if (!alias_data)
1551 return NULL;
1552
1553 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1554 dom = dev_data->domain;
1555 if (dom == NULL &&
1556 alias_data->domain != NULL) {
1557 __attach_device(dev, alias_data->domain);
1558 dom = alias_data->domain;
1559 }
1560
1561 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1562
1563 return dom;
1564}
1565
1252static int device_change_notifier(struct notifier_block *nb, 1566static int device_change_notifier(struct notifier_block *nb,
1253 unsigned long action, void *data) 1567 unsigned long action, void *data)
1254{ 1568{
1255 struct device *dev = data; 1569 struct device *dev = data;
1256 struct pci_dev *pdev = to_pci_dev(dev); 1570 u16 devid;
1257 u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
1258 struct protection_domain *domain; 1571 struct protection_domain *domain;
1259 struct dma_ops_domain *dma_domain; 1572 struct dma_ops_domain *dma_domain;
1260 struct amd_iommu *iommu; 1573 struct amd_iommu *iommu;
1261 unsigned long flags; 1574 unsigned long flags;
1262 1575
1263 if (devid > amd_iommu_last_bdf) 1576 if (!check_device(dev))
1264 goto out; 1577 return 0;
1265
1266 devid = amd_iommu_alias_table[devid];
1267
1268 iommu = amd_iommu_rlookup_table[devid];
1269 if (iommu == NULL)
1270 goto out;
1271
1272 domain = domain_for_device(devid);
1273 1578
1274 if (domain && !dma_ops_domain(domain)) 1579 devid = get_device_id(dev);
1275 WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " 1580 iommu = amd_iommu_rlookup_table[devid];
1276 "to a non-dma-ops domain\n", dev_name(dev));
1277 1581
1278 switch (action) { 1582 switch (action) {
1279 case BUS_NOTIFY_UNBOUND_DRIVER: 1583 case BUS_NOTIFY_UNBOUND_DRIVER:
1584
1585 domain = domain_for_device(dev);
1586
1280 if (!domain) 1587 if (!domain)
1281 goto out; 1588 goto out;
1282 if (iommu_pass_through) 1589 if (iommu_pass_through)
1283 break; 1590 break;
1284 detach_device(domain, devid); 1591 detach_device(dev);
1285 break; 1592 break;
1286 case BUS_NOTIFY_ADD_DEVICE: 1593 case BUS_NOTIFY_ADD_DEVICE:
1594
1595 iommu_init_device(dev);
1596
1597 domain = domain_for_device(dev);
1598
1287 /* allocate a protection domain if a device is added */ 1599 /* allocate a protection domain if a device is added */
1288 dma_domain = find_protection_domain(devid); 1600 dma_domain = find_protection_domain(devid);
1289 if (dma_domain) 1601 if (dma_domain)
1290 goto out; 1602 goto out;
1291 dma_domain = dma_ops_domain_alloc(iommu); 1603 dma_domain = dma_ops_domain_alloc();
1292 if (!dma_domain) 1604 if (!dma_domain)
1293 goto out; 1605 goto out;
1294 dma_domain->target_dev = devid; 1606 dma_domain->target_dev = devid;
@@ -1298,11 +1610,15 @@ static int device_change_notifier(struct notifier_block *nb,
1298 spin_unlock_irqrestore(&iommu_pd_list_lock, flags); 1610 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1299 1611
1300 break; 1612 break;
1613 case BUS_NOTIFY_DEL_DEVICE:
1614
1615 iommu_uninit_device(dev);
1616
1301 default: 1617 default:
1302 goto out; 1618 goto out;
1303 } 1619 }
1304 1620
1305 iommu_queue_inv_dev_entry(iommu, devid); 1621 iommu_flush_device(dev);
1306 iommu_completion_wait(iommu); 1622 iommu_completion_wait(iommu);
1307 1623
1308out: 1624out:
@@ -1313,6 +1629,11 @@ static struct notifier_block device_nb = {
1313 .notifier_call = device_change_notifier, 1629 .notifier_call = device_change_notifier,
1314}; 1630};
1315 1631
1632void amd_iommu_init_notifier(void)
1633{
1634 bus_register_notifier(&pci_bus_type, &device_nb);
1635}
1636
1316/***************************************************************************** 1637/*****************************************************************************
1317 * 1638 *
1318 * The next functions belong to the dma_ops mapping/unmapping code. 1639 * The next functions belong to the dma_ops mapping/unmapping code.
@@ -1320,106 +1641,46 @@ static struct notifier_block device_nb = {
1320 *****************************************************************************/ 1641 *****************************************************************************/
1321 1642
1322/* 1643/*
1323 * This function checks if the driver got a valid device from the caller to
1324 * avoid dereferencing invalid pointers.
1325 */
1326static bool check_device(struct device *dev)
1327{
1328 if (!dev || !dev->dma_mask)
1329 return false;
1330
1331 return true;
1332}
1333
1334/*
1335 * In this function the list of preallocated protection domains is traversed to
1336 * find the domain for a specific device
1337 */
1338static struct dma_ops_domain *find_protection_domain(u16 devid)
1339{
1340 struct dma_ops_domain *entry, *ret = NULL;
1341 unsigned long flags;
1342
1343 if (list_empty(&iommu_pd_list))
1344 return NULL;
1345
1346 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1347
1348 list_for_each_entry(entry, &iommu_pd_list, list) {
1349 if (entry->target_dev == devid) {
1350 ret = entry;
1351 break;
1352 }
1353 }
1354
1355 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1356
1357 return ret;
1358}
1359
1360/*
1361 * In the dma_ops path we only have the struct device. This function 1644 * In the dma_ops path we only have the struct device. This function
1362 * finds the corresponding IOMMU, the protection domain and the 1645 * finds the corresponding IOMMU, the protection domain and the
1363 * requestor id for a given device. 1646 * requestor id for a given device.
1364 * If the device is not yet associated with a domain this is also done 1647 * If the device is not yet associated with a domain this is also done
1365 * in this function. 1648 * in this function.
1366 */ 1649 */
1367static int get_device_resources(struct device *dev, 1650static struct protection_domain *get_domain(struct device *dev)
1368 struct amd_iommu **iommu,
1369 struct protection_domain **domain,
1370 u16 *bdf)
1371{ 1651{
1652 struct protection_domain *domain;
1372 struct dma_ops_domain *dma_dom; 1653 struct dma_ops_domain *dma_dom;
1373 struct pci_dev *pcidev; 1654 u16 devid = get_device_id(dev);
1374 u16 _bdf;
1375
1376 *iommu = NULL;
1377 *domain = NULL;
1378 *bdf = 0xffff;
1379 1655
1380 if (dev->bus != &pci_bus_type) 1656 if (!check_device(dev))
1381 return 0; 1657 return ERR_PTR(-EINVAL);
1382
1383 pcidev = to_pci_dev(dev);
1384 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1385 1658
1386 /* device not translated by any IOMMU in the system? */ 1659 domain = domain_for_device(dev);
1387 if (_bdf > amd_iommu_last_bdf) 1660 if (domain != NULL && !dma_ops_domain(domain))
1388 return 0; 1661 return ERR_PTR(-EBUSY);
1389 1662
1390 *bdf = amd_iommu_alias_table[_bdf]; 1663 if (domain != NULL)
1664 return domain;
1391 1665
1392 *iommu = amd_iommu_rlookup_table[*bdf]; 1666 /* Device not bount yet - bind it */
1393 if (*iommu == NULL) 1667 dma_dom = find_protection_domain(devid);
1394 return 0; 1668 if (!dma_dom)
1395 *domain = domain_for_device(*bdf); 1669 dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
1396 if (*domain == NULL) { 1670 attach_device(dev, &dma_dom->domain);
1397 dma_dom = find_protection_domain(*bdf); 1671 DUMP_printk("Using protection domain %d for device %s\n",
1398 if (!dma_dom) 1672 dma_dom->domain.id, dev_name(dev));
1399 dma_dom = (*iommu)->default_dom;
1400 *domain = &dma_dom->domain;
1401 attach_device(*iommu, *domain, *bdf);
1402 DUMP_printk("Using protection domain %d for device %s\n",
1403 (*domain)->id, dev_name(dev));
1404 }
1405
1406 if (domain_for_device(_bdf) == NULL)
1407 attach_device(*iommu, *domain, _bdf);
1408 1673
1409 return 1; 1674 return &dma_dom->domain;
1410} 1675}
1411 1676
1412static void update_device_table(struct protection_domain *domain) 1677static void update_device_table(struct protection_domain *domain)
1413{ 1678{
1414 unsigned long flags; 1679 struct iommu_dev_data *dev_data;
1415 int i;
1416 1680
1417 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 1681 list_for_each_entry(dev_data, &domain->dev_list, list) {
1418 if (amd_iommu_pd_table[i] != domain) 1682 u16 devid = get_device_id(dev_data->dev);
1419 continue; 1683 set_dte_entry(devid, domain);
1420 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1421 set_dte_entry(i, domain);
1422 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1423 } 1684 }
1424} 1685}
1425 1686
@@ -1429,76 +1690,13 @@ static void update_domain(struct protection_domain *domain)
1429 return; 1690 return;
1430 1691
1431 update_device_table(domain); 1692 update_device_table(domain);
1432 flush_devices_by_domain(domain); 1693 iommu_flush_domain_devices(domain);
1433 iommu_flush_domain(domain->id); 1694 iommu_flush_tlb_pde(domain);
1434 1695
1435 domain->updated = false; 1696 domain->updated = false;
1436} 1697}
1437 1698
1438/* 1699/*
1439 * This function is used to add another level to an IO page table. Adding
1440 * another level increases the size of the address space by 9 bits to a size up
1441 * to 64 bits.
1442 */
1443static bool increase_address_space(struct protection_domain *domain,
1444 gfp_t gfp)
1445{
1446 u64 *pte;
1447
1448 if (domain->mode == PAGE_MODE_6_LEVEL)
1449 /* address space already 64 bit large */
1450 return false;
1451
1452 pte = (void *)get_zeroed_page(gfp);
1453 if (!pte)
1454 return false;
1455
1456 *pte = PM_LEVEL_PDE(domain->mode,
1457 virt_to_phys(domain->pt_root));
1458 domain->pt_root = pte;
1459 domain->mode += 1;
1460 domain->updated = true;
1461
1462 return true;
1463}
1464
1465static u64 *alloc_pte(struct protection_domain *domain,
1466 unsigned long address,
1467 int end_lvl,
1468 u64 **pte_page,
1469 gfp_t gfp)
1470{
1471 u64 *pte, *page;
1472 int level;
1473
1474 while (address > PM_LEVEL_SIZE(domain->mode))
1475 increase_address_space(domain, gfp);
1476
1477 level = domain->mode - 1;
1478 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1479
1480 while (level > end_lvl) {
1481 if (!IOMMU_PTE_PRESENT(*pte)) {
1482 page = (u64 *)get_zeroed_page(gfp);
1483 if (!page)
1484 return NULL;
1485 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
1486 }
1487
1488 level -= 1;
1489
1490 pte = IOMMU_PTE_PAGE(*pte);
1491
1492 if (pte_page && level == end_lvl)
1493 *pte_page = pte;
1494
1495 pte = &pte[PM_LEVEL_INDEX(level, address)];
1496 }
1497
1498 return pte;
1499}
1500
1501/*
1502 * This function fetches the PTE for a given address in the aperture 1700 * This function fetches the PTE for a given address in the aperture
1503 */ 1701 */
1504static u64* dma_ops_get_pte(struct dma_ops_domain *dom, 1702static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
@@ -1528,8 +1726,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1528 * This is the generic map function. It maps one 4kb page at paddr to 1726 * This is the generic map function. It maps one 4kb page at paddr to
1529 * the given address in the DMA address space for the domain. 1727 * the given address in the DMA address space for the domain.
1530 */ 1728 */
1531static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 1729static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
1532 struct dma_ops_domain *dom,
1533 unsigned long address, 1730 unsigned long address,
1534 phys_addr_t paddr, 1731 phys_addr_t paddr,
1535 int direction) 1732 int direction)
@@ -1542,7 +1739,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1542 1739
1543 pte = dma_ops_get_pte(dom, address); 1740 pte = dma_ops_get_pte(dom, address);
1544 if (!pte) 1741 if (!pte)
1545 return bad_dma_address; 1742 return DMA_ERROR_CODE;
1546 1743
1547 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1744 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1548 1745
@@ -1563,8 +1760,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1563/* 1760/*
1564 * The generic unmapping function for on page in the DMA address space. 1761 * The generic unmapping function for on page in the DMA address space.
1565 */ 1762 */
1566static void dma_ops_domain_unmap(struct amd_iommu *iommu, 1763static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
1567 struct dma_ops_domain *dom,
1568 unsigned long address) 1764 unsigned long address)
1569{ 1765{
1570 struct aperture_range *aperture; 1766 struct aperture_range *aperture;
@@ -1595,7 +1791,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1595 * Must be called with the domain lock held. 1791 * Must be called with the domain lock held.
1596 */ 1792 */
1597static dma_addr_t __map_single(struct device *dev, 1793static dma_addr_t __map_single(struct device *dev,
1598 struct amd_iommu *iommu,
1599 struct dma_ops_domain *dma_dom, 1794 struct dma_ops_domain *dma_dom,
1600 phys_addr_t paddr, 1795 phys_addr_t paddr,
1601 size_t size, 1796 size_t size,
@@ -1623,7 +1818,7 @@ static dma_addr_t __map_single(struct device *dev,
1623retry: 1818retry:
1624 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1819 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1625 dma_mask); 1820 dma_mask);
1626 if (unlikely(address == bad_dma_address)) { 1821 if (unlikely(address == DMA_ERROR_CODE)) {
1627 /* 1822 /*
1628 * setting next_address here will let the address 1823 * setting next_address here will let the address
1629 * allocator only scan the new allocated range in the 1824 * allocator only scan the new allocated range in the
@@ -1631,11 +1826,11 @@ retry:
1631 */ 1826 */
1632 dma_dom->next_address = dma_dom->aperture_size; 1827 dma_dom->next_address = dma_dom->aperture_size;
1633 1828
1634 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) 1829 if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
1635 goto out; 1830 goto out;
1636 1831
1637 /* 1832 /*
1638 * aperture was sucessfully enlarged by 128 MB, try 1833 * aperture was successfully enlarged by 128 MB, try
1639 * allocation again 1834 * allocation again
1640 */ 1835 */
1641 goto retry; 1836 goto retry;
@@ -1643,8 +1838,8 @@ retry:
1643 1838
1644 start = address; 1839 start = address;
1645 for (i = 0; i < pages; ++i) { 1840 for (i = 0; i < pages; ++i) {
1646 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1841 ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
1647 if (ret == bad_dma_address) 1842 if (ret == DMA_ERROR_CODE)
1648 goto out_unmap; 1843 goto out_unmap;
1649 1844
1650 paddr += PAGE_SIZE; 1845 paddr += PAGE_SIZE;
@@ -1655,10 +1850,10 @@ retry:
1655 ADD_STATS_COUNTER(alloced_io_mem, size); 1850 ADD_STATS_COUNTER(alloced_io_mem, size);
1656 1851
1657 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 1852 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
1658 iommu_flush_tlb(iommu, dma_dom->domain.id); 1853 iommu_flush_tlb(&dma_dom->domain);
1659 dma_dom->need_flush = false; 1854 dma_dom->need_flush = false;
1660 } else if (unlikely(iommu_has_npcache(iommu))) 1855 } else if (unlikely(amd_iommu_np_cache))
1661 iommu_flush_pages(iommu, dma_dom->domain.id, address, size); 1856 iommu_flush_pages(&dma_dom->domain, address, size);
1662 1857
1663out: 1858out:
1664 return address; 1859 return address;
@@ -1667,20 +1862,19 @@ out_unmap:
1667 1862
1668 for (--i; i >= 0; --i) { 1863 for (--i; i >= 0; --i) {
1669 start -= PAGE_SIZE; 1864 start -= PAGE_SIZE;
1670 dma_ops_domain_unmap(iommu, dma_dom, start); 1865 dma_ops_domain_unmap(dma_dom, start);
1671 } 1866 }
1672 1867
1673 dma_ops_free_addresses(dma_dom, address, pages); 1868 dma_ops_free_addresses(dma_dom, address, pages);
1674 1869
1675 return bad_dma_address; 1870 return DMA_ERROR_CODE;
1676} 1871}
1677 1872
1678/* 1873/*
1679 * Does the reverse of the __map_single function. Must be called with 1874 * Does the reverse of the __map_single function. Must be called with
1680 * the domain lock held too 1875 * the domain lock held too
1681 */ 1876 */
1682static void __unmap_single(struct amd_iommu *iommu, 1877static void __unmap_single(struct dma_ops_domain *dma_dom,
1683 struct dma_ops_domain *dma_dom,
1684 dma_addr_t dma_addr, 1878 dma_addr_t dma_addr,
1685 size_t size, 1879 size_t size,
1686 int dir) 1880 int dir)
@@ -1688,7 +1882,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1688 dma_addr_t i, start; 1882 dma_addr_t i, start;
1689 unsigned int pages; 1883 unsigned int pages;
1690 1884
1691 if ((dma_addr == bad_dma_address) || 1885 if ((dma_addr == DMA_ERROR_CODE) ||
1692 (dma_addr + size > dma_dom->aperture_size)) 1886 (dma_addr + size > dma_dom->aperture_size))
1693 return; 1887 return;
1694 1888
@@ -1697,7 +1891,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1697 start = dma_addr; 1891 start = dma_addr;
1698 1892
1699 for (i = 0; i < pages; ++i) { 1893 for (i = 0; i < pages; ++i) {
1700 dma_ops_domain_unmap(iommu, dma_dom, start); 1894 dma_ops_domain_unmap(dma_dom, start);
1701 start += PAGE_SIZE; 1895 start += PAGE_SIZE;
1702 } 1896 }
1703 1897
@@ -1706,7 +1900,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1706 dma_ops_free_addresses(dma_dom, dma_addr, pages); 1900 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1707 1901
1708 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 1902 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1709 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); 1903 iommu_flush_pages(&dma_dom->domain, dma_addr, size);
1710 dma_dom->need_flush = false; 1904 dma_dom->need_flush = false;
1711 } 1905 }
1712} 1906}
@@ -1720,36 +1914,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
1720 struct dma_attrs *attrs) 1914 struct dma_attrs *attrs)
1721{ 1915{
1722 unsigned long flags; 1916 unsigned long flags;
1723 struct amd_iommu *iommu;
1724 struct protection_domain *domain; 1917 struct protection_domain *domain;
1725 u16 devid;
1726 dma_addr_t addr; 1918 dma_addr_t addr;
1727 u64 dma_mask; 1919 u64 dma_mask;
1728 phys_addr_t paddr = page_to_phys(page) + offset; 1920 phys_addr_t paddr = page_to_phys(page) + offset;
1729 1921
1730 INC_STATS_COUNTER(cnt_map_single); 1922 INC_STATS_COUNTER(cnt_map_single);
1731 1923
1732 if (!check_device(dev)) 1924 domain = get_domain(dev);
1733 return bad_dma_address; 1925 if (PTR_ERR(domain) == -EINVAL)
1734
1735 dma_mask = *dev->dma_mask;
1736
1737 get_device_resources(dev, &iommu, &domain, &devid);
1738
1739 if (iommu == NULL || domain == NULL)
1740 /* device not handled by any AMD IOMMU */
1741 return (dma_addr_t)paddr; 1926 return (dma_addr_t)paddr;
1927 else if (IS_ERR(domain))
1928 return DMA_ERROR_CODE;
1742 1929
1743 if (!dma_ops_domain(domain)) 1930 dma_mask = *dev->dma_mask;
1744 return bad_dma_address;
1745 1931
1746 spin_lock_irqsave(&domain->lock, flags); 1932 spin_lock_irqsave(&domain->lock, flags);
1747 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, 1933
1934 addr = __map_single(dev, domain->priv, paddr, size, dir, false,
1748 dma_mask); 1935 dma_mask);
1749 if (addr == bad_dma_address) 1936 if (addr == DMA_ERROR_CODE)
1750 goto out; 1937 goto out;
1751 1938
1752 iommu_completion_wait(iommu); 1939 iommu_flush_complete(domain);
1753 1940
1754out: 1941out:
1755 spin_unlock_irqrestore(&domain->lock, flags); 1942 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1764,25 +1951,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
1764 enum dma_data_direction dir, struct dma_attrs *attrs) 1951 enum dma_data_direction dir, struct dma_attrs *attrs)
1765{ 1952{
1766 unsigned long flags; 1953 unsigned long flags;
1767 struct amd_iommu *iommu;
1768 struct protection_domain *domain; 1954 struct protection_domain *domain;
1769 u16 devid;
1770 1955
1771 INC_STATS_COUNTER(cnt_unmap_single); 1956 INC_STATS_COUNTER(cnt_unmap_single);
1772 1957
1773 if (!check_device(dev) || 1958 domain = get_domain(dev);
1774 !get_device_resources(dev, &iommu, &domain, &devid)) 1959 if (IS_ERR(domain))
1775 /* device not handled by any AMD IOMMU */
1776 return;
1777
1778 if (!dma_ops_domain(domain))
1779 return; 1960 return;
1780 1961
1781 spin_lock_irqsave(&domain->lock, flags); 1962 spin_lock_irqsave(&domain->lock, flags);
1782 1963
1783 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1964 __unmap_single(domain->priv, dma_addr, size, dir);
1784 1965
1785 iommu_completion_wait(iommu); 1966 iommu_flush_complete(domain);
1786 1967
1787 spin_unlock_irqrestore(&domain->lock, flags); 1968 spin_unlock_irqrestore(&domain->lock, flags);
1788} 1969}
@@ -1814,9 +1995,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1814 struct dma_attrs *attrs) 1995 struct dma_attrs *attrs)
1815{ 1996{
1816 unsigned long flags; 1997 unsigned long flags;
1817 struct amd_iommu *iommu;
1818 struct protection_domain *domain; 1998 struct protection_domain *domain;
1819 u16 devid;
1820 int i; 1999 int i;
1821 struct scatterlist *s; 2000 struct scatterlist *s;
1822 phys_addr_t paddr; 2001 phys_addr_t paddr;
@@ -1825,25 +2004,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1825 2004
1826 INC_STATS_COUNTER(cnt_map_sg); 2005 INC_STATS_COUNTER(cnt_map_sg);
1827 2006
1828 if (!check_device(dev)) 2007 domain = get_domain(dev);
2008 if (PTR_ERR(domain) == -EINVAL)
2009 return map_sg_no_iommu(dev, sglist, nelems, dir);
2010 else if (IS_ERR(domain))
1829 return 0; 2011 return 0;
1830 2012
1831 dma_mask = *dev->dma_mask; 2013 dma_mask = *dev->dma_mask;
1832 2014
1833 get_device_resources(dev, &iommu, &domain, &devid);
1834
1835 if (!iommu || !domain)
1836 return map_sg_no_iommu(dev, sglist, nelems, dir);
1837
1838 if (!dma_ops_domain(domain))
1839 return 0;
1840
1841 spin_lock_irqsave(&domain->lock, flags); 2015 spin_lock_irqsave(&domain->lock, flags);
1842 2016
1843 for_each_sg(sglist, s, nelems, i) { 2017 for_each_sg(sglist, s, nelems, i) {
1844 paddr = sg_phys(s); 2018 paddr = sg_phys(s);
1845 2019
1846 s->dma_address = __map_single(dev, iommu, domain->priv, 2020 s->dma_address = __map_single(dev, domain->priv,
1847 paddr, s->length, dir, false, 2021 paddr, s->length, dir, false,
1848 dma_mask); 2022 dma_mask);
1849 2023
@@ -1854,7 +2028,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1854 goto unmap; 2028 goto unmap;
1855 } 2029 }
1856 2030
1857 iommu_completion_wait(iommu); 2031 iommu_flush_complete(domain);
1858 2032
1859out: 2033out:
1860 spin_unlock_irqrestore(&domain->lock, flags); 2034 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1863,7 +2037,7 @@ out:
1863unmap: 2037unmap:
1864 for_each_sg(sglist, s, mapped_elems, i) { 2038 for_each_sg(sglist, s, mapped_elems, i) {
1865 if (s->dma_address) 2039 if (s->dma_address)
1866 __unmap_single(iommu, domain->priv, s->dma_address, 2040 __unmap_single(domain->priv, s->dma_address,
1867 s->dma_length, dir); 2041 s->dma_length, dir);
1868 s->dma_address = s->dma_length = 0; 2042 s->dma_address = s->dma_length = 0;
1869 } 2043 }
@@ -1882,30 +2056,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1882 struct dma_attrs *attrs) 2056 struct dma_attrs *attrs)
1883{ 2057{
1884 unsigned long flags; 2058 unsigned long flags;
1885 struct amd_iommu *iommu;
1886 struct protection_domain *domain; 2059 struct protection_domain *domain;
1887 struct scatterlist *s; 2060 struct scatterlist *s;
1888 u16 devid;
1889 int i; 2061 int i;
1890 2062
1891 INC_STATS_COUNTER(cnt_unmap_sg); 2063 INC_STATS_COUNTER(cnt_unmap_sg);
1892 2064
1893 if (!check_device(dev) || 2065 domain = get_domain(dev);
1894 !get_device_resources(dev, &iommu, &domain, &devid)) 2066 if (IS_ERR(domain))
1895 return;
1896
1897 if (!dma_ops_domain(domain))
1898 return; 2067 return;
1899 2068
1900 spin_lock_irqsave(&domain->lock, flags); 2069 spin_lock_irqsave(&domain->lock, flags);
1901 2070
1902 for_each_sg(sglist, s, nelems, i) { 2071 for_each_sg(sglist, s, nelems, i) {
1903 __unmap_single(iommu, domain->priv, s->dma_address, 2072 __unmap_single(domain->priv, s->dma_address,
1904 s->dma_length, dir); 2073 s->dma_length, dir);
1905 s->dma_address = s->dma_length = 0; 2074 s->dma_address = s->dma_length = 0;
1906 } 2075 }
1907 2076
1908 iommu_completion_wait(iommu); 2077 iommu_flush_complete(domain);
1909 2078
1910 spin_unlock_irqrestore(&domain->lock, flags); 2079 spin_unlock_irqrestore(&domain->lock, flags);
1911} 2080}
@@ -1918,49 +2087,44 @@ static void *alloc_coherent(struct device *dev, size_t size,
1918{ 2087{
1919 unsigned long flags; 2088 unsigned long flags;
1920 void *virt_addr; 2089 void *virt_addr;
1921 struct amd_iommu *iommu;
1922 struct protection_domain *domain; 2090 struct protection_domain *domain;
1923 u16 devid;
1924 phys_addr_t paddr; 2091 phys_addr_t paddr;
1925 u64 dma_mask = dev->coherent_dma_mask; 2092 u64 dma_mask = dev->coherent_dma_mask;
1926 2093
1927 INC_STATS_COUNTER(cnt_alloc_coherent); 2094 INC_STATS_COUNTER(cnt_alloc_coherent);
1928 2095
1929 if (!check_device(dev)) 2096 domain = get_domain(dev);
2097 if (PTR_ERR(domain) == -EINVAL) {
2098 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2099 *dma_addr = __pa(virt_addr);
2100 return virt_addr;
2101 } else if (IS_ERR(domain))
1930 return NULL; 2102 return NULL;
1931 2103
1932 if (!get_device_resources(dev, &iommu, &domain, &devid)) 2104 dma_mask = dev->coherent_dma_mask;
1933 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 2105 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2106 flag |= __GFP_ZERO;
1934 2107
1935 flag |= __GFP_ZERO;
1936 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 2108 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1937 if (!virt_addr) 2109 if (!virt_addr)
1938 return NULL; 2110 return NULL;
1939 2111
1940 paddr = virt_to_phys(virt_addr); 2112 paddr = virt_to_phys(virt_addr);
1941 2113
1942 if (!iommu || !domain) {
1943 *dma_addr = (dma_addr_t)paddr;
1944 return virt_addr;
1945 }
1946
1947 if (!dma_ops_domain(domain))
1948 goto out_free;
1949
1950 if (!dma_mask) 2114 if (!dma_mask)
1951 dma_mask = *dev->dma_mask; 2115 dma_mask = *dev->dma_mask;
1952 2116
1953 spin_lock_irqsave(&domain->lock, flags); 2117 spin_lock_irqsave(&domain->lock, flags);
1954 2118
1955 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 2119 *dma_addr = __map_single(dev, domain->priv, paddr,
1956 size, DMA_BIDIRECTIONAL, true, dma_mask); 2120 size, DMA_BIDIRECTIONAL, true, dma_mask);
1957 2121
1958 if (*dma_addr == bad_dma_address) { 2122 if (*dma_addr == DMA_ERROR_CODE) {
1959 spin_unlock_irqrestore(&domain->lock, flags); 2123 spin_unlock_irqrestore(&domain->lock, flags);
1960 goto out_free; 2124 goto out_free;
1961 } 2125 }
1962 2126
1963 iommu_completion_wait(iommu); 2127 iommu_flush_complete(domain);
1964 2128
1965 spin_unlock_irqrestore(&domain->lock, flags); 2129 spin_unlock_irqrestore(&domain->lock, flags);
1966 2130
@@ -1980,28 +2144,19 @@ static void free_coherent(struct device *dev, size_t size,
1980 void *virt_addr, dma_addr_t dma_addr) 2144 void *virt_addr, dma_addr_t dma_addr)
1981{ 2145{
1982 unsigned long flags; 2146 unsigned long flags;
1983 struct amd_iommu *iommu;
1984 struct protection_domain *domain; 2147 struct protection_domain *domain;
1985 u16 devid;
1986 2148
1987 INC_STATS_COUNTER(cnt_free_coherent); 2149 INC_STATS_COUNTER(cnt_free_coherent);
1988 2150
1989 if (!check_device(dev)) 2151 domain = get_domain(dev);
1990 return; 2152 if (IS_ERR(domain))
1991
1992 get_device_resources(dev, &iommu, &domain, &devid);
1993
1994 if (!iommu || !domain)
1995 goto free_mem;
1996
1997 if (!dma_ops_domain(domain))
1998 goto free_mem; 2153 goto free_mem;
1999 2154
2000 spin_lock_irqsave(&domain->lock, flags); 2155 spin_lock_irqsave(&domain->lock, flags);
2001 2156
2002 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2157 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2003 2158
2004 iommu_completion_wait(iommu); 2159 iommu_flush_complete(domain);
2005 2160
2006 spin_unlock_irqrestore(&domain->lock, flags); 2161 spin_unlock_irqrestore(&domain->lock, flags);
2007 2162
@@ -2015,22 +2170,7 @@ free_mem:
2015 */ 2170 */
2016static int amd_iommu_dma_supported(struct device *dev, u64 mask) 2171static int amd_iommu_dma_supported(struct device *dev, u64 mask)
2017{ 2172{
2018 u16 bdf; 2173 return check_device(dev);
2019 struct pci_dev *pcidev;
2020
2021 /* No device or no PCI device */
2022 if (!dev || dev->bus != &pci_bus_type)
2023 return 0;
2024
2025 pcidev = to_pci_dev(dev);
2026
2027 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
2028
2029 /* Out of our scope? */
2030 if (bdf > amd_iommu_last_bdf)
2031 return 0;
2032
2033 return 1;
2034} 2174}
2035 2175
2036/* 2176/*
@@ -2044,25 +2184,28 @@ static void prealloc_protection_domains(void)
2044{ 2184{
2045 struct pci_dev *dev = NULL; 2185 struct pci_dev *dev = NULL;
2046 struct dma_ops_domain *dma_dom; 2186 struct dma_ops_domain *dma_dom;
2047 struct amd_iommu *iommu;
2048 u16 devid; 2187 u16 devid;
2049 2188
2050 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2189 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2051 devid = calc_devid(dev->bus->number, dev->devfn); 2190
2052 if (devid > amd_iommu_last_bdf) 2191 /* Do we handle this device? */
2053 continue; 2192 if (!check_device(&dev->dev))
2054 devid = amd_iommu_alias_table[devid];
2055 if (domain_for_device(devid))
2056 continue; 2193 continue;
2057 iommu = amd_iommu_rlookup_table[devid]; 2194
2058 if (!iommu) 2195 /* Is there already any domain for it? */
2196 if (domain_for_device(&dev->dev))
2059 continue; 2197 continue;
2060 dma_dom = dma_ops_domain_alloc(iommu); 2198
2199 devid = get_device_id(&dev->dev);
2200
2201 dma_dom = dma_ops_domain_alloc();
2061 if (!dma_dom) 2202 if (!dma_dom)
2062 continue; 2203 continue;
2063 init_unity_mappings_for_device(dma_dom, devid); 2204 init_unity_mappings_for_device(dma_dom, devid);
2064 dma_dom->target_dev = devid; 2205 dma_dom->target_dev = devid;
2065 2206
2207 attach_device(&dev->dev, &dma_dom->domain);
2208
2066 list_add_tail(&dma_dom->list, &iommu_pd_list); 2209 list_add_tail(&dma_dom->list, &iommu_pd_list);
2067 } 2210 }
2068} 2211}
@@ -2080,6 +2223,12 @@ static struct dma_map_ops amd_iommu_dma_ops = {
2080/* 2223/*
2081 * The function which clues the AMD IOMMU driver into dma_ops. 2224 * The function which clues the AMD IOMMU driver into dma_ops.
2082 */ 2225 */
2226
2227void __init amd_iommu_init_api(void)
2228{
2229 register_iommu(&amd_iommu_ops);
2230}
2231
2083int __init amd_iommu_init_dma_ops(void) 2232int __init amd_iommu_init_dma_ops(void)
2084{ 2233{
2085 struct amd_iommu *iommu; 2234 struct amd_iommu *iommu;
@@ -2091,7 +2240,7 @@ int __init amd_iommu_init_dma_ops(void)
2091 * protection domain will be assigned to the default one. 2240 * protection domain will be assigned to the default one.
2092 */ 2241 */
2093 for_each_iommu(iommu) { 2242 for_each_iommu(iommu) {
2094 iommu->default_dom = dma_ops_domain_alloc(iommu); 2243 iommu->default_dom = dma_ops_domain_alloc();
2095 if (iommu->default_dom == NULL) 2244 if (iommu->default_dom == NULL)
2096 return -ENOMEM; 2245 return -ENOMEM;
2097 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 2246 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -2101,15 +2250,12 @@ int __init amd_iommu_init_dma_ops(void)
2101 } 2250 }
2102 2251
2103 /* 2252 /*
2104 * If device isolation is enabled, pre-allocate the protection 2253 * Pre-allocate the protection domains for each device.
2105 * domains for each device.
2106 */ 2254 */
2107 if (amd_iommu_isolate) 2255 prealloc_protection_domains();
2108 prealloc_protection_domains();
2109 2256
2110 iommu_detected = 1; 2257 iommu_detected = 1;
2111 force_iommu = 1; 2258 swiotlb = 0;
2112 bad_dma_address = 0;
2113#ifdef CONFIG_GART_IOMMU 2259#ifdef CONFIG_GART_IOMMU
2114 gart_iommu_aperture_disabled = 1; 2260 gart_iommu_aperture_disabled = 1;
2115 gart_iommu_aperture = 0; 2261 gart_iommu_aperture = 0;
@@ -2118,10 +2264,6 @@ int __init amd_iommu_init_dma_ops(void)
2118 /* Make the driver finally visible to the drivers */ 2264 /* Make the driver finally visible to the drivers */
2119 dma_ops = &amd_iommu_dma_ops; 2265 dma_ops = &amd_iommu_dma_ops;
2120 2266
2121 register_iommu(&amd_iommu_ops);
2122
2123 bus_register_notifier(&pci_bus_type, &device_nb);
2124
2125 amd_iommu_stats_init(); 2267 amd_iommu_stats_init();
2126 2268
2127 return 0; 2269 return 0;
@@ -2148,14 +2290,17 @@ free_domains:
2148 2290
2149static void cleanup_domain(struct protection_domain *domain) 2291static void cleanup_domain(struct protection_domain *domain)
2150{ 2292{
2293 struct iommu_dev_data *dev_data, *next;
2151 unsigned long flags; 2294 unsigned long flags;
2152 u16 devid;
2153 2295
2154 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2296 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2155 2297
2156 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) 2298 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2157 if (amd_iommu_pd_table[devid] == domain) 2299 struct device *dev = dev_data->dev;
2158 __detach_device(domain, devid); 2300
2301 do_detach(dev);
2302 atomic_set(&dev_data->bind, 0);
2303 }
2159 2304
2160 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2305 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2161} 2306}
@@ -2165,6 +2310,8 @@ static void protection_domain_free(struct protection_domain *domain)
2165 if (!domain) 2310 if (!domain)
2166 return; 2311 return;
2167 2312
2313 del_domain_from_list(domain);
2314
2168 if (domain->id) 2315 if (domain->id)
2169 domain_id_free(domain->id); 2316 domain_id_free(domain->id);
2170 2317
@@ -2183,6 +2330,9 @@ static struct protection_domain *protection_domain_alloc(void)
2183 domain->id = domain_id_alloc(); 2330 domain->id = domain_id_alloc();
2184 if (!domain->id) 2331 if (!domain->id)
2185 goto out_err; 2332 goto out_err;
2333 INIT_LIST_HEAD(&domain->dev_list);
2334
2335 add_domain_to_list(domain);
2186 2336
2187 return domain; 2337 return domain;
2188 2338
@@ -2239,26 +2389,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2239static void amd_iommu_detach_device(struct iommu_domain *dom, 2389static void amd_iommu_detach_device(struct iommu_domain *dom,
2240 struct device *dev) 2390 struct device *dev)
2241{ 2391{
2242 struct protection_domain *domain = dom->priv; 2392 struct iommu_dev_data *dev_data = dev->archdata.iommu;
2243 struct amd_iommu *iommu; 2393 struct amd_iommu *iommu;
2244 struct pci_dev *pdev;
2245 u16 devid; 2394 u16 devid;
2246 2395
2247 if (dev->bus != &pci_bus_type) 2396 if (!check_device(dev))
2248 return; 2397 return;
2249 2398
2250 pdev = to_pci_dev(dev); 2399 devid = get_device_id(dev);
2251 2400
2252 devid = calc_devid(pdev->bus->number, pdev->devfn); 2401 if (dev_data->domain != NULL)
2253 2402 detach_device(dev);
2254 if (devid > 0)
2255 detach_device(domain, devid);
2256 2403
2257 iommu = amd_iommu_rlookup_table[devid]; 2404 iommu = amd_iommu_rlookup_table[devid];
2258 if (!iommu) 2405 if (!iommu)
2259 return; 2406 return;
2260 2407
2261 iommu_queue_inv_dev_entry(iommu, devid); 2408 iommu_flush_device(dev);
2262 iommu_completion_wait(iommu); 2409 iommu_completion_wait(iommu);
2263} 2410}
2264 2411
@@ -2266,35 +2413,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
2266 struct device *dev) 2413 struct device *dev)
2267{ 2414{
2268 struct protection_domain *domain = dom->priv; 2415 struct protection_domain *domain = dom->priv;
2269 struct protection_domain *old_domain; 2416 struct iommu_dev_data *dev_data;
2270 struct amd_iommu *iommu; 2417 struct amd_iommu *iommu;
2271 struct pci_dev *pdev; 2418 int ret;
2272 u16 devid; 2419 u16 devid;
2273 2420
2274 if (dev->bus != &pci_bus_type) 2421 if (!check_device(dev))
2275 return -EINVAL; 2422 return -EINVAL;
2276 2423
2277 pdev = to_pci_dev(dev); 2424 dev_data = dev->archdata.iommu;
2278
2279 devid = calc_devid(pdev->bus->number, pdev->devfn);
2280 2425
2281 if (devid >= amd_iommu_last_bdf || 2426 devid = get_device_id(dev);
2282 devid != amd_iommu_alias_table[devid])
2283 return -EINVAL;
2284 2427
2285 iommu = amd_iommu_rlookup_table[devid]; 2428 iommu = amd_iommu_rlookup_table[devid];
2286 if (!iommu) 2429 if (!iommu)
2287 return -EINVAL; 2430 return -EINVAL;
2288 2431
2289 old_domain = domain_for_device(devid); 2432 if (dev_data->domain)
2290 if (old_domain) 2433 detach_device(dev);
2291 detach_device(old_domain, devid);
2292 2434
2293 attach_device(iommu, domain, devid); 2435 ret = attach_device(dev, domain);
2294 2436
2295 iommu_completion_wait(iommu); 2437 iommu_completion_wait(iommu);
2296 2438
2297 return 0; 2439 return ret;
2298} 2440}
2299 2441
2300static int amd_iommu_map_range(struct iommu_domain *dom, 2442static int amd_iommu_map_range(struct iommu_domain *dom,
@@ -2340,7 +2482,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2340 iova += PAGE_SIZE; 2482 iova += PAGE_SIZE;
2341 } 2483 }
2342 2484
2343 iommu_flush_domain(domain->id); 2485 iommu_flush_tlb_pde(domain);
2344} 2486}
2345 2487
2346static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2488static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2391,10 +2533,11 @@ static struct iommu_ops amd_iommu_ops = {
2391 2533
2392int __init amd_iommu_init_passthrough(void) 2534int __init amd_iommu_init_passthrough(void)
2393{ 2535{
2536 struct amd_iommu *iommu;
2394 struct pci_dev *dev = NULL; 2537 struct pci_dev *dev = NULL;
2395 u16 devid, devid2; 2538 u16 devid;
2396 2539
2397 /* allocate passthroug domain */ 2540 /* allocate passthrough domain */
2398 pt_domain = protection_domain_alloc(); 2541 pt_domain = protection_domain_alloc();
2399 if (!pt_domain) 2542 if (!pt_domain)
2400 return -ENOMEM; 2543 return -ENOMEM;
@@ -2402,20 +2545,17 @@ int __init amd_iommu_init_passthrough(void)
2402 pt_domain->mode |= PAGE_MODE_NONE; 2545 pt_domain->mode |= PAGE_MODE_NONE;
2403 2546
2404 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2547 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2405 struct amd_iommu *iommu;
2406 2548
2407 devid = calc_devid(dev->bus->number, dev->devfn); 2549 if (!check_device(&dev->dev))
2408 if (devid > amd_iommu_last_bdf)
2409 continue; 2550 continue;
2410 2551
2411 devid2 = amd_iommu_alias_table[devid]; 2552 devid = get_device_id(&dev->dev);
2412 2553
2413 iommu = amd_iommu_rlookup_table[devid2]; 2554 iommu = amd_iommu_rlookup_table[devid];
2414 if (!iommu) 2555 if (!iommu)
2415 continue; 2556 continue;
2416 2557
2417 __attach_device(iommu, pt_domain, devid); 2558 attach_device(&dev->dev, pt_domain);
2418 __attach_device(iommu, pt_domain, devid2);
2419 } 2559 }
2420 2560
2421 pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); 2561 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index b4b61d462dcc..9dc91b431470 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -25,10 +25,12 @@
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
28#include <asm/amd_iommu_proto.h>
28#include <asm/amd_iommu_types.h> 29#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h> 30#include <asm/amd_iommu.h>
30#include <asm/iommu.h> 31#include <asm/iommu.h>
31#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/x86_init.h>
32 34
33/* 35/*
34 * definitions for the ACPI scanning code 36 * definitions for the ACPI scanning code
@@ -123,18 +125,29 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
123 to handle */ 125 to handle */
124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 126LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
125 we find in ACPI */ 127 we find in ACPI */
126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
129bool amd_iommu_isolate = true; /* if true, device isolation is
130 enabled */
131#endif
132
133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 128bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
134 129
135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 130LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
136 system */ 131 system */
137 132
133/* Array to assign indices to IOMMUs*/
134struct amd_iommu *amd_iommus[MAX_IOMMUS];
135int amd_iommus_present;
136
137/* IOMMUs have a non-present cache? */
138bool amd_iommu_np_cache __read_mostly;
139
140/*
141 * Set to true if ACPI table parsing and hardware intialization went properly
142 */
143static bool amd_iommu_initialized;
144
145/*
146 * List of protection domains - used during resume
147 */
148LIST_HEAD(amd_iommu_pd_list);
149spinlock_t amd_iommu_pd_lock;
150
138/* 151/*
139 * Pointer to the device table which is shared by all AMD IOMMUs 152 * Pointer to the device table which is shared by all AMD IOMMUs
140 * it is indexed by the PCI device id or the HT unit id and contains 153 * it is indexed by the PCI device id or the HT unit id and contains
@@ -157,12 +170,6 @@ u16 *amd_iommu_alias_table;
157struct amd_iommu **amd_iommu_rlookup_table; 170struct amd_iommu **amd_iommu_rlookup_table;
158 171
159/* 172/*
160 * The pd table (protection domain table) is used to find the protection domain
161 * data structure a device belongs to. Indexed with the PCI device id too.
162 */
163struct protection_domain **amd_iommu_pd_table;
164
165/*
166 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap 173 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
167 * to know which ones are already in use. 174 * to know which ones are already in use.
168 */ 175 */
@@ -240,7 +247,7 @@ static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
240 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 247 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
241} 248}
242 249
243static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) 250static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
244{ 251{
245 u32 ctrl; 252 u32 ctrl;
246 253
@@ -519,6 +526,26 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
519 amd_iommu_dev_table[devid].data[i] |= (1 << _bit); 526 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
520} 527}
521 528
529static int get_dev_entry_bit(u16 devid, u8 bit)
530{
531 int i = (bit >> 5) & 0x07;
532 int _bit = bit & 0x1f;
533
534 return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
535}
536
537
538void amd_iommu_apply_erratum_63(u16 devid)
539{
540 int sysmgt;
541
542 sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
543 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
544
545 if (sysmgt == 0x01)
546 set_dev_entry_bit(devid, DEV_ENTRY_IW);
547}
548
522/* Writes the specific IOMMU for a device into the rlookup table */ 549/* Writes the specific IOMMU for a device into the rlookup table */
523static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) 550static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
524{ 551{
@@ -547,6 +574,8 @@ static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
547 if (flags & ACPI_DEVFLAG_LINT1) 574 if (flags & ACPI_DEVFLAG_LINT1)
548 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); 575 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
549 576
577 amd_iommu_apply_erratum_63(devid);
578
550 set_iommu_for_device(iommu, devid); 579 set_iommu_for_device(iommu, devid);
551} 580}
552 581
@@ -816,7 +845,18 @@ static void __init free_iommu_all(void)
816static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 845static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
817{ 846{
818 spin_lock_init(&iommu->lock); 847 spin_lock_init(&iommu->lock);
848
849 /* Add IOMMU to internal data structures */
819 list_add_tail(&iommu->list, &amd_iommu_list); 850 list_add_tail(&iommu->list, &amd_iommu_list);
851 iommu->index = amd_iommus_present++;
852
853 if (unlikely(iommu->index >= MAX_IOMMUS)) {
854 WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
855 return -ENOSYS;
856 }
857
858 /* Index is fine - add IOMMU to the array */
859 amd_iommus[iommu->index] = iommu;
820 860
821 /* 861 /*
822 * Copy data from ACPI table entry to the iommu struct 862 * Copy data from ACPI table entry to the iommu struct
@@ -846,6 +886,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
846 init_iommu_from_acpi(iommu, h); 886 init_iommu_from_acpi(iommu, h);
847 init_iommu_devices(iommu); 887 init_iommu_devices(iommu);
848 888
889 if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
890 amd_iommu_np_cache = true;
891
849 return pci_enable_device(iommu->dev); 892 return pci_enable_device(iommu->dev);
850} 893}
851 894
@@ -891,6 +934,8 @@ static int __init init_iommu_all(struct acpi_table_header *table)
891 } 934 }
892 WARN_ON(p != end); 935 WARN_ON(p != end);
893 936
937 amd_iommu_initialized = true;
938
894 return 0; 939 return 0;
895} 940}
896 941
@@ -903,7 +948,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
903 * 948 *
904 ****************************************************************************/ 949 ****************************************************************************/
905 950
906static int __init iommu_setup_msi(struct amd_iommu *iommu) 951static int iommu_setup_msi(struct amd_iommu *iommu)
907{ 952{
908 int r; 953 int r;
909 954
@@ -1154,19 +1199,10 @@ static struct sys_device device_amd_iommu = {
1154 * functions. Finally it prints some information about AMD IOMMUs and 1199 * functions. Finally it prints some information about AMD IOMMUs and
1155 * the driver state and enables the hardware. 1200 * the driver state and enables the hardware.
1156 */ 1201 */
1157int __init amd_iommu_init(void) 1202static int __init amd_iommu_init(void)
1158{ 1203{
1159 int i, ret = 0; 1204 int i, ret = 0;
1160 1205
1161
1162 if (no_iommu) {
1163 printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
1164 return 0;
1165 }
1166
1167 if (!amd_iommu_detected)
1168 return -ENODEV;
1169
1170 /* 1206 /*
1171 * First parse ACPI tables to find the largest Bus/Dev/Func 1207 * First parse ACPI tables to find the largest Bus/Dev/Func
1172 * we need to handle. Upon this information the shared data 1208 * we need to handle. Upon this information the shared data
@@ -1203,15 +1239,6 @@ int __init amd_iommu_init(void)
1203 if (amd_iommu_rlookup_table == NULL) 1239 if (amd_iommu_rlookup_table == NULL)
1204 goto free; 1240 goto free;
1205 1241
1206 /*
1207 * Protection Domain table - maps devices to protection domains
1208 * This table has the same size as the rlookup_table
1209 */
1210 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1211 get_order(rlookup_table_size));
1212 if (amd_iommu_pd_table == NULL)
1213 goto free;
1214
1215 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( 1242 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1216 GFP_KERNEL | __GFP_ZERO, 1243 GFP_KERNEL | __GFP_ZERO,
1217 get_order(MAX_DOMAIN_ID/8)); 1244 get_order(MAX_DOMAIN_ID/8));
@@ -1233,6 +1260,8 @@ int __init amd_iommu_init(void)
1233 */ 1260 */
1234 amd_iommu_pd_alloc_bitmap[0] = 1; 1261 amd_iommu_pd_alloc_bitmap[0] = 1;
1235 1262
1263 spin_lock_init(&amd_iommu_pd_lock);
1264
1236 /* 1265 /*
1237 * now the data structures are allocated and basically initialized 1266 * now the data structures are allocated and basically initialized
1238 * start the real acpi table scan 1267 * start the real acpi table scan
@@ -1241,6 +1270,9 @@ int __init amd_iommu_init(void)
1241 if (acpi_table_parse("IVRS", init_iommu_all) != 0) 1270 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1242 goto free; 1271 goto free;
1243 1272
1273 if (!amd_iommu_initialized)
1274 goto free;
1275
1244 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1276 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1245 goto free; 1277 goto free;
1246 1278
@@ -1252,39 +1284,43 @@ int __init amd_iommu_init(void)
1252 if (ret) 1284 if (ret)
1253 goto free; 1285 goto free;
1254 1286
1287 ret = amd_iommu_init_devices();
1288 if (ret)
1289 goto free;
1290
1255 if (iommu_pass_through) 1291 if (iommu_pass_through)
1256 ret = amd_iommu_init_passthrough(); 1292 ret = amd_iommu_init_passthrough();
1257 else 1293 else
1258 ret = amd_iommu_init_dma_ops(); 1294 ret = amd_iommu_init_dma_ops();
1295
1259 if (ret) 1296 if (ret)
1260 goto free; 1297 goto free;
1261 1298
1299 amd_iommu_init_api();
1300
1301 amd_iommu_init_notifier();
1302
1262 enable_iommus(); 1303 enable_iommus();
1263 1304
1264 if (iommu_pass_through) 1305 if (iommu_pass_through)
1265 goto out; 1306 goto out;
1266 1307
1267 printk(KERN_INFO "AMD-Vi: device isolation ");
1268 if (amd_iommu_isolate)
1269 printk("enabled\n");
1270 else
1271 printk("disabled\n");
1272
1273 if (amd_iommu_unmap_flush) 1308 if (amd_iommu_unmap_flush)
1274 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); 1309 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1275 else 1310 else
1276 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); 1311 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1277 1312
1313 x86_platform.iommu_shutdown = disable_iommus;
1278out: 1314out:
1279 return ret; 1315 return ret;
1280 1316
1281free: 1317free:
1318
1319 amd_iommu_uninit_devices();
1320
1282 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1321 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1283 get_order(MAX_DOMAIN_ID/8)); 1322 get_order(MAX_DOMAIN_ID/8));
1284 1323
1285 free_pages((unsigned long)amd_iommu_pd_table,
1286 get_order(rlookup_table_size));
1287
1288 free_pages((unsigned long)amd_iommu_rlookup_table, 1324 free_pages((unsigned long)amd_iommu_rlookup_table,
1289 get_order(rlookup_table_size)); 1325 get_order(rlookup_table_size));
1290 1326
@@ -1301,11 +1337,6 @@ free:
1301 goto out; 1337 goto out;
1302} 1338}
1303 1339
1304void amd_iommu_shutdown(void)
1305{
1306 disable_iommus();
1307}
1308
1309/**************************************************************************** 1340/****************************************************************************
1310 * 1341 *
1311 * Early detect code. This code runs at IOMMU detection time in the DMA 1342 * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1320,16 +1351,16 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1320 1351
1321void __init amd_iommu_detect(void) 1352void __init amd_iommu_detect(void)
1322{ 1353{
1323 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) 1354 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1324 return; 1355 return;
1325 1356
1326 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1357 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1327 iommu_detected = 1; 1358 iommu_detected = 1;
1328 amd_iommu_detected = 1; 1359 amd_iommu_detected = 1;
1329#ifdef CONFIG_GART_IOMMU 1360 x86_init.iommu.iommu_init = amd_iommu_init;
1330 gart_iommu_aperture_disabled = 1; 1361
1331 gart_iommu_aperture = 0; 1362 /* Make sure ACS will be enabled */
1332#endif 1363 pci_request_acs();
1333 } 1364 }
1334} 1365}
1335 1366
@@ -1350,10 +1381,6 @@ static int __init parse_amd_iommu_dump(char *str)
1350static int __init parse_amd_iommu_options(char *str) 1381static int __init parse_amd_iommu_options(char *str)
1351{ 1382{
1352 for (; *str; ++str) { 1383 for (; *str; ++str) {
1353 if (strncmp(str, "isolate", 7) == 0)
1354 amd_iommu_isolate = true;
1355 if (strncmp(str, "share", 5) == 0)
1356 amd_iommu_isolate = false;
1357 if (strncmp(str, "fullflush", 9) == 0) 1384 if (strncmp(str, "fullflush", 9) == 0)
1358 amd_iommu_unmap_flush = true; 1385 amd_iommu_unmap_flush = true;
1359 } 1386 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 128111d8ffe0..f147a95fd84a 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,8 +28,10 @@
28#include <asm/pci-direct.h> 28#include <asm/pci-direct.h>
29#include <asm/dma.h> 29#include <asm/dma.h>
30#include <asm/k8.h> 30#include <asm/k8.h>
31#include <asm/x86_init.h>
31 32
32int gart_iommu_aperture; 33int gart_iommu_aperture;
34EXPORT_SYMBOL_GPL(gart_iommu_aperture);
33int gart_iommu_aperture_disabled __initdata; 35int gart_iommu_aperture_disabled __initdata;
34int gart_iommu_aperture_allowed __initdata; 36int gart_iommu_aperture_allowed __initdata;
35 37
@@ -279,7 +281,8 @@ void __init early_gart_iommu_check(void)
279 * or BIOS forget to put that in reserved. 281 * or BIOS forget to put that in reserved.
280 * try to update e820 to make that region as reserved. 282 * try to update e820 to make that region as reserved.
281 */ 283 */
282 int i, fix, slot; 284 u32 agp_aper_base = 0, agp_aper_order = 0;
285 int i, fix, slot, valid_agp = 0;
283 u32 ctl; 286 u32 ctl;
284 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 287 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
285 u64 aper_base = 0, last_aper_base = 0; 288 u64 aper_base = 0, last_aper_base = 0;
@@ -289,6 +292,8 @@ void __init early_gart_iommu_check(void)
289 return; 292 return;
290 293
291 /* This is mostly duplicate of iommu_hole_init */ 294 /* This is mostly duplicate of iommu_hole_init */
295 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
296
292 fix = 0; 297 fix = 0;
293 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 298 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
294 int bus; 299 int bus;
@@ -341,10 +346,10 @@ void __init early_gart_iommu_check(void)
341 } 346 }
342 } 347 }
343 348
344 if (!fix) 349 if (valid_agp)
345 return; 350 return;
346 351
347 /* different nodes have different setting, disable them all at first*/ 352 /* disable them all at first */
348 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 353 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
349 int bus; 354 int bus;
350 int dev_base, dev_limit; 355 int dev_base, dev_limit;
@@ -400,6 +405,7 @@ void __init gart_iommu_hole_init(void)
400 405
401 iommu_detected = 1; 406 iommu_detected = 1;
402 gart_iommu_aperture = 1; 407 gart_iommu_aperture = 1;
408 x86_init.iommu.iommu_init = gart_iommu_init;
403 409
404 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; 410 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
405 aper_size = (32 * 1024 * 1024) << aper_order; 411 aper_size = (32 * 1024 * 1024) << aper_order;
@@ -456,8 +462,6 @@ out:
456 462
457 if (aper_alloc) { 463 if (aper_alloc) {
458 /* Got the aperture from the AGP bridge */ 464 /* Got the aperture from the AGP bridge */
459 } else if (swiotlb && !valid_agp) {
460 /* Do nothing */
461 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || 465 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
462 force_iommu || 466 force_iommu ||
463 valid_agp || 467 valid_agp ||
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index da7b7b9f8bd8..565c1bfc507d 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,7 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o 6obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o 7obj-$(CONFIG_SMP) += ipi.o
8 8
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 894aa97f0717..dfca210f6a10 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U;
61 61
62/* 62/*
63 * The highest APIC ID seen during enumeration. 63 * The highest APIC ID seen during enumeration.
64 *
65 * On AMD, this determines the messaging protocol we can use: if all APIC IDs
66 * are in the 0 ... 7 range, then we can use logical addressing which
67 * has some performance advantages (better broadcasting).
68 *
69 * If there's an APIC ID above 8, we use physical addressing.
70 */ 64 */
71unsigned int max_physical_apicid; 65unsigned int max_physical_apicid;
72 66
@@ -241,28 +235,13 @@ static int modern_apic(void)
241} 235}
242 236
243/* 237/*
244 * bare function to substitute write operation 238 * right after this call apic become NOOP driven
245 * and it's _that_ fast :) 239 * so apic->write/read doesn't do anything
246 */
247static void native_apic_write_dummy(u32 reg, u32 v)
248{
249 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
250}
251
252static u32 native_apic_read_dummy(u32 reg)
253{
254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
255 return 0;
256}
257
258/*
259 * right after this call apic->write/read doesn't do anything
260 * note that there is no restore operation it works one way
261 */ 240 */
262void apic_disable(void) 241void apic_disable(void)
263{ 242{
264 apic->read = native_apic_read_dummy; 243 pr_info("APIC: switched to apic NOOP\n");
265 apic->write = native_apic_write_dummy; 244 apic = &apic_noop;
266} 245}
267 246
268void native_apic_wait_icr_idle(void) 247void native_apic_wait_icr_idle(void)
@@ -459,7 +438,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
459 v = apic_read(APIC_LVTT); 438 v = apic_read(APIC_LVTT);
460 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 439 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
461 apic_write(APIC_LVTT, v); 440 apic_write(APIC_LVTT, v);
462 apic_write(APIC_TMICT, 0xffffffff); 441 apic_write(APIC_TMICT, 0);
463 break; 442 break;
464 case CLOCK_EVT_MODE_RESUME: 443 case CLOCK_EVT_MODE_RESUME:
465 /* Nothing to do here */ 444 /* Nothing to do here */
@@ -662,7 +641,7 @@ static int __init calibrate_APIC_clock(void)
662 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; 641 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
663 642
664 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); 643 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
665 apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); 644 apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
666 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", 645 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
667 calibration_result); 646 calibration_result);
668 647
@@ -1356,7 +1335,7 @@ void enable_x2apic(void)
1356 1335
1357 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1336 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1358 if (!(msr & X2APIC_ENABLE)) { 1337 if (!(msr & X2APIC_ENABLE)) {
1359 pr_info("Enabling x2apic\n"); 1338 printk_once(KERN_INFO "Enabling x2apic\n");
1360 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1339 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1361 } 1340 }
1362} 1341}
@@ -1392,14 +1371,11 @@ void __init enable_IR_x2apic(void)
1392 unsigned long flags; 1371 unsigned long flags;
1393 struct IO_APIC_route_entry **ioapic_entries = NULL; 1372 struct IO_APIC_route_entry **ioapic_entries = NULL;
1394 int ret, x2apic_enabled = 0; 1373 int ret, x2apic_enabled = 0;
1395 int dmar_table_init_ret = 0; 1374 int dmar_table_init_ret;
1396 1375
1397#ifdef CONFIG_INTR_REMAP
1398 dmar_table_init_ret = dmar_table_init(); 1376 dmar_table_init_ret = dmar_table_init();
1399 if (dmar_table_init_ret) 1377 if (dmar_table_init_ret && !x2apic_supported())
1400 pr_debug("dmar_table_init() failed with %d:\n", 1378 return;
1401 dmar_table_init_ret);
1402#endif
1403 1379
1404 ioapic_entries = alloc_ioapic_entries(); 1380 ioapic_entries = alloc_ioapic_entries();
1405 if (!ioapic_entries) { 1381 if (!ioapic_entries) {
@@ -1665,9 +1641,7 @@ int __init APIC_init_uniprocessor(void)
1665#endif 1641#endif
1666 1642
1667 enable_IR_x2apic(); 1643 enable_IR_x2apic();
1668#ifdef CONFIG_X86_64
1669 default_setup_apic_routing(); 1644 default_setup_apic_routing();
1670#endif
1671 1645
1672 verify_local_APIC(); 1646 verify_local_APIC();
1673 connect_bsp_APIC(); 1647 connect_bsp_APIC();
@@ -1915,18 +1889,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1915 if (apicid > max_physical_apicid) 1889 if (apicid > max_physical_apicid)
1916 max_physical_apicid = apicid; 1890 max_physical_apicid = apicid;
1917 1891
1918#ifdef CONFIG_X86_32
1919 switch (boot_cpu_data.x86_vendor) {
1920 case X86_VENDOR_INTEL:
1921 if (num_processors > 8)
1922 def_to_bigsmp = 1;
1923 break;
1924 case X86_VENDOR_AMD:
1925 if (max_physical_apicid >= 8)
1926 def_to_bigsmp = 1;
1927 }
1928#endif
1929
1930#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) 1892#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
1931 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1893 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1932 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1894 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index d0c99abc26c3..e3c3d820c325 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
240 printk(KERN_DEBUG "system APIC only can use physical flat"); 240 printk(KERN_DEBUG "system APIC only can use physical flat");
241 return 1; 241 return 1;
242 } 242 }
243
244 if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
245 printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
246 return 1;
247 }
243#endif 248#endif
244 249
245 return 0; 250 return 0;
@@ -306,10 +311,7 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
306 if (cpumask_test_cpu(cpu, cpu_online_mask)) 311 if (cpumask_test_cpu(cpu, cpu_online_mask))
307 break; 312 break;
308 } 313 }
309 if (cpu < nr_cpu_ids) 314 return per_cpu(x86_cpu_to_apicid, cpu);
310 return per_cpu(x86_cpu_to_apicid, cpu);
311
312 return BAD_APICID;
313} 315}
314 316
315struct apic apic_physflat = { 317struct apic apic_physflat = {
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 000000000000..e31b9ffe25f5
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,200 @@
1/*
2 * NOOP APIC driver.
3 *
4 * Does almost nothing and should be substituted by a real apic driver via
5 * probe routine.
6 *
7 * Though in case if apic is disabled (for some reason) we try
8 * to not uglify the caller's code and allow to call (some) apic routines
9 * like self-ipi, etc...
10 */
11
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/module.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <linux/errno.h>
20#include <asm/fixmap.h>
21#include <asm/mpspec.h>
22#include <asm/apicdef.h>
23#include <asm/apic.h>
24#include <asm/setup.h>
25
26#include <linux/smp.h>
27#include <asm/ipi.h>
28
29#include <linux/interrupt.h>
30#include <asm/acpi.h>
31#include <asm/e820.h>
32
33static void noop_init_apic_ldr(void) { }
34static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
35static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
36static void noop_send_IPI_allbutself(int vector) { }
37static void noop_send_IPI_all(int vector) { }
38static void noop_send_IPI_self(int vector) { }
39static void noop_apic_wait_icr_idle(void) { }
40static void noop_apic_icr_write(u32 low, u32 id) { }
41
42static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
43{
44 return -1;
45}
46
47static u32 noop_safe_apic_wait_icr_idle(void)
48{
49 return 0;
50}
51
52static u64 noop_apic_icr_read(void)
53{
54 return 0;
55}
56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{
64 return 0;
65}
66
67static unsigned int noop_get_apic_id(unsigned long x)
68{
69 return 0;
70}
71
72static int noop_probe(void)
73{
74 /*
75 * NOOP apic should not ever be
76 * enabled via probe routine
77 */
78 return 0;
79}
80
81static int noop_apic_id_registered(void)
82{
83 /*
84 * if we would be really "pedantic"
85 * we should pass read_apic_id() here
86 * but since NOOP suppose APIC ID = 0
87 * lets save a few cycles
88 */
89 return physid_isset(0, phys_cpu_present_map);
90}
91
92static const struct cpumask *noop_target_cpus(void)
93{
94 /* only BSP here */
95 return cpumask_of(0);
96}
97
98static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
99{
100 return physid_isset(apicid, *map);
101}
102
103static unsigned long noop_check_apicid_present(int bit)
104{
105 return physid_isset(bit, phys_cpu_present_map);
106}
107
108static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
109{
110 if (cpu != 0)
111 pr_warning("APIC: Vector allocated for non-BSP cpu\n");
112 cpumask_clear(retmask);
113 cpumask_set_cpu(cpu, retmask);
114}
115
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg)
123{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
125 return 0;
126}
127
128static void noop_apic_write(u32 reg, u32 v)
129{
130 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
131}
132
133struct apic apic_noop = {
134 .name = "noop",
135 .probe = noop_probe,
136 .acpi_madt_oem_check = NULL,
137
138 .apic_id_registered = noop_apic_id_registered,
139
140 .irq_delivery_mode = dest_LowestPrio,
141 /* logical delivery broadcast to all CPUs: */
142 .irq_dest_mode = 1,
143
144 .target_cpus = noop_target_cpus,
145 .disable_esr = 0,
146 .dest_logical = APIC_DEST_LOGICAL,
147 .check_apicid_used = noop_check_apicid_used,
148 .check_apicid_present = noop_check_apicid_present,
149
150 .vector_allocation_domain = noop_vector_allocation_domain,
151 .init_apic_ldr = noop_init_apic_ldr,
152
153 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid,
161
162 .setup_portio_remap = NULL,
163 .check_phys_apicid_present = default_check_phys_apicid_present,
164 .enable_apic_mode = NULL,
165
166 .phys_pkg_id = noop_phys_pkg_id,
167
168 .mps_oem_check = NULL,
169
170 .get_apic_id = noop_get_apic_id,
171 .set_apic_id = NULL,
172 .apic_id_mask = 0x0F << 24,
173
174 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
175 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
176
177 .send_IPI_mask = noop_send_IPI_mask,
178 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
179 .send_IPI_allbutself = noop_send_IPI_allbutself,
180 .send_IPI_all = noop_send_IPI_all,
181 .send_IPI_self = noop_send_IPI_self,
182
183 .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
184
185 /* should be safe */
186 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
187 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
188
189 .wait_for_init_deassert = NULL,
190
191 .smp_callin_clear_local_apic = NULL,
192 .inquire_remote_apic = NULL,
193
194 .read = noop_apic_read,
195 .write = noop_apic_write,
196 .icr_read = noop_apic_icr_read,
197 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
200};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 77a06413b6b2..cb804c5091b9 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void)
35#endif 35#endif
36} 36}
37 37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) 38static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
39{ 39{
40 return 0; 40 return 0;
41} 41}
@@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 93 return BAD_APICID;
94} 94}
95 95
96static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
97{
98 return physid_mask_of_physid(phys_apicid);
99}
100
101/* Mapping from cpu number to logical apicid */ 96/* Mapping from cpu number to logical apicid */
102static inline int bigsmp_cpu_to_logical_apicid(int cpu) 97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
103{ 98{
@@ -106,10 +101,10 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu)
106 return cpu_physical_id(cpu); 101 return cpu_physical_id(cpu);
107} 102}
108 103
109static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) 104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
110{ 105{
111 /* For clustered we don't have a good way to do this yet - hack */ 106 /* For clustered we don't have a good way to do this yet - hack */
112 return physids_promote(0xFFL); 107 physids_promote(0xFFL, retmap);
113} 108}
114 109
115static int bigsmp_check_phys_apicid_present(int phys_apicid) 110static int bigsmp_check_phys_apicid_present(int phys_apicid)
@@ -136,10 +131,7 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
136 if (cpumask_test_cpu(cpu, cpu_online_mask)) 131 if (cpumask_test_cpu(cpu, cpu_online_mask))
137 break; 132 break;
138 } 133 }
139 if (cpu < nr_cpu_ids) 134 return bigsmp_cpu_to_logical_apicid(cpu);
140 return bigsmp_cpu_to_logical_apicid(cpu);
141
142 return BAD_APICID;
143} 135}
144 136
145static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 137static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -230,7 +222,7 @@ struct apic apic_bigsmp = {
230 .apicid_to_node = bigsmp_apicid_to_node, 222 .apicid_to_node = bigsmp_apicid_to_node,
231 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, 223 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
232 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 224 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
233 .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, 225 .apicid_to_cpu_present = physid_set_mask_of_physid,
234 .setup_portio_remap = NULL, 226 .setup_portio_remap = NULL,
235 .check_phys_apicid_present = bigsmp_check_phys_apicid_present, 227 .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
236 .enable_apic_mode = NULL, 228 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 89174f847b49..dd2b5f264643 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -27,6 +27,9 @@
27 * 27 *
28 * http://www.unisys.com 28 * http://www.unisys.com
29 */ 29 */
30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
30#include <linux/notifier.h> 33#include <linux/notifier.h>
31#include <linux/spinlock.h> 34#include <linux/spinlock.h>
32#include <linux/cpumask.h> 35#include <linux/cpumask.h>
@@ -223,9 +226,9 @@ static int parse_unisys_oem(char *oemptr)
223 mip_addr = val; 226 mip_addr = val;
224 mip = (struct mip_reg *)val; 227 mip = (struct mip_reg *)val;
225 mip_reg = __va(mip); 228 mip_reg = __va(mip);
226 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", 229 pr_debug("host_reg = 0x%lx\n",
227 (unsigned long)host_reg); 230 (unsigned long)host_reg);
228 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", 231 pr_debug("mip_reg = 0x%lx\n",
229 (unsigned long)mip_reg); 232 (unsigned long)mip_reg);
230 success++; 233 success++;
231 break; 234 break;
@@ -401,7 +404,7 @@ static void es7000_enable_apic_mode(void)
401 if (!es7000_plat) 404 if (!es7000_plat)
402 return; 405 return;
403 406
404 printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); 407 pr_info("Enabling APIC mode.\n");
405 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); 408 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
406 es7000_mip_reg.off_0x00 = MIP_SW_APIC; 409 es7000_mip_reg.off_0x00 = MIP_SW_APIC;
407 es7000_mip_reg.off_0x38 = MIP_VALID; 410 es7000_mip_reg.off_0x38 = MIP_VALID;
@@ -466,11 +469,11 @@ static const struct cpumask *es7000_target_cpus(void)
466 return cpumask_of(smp_processor_id()); 469 return cpumask_of(smp_processor_id());
467} 470}
468 471
469static unsigned long 472static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
470es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
471{ 473{
472 return 0; 474 return 0;
473} 475}
476
474static unsigned long es7000_check_apicid_present(int bit) 477static unsigned long es7000_check_apicid_present(int bit)
475{ 478{
476 return physid_isset(bit, phys_cpu_present_map); 479 return physid_isset(bit, phys_cpu_present_map);
@@ -514,8 +517,7 @@ static void es7000_setup_apic_routing(void)
514{ 517{
515 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); 518 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
516 519
517 printk(KERN_INFO 520 pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
518 "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
519 (apic_version[apic] == 0x14) ? 521 (apic_version[apic] == 0x14) ?
520 "Physical Cluster" : "Logical Cluster", 522 "Physical Cluster" : "Logical Cluster",
521 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 523 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
@@ -539,14 +541,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu)
539 541
540static int cpu_id; 542static int cpu_id;
541 543
542static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) 544static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
543{ 545{
544 physid_mask_t mask; 546 physid_set_mask_of_physid(cpu_id, retmap);
545
546 mask = physid_mask_of_physid(cpu_id);
547 ++cpu_id; 547 ++cpu_id;
548
549 return mask;
550} 548}
551 549
552/* Mapping from cpu number to logical apicid */ 550/* Mapping from cpu number to logical apicid */
@@ -561,10 +559,10 @@ static int es7000_cpu_to_logical_apicid(int cpu)
561#endif 559#endif
562} 560}
563 561
564static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) 562static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
565{ 563{
566 /* For clustered we don't have a good way to do this yet - hack */ 564 /* For clustered we don't have a good way to do this yet - hack */
567 return physids_promote(0xff); 565 physids_promote(0xFFL, retmap);
568} 566}
569 567
570static int es7000_check_phys_apicid_present(int cpu_physical_apicid) 568static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dc69f28489f5..53243ca7816d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -60,8 +60,6 @@
60#include <asm/irq_remapping.h> 60#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 61#include <asm/hpet.h>
62#include <asm/hw_irq.h> 62#include <asm/hw_irq.h>
63#include <asm/uv/uv_hub.h>
64#include <asm/uv/uv_irq.h>
65 63
66#include <asm/apic.h> 64#include <asm/apic.h>
67 65
@@ -140,20 +138,6 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
140 return pin; 138 return pin;
141} 139}
142 140
143/*
144 * This is performance-critical, we want to do it O(1)
145 *
146 * Most irqs are mapped 1:1 with pins.
147 */
148struct irq_cfg {
149 struct irq_pin_list *irq_2_pin;
150 cpumask_var_t domain;
151 cpumask_var_t old_domain;
152 unsigned move_cleanup_count;
153 u8 vector;
154 u8 move_in_progress : 1;
155};
156
157/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 141/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
158#ifdef CONFIG_SPARSE_IRQ 142#ifdef CONFIG_SPARSE_IRQ
159static struct irq_cfg irq_cfgx[] = { 143static struct irq_cfg irq_cfgx[] = {
@@ -209,7 +193,7 @@ int __init arch_early_irq_init(void)
209} 193}
210 194
211#ifdef CONFIG_SPARSE_IRQ 195#ifdef CONFIG_SPARSE_IRQ
212static struct irq_cfg *irq_cfg(unsigned int irq) 196struct irq_cfg *irq_cfg(unsigned int irq)
213{ 197{
214 struct irq_cfg *cfg = NULL; 198 struct irq_cfg *cfg = NULL;
215 struct irq_desc *desc; 199 struct irq_desc *desc;
@@ -361,7 +345,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
361/* end for move_irq_desc */ 345/* end for move_irq_desc */
362 346
363#else 347#else
364static struct irq_cfg *irq_cfg(unsigned int irq) 348struct irq_cfg *irq_cfg(unsigned int irq)
365{ 349{
366 return irq < nr_irqs ? irq_cfgx + irq : NULL; 350 return irq < nr_irqs ? irq_cfgx + irq : NULL;
367} 351}
@@ -555,23 +539,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
555 add_pin_to_irq_node(cfg, node, newapic, newpin); 539 add_pin_to_irq_node(cfg, node, newapic, newpin);
556} 540}
557 541
542static void __io_apic_modify_irq(struct irq_pin_list *entry,
543 int mask_and, int mask_or,
544 void (*final)(struct irq_pin_list *entry))
545{
546 unsigned int reg, pin;
547
548 pin = entry->pin;
549 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
550 reg &= mask_and;
551 reg |= mask_or;
552 io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
553 if (final)
554 final(entry);
555}
556
558static void io_apic_modify_irq(struct irq_cfg *cfg, 557static void io_apic_modify_irq(struct irq_cfg *cfg,
559 int mask_and, int mask_or, 558 int mask_and, int mask_or,
560 void (*final)(struct irq_pin_list *entry)) 559 void (*final)(struct irq_pin_list *entry))
561{ 560{
562 int pin;
563 struct irq_pin_list *entry; 561 struct irq_pin_list *entry;
564 562
565 for_each_irq_pin(entry, cfg->irq_2_pin) { 563 for_each_irq_pin(entry, cfg->irq_2_pin)
566 unsigned int reg; 564 __io_apic_modify_irq(entry, mask_and, mask_or, final);
567 pin = entry->pin; 565}
568 reg = io_apic_read(entry->apic, 0x10 + pin * 2); 566
569 reg &= mask_and; 567static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
570 reg |= mask_or; 568{
571 io_apic_modify(entry->apic, 0x10 + pin * 2, reg); 569 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
572 if (final) 570 IO_APIC_REDIR_MASKED, NULL);
573 final(entry); 571}
574 } 572
573static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
574{
575 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
576 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
575} 577}
576 578
577static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) 579static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
@@ -595,18 +597,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
595 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 597 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
596} 598}
597 599
598static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
599{
600 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
601 IO_APIC_REDIR_MASKED, NULL);
602}
603
604static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
605{
606 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
607 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
608}
609
610static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 600static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
611{ 601{
612 struct irq_cfg *cfg = desc->chip_data; 602 struct irq_cfg *cfg = desc->chip_data;
@@ -1177,7 +1167,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1177 int cpu, err; 1167 int cpu, err;
1178 cpumask_var_t tmp_mask; 1168 cpumask_var_t tmp_mask;
1179 1169
1180 if ((cfg->move_in_progress) || cfg->move_cleanup_count) 1170 if (cfg->move_in_progress)
1181 return -EBUSY; 1171 return -EBUSY;
1182 1172
1183 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) 1173 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
@@ -1237,8 +1227,7 @@ next:
1237 return err; 1227 return err;
1238} 1228}
1239 1229
1240static int 1230int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1241assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1242{ 1231{
1243 int err; 1232 int err;
1244 unsigned long flags; 1233 unsigned long flags;
@@ -1599,9 +1588,6 @@ __apicdebuginit(void) print_IO_APIC(void)
1599 struct irq_desc *desc; 1588 struct irq_desc *desc;
1600 unsigned int irq; 1589 unsigned int irq;
1601 1590
1602 if (apic_verbosity == APIC_QUIET)
1603 return;
1604
1605 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1591 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1606 for (i = 0; i < nr_ioapics; i++) 1592 for (i = 0; i < nr_ioapics; i++)
1607 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1593 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
@@ -1708,9 +1694,6 @@ __apicdebuginit(void) print_APIC_field(int base)
1708{ 1694{
1709 int i; 1695 int i;
1710 1696
1711 if (apic_verbosity == APIC_QUIET)
1712 return;
1713
1714 printk(KERN_DEBUG); 1697 printk(KERN_DEBUG);
1715 1698
1716 for (i = 0; i < 8; i++) 1699 for (i = 0; i < 8; i++)
@@ -1724,9 +1707,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1724 unsigned int i, v, ver, maxlvt; 1707 unsigned int i, v, ver, maxlvt;
1725 u64 icr; 1708 u64 icr;
1726 1709
1727 if (apic_verbosity == APIC_QUIET)
1728 return;
1729
1730 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1710 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1731 smp_processor_id(), hard_smp_processor_id()); 1711 smp_processor_id(), hard_smp_processor_id());
1732 v = apic_read(APIC_ID); 1712 v = apic_read(APIC_ID);
@@ -1824,13 +1804,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1824 printk("\n"); 1804 printk("\n");
1825} 1805}
1826 1806
1827__apicdebuginit(void) print_all_local_APICs(void) 1807__apicdebuginit(void) print_local_APICs(int maxcpu)
1828{ 1808{
1829 int cpu; 1809 int cpu;
1830 1810
1811 if (!maxcpu)
1812 return;
1813
1831 preempt_disable(); 1814 preempt_disable();
1832 for_each_online_cpu(cpu) 1815 for_each_online_cpu(cpu) {
1816 if (cpu >= maxcpu)
1817 break;
1833 smp_call_function_single(cpu, print_local_APIC, NULL, 1); 1818 smp_call_function_single(cpu, print_local_APIC, NULL, 1);
1819 }
1834 preempt_enable(); 1820 preempt_enable();
1835} 1821}
1836 1822
@@ -1839,7 +1825,7 @@ __apicdebuginit(void) print_PIC(void)
1839 unsigned int v; 1825 unsigned int v;
1840 unsigned long flags; 1826 unsigned long flags;
1841 1827
1842 if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) 1828 if (!nr_legacy_irqs)
1843 return; 1829 return;
1844 1830
1845 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1831 printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1866,21 +1852,41 @@ __apicdebuginit(void) print_PIC(void)
1866 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1852 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1867} 1853}
1868 1854
1869__apicdebuginit(int) print_all_ICs(void) 1855static int __initdata show_lapic = 1;
1856static __init int setup_show_lapic(char *arg)
1870{ 1857{
1858 int num = -1;
1859
1860 if (strcmp(arg, "all") == 0) {
1861 show_lapic = CONFIG_NR_CPUS;
1862 } else {
1863 get_option(&arg, &num);
1864 if (num >= 0)
1865 show_lapic = num;
1866 }
1867
1868 return 1;
1869}
1870__setup("show_lapic=", setup_show_lapic);
1871
1872__apicdebuginit(int) print_ICs(void)
1873{
1874 if (apic_verbosity == APIC_QUIET)
1875 return 0;
1876
1871 print_PIC(); 1877 print_PIC();
1872 1878
1873 /* don't print out if apic is not there */ 1879 /* don't print out if apic is not there */
1874 if (!cpu_has_apic && !apic_from_smp_config()) 1880 if (!cpu_has_apic && !apic_from_smp_config())
1875 return 0; 1881 return 0;
1876 1882
1877 print_all_local_APICs(); 1883 print_local_APICs(show_lapic);
1878 print_IO_APIC(); 1884 print_IO_APIC();
1879 1885
1880 return 0; 1886 return 0;
1881} 1887}
1882 1888
1883fs_initcall(print_all_ICs); 1889fs_initcall(print_ICs);
1884 1890
1885 1891
1886/* Where if anywhere is the i8259 connect in external int mode */ 1892/* Where if anywhere is the i8259 connect in external int mode */
@@ -2031,7 +2037,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2031 * This is broken; anything with a real cpu count has to 2037 * This is broken; anything with a real cpu count has to
2032 * circumvent this idiocy regardless. 2038 * circumvent this idiocy regardless.
2033 */ 2039 */
2034 phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 2040 apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
2035 2041
2036 /* 2042 /*
2037 * Set the IOAPIC ID to the value stored in the MPC table. 2043 * Set the IOAPIC ID to the value stored in the MPC table.
@@ -2058,7 +2064,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2058 * system must have a unique ID or we get lots of nice 2064 * system must have a unique ID or we get lots of nice
2059 * 'stuck on smp_invalidate_needed IPI wait' messages. 2065 * 'stuck on smp_invalidate_needed IPI wait' messages.
2060 */ 2066 */
2061 if (apic->check_apicid_used(phys_id_present_map, 2067 if (apic->check_apicid_used(&phys_id_present_map,
2062 mp_ioapics[apic_id].apicid)) { 2068 mp_ioapics[apic_id].apicid)) {
2063 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2069 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2064 apic_id, mp_ioapics[apic_id].apicid); 2070 apic_id, mp_ioapics[apic_id].apicid);
@@ -2073,7 +2079,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2073 mp_ioapics[apic_id].apicid = i; 2079 mp_ioapics[apic_id].apicid = i;
2074 } else { 2080 } else {
2075 physid_mask_t tmp; 2081 physid_mask_t tmp;
2076 tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); 2082 apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
2077 apic_printk(APIC_VERBOSE, "Setting %d in the " 2083 apic_printk(APIC_VERBOSE, "Setting %d in the "
2078 "phys_id_present_map\n", 2084 "phys_id_present_map\n",
2079 mp_ioapics[apic_id].apicid); 2085 mp_ioapics[apic_id].apicid);
@@ -2228,20 +2234,16 @@ static int ioapic_retrigger_irq(unsigned int irq)
2228 */ 2234 */
2229 2235
2230#ifdef CONFIG_SMP 2236#ifdef CONFIG_SMP
2231static void send_cleanup_vector(struct irq_cfg *cfg) 2237void send_cleanup_vector(struct irq_cfg *cfg)
2232{ 2238{
2233 cpumask_var_t cleanup_mask; 2239 cpumask_var_t cleanup_mask;
2234 2240
2235 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { 2241 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
2236 unsigned int i; 2242 unsigned int i;
2237 cfg->move_cleanup_count = 0;
2238 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2239 cfg->move_cleanup_count++;
2240 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) 2243 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2241 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); 2244 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
2242 } else { 2245 } else {
2243 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); 2246 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
2244 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
2245 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); 2247 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2246 free_cpumask_var(cleanup_mask); 2248 free_cpumask_var(cleanup_mask);
2247 } 2249 }
@@ -2272,31 +2274,30 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2272 } 2274 }
2273} 2275}
2274 2276
2275static int
2276assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
2277
2278/* 2277/*
2279 * Either sets desc->affinity to a valid value, and returns 2278 * Either sets desc->affinity to a valid value, and returns
2280 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and 2279 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2281 * leaves desc->affinity untouched. 2280 * leaves desc->affinity untouched.
2282 */ 2281 */
2283static unsigned int 2282unsigned int
2284set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) 2283set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
2284 unsigned int *dest_id)
2285{ 2285{
2286 struct irq_cfg *cfg; 2286 struct irq_cfg *cfg;
2287 unsigned int irq; 2287 unsigned int irq;
2288 2288
2289 if (!cpumask_intersects(mask, cpu_online_mask)) 2289 if (!cpumask_intersects(mask, cpu_online_mask))
2290 return BAD_APICID; 2290 return -1;
2291 2291
2292 irq = desc->irq; 2292 irq = desc->irq;
2293 cfg = desc->chip_data; 2293 cfg = desc->chip_data;
2294 if (assign_irq_vector(irq, cfg, mask)) 2294 if (assign_irq_vector(irq, cfg, mask))
2295 return BAD_APICID; 2295 return -1;
2296 2296
2297 cpumask_copy(desc->affinity, mask); 2297 cpumask_copy(desc->affinity, mask);
2298 2298
2299 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); 2299 *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
2300 return 0;
2300} 2301}
2301 2302
2302static int 2303static int
@@ -2312,12 +2313,11 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2312 cfg = desc->chip_data; 2313 cfg = desc->chip_data;
2313 2314
2314 spin_lock_irqsave(&ioapic_lock, flags); 2315 spin_lock_irqsave(&ioapic_lock, flags);
2315 dest = set_desc_affinity(desc, mask); 2316 ret = set_desc_affinity(desc, mask, &dest);
2316 if (dest != BAD_APICID) { 2317 if (!ret) {
2317 /* Only the high 8 bits are valid. */ 2318 /* Only the high 8 bits are valid. */
2318 dest = SET_APIC_LOGICAL_ID(dest); 2319 dest = SET_APIC_LOGICAL_ID(dest);
2319 __target_IO_APIC_irq(irq, dest, cfg); 2320 __target_IO_APIC_irq(irq, dest, cfg);
2320 ret = 0;
2321 } 2321 }
2322 spin_unlock_irqrestore(&ioapic_lock, flags); 2322 spin_unlock_irqrestore(&ioapic_lock, flags);
2323 2323
@@ -2432,8 +2432,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2432 continue; 2432 continue;
2433 2433
2434 cfg = irq_cfg(irq); 2434 cfg = irq_cfg(irq);
2435 spin_lock(&desc->lock); 2435 raw_spin_lock(&desc->lock);
2436 if (!cfg->move_cleanup_count) 2436
2437 /*
2438 * Check if the irq migration is in progress. If so, we
2439 * haven't received the cleanup request yet for this irq.
2440 */
2441 if (cfg->move_in_progress)
2437 goto unlock; 2442 goto unlock;
2438 2443
2439 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2444 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
@@ -2452,29 +2457,40 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2452 goto unlock; 2457 goto unlock;
2453 } 2458 }
2454 __get_cpu_var(vector_irq)[vector] = -1; 2459 __get_cpu_var(vector_irq)[vector] = -1;
2455 cfg->move_cleanup_count--;
2456unlock: 2460unlock:
2457 spin_unlock(&desc->lock); 2461 raw_spin_unlock(&desc->lock);
2458 } 2462 }
2459 2463
2460 irq_exit(); 2464 irq_exit();
2461} 2465}
2462 2466
2463static void irq_complete_move(struct irq_desc **descp) 2467static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
2464{ 2468{
2465 struct irq_desc *desc = *descp; 2469 struct irq_desc *desc = *descp;
2466 struct irq_cfg *cfg = desc->chip_data; 2470 struct irq_cfg *cfg = desc->chip_data;
2467 unsigned vector, me; 2471 unsigned me;
2468 2472
2469 if (likely(!cfg->move_in_progress)) 2473 if (likely(!cfg->move_in_progress))
2470 return; 2474 return;
2471 2475
2472 vector = ~get_irq_regs()->orig_ax;
2473 me = smp_processor_id(); 2476 me = smp_processor_id();
2474 2477
2475 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2478 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2476 send_cleanup_vector(cfg); 2479 send_cleanup_vector(cfg);
2477} 2480}
2481
2482static void irq_complete_move(struct irq_desc **descp)
2483{
2484 __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
2485}
2486
2487void irq_force_complete_move(int irq)
2488{
2489 struct irq_desc *desc = irq_to_desc(irq);
2490 struct irq_cfg *cfg = desc->chip_data;
2491
2492 __irq_complete_move(&desc, cfg->vector);
2493}
2478#else 2494#else
2479static inline void irq_complete_move(struct irq_desc **descp) {} 2495static inline void irq_complete_move(struct irq_desc **descp) {}
2480#endif 2496#endif
@@ -2490,6 +2506,59 @@ static void ack_apic_edge(unsigned int irq)
2490 2506
2491atomic_t irq_mis_count; 2507atomic_t irq_mis_count;
2492 2508
2509/*
2510 * IO-APIC versions below 0x20 don't support EOI register.
2511 * For the record, here is the information about various versions:
2512 * 0Xh 82489DX
2513 * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
2514 * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
2515 * 30h-FFh Reserved
2516 *
2517 * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
2518 * version as 0x2. This is an error with documentation and these ICH chips
2519 * use io-apic's of version 0x20.
2520 *
2521 * For IO-APIC's with EOI register, we use that to do an explicit EOI.
2522 * Otherwise, we simulate the EOI message manually by changing the trigger
2523 * mode to edge and then back to level, with RTE being masked during this.
2524*/
2525static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2526{
2527 struct irq_pin_list *entry;
2528
2529 for_each_irq_pin(entry, cfg->irq_2_pin) {
2530 if (mp_ioapics[entry->apic].apicver >= 0x20) {
2531 /*
2532 * Intr-remapping uses pin number as the virtual vector
2533 * in the RTE. Actual vector is programmed in
2534 * intr-remapping table entry. Hence for the io-apic
2535 * EOI we use the pin number.
2536 */
2537 if (irq_remapped(irq))
2538 io_apic_eoi(entry->apic, entry->pin);
2539 else
2540 io_apic_eoi(entry->apic, cfg->vector);
2541 } else {
2542 __mask_and_edge_IO_APIC_irq(entry);
2543 __unmask_and_level_IO_APIC_irq(entry);
2544 }
2545 }
2546}
2547
2548static void eoi_ioapic_irq(struct irq_desc *desc)
2549{
2550 struct irq_cfg *cfg;
2551 unsigned long flags;
2552 unsigned int irq;
2553
2554 irq = desc->irq;
2555 cfg = desc->chip_data;
2556
2557 spin_lock_irqsave(&ioapic_lock, flags);
2558 __eoi_ioapic_irq(irq, cfg);
2559 spin_unlock_irqrestore(&ioapic_lock, flags);
2560}
2561
2493static void ack_apic_level(unsigned int irq) 2562static void ack_apic_level(unsigned int irq)
2494{ 2563{
2495 struct irq_desc *desc = irq_to_desc(irq); 2564 struct irq_desc *desc = irq_to_desc(irq);
@@ -2525,6 +2594,19 @@ static void ack_apic_level(unsigned int irq)
2525 * level-triggered interrupt. We mask the source for the time of the 2594 * level-triggered interrupt. We mask the source for the time of the
2526 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2595 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2527 * The idea is from Manfred Spraul. --macro 2596 * The idea is from Manfred Spraul. --macro
2597 *
2598 * Also in the case when cpu goes offline, fixup_irqs() will forward
2599 * any unhandled interrupt on the offlined cpu to the new cpu
2600 * destination that is handling the corresponding interrupt. This
2601 * interrupt forwarding is done via IPI's. Hence, in this case also
2602 * level-triggered io-apic interrupt will be seen as an edge
2603 * interrupt in the IRR. And we can't rely on the cpu's EOI
2604 * to be broadcasted to the IO-APIC's which will clear the remoteIRR
2605 * corresponding to the level-triggered interrupt. Hence on IO-APIC's
2606 * supporting EOI register, we do an explicit EOI to clear the
2607 * remote IRR and on IO-APIC's which don't have an EOI register,
2608 * we use the above logic (mask+edge followed by unmask+level) from
2609 * Manfred Spraul to clear the remote IRR.
2528 */ 2610 */
2529 cfg = desc->chip_data; 2611 cfg = desc->chip_data;
2530 i = cfg->vector; 2612 i = cfg->vector;
@@ -2536,6 +2618,19 @@ static void ack_apic_level(unsigned int irq)
2536 */ 2618 */
2537 ack_APIC_irq(); 2619 ack_APIC_irq();
2538 2620
2621 /*
2622 * Tail end of clearing remote IRR bit (either by delivering the EOI
2623 * message via io-apic EOI register write or simulating it using
2624 * mask+edge followed by unnask+level logic) manually when the
2625 * level triggered interrupt is seen as the edge triggered interrupt
2626 * at the cpu.
2627 */
2628 if (!(v & (1 << (i & 0x1f)))) {
2629 atomic_inc(&irq_mis_count);
2630
2631 eoi_ioapic_irq(desc);
2632 }
2633
2539 /* Now we can move and renable the irq */ 2634 /* Now we can move and renable the irq */
2540 if (unlikely(do_unmask_irq)) { 2635 if (unlikely(do_unmask_irq)) {
2541 /* Only migrate the irq if the ack has been received. 2636 /* Only migrate the irq if the ack has been received.
@@ -2569,41 +2664,9 @@ static void ack_apic_level(unsigned int irq)
2569 move_masked_irq(irq); 2664 move_masked_irq(irq);
2570 unmask_IO_APIC_irq_desc(desc); 2665 unmask_IO_APIC_irq_desc(desc);
2571 } 2666 }
2572
2573 /* Tail end of version 0x11 I/O APIC bug workaround */
2574 if (!(v & (1 << (i & 0x1f)))) {
2575 atomic_inc(&irq_mis_count);
2576 spin_lock(&ioapic_lock);
2577 __mask_and_edge_IO_APIC_irq(cfg);
2578 __unmask_and_level_IO_APIC_irq(cfg);
2579 spin_unlock(&ioapic_lock);
2580 }
2581} 2667}
2582 2668
2583#ifdef CONFIG_INTR_REMAP 2669#ifdef CONFIG_INTR_REMAP
2584static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2585{
2586 struct irq_pin_list *entry;
2587
2588 for_each_irq_pin(entry, cfg->irq_2_pin)
2589 io_apic_eoi(entry->apic, entry->pin);
2590}
2591
2592static void
2593eoi_ioapic_irq(struct irq_desc *desc)
2594{
2595 struct irq_cfg *cfg;
2596 unsigned long flags;
2597 unsigned int irq;
2598
2599 irq = desc->irq;
2600 cfg = desc->chip_data;
2601
2602 spin_lock_irqsave(&ioapic_lock, flags);
2603 __eoi_ioapic_irq(irq, cfg);
2604 spin_unlock_irqrestore(&ioapic_lock, flags);
2605}
2606
2607static void ir_ack_apic_edge(unsigned int irq) 2670static void ir_ack_apic_edge(unsigned int irq)
2608{ 2671{
2609 ack_APIC_irq(); 2672 ack_APIC_irq();
@@ -3157,6 +3220,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3157 continue; 3220 continue;
3158 3221
3159 desc_new = move_irq_desc(desc_new, node); 3222 desc_new = move_irq_desc(desc_new, node);
3223 cfg_new = desc_new->chip_data;
3160 3224
3161 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3225 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3162 irq = new; 3226 irq = new;
@@ -3211,7 +3275,8 @@ void destroy_irq(unsigned int irq)
3211 * MSI message composition 3275 * MSI message composition
3212 */ 3276 */
3213#ifdef CONFIG_PCI_MSI 3277#ifdef CONFIG_PCI_MSI
3214static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) 3278static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3279 struct msi_msg *msg, u8 hpet_id)
3215{ 3280{
3216 struct irq_cfg *cfg; 3281 struct irq_cfg *cfg;
3217 int err; 3282 int err;
@@ -3245,7 +3310,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3245 irte.dest_id = IRTE_DEST(dest); 3310 irte.dest_id = IRTE_DEST(dest);
3246 3311
3247 /* Set source-id of interrupt request */ 3312 /* Set source-id of interrupt request */
3248 set_msi_sid(&irte, pdev); 3313 if (pdev)
3314 set_msi_sid(&irte, pdev);
3315 else
3316 set_hpet_sid(&irte, hpet_id);
3249 3317
3250 modify_irte(irq, &irte); 3318 modify_irte(irq, &irte);
3251 3319
@@ -3291,8 +3359,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3291 struct msi_msg msg; 3359 struct msi_msg msg;
3292 unsigned int dest; 3360 unsigned int dest;
3293 3361
3294 dest = set_desc_affinity(desc, mask); 3362 if (set_desc_affinity(desc, mask, &dest))
3295 if (dest == BAD_APICID)
3296 return -1; 3363 return -1;
3297 3364
3298 cfg = desc->chip_data; 3365 cfg = desc->chip_data;
@@ -3324,8 +3391,7 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3324 if (get_irte(irq, &irte)) 3391 if (get_irte(irq, &irte))
3325 return -1; 3392 return -1;
3326 3393
3327 dest = set_desc_affinity(desc, mask); 3394 if (set_desc_affinity(desc, mask, &dest))
3328 if (dest == BAD_APICID)
3329 return -1; 3395 return -1;
3330 3396
3331 irte.vector = cfg->vector; 3397 irte.vector = cfg->vector;
@@ -3410,7 +3476,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3410 int ret; 3476 int ret;
3411 struct msi_msg msg; 3477 struct msi_msg msg;
3412 3478
3413 ret = msi_compose_msg(dev, irq, &msg); 3479 ret = msi_compose_msg(dev, irq, &msg, -1);
3414 if (ret < 0) 3480 if (ret < 0)
3415 return ret; 3481 return ret;
3416 3482
@@ -3507,8 +3573,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3507 struct msi_msg msg; 3573 struct msi_msg msg;
3508 unsigned int dest; 3574 unsigned int dest;
3509 3575
3510 dest = set_desc_affinity(desc, mask); 3576 if (set_desc_affinity(desc, mask, &dest))
3511 if (dest == BAD_APICID)
3512 return -1; 3577 return -1;
3513 3578
3514 cfg = desc->chip_data; 3579 cfg = desc->chip_data;
@@ -3543,7 +3608,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3543 int ret; 3608 int ret;
3544 struct msi_msg msg; 3609 struct msi_msg msg;
3545 3610
3546 ret = msi_compose_msg(NULL, irq, &msg); 3611 ret = msi_compose_msg(NULL, irq, &msg, -1);
3547 if (ret < 0) 3612 if (ret < 0)
3548 return ret; 3613 return ret;
3549 dmar_msi_write(irq, &msg); 3614 dmar_msi_write(irq, &msg);
@@ -3563,8 +3628,7 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3563 struct msi_msg msg; 3628 struct msi_msg msg;
3564 unsigned int dest; 3629 unsigned int dest;
3565 3630
3566 dest = set_desc_affinity(desc, mask); 3631 if (set_desc_affinity(desc, mask, &dest))
3567 if (dest == BAD_APICID)
3568 return -1; 3632 return -1;
3569 3633
3570 cfg = desc->chip_data; 3634 cfg = desc->chip_data;
@@ -3583,6 +3647,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3583 3647
3584#endif /* CONFIG_SMP */ 3648#endif /* CONFIG_SMP */
3585 3649
3650static struct irq_chip ir_hpet_msi_type = {
3651 .name = "IR-HPET_MSI",
3652 .unmask = hpet_msi_unmask,
3653 .mask = hpet_msi_mask,
3654#ifdef CONFIG_INTR_REMAP
3655 .ack = ir_ack_apic_edge,
3656#ifdef CONFIG_SMP
3657 .set_affinity = ir_set_msi_irq_affinity,
3658#endif
3659#endif
3660 .retrigger = ioapic_retrigger_irq,
3661};
3662
3586static struct irq_chip hpet_msi_type = { 3663static struct irq_chip hpet_msi_type = {
3587 .name = "HPET_MSI", 3664 .name = "HPET_MSI",
3588 .unmask = hpet_msi_unmask, 3665 .unmask = hpet_msi_unmask,
@@ -3594,20 +3671,36 @@ static struct irq_chip hpet_msi_type = {
3594 .retrigger = ioapic_retrigger_irq, 3671 .retrigger = ioapic_retrigger_irq,
3595}; 3672};
3596 3673
3597int arch_setup_hpet_msi(unsigned int irq) 3674int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3598{ 3675{
3599 int ret; 3676 int ret;
3600 struct msi_msg msg; 3677 struct msi_msg msg;
3601 struct irq_desc *desc = irq_to_desc(irq); 3678 struct irq_desc *desc = irq_to_desc(irq);
3602 3679
3603 ret = msi_compose_msg(NULL, irq, &msg); 3680 if (intr_remapping_enabled) {
3681 struct intel_iommu *iommu = map_hpet_to_ir(id);
3682 int index;
3683
3684 if (!iommu)
3685 return -1;
3686
3687 index = alloc_irte(iommu, irq, 1);
3688 if (index < 0)
3689 return -1;
3690 }
3691
3692 ret = msi_compose_msg(NULL, irq, &msg, id);
3604 if (ret < 0) 3693 if (ret < 0)
3605 return ret; 3694 return ret;
3606 3695
3607 hpet_msi_write(irq, &msg); 3696 hpet_msi_write(irq, &msg);
3608 desc->status |= IRQ_MOVE_PCNTXT; 3697 desc->status |= IRQ_MOVE_PCNTXT;
3609 set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, 3698 if (irq_remapped(irq))
3610 "edge"); 3699 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
3700 handle_edge_irq, "edge");
3701 else
3702 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3703 handle_edge_irq, "edge");
3611 3704
3612 return 0; 3705 return 0;
3613} 3706}
@@ -3641,8 +3734,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3641 struct irq_cfg *cfg; 3734 struct irq_cfg *cfg;
3642 unsigned int dest; 3735 unsigned int dest;
3643 3736
3644 dest = set_desc_affinity(desc, mask); 3737 if (set_desc_affinity(desc, mask, &dest))
3645 if (dest == BAD_APICID)
3646 return -1; 3738 return -1;
3647 3739
3648 cfg = desc->chip_data; 3740 cfg = desc->chip_data;
@@ -3708,75 +3800,6 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3708} 3800}
3709#endif /* CONFIG_HT_IRQ */ 3801#endif /* CONFIG_HT_IRQ */
3710 3802
3711#ifdef CONFIG_X86_UV
3712/*
3713 * Re-target the irq to the specified CPU and enable the specified MMR located
3714 * on the specified blade to allow the sending of MSIs to the specified CPU.
3715 */
3716int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3717 unsigned long mmr_offset)
3718{
3719 const struct cpumask *eligible_cpu = cpumask_of(cpu);
3720 struct irq_cfg *cfg;
3721 int mmr_pnode;
3722 unsigned long mmr_value;
3723 struct uv_IO_APIC_route_entry *entry;
3724 unsigned long flags;
3725 int err;
3726
3727 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3728
3729 cfg = irq_cfg(irq);
3730
3731 err = assign_irq_vector(irq, cfg, eligible_cpu);
3732 if (err != 0)
3733 return err;
3734
3735 spin_lock_irqsave(&vector_lock, flags);
3736 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
3737 irq_name);
3738 spin_unlock_irqrestore(&vector_lock, flags);
3739
3740 mmr_value = 0;
3741 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3742 entry->vector = cfg->vector;
3743 entry->delivery_mode = apic->irq_delivery_mode;
3744 entry->dest_mode = apic->irq_dest_mode;
3745 entry->polarity = 0;
3746 entry->trigger = 0;
3747 entry->mask = 0;
3748 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3749
3750 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3751 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3752
3753 if (cfg->move_in_progress)
3754 send_cleanup_vector(cfg);
3755
3756 return irq;
3757}
3758
3759/*
3760 * Disable the specified MMR located on the specified blade so that MSIs are
3761 * longer allowed to be sent.
3762 */
3763void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3764{
3765 unsigned long mmr_value;
3766 struct uv_IO_APIC_route_entry *entry;
3767 int mmr_pnode;
3768
3769 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3770
3771 mmr_value = 0;
3772 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3773 entry->mask = 1;
3774
3775 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3776 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3777}
3778#endif /* CONFIG_X86_64 */
3779
3780int __init io_apic_get_redir_entries (int ioapic) 3803int __init io_apic_get_redir_entries (int ioapic)
3781{ 3804{
3782 union IO_APIC_reg_01 reg_01; 3805 union IO_APIC_reg_01 reg_01;
@@ -3944,7 +3967,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3944 */ 3967 */
3945 3968
3946 if (physids_empty(apic_id_map)) 3969 if (physids_empty(apic_id_map))
3947 apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 3970 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
3948 3971
3949 spin_lock_irqsave(&ioapic_lock, flags); 3972 spin_lock_irqsave(&ioapic_lock, flags);
3950 reg_00.raw = io_apic_read(ioapic, 0); 3973 reg_00.raw = io_apic_read(ioapic, 0);
@@ -3960,10 +3983,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3960 * Every APIC in a system must have a unique ID or we get lots of nice 3983 * Every APIC in a system must have a unique ID or we get lots of nice
3961 * 'stuck on smp_invalidate_needed IPI wait' messages. 3984 * 'stuck on smp_invalidate_needed IPI wait' messages.
3962 */ 3985 */
3963 if (apic->check_apicid_used(apic_id_map, apic_id)) { 3986 if (apic->check_apicid_used(&apic_id_map, apic_id)) {
3964 3987
3965 for (i = 0; i < get_physical_broadcast(); i++) { 3988 for (i = 0; i < get_physical_broadcast(); i++) {
3966 if (!apic->check_apicid_used(apic_id_map, i)) 3989 if (!apic->check_apicid_used(&apic_id_map, i))
3967 break; 3990 break;
3968 } 3991 }
3969 3992
@@ -3976,7 +3999,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3976 apic_id = i; 3999 apic_id = i;
3977 } 4000 }
3978 4001
3979 tmp = apic->apicid_to_cpu_present(apic_id); 4002 apic->apicid_to_cpu_present(apic_id, &tmp);
3980 physids_or(apic_id_map, apic_id_map, tmp); 4003 physids_or(apic_id_map, apic_id_map, tmp);
3981 4004
3982 if (reg_00.bits.ID != apic_id) { 4005 if (reg_00.bits.ID != apic_id) {
@@ -4106,7 +4129,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
4106 for (i = 0; i < nr_ioapics; i++) { 4129 for (i = 0; i < nr_ioapics; i++) {
4107 res[i].name = mem; 4130 res[i].name = mem;
4108 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; 4131 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4109 sprintf(mem, "IOAPIC %u", i); 4132 snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
4110 mem += IOAPIC_RESOURCE_NAME_SIZE; 4133 mem += IOAPIC_RESOURCE_NAME_SIZE;
4111 } 4134 }
4112 4135
@@ -4140,18 +4163,17 @@ void __init ioapic_init_mappings(void)
4140#ifdef CONFIG_X86_32 4163#ifdef CONFIG_X86_32
4141fake_ioapic_page: 4164fake_ioapic_page:
4142#endif 4165#endif
4143 ioapic_phys = (unsigned long) 4166 ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
4144 alloc_bootmem_pages(PAGE_SIZE);
4145 ioapic_phys = __pa(ioapic_phys); 4167 ioapic_phys = __pa(ioapic_phys);
4146 } 4168 }
4147 set_fixmap_nocache(idx, ioapic_phys); 4169 set_fixmap_nocache(idx, ioapic_phys);
4148 apic_printk(APIC_VERBOSE, 4170 apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
4149 "mapped IOAPIC to %08lx (%08lx)\n", 4171 __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
4150 __fix_to_virt(idx), ioapic_phys); 4172 ioapic_phys);
4151 idx++; 4173 idx++;
4152 4174
4153 ioapic_res->start = ioapic_phys; 4175 ioapic_res->start = ioapic_phys;
4154 ioapic_res->end = ioapic_phys + (4 * 1024) - 1; 4176 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
4155 ioapic_res++; 4177 ioapic_res++;
4156 } 4178 }
4157} 4179}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7ff61d6a188a..0159a69396cb 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,8 @@
39int unknown_nmi_panic; 39int unknown_nmi_panic;
40int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
41 41
42static cpumask_t backtrace_mask __read_mostly; 42/* For reliability, we're prepared to waste bits here. */
43static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
43 44
44/* nmi_active: 45/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 46 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -360,7 +361,7 @@ void stop_apic_nmi_watchdog(void *unused)
360 */ 361 */
361 362
362static DEFINE_PER_CPU(unsigned, last_irq_sum); 363static DEFINE_PER_CPU(unsigned, last_irq_sum);
363static DEFINE_PER_CPU(local_t, alert_counter); 364static DEFINE_PER_CPU(long, alert_counter);
364static DEFINE_PER_CPU(int, nmi_touch); 365static DEFINE_PER_CPU(int, nmi_touch);
365 366
366void touch_nmi_watchdog(void) 367void touch_nmi_watchdog(void)
@@ -414,7 +415,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
414 } 415 }
415 416
416 /* We can be called before check_nmi_watchdog, hence NULL check. */ 417 /* We can be called before check_nmi_watchdog, hence NULL check. */
417 if (cpumask_test_cpu(cpu, &backtrace_mask)) { 418 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
419 420
420 spin_lock(&lock); 421 spin_lock(&lock);
@@ -422,7 +423,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
422 show_regs(regs); 423 show_regs(regs);
423 dump_stack(); 424 dump_stack();
424 spin_unlock(&lock); 425 spin_unlock(&lock);
425 cpumask_clear_cpu(cpu, &backtrace_mask); 426 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
426 427
427 rc = 1; 428 rc = 1;
428 } 429 }
@@ -437,8 +438,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
437 * Ayiee, looks like this CPU is stuck ... 438 * Ayiee, looks like this CPU is stuck ...
438 * wait a few IRQs (5 seconds) before doing the oops ... 439 * wait a few IRQs (5 seconds) before doing the oops ...
439 */ 440 */
440 local_inc(&__get_cpu_var(alert_counter)); 441 __this_cpu_inc(per_cpu_var(alert_counter));
441 if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) 442 if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz)
442 /* 443 /*
443 * die_nmi will return ONLY if NOTIFY_STOP happens.. 444 * die_nmi will return ONLY if NOTIFY_STOP happens..
444 */ 445 */
@@ -446,7 +447,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
446 regs, panic_on_timeout); 447 regs, panic_on_timeout);
447 } else { 448 } else {
448 __get_cpu_var(last_irq_sum) = sum; 449 __get_cpu_var(last_irq_sum) = sum;
449 local_set(&__get_cpu_var(alert_counter), 0); 450 __this_cpu_write(per_cpu_var(alert_counter), 0);
450 } 451 }
451 452
452 /* see if the nmi watchdog went off */ 453 /* see if the nmi watchdog went off */
@@ -558,14 +559,14 @@ void arch_trigger_all_cpu_backtrace(void)
558{ 559{
559 int i; 560 int i;
560 561
561 cpumask_copy(&backtrace_mask, cpu_online_mask); 562 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
562 563
563 printk(KERN_INFO "sending NMI to all CPUs:\n"); 564 printk(KERN_INFO "sending NMI to all CPUs:\n");
564 apic->send_IPI_all(NMI_VECTOR); 565 apic->send_IPI_all(NMI_VECTOR);
565 566
566 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 567 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
567 for (i = 0; i < 10 * 1000; i++) { 568 for (i = 0; i < 10 * 1000; i++) {
568 if (cpumask_empty(&backtrace_mask)) 569 if (cpumask_empty(to_cpumask(backtrace_mask)))
569 break; 570 break;
570 mdelay(1); 571 mdelay(1);
571 } 572 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 9c0629ceb528..98c4665f251c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -329,10 +329,9 @@ static inline const struct cpumask *numaq_target_cpus(void)
329 return cpu_all_mask; 329 return cpu_all_mask;
330} 330}
331 331
332static inline unsigned long 332static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
333numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
334{ 333{
335 return physid_isset(apicid, bitmap); 334 return physid_isset(apicid, *map);
336} 335}
337 336
338static inline unsigned long numaq_check_apicid_present(int bit) 337static inline unsigned long numaq_check_apicid_present(int bit)
@@ -366,10 +365,10 @@ static inline int numaq_multi_timer_check(int apic, int irq)
366 return apic != 0 && irq == 0; 365 return apic != 0 && irq == 0;
367} 366}
368 367
369static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) 368static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
370{ 369{
371 /* We don't have a good way to do this yet - hack */ 370 /* We don't have a good way to do this yet - hack */
372 return physids_promote(0xFUL); 371 return physids_promote(0xFUL, retmap);
373} 372}
374 373
375static inline int numaq_cpu_to_logical_apicid(int cpu) 374static inline int numaq_cpu_to_logical_apicid(int cpu)
@@ -397,12 +396,12 @@ static inline int numaq_apicid_to_node(int logical_apicid)
397 return logical_apicid >> 4; 396 return logical_apicid >> 4;
398} 397}
399 398
400static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) 399static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
401{ 400{
402 int node = numaq_apicid_to_node(logical_apicid); 401 int node = numaq_apicid_to_node(logical_apicid);
403 int cpu = __ffs(logical_apicid & 0xf); 402 int cpu = __ffs(logical_apicid & 0xf);
404 403
405 return physid_mask_of_physid(cpu + 4*node); 404 physid_set_mask_of_physid(cpu + 4*node, retmap);
406} 405}
407 406
408/* Where the IO area was mapped on multiquad, always 0 otherwise */ 407/* Where the IO area was mapped on multiquad, always 0 otherwise */
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0c0182cc947d..99d2fe016084 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,7 +52,32 @@ static int __init print_ipi_mode(void)
52} 52}
53late_initcall(print_ipi_mode); 53late_initcall(print_ipi_mode);
54 54
55void default_setup_apic_routing(void) 55void __init default_setup_apic_routing(void)
56{
57 int version = apic_version[boot_cpu_physical_apicid];
58
59 if (num_possible_cpus() > 8) {
60 switch (boot_cpu_data.x86_vendor) {
61 case X86_VENDOR_INTEL:
62 if (!APIC_XAPIC(version)) {
63 def_to_bigsmp = 0;
64 break;
65 }
66 /* If P4 and above fall through */
67 case X86_VENDOR_AMD:
68 def_to_bigsmp = 1;
69 }
70 }
71
72#ifdef CONFIG_X86_BIGSMP
73 generic_bigsmp_probe();
74#endif
75
76 if (apic->setup_apic_routing)
77 apic->setup_apic_routing();
78}
79
80static void setup_apic_flat_routing(void)
56{ 81{
57#ifdef CONFIG_X86_IO_APIC 82#ifdef CONFIG_X86_IO_APIC
58 printk(KERN_INFO 83 printk(KERN_INFO
@@ -103,12 +128,12 @@ struct apic apic_default = {
103 .init_apic_ldr = default_init_apic_ldr, 128 .init_apic_ldr = default_init_apic_ldr,
104 129
105 .ioapic_phys_id_map = default_ioapic_phys_id_map, 130 .ioapic_phys_id_map = default_ioapic_phys_id_map,
106 .setup_apic_routing = default_setup_apic_routing, 131 .setup_apic_routing = setup_apic_flat_routing,
107 .multi_timer_check = NULL, 132 .multi_timer_check = NULL,
108 .apicid_to_node = default_apicid_to_node, 133 .apicid_to_node = default_apicid_to_node,
109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid, 134 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
110 .cpu_present_to_apicid = default_cpu_present_to_apicid, 135 .cpu_present_to_apicid = default_cpu_present_to_apicid,
111 .apicid_to_cpu_present = default_apicid_to_cpu_present, 136 .apicid_to_cpu_present = physid_set_mask_of_physid,
112 .setup_portio_remap = NULL, 137 .setup_portio_remap = NULL,
113 .check_phys_apicid_present = default_check_phys_apicid_present, 138 .check_phys_apicid_present = default_check_phys_apicid_present,
114 .enable_apic_mode = NULL, 139 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index c4cbd3080c1c..83e9be4778e2 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -67,17 +67,8 @@ void __init default_setup_apic_routing(void)
67 } 67 }
68#endif 68#endif
69 69
70 if (apic == &apic_flat) { 70 if (apic == &apic_flat && num_possible_cpus() > 8)
71 switch (boot_cpu_data.x86_vendor) { 71 apic = &apic_physflat;
72 case X86_VENDOR_INTEL:
73 if (num_processors > 8)
74 apic = &apic_physflat;
75 break;
76 case X86_VENDOR_AMD:
77 if (max_physical_apicid >= 8)
78 apic = &apic_physflat;
79 }
80 }
81 72
82 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 73 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
83 74
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 645ecc4ff0be..9b419263d90d 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void)
183 return cpumask_of(0); 183 return cpumask_of(0);
184} 184}
185 185
186static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) 186static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
187{ 187{
188 return 0; 188 return 0;
189} 189}
@@ -261,15 +261,15 @@ static int summit_cpu_present_to_apicid(int mps_cpu)
261 return BAD_APICID; 261 return BAD_APICID;
262} 262}
263 263
264static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) 264static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
265{ 265{
266 /* For clustered we don't have a good way to do this yet - hack */ 266 /* For clustered we don't have a good way to do this yet - hack */
267 return physids_promote(0x0F); 267 physids_promote(0x0FL, retmap);
268} 268}
269 269
270static physid_mask_t summit_apicid_to_cpu_present(int apicid) 270static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
271{ 271{
272 return physid_mask_of_physid(0); 272 physid_set_mask_of_physid(0, retmap);
273} 273}
274 274
275static int summit_check_phys_apicid_present(int physical_apicid) 275static int summit_check_phys_apicid_present(int physical_apicid)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index a5371ec36776..cf69c59f4910 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -148,10 +148,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
148 break; 148 break;
149 } 149 }
150 150
151 if (cpu < nr_cpu_ids) 151 return per_cpu(x86_cpu_to_logical_apicid, cpu);
152 return per_cpu(x86_cpu_to_logical_apicid, cpu);
153
154 return BAD_APICID;
155} 152}
156 153
157static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) 154static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a8989aadc99a..8972f38c5ced 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -146,10 +146,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
146 break; 146 break;
147 } 147 }
148 148
149 if (cpu < nr_cpu_ids) 149 return per_cpu(x86_cpu_to_apicid, cpu);
150 return per_cpu(x86_cpu_to_apicid, cpu);
151
152 return BAD_APICID;
153} 150}
154 151
155static unsigned int x2apic_phys_get_apic_id(unsigned long x) 152static unsigned int x2apic_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 1e09417c992f..21db3cbea7dc 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -36,6 +36,8 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
36 36
37static enum uv_system_type uv_system_type; 37static enum uv_system_type uv_system_type;
38static u64 gru_start_paddr, gru_end_paddr; 38static u64 gru_start_paddr, gru_end_paddr;
39int uv_min_hub_revision_id;
40EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
39 41
40static inline bool is_GRU_range(u64 start, u64 end) 42static inline bool is_GRU_range(u64 start, u64 end)
41{ 43{
@@ -55,12 +57,19 @@ static int early_get_nodeid(void)
55 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); 57 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
56 node_id.v = *mmr; 58 node_id.v = *mmr;
57 early_iounmap(mmr, sizeof(*mmr)); 59 early_iounmap(mmr, sizeof(*mmr));
60
61 /* Currently, all blades have same revision number */
62 uv_min_hub_revision_id = node_id.s.revision;
63
58 return node_id.s.node_id; 64 return node_id.s.node_id;
59} 65}
60 66
61static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 67static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
62{ 68{
69 int nodeid;
70
63 if (!strcmp(oem_id, "SGI")) { 71 if (!strcmp(oem_id, "SGI")) {
72 nodeid = early_get_nodeid();
64 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 73 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
65 if (!strcmp(oem_table_id, "UVL")) 74 if (!strcmp(oem_table_id, "UVL"))
66 uv_system_type = UV_LEGACY_APIC; 75 uv_system_type = UV_LEGACY_APIC;
@@ -68,7 +77,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
68 uv_system_type = UV_X2APIC; 77 uv_system_type = UV_X2APIC;
69 else if (!strcmp(oem_table_id, "UVH")) { 78 else if (!strcmp(oem_table_id, "UVH")) {
70 __get_cpu_var(x2apic_extra_bits) = 79 __get_cpu_var(x2apic_extra_bits) =
71 early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); 80 nodeid << (UV_APIC_PNODE_SHIFT - 1);
72 uv_system_type = UV_NON_UNIQUE_APIC; 81 uv_system_type = UV_NON_UNIQUE_APIC;
73 return 1; 82 return 1;
74 } 83 }
@@ -225,10 +234,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
225 if (cpumask_test_cpu(cpu, cpu_online_mask)) 234 if (cpumask_test_cpu(cpu, cpu_online_mask))
226 break; 235 break;
227 } 236 }
228 if (cpu < nr_cpu_ids) 237 return per_cpu(x86_cpu_to_apicid, cpu);
229 return per_cpu(x86_cpu_to_apicid, cpu);
230
231 return BAD_APICID;
232} 238}
233 239
234static unsigned int x2apic_get_apic_id(unsigned long x) 240static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -365,25 +371,25 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
365 371
366 for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { 372 for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
367 alias.v = uv_read_local_mmr(redir_addrs[i].alias); 373 alias.v = uv_read_local_mmr(redir_addrs[i].alias);
368 if (alias.s.base == 0) { 374 if (alias.s.enable && alias.s.base == 0) {
369 *size = (1UL << alias.s.m_alias); 375 *size = (1UL << alias.s.m_alias);
370 redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); 376 redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
371 *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; 377 *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
372 return; 378 return;
373 } 379 }
374 } 380 }
375 BUG(); 381 *base = *size = 0;
376} 382}
377 383
378enum map_type {map_wb, map_uc}; 384enum map_type {map_wb, map_uc};
379 385
380static __init void map_high(char *id, unsigned long base, int shift, 386static __init void map_high(char *id, unsigned long base, int pshift,
381 int max_pnode, enum map_type map_type) 387 int bshift, int max_pnode, enum map_type map_type)
382{ 388{
383 unsigned long bytes, paddr; 389 unsigned long bytes, paddr;
384 390
385 paddr = base << shift; 391 paddr = base << pshift;
386 bytes = (1UL << shift) * (max_pnode + 1); 392 bytes = (1UL << bshift) * (max_pnode + 1);
387 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 393 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
388 paddr + bytes); 394 paddr + bytes);
389 if (map_type == map_uc) 395 if (map_type == map_uc)
@@ -399,7 +405,7 @@ static __init void map_gru_high(int max_pnode)
399 405
400 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 406 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
401 if (gru.s.enable) { 407 if (gru.s.enable) {
402 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 408 map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
403 gru_start_paddr = ((u64)gru.s.base << shift); 409 gru_start_paddr = ((u64)gru.s.base << shift);
404 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); 410 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
405 411
@@ -413,7 +419,7 @@ static __init void map_mmr_high(int max_pnode)
413 419
414 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); 420 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
415 if (mmr.s.enable) 421 if (mmr.s.enable)
416 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); 422 map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
417} 423}
418 424
419static __init void map_mmioh_high(int max_pnode) 425static __init void map_mmioh_high(int max_pnode)
@@ -423,7 +429,14 @@ static __init void map_mmioh_high(int max_pnode)
423 429
424 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 430 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
425 if (mmioh.s.enable) 431 if (mmioh.s.enable)
426 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); 432 map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
433 max_pnode, map_uc);
434}
435
436static __init void map_low_mmrs(void)
437{
438 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
439 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
427} 440}
428 441
429static __init void uv_rtc_init(void) 442static __init void uv_rtc_init(void)
@@ -567,6 +580,8 @@ void __init uv_system_init(void)
567 unsigned long mmr_base, present, paddr; 580 unsigned long mmr_base, present, paddr;
568 unsigned short pnode_mask; 581 unsigned short pnode_mask;
569 582
583 map_low_mmrs();
584
570 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 585 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
571 m_val = m_n_config.s.m_skt; 586 m_val = m_n_config.s.m_skt;
572 n_val = m_n_config.s.n_skt; 587 n_val = m_n_config.s.n_skt;
@@ -624,8 +639,10 @@ void __init uv_system_init(void)
624 uv_rtc_init(); 639 uv_rtc_init();
625 640
626 for_each_present_cpu(cpu) { 641 for_each_present_cpu(cpu) {
642 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
643
627 nid = cpu_to_node(cpu); 644 nid = cpu_to_node(cpu);
628 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 645 pnode = uv_apicid_to_pnode(apicid);
629 blade = boot_pnode_to_blade(pnode); 646 blade = boot_pnode_to_blade(pnode);
630 lcpu = uv_blade_info[blade].nr_possible_cpus; 647 lcpu = uv_blade_info[blade].nr_possible_cpus;
631 uv_blade_info[blade].nr_possible_cpus++; 648 uv_blade_info[blade].nr_possible_cpus++;
@@ -636,25 +653,23 @@ void __init uv_system_init(void)
636 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 653 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
637 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; 654 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
638 uv_cpu_hub_info(cpu)->m_val = m_val; 655 uv_cpu_hub_info(cpu)->m_val = m_val;
639 uv_cpu_hub_info(cpu)->n_val = m_val; 656 uv_cpu_hub_info(cpu)->n_val = n_val;
640 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 657 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
641 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; 658 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
642 uv_cpu_hub_info(cpu)->pnode = pnode; 659 uv_cpu_hub_info(cpu)->pnode = pnode;
643 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; 660 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
644 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 661 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
645 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 662 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
646 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 663 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
647 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 664 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
648 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 665 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
649 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 666 uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
650 uv_node_to_blade[nid] = blade; 667 uv_node_to_blade[nid] = blade;
651 uv_cpu_to_blade[cpu] = blade; 668 uv_cpu_to_blade[cpu] = blade;
652 max_pnode = max(pnode, max_pnode); 669 max_pnode = max(pnode, max_pnode);
653 670
654 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " 671 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
655 "lcpu %d, blade %d\n", 672 cpu, apicid, pnode, nid, lcpu, blade);
656 cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
657 lcpu, blade);
658 } 673 }
659 674
660 /* Add blade/pnode info for nodes without cpus */ 675 /* Add blade/pnode info for nodes without cpus */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 151ace69a5aa..b5b6b23bce53 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,7 +204,6 @@
204#include <linux/module.h> 204#include <linux/module.h>
205 205
206#include <linux/poll.h> 206#include <linux/poll.h>
207#include <linux/smp_lock.h>
208#include <linux/types.h> 207#include <linux/types.h>
209#include <linux/stddef.h> 208#include <linux/stddef.h>
210#include <linux/timer.h> 209#include <linux/timer.h>
@@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 402static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
404static struct apm_user *user_list; 403static struct apm_user *user_list;
405static DEFINE_SPINLOCK(user_list_lock); 404static DEFINE_SPINLOCK(user_list_lock);
405static DEFINE_MUTEX(apm_mutex);
406 406
407/* 407/*
408 * Set up a segment that references the real mode segment 0x40 408 * Set up a segment that references the real mode segment 0x40
@@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1531 return -EPERM; 1531 return -EPERM;
1532 switch (cmd) { 1532 switch (cmd) {
1533 case APM_IOC_STANDBY: 1533 case APM_IOC_STANDBY:
1534 lock_kernel(); 1534 mutex_lock(&apm_mutex);
1535 if (as->standbys_read > 0) { 1535 if (as->standbys_read > 0) {
1536 as->standbys_read--; 1536 as->standbys_read--;
1537 as->standbys_pending--; 1537 as->standbys_pending--;
@@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1540 queue_event(APM_USER_STANDBY, as); 1540 queue_event(APM_USER_STANDBY, as);
1541 if (standbys_pending <= 0) 1541 if (standbys_pending <= 0)
1542 standby(); 1542 standby();
1543 unlock_kernel(); 1543 mutex_unlock(&apm_mutex);
1544 break; 1544 break;
1545 case APM_IOC_SUSPEND: 1545 case APM_IOC_SUSPEND:
1546 lock_kernel(); 1546 mutex_lock(&apm_mutex);
1547 if (as->suspends_read > 0) { 1547 if (as->suspends_read > 0) {
1548 as->suspends_read--; 1548 as->suspends_read--;
1549 as->suspends_pending--; 1549 as->suspends_pending--;
@@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1552 queue_event(APM_USER_SUSPEND, as); 1552 queue_event(APM_USER_SUSPEND, as);
1553 if (suspends_pending <= 0) { 1553 if (suspends_pending <= 0) {
1554 ret = suspend(1); 1554 ret = suspend(1);
1555 mutex_unlock(&apm_mutex);
1555 } else { 1556 } else {
1556 as->suspend_wait = 1; 1557 as->suspend_wait = 1;
1558 mutex_unlock(&apm_mutex);
1557 wait_event_interruptible(apm_suspend_waitqueue, 1559 wait_event_interruptible(apm_suspend_waitqueue,
1558 as->suspend_wait == 0); 1560 as->suspend_wait == 0);
1559 ret = as->suspend_result; 1561 ret = as->suspend_result;
1560 } 1562 }
1561 unlock_kernel();
1562 return ret; 1563 return ret;
1563 default: 1564 default:
1564 return -ENOTTY; 1565 return -ENOTTY;
@@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp)
1608{ 1609{
1609 struct apm_user *as; 1610 struct apm_user *as;
1610 1611
1611 lock_kernel();
1612 as = kmalloc(sizeof(*as), GFP_KERNEL); 1612 as = kmalloc(sizeof(*as), GFP_KERNEL);
1613 if (as == NULL) { 1613 if (as == NULL) {
1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", 1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1615 sizeof(*as)); 1615 sizeof(*as));
1616 unlock_kernel();
1617 return -ENOMEM; 1616 return -ENOMEM;
1618 } 1617 }
1619 as->magic = APM_BIOS_MAGIC; 1618 as->magic = APM_BIOS_MAGIC;
@@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp)
1635 user_list = as; 1634 user_list = as;
1636 spin_unlock(&user_list_lock); 1635 spin_unlock(&user_list_lock);
1637 filp->private_data = as; 1636 filp->private_data = as;
1638 unlock_kernel();
1639 return 0; 1637 return 0;
1640} 1638}
1641 1639
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index 63a88e1f987d..b0206a211b09 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -101,21 +101,17 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
101} 101}
102 102
103int 103int
104uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, 104uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
105 unsigned long *intr_mmr_offset) 105 unsigned long *intr_mmr_offset)
106{ 106{
107 union uv_watchlist_u size_blade;
108 u64 watchlist; 107 u64 watchlist;
109 s64 ret; 108 s64 ret;
110 109
111 size_blade.size = mq_size;
112 size_blade.blade = blade;
113
114 /* 110 /*
115 * bios returns watchlist number or negative error number. 111 * bios returns watchlist number or negative error number.
116 */ 112 */
117 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, 113 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
118 size_blade.val, (u64)intr_mmr_offset, 114 mq_size, (u64)intr_mmr_offset,
119 (u64)&watchlist, 0); 115 (u64)&watchlist, 0);
120 if (ret < BIOS_STATUS_SUCCESS) 116 if (ret < BIOS_STATUS_SUCCESS)
121 return ret; 117 return ret;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e957a9b..c202b62f3671 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER 6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg 7CFLAGS_REMOVE_common.o = -pg
8CFLAGS_REMOVE_perf_event.o = -pg
8endif 9endif
9 10
10# Make sure load_percpu_segment has no stackprotector 11# Make sure load_percpu_segment has no stackprotector
@@ -18,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o
18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
19obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
20 21
21obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
22
23obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
24obj-$(CONFIG_CPU_SUP_AMD) += amd.o 23obj-$(CONFIG_CPU_SUP_AMD) += amd.o
25obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 24obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c965e5212714..468489b57aae 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -74,6 +74,7 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
74 unsigned int eax, ebx, ecx, edx, sub_index; 74 unsigned int eax, ebx, ecx, edx, sub_index;
75 unsigned int ht_mask_width, core_plus_mask_width; 75 unsigned int ht_mask_width, core_plus_mask_width;
76 unsigned int core_select_mask, core_level_siblings; 76 unsigned int core_select_mask, core_level_siblings;
77 static bool printed;
77 78
78 if (c->cpuid_level < 0xb) 79 if (c->cpuid_level < 0xb)
79 return; 80 return;
@@ -127,12 +128,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
127 128
128 c->x86_max_cores = (core_level_siblings / smp_num_siblings); 129 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
129 130
130 131 if (!printed) {
131 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 132 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
132 c->phys_proc_id); 133 c->phys_proc_id);
133 if (c->x86_max_cores > 1) 134 if (c->x86_max_cores > 1)
134 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 135 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
135 c->cpu_core_id); 136 c->cpu_core_id);
137 printed = 1;
138 }
136 return; 139 return;
137#endif 140#endif
138} 141}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c910a716a71c..e485825130d2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -254,59 +254,36 @@ static int __cpuinit nearby_node(int apicid)
254 254
255/* 255/*
256 * Fixup core topology information for AMD multi-node processors. 256 * Fixup core topology information for AMD multi-node processors.
257 * Assumption 1: Number of cores in each internal node is the same. 257 * Assumption: Number of cores in each internal node is the same.
258 * Assumption 2: Mixed systems with both single-node and dual-node
259 * processors are not supported.
260 */ 258 */
261#ifdef CONFIG_X86_HT 259#ifdef CONFIG_X86_HT
262static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) 260static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
263{ 261{
264#ifdef CONFIG_PCI 262 unsigned long long value;
265 u32 t, cpn; 263 u32 nodes, cores_per_node;
266 u8 n, n_id;
267 int cpu = smp_processor_id(); 264 int cpu = smp_processor_id();
268 265
266 if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
267 return;
268
269 /* fixup topology information only once for a core */ 269 /* fixup topology information only once for a core */
270 if (cpu_has(c, X86_FEATURE_AMD_DCM)) 270 if (cpu_has(c, X86_FEATURE_AMD_DCM))
271 return; 271 return;
272 272
273 /* check for multi-node processor on boot cpu */ 273 rdmsrl(MSR_FAM10H_NODE_ID, value);
274 t = read_pci_config(0, 24, 3, 0xe8); 274
275 if (!(t & (1 << 29))) 275 nodes = ((value >> 3) & 7) + 1;
276 if (nodes == 1)
276 return; 277 return;
277 278
278 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 279 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
280 cores_per_node = c->x86_max_cores / nodes;
279 281
280 /* cores per node: each internal node has half the number of cores */ 282 /* store NodeID, use llc_shared_map to store sibling info */
281 cpn = c->x86_max_cores >> 1; 283 per_cpu(cpu_llc_id, cpu) = value & 7;
282
283 /* even-numbered NB_id of this dual-node processor */
284 n = c->phys_proc_id << 1;
285
286 /*
287 * determine internal node id and assign cores fifty-fifty to
288 * each node of the dual-node processor
289 */
290 t = read_pci_config(0, 24 + n, 3, 0xe8);
291 n = (t>>30) & 0x3;
292 if (n == 0) {
293 if (c->cpu_core_id < cpn)
294 n_id = 0;
295 else
296 n_id = 1;
297 } else {
298 if (c->cpu_core_id < cpn)
299 n_id = 1;
300 else
301 n_id = 0;
302 }
303
304 /* compute entire NodeID, use llc_shared_map to store sibling info */
305 per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id;
306 284
307 /* fixup core id to be in range from 0 to cpn */ 285 /* fixup core id to be in range from 0 to (cores_per_node - 1) */
308 c->cpu_core_id = c->cpu_core_id % cpn; 286 c->cpu_core_id = c->cpu_core_id % cores_per_node;
309#endif
310} 287}
311#endif 288#endif
312 289
@@ -375,8 +352,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
375 node = nearby_node(apicid); 352 node = nearby_node(apicid);
376 } 353 }
377 numa_set_node(cpu, node); 354 numa_set_node(cpu, node);
378
379 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
380#endif 355#endif
381} 356}
382 357
@@ -535,7 +510,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
535 } 510 }
536 } 511 }
537 512
538 display_cacheinfo(c); 513 cpu_detect_cache_sizes(c);
539 514
540 /* Multi core CPU? */ 515 /* Multi core CPU? */
541 if (c->extended_cpuid_level >= 0x80000008) { 516 if (c->extended_cpuid_level >= 0x80000008) {
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index c95e831bb095..e58d978e0758 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 } 295 }
296 296
297 display_cacheinfo(c); 297 cpu_detect_cache_sizes(c);
298} 298}
299 299
300enum { 300enum {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 18346da8c594..4868e4a951ee 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -61,7 +61,7 @@ void __init setup_cpu_local_masks(void)
61static void __cpuinit default_init(struct cpuinfo_x86 *c) 61static void __cpuinit default_init(struct cpuinfo_x86 *c)
62{ 62{
63#ifdef CONFIG_X86_64 63#ifdef CONFIG_X86_64
64 display_cacheinfo(c); 64 cpu_detect_cache_sizes(c);
65#else 65#else
66 /* Not much we can do here... */ 66 /* Not much we can do here... */
67 /* Check if at least it has cpuid */ 67 /* Check if at least it has cpuid */
@@ -383,7 +383,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
383 } 383 }
384} 384}
385 385
386void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 386void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
387{ 387{
388 unsigned int n, dummy, ebx, ecx, edx, l2size; 388 unsigned int n, dummy, ebx, ecx, edx, l2size;
389 389
@@ -391,8 +391,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
391 391
392 if (n >= 0x80000005) { 392 if (n >= 0x80000005) {
393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); 393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
394 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
395 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
396 c->x86_cache_size = (ecx>>24) + (edx>>24); 394 c->x86_cache_size = (ecx>>24) + (edx>>24);
397#ifdef CONFIG_X86_64 395#ifdef CONFIG_X86_64
398 /* On K8 L1 TLB is inclusive, so don't count it */ 396 /* On K8 L1 TLB is inclusive, so don't count it */
@@ -422,9 +420,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
422#endif 420#endif
423 421
424 c->x86_cache_size = l2size; 422 c->x86_cache_size = l2size;
425
426 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
427 l2size, ecx & 0xFF);
428} 423}
429 424
430void __cpuinit detect_ht(struct cpuinfo_x86 *c) 425void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -432,6 +427,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
432#ifdef CONFIG_X86_HT 427#ifdef CONFIG_X86_HT
433 u32 eax, ebx, ecx, edx; 428 u32 eax, ebx, ecx, edx;
434 int index_msb, core_bits; 429 int index_msb, core_bits;
430 static bool printed;
435 431
436 if (!cpu_has(c, X86_FEATURE_HT)) 432 if (!cpu_has(c, X86_FEATURE_HT))
437 return; 433 return;
@@ -447,7 +443,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
447 smp_num_siblings = (ebx & 0xff0000) >> 16; 443 smp_num_siblings = (ebx & 0xff0000) >> 16;
448 444
449 if (smp_num_siblings == 1) { 445 if (smp_num_siblings == 1) {
450 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 446 printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
451 goto out; 447 goto out;
452 } 448 }
453 449
@@ -474,11 +470,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
474 ((1 << core_bits) - 1); 470 ((1 << core_bits) - 1);
475 471
476out: 472out:
477 if ((c->x86_max_cores * smp_num_siblings) > 1) { 473 if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
478 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 474 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
479 c->phys_proc_id); 475 c->phys_proc_id);
480 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 476 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
481 c->cpu_core_id); 477 c->cpu_core_id);
478 printed = 1;
482 } 479 }
483#endif 480#endif
484} 481}
@@ -659,24 +656,31 @@ void __init early_cpu_init(void)
659 const struct cpu_dev *const *cdev; 656 const struct cpu_dev *const *cdev;
660 int count = 0; 657 int count = 0;
661 658
659#ifdef PROCESSOR_SELECT
662 printk(KERN_INFO "KERNEL supported cpus:\n"); 660 printk(KERN_INFO "KERNEL supported cpus:\n");
661#endif
662
663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { 663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
664 const struct cpu_dev *cpudev = *cdev; 664 const struct cpu_dev *cpudev = *cdev;
665 unsigned int j;
666 665
667 if (count >= X86_VENDOR_NUM) 666 if (count >= X86_VENDOR_NUM)
668 break; 667 break;
669 cpu_devs[count] = cpudev; 668 cpu_devs[count] = cpudev;
670 count++; 669 count++;
671 670
672 for (j = 0; j < 2; j++) { 671#ifdef PROCESSOR_SELECT
673 if (!cpudev->c_ident[j]) 672 {
674 continue; 673 unsigned int j;
675 printk(KERN_INFO " %s %s\n", cpudev->c_vendor, 674
676 cpudev->c_ident[j]); 675 for (j = 0; j < 2; j++) {
676 if (!cpudev->c_ident[j])
677 continue;
678 printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
679 cpudev->c_ident[j]);
680 }
677 } 681 }
682#endif
678 } 683 }
679
680 early_identify_cpu(&boot_cpu_data); 684 early_identify_cpu(&boot_cpu_data);
681} 685}
682 686
@@ -837,10 +841,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
837 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 841 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
838 } 842 }
839 843
840#ifdef CONFIG_X86_MCE
841 /* Init Machine Check Exception if available. */ 844 /* Init Machine Check Exception if available. */
842 mcheck_init(c); 845 mcheck_cpu_init(c);
843#endif
844 846
845 select_idle_routine(c); 847 select_idle_routine(c);
846 848
@@ -1093,7 +1095,7 @@ static void clear_all_debug_regs(void)
1093 1095
1094void __cpuinit cpu_init(void) 1096void __cpuinit cpu_init(void)
1095{ 1097{
1096 struct orig_ist *orig_ist; 1098 struct orig_ist *oist;
1097 struct task_struct *me; 1099 struct task_struct *me;
1098 struct tss_struct *t; 1100 struct tss_struct *t;
1099 unsigned long v; 1101 unsigned long v;
@@ -1102,7 +1104,7 @@ void __cpuinit cpu_init(void)
1102 1104
1103 cpu = stack_smp_processor_id(); 1105 cpu = stack_smp_processor_id();
1104 t = &per_cpu(init_tss, cpu); 1106 t = &per_cpu(init_tss, cpu);
1105 orig_ist = &per_cpu(orig_ist, cpu); 1107 oist = &per_cpu(orig_ist, cpu);
1106 1108
1107#ifdef CONFIG_NUMA 1109#ifdef CONFIG_NUMA
1108 if (cpu != 0 && percpu_read(node_number) == 0 && 1110 if (cpu != 0 && percpu_read(node_number) == 0 &&
@@ -1115,7 +1117,7 @@ void __cpuinit cpu_init(void)
1115 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) 1117 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
1116 panic("CPU#%d already initialized!\n", cpu); 1118 panic("CPU#%d already initialized!\n", cpu);
1117 1119
1118 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1120 pr_debug("Initializing CPU#%d\n", cpu);
1119 1121
1120 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1122 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1121 1123
@@ -1143,12 +1145,12 @@ void __cpuinit cpu_init(void)
1143 /* 1145 /*
1144 * set up and load the per-CPU TSS 1146 * set up and load the per-CPU TSS
1145 */ 1147 */
1146 if (!orig_ist->ist[0]) { 1148 if (!oist->ist[0]) {
1147 char *estacks = per_cpu(exception_stacks, cpu); 1149 char *estacks = per_cpu(exception_stacks, cpu);
1148 1150
1149 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1151 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1150 estacks += exception_stack_sizes[v]; 1152 estacks += exception_stack_sizes[v];
1151 orig_ist->ist[v] = t->x86_tss.ist[v] = 1153 oist->ist[v] = t->x86_tss.ist[v] =
1152 (unsigned long)estacks; 1154 (unsigned long)estacks;
1153 } 1155 }
1154 } 1156 }
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 6de9a908e400..3624e8a0f71b 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,6 @@ struct cpu_dev {
32extern const struct cpu_dev *const __x86_cpu_dev_start[], 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void display_cacheinfo(struct cpuinfo_x86 *c); 35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36 36
37#endif 37#endif
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
deleted file mode 100644
index dca325c03999..000000000000
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ /dev/null
@@ -1,688 +0,0 @@
1/*
2 * CPU x86 architecture debug code
3 *
4 * Copyright(C) 2009 Jaswinder Singh Rajput
5 *
6 * For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/interrupt.h>
10#include <linux/compiler.h>
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/kprobes.h>
14#include <linux/uaccess.h>
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/percpu.h>
18#include <linux/signal.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/types.h>
22#include <linux/init.h>
23#include <linux/slab.h>
24#include <linux/smp.h>
25
26#include <asm/cpu_debug.h>
27#include <asm/paravirt.h>
28#include <asm/system.h>
29#include <asm/traps.h>
30#include <asm/apic.h>
31#include <asm/desc.h>
32
33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
35static DEFINE_PER_CPU(int, cpu_priv_count);
36
37static DEFINE_MUTEX(cpu_debug_lock);
38
39static struct dentry *cpu_debugfs_dir;
40
41static struct cpu_debug_base cpu_base[] = {
42 { "mc", CPU_MC, 0 },
43 { "monitor", CPU_MONITOR, 0 },
44 { "time", CPU_TIME, 0 },
45 { "pmc", CPU_PMC, 1 },
46 { "platform", CPU_PLATFORM, 0 },
47 { "apic", CPU_APIC, 0 },
48 { "poweron", CPU_POWERON, 0 },
49 { "control", CPU_CONTROL, 0 },
50 { "features", CPU_FEATURES, 0 },
51 { "lastbranch", CPU_LBRANCH, 0 },
52 { "bios", CPU_BIOS, 0 },
53 { "freq", CPU_FREQ, 0 },
54 { "mtrr", CPU_MTRR, 0 },
55 { "perf", CPU_PERF, 0 },
56 { "cache", CPU_CACHE, 0 },
57 { "sysenter", CPU_SYSENTER, 0 },
58 { "therm", CPU_THERM, 0 },
59 { "misc", CPU_MISC, 0 },
60 { "debug", CPU_DEBUG, 0 },
61 { "pat", CPU_PAT, 0 },
62 { "vmx", CPU_VMX, 0 },
63 { "call", CPU_CALL, 0 },
64 { "base", CPU_BASE, 0 },
65 { "ver", CPU_VER, 0 },
66 { "conf", CPU_CONF, 0 },
67 { "smm", CPU_SMM, 0 },
68 { "svm", CPU_SVM, 0 },
69 { "osvm", CPU_OSVM, 0 },
70 { "tss", CPU_TSS, 0 },
71 { "cr", CPU_CR, 0 },
72 { "dt", CPU_DT, 0 },
73 { "registers", CPU_REG_ALL, 0 },
74};
75
76static struct cpu_file_base cpu_file[] = {
77 { "index", CPU_REG_ALL, 0 },
78 { "value", CPU_REG_ALL, 1 },
79};
80
81/* CPU Registers Range */
82static struct cpu_debug_range cpu_reg_range[] = {
83 { 0x00000000, 0x00000001, CPU_MC, },
84 { 0x00000006, 0x00000007, CPU_MONITOR, },
85 { 0x00000010, 0x00000010, CPU_TIME, },
86 { 0x00000011, 0x00000013, CPU_PMC, },
87 { 0x00000017, 0x00000017, CPU_PLATFORM, },
88 { 0x0000001B, 0x0000001B, CPU_APIC, },
89 { 0x0000002A, 0x0000002B, CPU_POWERON, },
90 { 0x0000002C, 0x0000002C, CPU_FREQ, },
91 { 0x0000003A, 0x0000003A, CPU_CONTROL, },
92 { 0x00000040, 0x00000047, CPU_LBRANCH, },
93 { 0x00000060, 0x00000067, CPU_LBRANCH, },
94 { 0x00000079, 0x00000079, CPU_BIOS, },
95 { 0x00000088, 0x0000008A, CPU_CACHE, },
96 { 0x0000008B, 0x0000008B, CPU_BIOS, },
97 { 0x0000009B, 0x0000009B, CPU_MONITOR, },
98 { 0x000000C1, 0x000000C4, CPU_PMC, },
99 { 0x000000CD, 0x000000CD, CPU_FREQ, },
100 { 0x000000E7, 0x000000E8, CPU_PERF, },
101 { 0x000000FE, 0x000000FE, CPU_MTRR, },
102
103 { 0x00000116, 0x0000011E, CPU_CACHE, },
104 { 0x00000174, 0x00000176, CPU_SYSENTER, },
105 { 0x00000179, 0x0000017B, CPU_MC, },
106 { 0x00000186, 0x00000189, CPU_PMC, },
107 { 0x00000198, 0x00000199, CPU_PERF, },
108 { 0x0000019A, 0x0000019A, CPU_TIME, },
109 { 0x0000019B, 0x0000019D, CPU_THERM, },
110 { 0x000001A0, 0x000001A0, CPU_MISC, },
111 { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
112 { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
113 { 0x000001D9, 0x000001D9, CPU_DEBUG, },
114 { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
115
116 { 0x00000200, 0x0000020F, CPU_MTRR, },
117 { 0x00000250, 0x00000250, CPU_MTRR, },
118 { 0x00000258, 0x00000259, CPU_MTRR, },
119 { 0x00000268, 0x0000026F, CPU_MTRR, },
120 { 0x00000277, 0x00000277, CPU_PAT, },
121 { 0x000002FF, 0x000002FF, CPU_MTRR, },
122
123 { 0x00000300, 0x00000311, CPU_PMC, },
124 { 0x00000345, 0x00000345, CPU_PMC, },
125 { 0x00000360, 0x00000371, CPU_PMC, },
126 { 0x0000038D, 0x00000390, CPU_PMC, },
127 { 0x000003A0, 0x000003BE, CPU_PMC, },
128 { 0x000003C0, 0x000003CD, CPU_PMC, },
129 { 0x000003E0, 0x000003E1, CPU_PMC, },
130 { 0x000003F0, 0x000003F2, CPU_PMC, },
131
132 { 0x00000400, 0x00000417, CPU_MC, },
133 { 0x00000480, 0x0000048B, CPU_VMX, },
134
135 { 0x00000600, 0x00000600, CPU_DEBUG, },
136 { 0x00000680, 0x0000068F, CPU_LBRANCH, },
137 { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
138
139 { 0x000107CC, 0x000107D3, CPU_PMC, },
140
141 { 0xC0000080, 0xC0000080, CPU_FEATURES, },
142 { 0xC0000081, 0xC0000084, CPU_CALL, },
143 { 0xC0000100, 0xC0000102, CPU_BASE, },
144 { 0xC0000103, 0xC0000103, CPU_TIME, },
145
146 { 0xC0010000, 0xC0010007, CPU_PMC, },
147 { 0xC0010010, 0xC0010010, CPU_CONF, },
148 { 0xC0010015, 0xC0010015, CPU_CONF, },
149 { 0xC0010016, 0xC001001A, CPU_MTRR, },
150 { 0xC001001D, 0xC001001D, CPU_MTRR, },
151 { 0xC001001F, 0xC001001F, CPU_CONF, },
152 { 0xC0010030, 0xC0010035, CPU_BIOS, },
153 { 0xC0010044, 0xC0010048, CPU_MC, },
154 { 0xC0010050, 0xC0010056, CPU_SMM, },
155 { 0xC0010058, 0xC0010058, CPU_CONF, },
156 { 0xC0010060, 0xC0010060, CPU_CACHE, },
157 { 0xC0010061, 0xC0010068, CPU_SMM, },
158 { 0xC0010069, 0xC001006B, CPU_SMM, },
159 { 0xC0010070, 0xC0010071, CPU_SMM, },
160 { 0xC0010111, 0xC0010113, CPU_SMM, },
161 { 0xC0010114, 0xC0010118, CPU_SVM, },
162 { 0xC0010140, 0xC0010141, CPU_OSVM, },
163 { 0xC0011022, 0xC0011023, CPU_CONF, },
164};
165
166static int is_typeflag_valid(unsigned cpu, unsigned flag)
167{
168 int i;
169
170 /* Standard Registers should be always valid */
171 if (flag >= CPU_TSS)
172 return 1;
173
174 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
175 if (cpu_reg_range[i].flag == flag)
176 return 1;
177 }
178
179 /* Invalid */
180 return 0;
181}
182
183static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
184 int index, unsigned flag)
185{
186 if (cpu_reg_range[index].flag == flag) {
187 *min = cpu_reg_range[index].min;
188 *max = cpu_reg_range[index].max;
189 } else
190 *max = 0;
191
192 return *max;
193}
194
195/* This function can also be called with seq = NULL for printk */
196static void print_cpu_data(struct seq_file *seq, unsigned type,
197 u32 low, u32 high)
198{
199 struct cpu_private *priv;
200 u64 val = high;
201
202 if (seq) {
203 priv = seq->private;
204 if (priv->file) {
205 val = (val << 32) | low;
206 seq_printf(seq, "0x%llx\n", val);
207 } else
208 seq_printf(seq, " %08x: %08x_%08x\n",
209 type, high, low);
210 } else
211 printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
212}
213
214/* This function can also be called with seq = NULL for printk */
215static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
216{
217 unsigned msr, msr_min, msr_max;
218 struct cpu_private *priv;
219 u32 low, high;
220 int i;
221
222 if (seq) {
223 priv = seq->private;
224 if (priv->file) {
225 if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
226 &low, &high))
227 print_cpu_data(seq, priv->reg, low, high);
228 return;
229 }
230 }
231
232 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
233 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
234 continue;
235
236 for (msr = msr_min; msr <= msr_max; msr++) {
237 if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
238 continue;
239 print_cpu_data(seq, msr, low, high);
240 }
241 }
242}
243
244static void print_tss(void *arg)
245{
246 struct pt_regs *regs = task_pt_regs(current);
247 struct seq_file *seq = arg;
248 unsigned int seg;
249
250 seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
251 seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
252 seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
253 seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
254
255 seq_printf(seq, " RSI\t: %016lx\n", regs->si);
256 seq_printf(seq, " RDI\t: %016lx\n", regs->di);
257 seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
258 seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
259
260#ifdef CONFIG_X86_64
261 seq_printf(seq, " R08\t: %016lx\n", regs->r8);
262 seq_printf(seq, " R09\t: %016lx\n", regs->r9);
263 seq_printf(seq, " R10\t: %016lx\n", regs->r10);
264 seq_printf(seq, " R11\t: %016lx\n", regs->r11);
265 seq_printf(seq, " R12\t: %016lx\n", regs->r12);
266 seq_printf(seq, " R13\t: %016lx\n", regs->r13);
267 seq_printf(seq, " R14\t: %016lx\n", regs->r14);
268 seq_printf(seq, " R15\t: %016lx\n", regs->r15);
269#endif
270
271 asm("movl %%cs,%0" : "=r" (seg));
272 seq_printf(seq, " CS\t: %04x\n", seg);
273 asm("movl %%ds,%0" : "=r" (seg));
274 seq_printf(seq, " DS\t: %04x\n", seg);
275 seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
276 asm("movl %%es,%0" : "=r" (seg));
277 seq_printf(seq, " ES\t: %04x\n", seg);
278 asm("movl %%fs,%0" : "=r" (seg));
279 seq_printf(seq, " FS\t: %04x\n", seg);
280 asm("movl %%gs,%0" : "=r" (seg));
281 seq_printf(seq, " GS\t: %04x\n", seg);
282
283 seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
284
285 seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
286}
287
288static void print_cr(void *arg)
289{
290 struct seq_file *seq = arg;
291
292 seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
293 seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
294 seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
295 seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
296#ifdef CONFIG_X86_64
297 seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
298#endif
299}
300
301static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
302{
303 seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
304}
305
306static void print_dt(void *seq)
307{
308 struct desc_ptr dt;
309 unsigned long ldt;
310
311 /* IDT */
312 store_idt((struct desc_ptr *)&dt);
313 print_desc_ptr("IDT", seq, dt);
314
315 /* GDT */
316 store_gdt((struct desc_ptr *)&dt);
317 print_desc_ptr("GDT", seq, dt);
318
319 /* LDT */
320 store_ldt(ldt);
321 seq_printf(seq, " LDT\t: %016lx\n", ldt);
322
323 /* TR */
324 store_tr(ldt);
325 seq_printf(seq, " TR\t: %016lx\n", ldt);
326}
327
328static void print_dr(void *arg)
329{
330 struct seq_file *seq = arg;
331 unsigned long dr;
332 int i;
333
334 for (i = 0; i < 8; i++) {
335 /* Ignore db4, db5 */
336 if ((i == 4) || (i == 5))
337 continue;
338 get_debugreg(dr, i);
339 seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
340 }
341
342 seq_printf(seq, "\n MSR\t:\n");
343}
344
345static void print_apic(void *arg)
346{
347 struct seq_file *seq = arg;
348
349#ifdef CONFIG_X86_LOCAL_APIC
350 seq_printf(seq, " LAPIC\t:\n");
351 seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
352 seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
353 seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
354 seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
355 seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
356 seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
357 seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
358 seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
359 seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
360 seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
361 seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
362 seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
363 seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
364 seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
365 seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
366 seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
367 seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
368 seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
369 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
370 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
371 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
372 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
373 unsigned int i, v, maxeilvt;
374
375 v = apic_read(APIC_EFEAT);
376 maxeilvt = (v >> 16) & 0xff;
377 seq_printf(seq, " EFEAT\t\t: %08x\n", v);
378 seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
379
380 for (i = 0; i < maxeilvt; i++) {
381 v = apic_read(APIC_EILVTn(i));
382 seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
383 }
384 }
385#endif /* CONFIG_X86_LOCAL_APIC */
386 seq_printf(seq, "\n MSR\t:\n");
387}
388
389static int cpu_seq_show(struct seq_file *seq, void *v)
390{
391 struct cpu_private *priv = seq->private;
392
393 if (priv == NULL)
394 return -EINVAL;
395
396 switch (cpu_base[priv->type].flag) {
397 case CPU_TSS:
398 smp_call_function_single(priv->cpu, print_tss, seq, 1);
399 break;
400 case CPU_CR:
401 smp_call_function_single(priv->cpu, print_cr, seq, 1);
402 break;
403 case CPU_DT:
404 smp_call_function_single(priv->cpu, print_dt, seq, 1);
405 break;
406 case CPU_DEBUG:
407 if (priv->file == CPU_INDEX_BIT)
408 smp_call_function_single(priv->cpu, print_dr, seq, 1);
409 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
410 break;
411 case CPU_APIC:
412 if (priv->file == CPU_INDEX_BIT)
413 smp_call_function_single(priv->cpu, print_apic, seq, 1);
414 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
415 break;
416
417 default:
418 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
419 break;
420 }
421 seq_printf(seq, "\n");
422
423 return 0;
424}
425
426static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
427{
428 if (*pos == 0) /* One time is enough ;-) */
429 return seq;
430
431 return NULL;
432}
433
434static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
435{
436 (*pos)++;
437
438 return cpu_seq_start(seq, pos);
439}
440
441static void cpu_seq_stop(struct seq_file *seq, void *v)
442{
443}
444
445static const struct seq_operations cpu_seq_ops = {
446 .start = cpu_seq_start,
447 .next = cpu_seq_next,
448 .stop = cpu_seq_stop,
449 .show = cpu_seq_show,
450};
451
452static int cpu_seq_open(struct inode *inode, struct file *file)
453{
454 struct cpu_private *priv = inode->i_private;
455 struct seq_file *seq;
456 int err;
457
458 err = seq_open(file, &cpu_seq_ops);
459 if (!err) {
460 seq = file->private_data;
461 seq->private = priv;
462 }
463
464 return err;
465}
466
467static int write_msr(struct cpu_private *priv, u64 val)
468{
469 u32 low, high;
470
471 high = (val >> 32) & 0xffffffff;
472 low = val & 0xffffffff;
473
474 if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
475 return 0;
476
477 return -EPERM;
478}
479
480static int write_cpu_register(struct cpu_private *priv, const char *buf)
481{
482 int ret = -EPERM;
483 u64 val;
484
485 ret = strict_strtoull(buf, 0, &val);
486 if (ret < 0)
487 return ret;
488
489 /* Supporting only MSRs */
490 if (priv->type < CPU_TSS_BIT)
491 return write_msr(priv, val);
492
493 return ret;
494}
495
496static ssize_t cpu_write(struct file *file, const char __user *ubuf,
497 size_t count, loff_t *off)
498{
499 struct seq_file *seq = file->private_data;
500 struct cpu_private *priv = seq->private;
501 char buf[19];
502
503 if ((priv == NULL) || (count >= sizeof(buf)))
504 return -EINVAL;
505
506 if (copy_from_user(&buf, ubuf, count))
507 return -EFAULT;
508
509 buf[count] = 0;
510
511 if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
512 if (!write_cpu_register(priv, buf))
513 return count;
514
515 return -EACCES;
516}
517
518static const struct file_operations cpu_fops = {
519 .owner = THIS_MODULE,
520 .open = cpu_seq_open,
521 .read = seq_read,
522 .write = cpu_write,
523 .llseek = seq_lseek,
524 .release = seq_release,
525};
526
527static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
528 unsigned file, struct dentry *dentry)
529{
530 struct cpu_private *priv = NULL;
531
532 /* Already intialized */
533 if (file == CPU_INDEX_BIT)
534 if (per_cpu(cpu_arr[type].init, cpu))
535 return 0;
536
537 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
538 if (priv == NULL)
539 return -ENOMEM;
540
541 priv->cpu = cpu;
542 priv->type = type;
543 priv->reg = reg;
544 priv->file = file;
545 mutex_lock(&cpu_debug_lock);
546 per_cpu(priv_arr[type], cpu) = priv;
547 per_cpu(cpu_priv_count, cpu)++;
548 mutex_unlock(&cpu_debug_lock);
549
550 if (file)
551 debugfs_create_file(cpu_file[file].name, S_IRUGO,
552 dentry, (void *)priv, &cpu_fops);
553 else {
554 debugfs_create_file(cpu_base[type].name, S_IRUGO,
555 per_cpu(cpu_arr[type].dentry, cpu),
556 (void *)priv, &cpu_fops);
557 mutex_lock(&cpu_debug_lock);
558 per_cpu(cpu_arr[type].init, cpu) = 1;
559 mutex_unlock(&cpu_debug_lock);
560 }
561
562 return 0;
563}
564
565static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
566 struct dentry *dentry)
567{
568 unsigned file;
569 int err = 0;
570
571 for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
572 err = cpu_create_file(cpu, type, reg, file, dentry);
573 if (err)
574 return err;
575 }
576
577 return err;
578}
579
580static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
581{
582 struct dentry *cpu_dentry = NULL;
583 unsigned reg, reg_min, reg_max;
584 int i, err = 0;
585 char reg_dir[12];
586 u32 low, high;
587
588 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
589 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
590 cpu_base[type].flag))
591 continue;
592
593 for (reg = reg_min; reg <= reg_max; reg++) {
594 if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
595 continue;
596
597 sprintf(reg_dir, "0x%x", reg);
598 cpu_dentry = debugfs_create_dir(reg_dir, dentry);
599 err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
600 if (err)
601 return err;
602 }
603 }
604
605 return err;
606}
607
608static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
609{
610 struct dentry *cpu_dentry = NULL;
611 unsigned type;
612 int err = 0;
613
614 for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
615 if (!is_typeflag_valid(cpu, cpu_base[type].flag))
616 continue;
617 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
618 per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
619
620 if (type < CPU_TSS_BIT)
621 err = cpu_init_msr(cpu, type, cpu_dentry);
622 else
623 err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
624 cpu_dentry);
625 if (err)
626 return err;
627 }
628
629 return err;
630}
631
632static int cpu_init_cpu(void)
633{
634 struct dentry *cpu_dentry = NULL;
635 struct cpuinfo_x86 *cpui;
636 char cpu_dir[12];
637 unsigned cpu;
638 int err = 0;
639
640 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
641 cpui = &cpu_data(cpu);
642 if (!cpu_has(cpui, X86_FEATURE_MSR))
643 continue;
644
645 sprintf(cpu_dir, "cpu%d", cpu);
646 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
647 err = cpu_init_allreg(cpu, cpu_dentry);
648
649 pr_info("cpu%d(%d) debug files %d\n",
650 cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
651 if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
652 pr_err("Register files count %d exceeds limit %d\n",
653 per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
654 per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
655 err = -ENFILE;
656 }
657 if (err)
658 return err;
659 }
660
661 return err;
662}
663
664static int __init cpu_debug_init(void)
665{
666 cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
667
668 return cpu_init_cpu();
669}
670
671static void __exit cpu_debug_exit(void)
672{
673 int i, cpu;
674
675 if (cpu_debugfs_dir)
676 debugfs_remove_recursive(cpu_debugfs_dir);
677
678 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
679 for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
680 kfree(per_cpu(priv_arr[i], cpu));
681}
682
683module_init(cpu_debug_init);
684module_exit(cpu_debug_exit);
685
686MODULE_AUTHOR("Jaswinder Singh Rajput");
687MODULE_DESCRIPTION("CPU Debug module");
688MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 7d5c3b0ea8da..1b1920fa7c80 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -68,9 +68,9 @@ struct acpi_cpufreq_data {
68 unsigned int cpu_feature; 68 unsigned int cpu_feature;
69}; 69};
70 70
71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
72 72
73static DEFINE_PER_CPU(struct aperfmperf, old_perf); 73static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
74 74
75/* acpi_perf_data is a pointer to percpu data. */ 75/* acpi_perf_data is a pointer to percpu data. */
76static struct acpi_processor_performance *acpi_perf_data; 76static struct acpi_processor_performance *acpi_perf_data;
@@ -190,9 +190,11 @@ static void do_drv_write(void *_cmd)
190 190
191static void drv_read(struct drv_cmd *cmd) 191static void drv_read(struct drv_cmd *cmd)
192{ 192{
193 int err;
193 cmd->val = 0; 194 cmd->val = 0;
194 195
195 smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1); 196 err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
197 WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
196} 198}
197 199
198static void drv_write(struct drv_cmd *cmd) 200static void drv_write(struct drv_cmd *cmd)
@@ -214,14 +216,14 @@ static u32 get_cur_val(const struct cpumask *mask)
214 if (unlikely(cpumask_empty(mask))) 216 if (unlikely(cpumask_empty(mask)))
215 return 0; 217 return 0;
216 218
217 switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) { 219 switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
218 case SYSTEM_INTEL_MSR_CAPABLE: 220 case SYSTEM_INTEL_MSR_CAPABLE:
219 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 221 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
220 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; 222 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
221 break; 223 break;
222 case SYSTEM_IO_CAPABLE: 224 case SYSTEM_IO_CAPABLE:
223 cmd.type = SYSTEM_IO_CAPABLE; 225 cmd.type = SYSTEM_IO_CAPABLE;
224 perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data; 226 perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
225 cmd.addr.io.port = perf->control_register.address; 227 cmd.addr.io.port = perf->control_register.address;
226 cmd.addr.io.bit_width = perf->control_register.bit_width; 228 cmd.addr.io.bit_width = perf->control_register.bit_width;
227 break; 229 break;
@@ -268,8 +270,8 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
268 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) 270 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
269 return 0; 271 return 0;
270 272
271 ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); 273 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
272 per_cpu(old_perf, cpu) = perf; 274 per_cpu(acfreq_old_perf, cpu) = perf;
273 275
274 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; 276 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
275 277
@@ -278,7 +280,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
278 280
279static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 281static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
280{ 282{
281 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); 283 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
282 unsigned int freq; 284 unsigned int freq;
283 unsigned int cached_freq; 285 unsigned int cached_freq;
284 286
@@ -322,7 +324,7 @@ static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
322static int acpi_cpufreq_target(struct cpufreq_policy *policy, 324static int acpi_cpufreq_target(struct cpufreq_policy *policy,
323 unsigned int target_freq, unsigned int relation) 325 unsigned int target_freq, unsigned int relation)
324{ 326{
325 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 327 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
326 struct acpi_processor_performance *perf; 328 struct acpi_processor_performance *perf;
327 struct cpufreq_freqs freqs; 329 struct cpufreq_freqs freqs;
328 struct drv_cmd cmd; 330 struct drv_cmd cmd;
@@ -416,7 +418,7 @@ out:
416 418
417static int acpi_cpufreq_verify(struct cpufreq_policy *policy) 419static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
418{ 420{
419 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 421 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
420 422
421 dprintk("acpi_cpufreq_verify\n"); 423 dprintk("acpi_cpufreq_verify\n");
422 424
@@ -526,15 +528,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
526 528
527static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) 529static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
528{ 530{
529 /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf 531 /* Intel Xeon Processor 7100 Series Specification Update
532 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
530 * AL30: A Machine Check Exception (MCE) Occurring during an 533 * AL30: A Machine Check Exception (MCE) Occurring during an
531 * Enhanced Intel SpeedStep Technology Ratio Change May Cause 534 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
532 * Both Processor Cores to Lock Up when HT is enabled*/ 535 * Both Processor Cores to Lock Up. */
533 if (c->x86_vendor == X86_VENDOR_INTEL) { 536 if (c->x86_vendor == X86_VENDOR_INTEL) {
534 if ((c->x86 == 15) && 537 if ((c->x86 == 15) &&
535 (c->x86_model == 6) && 538 (c->x86_model == 6) &&
536 (c->x86_mask == 8) && smt_capable()) 539 (c->x86_mask == 8)) {
540 printk(KERN_INFO "acpi-cpufreq: Intel(R) "
541 "Xeon(R) 7100 Errata AL30, processors may "
542 "lock up on frequency changes: disabling "
543 "acpi-cpufreq.\n");
537 return -ENODEV; 544 return -ENODEV;
545 }
538 } 546 }
539 return 0; 547 return 0;
540} 548}
@@ -549,13 +557,18 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
549 unsigned int result = 0; 557 unsigned int result = 0;
550 struct cpuinfo_x86 *c = &cpu_data(policy->cpu); 558 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
551 struct acpi_processor_performance *perf; 559 struct acpi_processor_performance *perf;
560#ifdef CONFIG_SMP
561 static int blacklisted;
562#endif
552 563
553 dprintk("acpi_cpufreq_cpu_init\n"); 564 dprintk("acpi_cpufreq_cpu_init\n");
554 565
555#ifdef CONFIG_SMP 566#ifdef CONFIG_SMP
556 result = acpi_cpufreq_blacklist(c); 567 if (blacklisted)
557 if (result) 568 return blacklisted;
558 return result; 569 blacklisted = acpi_cpufreq_blacklist(c);
570 if (blacklisted)
571 return blacklisted;
559#endif 572#endif
560 573
561 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); 574 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
@@ -563,7 +576,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
563 return -ENOMEM; 576 return -ENOMEM;
564 577
565 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); 578 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
566 per_cpu(drv_data, cpu) = data; 579 per_cpu(acfreq_data, cpu) = data;
567 580
568 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 581 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
569 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; 582 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -714,20 +727,20 @@ err_unreg:
714 acpi_processor_unregister_performance(perf, cpu); 727 acpi_processor_unregister_performance(perf, cpu);
715err_free: 728err_free:
716 kfree(data); 729 kfree(data);
717 per_cpu(drv_data, cpu) = NULL; 730 per_cpu(acfreq_data, cpu) = NULL;
718 731
719 return result; 732 return result;
720} 733}
721 734
722static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) 735static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
723{ 736{
724 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 737 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
725 738
726 dprintk("acpi_cpufreq_cpu_exit\n"); 739 dprintk("acpi_cpufreq_cpu_exit\n");
727 740
728 if (data) { 741 if (data) {
729 cpufreq_frequency_table_put_attr(policy->cpu); 742 cpufreq_frequency_table_put_attr(policy->cpu);
730 per_cpu(drv_data, policy->cpu) = NULL; 743 per_cpu(acfreq_data, policy->cpu) = NULL;
731 acpi_processor_unregister_performance(data->acpi_data, 744 acpi_processor_unregister_performance(data->acpi_data,
732 policy->cpu); 745 policy->cpu);
733 kfree(data); 746 kfree(data);
@@ -738,7 +751,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
738 751
739static int acpi_cpufreq_resume(struct cpufreq_policy *policy) 752static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
740{ 753{
741 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 754 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
742 755
743 dprintk("acpi_cpufreq_resume\n"); 756 dprintk("acpi_cpufreq_resume\n");
744 757
@@ -753,14 +766,15 @@ static struct freq_attr *acpi_cpufreq_attr[] = {
753}; 766};
754 767
755static struct cpufreq_driver acpi_cpufreq_driver = { 768static struct cpufreq_driver acpi_cpufreq_driver = {
756 .verify = acpi_cpufreq_verify, 769 .verify = acpi_cpufreq_verify,
757 .target = acpi_cpufreq_target, 770 .target = acpi_cpufreq_target,
758 .init = acpi_cpufreq_cpu_init, 771 .bios_limit = acpi_processor_get_bios_limit,
759 .exit = acpi_cpufreq_cpu_exit, 772 .init = acpi_cpufreq_cpu_init,
760 .resume = acpi_cpufreq_resume, 773 .exit = acpi_cpufreq_cpu_exit,
761 .name = "acpi-cpufreq", 774 .resume = acpi_cpufreq_resume,
762 .owner = THIS_MODULE, 775 .name = "acpi-cpufreq",
763 .attr = acpi_cpufreq_attr, 776 .owner = THIS_MODULE,
777 .attr = acpi_cpufreq_attr,
764}; 778};
765 779
766static int __init acpi_cpufreq_init(void) 780static int __init acpi_cpufreq_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index ce2ed3e4aad9..7e7eea4f8261 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); 813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
814 break; 814 break;
815 case 1 ... 15: 815 case 1 ... 15:
816 longhaul_version = TYPE_LONGHAUL_V1; 816 longhaul_version = TYPE_LONGHAUL_V2;
817 if (c->x86_mask < 8) { 817 if (c->x86_mask < 8) {
818 cpu_model = CPU_SAMUEL2; 818 cpu_model = CPU_SAMUEL2;
819 cpuname = "C3 'Samuel 2' [C5B]"; 819 cpuname = "C3 'Samuel 2' [C5B]";
@@ -885,7 +885,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
885 885
886 /* Find ACPI data for processor */ 886 /* Find ACPI data for processor */
887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, 887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
888 ACPI_UINT32_MAX, &longhaul_walk_callback, 888 ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
889 NULL, (void *)&pr); 889 NULL, (void *)&pr);
890 890
891 /* Check ACPI support for C3 state */ 891 /* Check ACPI support for C3 state */
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index f10dea409f40..cb01dac267d3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -164,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
164 } 164 }
165 165
166 /* cpuinfo and default policy values */ 166 /* cpuinfo and default policy values */
167 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; 167 policy->cpuinfo.transition_latency = 200000;
168 policy->cur = busfreq * max_multiplier; 168 policy->cur = busfreq * max_multiplier;
169 169
170 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 170 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index d47c775eb0ab..9a97116f89e5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -714,14 +714,17 @@ static struct freq_attr *powernow_table_attr[] = {
714}; 714};
715 715
716static struct cpufreq_driver powernow_driver = { 716static struct cpufreq_driver powernow_driver = {
717 .verify = powernow_verify, 717 .verify = powernow_verify,
718 .target = powernow_target, 718 .target = powernow_target,
719 .get = powernow_get, 719 .get = powernow_get,
720 .init = powernow_cpu_init, 720#ifdef CONFIG_X86_POWERNOW_K7_ACPI
721 .exit = powernow_cpu_exit, 721 .bios_limit = acpi_processor_get_bios_limit,
722 .name = "powernow-k7", 722#endif
723 .owner = THIS_MODULE, 723 .init = powernow_cpu_init,
724 .attr = powernow_table_attr, 724 .exit = powernow_cpu_exit,
725 .name = "powernow-k7",
726 .owner = THIS_MODULE,
727 .attr = powernow_table_attr,
725}; 728};
726 729
727static int __init powernow_init(void) 730static int __init powernow_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6394aa5c7985..6e44519960c8 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1022,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data)
1022 * set it to 1 to avoid problems in the future. 1022 * set it to 1 to avoid problems in the future.
1023 * For all others it's a BIOS bug. 1023 * For all others it's a BIOS bug.
1024 */ 1024 */
1025 if (!boot_cpu_data.x86 == 0x11) 1025 if (boot_cpu_data.x86 != 0x11)
1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition " 1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1027 "latency\n"); 1027 "latency\n");
1028 max_latency = 1; 1028 max_latency = 1;
@@ -1118,7 +1118,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1118static int powernowk8_target(struct cpufreq_policy *pol, 1118static int powernowk8_target(struct cpufreq_policy *pol,
1119 unsigned targfreq, unsigned relation) 1119 unsigned targfreq, unsigned relation)
1120{ 1120{
1121 cpumask_t oldmask; 1121 cpumask_var_t oldmask;
1122 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1122 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1123 u32 checkfid; 1123 u32 checkfid;
1124 u32 checkvid; 1124 u32 checkvid;
@@ -1131,9 +1131,13 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1131 checkfid = data->currfid; 1131 checkfid = data->currfid;
1132 checkvid = data->currvid; 1132 checkvid = data->currvid;
1133 1133
1134 /* only run on specific CPU from here on */ 1134 /* only run on specific CPU from here on. */
1135 oldmask = current->cpus_allowed; 1135 /* This is poor form: use a workqueue or smp_call_function_single */
1136 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1136 if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
1137 return -ENOMEM;
1138
1139 cpumask_copy(oldmask, tsk_cpus_allowed(current));
1140 set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
1137 1141
1138 if (smp_processor_id() != pol->cpu) { 1142 if (smp_processor_id() != pol->cpu) {
1139 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1143 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1193,7 +1197,8 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1193 ret = 0; 1197 ret = 0;
1194 1198
1195err_out: 1199err_out:
1196 set_cpus_allowed_ptr(current, &oldmask); 1200 set_cpus_allowed_ptr(current, oldmask);
1201 free_cpumask_var(oldmask);
1197 return ret; 1202 return ret;
1198} 1203}
1199 1204
@@ -1351,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1351 1356
1352 kfree(data->powernow_table); 1357 kfree(data->powernow_table);
1353 kfree(data); 1358 kfree(data);
1359 per_cpu(powernow_data, pol->cpu) = NULL;
1354 1360
1355 return 0; 1361 return 0;
1356} 1362}
@@ -1370,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu)
1370 int err; 1376 int err;
1371 1377
1372 if (!data) 1378 if (!data)
1373 return -EINVAL; 1379 return 0;
1374 1380
1375 smp_call_function_single(cpu, query_values_on_cpu, &err, true); 1381 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1376 if (err) 1382 if (err)
@@ -1393,14 +1399,15 @@ static struct freq_attr *powernow_k8_attr[] = {
1393}; 1399};
1394 1400
1395static struct cpufreq_driver cpufreq_amd64_driver = { 1401static struct cpufreq_driver cpufreq_amd64_driver = {
1396 .verify = powernowk8_verify, 1402 .verify = powernowk8_verify,
1397 .target = powernowk8_target, 1403 .target = powernowk8_target,
1398 .init = powernowk8_cpu_init, 1404 .bios_limit = acpi_processor_get_bios_limit,
1399 .exit = __devexit_p(powernowk8_cpu_exit), 1405 .init = powernowk8_cpu_init,
1400 .get = powernowk8_get, 1406 .exit = __devexit_p(powernowk8_cpu_exit),
1401 .name = "powernow-k8", 1407 .get = powernowk8_get,
1402 .owner = THIS_MODULE, 1408 .name = "powernow-k8",
1403 .attr = powernow_k8_attr, 1409 .owner = THIS_MODULE,
1410 .attr = powernow_k8_attr,
1404}; 1411};
1405 1412
1406/* driver entry point for init */ 1413/* driver entry point for init */
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 6911e91fb4f6..2ce8e0b5cc54 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev;
39 39
40/* speedstep_processor 40/* speedstep_processor
41 */ 41 */
42static unsigned int speedstep_processor; 42static enum speedstep_processor speedstep_processor;
43 43
44static u32 pmbase; 44static u32 pmbase;
45 45
@@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void)
232 return 0; 232 return 0;
233} 233}
234 234
235struct get_freq_data { 235static void get_freq_data(void *_speed)
236 unsigned int speed;
237 unsigned int processor;
238};
239
240static void get_freq_data(void *_data)
241{ 236{
242 struct get_freq_data *data = _data; 237 unsigned int *speed = _speed;
243 238
244 data->speed = speedstep_get_frequency(data->processor); 239 *speed = speedstep_get_frequency(speedstep_processor);
245} 240}
246 241
247static unsigned int speedstep_get(unsigned int cpu) 242static unsigned int speedstep_get(unsigned int cpu)
248{ 243{
249 struct get_freq_data data = { .processor = cpu }; 244 unsigned int speed;
250 245
251 /* You're supposed to ensure CPU is online. */ 246 /* You're supposed to ensure CPU is online. */
252 if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) 247 if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
253 BUG(); 248 BUG();
254 249
255 dprintk("detected %u kHz as current frequency\n", data.speed); 250 dprintk("detected %u kHz as current frequency\n", speed);
256 return data.speed; 251 return speed;
257} 252}
258 253
259/** 254/**
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index f4c290b8482f..ad0083abfa23 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -34,7 +34,7 @@ static int relaxed_check;
34 * GET PROCESSOR CORE SPEED IN KHZ * 34 * GET PROCESSOR CORE SPEED IN KHZ *
35 *********************************************************************/ 35 *********************************************************************/
36 36
37static unsigned int pentium3_get_frequency(unsigned int processor) 37static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
38{ 38{
39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ 39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
40 struct { 40 struct {
@@ -227,7 +227,7 @@ static unsigned int pentium4_get_frequency(void)
227 227
228 228
229/* Warning: may get called from smp_call_function_single. */ 229/* Warning: may get called from smp_call_function_single. */
230unsigned int speedstep_get_frequency(unsigned int processor) 230unsigned int speedstep_get_frequency(enum speedstep_processor processor)
231{ 231{
232 switch (processor) { 232 switch (processor) {
233 case SPEEDSTEP_CPU_PCORE: 233 case SPEEDSTEP_CPU_PCORE:
@@ -380,7 +380,7 @@ EXPORT_SYMBOL_GPL(speedstep_detect_processor);
380 * DETECT SPEEDSTEP SPEEDS * 380 * DETECT SPEEDSTEP SPEEDS *
381 *********************************************************************/ 381 *********************************************************************/
382 382
383unsigned int speedstep_get_freqs(unsigned int processor, 383unsigned int speedstep_get_freqs(enum speedstep_processor processor,
384 unsigned int *low_speed, 384 unsigned int *low_speed,
385 unsigned int *high_speed, 385 unsigned int *high_speed,
386 unsigned int *transition_latency, 386 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
index 2b6c04e5a304..70d9cea1219d 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -11,18 +11,18 @@
11 11
12 12
13/* processors */ 13/* processors */
14 14enum speedstep_processor {
15#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */ 15 SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
16#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */ 16 SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
17#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */ 17 SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
18#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */ 18 SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
19
20/* the following processors are not speedstep-capable and are not auto-detected 19/* the following processors are not speedstep-capable and are not auto-detected
21 * in speedstep_detect_processor(). However, their speed can be detected using 20 * in speedstep_detect_processor(). However, their speed can be detected using
22 * the speedstep_get_frequency() call. */ 21 * the speedstep_get_frequency() call. */
23#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */ 22 SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
24#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */ 23 SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
25#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */ 24 SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
25};
26 26
27/* speedstep states -- only two of them */ 27/* speedstep states -- only two of them */
28 28
@@ -31,10 +31,10 @@
31 31
32 32
33/* detect a speedstep-capable processor */ 33/* detect a speedstep-capable processor */
34extern unsigned int speedstep_detect_processor (void); 34extern enum speedstep_processor speedstep_detect_processor(void);
35 35
36/* detect the current speed (in khz) of the processor */ 36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_frequency(unsigned int processor); 37extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
38 38
39 39
40/* detect the low and high speeds of the processor. The callback 40/* detect the low and high speeds of the processor. The callback
@@ -42,7 +42,7 @@ extern unsigned int speedstep_get_frequency(unsigned int processor);
42 * SPEEDSTEP_LOW; the second argument is zero so that no 42 * SPEEDSTEP_LOW; the second argument is zero so that no
43 * cpufreq_notify_transition calls are initiated. 43 * cpufreq_notify_transition calls are initiated.
44 */ 44 */
45extern unsigned int speedstep_get_freqs(unsigned int processor, 45extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
46 unsigned int *low_speed, 46 unsigned int *low_speed,
47 unsigned int *high_speed, 47 unsigned int *high_speed,
48 unsigned int *transition_latency, 48 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index befea088e4f5..04d73c114e49 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -35,7 +35,7 @@ static int smi_cmd;
35static unsigned int smi_sig; 35static unsigned int smi_sig;
36 36
37/* info about the processor */ 37/* info about the processor */
38static unsigned int speedstep_processor; 38static enum speedstep_processor speedstep_processor;
39 39
40/* 40/*
41 * There are only two frequency states for each processor. Values 41 * There are only two frequency states for each processor. Values
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 19807b89f058..4fbd384fb645 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
373 /* Handle the GX (Formally known as the GX2) */ 373 /* Handle the GX (Formally known as the GX2) */
374 374
375 if (c->x86 == 5 && c->x86_model == 5) 375 if (c->x86 == 5 && c->x86_model == 5)
376 display_cacheinfo(c); 376 cpu_detect_cache_sizes(c);
377 else 377 else
378 init_cyrix(c); 378 init_cyrix(c);
379} 379}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index c900b73f9224..879666f4d871 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -70,7 +70,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
70 if (c->x86_power & (1 << 8)) { 70 if (c->x86_power & (1 << 8)) {
71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
73 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
74 sched_clock_stable = 1; 73 sched_clock_stable = 1;
75 } 74 }
76 75
@@ -270,8 +269,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
270 node = cpu_to_node(cpu); 269 node = cpu_to_node(cpu);
271 } 270 }
272 numa_set_node(cpu, node); 271 numa_set_node(cpu, node);
273
274 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
275#endif 272#endif
276} 273}
277 274
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 804c40e2bc3e..fc6c8ef92dcc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -94,7 +94,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ 94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */
95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ 95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */
96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ 96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */
97 { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ 97 { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */
98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ 99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */
100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
@@ -102,6 +102,9 @@ static const struct _cache_table __cpuinitconst cache_table[] =
102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ 102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */
103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
105 { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */
106 { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */
107 { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */
105 { 0x00, 0, 0} 108 { 0x00, 0, 0}
106}; 109};
107 110
@@ -488,22 +491,6 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
488#endif 491#endif
489 } 492 }
490 493
491 if (trace)
492 printk(KERN_INFO "CPU: Trace cache: %dK uops", trace);
493 else if (l1i)
494 printk(KERN_INFO "CPU: L1 I cache: %dK", l1i);
495
496 if (l1d)
497 printk(KERN_CONT ", L1 D cache: %dK\n", l1d);
498 else
499 printk(KERN_CONT "\n");
500
501 if (l2)
502 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
503
504 if (l3)
505 printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
506
507 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); 494 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
508 495
509 return l2; 496 return l2;
@@ -512,26 +499,27 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
512#ifdef CONFIG_SYSFS 499#ifdef CONFIG_SYSFS
513 500
514/* pointer to _cpuid4_info array (for each cache leaf) */ 501/* pointer to _cpuid4_info array (for each cache leaf) */
515static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 502static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
516#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 503#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
517 504
518#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
519static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 506static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
520{ 507{
521 struct _cpuid4_info *this_leaf, *sibling_leaf; 508 struct _cpuid4_info *this_leaf, *sibling_leaf;
522 unsigned long num_threads_sharing; 509 unsigned long num_threads_sharing;
523 int index_msb, i; 510 int index_msb, i, sibling;
524 struct cpuinfo_x86 *c = &cpu_data(cpu); 511 struct cpuinfo_x86 *c = &cpu_data(cpu);
525 512
526 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 513 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
527 struct cpuinfo_x86 *d; 514 for_each_cpu(i, c->llc_shared_map) {
528 for_each_online_cpu(i) { 515 if (!per_cpu(ici_cpuid4_info, i))
529 if (!per_cpu(cpuid4_info, i))
530 continue; 516 continue;
531 d = &cpu_data(i);
532 this_leaf = CPUID4_INFO_IDX(i, index); 517 this_leaf = CPUID4_INFO_IDX(i, index);
533 cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), 518 for_each_cpu(sibling, c->llc_shared_map) {
534 d->llc_shared_map); 519 if (!cpu_online(sibling))
520 continue;
521 set_bit(sibling, this_leaf->shared_cpu_map);
522 }
535 } 523 }
536 return; 524 return;
537 } 525 }
@@ -548,7 +536,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
548 c->apicid >> index_msb) { 536 c->apicid >> index_msb) {
549 cpumask_set_cpu(i, 537 cpumask_set_cpu(i,
550 to_cpumask(this_leaf->shared_cpu_map)); 538 to_cpumask(this_leaf->shared_cpu_map));
551 if (i != cpu && per_cpu(cpuid4_info, i)) { 539 if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
552 sibling_leaf = 540 sibling_leaf =
553 CPUID4_INFO_IDX(i, index); 541 CPUID4_INFO_IDX(i, index);
554 cpumask_set_cpu(cpu, to_cpumask( 542 cpumask_set_cpu(cpu, to_cpumask(
@@ -587,8 +575,8 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
587 for (i = 0; i < num_cache_leaves; i++) 575 for (i = 0; i < num_cache_leaves; i++)
588 cache_remove_shared_cpu_map(cpu, i); 576 cache_remove_shared_cpu_map(cpu, i);
589 577
590 kfree(per_cpu(cpuid4_info, cpu)); 578 kfree(per_cpu(ici_cpuid4_info, cpu));
591 per_cpu(cpuid4_info, cpu) = NULL; 579 per_cpu(ici_cpuid4_info, cpu) = NULL;
592} 580}
593 581
594static int 582static int
@@ -627,15 +615,15 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
627 if (num_cache_leaves == 0) 615 if (num_cache_leaves == 0)
628 return -ENOENT; 616 return -ENOENT;
629 617
630 per_cpu(cpuid4_info, cpu) = kzalloc( 618 per_cpu(ici_cpuid4_info, cpu) = kzalloc(
631 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 619 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
632 if (per_cpu(cpuid4_info, cpu) == NULL) 620 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
633 return -ENOMEM; 621 return -ENOMEM;
634 622
635 smp_call_function_single(cpu, get_cpu_leaves, &retval, true); 623 smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
636 if (retval) { 624 if (retval) {
637 kfree(per_cpu(cpuid4_info, cpu)); 625 kfree(per_cpu(ici_cpuid4_info, cpu));
638 per_cpu(cpuid4_info, cpu) = NULL; 626 per_cpu(ici_cpuid4_info, cpu) = NULL;
639 } 627 }
640 628
641 return retval; 629 return retval;
@@ -647,7 +635,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
647extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ 635extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
648 636
649/* pointer to kobject for cpuX/cache */ 637/* pointer to kobject for cpuX/cache */
650static DEFINE_PER_CPU(struct kobject *, cache_kobject); 638static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
651 639
652struct _index_kobject { 640struct _index_kobject {
653 struct kobject kobj; 641 struct kobject kobj;
@@ -656,8 +644,8 @@ struct _index_kobject {
656}; 644};
657 645
658/* pointer to array of kobjects for cpuX/cache/indexY */ 646/* pointer to array of kobjects for cpuX/cache/indexY */
659static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); 647static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
660#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) 648#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
661 649
662#define show_one_plus(file_name, object, val) \ 650#define show_one_plus(file_name, object, val) \
663static ssize_t show_##file_name \ 651static ssize_t show_##file_name \
@@ -876,10 +864,10 @@ static struct kobj_type ktype_percpu_entry = {
876 864
877static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) 865static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
878{ 866{
879 kfree(per_cpu(cache_kobject, cpu)); 867 kfree(per_cpu(ici_cache_kobject, cpu));
880 kfree(per_cpu(index_kobject, cpu)); 868 kfree(per_cpu(ici_index_kobject, cpu));
881 per_cpu(cache_kobject, cpu) = NULL; 869 per_cpu(ici_cache_kobject, cpu) = NULL;
882 per_cpu(index_kobject, cpu) = NULL; 870 per_cpu(ici_index_kobject, cpu) = NULL;
883 free_cache_attributes(cpu); 871 free_cache_attributes(cpu);
884} 872}
885 873
@@ -895,14 +883,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
895 return err; 883 return err;
896 884
897 /* Allocate all required memory */ 885 /* Allocate all required memory */
898 per_cpu(cache_kobject, cpu) = 886 per_cpu(ici_cache_kobject, cpu) =
899 kzalloc(sizeof(struct kobject), GFP_KERNEL); 887 kzalloc(sizeof(struct kobject), GFP_KERNEL);
900 if (unlikely(per_cpu(cache_kobject, cpu) == NULL)) 888 if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
901 goto err_out; 889 goto err_out;
902 890
903 per_cpu(index_kobject, cpu) = kzalloc( 891 per_cpu(ici_index_kobject, cpu) = kzalloc(
904 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); 892 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
905 if (unlikely(per_cpu(index_kobject, cpu) == NULL)) 893 if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
906 goto err_out; 894 goto err_out;
907 895
908 return 0; 896 return 0;
@@ -926,7 +914,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
926 if (unlikely(retval < 0)) 914 if (unlikely(retval < 0))
927 return retval; 915 return retval;
928 916
929 retval = kobject_init_and_add(per_cpu(cache_kobject, cpu), 917 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
930 &ktype_percpu_entry, 918 &ktype_percpu_entry,
931 &sys_dev->kobj, "%s", "cache"); 919 &sys_dev->kobj, "%s", "cache");
932 if (retval < 0) { 920 if (retval < 0) {
@@ -940,12 +928,12 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
940 this_object->index = i; 928 this_object->index = i;
941 retval = kobject_init_and_add(&(this_object->kobj), 929 retval = kobject_init_and_add(&(this_object->kobj),
942 &ktype_cache, 930 &ktype_cache,
943 per_cpu(cache_kobject, cpu), 931 per_cpu(ici_cache_kobject, cpu),
944 "index%1lu", i); 932 "index%1lu", i);
945 if (unlikely(retval)) { 933 if (unlikely(retval)) {
946 for (j = 0; j < i; j++) 934 for (j = 0; j < i; j++)
947 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); 935 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
948 kobject_put(per_cpu(cache_kobject, cpu)); 936 kobject_put(per_cpu(ici_cache_kobject, cpu));
949 cpuid4_cache_sysfs_exit(cpu); 937 cpuid4_cache_sysfs_exit(cpu);
950 return retval; 938 return retval;
951 } 939 }
@@ -953,7 +941,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
953 } 941 }
954 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); 942 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
955 943
956 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 944 kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
957 return 0; 945 return 0;
958} 946}
959 947
@@ -962,7 +950,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
962 unsigned int cpu = sys_dev->id; 950 unsigned int cpu = sys_dev->id;
963 unsigned long i; 951 unsigned long i;
964 952
965 if (per_cpu(cpuid4_info, cpu) == NULL) 953 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
966 return; 954 return;
967 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) 955 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
968 return; 956 return;
@@ -970,7 +958,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
970 958
971 for (i = 0; i < num_cache_leaves; i++) 959 for (i = 0; i < num_cache_leaves; i++)
972 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); 960 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
973 kobject_put(per_cpu(cache_kobject, cpu)); 961 kobject_put(per_cpu(ici_cache_kobject, cpu));
974 cpuid4_cache_sysfs_exit(cpu); 962 cpuid4_cache_sysfs_exit(cpu);
975} 963}
976 964
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 472763d92098..73734baa50f2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -74,7 +74,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
74 m->finished = 0; 74 m->finished = 0;
75} 75}
76 76
77static cpumask_t mce_inject_cpumask; 77static cpumask_var_t mce_inject_cpumask;
78 78
79static int mce_raise_notify(struct notifier_block *self, 79static int mce_raise_notify(struct notifier_block *self,
80 unsigned long val, void *data) 80 unsigned long val, void *data)
@@ -82,9 +82,9 @@ static int mce_raise_notify(struct notifier_block *self,
82 struct die_args *args = (struct die_args *)data; 82 struct die_args *args = (struct die_args *)data;
83 int cpu = smp_processor_id(); 83 int cpu = smp_processor_id();
84 struct mce *m = &__get_cpu_var(injectm); 84 struct mce *m = &__get_cpu_var(injectm);
85 if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) 85 if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
86 return NOTIFY_DONE; 86 return NOTIFY_DONE;
87 cpu_clear(cpu, mce_inject_cpumask); 87 cpumask_clear_cpu(cpu, mce_inject_cpumask);
88 if (m->inject_flags & MCJ_EXCEPTION) 88 if (m->inject_flags & MCJ_EXCEPTION)
89 raise_exception(m, args->regs); 89 raise_exception(m, args->regs);
90 else if (m->status) 90 else if (m->status)
@@ -148,22 +148,22 @@ static void raise_mce(struct mce *m)
148 unsigned long start; 148 unsigned long start;
149 int cpu; 149 int cpu;
150 get_online_cpus(); 150 get_online_cpus();
151 mce_inject_cpumask = cpu_online_map; 151 cpumask_copy(mce_inject_cpumask, cpu_online_mask);
152 cpu_clear(get_cpu(), mce_inject_cpumask); 152 cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
153 for_each_online_cpu(cpu) { 153 for_each_online_cpu(cpu) {
154 struct mce *mcpu = &per_cpu(injectm, cpu); 154 struct mce *mcpu = &per_cpu(injectm, cpu);
155 if (!mcpu->finished || 155 if (!mcpu->finished ||
156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) 156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
157 cpu_clear(cpu, mce_inject_cpumask); 157 cpumask_clear_cpu(cpu, mce_inject_cpumask);
158 } 158 }
159 if (!cpus_empty(mce_inject_cpumask)) 159 if (!cpumask_empty(mce_inject_cpumask))
160 apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); 160 apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
161 start = jiffies; 161 start = jiffies;
162 while (!cpus_empty(mce_inject_cpumask)) { 162 while (!cpumask_empty(mce_inject_cpumask)) {
163 if (!time_before(jiffies, start + 2*HZ)) { 163 if (!time_before(jiffies, start + 2*HZ)) {
164 printk(KERN_ERR 164 printk(KERN_ERR
165 "Timeout waiting for mce inject NMI %lx\n", 165 "Timeout waiting for mce inject NMI %lx\n",
166 *cpus_addr(mce_inject_cpumask)); 166 *cpumask_bits(mce_inject_cpumask));
167 break; 167 break;
168 } 168 }
169 cpu_relax(); 169 cpu_relax();
@@ -210,6 +210,8 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
210 210
211static int inject_init(void) 211static int inject_init(void)
212{ 212{
213 if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
214 return -ENOMEM;
213 printk(KERN_INFO "Machine check injector initialized\n"); 215 printk(KERN_INFO "Machine check injector initialized\n");
214 mce_chrdev_ops.write = mce_write; 216 mce_chrdev_ops.write = mce_write;
215 register_die_notifier(&mce_raise_nb); 217 register_die_notifier(&mce_raise_nb);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b1598a9436d0..a8aacd4b513c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -46,6 +46,9 @@
46 46
47#include "mce-internal.h" 47#include "mce-internal.h"
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h>
51
49int mce_disabled __read_mostly; 52int mce_disabled __read_mostly;
50 53
51#define MISC_MCELOG_MINOR 227 54#define MISC_MCELOG_MINOR 227
@@ -85,18 +88,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
85static DEFINE_PER_CPU(struct mce, mces_seen); 88static DEFINE_PER_CPU(struct mce, mces_seen);
86static int cpu_missing; 89static int cpu_missing;
87 90
88static void default_decode_mce(struct mce *m) 91/*
92 * CPU/chipset specific EDAC code can register a notifier call here to print
93 * MCE errors in a human-readable form.
94 */
95ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
96EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
97
98static int default_decode_mce(struct notifier_block *nb, unsigned long val,
99 void *data)
89{ 100{
90 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 101 pr_emerg("No human readable MCE decoding support on this CPU type.\n");
91 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 102 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
103
104 return NOTIFY_STOP;
92} 105}
93 106
94/* 107static struct notifier_block mce_dec_nb = {
95 * CPU/chipset specific EDAC code can register a callback here to print 108 .notifier_call = default_decode_mce,
96 * MCE errors in a human-readable form: 109 .priority = -1,
97 */ 110};
98void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce;
99EXPORT_SYMBOL(x86_mce_decode_callback);
100 111
101/* MCA banks polled by the period polling timer for corrected events */ 112/* MCA banks polled by the period polling timer for corrected events */
102DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 113DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -141,6 +152,9 @@ void mce_log(struct mce *mce)
141{ 152{
142 unsigned next, entry; 153 unsigned next, entry;
143 154
155 /* Emit the trace record: */
156 trace_mce_record(mce);
157
144 mce->finished = 0; 158 mce->finished = 0;
145 wmb(); 159 wmb();
146 for (;;) { 160 for (;;) {
@@ -204,9 +218,9 @@ static void print_mce(struct mce *m)
204 218
205 /* 219 /*
206 * Print out human-readable details about the MCE error, 220 * Print out human-readable details about the MCE error,
207 * (if the CPU has an implementation for that): 221 * (if the CPU has an implementation for that)
208 */ 222 */
209 x86_mce_decode_callback(m); 223 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
210} 224}
211 225
212static void print_mce_head(void) 226static void print_mce_head(void)
@@ -1122,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */
1122static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1136static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1123static DEFINE_PER_CPU(struct timer_list, mce_timer); 1137static DEFINE_PER_CPU(struct timer_list, mce_timer);
1124 1138
1125static void mcheck_timer(unsigned long data) 1139static void mce_start_timer(unsigned long data)
1126{ 1140{
1127 struct timer_list *t = &per_cpu(mce_timer, data); 1141 struct timer_list *t = &per_cpu(mce_timer, data);
1128 int *n; 1142 int *n;
@@ -1187,7 +1201,7 @@ int mce_notify_irq(void)
1187} 1201}
1188EXPORT_SYMBOL_GPL(mce_notify_irq); 1202EXPORT_SYMBOL_GPL(mce_notify_irq);
1189 1203
1190static int mce_banks_init(void) 1204static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1191{ 1205{
1192 int i; 1206 int i;
1193 1207
@@ -1206,7 +1220,7 @@ static int mce_banks_init(void)
1206/* 1220/*
1207 * Initialize Machine Checks for a CPU. 1221 * Initialize Machine Checks for a CPU.
1208 */ 1222 */
1209static int __cpuinit mce_cap_init(void) 1223static int __cpuinit __mcheck_cpu_cap_init(void)
1210{ 1224{
1211 unsigned b; 1225 unsigned b;
1212 u64 cap; 1226 u64 cap;
@@ -1214,7 +1228,8 @@ static int __cpuinit mce_cap_init(void)
1214 rdmsrl(MSR_IA32_MCG_CAP, cap); 1228 rdmsrl(MSR_IA32_MCG_CAP, cap);
1215 1229
1216 b = cap & MCG_BANKCNT_MASK; 1230 b = cap & MCG_BANKCNT_MASK;
1217 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1231 if (!banks)
1232 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1218 1233
1219 if (b > MAX_NR_BANKS) { 1234 if (b > MAX_NR_BANKS) {
1220 printk(KERN_WARNING 1235 printk(KERN_WARNING
@@ -1227,7 +1242,7 @@ static int __cpuinit mce_cap_init(void)
1227 WARN_ON(banks != 0 && b != banks); 1242 WARN_ON(banks != 0 && b != banks);
1228 banks = b; 1243 banks = b;
1229 if (!mce_banks) { 1244 if (!mce_banks) {
1230 int err = mce_banks_init(); 1245 int err = __mcheck_cpu_mce_banks_init();
1231 1246
1232 if (err) 1247 if (err)
1233 return err; 1248 return err;
@@ -1243,7 +1258,7 @@ static int __cpuinit mce_cap_init(void)
1243 return 0; 1258 return 0;
1244} 1259}
1245 1260
1246static void mce_init(void) 1261static void __mcheck_cpu_init_generic(void)
1247{ 1262{
1248 mce_banks_t all_banks; 1263 mce_banks_t all_banks;
1249 u64 cap; 1264 u64 cap;
@@ -1272,7 +1287,7 @@ static void mce_init(void)
1272} 1287}
1273 1288
1274/* Add per CPU specific workarounds here */ 1289/* Add per CPU specific workarounds here */
1275static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 1290static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1276{ 1291{
1277 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1292 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1278 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1293 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1340,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1340 return 0; 1355 return 0;
1341} 1356}
1342 1357
1343static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1358static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1344{ 1359{
1345 if (c->x86 != 5) 1360 if (c->x86 != 5)
1346 return; 1361 return;
@@ -1354,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1354 } 1369 }
1355} 1370}
1356 1371
1357static void mce_cpu_features(struct cpuinfo_x86 *c) 1372static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1358{ 1373{
1359 switch (c->x86_vendor) { 1374 switch (c->x86_vendor) {
1360 case X86_VENDOR_INTEL: 1375 case X86_VENDOR_INTEL:
@@ -1368,18 +1383,19 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1368 } 1383 }
1369} 1384}
1370 1385
1371static void mce_init_timer(void) 1386static void __mcheck_cpu_init_timer(void)
1372{ 1387{
1373 struct timer_list *t = &__get_cpu_var(mce_timer); 1388 struct timer_list *t = &__get_cpu_var(mce_timer);
1374 int *n = &__get_cpu_var(mce_next_interval); 1389 int *n = &__get_cpu_var(mce_next_interval);
1375 1390
1391 setup_timer(t, mce_start_timer, smp_processor_id());
1392
1376 if (mce_ignore_ce) 1393 if (mce_ignore_ce)
1377 return; 1394 return;
1378 1395
1379 *n = check_interval * HZ; 1396 *n = check_interval * HZ;
1380 if (!*n) 1397 if (!*n)
1381 return; 1398 return;
1382 setup_timer(t, mcheck_timer, smp_processor_id());
1383 t->expires = round_jiffies(jiffies + *n); 1399 t->expires = round_jiffies(jiffies + *n);
1384 add_timer_on(t, smp_processor_id()); 1400 add_timer_on(t, smp_processor_id());
1385} 1401}
@@ -1399,27 +1415,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
1399 * Called for each booted CPU to set up machine checks. 1415 * Called for each booted CPU to set up machine checks.
1400 * Must be called with preempt off: 1416 * Must be called with preempt off:
1401 */ 1417 */
1402void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1418void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1403{ 1419{
1404 if (mce_disabled) 1420 if (mce_disabled)
1405 return; 1421 return;
1406 1422
1407 mce_ancient_init(c); 1423 __mcheck_cpu_ancient_init(c);
1408 1424
1409 if (!mce_available(c)) 1425 if (!mce_available(c))
1410 return; 1426 return;
1411 1427
1412 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { 1428 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1413 mce_disabled = 1; 1429 mce_disabled = 1;
1414 return; 1430 return;
1415 } 1431 }
1416 1432
1417 machine_check_vector = do_machine_check; 1433 machine_check_vector = do_machine_check;
1418 1434
1419 mce_init(); 1435 __mcheck_cpu_init_generic();
1420 mce_cpu_features(c); 1436 __mcheck_cpu_init_vendor(c);
1421 mce_init_timer(); 1437 __mcheck_cpu_init_timer();
1422 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1438 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1439
1423} 1440}
1424 1441
1425/* 1442/*
@@ -1639,6 +1656,15 @@ static int __init mcheck_enable(char *str)
1639} 1656}
1640__setup("mce", mcheck_enable); 1657__setup("mce", mcheck_enable);
1641 1658
1659int __init mcheck_init(void)
1660{
1661 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1662
1663 mcheck_intel_therm_init();
1664
1665 return 0;
1666}
1667
1642/* 1668/*
1643 * Sysfs support 1669 * Sysfs support
1644 */ 1670 */
@@ -1647,7 +1673,7 @@ __setup("mce", mcheck_enable);
1647 * Disable machine checks on suspend and shutdown. We can't really handle 1673 * Disable machine checks on suspend and shutdown. We can't really handle
1648 * them later. 1674 * them later.
1649 */ 1675 */
1650static int mce_disable(void) 1676static int mce_disable_error_reporting(void)
1651{ 1677{
1652 int i; 1678 int i;
1653 1679
@@ -1662,12 +1688,12 @@ static int mce_disable(void)
1662 1688
1663static int mce_suspend(struct sys_device *dev, pm_message_t state) 1689static int mce_suspend(struct sys_device *dev, pm_message_t state)
1664{ 1690{
1665 return mce_disable(); 1691 return mce_disable_error_reporting();
1666} 1692}
1667 1693
1668static int mce_shutdown(struct sys_device *dev) 1694static int mce_shutdown(struct sys_device *dev)
1669{ 1695{
1670 return mce_disable(); 1696 return mce_disable_error_reporting();
1671} 1697}
1672 1698
1673/* 1699/*
@@ -1677,8 +1703,8 @@ static int mce_shutdown(struct sys_device *dev)
1677 */ 1703 */
1678static int mce_resume(struct sys_device *dev) 1704static int mce_resume(struct sys_device *dev)
1679{ 1705{
1680 mce_init(); 1706 __mcheck_cpu_init_generic();
1681 mce_cpu_features(&current_cpu_data); 1707 __mcheck_cpu_init_vendor(&current_cpu_data);
1682 1708
1683 return 0; 1709 return 0;
1684} 1710}
@@ -1688,8 +1714,8 @@ static void mce_cpu_restart(void *data)
1688 del_timer_sync(&__get_cpu_var(mce_timer)); 1714 del_timer_sync(&__get_cpu_var(mce_timer));
1689 if (!mce_available(&current_cpu_data)) 1715 if (!mce_available(&current_cpu_data))
1690 return; 1716 return;
1691 mce_init(); 1717 __mcheck_cpu_init_generic();
1692 mce_init_timer(); 1718 __mcheck_cpu_init_timer();
1693} 1719}
1694 1720
1695/* Reinit MCEs after user configuration changes */ 1721/* Reinit MCEs after user configuration changes */
@@ -1715,7 +1741,7 @@ static void mce_enable_ce(void *all)
1715 cmci_reenable(); 1741 cmci_reenable();
1716 cmci_recheck(); 1742 cmci_recheck();
1717 if (all) 1743 if (all)
1718 mce_init_timer(); 1744 __mcheck_cpu_init_timer();
1719} 1745}
1720 1746
1721static struct sysdev_class mce_sysclass = { 1747static struct sysdev_class mce_sysclass = {
@@ -1903,7 +1929,7 @@ error2:
1903 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1929 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1904error: 1930error:
1905 while (--i >= 0) 1931 while (--i >= 0)
1906 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1932 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1907 1933
1908 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1934 sysdev_unregister(&per_cpu(mce_dev, cpu));
1909 1935
@@ -1928,13 +1954,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1928} 1954}
1929 1955
1930/* Make sure there are no machine checks on offlined CPUs. */ 1956/* Make sure there are no machine checks on offlined CPUs. */
1931static void mce_disable_cpu(void *h) 1957static void __cpuinit mce_disable_cpu(void *h)
1932{ 1958{
1933 unsigned long action = *(unsigned long *)h; 1959 unsigned long action = *(unsigned long *)h;
1934 int i; 1960 int i;
1935 1961
1936 if (!mce_available(&current_cpu_data)) 1962 if (!mce_available(&current_cpu_data))
1937 return; 1963 return;
1964
1938 if (!(action & CPU_TASKS_FROZEN)) 1965 if (!(action & CPU_TASKS_FROZEN))
1939 cmci_clear(); 1966 cmci_clear();
1940 for (i = 0; i < banks; i++) { 1967 for (i = 0; i < banks; i++) {
@@ -1945,7 +1972,7 @@ static void mce_disable_cpu(void *h)
1945 } 1972 }
1946} 1973}
1947 1974
1948static void mce_reenable_cpu(void *h) 1975static void __cpuinit mce_reenable_cpu(void *h)
1949{ 1976{
1950 unsigned long action = *(unsigned long *)h; 1977 unsigned long action = *(unsigned long *)h;
1951 int i; 1978 int i;
@@ -1990,9 +2017,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1990 break; 2017 break;
1991 case CPU_DOWN_FAILED: 2018 case CPU_DOWN_FAILED:
1992 case CPU_DOWN_FAILED_FROZEN: 2019 case CPU_DOWN_FAILED_FROZEN:
1993 t->expires = round_jiffies(jiffies + 2020 if (!mce_ignore_ce && check_interval) {
2021 t->expires = round_jiffies(jiffies +
1994 __get_cpu_var(mce_next_interval)); 2022 __get_cpu_var(mce_next_interval));
1995 add_timer_on(t, cpu); 2023 add_timer_on(t, cpu);
2024 }
1996 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2025 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1997 break; 2026 break;
1998 case CPU_POST_DEAD: 2027 case CPU_POST_DEAD:
@@ -2024,7 +2053,7 @@ static __init void mce_init_banks(void)
2024 } 2053 }
2025} 2054}
2026 2055
2027static __init int mce_init_device(void) 2056static __init int mcheck_init_device(void)
2028{ 2057{
2029 int err; 2058 int err;
2030 int i = 0; 2059 int i = 0;
@@ -2052,7 +2081,7 @@ static __init int mce_init_device(void)
2052 return err; 2081 return err;
2053} 2082}
2054 2083
2055device_initcall(mce_init_device); 2084device_initcall(mcheck_init_device);
2056 2085
2057/* 2086/*
2058 * Old style boot options parsing. Only for compatibility. 2087 * Old style boot options parsing. Only for compatibility.
@@ -2100,7 +2129,7 @@ static int fake_panic_set(void *data, u64 val)
2100DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2129DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2101 fake_panic_set, "%llu\n"); 2130 fake_panic_set, "%llu\n");
2102 2131
2103static int __init mce_debugfs_init(void) 2132static int __init mcheck_debugfs_init(void)
2104{ 2133{
2105 struct dentry *dmce, *ffake_panic; 2134 struct dentry *dmce, *ffake_panic;
2106 2135
@@ -2114,5 +2143,5 @@ static int __init mce_debugfs_init(void)
2114 2143
2115 return 0; 2144 return 0;
2116} 2145}
2117late_initcall(mce_debugfs_init); 2146late_initcall(mcheck_debugfs_init);
2118#endif 2147#endif
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index b3a1dba75330..81c499eceb21 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state);
49 49
50static atomic_t therm_throt_en = ATOMIC_INIT(0); 50static atomic_t therm_throt_en = ATOMIC_INIT(0);
51 51
52static u32 lvtthmr_init __read_mostly;
53
52#ifdef CONFIG_SYSFS 54#ifdef CONFIG_SYSFS
53#define define_therm_throt_sysdev_one_ro(_name) \ 55#define define_therm_throt_sysdev_one_ro(_name) \
54 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
@@ -254,14 +256,34 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
254 ack_APIC_irq(); 256 ack_APIC_irq();
255} 257}
256 258
259/* Thermal monitoring depends on APIC, ACPI and clock modulation */
260static int intel_thermal_supported(struct cpuinfo_x86 *c)
261{
262 if (!cpu_has_apic)
263 return 0;
264 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
265 return 0;
266 return 1;
267}
268
269void __init mcheck_intel_therm_init(void)
270{
271 /*
272 * This function is only called on boot CPU. Save the init thermal
273 * LVT value on BSP and use that value to restore APs' thermal LVT
274 * entry BIOS programmed later
275 */
276 if (intel_thermal_supported(&boot_cpu_data))
277 lvtthmr_init = apic_read(APIC_LVTTHMR);
278}
279
257void intel_init_thermal(struct cpuinfo_x86 *c) 280void intel_init_thermal(struct cpuinfo_x86 *c)
258{ 281{
259 unsigned int cpu = smp_processor_id(); 282 unsigned int cpu = smp_processor_id();
260 int tm2 = 0; 283 int tm2 = 0;
261 u32 l, h; 284 u32 l, h;
262 285
263 /* Thermal monitoring depends on ACPI and clock modulation*/ 286 if (!intel_thermal_supported(c))
264 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
265 return; 287 return;
266 288
267 /* 289 /*
@@ -270,7 +292,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
270 * since it might be delivered via SMI already: 292 * since it might be delivered via SMI already:
271 */ 293 */
272 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 294 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
273 h = apic_read(APIC_LVTTHMR); 295
296 /*
297 * The initial value of thermal LVT entries on all APs always reads
298 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
299 * sequence to them and LVT registers are reset to 0s except for
300 * the mask bits which are set to 1s when APs receive INIT IPI.
301 * Always restore the value that BIOS has programmed on AP based on
302 * BSP's info we saved since BIOS is always setting the same value
303 * for all threads/cores
304 */
305 apic_write(APIC_LVTTHMR, lvtthmr_init);
306
307 h = lvtthmr_init;
308
274 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 309 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
275 printk(KERN_DEBUG 310 printk(KERN_DEBUG
276 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 311 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
@@ -312,8 +347,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
312 l = apic_read(APIC_LVTTHMR); 347 l = apic_read(APIC_LVTTHMR);
313 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 348 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
314 349
315 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 350 printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
316 cpu, tm2 ? "TM2" : "TM1"); 351 tm2 ? "TM2" : "TM1");
317 352
318 /* enable thermal throttle processing */ 353 /* enable thermal throttle processing */
319 atomic_set(&therm_throt_en, 1); 354 atomic_set(&therm_throt_en, 1);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 6987af786c02..09b1698e0466 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -875,7 +875,7 @@ int __init mtrr_cleanup(unsigned address_bits)
875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
876 876
877 range_sums = sum_ranges(range, nr_range); 877 range_sums = sum_ranges(range, nr_range);
878 printk(KERN_INFO "total RAM coverred: %ldM\n", 878 printk(KERN_INFO "total RAM covered: %ldM\n",
879 range_sums >> (20 - PAGE_SHIFT)); 879 range_sums >> (20 - PAGE_SHIFT));
880 880
881 if (mtrr_chunk_size && mtrr_gran_size) { 881 if (mtrr_chunk_size && mtrr_gran_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 3c1b12d461d1..e006e56f699c 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -4,6 +4,7 @@
4#include <linux/proc_fs.h> 4#include <linux/proc_fs.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/ctype.h> 6#include <linux/ctype.h>
7#include <linux/string.h>
7#include <linux/init.h> 8#include <linux/init.h>
8 9
9#define LINE_SIZE 80 10#define LINE_SIZE 80
@@ -133,8 +134,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
133 return -EINVAL; 134 return -EINVAL;
134 135
135 base = simple_strtoull(line + 5, &ptr, 0); 136 base = simple_strtoull(line + 5, &ptr, 0);
136 while (isspace(*ptr)) 137 ptr = skip_spaces(ptr);
137 ptr++;
138 138
139 if (strncmp(ptr, "size=", 5)) 139 if (strncmp(ptr, "size=", 5))
140 return -EINVAL; 140 return -EINVAL;
@@ -142,14 +142,11 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
142 size = simple_strtoull(ptr + 5, &ptr, 0); 142 size = simple_strtoull(ptr + 5, &ptr, 0);
143 if ((base & 0xfff) || (size & 0xfff)) 143 if ((base & 0xfff) || (size & 0xfff))
144 return -EINVAL; 144 return -EINVAL;
145 while (isspace(*ptr)) 145 ptr = skip_spaces(ptr);
146 ptr++;
147 146
148 if (strncmp(ptr, "type=", 5)) 147 if (strncmp(ptr, "type=", 5))
149 return -EINVAL; 148 return -EINVAL;
150 ptr += 5; 149 ptr = skip_spaces(ptr + 5);
151 while (isspace(*ptr))
152 ptr++;
153 150
154 for (i = 0; i < MTRR_NUM_TYPES; ++i) { 151 for (i = 0; i < MTRR_NUM_TYPES; ++i) {
155 if (strcmp(ptr, mtrr_strings[i])) 152 if (strcmp(ptr, mtrr_strings[i]))
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5801c311846..8c1c07073ccc 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -77,6 +77,18 @@ struct cpu_hw_events {
77 struct debug_store *ds; 77 struct debug_store *ds;
78}; 78};
79 79
80struct event_constraint {
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
82 int code;
83};
84
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
87
88#define for_each_event_constraint(e, c) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++)
90
91
80/* 92/*
81 * struct x86_pmu - generic x86 pmu 93 * struct x86_pmu - generic x86 pmu
82 */ 94 */
@@ -102,6 +114,8 @@ struct x86_pmu {
102 u64 intel_ctrl; 114 u64 intel_ctrl;
103 void (*enable_bts)(u64 config); 115 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void); 116 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc,
118 struct hw_perf_event *hwc);
105}; 119};
106 120
107static struct x86_pmu x86_pmu __read_mostly; 121static struct x86_pmu x86_pmu __read_mostly;
@@ -110,6 +124,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
110 .enabled = 1, 124 .enabled = 1,
111}; 125};
112 126
127static const struct event_constraint *event_constraints;
128
113/* 129/*
114 * Not sure about some of these 130 * Not sure about some of these
115 */ 131 */
@@ -155,6 +171,16 @@ static u64 p6_pmu_raw_event(u64 hw_event)
155 return hw_event & P6_EVNTSEL_MASK; 171 return hw_event & P6_EVNTSEL_MASK;
156} 172}
157 173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
158 184
159/* 185/*
160 * Intel PerfMon v3. Used on Core2 and later. 186 * Intel PerfMon v3. Used on Core2 and later.
@@ -170,6 +196,35 @@ static const u64 intel_perfmon_event_map[] =
170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
171}; 197};
172 198
199static const struct event_constraint intel_core_event_constraints[] =
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212
213static const struct event_constraint intel_nehalem_event_constraints[] =
214{
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226};
227
173static u64 intel_pmu_event_map(int hw_event) 228static u64 intel_pmu_event_map(int hw_event)
174{ 229{
175 return intel_perfmon_event_map[hw_event]; 230 return intel_perfmon_event_map[hw_event];
@@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids
190 [PERF_COUNT_HW_CACHE_OP_MAX] 245 [PERF_COUNT_HW_CACHE_OP_MAX]
191 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 246 [PERF_COUNT_HW_CACHE_RESULT_MAX];
192 247
193static const u64 nehalem_hw_cache_event_ids 248static __initconst u64 nehalem_hw_cache_event_ids
194 [PERF_COUNT_HW_CACHE_MAX] 249 [PERF_COUNT_HW_CACHE_MAX]
195 [PERF_COUNT_HW_CACHE_OP_MAX] 250 [PERF_COUNT_HW_CACHE_OP_MAX]
196 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 251 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids
281 }, 336 },
282}; 337};
283 338
284static const u64 core2_hw_cache_event_ids 339static __initconst u64 core2_hw_cache_event_ids
285 [PERF_COUNT_HW_CACHE_MAX] 340 [PERF_COUNT_HW_CACHE_MAX]
286 [PERF_COUNT_HW_CACHE_OP_MAX] 341 [PERF_COUNT_HW_CACHE_OP_MAX]
287 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 342 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids
372 }, 427 },
373}; 428};
374 429
375static const u64 atom_hw_cache_event_ids 430static __initconst u64 atom_hw_cache_event_ids
376 [PERF_COUNT_HW_CACHE_MAX] 431 [PERF_COUNT_HW_CACHE_MAX]
377 [PERF_COUNT_HW_CACHE_OP_MAX] 432 [PERF_COUNT_HW_CACHE_OP_MAX]
378 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 433 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -469,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL 524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL 525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL 526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
472#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL 527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
473 528
474#define CORE_EVNTSEL_MASK \ 529#define CORE_EVNTSEL_MASK \
475 (CORE_EVNTSEL_EVENT_MASK | \ 530 (CORE_EVNTSEL_EVENT_MASK | \
@@ -481,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
481 return hw_event & CORE_EVNTSEL_MASK; 536 return hw_event & CORE_EVNTSEL_MASK;
482} 537}
483 538
484static const u64 amd_hw_cache_event_ids 539static __initconst u64 amd_hw_cache_event_ids
485 [PERF_COUNT_HW_CACHE_MAX] 540 [PERF_COUNT_HW_CACHE_MAX]
486 [PERF_COUNT_HW_CACHE_OP_MAX] 541 [PERF_COUNT_HW_CACHE_OP_MAX]
487 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 542 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -932,6 +987,8 @@ static int __hw_perf_event_init(struct perf_event *event)
932 */ 987 */
933 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 988 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
934 989
990 hwc->idx = -1;
991
935 /* 992 /*
936 * Count user and OS events unless requested not to. 993 * Count user and OS events unless requested not to.
937 */ 994 */
@@ -1229,7 +1286,7 @@ x86_perf_event_set_period(struct perf_event *event,
1229 return 0; 1286 return 0;
1230 1287
1231 /* 1288 /*
1232 * If we are way outside a reasoable range then just skip forward: 1289 * If we are way outside a reasonable range then just skip forward:
1233 */ 1290 */
1234 if (unlikely(left <= -period)) { 1291 if (unlikely(left <= -period)) {
1235 left = period; 1292 left = period;
@@ -1286,6 +1343,13 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1286 bits |= 0x2; 1343 bits |= 0x2;
1287 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) 1344 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
1288 bits |= 0x1; 1345 bits |= 0x1;
1346
1347 /*
1348 * ANY bit is supported in v3 and up
1349 */
1350 if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
1351 bits |= 0x4;
1352
1289 bits <<= (idx * 4); 1353 bits <<= (idx * 4);
1290 mask = 0xfULL << (idx * 4); 1354 mask = 0xfULL << (idx * 4);
1291 1355
@@ -1334,8 +1398,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1334 x86_pmu_enable_event(hwc, idx); 1398 x86_pmu_enable_event(hwc, idx);
1335} 1399}
1336 1400
1337static int 1401static int fixed_mode_idx(struct hw_perf_event *hwc)
1338fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1339{ 1402{
1340 unsigned int hw_event; 1403 unsigned int hw_event;
1341 1404
@@ -1349,6 +1412,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1349 if (!x86_pmu.num_events_fixed) 1412 if (!x86_pmu.num_events_fixed)
1350 return -1; 1413 return -1;
1351 1414
1415 /*
1416 * fixed counters do not take all possible filters
1417 */
1418 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
1419 return -1;
1420
1352 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1421 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 1422 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1354 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) 1423 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1360,22 +1429,57 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1360} 1429}
1361 1430
1362/* 1431/*
1363 * Find a PMC slot for the freshly enabled / scheduled in event: 1432 * generic counter allocator: get next free counter
1364 */ 1433 */
1365static int x86_pmu_enable(struct perf_event *event) 1434static int
1435gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1436{
1437 int idx;
1438
1439 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1440 return idx == x86_pmu.num_events ? -1 : idx;
1441}
1442
1443/*
1444 * intel-specific counter allocator: check event constraints
1445 */
1446static int
1447intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1448{
1449 const struct event_constraint *event_constraint;
1450 int i, code;
1451
1452 if (!event_constraints)
1453 goto skip;
1454
1455 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1456
1457 for_each_event_constraint(event_constraint, event_constraints) {
1458 if (code == event_constraint->code) {
1459 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1460 if (!test_and_set_bit(i, cpuc->used_mask))
1461 return i;
1462 }
1463 return -1;
1464 }
1465 }
1466skip:
1467 return gen_get_event_idx(cpuc, hwc);
1468}
1469
1470static int
1471x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1366{ 1472{
1367 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1368 struct hw_perf_event *hwc = &event->hw;
1369 int idx; 1473 int idx;
1370 1474
1371 idx = fixed_mode_idx(event, hwc); 1475 idx = fixed_mode_idx(hwc);
1372 if (idx == X86_PMC_IDX_FIXED_BTS) { 1476 if (idx == X86_PMC_IDX_FIXED_BTS) {
1373 /* BTS is already occupied. */ 1477 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask)) 1478 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN; 1479 return -EAGAIN;
1376 1480
1377 hwc->config_base = 0; 1481 hwc->config_base = 0;
1378 hwc->event_base = 0; 1482 hwc->event_base = 0;
1379 hwc->idx = idx; 1483 hwc->idx = idx;
1380 } else if (idx >= 0) { 1484 } else if (idx >= 0) {
1381 /* 1485 /*
@@ -1396,20 +1500,35 @@ static int x86_pmu_enable(struct perf_event *event)
1396 } else { 1500 } else {
1397 idx = hwc->idx; 1501 idx = hwc->idx;
1398 /* Try to get the previous generic event again */ 1502 /* Try to get the previous generic event again */
1399 if (test_and_set_bit(idx, cpuc->used_mask)) { 1503 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1400try_generic: 1504try_generic:
1401 idx = find_first_zero_bit(cpuc->used_mask, 1505 idx = x86_pmu.get_event_idx(cpuc, hwc);
1402 x86_pmu.num_events); 1506 if (idx == -1)
1403 if (idx == x86_pmu.num_events)
1404 return -EAGAIN; 1507 return -EAGAIN;
1405 1508
1406 set_bit(idx, cpuc->used_mask); 1509 set_bit(idx, cpuc->used_mask);
1407 hwc->idx = idx; 1510 hwc->idx = idx;
1408 } 1511 }
1409 hwc->config_base = x86_pmu.eventsel; 1512 hwc->config_base = x86_pmu.eventsel;
1410 hwc->event_base = x86_pmu.perfctr; 1513 hwc->event_base = x86_pmu.perfctr;
1411 } 1514 }
1412 1515
1516 return idx;
1517}
1518
1519/*
1520 * Find a PMC slot for the freshly enabled / scheduled in event:
1521 */
1522static int x86_pmu_enable(struct perf_event *event)
1523{
1524 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1525 struct hw_perf_event *hwc = &event->hw;
1526 int idx;
1527
1528 idx = x86_schedule_event(cpuc, hwc);
1529 if (idx < 0)
1530 return idx;
1531
1413 perf_events_lapic_init(); 1532 perf_events_lapic_init();
1414 1533
1415 x86_pmu.disable(hwc, idx); 1534 x86_pmu.disable(hwc, idx);
@@ -1520,6 +1639,7 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
1520 1639
1521 data.period = event->hw.last_period; 1640 data.period = event->hw.last_period;
1522 data.addr = 0; 1641 data.addr = 0;
1642 data.raw = NULL;
1523 regs.ip = 0; 1643 regs.ip = 0;
1524 1644
1525 /* 1645 /*
@@ -1637,6 +1757,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
1637 u64 val; 1757 u64 val;
1638 1758
1639 data.addr = 0; 1759 data.addr = 0;
1760 data.raw = NULL;
1640 1761
1641 cpuc = &__get_cpu_var(cpu_hw_events); 1762 cpuc = &__get_cpu_var(cpu_hw_events);
1642 1763
@@ -1682,6 +1803,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1682 u64 ack, status; 1803 u64 ack, status;
1683 1804
1684 data.addr = 0; 1805 data.addr = 0;
1806 data.raw = NULL;
1685 1807
1686 cpuc = &__get_cpu_var(cpu_hw_events); 1808 cpuc = &__get_cpu_var(cpu_hw_events);
1687 1809
@@ -1745,6 +1867,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1745 u64 val; 1867 u64 val;
1746 1868
1747 data.addr = 0; 1869 data.addr = 0;
1870 data.raw = NULL;
1748 1871
1749 cpuc = &__get_cpu_var(cpu_hw_events); 1872 cpuc = &__get_cpu_var(cpu_hw_events);
1750 1873
@@ -1852,7 +1975,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1852 .priority = 1 1975 .priority = 1
1853}; 1976};
1854 1977
1855static struct x86_pmu p6_pmu = { 1978static __initconst struct x86_pmu p6_pmu = {
1856 .name = "p6", 1979 .name = "p6",
1857 .handle_irq = p6_pmu_handle_irq, 1980 .handle_irq = p6_pmu_handle_irq,
1858 .disable_all = p6_pmu_disable_all, 1981 .disable_all = p6_pmu_disable_all,
@@ -1877,9 +2000,10 @@ static struct x86_pmu p6_pmu = {
1877 */ 2000 */
1878 .event_bits = 32, 2001 .event_bits = 32,
1879 .event_mask = (1ULL << 32) - 1, 2002 .event_mask = (1ULL << 32) - 1,
2003 .get_event_idx = intel_get_event_idx,
1880}; 2004};
1881 2005
1882static struct x86_pmu intel_pmu = { 2006static __initconst struct x86_pmu intel_pmu = {
1883 .name = "Intel", 2007 .name = "Intel",
1884 .handle_irq = intel_pmu_handle_irq, 2008 .handle_irq = intel_pmu_handle_irq,
1885 .disable_all = intel_pmu_disable_all, 2009 .disable_all = intel_pmu_disable_all,
@@ -1900,9 +2024,10 @@ static struct x86_pmu intel_pmu = {
1900 .max_period = (1ULL << 31) - 1, 2024 .max_period = (1ULL << 31) - 1,
1901 .enable_bts = intel_pmu_enable_bts, 2025 .enable_bts = intel_pmu_enable_bts,
1902 .disable_bts = intel_pmu_disable_bts, 2026 .disable_bts = intel_pmu_disable_bts,
2027 .get_event_idx = intel_get_event_idx,
1903}; 2028};
1904 2029
1905static struct x86_pmu amd_pmu = { 2030static __initconst struct x86_pmu amd_pmu = {
1906 .name = "AMD", 2031 .name = "AMD",
1907 .handle_irq = amd_pmu_handle_irq, 2032 .handle_irq = amd_pmu_handle_irq,
1908 .disable_all = amd_pmu_disable_all, 2033 .disable_all = amd_pmu_disable_all,
@@ -1920,9 +2045,10 @@ static struct x86_pmu amd_pmu = {
1920 .apic = 1, 2045 .apic = 1,
1921 /* use highest bit to detect overflow */ 2046 /* use highest bit to detect overflow */
1922 .max_period = (1ULL << 47) - 1, 2047 .max_period = (1ULL << 47) - 1,
2048 .get_event_idx = gen_get_event_idx,
1923}; 2049};
1924 2050
1925static int p6_pmu_init(void) 2051static __init int p6_pmu_init(void)
1926{ 2052{
1927 switch (boot_cpu_data.x86_model) { 2053 switch (boot_cpu_data.x86_model) {
1928 case 1: 2054 case 1:
@@ -1932,10 +2058,12 @@ static int p6_pmu_init(void)
1932 case 7: 2058 case 7:
1933 case 8: 2059 case 8:
1934 case 11: /* Pentium III */ 2060 case 11: /* Pentium III */
2061 event_constraints = intel_p6_event_constraints;
1935 break; 2062 break;
1936 case 9: 2063 case 9:
1937 case 13: 2064 case 13:
1938 /* Pentium M */ 2065 /* Pentium M */
2066 event_constraints = intel_p6_event_constraints;
1939 break; 2067 break;
1940 default: 2068 default:
1941 pr_cont("unsupported p6 CPU model %d ", 2069 pr_cont("unsupported p6 CPU model %d ",
@@ -1945,16 +2073,10 @@ static int p6_pmu_init(void)
1945 2073
1946 x86_pmu = p6_pmu; 2074 x86_pmu = p6_pmu;
1947 2075
1948 if (!cpu_has_apic) {
1949 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1950 pr_info("no hardware sampling interrupt available.\n");
1951 x86_pmu.apic = 0;
1952 }
1953
1954 return 0; 2076 return 0;
1955} 2077}
1956 2078
1957static int intel_pmu_init(void) 2079static __init int intel_pmu_init(void)
1958{ 2080{
1959 union cpuid10_edx edx; 2081 union cpuid10_edx edx;
1960 union cpuid10_eax eax; 2082 union cpuid10_eax eax;
@@ -2007,12 +2129,14 @@ static int intel_pmu_init(void)
2007 sizeof(hw_cache_event_ids)); 2129 sizeof(hw_cache_event_ids));
2008 2130
2009 pr_cont("Core2 events, "); 2131 pr_cont("Core2 events, ");
2132 event_constraints = intel_core_event_constraints;
2010 break; 2133 break;
2011 default: 2134 default:
2012 case 26: 2135 case 26:
2013 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 2136 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2014 sizeof(hw_cache_event_ids)); 2137 sizeof(hw_cache_event_ids));
2015 2138
2139 event_constraints = intel_nehalem_event_constraints;
2016 pr_cont("Nehalem/Corei7 events, "); 2140 pr_cont("Nehalem/Corei7 events, ");
2017 break; 2141 break;
2018 case 28: 2142 case 28:
@@ -2025,7 +2149,7 @@ static int intel_pmu_init(void)
2025 return 0; 2149 return 0;
2026} 2150}
2027 2151
2028static int amd_pmu_init(void) 2152static __init int amd_pmu_init(void)
2029{ 2153{
2030 /* Performance-monitoring supported from K7 and later: */ 2154 /* Performance-monitoring supported from K7 and later: */
2031 if (boot_cpu_data.x86 < 6) 2155 if (boot_cpu_data.x86 < 6)
@@ -2040,6 +2164,16 @@ static int amd_pmu_init(void)
2040 return 0; 2164 return 0;
2041} 2165}
2042 2166
2167static void __init pmu_check_apic(void)
2168{
2169 if (cpu_has_apic)
2170 return;
2171
2172 x86_pmu.apic = 0;
2173 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
2174 pr_info("no hardware sampling interrupt available.\n");
2175}
2176
2043void __init init_hw_perf_events(void) 2177void __init init_hw_perf_events(void)
2044{ 2178{
2045 int err; 2179 int err;
@@ -2061,6 +2195,8 @@ void __init init_hw_perf_events(void)
2061 return; 2195 return;
2062 } 2196 }
2063 2197
2198 pmu_check_apic();
2199
2064 pr_cont("%s PMU driver.\n", x86_pmu.name); 2200 pr_cont("%s PMU driver.\n", x86_pmu.name);
2065 2201
2066 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 2202 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
@@ -2105,11 +2241,47 @@ static const struct pmu pmu = {
2105 .unthrottle = x86_pmu_unthrottle, 2241 .unthrottle = x86_pmu_unthrottle,
2106}; 2242};
2107 2243
2244static int
2245validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
2246{
2247 struct hw_perf_event fake_event = event->hw;
2248
2249 if (event->pmu && event->pmu != &pmu)
2250 return 0;
2251
2252 return x86_schedule_event(cpuc, &fake_event) >= 0;
2253}
2254
2255static int validate_group(struct perf_event *event)
2256{
2257 struct perf_event *sibling, *leader = event->group_leader;
2258 struct cpu_hw_events fake_pmu;
2259
2260 memset(&fake_pmu, 0, sizeof(fake_pmu));
2261
2262 if (!validate_event(&fake_pmu, leader))
2263 return -ENOSPC;
2264
2265 list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
2266 if (!validate_event(&fake_pmu, sibling))
2267 return -ENOSPC;
2268 }
2269
2270 if (!validate_event(&fake_pmu, event))
2271 return -ENOSPC;
2272
2273 return 0;
2274}
2275
2108const struct pmu *hw_perf_event_init(struct perf_event *event) 2276const struct pmu *hw_perf_event_init(struct perf_event *event)
2109{ 2277{
2110 int err; 2278 int err;
2111 2279
2112 err = __hw_perf_event_init(event); 2280 err = __hw_perf_event_init(event);
2281 if (!err) {
2282 if (event->group_leader != event)
2283 err = validate_group(event);
2284 }
2113 if (err) { 2285 if (err) {
2114 if (event->destroy) 2286 if (event->destroy)
2115 event->destroy(event); 2287 event->destroy(event);
@@ -2132,7 +2304,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2132 2304
2133static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 2305static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2134static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 2306static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2135static DEFINE_PER_CPU(int, in_nmi_frame); 2307static DEFINE_PER_CPU(int, in_ignored_frame);
2136 2308
2137 2309
2138static void 2310static void
@@ -2148,8 +2320,9 @@ static void backtrace_warning(void *data, char *msg)
2148 2320
2149static int backtrace_stack(void *data, char *name) 2321static int backtrace_stack(void *data, char *name)
2150{ 2322{
2151 per_cpu(in_nmi_frame, smp_processor_id()) = 2323 per_cpu(in_ignored_frame, smp_processor_id()) =
2152 x86_is_stack_id(NMI_STACK, name); 2324 x86_is_stack_id(NMI_STACK, name) ||
2325 x86_is_stack_id(DEBUG_STACK, name);
2153 2326
2154 return 0; 2327 return 0;
2155} 2328}
@@ -2158,7 +2331,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
2158{ 2331{
2159 struct perf_callchain_entry *entry = data; 2332 struct perf_callchain_entry *entry = data;
2160 2333
2161 if (per_cpu(in_nmi_frame, smp_processor_id())) 2334 if (per_cpu(in_ignored_frame, smp_processor_id()))
2162 return; 2335 return;
2163 2336
2164 if (reliable) 2337 if (reliable)
@@ -2170,6 +2343,7 @@ static const struct stacktrace_ops backtrace_ops = {
2170 .warning_symbol = backtrace_warning_symbol, 2343 .warning_symbol = backtrace_warning_symbol,
2171 .stack = backtrace_stack, 2344 .stack = backtrace_stack,
2172 .address = backtrace_address, 2345 .address = backtrace_address,
2346 .walk_stack = print_context_stack_bp,
2173}; 2347};
2174 2348
2175#include "../dumpstack.h" 2349#include "../dumpstack.h"
@@ -2180,7 +2354,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
2180 callchain_store(entry, PERF_CONTEXT_KERNEL); 2354 callchain_store(entry, PERF_CONTEXT_KERNEL);
2181 callchain_store(entry, regs->ip); 2355 callchain_store(entry, regs->ip);
2182 2356
2183 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); 2357 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
2184} 2358}
2185 2359
2186/* 2360/*
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fab786f60ed6..898df9719afb 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -712,7 +712,7 @@ static void probe_nmi_watchdog(void)
712 switch (boot_cpu_data.x86_vendor) { 712 switch (boot_cpu_data.x86_vendor) {
713 case X86_VENDOR_AMD: 713 case X86_VENDOR_AMD:
714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && 714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
715 boot_cpu_data.x86 != 16) 715 boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
716 return; 716 return;
717 wd_ops = &k7_wd_ops; 717 wd_ops = &k7_wd_ops;
718 break; 718 break;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index bb62b3e5caad..28000743bbb0 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -26,7 +26,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
26 26
27 early_init_transmeta(c); 27 early_init_transmeta(c);
28 28
29 display_cacheinfo(c); 29 cpu_detect_cache_sizes(c);
30 30
31 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
32 max = cpuid_eax(0x80860000); 32 max = cpuid_eax(0x80860000);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6a52d4b36a30..83e5e628de73 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -116,21 +116,16 @@ static int cpuid_open(struct inode *inode, struct file *file)
116{ 116{
117 unsigned int cpu; 117 unsigned int cpu;
118 struct cpuinfo_x86 *c; 118 struct cpuinfo_x86 *c;
119 int ret = 0;
120
121 lock_kernel();
122 119
123 cpu = iminor(file->f_path.dentry->d_inode); 120 cpu = iminor(file->f_path.dentry->d_inode);
124 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { 121 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
125 ret = -ENXIO; /* No such CPU */ 122 return -ENXIO; /* No such CPU */
126 goto out; 123
127 }
128 c = &cpu_data(cpu); 124 c = &cpu_data(cpu);
129 if (c->cpuid_level < 0) 125 if (c->cpuid_level < 0)
130 ret = -EIO; /* CPUID not supported */ 126 return -EIO; /* CPUID not supported */
131out: 127
132 unlock_kernel(); 128 return 0;
133 return ret;
134} 129}
135 130
136/* 131/*
@@ -192,7 +187,8 @@ static int __init cpuid_init(void)
192 int i, err = 0; 187 int i, err = 0;
193 i = 0; 188 i = 0;
194 189
195 if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { 190 if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
191 "cpu/cpuid", &cpuid_fops)) {
196 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", 192 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
197 CPUID_MAJOR); 193 CPUID_MAJOR);
198 err = -EBUSY; 194 err = -EBUSY;
@@ -221,7 +217,7 @@ out_class:
221 } 217 }
222 class_destroy(cpuid_class); 218 class_destroy(cpuid_class);
223out_chrdev: 219out_chrdev:
224 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 220 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
225out: 221out:
226 return err; 222 return err;
227} 223}
@@ -233,7 +229,7 @@ static void __exit cpuid_exit(void)
233 for_each_online_cpu(cpu) 229 for_each_online_cpu(cpu)
234 cpuid_device_destroy(cpu); 230 cpuid_device_destroy(cpu);
235 class_destroy(cpuid_class); 231 class_destroy(cpuid_class);
236 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 232 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
237 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 233 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
238} 234}
239 235
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 5e409dc298a4..a4849c10a77e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,8 +27,7 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/iommu.h> 30#include <asm/x86_init.h>
31
32 31
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
34 33
@@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
106#endif 105#endif
107 106
108#ifdef CONFIG_X86_64 107#ifdef CONFIG_X86_64
109 pci_iommu_shutdown(); 108 x86_platform.iommu_shutdown();
110#endif 109#endif
111 110
112 crash_save_cpu(regs, safe_smp_processor_id()); 111 crash_save_cpu(regs, safe_smp_processor_id());
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index f7cdb3b457aa..cd97ce18c29d 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -16,6 +16,22 @@ static void *kdump_buf_page;
16/* Stores the physical address of elf header of crash image. */ 16/* Stores the physical address of elf header of crash image. */
17unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; 17unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
18 18
19static inline bool is_crashed_pfn_valid(unsigned long pfn)
20{
21#ifndef CONFIG_X86_PAE
22 /*
23 * non-PAE kdump kernel executed from a PAE one will crop high pte
24 * bits and poke unwanted space counting again from address 0, we
25 * don't want that. pte must fit into unsigned long. In fact the
26 * test checks high 12 bits for being zero (pfn will be shifted left
27 * by PAGE_SHIFT).
28 */
29 return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn;
30#else
31 return true;
32#endif
33}
34
19/** 35/**
20 * copy_oldmem_page - copy one page from "oldmem" 36 * copy_oldmem_page - copy one page from "oldmem"
21 * @pfn: page frame number to be copied 37 * @pfn: page frame number to be copied
@@ -41,6 +57,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
41 if (!csize) 57 if (!csize)
42 return 0; 58 return 0;
43 59
60 if (!is_crashed_pfn_valid(pfn))
61 return -EFAULT;
62
44 vaddr = kmap_atomic_pfn(pfn, KM_PTE0); 63 vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
45 64
46 if (!userbuf) { 65 if (!userbuf) {
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index ef42a038f1a6..1c47390dd0e5 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -265,13 +265,13 @@ struct ds_context {
265 int cpu; 265 int cpu;
266}; 266};
267 267
268static DEFINE_PER_CPU(struct ds_context *, cpu_context); 268static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
269 269
270 270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu) 271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
272{ 272{
273 struct ds_context **p_context = 273 struct ds_context **p_context =
274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); 274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
275 struct ds_context *context = NULL; 275 struct ds_context *context = NULL;
276 struct ds_context *new_context = NULL; 276 struct ds_context *new_context = NULL;
277 277
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371d4339..6d817554780a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -109,6 +109,32 @@ print_context_stack(struct thread_info *tinfo,
109 } 109 }
110 return bp; 110 return bp;
111} 111}
112EXPORT_SYMBOL_GPL(print_context_stack);
113
114unsigned long
115print_context_stack_bp(struct thread_info *tinfo,
116 unsigned long *stack, unsigned long bp,
117 const struct stacktrace_ops *ops, void *data,
118 unsigned long *end, int *graph)
119{
120 struct stack_frame *frame = (struct stack_frame *)bp;
121 unsigned long *ret_addr = &frame->return_address;
122
123 while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) {
124 unsigned long addr = *ret_addr;
125
126 if (!__kernel_text_address(addr))
127 break;
128
129 ops->address(data, addr, 1);
130 frame = frame->next_frame;
131 ret_addr = &frame->return_address;
132 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
133 }
134
135 return (unsigned long)frame;
136}
137EXPORT_SYMBOL_GPL(print_context_stack_bp);
112 138
113 139
114static void 140static void
@@ -141,10 +167,11 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
141} 167}
142 168
143static const struct stacktrace_ops print_trace_ops = { 169static const struct stacktrace_ops print_trace_ops = {
144 .warning = print_trace_warning, 170 .warning = print_trace_warning,
145 .warning_symbol = print_trace_warning_symbol, 171 .warning_symbol = print_trace_warning_symbol,
146 .stack = print_trace_stack, 172 .stack = print_trace_stack,
147 .address = print_trace_address, 173 .address = print_trace_address,
174 .walk_stack = print_context_stack,
148}; 175};
149 176
150void 177void
@@ -188,7 +215,7 @@ void dump_stack(void)
188} 215}
189EXPORT_SYMBOL(dump_stack); 216EXPORT_SYMBOL(dump_stack);
190 217
191static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; 218static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
192static int die_owner = -1; 219static int die_owner = -1;
193static unsigned int die_nest_count; 220static unsigned int die_nest_count;
194 221
@@ -207,11 +234,11 @@ unsigned __kprobes long oops_begin(void)
207 /* racy, but better than risking deadlock. */ 234 /* racy, but better than risking deadlock. */
208 raw_local_irq_save(flags); 235 raw_local_irq_save(flags);
209 cpu = smp_processor_id(); 236 cpu = smp_processor_id();
210 if (!__raw_spin_trylock(&die_lock)) { 237 if (!arch_spin_trylock(&die_lock)) {
211 if (cpu == die_owner) 238 if (cpu == die_owner)
212 /* nested oops. should stop eventually */; 239 /* nested oops. should stop eventually */;
213 else 240 else
214 __raw_spin_lock(&die_lock); 241 arch_spin_lock(&die_lock);
215 } 242 }
216 die_nest_count++; 243 die_nest_count++;
217 die_owner = cpu; 244 die_owner = cpu;
@@ -231,7 +258,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
231 die_nest_count--; 258 die_nest_count--;
232 if (!die_nest_count) 259 if (!die_nest_count)
233 /* Nest count reaches zero, release the lock. */ 260 /* Nest count reaches zero, release the lock. */
234 __raw_spin_unlock(&die_lock); 261 arch_spin_unlock(&die_lock);
235 raw_local_irq_restore(flags); 262 raw_local_irq_restore(flags);
236 oops_exit(); 263 oops_exit();
237 264
@@ -268,11 +295,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
268 295
269 show_registers(regs); 296 show_registers(regs);
270#ifdef CONFIG_X86_32 297#ifdef CONFIG_X86_32
271 sp = (unsigned long) (&regs->sp); 298 if (user_mode_vm(regs)) {
272 savesegment(ss, ss);
273 if (user_mode(regs)) {
274 sp = regs->sp; 299 sp = regs->sp;
275 ss = regs->ss & 0xffff; 300 ss = regs->ss & 0xffff;
301 } else {
302 sp = kernel_stack_pointer(regs);
303 savesegment(ss, ss);
276 } 304 }
277 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 305 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
278 print_symbol("%s", regs->ip); 306 print_symbol("%s", regs->ip);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 81086c227ab7..4fd1420faffa 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,12 +14,6 @@
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif 15#endif
16 16
17extern unsigned long
18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end, int *graph);
22
23extern void 17extern void
24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 18show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *stack, unsigned long bp, char *log_lvl); 19 unsigned long *stack, unsigned long bp, char *log_lvl);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f7dd2a7c3bf4..ae775ca47b25 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -10,9 +10,9 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
@@ -35,6 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
35 35
36 if (!stack) { 36 if (!stack) {
37 unsigned long dummy; 37 unsigned long dummy;
38
38 stack = &dummy; 39 stack = &dummy;
39 if (task && task != current) 40 if (task && task != current)
40 stack = (unsigned long *)task->thread.sp; 41 stack = (unsigned long *)task->thread.sp;
@@ -57,8 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
57 58
58 context = (struct thread_info *) 59 context = (struct thread_info *)
59 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 60 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
60 bp = print_context_stack(context, stack, bp, ops, 61 bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph);
61 data, NULL, &graph);
62 62
63 stack = (unsigned long *)context->previous_esp; 63 stack = (unsigned long *)context->previous_esp;
64 if (!stack) 64 if (!stack)
@@ -72,7 +72,7 @@ EXPORT_SYMBOL(dump_trace);
72 72
73void 73void
74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
75 unsigned long *sp, unsigned long bp, char *log_lvl) 75 unsigned long *sp, unsigned long bp, char *log_lvl)
76{ 76{
77 unsigned long *stack; 77 unsigned long *stack;
78 int i; 78 int i;
@@ -156,4 +156,3 @@ int is_valid_bugaddr(unsigned long ip)
156 156
157 return ud2 == 0x0b0f; 157 return ud2 == 0x0b0f;
158} 158}
159
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a071e6be177e..0ad9597073f5 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -10,26 +10,28 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21#define N_EXCEPTION_STACKS_END \
22 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
21 23
22static char x86_stack_ids[][8] = { 24static char x86_stack_ids[][8] = {
23 [DEBUG_STACK - 1] = "#DB", 25 [ DEBUG_STACK-1 ] = "#DB",
24 [NMI_STACK - 1] = "NMI", 26 [ NMI_STACK-1 ] = "NMI",
25 [DOUBLEFAULT_STACK - 1] = "#DF", 27 [ DOUBLEFAULT_STACK-1 ] = "#DF",
26 [STACKFAULT_STACK - 1] = "#SS", 28 [ STACKFAULT_STACK-1 ] = "#SS",
27 [MCE_STACK - 1] = "#MC", 29 [ MCE_STACK-1 ] = "#MC",
28#if DEBUG_STKSZ > EXCEPTION_STKSZ 30#if DEBUG_STKSZ > EXCEPTION_STKSZ
29 [N_EXCEPTION_STACKS ... 31 [ N_EXCEPTION_STACKS ...
30 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 32 N_EXCEPTION_STACKS_END ] = "#DB[?]"
31#endif 33#endif
32 }; 34};
33 35
34int x86_is_stack_id(int id, char *name) 36int x86_is_stack_id(int id, char *name)
35{ 37{
@@ -37,7 +39,7 @@ int x86_is_stack_id(int id, char *name)
37} 39}
38 40
39static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 41static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
40 unsigned *usedp, char **idp) 42 unsigned *usedp, char **idp)
41{ 43{
42 unsigned k; 44 unsigned k;
43 45
@@ -101,6 +103,35 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
101 return NULL; 103 return NULL;
102} 104}
103 105
106static inline int
107in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
108 unsigned long *irq_stack_end)
109{
110 return (stack >= irq_stack && stack < irq_stack_end);
111}
112
113/*
114 * We are returning from the irq stack and go to the previous one.
115 * If the previous stack is also in the irq stack, then bp in the first
116 * frame of the irq stack points to the previous, interrupted one.
117 * Otherwise we have another level of indirection: We first save
118 * the bp of the previous stack, then we switch the stack to the irq one
119 * and save a new bp that links to the previous one.
120 * (See save_args())
121 */
122static inline unsigned long
123fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
124 unsigned long *irq_stack, unsigned long *irq_stack_end)
125{
126#ifdef CONFIG_FRAME_POINTER
127 struct stack_frame *frame = (struct stack_frame *)bp;
128
129 if (!in_irq_stack(stack, irq_stack, irq_stack_end))
130 return (unsigned long)frame->next_frame;
131#endif
132 return bp;
133}
134
104/* 135/*
105 * x86-64 can have up to three kernel stacks: 136 * x86-64 can have up to three kernel stacks:
106 * process stack 137 * process stack
@@ -157,8 +188,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
157 if (ops->stack(data, id) < 0) 188 if (ops->stack(data, id) < 0)
158 break; 189 break;
159 190
160 bp = print_context_stack(tinfo, stack, bp, ops, 191 bp = ops->walk_stack(tinfo, stack, bp, ops,
161 data, estack_end, &graph); 192 data, estack_end, &graph);
162 ops->stack(data, "<EOE>"); 193 ops->stack(data, "<EOE>");
163 /* 194 /*
164 * We link to the next stack via the 195 * We link to the next stack via the
@@ -173,7 +204,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
173 irq_stack = irq_stack_end - 204 irq_stack = irq_stack_end -
174 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); 205 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
175 206
176 if (stack >= irq_stack && stack < irq_stack_end) { 207 if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
177 if (ops->stack(data, "IRQ") < 0) 208 if (ops->stack(data, "IRQ") < 0)
178 break; 209 break;
179 bp = print_context_stack(tinfo, stack, bp, 210 bp = print_context_stack(tinfo, stack, bp,
@@ -184,6 +215,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
184 * pointer (index -1 to end) in the IRQ stack: 215 * pointer (index -1 to end) in the IRQ stack:
185 */ 216 */
186 stack = (unsigned long *) (irq_stack_end[-1]); 217 stack = (unsigned long *) (irq_stack_end[-1]);
218 bp = fixup_bp_irq_link(bp, stack, irq_stack,
219 irq_stack_end);
187 irq_stack_end = NULL; 220 irq_stack_end = NULL;
188 ops->stack(data, "EOI"); 221 ops->stack(data, "EOI");
189 continue; 222 continue;
@@ -202,21 +235,24 @@ EXPORT_SYMBOL(dump_trace);
202 235
203void 236void
204show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 237show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
205 unsigned long *sp, unsigned long bp, char *log_lvl) 238 unsigned long *sp, unsigned long bp, char *log_lvl)
206{ 239{
240 unsigned long *irq_stack_end;
241 unsigned long *irq_stack;
207 unsigned long *stack; 242 unsigned long *stack;
243 int cpu;
208 int i; 244 int i;
209 const int cpu = smp_processor_id(); 245
210 unsigned long *irq_stack_end = 246 preempt_disable();
211 (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); 247 cpu = smp_processor_id();
212 unsigned long *irq_stack = 248
213 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); 249 irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
250 irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
214 251
215 /* 252 /*
216 * debugging aid: "show_stack(NULL, NULL);" prints the 253 * Debugging aid: "show_stack(NULL, NULL);" prints the
217 * back trace for this cpu. 254 * back trace for this cpu:
218 */ 255 */
219
220 if (sp == NULL) { 256 if (sp == NULL) {
221 if (task) 257 if (task)
222 sp = (unsigned long *)task->thread.sp; 258 sp = (unsigned long *)task->thread.sp;
@@ -240,6 +276,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
240 printk(" %016lx", *stack++); 276 printk(" %016lx", *stack++);
241 touch_nmi_watchdog(); 277 touch_nmi_watchdog();
242 } 278 }
279 preempt_enable();
280
243 printk("\n"); 281 printk("\n");
244 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 282 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
245} 283}
@@ -303,4 +341,3 @@ int is_valid_bugaddr(unsigned long ip)
303 341
304 return ud2 == 0x0b0f; 342 return ud2 == 0x0b0f;
305} 343}
306
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 230687ba5ba5..a966b753e496 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -732,7 +732,7 @@ core_initcall(e820_mark_nvs_memory);
732/* 732/*
733 * Early reserved memory areas. 733 * Early reserved memory areas.
734 */ 734 */
735#define MAX_EARLY_RES 20 735#define MAX_EARLY_RES 32
736 736
737struct early_res { 737struct early_res {
738 u64 start, end; 738 u64 start, end;
@@ -740,7 +740,16 @@ struct early_res {
740 char overlap_ok; 740 char overlap_ok;
741}; 741};
742static struct early_res early_res[MAX_EARLY_RES] __initdata = { 742static struct early_res early_res[MAX_EARLY_RES] __initdata = {
743 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ 743 { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
744#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE)
745 /*
746 * But first pinch a few for the stack/trampoline stuff
747 * FIXME: Don't need the extra page at 4K, but need to fix
748 * trampoline before removing it. (see the GDT stuff)
749 */
750 { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 },
751#endif
752
744 {} 753 {}
745}; 754};
746 755
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index ad5bd988fb79..cdcfb122f256 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -454,8 +454,10 @@ void __init efi_init(void)
454 if (add_efi_memmap) 454 if (add_efi_memmap)
455 do_add_efi_memmap(); 455 do_add_efi_memmap();
456 456
457#ifdef CONFIG_X86_32
457 x86_platform.get_wallclock = efi_get_time; 458 x86_platform.get_wallclock = efi_get_time;
458 x86_platform.set_wallclock = efi_set_rtc_mmss; 459 x86_platform.set_wallclock = efi_set_rtc_mmss;
460#endif
459 461
460 /* Setup for EFI runtime service */ 462 /* Setup for EFI runtime service */
461 reboot_type = BOOT_EFI; 463 reboot_type = BOOT_EFI;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..44a8e0dc6737 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -334,6 +334,10 @@ ENTRY(ret_from_fork)
334END(ret_from_fork) 334END(ret_from_fork)
335 335
336/* 336/*
337 * Interrupt exit functions should be protected against kprobes
338 */
339 .pushsection .kprobes.text, "ax"
340/*
337 * Return to user mode is not as complex as all this looks, 341 * Return to user mode is not as complex as all this looks,
338 * but we want the default path for a system call return to 342 * but we want the default path for a system call return to
339 * go as quickly as possible which is why some of this is 343 * go as quickly as possible which is why some of this is
@@ -383,6 +387,10 @@ need_resched:
383END(resume_kernel) 387END(resume_kernel)
384#endif 388#endif
385 CFI_ENDPROC 389 CFI_ENDPROC
390/*
391 * End of kprobes section
392 */
393 .popsection
386 394
387/* SYSENTER_RETURN points to after the "sysenter" instruction in 395/* SYSENTER_RETURN points to after the "sysenter" instruction in
388 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 396 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -513,6 +521,10 @@ sysexit_audit:
513 PTGS_TO_GS_EX 521 PTGS_TO_GS_EX
514ENDPROC(ia32_sysenter_target) 522ENDPROC(ia32_sysenter_target)
515 523
524/*
525 * syscall stub including irq exit should be protected against kprobes
526 */
527 .pushsection .kprobes.text, "ax"
516 # system call handler stub 528 # system call handler stub
517ENTRY(system_call) 529ENTRY(system_call)
518 RING0_INT_FRAME # can't unwind into user space anyway 530 RING0_INT_FRAME # can't unwind into user space anyway
@@ -705,26 +717,69 @@ syscall_badsys:
705 jmp resume_userspace 717 jmp resume_userspace
706END(syscall_badsys) 718END(syscall_badsys)
707 CFI_ENDPROC 719 CFI_ENDPROC
720/*
721 * End of kprobes section
722 */
723 .popsection
708 724
709/* 725/*
710 * System calls that need a pt_regs pointer. 726 * System calls that need a pt_regs pointer.
711 */ 727 */
712#define PTREGSCALL(name) \ 728#define PTREGSCALL0(name) \
713 ALIGN; \ 729 ALIGN; \
714ptregs_##name: \ 730ptregs_##name: \
715 leal 4(%esp),%eax; \ 731 leal 4(%esp),%eax; \
716 jmp sys_##name; 732 jmp sys_##name;
717 733
718PTREGSCALL(iopl) 734#define PTREGSCALL1(name) \
719PTREGSCALL(fork) 735 ALIGN; \
720PTREGSCALL(clone) 736ptregs_##name: \
721PTREGSCALL(vfork) 737 leal 4(%esp),%edx; \
722PTREGSCALL(execve) 738 movl (PT_EBX+4)(%esp),%eax; \
723PTREGSCALL(sigaltstack) 739 jmp sys_##name;
724PTREGSCALL(sigreturn) 740
725PTREGSCALL(rt_sigreturn) 741#define PTREGSCALL2(name) \
726PTREGSCALL(vm86) 742 ALIGN; \
727PTREGSCALL(vm86old) 743ptregs_##name: \
744 leal 4(%esp),%ecx; \
745 movl (PT_ECX+4)(%esp),%edx; \
746 movl (PT_EBX+4)(%esp),%eax; \
747 jmp sys_##name;
748
749#define PTREGSCALL3(name) \
750 ALIGN; \
751ptregs_##name: \
752 leal 4(%esp),%eax; \
753 pushl %eax; \
754 movl PT_EDX(%eax),%ecx; \
755 movl PT_ECX(%eax),%edx; \
756 movl PT_EBX(%eax),%eax; \
757 call sys_##name; \
758 addl $4,%esp; \
759 ret
760
761PTREGSCALL1(iopl)
762PTREGSCALL0(fork)
763PTREGSCALL0(vfork)
764PTREGSCALL3(execve)
765PTREGSCALL2(sigaltstack)
766PTREGSCALL0(sigreturn)
767PTREGSCALL0(rt_sigreturn)
768PTREGSCALL2(vm86)
769PTREGSCALL1(vm86old)
770
771/* Clone is an oddball. The 4th arg is in %edi */
772 ALIGN;
773ptregs_clone:
774 leal 4(%esp),%eax
775 pushl %eax
776 pushl PT_EDI(%eax)
777 movl PT_EDX(%eax),%ecx
778 movl PT_ECX(%eax),%edx
779 movl PT_EBX(%eax),%eax
780 call sys_clone
781 addl $8,%esp
782 ret
728 783
729.macro FIXUP_ESPFIX_STACK 784.macro FIXUP_ESPFIX_STACK
730/* 785/*
@@ -814,6 +869,10 @@ common_interrupt:
814ENDPROC(common_interrupt) 869ENDPROC(common_interrupt)
815 CFI_ENDPROC 870 CFI_ENDPROC
816 871
872/*
873 * Irq entries should be protected against kprobes
874 */
875 .pushsection .kprobes.text, "ax"
817#define BUILD_INTERRUPT3(name, nr, fn) \ 876#define BUILD_INTERRUPT3(name, nr, fn) \
818ENTRY(name) \ 877ENTRY(name) \
819 RING0_INT_FRAME; \ 878 RING0_INT_FRAME; \
@@ -980,16 +1039,16 @@ ENTRY(spurious_interrupt_bug)
980 jmp error_code 1039 jmp error_code
981 CFI_ENDPROC 1040 CFI_ENDPROC
982END(spurious_interrupt_bug) 1041END(spurious_interrupt_bug)
1042/*
1043 * End of kprobes section
1044 */
1045 .popsection
983 1046
984ENTRY(kernel_thread_helper) 1047ENTRY(kernel_thread_helper)
985 pushl $0 # fake return address for unwinder 1048 pushl $0 # fake return address for unwinder
986 CFI_STARTPROC 1049 CFI_STARTPROC
987 movl %edx,%eax 1050 movl %edi,%eax
988 push %edx 1051 call *%esi
989 CFI_ADJUST_CFA_OFFSET 4
990 call *%ebx
991 push %eax
992 CFI_ADJUST_CFA_OFFSET 4
993 call do_exit 1052 call do_exit
994 ud2 # padding for call trace 1053 ud2 # padding for call trace
995 CFI_ENDPROC 1054 CFI_ENDPROC
@@ -1185,17 +1244,14 @@ END(ftrace_graph_caller)
1185 1244
1186.globl return_to_handler 1245.globl return_to_handler
1187return_to_handler: 1246return_to_handler:
1188 pushl $0
1189 pushl %eax 1247 pushl %eax
1190 pushl %ecx
1191 pushl %edx 1248 pushl %edx
1192 movl %ebp, %eax 1249 movl %ebp, %eax
1193 call ftrace_return_to_handler 1250 call ftrace_return_to_handler
1194 movl %eax, 0xc(%esp) 1251 movl %eax, %ecx
1195 popl %edx 1252 popl %edx
1196 popl %ecx
1197 popl %eax 1253 popl %eax
1198 ret 1254 jmp *%ecx
1199#endif 1255#endif
1200 1256
1201.section .rodata,"a" 1257.section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b5c061f8f358..0697ff139837 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 16(%rsp) 158 movq %rax, %rdi
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $16, %rsp 161 addq $24, %rsp
162 retq 162 jmp *%rdi
163#endif 163#endif
164 164
165 165
@@ -803,6 +803,10 @@ END(interrupt)
803 call \func 803 call \func
804 .endm 804 .endm
805 805
806/*
807 * Interrupt entry/exit should be protected against kprobes
808 */
809 .pushsection .kprobes.text, "ax"
806 /* 810 /*
807 * The interrupt stubs push (~vector+0x80) onto the stack and 811 * The interrupt stubs push (~vector+0x80) onto the stack and
808 * then jump to common_interrupt. 812 * then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
941 945
942 CFI_ENDPROC 946 CFI_ENDPROC
943END(common_interrupt) 947END(common_interrupt)
948/*
949 * End of kprobes section
950 */
951 .popsection
944 952
945/* 953/*
946 * APIC interrupts. 954 * APIC interrupts.
@@ -969,8 +977,8 @@ apicinterrupt UV_BAU_MESSAGE \
969#endif 977#endif
970apicinterrupt LOCAL_TIMER_VECTOR \ 978apicinterrupt LOCAL_TIMER_VECTOR \
971 apic_timer_interrupt smp_apic_timer_interrupt 979 apic_timer_interrupt smp_apic_timer_interrupt
972apicinterrupt GENERIC_INTERRUPT_VECTOR \ 980apicinterrupt X86_PLATFORM_IPI_VECTOR \
973 generic_interrupt smp_generic_interrupt 981 x86_platform_ipi smp_x86_platform_ipi
974 982
975#ifdef CONFIG_SMP 983#ifdef CONFIG_SMP
976apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -1068,10 +1076,10 @@ ENTRY(\sym)
1068 TRACE_IRQS_OFF 1076 TRACE_IRQS_OFF
1069 movq %rsp,%rdi /* pt_regs pointer */ 1077 movq %rsp,%rdi /* pt_regs pointer */
1070 xorl %esi,%esi /* no error code */ 1078 xorl %esi,%esi /* no error code */
1071 PER_CPU(init_tss, %rbp) 1079 PER_CPU(init_tss, %r12)
1072 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1080 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1073 call \do_sym 1081 call \do_sym
1074 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1082 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1075 jmp paranoid_exit /* %ebx: no swapgs flag */ 1083 jmp paranoid_exit /* %ebx: no swapgs flag */
1076 CFI_ENDPROC 1084 CFI_ENDPROC
1077END(\sym) 1085END(\sym)
@@ -1158,63 +1166,20 @@ bad_gs:
1158 jmp 2b 1166 jmp 2b
1159 .previous 1167 .previous
1160 1168
1161/* 1169ENTRY(kernel_thread_helper)
1162 * Create a kernel thread.
1163 *
1164 * C extern interface:
1165 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
1166 *
1167 * asm input arguments:
1168 * rdi: fn, rsi: arg, rdx: flags
1169 */
1170ENTRY(kernel_thread)
1171 CFI_STARTPROC
1172 FAKE_STACK_FRAME $child_rip
1173 SAVE_ALL
1174
1175 # rdi: flags, rsi: usp, rdx: will be &pt_regs
1176 movq %rdx,%rdi
1177 orq kernel_thread_flags(%rip),%rdi
1178 movq $-1, %rsi
1179 movq %rsp, %rdx
1180
1181 xorl %r8d,%r8d
1182 xorl %r9d,%r9d
1183
1184 # clone now
1185 call do_fork
1186 movq %rax,RAX(%rsp)
1187 xorl %edi,%edi
1188
1189 /*
1190 * It isn't worth to check for reschedule here,
1191 * so internally to the x86_64 port you can rely on kernel_thread()
1192 * not to reschedule the child before returning, this avoids the need
1193 * of hacks for example to fork off the per-CPU idle tasks.
1194 * [Hopefully no generic code relies on the reschedule -AK]
1195 */
1196 RESTORE_ALL
1197 UNFAKE_STACK_FRAME
1198 ret
1199 CFI_ENDPROC
1200END(kernel_thread)
1201
1202ENTRY(child_rip)
1203 pushq $0 # fake return address 1170 pushq $0 # fake return address
1204 CFI_STARTPROC 1171 CFI_STARTPROC
1205 /* 1172 /*
1206 * Here we are in the child and the registers are set as they were 1173 * Here we are in the child and the registers are set as they were
1207 * at kernel_thread() invocation in the parent. 1174 * at kernel_thread() invocation in the parent.
1208 */ 1175 */
1209 movq %rdi, %rax 1176 call *%rsi
1210 movq %rsi, %rdi
1211 call *%rax
1212 # exit 1177 # exit
1213 mov %eax, %edi 1178 mov %eax, %edi
1214 call do_exit 1179 call do_exit
1215 ud2 # padding for call trace 1180 ud2 # padding for call trace
1216 CFI_ENDPROC 1181 CFI_ENDPROC
1217END(child_rip) 1182END(kernel_thread_helper)
1218 1183
1219/* 1184/*
1220 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1185 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1491,12 +1456,17 @@ error_kernelspace:
1491 leaq irq_return(%rip),%rcx 1456 leaq irq_return(%rip),%rcx
1492 cmpq %rcx,RIP+8(%rsp) 1457 cmpq %rcx,RIP+8(%rsp)
1493 je error_swapgs 1458 je error_swapgs
1494 movl %ecx,%ecx /* zero extend */ 1459 movl %ecx,%eax /* zero extend */
1495 cmpq %rcx,RIP+8(%rsp) 1460 cmpq %rax,RIP+8(%rsp)
1496 je error_swapgs 1461 je bstep_iret
1497 cmpq $gs_change,RIP+8(%rsp) 1462 cmpq $gs_change,RIP+8(%rsp)
1498 je error_swapgs 1463 je error_swapgs
1499 jmp error_sti 1464 jmp error_sti
1465
1466bstep_iret:
1467 /* Fix truncated RIP */
1468 movq %rcx,RIP+8(%rsp)
1469 jmp error_swapgs
1500END(error_entry) 1470END(error_entry)
1501 1471
1502 1472
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 944e9820b4b5..309689245431 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
9 * the dangers of modifying code on the run. 9 * the dangers of modifying code on the run.
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
13#include <linux/hardirq.h> 15#include <linux/hardirq.h>
14#include <linux/uaccess.h> 16#include <linux/uaccess.h>
@@ -353,15 +355,15 @@ int __init ftrace_dyn_arch_init(void *data)
353 355
354 switch (faulted) { 356 switch (faulted) {
355 case 0: 357 case 0:
356 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); 358 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
357 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); 359 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
358 break; 360 break;
359 case 1: 361 case 1:
360 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); 362 pr_info("converting mcount calls to 66 66 66 66 90\n");
361 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); 363 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
362 break; 364 break;
363 case 2: 365 case 2:
364 pr_info("ftrace: converting mcount calls to jmp . + 5\n"); 366 pr_info("converting mcount calls to jmp . + 5\n");
365 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); 367 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
366 break; 368 break;
367 } 369 }
@@ -485,82 +487,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
485 487
486#ifdef CONFIG_FTRACE_SYSCALLS 488#ifdef CONFIG_FTRACE_SYSCALLS
487 489
488extern unsigned long __start_syscalls_metadata[];
489extern unsigned long __stop_syscalls_metadata[];
490extern unsigned long *sys_call_table; 490extern unsigned long *sys_call_table;
491 491
492static struct syscall_metadata **syscalls_metadata; 492unsigned long __init arch_syscall_addr(int nr)
493
494static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
495{
496 struct syscall_metadata *start;
497 struct syscall_metadata *stop;
498 char str[KSYM_SYMBOL_LEN];
499
500
501 start = (struct syscall_metadata *)__start_syscalls_metadata;
502 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
503 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
504
505 for ( ; start < stop; start++) {
506 if (start->name && !strcmp(start->name, str))
507 return start;
508 }
509 return NULL;
510}
511
512struct syscall_metadata *syscall_nr_to_meta(int nr)
513{
514 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
515 return NULL;
516
517 return syscalls_metadata[nr];
518}
519
520int syscall_name_to_nr(char *name)
521{ 493{
522 int i; 494 return (unsigned long)(&sys_call_table)[nr];
523
524 if (!syscalls_metadata)
525 return -1;
526
527 for (i = 0; i < NR_syscalls; i++) {
528 if (syscalls_metadata[i]) {
529 if (!strcmp(syscalls_metadata[i]->name, name))
530 return i;
531 }
532 }
533 return -1;
534}
535
536void set_syscall_enter_id(int num, int id)
537{
538 syscalls_metadata[num]->enter_id = id;
539}
540
541void set_syscall_exit_id(int num, int id)
542{
543 syscalls_metadata[num]->exit_id = id;
544}
545
546static int __init arch_init_ftrace_syscalls(void)
547{
548 int i;
549 struct syscall_metadata *meta;
550 unsigned long **psys_syscall_table = &sys_call_table;
551
552 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
553 NR_syscalls, GFP_KERNEL);
554 if (!syscalls_metadata) {
555 WARN_ON(1);
556 return -ENOMEM;
557 }
558
559 for (i = 0; i < NR_syscalls; i++) {
560 meta = find_syscall_meta(psys_syscall_table[i]);
561 syscalls_metadata[i] = meta;
562 }
563 return 0;
564} 495}
565arch_initcall(arch_init_ftrace_syscalls);
566#endif 496#endif
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
deleted file mode 100644
index 9b08e852fd1a..000000000000
--- a/arch/x86/kernel/geode_32.c
+++ /dev/null
@@ -1,196 +0,0 @@
1/*
2 * AMD Geode southbridge support code
3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2 of the GNU General Public License
8 * as published by the Free Software Foundation.
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/ioport.h>
14#include <linux/io.h>
15#include <asm/msr.h>
16#include <asm/geode.h>
17
18static struct {
19 char *name;
20 u32 msr;
21 int size;
22 u32 base;
23} lbars[] = {
24 { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
25 { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
26 { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
27 { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
28};
29
30static void __init init_lbars(void)
31{
32 u32 lo, hi;
33 int i;
34
35 for (i = 0; i < ARRAY_SIZE(lbars); i++) {
36 rdmsr(lbars[i].msr, lo, hi);
37 if (hi & 0x01)
38 lbars[i].base = lo & 0x0000ffff;
39
40 if (lbars[i].base == 0)
41 printk(KERN_ERR "geode: Couldn't initialize '%s'\n",
42 lbars[i].name);
43 }
44}
45
46int geode_get_dev_base(unsigned int dev)
47{
48 BUG_ON(dev >= ARRAY_SIZE(lbars));
49 return lbars[dev].base;
50}
51EXPORT_SYMBOL_GPL(geode_get_dev_base);
52
53/* === GPIO API === */
54
55void geode_gpio_set(u32 gpio, unsigned int reg)
56{
57 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
58
59 if (!base)
60 return;
61
62 /* low bank register */
63 if (gpio & 0xFFFF)
64 outl(gpio & 0xFFFF, base + reg);
65 /* high bank register */
66 gpio >>= 16;
67 if (gpio)
68 outl(gpio, base + 0x80 + reg);
69}
70EXPORT_SYMBOL_GPL(geode_gpio_set);
71
72void geode_gpio_clear(u32 gpio, unsigned int reg)
73{
74 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
75
76 if (!base)
77 return;
78
79 /* low bank register */
80 if (gpio & 0xFFFF)
81 outl((gpio & 0xFFFF) << 16, base + reg);
82 /* high bank register */
83 gpio &= (0xFFFF << 16);
84 if (gpio)
85 outl(gpio, base + 0x80 + reg);
86}
87EXPORT_SYMBOL_GPL(geode_gpio_clear);
88
89int geode_gpio_isset(u32 gpio, unsigned int reg)
90{
91 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
92 u32 val;
93
94 if (!base)
95 return 0;
96
97 /* low bank register */
98 if (gpio & 0xFFFF) {
99 val = inl(base + reg) & (gpio & 0xFFFF);
100 if ((gpio & 0xFFFF) == val)
101 return 1;
102 }
103 /* high bank register */
104 gpio >>= 16;
105 if (gpio) {
106 val = inl(base + 0x80 + reg) & gpio;
107 if (gpio == val)
108 return 1;
109 }
110 return 0;
111}
112EXPORT_SYMBOL_GPL(geode_gpio_isset);
113
114void geode_gpio_set_irq(unsigned int group, unsigned int irq)
115{
116 u32 lo, hi;
117
118 if (group > 7 || irq > 15)
119 return;
120
121 rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
122
123 lo &= ~(0xF << (group * 4));
124 lo |= (irq & 0xF) << (group * 4);
125
126 wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
127}
128EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
129
130void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
131{
132 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
133 u32 offset, shift, val;
134
135 if (gpio >= 24)
136 offset = GPIO_MAP_W;
137 else if (gpio >= 16)
138 offset = GPIO_MAP_Z;
139 else if (gpio >= 8)
140 offset = GPIO_MAP_Y;
141 else
142 offset = GPIO_MAP_X;
143
144 shift = (gpio % 8) * 4;
145
146 val = inl(base + offset);
147
148 /* Clear whatever was there before */
149 val &= ~(0xF << shift);
150
151 /* And set the new value */
152
153 val |= ((pair & 7) << shift);
154
155 /* Set the PME bit if this is a PME event */
156
157 if (pme)
158 val |= (1 << (shift + 3));
159
160 outl(val, base + offset);
161}
162EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
163
164int geode_has_vsa2(void)
165{
166 static int has_vsa2 = -1;
167
168 if (has_vsa2 == -1) {
169 u16 val;
170
171 /*
172 * The VSA has virtual registers that we can query for a
173 * signature.
174 */
175 outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
176 outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
177
178 val = inw(VSA_VRC_DATA);
179 has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
180 }
181
182 return has_vsa2;
183}
184EXPORT_SYMBOL_GPL(geode_has_vsa2);
185
186static int __init geode_southbridge_init(void)
187{
188 if (!is_geode())
189 return -ENODEV;
190
191 init_lbars();
192 (void) mfgpt_timer_setup();
193 return 0;
194}
195
196postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 4f8e2507e8f3..5051b94c9069 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,8 +29,6 @@ static void __init i386_default_early_setup(void)
29 29
30void __init i386_start_kernel(void) 30void __init i386_start_kernel(void)
31{ 31{
32 reserve_trampoline_memory();
33
34 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 32 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
35 33
36#ifdef CONFIG_BLK_DEV_INITRD 34#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0b06cd778fd9..b5a9896ca1e7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,8 +98,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
98{ 98{
99 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
100 100
101 reserve_trampoline_memory();
102
103 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 101 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
104 102
105#ifdef CONFIG_BLK_DEV_INITRD 103#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b55ee4ff509f..2d8b5035371c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -212,8 +212,8 @@ ENTRY(secondary_startup_64)
212 */ 212 */
213 lgdt early_gdt_descr(%rip) 213 lgdt early_gdt_descr(%rip)
214 214
215 /* set up data segments. actually 0 would do too */ 215 /* set up data segments */
216 movl $__KERNEL_DS,%eax 216 xorl %eax,%eax
217 movl %eax,%ds 217 movl %eax,%ds
218 movl %eax,%ss 218 movl %eax,%ss
219 movl %eax,%es 219 movl %eax,%es
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dedc2bddf7a5..ad80a1c718c6 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,6 +33,9 @@
33 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
34 */ 34 */
35unsigned long hpet_address; 35unsigned long hpet_address;
36u8 hpet_blockid; /* OS timer block num */
37u8 hpet_msi_disable;
38
36#ifdef CONFIG_PCI_MSI 39#ifdef CONFIG_PCI_MSI
37static unsigned long hpet_num_timers; 40static unsigned long hpet_num_timers;
38#endif 41#endif
@@ -47,12 +50,12 @@ struct hpet_dev {
47 char name[10]; 50 char name[10];
48}; 51};
49 52
50unsigned long hpet_readl(unsigned long a) 53inline unsigned int hpet_readl(unsigned int a)
51{ 54{
52 return readl(hpet_virt_address + a); 55 return readl(hpet_virt_address + a);
53} 56}
54 57
55static inline void hpet_writel(unsigned long d, unsigned long a) 58static inline void hpet_writel(unsigned int d, unsigned int a)
56{ 59{
57 writel(d, hpet_virt_address + a); 60 writel(d, hpet_virt_address + a);
58} 61}
@@ -167,7 +170,7 @@ do { \
167 170
168static void hpet_reserve_msi_timers(struct hpet_data *hd); 171static void hpet_reserve_msi_timers(struct hpet_data *hd);
169 172
170static void hpet_reserve_platform_timers(unsigned long id) 173static void hpet_reserve_platform_timers(unsigned int id)
171{ 174{
172 struct hpet __iomem *hpet = hpet_virt_address; 175 struct hpet __iomem *hpet = hpet_virt_address;
173 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; 176 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
@@ -205,7 +208,7 @@ static void hpet_reserve_platform_timers(unsigned long id)
205 208
206} 209}
207#else 210#else
208static void hpet_reserve_platform_timers(unsigned long id) { } 211static void hpet_reserve_platform_timers(unsigned int id) { }
209#endif 212#endif
210 213
211/* 214/*
@@ -246,7 +249,7 @@ static void hpet_reset_counter(void)
246 249
247static void hpet_start_counter(void) 250static void hpet_start_counter(void)
248{ 251{
249 unsigned long cfg = hpet_readl(HPET_CFG); 252 unsigned int cfg = hpet_readl(HPET_CFG);
250 cfg |= HPET_CFG_ENABLE; 253 cfg |= HPET_CFG_ENABLE;
251 hpet_writel(cfg, HPET_CFG); 254 hpet_writel(cfg, HPET_CFG);
252} 255}
@@ -271,7 +274,7 @@ static void hpet_resume_counter(void)
271 274
272static void hpet_enable_legacy_int(void) 275static void hpet_enable_legacy_int(void)
273{ 276{
274 unsigned long cfg = hpet_readl(HPET_CFG); 277 unsigned int cfg = hpet_readl(HPET_CFG);
275 278
276 cfg |= HPET_CFG_LEGACY; 279 cfg |= HPET_CFG_LEGACY;
277 hpet_writel(cfg, HPET_CFG); 280 hpet_writel(cfg, HPET_CFG);
@@ -314,7 +317,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
314static void hpet_set_mode(enum clock_event_mode mode, 317static void hpet_set_mode(enum clock_event_mode mode,
315 struct clock_event_device *evt, int timer) 318 struct clock_event_device *evt, int timer)
316{ 319{
317 unsigned long cfg, cmp, now; 320 unsigned int cfg, cmp, now;
318 uint64_t delta; 321 uint64_t delta;
319 322
320 switch (mode) { 323 switch (mode) {
@@ -323,7 +326,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
323 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; 326 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
324 delta >>= evt->shift; 327 delta >>= evt->shift;
325 now = hpet_readl(HPET_COUNTER); 328 now = hpet_readl(HPET_COUNTER);
326 cmp = now + (unsigned long) delta; 329 cmp = now + (unsigned int) delta;
327 cfg = hpet_readl(HPET_Tn_CFG(timer)); 330 cfg = hpet_readl(HPET_Tn_CFG(timer));
328 /* Make sure we use edge triggered interrupts */ 331 /* Make sure we use edge triggered interrupts */
329 cfg &= ~HPET_TN_LEVEL; 332 cfg &= ~HPET_TN_LEVEL;
@@ -339,7 +342,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
339 * (See AMD-8111 HyperTransport I/O Hub Data Sheet, 342 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
340 * Publication # 24674) 343 * Publication # 24674)
341 */ 344 */
342 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); 345 hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
343 hpet_start_counter(); 346 hpet_start_counter();
344 hpet_print_config(); 347 hpet_print_config();
345 break; 348 break;
@@ -383,13 +386,24 @@ static int hpet_next_event(unsigned long delta,
383 hpet_writel(cnt, HPET_Tn_CMP(timer)); 386 hpet_writel(cnt, HPET_Tn_CMP(timer));
384 387
385 /* 388 /*
386 * We need to read back the CMP register to make sure that 389 * We need to read back the CMP register on certain HPET
387 * what we wrote hit the chip before we compare it to the 390 * implementations (ATI chipsets) which seem to delay the
388 * counter. 391 * transfer of the compare register into the internal compare
392 * logic. With small deltas this might actually be too late as
393 * the counter could already be higher than the compare value
394 * at that point and we would wait for the next hpet interrupt
395 * forever. We found out that reading the CMP register back
396 * forces the transfer so we can rely on the comparison with
397 * the counter register below. If the read back from the
398 * compare register does not match the value we programmed
399 * then we might have a real hardware problem. We can not do
400 * much about it here, but at least alert the user/admin with
401 * a prominent warning.
389 */ 402 */
390 WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); 403 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
404 KERN_WARNING "hpet: compare register read back failed.\n");
391 405
392 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 406 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
393} 407}
394 408
395static void hpet_legacy_set_mode(enum clock_event_mode mode, 409static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -415,7 +429,7 @@ static struct hpet_dev *hpet_devs;
415void hpet_msi_unmask(unsigned int irq) 429void hpet_msi_unmask(unsigned int irq)
416{ 430{
417 struct hpet_dev *hdev = get_irq_data(irq); 431 struct hpet_dev *hdev = get_irq_data(irq);
418 unsigned long cfg; 432 unsigned int cfg;
419 433
420 /* unmask it */ 434 /* unmask it */
421 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 435 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -425,7 +439,7 @@ void hpet_msi_unmask(unsigned int irq)
425 439
426void hpet_msi_mask(unsigned int irq) 440void hpet_msi_mask(unsigned int irq)
427{ 441{
428 unsigned long cfg; 442 unsigned int cfg;
429 struct hpet_dev *hdev = get_irq_data(irq); 443 struct hpet_dev *hdev = get_irq_data(irq);
430 444
431 /* mask it */ 445 /* mask it */
@@ -467,7 +481,7 @@ static int hpet_msi_next_event(unsigned long delta,
467 481
468static int hpet_setup_msi_irq(unsigned int irq) 482static int hpet_setup_msi_irq(unsigned int irq)
469{ 483{
470 if (arch_setup_hpet_msi(irq)) { 484 if (arch_setup_hpet_msi(irq, hpet_blockid)) {
471 destroy_irq(irq); 485 destroy_irq(irq);
472 return -EINVAL; 486 return -EINVAL;
473 } 487 }
@@ -584,6 +598,11 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
584 unsigned int num_timers_used = 0; 598 unsigned int num_timers_used = 0;
585 int i; 599 int i;
586 600
601 if (hpet_msi_disable)
602 return;
603
604 if (boot_cpu_has(X86_FEATURE_ARAT))
605 return;
587 id = hpet_readl(HPET_ID); 606 id = hpet_readl(HPET_ID);
588 607
589 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); 608 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
@@ -598,7 +617,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
598 617
599 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { 618 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
600 struct hpet_dev *hdev = &hpet_devs[num_timers_used]; 619 struct hpet_dev *hdev = &hpet_devs[num_timers_used];
601 unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); 620 unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
602 621
603 /* Only consider HPET timer with MSI support */ 622 /* Only consider HPET timer with MSI support */
604 if (!(cfg & HPET_TN_FSB_CAP)) 623 if (!(cfg & HPET_TN_FSB_CAP))
@@ -813,7 +832,7 @@ static int hpet_clocksource_register(void)
813 */ 832 */
814int __init hpet_enable(void) 833int __init hpet_enable(void)
815{ 834{
816 unsigned long id; 835 unsigned int id;
817 int i; 836 int i;
818 837
819 if (!is_hpet_capable()) 838 if (!is_hpet_capable())
@@ -872,10 +891,8 @@ int __init hpet_enable(void)
872 891
873 if (id & HPET_ID_LEGSUP) { 892 if (id & HPET_ID_LEGSUP) {
874 hpet_legacy_clockevent_register(); 893 hpet_legacy_clockevent_register();
875 hpet_msi_capability_lookup(2);
876 return 1; 894 return 1;
877 } 895 }
878 hpet_msi_capability_lookup(0);
879 return 0; 896 return 0;
880 897
881out_nohpet: 898out_nohpet:
@@ -908,9 +925,20 @@ static __init int hpet_late_init(void)
908 if (!hpet_virt_address) 925 if (!hpet_virt_address)
909 return -ENODEV; 926 return -ENODEV;
910 927
928 if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
929 hpet_msi_capability_lookup(2);
930 else
931 hpet_msi_capability_lookup(0);
932
911 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 933 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
912 hpet_print_config(); 934 hpet_print_config();
913 935
936 if (hpet_msi_disable)
937 return 0;
938
939 if (boot_cpu_has(X86_FEATURE_ARAT))
940 return 0;
941
914 for_each_online_cpu(cpu) { 942 for_each_online_cpu(cpu) {
915 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); 943 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
916 } 944 }
@@ -925,7 +953,7 @@ fs_initcall(hpet_late_init);
925void hpet_disable(void) 953void hpet_disable(void)
926{ 954{
927 if (is_hpet_capable()) { 955 if (is_hpet_capable()) {
928 unsigned long cfg = hpet_readl(HPET_CFG); 956 unsigned int cfg = hpet_readl(HPET_CFG);
929 957
930 if (hpet_legacy_int_enabled) { 958 if (hpet_legacy_int_enabled) {
931 cfg &= ~HPET_CFG_LEGACY; 959 cfg &= ~HPET_CFG_LEGACY;
@@ -965,8 +993,8 @@ static int hpet_prev_update_sec;
965static struct rtc_time hpet_alarm_time; 993static struct rtc_time hpet_alarm_time;
966static unsigned long hpet_pie_count; 994static unsigned long hpet_pie_count;
967static u32 hpet_t1_cmp; 995static u32 hpet_t1_cmp;
968static unsigned long hpet_default_delta; 996static u32 hpet_default_delta;
969static unsigned long hpet_pie_delta; 997static u32 hpet_pie_delta;
970static unsigned long hpet_pie_limit; 998static unsigned long hpet_pie_limit;
971 999
972static rtc_irq_handler irq_handler; 1000static rtc_irq_handler irq_handler;
@@ -1017,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
1017 */ 1045 */
1018int hpet_rtc_timer_init(void) 1046int hpet_rtc_timer_init(void)
1019{ 1047{
1020 unsigned long cfg, cnt, delta, flags; 1048 unsigned int cfg, cnt, delta;
1049 unsigned long flags;
1021 1050
1022 if (!is_hpet_enabled()) 1051 if (!is_hpet_enabled())
1023 return 0; 1052 return 0;
@@ -1027,7 +1056,7 @@ int hpet_rtc_timer_init(void)
1027 1056
1028 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1057 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1029 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; 1058 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
1030 hpet_default_delta = (unsigned long) clc; 1059 hpet_default_delta = clc;
1031 } 1060 }
1032 1061
1033 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) 1062 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
@@ -1113,7 +1142,7 @@ int hpet_set_periodic_freq(unsigned long freq)
1113 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1142 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1114 do_div(clc, freq); 1143 do_div(clc, freq);
1115 clc >>= hpet_clockevent.shift; 1144 clc >>= hpet_clockevent.shift;
1116 hpet_pie_delta = (unsigned long) clc; 1145 hpet_pie_delta = clc;
1117 } 1146 }
1118 return 1; 1147 return 1;
1119} 1148}
@@ -1127,7 +1156,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
1127 1156
1128static void hpet_rtc_timer_reinit(void) 1157static void hpet_rtc_timer_reinit(void)
1129{ 1158{
1130 unsigned long cfg, delta; 1159 unsigned int cfg, delta;
1131 int lost_ints = -1; 1160 int lost_ints = -1;
1132 1161
1133 if (unlikely(!hpet_rtc_flags)) { 1162 if (unlikely(!hpet_rtc_flags)) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..05d5fec64a94
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,554 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) 2009 IBM Corporation
18 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Authors: Alan Stern <stern@rowland.harvard.edu>
21 * K.Prasad <prasad@linux.vnet.ibm.com>
22 * Frederic Weisbecker <fweisbec@gmail.com>
23 */
24
25/*
26 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
27 * using the CPU's debug registers.
28 */
29
30#include <linux/perf_event.h>
31#include <linux/hw_breakpoint.h>
32#include <linux/irqflags.h>
33#include <linux/notifier.h>
34#include <linux/kallsyms.h>
35#include <linux/kprobes.h>
36#include <linux/percpu.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h>
43
44#include <asm/hw_breakpoint.h>
45#include <asm/processor.h>
46#include <asm/debugreg.h>
47
48/* Per cpu debug control register value */
49DEFINE_PER_CPU(unsigned long, cpu_dr7);
50EXPORT_PER_CPU_SYMBOL(cpu_dr7);
51
52/* Per cpu debug address registers values */
53static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
54
55/*
56 * Stores the breakpoints currently in use on each breakpoint address
57 * register for each cpus
58 */
59static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
60
61
62static inline unsigned long
63__encode_dr7(int drnum, unsigned int len, unsigned int type)
64{
65 unsigned long bp_info;
66
67 bp_info = (len | type) & 0xf;
68 bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
69 bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
70
71 return bp_info;
72}
73
74/*
75 * Encode the length, type, Exact, and Enable bits for a particular breakpoint
76 * as stored in debug register 7.
77 */
78unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
79{
80 return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
81}
82
83/*
84 * Decode the length and type bits for a particular breakpoint as
85 * stored in debug register 7. Return the "enabled" status.
86 */
87int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
88{
89 int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
90
91 *len = (bp_info & 0xc) | 0x40;
92 *type = (bp_info & 0x3) | 0x80;
93
94 return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
95}
96
97/*
98 * Install a perf counter breakpoint.
99 *
100 * We seek a free debug address register and use it for this
101 * breakpoint. Eventually we enable it in the debug control register.
102 *
103 * Atomic: we hold the counter->ctx->lock and we only handle variables
104 * and registers local to this cpu.
105 */
106int arch_install_hw_breakpoint(struct perf_event *bp)
107{
108 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
109 unsigned long *dr7;
110 int i;
111
112 for (i = 0; i < HBP_NUM; i++) {
113 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
114
115 if (!*slot) {
116 *slot = bp;
117 break;
118 }
119 }
120
121 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
122 return -EBUSY;
123
124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address;
126
127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type);
129
130 set_debugreg(*dr7, 7);
131
132 return 0;
133}
134
135/*
136 * Uninstall the breakpoint contained in the given counter.
137 *
138 * First we search the debug address register it uses and then we disable
139 * it.
140 *
141 * Atomic: we hold the counter->ctx->lock and we only handle variables
142 * and registers local to this cpu.
143 */
144void arch_uninstall_hw_breakpoint(struct perf_event *bp)
145{
146 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
147 unsigned long *dr7;
148 int i;
149
150 for (i = 0; i < HBP_NUM; i++) {
151 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
152
153 if (*slot == bp) {
154 *slot = NULL;
155 break;
156 }
157 }
158
159 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
160 return;
161
162 dr7 = &__get_cpu_var(cpu_dr7);
163 *dr7 &= ~__encode_dr7(i, info->len, info->type);
164
165 set_debugreg(*dr7, 7);
166}
167
168static int get_hbp_len(u8 hbp_len)
169{
170 unsigned int len_in_bytes = 0;
171
172 switch (hbp_len) {
173 case X86_BREAKPOINT_LEN_1:
174 len_in_bytes = 1;
175 break;
176 case X86_BREAKPOINT_LEN_2:
177 len_in_bytes = 2;
178 break;
179 case X86_BREAKPOINT_LEN_4:
180 len_in_bytes = 4;
181 break;
182#ifdef CONFIG_X86_64
183 case X86_BREAKPOINT_LEN_8:
184 len_in_bytes = 8;
185 break;
186#endif
187 }
188 return len_in_bytes;
189}
190
191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space.
205 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
207{
208 unsigned int len;
209
210 len = get_hbp_len(hbp_len);
211
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213}
214
215/*
216 * Store a breakpoint's encoded address, length, and type.
217 */
218static int arch_store_info(struct perf_event *bp)
219{
220 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
221 /*
222 * For kernel-addresses, either the address or symbol name can be
223 * specified.
224 */
225 if (info->name)
226 info->address = (unsigned long)
227 kallsyms_lookup_name(info->name);
228 if (info->address)
229 return 0;
230
231 return -EINVAL;
232}
233
234int arch_bp_generic_fields(int x86_len, int x86_type,
235 int *gen_len, int *gen_type)
236{
237 /* Len */
238 switch (x86_len) {
239 case X86_BREAKPOINT_LEN_1:
240 *gen_len = HW_BREAKPOINT_LEN_1;
241 break;
242 case X86_BREAKPOINT_LEN_2:
243 *gen_len = HW_BREAKPOINT_LEN_2;
244 break;
245 case X86_BREAKPOINT_LEN_4:
246 *gen_len = HW_BREAKPOINT_LEN_4;
247 break;
248#ifdef CONFIG_X86_64
249 case X86_BREAKPOINT_LEN_8:
250 *gen_len = HW_BREAKPOINT_LEN_8;
251 break;
252#endif
253 default:
254 return -EINVAL;
255 }
256
257 /* Type */
258 switch (x86_type) {
259 case X86_BREAKPOINT_EXECUTE:
260 *gen_type = HW_BREAKPOINT_X;
261 break;
262 case X86_BREAKPOINT_WRITE:
263 *gen_type = HW_BREAKPOINT_W;
264 break;
265 case X86_BREAKPOINT_RW:
266 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
267 break;
268 default:
269 return -EINVAL;
270 }
271
272 return 0;
273}
274
275
276static int arch_build_bp_info(struct perf_event *bp)
277{
278 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
279
280 info->address = bp->attr.bp_addr;
281
282 /* Len */
283 switch (bp->attr.bp_len) {
284 case HW_BREAKPOINT_LEN_1:
285 info->len = X86_BREAKPOINT_LEN_1;
286 break;
287 case HW_BREAKPOINT_LEN_2:
288 info->len = X86_BREAKPOINT_LEN_2;
289 break;
290 case HW_BREAKPOINT_LEN_4:
291 info->len = X86_BREAKPOINT_LEN_4;
292 break;
293#ifdef CONFIG_X86_64
294 case HW_BREAKPOINT_LEN_8:
295 info->len = X86_BREAKPOINT_LEN_8;
296 break;
297#endif
298 default:
299 return -EINVAL;
300 }
301
302 /* Type */
303 switch (bp->attr.bp_type) {
304 case HW_BREAKPOINT_W:
305 info->type = X86_BREAKPOINT_WRITE;
306 break;
307 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
308 info->type = X86_BREAKPOINT_RW;
309 break;
310 case HW_BREAKPOINT_X:
311 info->type = X86_BREAKPOINT_EXECUTE;
312 break;
313 default:
314 return -EINVAL;
315 }
316
317 return 0;
318}
319/*
320 * Validate the arch-specific HW Breakpoint register settings
321 */
322int arch_validate_hwbkpt_settings(struct perf_event *bp,
323 struct task_struct *tsk)
324{
325 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
326 unsigned int align;
327 int ret;
328
329
330 ret = arch_build_bp_info(bp);
331 if (ret)
332 return ret;
333
334 ret = -EINVAL;
335
336 if (info->type == X86_BREAKPOINT_EXECUTE)
337 /*
338 * Ptrace-refactoring code
339 * For now, we'll allow instruction breakpoint only for user-space
340 * addresses
341 */
342 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
343 info->len != X86_BREAKPOINT_EXECUTE)
344 return ret;
345
346 switch (info->len) {
347 case X86_BREAKPOINT_LEN_1:
348 align = 0;
349 break;
350 case X86_BREAKPOINT_LEN_2:
351 align = 1;
352 break;
353 case X86_BREAKPOINT_LEN_4:
354 align = 3;
355 break;
356#ifdef CONFIG_X86_64
357 case X86_BREAKPOINT_LEN_8:
358 align = 7;
359 break;
360#endif
361 default:
362 return ret;
363 }
364
365 ret = arch_store_info(bp);
366
367 if (ret < 0)
368 return ret;
369 /*
370 * Check that the low-order bits of the address are appropriate
371 * for the alignment implied by len.
372 */
373 if (info->address & align)
374 return -EINVAL;
375
376 /* Check that the virtual address is in the proper range */
377 if (tsk) {
378 if (!arch_check_va_in_userspace(info->address, info->len))
379 return -EFAULT;
380 } else {
381 if (!arch_check_va_in_kernelspace(info->address, info->len))
382 return -EFAULT;
383 }
384
385 return 0;
386}
387
388/*
389 * Dump the debug register contents to the user.
390 * We can't dump our per cpu values because it
391 * may contain cpu wide breakpoint, something that
392 * doesn't belong to the current task.
393 *
394 * TODO: include non-ptrace user breakpoints (perf)
395 */
396void aout_dump_debugregs(struct user *dump)
397{
398 int i;
399 int dr7 = 0;
400 struct perf_event *bp;
401 struct arch_hw_breakpoint *info;
402 struct thread_struct *thread = &current->thread;
403
404 for (i = 0; i < HBP_NUM; i++) {
405 bp = thread->ptrace_bps[i];
406
407 if (bp && !bp->attr.disabled) {
408 dump->u_debugreg[i] = bp->attr.bp_addr;
409 info = counter_arch_bp(bp);
410 dr7 |= encode_dr7(i, info->len, info->type);
411 } else {
412 dump->u_debugreg[i] = 0;
413 }
414 }
415
416 dump->u_debugreg[4] = 0;
417 dump->u_debugreg[5] = 0;
418 dump->u_debugreg[6] = current->thread.debugreg6;
419
420 dump->u_debugreg[7] = dr7;
421}
422EXPORT_SYMBOL_GPL(aout_dump_debugregs);
423
424/*
425 * Release the user breakpoints used by ptrace
426 */
427void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
428{
429 int i;
430 struct thread_struct *t = &tsk->thread;
431
432 for (i = 0; i < HBP_NUM; i++) {
433 unregister_hw_breakpoint(t->ptrace_bps[i]);
434 t->ptrace_bps[i] = NULL;
435 }
436}
437
438void hw_breakpoint_restore(void)
439{
440 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
441 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
442 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
443 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
444 set_debugreg(current->thread.debugreg6, 6);
445 set_debugreg(__get_cpu_var(cpu_dr7), 7);
446}
447EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
448
449/*
450 * Handle debug exception notifications.
451 *
452 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
453 *
454 * NOTIFY_DONE returned if one of the following conditions is true.
455 * i) When the causative address is from user-space and the exception
456 * is a valid one, i.e. not triggered as a result of lazy debug register
457 * switching
458 * ii) When there are more bits than trap<n> set in DR6 register (such
459 * as BD, BS or BT) indicating that more than one debug condition is
460 * met and requires some more action in do_debug().
461 *
462 * NOTIFY_STOP returned for all other cases
463 *
464 */
465static int __kprobes hw_breakpoint_handler(struct die_args *args)
466{
467 int i, cpu, rc = NOTIFY_STOP;
468 struct perf_event *bp;
469 unsigned long dr7, dr6;
470 unsigned long *dr6_p;
471
472 /* The DR6 value is pointed by args->err */
473 dr6_p = (unsigned long *)ERR_PTR(args->err);
474 dr6 = *dr6_p;
475
476 /* Do an early return if no trap bits are set in DR6 */
477 if ((dr6 & DR_TRAP_BITS) == 0)
478 return NOTIFY_DONE;
479
480 get_debugreg(dr7, 7);
481 /* Disable breakpoints during exception handling */
482 set_debugreg(0UL, 7);
483 /*
484 * Assert that local interrupts are disabled
485 * Reset the DRn bits in the virtualized register value.
486 * The ptrace trigger routine will add in whatever is needed.
487 */
488 current->thread.debugreg6 &= ~DR_TRAP_BITS;
489 cpu = get_cpu();
490
491 /* Handle all the breakpoints that were triggered */
492 for (i = 0; i < HBP_NUM; ++i) {
493 if (likely(!(dr6 & (DR_TRAP0 << i))))
494 continue;
495
496 /*
497 * The counter may be concurrently released but that can only
498 * occur from a call_rcu() path. We can then safely fetch
499 * the breakpoint, use its callback, touch its counter
500 * while we are in an rcu_read_lock() path.
501 */
502 rcu_read_lock();
503
504 bp = per_cpu(bp_per_reg[i], cpu);
505 if (bp)
506 rc = NOTIFY_DONE;
507 /*
508 * Reset the 'i'th TRAP bit in dr6 to denote completion of
509 * exception handling
510 */
511 (*dr6_p) &= ~(DR_TRAP0 << i);
512 /*
513 * bp can be NULL due to lazy debug register switching
514 * or due to concurrent perf counter removing.
515 */
516 if (!bp) {
517 rcu_read_unlock();
518 break;
519 }
520
521 perf_bp_event(bp, args->regs);
522
523 rcu_read_unlock();
524 }
525 if (dr6 & (~DR_TRAP_BITS))
526 rc = NOTIFY_DONE;
527
528 set_debugreg(dr7, 7);
529 put_cpu();
530
531 return rc;
532}
533
534/*
535 * Handle debug exception notifications.
536 */
537int __kprobes hw_breakpoint_exceptions_notify(
538 struct notifier_block *unused, unsigned long val, void *data)
539{
540 if (val != DIE_DEBUG)
541 return NOTIFY_DONE;
542
543 return hw_breakpoint_handler(data);
544}
545
546void hw_breakpoint_pmu_read(struct perf_event *bp)
547{
548 /* TODO */
549}
550
551void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
552{
553 /* TODO */
554}
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 99c4d308f16b..8eec0ec59af2 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -103,9 +103,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
103 * on system-call entry - see also fork() and the signal handling 103 * on system-call entry - see also fork() and the signal handling
104 * code. 104 * code.
105 */ 105 */
106static int do_iopl(unsigned int level, struct pt_regs *regs) 106long sys_iopl(unsigned int level, struct pt_regs *regs)
107{ 107{
108 unsigned int old = (regs->flags >> 12) & 3; 108 unsigned int old = (regs->flags >> 12) & 3;
109 struct thread_struct *t = &current->thread;
109 110
110 if (level > 3) 111 if (level > 3)
111 return -EINVAL; 112 return -EINVAL;
@@ -115,29 +116,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
115 return -EPERM; 116 return -EPERM;
116 } 117 }
117 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); 118 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
118
119 return 0;
120}
121
122#ifdef CONFIG_X86_32
123long sys_iopl(struct pt_regs *regs)
124{
125 unsigned int level = regs->bx;
126 struct thread_struct *t = &current->thread;
127 int rc;
128
129 rc = do_iopl(level, regs);
130 if (rc < 0)
131 goto out;
132
133 t->iopl = level << 12; 119 t->iopl = level << 12;
134 set_iopl_mask(t->iopl); 120 set_iopl_mask(t->iopl);
135out: 121
136 return rc; 122 return 0;
137}
138#else
139asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
140{
141 return do_iopl(level, regs);
142} 123}
143#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 391206199515..91fd0c70a18a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -18,7 +18,7 @@
18atomic_t irq_err_count; 18atomic_t irq_err_count;
19 19
20/* Function pointer for generic interrupt vector handling */ 20/* Function pointer for generic interrupt vector handling */
21void (*generic_interrupt_extension)(void) = NULL; 21void (*x86_platform_ipi_callback)(void) = NULL;
22 22
23/* 23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'. 24 * 'what should we do if we get a hw irq event on an illegal vector'.
@@ -63,19 +63,19 @@ static int show_other_interrupts(struct seq_file *p, int prec)
63 for_each_online_cpu(j) 63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "%*s: ", prec, "CNT"); 66 seq_printf(p, "%*s: ", prec, "PMI");
67 for_each_online_cpu(j) 67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); 68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n"); 69 seq_printf(p, " Performance monitoring interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND"); 70 seq_printf(p, "%*s: ", prec, "PND");
71 for_each_online_cpu(j) 71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n"); 73 seq_printf(p, " Performance pending work\n");
74#endif 74#endif
75 if (generic_interrupt_extension) { 75 if (x86_platform_ipi_callback) {
76 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
77 for_each_online_cpu(j) 77 for_each_online_cpu(j)
78 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); 78 seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
79 seq_printf(p, " Platform interrupts\n"); 79 seq_printf(p, " Platform interrupts\n");
80 } 80 }
81#ifdef CONFIG_SMP 81#ifdef CONFIG_SMP
@@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec)
92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
93 seq_printf(p, " TLB shootdowns\n"); 93 seq_printf(p, " TLB shootdowns\n");
94#endif 94#endif
95#ifdef CONFIG_X86_MCE 95#ifdef CONFIG_X86_THERMAL_VECTOR
96 seq_printf(p, "%*s: ", prec, "TRM"); 96 seq_printf(p, "%*s: ", prec, "TRM");
97 for_each_online_cpu(j) 97 for_each_online_cpu(j)
98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
99 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
100# ifdef CONFIG_X86_MCE_THRESHOLD 100#endif
101#ifdef CONFIG_X86_MCE_THRESHOLD
101 seq_printf(p, "%*s: ", prec, "THR"); 102 seq_printf(p, "%*s: ", prec, "THR");
102 for_each_online_cpu(j) 103 for_each_online_cpu(j)
103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 104 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
104 seq_printf(p, " Threshold APIC interrupts\n"); 105 seq_printf(p, " Threshold APIC interrupts\n");
105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
@@ -149,7 +149,7 @@ int show_interrupts(struct seq_file *p, void *v)
149 if (!desc) 149 if (!desc)
150 return 0; 150 return 0;
151 151
152 spin_lock_irqsave(&desc->lock, flags); 152 raw_spin_lock_irqsave(&desc->lock, flags);
153 for_each_online_cpu(j) 153 for_each_online_cpu(j)
154 any_count |= kstat_irqs_cpu(i, j); 154 any_count |= kstat_irqs_cpu(i, j);
155 action = desc->action; 155 action = desc->action;
@@ -170,7 +170,7 @@ int show_interrupts(struct seq_file *p, void *v)
170 170
171 seq_putc(p, '\n'); 171 seq_putc(p, '\n');
172out: 172out:
173 spin_unlock_irqrestore(&desc->lock, flags); 173 raw_spin_unlock_irqrestore(&desc->lock, flags);
174 return 0; 174 return 0;
175} 175}
176 176
@@ -187,18 +187,18 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
187 sum += irq_stats(cpu)->apic_perf_irqs; 187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs; 188 sum += irq_stats(cpu)->apic_pending_irqs;
189#endif 189#endif
190 if (generic_interrupt_extension) 190 if (x86_platform_ipi_callback)
191 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->x86_platform_ipis;
192#ifdef CONFIG_SMP 192#ifdef CONFIG_SMP
193 sum += irq_stats(cpu)->irq_resched_count; 193 sum += irq_stats(cpu)->irq_resched_count;
194 sum += irq_stats(cpu)->irq_call_count; 194 sum += irq_stats(cpu)->irq_call_count;
195 sum += irq_stats(cpu)->irq_tlb_count; 195 sum += irq_stats(cpu)->irq_tlb_count;
196#endif 196#endif
197#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_THERMAL_VECTOR
198 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
199# ifdef CONFIG_X86_MCE_THRESHOLD 199#endif
200#ifdef CONFIG_X86_MCE_THRESHOLD
200 sum += irq_stats(cpu)->irq_threshold_count; 201 sum += irq_stats(cpu)->irq_threshold_count;
201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
@@ -244,7 +244,6 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
244 __func__, smp_processor_id(), vector, irq); 244 __func__, smp_processor_id(), vector, irq);
245 } 245 }
246 246
247 run_local_timers();
248 irq_exit(); 247 irq_exit();
249 248
250 set_irq_regs(old_regs); 249 set_irq_regs(old_regs);
@@ -252,9 +251,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
252} 251}
253 252
254/* 253/*
255 * Handler for GENERIC_INTERRUPT_VECTOR. 254 * Handler for X86_PLATFORM_IPI_VECTOR.
256 */ 255 */
257void smp_generic_interrupt(struct pt_regs *regs) 256void smp_x86_platform_ipi(struct pt_regs *regs)
258{ 257{
259 struct pt_regs *old_regs = set_irq_regs(regs); 258 struct pt_regs *old_regs = set_irq_regs(regs);
260 259
@@ -264,15 +263,104 @@ void smp_generic_interrupt(struct pt_regs *regs)
264 263
265 irq_enter(); 264 irq_enter();
266 265
267 inc_irq_stat(generic_irqs); 266 inc_irq_stat(x86_platform_ipis);
268 267
269 if (generic_interrupt_extension) 268 if (x86_platform_ipi_callback)
270 generic_interrupt_extension(); 269 x86_platform_ipi_callback();
271 270
272 run_local_timers();
273 irq_exit(); 271 irq_exit();
274 272
275 set_irq_regs(old_regs); 273 set_irq_regs(old_regs);
276} 274}
277 275
278EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
277
278#ifdef CONFIG_HOTPLUG_CPU
279/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
280void fixup_irqs(void)
281{
282 unsigned int irq, vector;
283 static int warned;
284 struct irq_desc *desc;
285
286 for_each_irq_desc(irq, desc) {
287 int break_affinity = 0;
288 int set_affinity = 1;
289 const struct cpumask *affinity;
290
291 if (!desc)
292 continue;
293 if (irq == 2)
294 continue;
295
296 /* interrupt's are disabled at this point */
297 raw_spin_lock(&desc->lock);
298
299 affinity = desc->affinity;
300 if (!irq_has_action(irq) ||
301 cpumask_equal(affinity, cpu_online_mask)) {
302 raw_spin_unlock(&desc->lock);
303 continue;
304 }
305
306 /*
307 * Complete the irq move. This cpu is going down and for
308 * non intr-remapping case, we can't wait till this interrupt
309 * arrives at this cpu before completing the irq move.
310 */
311 irq_force_complete_move(irq);
312
313 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
314 break_affinity = 1;
315 affinity = cpu_all_mask;
316 }
317
318 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
319 desc->chip->mask(irq);
320
321 if (desc->chip->set_affinity)
322 desc->chip->set_affinity(irq, affinity);
323 else if (!(warned++))
324 set_affinity = 0;
325
326 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
327 desc->chip->unmask(irq);
328
329 raw_spin_unlock(&desc->lock);
330
331 if (break_affinity && set_affinity)
332 printk("Broke affinity for irq %i\n", irq);
333 else if (!set_affinity)
334 printk("Cannot set affinity for irq %i\n", irq);
335 }
336
337 /*
338 * We can remove mdelay() and then send spuriuous interrupts to
339 * new cpu targets for all the irqs that were handled previously by
340 * this cpu. While it works, I have seen spurious interrupt messages
341 * (nothing wrong but still...).
342 *
343 * So for now, retain mdelay(1) and check the IRR and then send those
344 * interrupts to new targets as this cpu is already offlined...
345 */
346 mdelay(1);
347
348 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
349 unsigned int irr;
350
351 if (__get_cpu_var(vector_irq)[vector] < 0)
352 continue;
353
354 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
355 if (irr & (1 << (vector % 32))) {
356 irq = __get_cpu_var(vector_irq)[vector];
357
358 desc = irq_to_desc(irq);
359 raw_spin_lock(&desc->lock);
360 if (desc->chip->retrigger)
361 desc->chip->retrigger(irq);
362 raw_spin_unlock(&desc->lock);
363 }
364 }
365}
366#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 7d35d0fe2329..10709f29d166 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
211 211
212 return true; 212 return true;
213} 213}
214
215#ifdef CONFIG_HOTPLUG_CPU
216
217/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
218void fixup_irqs(void)
219{
220 unsigned int irq;
221 struct irq_desc *desc;
222
223 for_each_irq_desc(irq, desc) {
224 const struct cpumask *affinity;
225
226 if (!desc)
227 continue;
228 if (irq == 2)
229 continue;
230
231 affinity = desc->affinity;
232 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
233 printk("Breaking affinity for irq %i\n", irq);
234 affinity = cpu_all_mask;
235 }
236 if (desc->chip->set_affinity)
237 desc->chip->set_affinity(irq, affinity);
238 else if (desc->action)
239 printk_once("Cannot set affinity for irq %i\n", irq);
240 }
241
242#if 0
243 barrier();
244 /* Ingo Molnar says: "after the IO-APIC masks have been redirected
245 [note the nop - the interrupt-enable boundary on x86 is two
246 instructions from sti] - to flush out pending hardirqs and
247 IPIs. After this point nothing is supposed to reach this CPU." */
248 __asm__ __volatile__("sti; nop; cli");
249 barrier();
250#else
251 /* That doesn't seem sufficient. Give it 1ms. */
252 local_irq_enable();
253 mdelay(1);
254 local_irq_disable();
255#endif
256}
257#endif
258
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 977d8b43a0dd..acf8fbf8fbda 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
62 return true; 62 return true;
63} 63}
64 64
65#ifdef CONFIG_HOTPLUG_CPU
66/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
67void fixup_irqs(void)
68{
69 unsigned int irq;
70 static int warned;
71 struct irq_desc *desc;
72
73 for_each_irq_desc(irq, desc) {
74 int break_affinity = 0;
75 int set_affinity = 1;
76 const struct cpumask *affinity;
77
78 if (!desc)
79 continue;
80 if (irq == 2)
81 continue;
82
83 /* interrupt's are disabled at this point */
84 spin_lock(&desc->lock);
85
86 affinity = desc->affinity;
87 if (!irq_has_action(irq) ||
88 cpumask_equal(affinity, cpu_online_mask)) {
89 spin_unlock(&desc->lock);
90 continue;
91 }
92
93 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
94 break_affinity = 1;
95 affinity = cpu_all_mask;
96 }
97
98 if (desc->chip->mask)
99 desc->chip->mask(irq);
100
101 if (desc->chip->set_affinity)
102 desc->chip->set_affinity(irq, affinity);
103 else if (!(warned++))
104 set_affinity = 0;
105
106 if (desc->chip->unmask)
107 desc->chip->unmask(irq);
108
109 spin_unlock(&desc->lock);
110
111 if (break_affinity && set_affinity)
112 printk("Broke affinity for irq %i\n", irq);
113 else if (!set_affinity)
114 printk("Cannot set affinity for irq %i\n", irq);
115 }
116
117 /* That doesn't seem sufficient. Give it 1ms. */
118 local_irq_enable();
119 mdelay(1);
120 local_irq_disable();
121}
122#endif
123 65
124extern void call_softirq(void); 66extern void call_softirq(void);
125 67
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 40f30773fb29..d5932226614f 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -200,8 +200,8 @@ static void __init apic_intr_init(void)
200 /* self generated IPI for local APIC timer */ 200 /* self generated IPI for local APIC timer */
201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
202 202
203 /* generic IPI for platform specific use */ 203 /* IPI for X86 platform specific use */
204 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); 204 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
205 205
206 /* IPI vectors for APIC spurious and error interrupts */ 206 /* IPI vectors for APIC spurious and error interrupts */
207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3b..bfba6019d762 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -42,7 +42,9 @@
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45#include <linux/hw_breakpoint.h>
45 46
47#include <asm/debugreg.h>
46#include <asm/apicdef.h> 48#include <asm/apicdef.h>
47#include <asm/system.h> 49#include <asm/system.h>
48 50
@@ -85,10 +87,15 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
85 gdb_regs[GDB_DS] = regs->ds; 87 gdb_regs[GDB_DS] = regs->ds;
86 gdb_regs[GDB_ES] = regs->es; 88 gdb_regs[GDB_ES] = regs->es;
87 gdb_regs[GDB_CS] = regs->cs; 89 gdb_regs[GDB_CS] = regs->cs;
88 gdb_regs[GDB_SS] = __KERNEL_DS;
89 gdb_regs[GDB_FS] = 0xFFFF; 90 gdb_regs[GDB_FS] = 0xFFFF;
90 gdb_regs[GDB_GS] = 0xFFFF; 91 gdb_regs[GDB_GS] = 0xFFFF;
91 gdb_regs[GDB_SP] = (int)&regs->sp; 92 if (user_mode_vm(regs)) {
93 gdb_regs[GDB_SS] = regs->ss;
94 gdb_regs[GDB_SP] = regs->sp;
95 } else {
96 gdb_regs[GDB_SS] = __KERNEL_DS;
97 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
98 }
92#else 99#else
93 gdb_regs[GDB_R8] = regs->r8; 100 gdb_regs[GDB_R8] = regs->r8;
94 gdb_regs[GDB_R9] = regs->r9; 101 gdb_regs[GDB_R9] = regs->r9;
@@ -101,7 +108,7 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
101 gdb_regs32[GDB_PS] = regs->flags; 108 gdb_regs32[GDB_PS] = regs->flags;
102 gdb_regs32[GDB_CS] = regs->cs; 109 gdb_regs32[GDB_CS] = regs->cs;
103 gdb_regs32[GDB_SS] = regs->ss; 110 gdb_regs32[GDB_SS] = regs->ss;
104 gdb_regs[GDB_SP] = regs->sp; 111 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
105#endif 112#endif
106} 113}
107 114
@@ -198,41 +205,81 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
198 205
199static struct hw_breakpoint { 206static struct hw_breakpoint {
200 unsigned enabled; 207 unsigned enabled;
201 unsigned type;
202 unsigned len;
203 unsigned long addr; 208 unsigned long addr;
209 int len;
210 int type;
211 struct perf_event **pev;
204} breakinfo[4]; 212} breakinfo[4];
205 213
206static void kgdb_correct_hw_break(void) 214static void kgdb_correct_hw_break(void)
207{ 215{
208 unsigned long dr7;
209 int correctit = 0;
210 int breakbit;
211 int breakno; 216 int breakno;
212 217
213 get_debugreg(dr7, 7);
214 for (breakno = 0; breakno < 4; breakno++) { 218 for (breakno = 0; breakno < 4; breakno++) {
215 breakbit = 2 << (breakno << 1); 219 struct perf_event *bp;
216 if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { 220 struct arch_hw_breakpoint *info;
217 correctit = 1; 221 int val;
218 dr7 |= breakbit; 222 int cpu = raw_smp_processor_id();
219 dr7 &= ~(0xf0000 << (breakno << 2)); 223 if (!breakinfo[breakno].enabled)
220 dr7 |= ((breakinfo[breakno].len << 2) | 224 continue;
221 breakinfo[breakno].type) << 225 bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
222 ((breakno << 2) + 16); 226 info = counter_arch_bp(bp);
223 if (breakno >= 0 && breakno <= 3) 227 if (bp->attr.disabled != 1)
224 set_debugreg(breakinfo[breakno].addr, breakno); 228 continue;
225 229 bp->attr.bp_addr = breakinfo[breakno].addr;
226 } else { 230 bp->attr.bp_len = breakinfo[breakno].len;
227 if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { 231 bp->attr.bp_type = breakinfo[breakno].type;
228 correctit = 1; 232 info->address = breakinfo[breakno].addr;
229 dr7 &= ~breakbit; 233 info->len = breakinfo[breakno].len;
230 dr7 &= ~(0xf0000 << (breakno << 2)); 234 info->type = breakinfo[breakno].type;
231 } 235 val = arch_install_hw_breakpoint(bp);
232 } 236 if (!val)
237 bp->attr.disabled = 0;
233 } 238 }
234 if (correctit) 239 hw_breakpoint_restore();
235 set_debugreg(dr7, 7); 240}
241
242static int hw_break_reserve_slot(int breakno)
243{
244 int cpu;
245 int cnt = 0;
246 struct perf_event **pevent;
247
248 for_each_online_cpu(cpu) {
249 cnt++;
250 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
251 if (dbg_reserve_bp_slot(*pevent))
252 goto fail;
253 }
254
255 return 0;
256
257fail:
258 for_each_online_cpu(cpu) {
259 cnt--;
260 if (!cnt)
261 break;
262 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
263 dbg_release_bp_slot(*pevent);
264 }
265 return -1;
266}
267
268static int hw_break_release_slot(int breakno)
269{
270 struct perf_event **pevent;
271 int cpu;
272
273 for_each_online_cpu(cpu) {
274 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
275 if (dbg_release_bp_slot(*pevent))
276 /*
277 * The debugger is responisble for handing the retry on
278 * remove failure.
279 */
280 return -1;
281 }
282 return 0;
236} 283}
237 284
238static int 285static int
@@ -246,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
246 if (i == 4) 293 if (i == 4)
247 return -1; 294 return -1;
248 295
296 if (hw_break_release_slot(i)) {
297 printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr);
298 return -1;
299 }
249 breakinfo[i].enabled = 0; 300 breakinfo[i].enabled = 0;
250 301
251 return 0; 302 return 0;
@@ -254,15 +305,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
254static void kgdb_remove_all_hw_break(void) 305static void kgdb_remove_all_hw_break(void)
255{ 306{
256 int i; 307 int i;
308 int cpu = raw_smp_processor_id();
309 struct perf_event *bp;
257 310
258 for (i = 0; i < 4; i++) 311 for (i = 0; i < 4; i++) {
259 memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); 312 if (!breakinfo[i].enabled)
313 continue;
314 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
315 if (bp->attr.disabled == 1)
316 continue;
317 arch_uninstall_hw_breakpoint(bp);
318 bp->attr.disabled = 1;
319 }
260} 320}
261 321
262static int 322static int
263kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) 323kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
264{ 324{
265 unsigned type;
266 int i; 325 int i;
267 326
268 for (i = 0; i < 4; i++) 327 for (i = 0; i < 4; i++)
@@ -273,27 +332,42 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
273 332
274 switch (bptype) { 333 switch (bptype) {
275 case BP_HARDWARE_BREAKPOINT: 334 case BP_HARDWARE_BREAKPOINT:
276 type = 0; 335 len = 1;
277 len = 1; 336 breakinfo[i].type = X86_BREAKPOINT_EXECUTE;
278 break; 337 break;
279 case BP_WRITE_WATCHPOINT: 338 case BP_WRITE_WATCHPOINT:
280 type = 1; 339 breakinfo[i].type = X86_BREAKPOINT_WRITE;
281 break; 340 break;
282 case BP_ACCESS_WATCHPOINT: 341 case BP_ACCESS_WATCHPOINT:
283 type = 3; 342 breakinfo[i].type = X86_BREAKPOINT_RW;
284 break; 343 break;
285 default: 344 default:
286 return -1; 345 return -1;
287 } 346 }
288 347 switch (len) {
289 if (len == 1 || len == 2 || len == 4) 348 case 1:
290 breakinfo[i].len = len - 1; 349 breakinfo[i].len = X86_BREAKPOINT_LEN_1;
291 else 350 break;
351 case 2:
352 breakinfo[i].len = X86_BREAKPOINT_LEN_2;
353 break;
354 case 4:
355 breakinfo[i].len = X86_BREAKPOINT_LEN_4;
356 break;
357#ifdef CONFIG_X86_64
358 case 8:
359 breakinfo[i].len = X86_BREAKPOINT_LEN_8;
360 break;
361#endif
362 default:
292 return -1; 363 return -1;
293 364 }
294 breakinfo[i].enabled = 1;
295 breakinfo[i].addr = addr; 365 breakinfo[i].addr = addr;
296 breakinfo[i].type = type; 366 if (hw_break_reserve_slot(i)) {
367 breakinfo[i].addr = 0;
368 return -1;
369 }
370 breakinfo[i].enabled = 1;
297 371
298 return 0; 372 return 0;
299} 373}
@@ -308,8 +382,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
308 */ 382 */
309void kgdb_disable_hw_debug(struct pt_regs *regs) 383void kgdb_disable_hw_debug(struct pt_regs *regs)
310{ 384{
385 int i;
386 int cpu = raw_smp_processor_id();
387 struct perf_event *bp;
388
311 /* Disable hardware debugging while we are in kgdb: */ 389 /* Disable hardware debugging while we are in kgdb: */
312 set_debugreg(0UL, 7); 390 set_debugreg(0UL, 7);
391 for (i = 0; i < 4; i++) {
392 if (!breakinfo[i].enabled)
393 continue;
394 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
395 if (bp->attr.disabled == 1)
396 continue;
397 arch_uninstall_hw_breakpoint(bp);
398 bp->attr.disabled = 1;
399 }
313} 400}
314 401
315/** 402/**
@@ -373,7 +460,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
373 struct pt_regs *linux_regs) 460 struct pt_regs *linux_regs)
374{ 461{
375 unsigned long addr; 462 unsigned long addr;
376 unsigned long dr6;
377 char *ptr; 463 char *ptr;
378 int newPC; 464 int newPC;
379 465
@@ -395,25 +481,10 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
395 /* set the trace bit if we're stepping */ 481 /* set the trace bit if we're stepping */
396 if (remcomInBuffer[0] == 's') { 482 if (remcomInBuffer[0] == 's') {
397 linux_regs->flags |= X86_EFLAGS_TF; 483 linux_regs->flags |= X86_EFLAGS_TF;
398 kgdb_single_step = 1;
399 atomic_set(&kgdb_cpu_doing_single_step, 484 atomic_set(&kgdb_cpu_doing_single_step,
400 raw_smp_processor_id()); 485 raw_smp_processor_id());
401 } 486 }
402 487
403 get_debugreg(dr6, 6);
404 if (!(dr6 & 0x4000)) {
405 int breakno;
406
407 for (breakno = 0; breakno < 4; breakno++) {
408 if (dr6 & (1 << breakno) &&
409 breakinfo[breakno].type == 0) {
410 /* Set restore flag: */
411 linux_regs->flags |= X86_EFLAGS_RF;
412 break;
413 }
414 }
415 }
416 set_debugreg(0UL, 6);
417 kgdb_correct_hw_break(); 488 kgdb_correct_hw_break();
418 489
419 return 0; 490 return 0;
@@ -434,6 +505,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
434 "resuming...\n"); 505 "resuming...\n");
435 kgdb_arch_handle_exception(args->trapnr, args->signr, 506 kgdb_arch_handle_exception(args->trapnr, args->signr,
436 args->err, "c", "", regs); 507 args->err, "c", "", regs);
508 /*
509 * Reset the BS bit in dr6 (pointed by args->err) to
510 * denote completion of processing
511 */
512 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
437 513
438 return NOTIFY_STOP; 514 return NOTIFY_STOP;
439} 515}
@@ -476,8 +552,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
476 break; 552 break;
477 553
478 case DIE_DEBUG: 554 case DIE_DEBUG:
479 if (atomic_read(&kgdb_cpu_doing_single_step) == 555 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
480 raw_smp_processor_id()) {
481 if (user_mode(regs)) 556 if (user_mode(regs))
482 return single_step_cont(regs, args); 557 return single_step_cont(regs, args);
483 break; 558 break;
@@ -530,7 +605,42 @@ static struct notifier_block kgdb_notifier = {
530 */ 605 */
531int kgdb_arch_init(void) 606int kgdb_arch_init(void)
532{ 607{
533 return register_die_notifier(&kgdb_notifier); 608 int i, cpu;
609 int ret;
610 struct perf_event_attr attr;
611 struct perf_event **pevent;
612
613 ret = register_die_notifier(&kgdb_notifier);
614 if (ret != 0)
615 return ret;
616 /*
617 * Pre-allocate the hw breakpoint structions in the non-atomic
618 * portion of kgdb because this operation requires mutexs to
619 * complete.
620 */
621 attr.bp_addr = (unsigned long)kgdb_arch_init;
622 attr.type = PERF_TYPE_BREAKPOINT;
623 attr.bp_len = HW_BREAKPOINT_LEN_1;
624 attr.bp_type = HW_BREAKPOINT_W;
625 attr.disabled = 1;
626 for (i = 0; i < 4; i++) {
627 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
628 if (IS_ERR(breakinfo[i].pev)) {
629 printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n");
630 breakinfo[i].pev = NULL;
631 kgdb_arch_exit();
632 return -1;
633 }
634 for_each_online_cpu(cpu) {
635 pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
636 pevent[0]->hw.sample_period = 1;
637 if (pevent[0]->destroy != NULL) {
638 pevent[0]->destroy = NULL;
639 release_bp_slot(*pevent);
640 }
641 }
642 }
643 return ret;
534} 644}
535 645
536/** 646/**
@@ -541,6 +651,13 @@ int kgdb_arch_init(void)
541 */ 651 */
542void kgdb_arch_exit(void) 652void kgdb_arch_exit(void)
543{ 653{
654 int i;
655 for (i = 0; i < 4; i++) {
656 if (breakinfo[i].pev) {
657 unregister_wide_hw_breakpoint(breakinfo[i].pev);
658 breakinfo[i].pev = NULL;
659 }
660 }
544 unregister_die_notifier(&kgdb_notifier); 661 unregister_die_notifier(&kgdb_notifier);
545} 662}
546 663
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7b5169d2b000..5b8c7505b3bc 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -48,31 +48,22 @@
48#include <linux/preempt.h> 48#include <linux/preempt.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h>
51 52
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
53#include <asm/desc.h> 54#include <asm/desc.h>
54#include <asm/pgtable.h> 55#include <asm/pgtable.h>
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/alternative.h> 57#include <asm/alternative.h>
58#include <asm/insn.h>
59#include <asm/debugreg.h>
57 60
58void jprobe_return_end(void); 61void jprobe_return_end(void);
59 62
60DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 63DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
61DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); 64DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
62 65
63#ifdef CONFIG_X86_64 66#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
64#define stack_addr(regs) ((unsigned long *)regs->sp)
65#else
66/*
67 * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
68 * don't save the ss and esp registers if the CPU is already in kernel
69 * mode when it traps. So for kprobes, regs->sp and regs->ss are not
70 * the [nonexistent] saved stack pointer and ss register, but rather
71 * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
72 * point to the top of the pre-int3 stack.
73 */
74#define stack_addr(regs) ((unsigned long *)&regs->sp)
75#endif
76 67
77#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ 68#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
78 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ 69 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -106,50 +97,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
106 /* ----------------------------------------------- */ 97 /* ----------------------------------------------- */
107 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108}; 99};
109static const u32 onebyte_has_modrm[256 / 32] = {
110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
111 /* ----------------------------------------------- */
112 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
113 W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
114 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
115 W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
116 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
117 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
118 W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
119 W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
120 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
121 W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
122 W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
123 W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
124 W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
125 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
126 W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
127 W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
128 /* ----------------------------------------------- */
129 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
130};
131static const u32 twobyte_has_modrm[256 / 32] = {
132 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* ----------------------------------------------- */
134 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
135 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
136 W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
137 W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
138 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
139 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
140 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
141 W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
142 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
143 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
144 W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
145 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
146 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
147 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
148 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
149 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
150 /* ----------------------------------------------- */
151 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
152};
153#undef W 100#undef W
154 101
155struct kretprobe_blackpoint kretprobe_blacklist[] = { 102struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -244,6 +191,75 @@ retry:
244 } 191 }
245} 192}
246 193
194/* Recover the probed instruction at addr for further analysis. */
195static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
196{
197 struct kprobe *kp;
198 kp = get_kprobe((void *)addr);
199 if (!kp)
200 return -EINVAL;
201
202 /*
203 * Basically, kp->ainsn.insn has an original instruction.
204 * However, RIP-relative instruction can not do single-stepping
205 * at different place, fix_riprel() tweaks the displacement of
206 * that instruction. In that case, we can't recover the instruction
207 * from the kp->ainsn.insn.
208 *
209 * On the other hand, kp->opcode has a copy of the first byte of
210 * the probed instruction, which is overwritten by int3. And
211 * the instruction at kp->addr is not modified by kprobes except
212 * for the first byte, we can recover the original instruction
213 * from it and kp->opcode.
214 */
215 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
216 buf[0] = kp->opcode;
217 return 0;
218}
219
220/* Dummy buffers for kallsyms_lookup */
221static char __dummy_buf[KSYM_NAME_LEN];
222
223/* Check if paddr is at an instruction boundary */
224static int __kprobes can_probe(unsigned long paddr)
225{
226 int ret;
227 unsigned long addr, offset = 0;
228 struct insn insn;
229 kprobe_opcode_t buf[MAX_INSN_SIZE];
230
231 if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
232 return 0;
233
234 /* Decode instructions */
235 addr = paddr - offset;
236 while (addr < paddr) {
237 kernel_insn_init(&insn, (void *)addr);
238 insn_get_opcode(&insn);
239
240 /*
241 * Check if the instruction has been modified by another
242 * kprobe, in which case we replace the breakpoint by the
243 * original instruction in our buffer.
244 */
245 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
246 ret = recover_probed_instruction(buf, addr);
247 if (ret)
248 /*
249 * Another debugging subsystem might insert
250 * this breakpoint. In that case, we can't
251 * recover it.
252 */
253 return 0;
254 kernel_insn_init(&insn, buf);
255 }
256 insn_get_length(&insn);
257 addr += insn.length;
258 }
259
260 return (addr == paddr);
261}
262
247/* 263/*
248 * Returns non-zero if opcode modifies the interrupt flag. 264 * Returns non-zero if opcode modifies the interrupt flag.
249 */ 265 */
@@ -277,68 +293,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
277static void __kprobes fix_riprel(struct kprobe *p) 293static void __kprobes fix_riprel(struct kprobe *p)
278{ 294{
279#ifdef CONFIG_X86_64 295#ifdef CONFIG_X86_64
280 u8 *insn = p->ainsn.insn; 296 struct insn insn;
281 s64 disp; 297 kernel_insn_init(&insn, p->ainsn.insn);
282 int need_modrm;
283
284 /* Skip legacy instruction prefixes. */
285 while (1) {
286 switch (*insn) {
287 case 0x66:
288 case 0x67:
289 case 0x2e:
290 case 0x3e:
291 case 0x26:
292 case 0x64:
293 case 0x65:
294 case 0x36:
295 case 0xf0:
296 case 0xf3:
297 case 0xf2:
298 ++insn;
299 continue;
300 }
301 break;
302 }
303 298
304 /* Skip REX instruction prefix. */ 299 if (insn_rip_relative(&insn)) {
305 if (is_REX_prefix(insn)) 300 s64 newdisp;
306 ++insn; 301 u8 *disp;
307 302 insn_get_displacement(&insn);
308 if (*insn == 0x0f) { 303 /*
309 /* Two-byte opcode. */ 304 * The copied instruction uses the %rip-relative addressing
310 ++insn; 305 * mode. Adjust the displacement for the difference between
311 need_modrm = test_bit(*insn, 306 * the original location of this instruction and the location
312 (unsigned long *)twobyte_has_modrm); 307 * of the copy that will actually be run. The tricky bit here
313 } else 308 * is making sure that the sign extension happens correctly in
314 /* One-byte opcode. */ 309 * this calculation, since we need a signed 32-bit result to
315 need_modrm = test_bit(*insn, 310 * be sign-extended to 64 bits when it's added to the %rip
316 (unsigned long *)onebyte_has_modrm); 311 * value and yield the same 64-bit result that the sign-
317 312 * extension of the original signed 32-bit displacement would
318 if (need_modrm) { 313 * have given.
319 u8 modrm = *++insn; 314 */
320 if ((modrm & 0xc7) == 0x05) { 315 newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
321 /* %rip+disp32 addressing mode */ 316 (u8 *) p->ainsn.insn;
322 /* Displacement follows ModRM byte. */ 317 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
323 ++insn; 318 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
324 /* 319 *(s32 *) disp = (s32) newdisp;
325 * The copied instruction uses the %rip-relative
326 * addressing mode. Adjust the displacement for the
327 * difference between the original location of this
328 * instruction and the location of the copy that will
329 * actually be run. The tricky bit here is making sure
330 * that the sign extension happens correctly in this
331 * calculation, since we need a signed 32-bit result to
332 * be sign-extended to 64 bits when it's added to the
333 * %rip value and yield the same 64-bit result that the
334 * sign-extension of the original signed 32-bit
335 * displacement would have given.
336 */
337 disp = (u8 *) p->addr + *((s32 *) insn) -
338 (u8 *) p->ainsn.insn;
339 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
340 *(s32 *)insn = (s32) disp;
341 }
342 } 320 }
343#endif 321#endif
344} 322}
@@ -359,6 +337,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
359 337
360int __kprobes arch_prepare_kprobe(struct kprobe *p) 338int __kprobes arch_prepare_kprobe(struct kprobe *p)
361{ 339{
340 if (!can_probe((unsigned long)p->addr))
341 return -EILSEQ;
362 /* insn: must be on special executable page on x86. */ 342 /* insn: must be on special executable page on x86. */
363 p->ainsn.insn = get_insn_slot(); 343 p->ainsn.insn = get_insn_slot();
364 if (!p->ainsn.insn) 344 if (!p->ainsn.insn)
@@ -472,17 +452,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
472{ 452{
473 switch (kcb->kprobe_status) { 453 switch (kcb->kprobe_status) {
474 case KPROBE_HIT_SSDONE: 454 case KPROBE_HIT_SSDONE:
475#ifdef CONFIG_X86_64
476 /* TODO: Provide re-entrancy from post_kprobes_handler() and
477 * avoid exception stack corruption while single-stepping on
478 * the instruction of the new probe.
479 */
480 arch_disarm_kprobe(p);
481 regs->ip = (unsigned long)p->addr;
482 reset_current_kprobe();
483 preempt_enable_no_resched();
484 break;
485#endif
486 case KPROBE_HIT_ACTIVE: 455 case KPROBE_HIT_ACTIVE:
487 save_previous_kprobe(kcb); 456 save_previous_kprobe(kcb);
488 set_current_kprobe(p, regs, kcb); 457 set_current_kprobe(p, regs, kcb);
@@ -491,18 +460,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
491 kcb->kprobe_status = KPROBE_REENTER; 460 kcb->kprobe_status = KPROBE_REENTER;
492 break; 461 break;
493 case KPROBE_HIT_SS: 462 case KPROBE_HIT_SS:
494 if (p == kprobe_running()) { 463 /* A probe has been hit in the codepath leading up to, or just
495 regs->flags &= ~X86_EFLAGS_TF; 464 * after, single-stepping of a probed instruction. This entire
496 regs->flags |= kcb->kprobe_saved_flags; 465 * codepath should strictly reside in .kprobes.text section.
497 return 0; 466 * Raise a BUG or we'll continue in an endless reentering loop
498 } else { 467 * and eventually a stack overflow.
499 /* A probe has been hit in the codepath leading up 468 */
500 * to, or just after, single-stepping of a probed 469 printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
501 * instruction. This entire codepath should strictly 470 p->addr);
502 * reside in .kprobes.text section. Raise a warning 471 dump_kprobe(p);
503 * to highlight this peculiar case. 472 BUG();
504 */
505 }
506 default: 473 default:
507 /* impossible cases */ 474 /* impossible cases */
508 WARN_ON(1); 475 WARN_ON(1);
@@ -514,7 +481,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
514 481
515/* 482/*
516 * Interrupts are disabled on entry as trap3 is an interrupt gate and they 483 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
517 * remain disabled thorough out this function. 484 * remain disabled throughout this function.
518 */ 485 */
519static int __kprobes kprobe_handler(struct pt_regs *regs) 486static int __kprobes kprobe_handler(struct pt_regs *regs)
520{ 487{
@@ -851,7 +818,7 @@ no_change:
851 818
852/* 819/*
853 * Interrupts are disabled on entry as trap1 is an interrupt gate and they 820 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
854 * remain disabled thoroughout this function. 821 * remain disabled throughout this function.
855 */ 822 */
856static int __kprobes post_kprobe_handler(struct pt_regs *regs) 823static int __kprobes post_kprobe_handler(struct pt_regs *regs)
857{ 824{
@@ -967,8 +934,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
967 ret = NOTIFY_STOP; 934 ret = NOTIFY_STOP;
968 break; 935 break;
969 case DIE_DEBUG: 936 case DIE_DEBUG:
970 if (post_kprobe_handler(args->regs)) 937 if (post_kprobe_handler(args->regs)) {
938 /*
939 * Reset the BS bit in dr6 (pointed by args->err) to
940 * denote completion of processing
941 */
942 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
971 ret = NOTIFY_STOP; 943 ret = NOTIFY_STOP;
944 }
972 break; 945 break;
973 case DIE_GPF: 946 case DIE_GPF:
974 /* 947 /*
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 03657e784fd8..a3fa43ba5d3b 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
25#include <asm/desc.h> 25#include <asm/desc.h>
26#include <asm/system.h> 26#include <asm/system.h>
27#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
28#include <asm/debugreg.h>
28 29
29static void set_idt(void *newidt, __u16 limit) 30static void set_idt(void *newidt, __u16 limit)
30{ 31{
@@ -200,6 +201,7 @@ void machine_kexec(struct kimage *image)
200 201
201 /* Interrupts aren't acceptable while we reboot */ 202 /* Interrupts aren't acceptable while we reboot */
202 local_irq_disable(); 203 local_irq_disable();
204 hw_breakpoint_disable();
203 205
204 if (image->preserve_context) { 206 if (image->preserve_context) {
205#ifdef CONFIG_X86_IO_APIC 207#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e98..4a8bb82248ae 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
18#include <asm/pgtable.h> 18#include <asm/pgtable.h>
19#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/debugreg.h>
21 22
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd, 23static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr) 24 unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
282 283
283 /* Interrupts aren't acceptable while we reboot */ 284 /* Interrupts aren't acceptable while we reboot */
284 local_irq_disable(); 285 local_irq_disable();
286 hw_breakpoint_disable();
285 287
286 if (image->preserve_context) { 288 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC 289#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
deleted file mode 100644
index 2a62d843f015..000000000000
--- a/arch/x86/kernel/mfgpt_32.c
+++ /dev/null
@@ -1,410 +0,0 @@
1/*
2 * Driver/API for AMD Geode Multi-Function General Purpose Timers (MFGPT)
3 *
4 * Copyright (C) 2006, Advanced Micro Devices, Inc.
5 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of version 2 of the GNU General Public License
9 * as published by the Free Software Foundation.
10 *
11 * The MFGPTs are documented in AMD Geode CS5536 Companion Device Data Book.
12 */
13
14/*
15 * We are using the 32.768kHz input clock - it's the only one that has the
16 * ranges we find desirable. The following table lists the suitable
17 * divisors and the associated Hz, minimum interval and the maximum interval:
18 *
19 * Divisor Hz Min Delta (s) Max Delta (s)
20 * 1 32768 .00048828125 2.000
21 * 2 16384 .0009765625 4.000
22 * 4 8192 .001953125 8.000
23 * 8 4096 .00390625 16.000
24 * 16 2048 .0078125 32.000
25 * 32 1024 .015625 64.000
26 * 64 512 .03125 128.000
27 * 128 256 .0625 256.000
28 * 256 128 .125 512.000
29 */
30
31#include <linux/kernel.h>
32#include <linux/interrupt.h>
33#include <linux/module.h>
34#include <asm/geode.h>
35
36#define MFGPT_DEFAULT_IRQ 7
37
38static struct mfgpt_timer_t {
39 unsigned int avail:1;
40} mfgpt_timers[MFGPT_MAX_TIMERS];
41
42/* Selected from the table above */
43
44#define MFGPT_DIVISOR 16
45#define MFGPT_SCALE 4 /* divisor = 2^(scale) */
46#define MFGPT_HZ (32768 / MFGPT_DIVISOR)
47#define MFGPT_PERIODIC (MFGPT_HZ / HZ)
48
49/* Allow for disabling of MFGPTs */
50static int disable;
51static int __init mfgpt_disable(char *s)
52{
53 disable = 1;
54 return 1;
55}
56__setup("nomfgpt", mfgpt_disable);
57
58/* Reset the MFGPT timers. This is required by some broken BIOSes which already
59 * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
60 * affected at least (0.99 is OK with MFGPT workaround left to off).
61 */
62static int __init mfgpt_fix(char *s)
63{
64 u32 val, dummy;
65
66 /* The following udocumented bit resets the MFGPT timers */
67 val = 0xFF; dummy = 0;
68 wrmsr(MSR_MFGPT_SETUP, val, dummy);
69 return 1;
70}
71__setup("mfgptfix", mfgpt_fix);
72
73/*
74 * Check whether any MFGPTs are available for the kernel to use. In most
75 * cases, firmware that uses AMD's VSA code will claim all timers during
76 * bootup; we certainly don't want to take them if they're already in use.
77 * In other cases (such as with VSAless OpenFirmware), the system firmware
78 * leaves timers available for us to use.
79 */
80
81
82static int timers = -1;
83
84static void geode_mfgpt_detect(void)
85{
86 int i;
87 u16 val;
88
89 timers = 0;
90
91 if (disable) {
92 printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n");
93 goto done;
94 }
95
96 if (!geode_get_dev_base(GEODE_DEV_MFGPT)) {
97 printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n");
98 goto done;
99 }
100
101 for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
102 val = geode_mfgpt_read(i, MFGPT_REG_SETUP);
103 if (!(val & MFGPT_SETUP_SETUP)) {
104 mfgpt_timers[i].avail = 1;
105 timers++;
106 }
107 }
108
109done:
110 printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers);
111}
112
113int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
114{
115 u32 msr, mask, value, dummy;
116 int shift = (cmp == MFGPT_CMP1) ? 0 : 8;
117
118 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
119 return -EIO;
120
121 /*
122 * The register maps for these are described in sections 6.17.1.x of
123 * the AMD Geode CS5536 Companion Device Data Book.
124 */
125 switch (event) {
126 case MFGPT_EVENT_RESET:
127 /*
128 * XXX: According to the docs, we cannot reset timers above
129 * 6; that is, resets for 7 and 8 will be ignored. Is this
130 * a problem? -dilinger
131 */
132 msr = MSR_MFGPT_NR;
133 mask = 1 << (timer + 24);
134 break;
135
136 case MFGPT_EVENT_NMI:
137 msr = MSR_MFGPT_NR;
138 mask = 1 << (timer + shift);
139 break;
140
141 case MFGPT_EVENT_IRQ:
142 msr = MSR_MFGPT_IRQ;
143 mask = 1 << (timer + shift);
144 break;
145
146 default:
147 return -EIO;
148 }
149
150 rdmsr(msr, value, dummy);
151
152 if (enable)
153 value |= mask;
154 else
155 value &= ~mask;
156
157 wrmsr(msr, value, dummy);
158 return 0;
159}
160EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
161
162int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
163{
164 u32 zsel, lpc, dummy;
165 int shift;
166
167 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
168 return -EIO;
169
170 /*
171 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
172 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
173 * 2, and we mustn't use nor change it.
174 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
175 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
176 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
177 */
178 rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
179 shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
180 if (((zsel >> shift) & 0xF) == 2)
181 return -EIO;
182
183 /* Choose IRQ: if none supplied, keep IRQ already set or use default */
184 if (!*irq)
185 *irq = (zsel >> shift) & 0xF;
186 if (!*irq)
187 *irq = MFGPT_DEFAULT_IRQ;
188
189 /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
190 if (*irq < 1 || *irq == 2 || *irq > 15)
191 return -EIO;
192 rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
193 if (lpc & (1 << *irq))
194 return -EIO;
195
196 /* All chosen and checked - go for it */
197 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
198 return -EIO;
199 if (enable) {
200 zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
201 wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
202 }
203
204 return 0;
205}
206
207static int mfgpt_get(int timer)
208{
209 mfgpt_timers[timer].avail = 0;
210 printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer);
211 return timer;
212}
213
214int geode_mfgpt_alloc_timer(int timer, int domain)
215{
216 int i;
217
218 if (timers == -1) {
219 /* timers haven't been detected yet */
220 geode_mfgpt_detect();
221 }
222
223 if (!timers)
224 return -1;
225
226 if (timer >= MFGPT_MAX_TIMERS)
227 return -1;
228
229 if (timer < 0) {
230 /* Try to find an available timer */
231 for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
232 if (mfgpt_timers[i].avail)
233 return mfgpt_get(i);
234
235 if (i == 5 && domain == MFGPT_DOMAIN_WORKING)
236 break;
237 }
238 } else {
239 /* If they requested a specific timer, try to honor that */
240 if (mfgpt_timers[timer].avail)
241 return mfgpt_get(timer);
242 }
243
244 /* No timers available - too bad */
245 return -1;
246}
247EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
248
249
250#ifdef CONFIG_GEODE_MFGPT_TIMER
251
252/*
253 * The MFPGT timers on the CS5536 provide us with suitable timers to use
254 * as clock event sources - not as good as a HPET or APIC, but certainly
255 * better than the PIT. This isn't a general purpose MFGPT driver, but
256 * a simplified one designed specifically to act as a clock event source.
257 * For full details about the MFGPT, please consult the CS5536 data sheet.
258 */
259
260#include <linux/clocksource.h>
261#include <linux/clockchips.h>
262
263static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
264static u16 mfgpt_event_clock;
265
266static int irq;
267static int __init mfgpt_setup(char *str)
268{
269 get_option(&str, &irq);
270 return 1;
271}
272__setup("mfgpt_irq=", mfgpt_setup);
273
274static void mfgpt_disable_timer(u16 clock)
275{
276 /* avoid races by clearing CMP1 and CMP2 unconditionally */
277 geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN |
278 MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2);
279}
280
281static int mfgpt_next_event(unsigned long, struct clock_event_device *);
282static void mfgpt_set_mode(enum clock_event_mode, struct clock_event_device *);
283
284static struct clock_event_device mfgpt_clockevent = {
285 .name = "mfgpt-timer",
286 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
287 .set_mode = mfgpt_set_mode,
288 .set_next_event = mfgpt_next_event,
289 .rating = 250,
290 .cpumask = cpu_all_mask,
291 .shift = 32
292};
293
294static void mfgpt_start_timer(u16 delta)
295{
296 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta);
297 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
298
299 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
300 MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
301}
302
303static void mfgpt_set_mode(enum clock_event_mode mode,
304 struct clock_event_device *evt)
305{
306 mfgpt_disable_timer(mfgpt_event_clock);
307
308 if (mode == CLOCK_EVT_MODE_PERIODIC)
309 mfgpt_start_timer(MFGPT_PERIODIC);
310
311 mfgpt_tick_mode = mode;
312}
313
314static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt)
315{
316 mfgpt_start_timer(delta);
317 return 0;
318}
319
320static irqreturn_t mfgpt_tick(int irq, void *dev_id)
321{
322 u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP);
323
324 /* See if the interrupt was for us */
325 if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1)))
326 return IRQ_NONE;
327
328 /* Turn off the clock (and clear the event) */
329 mfgpt_disable_timer(mfgpt_event_clock);
330
331 if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN)
332 return IRQ_HANDLED;
333
334 /* Clear the counter */
335 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
336
337 /* Restart the clock in periodic mode */
338
339 if (mfgpt_tick_mode == CLOCK_EVT_MODE_PERIODIC) {
340 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
341 MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
342 }
343
344 mfgpt_clockevent.event_handler(&mfgpt_clockevent);
345 return IRQ_HANDLED;
346}
347
348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
351 .name = "mfgpt-timer"
352};
353
354int __init mfgpt_timer_setup(void)
355{
356 int timer, ret;
357 u16 val;
358
359 timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING);
360 if (timer < 0) {
361 printk(KERN_ERR
362 "mfgpt-timer: Could not allocate a MFPGT timer\n");
363 return -ENODEV;
364 }
365
366 mfgpt_event_clock = timer;
367
368 /* Set up the IRQ on the MFGPT side */
369 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
370 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
371 return -EIO;
372 }
373
374 /* And register it with the kernel */
375 ret = setup_irq(irq, &mfgptirq);
376
377 if (ret) {
378 printk(KERN_ERR
379 "mfgpt-timer: Unable to set up the interrupt.\n");
380 goto err;
381 }
382
383 /* Set the clock scale and enable the event mode for CMP2 */
384 val = MFGPT_SCALE | (3 << 8);
385
386 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
387
388 /* Set up the clock event */
389 mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC,
390 mfgpt_clockevent.shift);
391 mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF,
392 &mfgpt_clockevent);
393 mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE,
394 &mfgpt_clockevent);
395
396 printk(KERN_INFO
397 "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
398 timer, irq);
399 clockevents_register_device(&mfgpt_clockevent);
400
401 return 0;
402
403err:
404 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
405 printk(KERN_ERR
406 "mfgpt-timer: Unable to set up the MFGPT clock source\n");
407 return -EIO;
408}
409
410#endif
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 366baa179913..e1af7c055c7d 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,6 +13,9 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16
17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
16#include <linux/firmware.h> 19#include <linux/firmware.h>
17#include <linux/pci_ids.h> 20#include <linux/pci_ids.h>
18#include <linux/uaccess.h> 21#include <linux/uaccess.h>
@@ -76,12 +79,12 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
76 79
77 memset(csig, 0, sizeof(*csig)); 80 memset(csig, 0, sizeof(*csig));
78 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 81 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
79 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not " 82 pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
80 "supported\n", cpu, c->x86); 83 "supported\n", cpu, c->x86);
81 return -1; 84 return -1;
82 } 85 }
83 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
84 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); 87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
85 return 0; 88 return 0;
86} 89}
87 90
@@ -103,23 +106,16 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
103 i++; 106 i++;
104 } 107 }
105 108
106 if (!equiv_cpu_id) { 109 if (!equiv_cpu_id)
107 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
108 "not listed in equivalent cpu table\n", cpu);
109 return 0; 110 return 0;
110 }
111 111
112 if (mc_header->processor_rev_id != equiv_cpu_id) { 112 if (mc_header->processor_rev_id != equiv_cpu_id)
113 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
114 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
115 cpu, mc_header->processor_rev_id, equiv_cpu_id);
116 return 0; 113 return 0;
117 }
118 114
119 /* ucode might be chipset specific -- currently we don't support this */ 115 /* ucode might be chipset specific -- currently we don't support this */
120 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
121 printk(KERN_ERR "microcode: CPU%d: loading of chipset " 117 pr_err("CPU%d: loading of chipset specific code not yet supported\n",
122 "specific code not yet supported\n", cpu); 118 cpu);
123 return 0; 119 return 0;
124 } 120 }
125 121
@@ -148,14 +144,12 @@ static int apply_microcode_amd(int cpu)
148 144
149 /* check current patch id and patch's id for match */ 145 /* check current patch id and patch's id for match */
150 if (rev != mc_amd->hdr.patch_id) { 146 if (rev != mc_amd->hdr.patch_id) {
151 printk(KERN_ERR "microcode: CPU%d: update failed " 147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n",
152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 148 cpu, mc_amd->hdr.patch_id);
153 return -1; 149 return -1;
154 } 150 }
155 151
156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
157 cpu, rev);
158
159 uci->cpu_sig.rev = rev; 153 uci->cpu_sig.rev = rev;
160 154
161 return 0; 155 return 0;
@@ -178,18 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
178 return NULL; 172 return NULL;
179 173
180 if (section_hdr[0] != UCODE_UCODE_TYPE) { 174 if (section_hdr[0] != UCODE_UCODE_TYPE) {
181 printk(KERN_ERR "microcode: error: invalid type field in " 175 pr_err("error: invalid type field in container file section header\n");
182 "container file section header\n");
183 return NULL; 176 return NULL;
184 } 177 }
185 178
186 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 179 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
187 180
188 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
189 size, total_size);
190
191 if (total_size > size || total_size > UCODE_MAX_SIZE) { 181 if (total_size > size || total_size > UCODE_MAX_SIZE) {
192 printk(KERN_ERR "microcode: error: size mismatch\n"); 182 pr_err("error: size mismatch\n");
193 return NULL; 183 return NULL;
194 } 184 }
195 185
@@ -218,15 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf)
218 size = buf_pos[2]; 208 size = buf_pos[2];
219 209
220 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 210 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
221 printk(KERN_ERR "microcode: error: invalid type field in " 211 pr_err("error: invalid type field in container file section header\n");
222 "container file section header\n");
223 return 0; 212 return 0;
224 } 213 }
225 214
226 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 215 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
227 if (!equiv_cpu_table) { 216 if (!equiv_cpu_table) {
228 printk(KERN_ERR "microcode: failed to allocate " 217 pr_err("failed to allocate equivalent CPU table\n");
229 "equivalent CPU table\n");
230 return 0; 218 return 0;
231 } 219 }
232 220
@@ -259,8 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
259 247
260 offset = install_equiv_cpu_table(ucode_ptr); 248 offset = install_equiv_cpu_table(ucode_ptr);
261 if (!offset) { 249 if (!offset) {
262 printk(KERN_ERR "microcode: failed to create " 250 pr_err("failed to create equivalent cpu table\n");
263 "equivalent cpu table\n");
264 return UCODE_ERROR; 251 return UCODE_ERROR;
265 } 252 }
266 253
@@ -291,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
291 if (!leftover) { 278 if (!leftover) {
292 vfree(uci->mc); 279 vfree(uci->mc);
293 uci->mc = new_mc; 280 uci->mc = new_mc;
294 pr_debug("microcode: CPU%d found a matching microcode " 281 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
295 "update with version 0x%x (current=0x%x)\n",
296 cpu, new_rev, uci->cpu_sig.rev); 282 cpu, new_rev, uci->cpu_sig.rev);
297 } else { 283 } else {
298 vfree(new_mc); 284 vfree(new_mc);
@@ -317,6 +303,12 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
317 return UCODE_NFOUND; 303 return UCODE_NFOUND;
318 } 304 }
319 305
306 if (*(u32 *)firmware->data != UCODE_MAGIC) {
307 pr_err("invalid UCODE_MAGIC (0x%08x)\n",
308 *(u32 *)firmware->data);
309 return UCODE_ERROR;
310 }
311
320 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 312 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
321 313
322 release_firmware(firmware); 314 release_firmware(firmware);
@@ -327,8 +319,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
327static enum ucode_state 319static enum ucode_state
328request_microcode_user(int cpu, const void __user *buf, size_t size) 320request_microcode_user(int cpu, const void __user *buf, size_t size)
329{ 321{
330 printk(KERN_INFO "microcode: AMD microcode update via " 322 pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
331 "/dev/cpu/microcode not supported\n");
332 return UCODE_ERROR; 323 return UCODE_ERROR;
333} 324}
334 325
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 378e9a8f1bf8..cceb5bc3c3c2 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -70,10 +70,12 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/platform_device.h> 76#include <linux/platform_device.h>
74#include <linux/miscdevice.h> 77#include <linux/miscdevice.h>
75#include <linux/capability.h> 78#include <linux/capability.h>
76#include <linux/smp_lock.h>
77#include <linux/kernel.h> 79#include <linux/kernel.h>
78#include <linux/module.h> 80#include <linux/module.h>
79#include <linux/mutex.h> 81#include <linux/mutex.h>
@@ -201,7 +203,6 @@ static int do_microcode_update(const void __user *buf, size_t size)
201 203
202static int microcode_open(struct inode *unused1, struct file *unused2) 204static int microcode_open(struct inode *unused1, struct file *unused2)
203{ 205{
204 cycle_kernel_lock();
205 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 206 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
206} 207}
207 208
@@ -211,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
211 ssize_t ret = -EINVAL; 212 ssize_t ret = -EINVAL;
212 213
213 if ((len >> PAGE_SHIFT) > totalram_pages) { 214 if ((len >> PAGE_SHIFT) > totalram_pages) {
214 pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); 215 pr_err("too much data (max %ld pages)\n", totalram_pages);
215 return ret; 216 return ret;
216 } 217 }
217 218
@@ -246,7 +247,7 @@ static int __init microcode_dev_init(void)
246 247
247 error = misc_register(&microcode_dev); 248 error = misc_register(&microcode_dev);
248 if (error) { 249 if (error) {
249 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); 250 pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
250 return error; 251 return error;
251 } 252 }
252 253
@@ -361,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu)
361 if (!uci->mc) 362 if (!uci->mc)
362 return UCODE_NFOUND; 363 return UCODE_NFOUND;
363 364
364 pr_debug("microcode: CPU%d updated upon resume\n", cpu); 365 pr_debug("CPU%d updated upon resume\n", cpu);
365 apply_microcode_on_target(cpu); 366 apply_microcode_on_target(cpu);
366 367
367 return UCODE_OK; 368 return UCODE_OK;
@@ -381,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu)
381 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev); 382 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
382 383
383 if (ustate == UCODE_OK) { 384 if (ustate == UCODE_OK) {
384 pr_debug("microcode: CPU%d updated upon init\n", cpu); 385 pr_debug("CPU%d updated upon init\n", cpu);
385 apply_microcode_on_target(cpu); 386 apply_microcode_on_target(cpu);
386 } 387 }
387 388
@@ -408,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
408 if (!cpu_online(cpu)) 409 if (!cpu_online(cpu))
409 return 0; 410 return 0;
410 411
411 pr_debug("microcode: CPU%d added\n", cpu); 412 pr_debug("CPU%d added\n", cpu);
412 413
413 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 414 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
414 if (err) 415 if (err)
@@ -427,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
427 if (!cpu_online(cpu)) 428 if (!cpu_online(cpu))
428 return 0; 429 return 0;
429 430
430 pr_debug("microcode: CPU%d removed\n", cpu); 431 pr_debug("CPU%d removed\n", cpu);
431 microcode_fini_cpu(cpu); 432 microcode_fini_cpu(cpu);
432 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 433 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
433 return 0; 434 return 0;
@@ -475,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
475 microcode_update_cpu(cpu); 476 microcode_update_cpu(cpu);
476 case CPU_DOWN_FAILED: 477 case CPU_DOWN_FAILED:
477 case CPU_DOWN_FAILED_FROZEN: 478 case CPU_DOWN_FAILED_FROZEN:
478 pr_debug("microcode: CPU%d added\n", cpu); 479 pr_debug("CPU%d added\n", cpu);
479 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 480 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
480 pr_err("microcode: Failed to create group for CPU%d\n", cpu); 481 pr_err("Failed to create group for CPU%d\n", cpu);
481 break; 482 break;
482 case CPU_DOWN_PREPARE: 483 case CPU_DOWN_PREPARE:
483 case CPU_DOWN_PREPARE_FROZEN: 484 case CPU_DOWN_PREPARE_FROZEN:
484 /* Suspend is in progress, only remove the interface */ 485 /* Suspend is in progress, only remove the interface */
485 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 486 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
486 pr_debug("microcode: CPU%d removed\n", cpu); 487 pr_debug("CPU%d removed\n", cpu);
487 break; 488 break;
488 case CPU_DEAD: 489 case CPU_DEAD:
489 case CPU_UP_CANCELED_FROZEN: 490 case CPU_UP_CANCELED_FROZEN:
@@ -509,7 +510,7 @@ static int __init microcode_init(void)
509 microcode_ops = init_amd_microcode(); 510 microcode_ops = init_amd_microcode();
510 511
511 if (!microcode_ops) { 512 if (!microcode_ops) {
512 pr_err("microcode: no support for this CPU vendor\n"); 513 pr_err("no support for this CPU vendor\n");
513 return -ENODEV; 514 return -ENODEV;
514 } 515 }
515 516
@@ -540,8 +541,7 @@ static int __init microcode_init(void)
540 register_hotcpu_notifier(&mc_cpu_notifier); 541 register_hotcpu_notifier(&mc_cpu_notifier);
541 542
542 pr_info("Microcode Update Driver: v" MICROCODE_VERSION 543 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
543 " <tigran@aivazian.fsnet.co.uk>," 544 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
544 " Peter Oruba\n");
545 545
546 return 0; 546 return 0;
547} 547}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 0d334ddd0a96..ebd193e476ca 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,6 +70,9 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/firmware.h> 76#include <linux/firmware.h>
74#include <linux/uaccess.h> 77#include <linux/uaccess.h>
75#include <linux/kernel.h> 78#include <linux/kernel.h>
@@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
146 149
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || 150 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) { 151 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel " 152 pr_err("CPU%d not a capable Intel processor\n", cpu_num);
150 "processor\n", cpu_num);
151 return -1; 153 return -1;
152 } 154 }
153 155
@@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
165 /* get the current revision from MSR 0x8B */ 167 /* get the current revision from MSR 0x8B */
166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 168 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
167 169
168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", 170 pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
169 cpu_num, csig->sig, csig->pf, csig->rev); 171 cpu_num, csig->sig, csig->pf, csig->rev);
170 172
171 return 0; 173 return 0;
172} 174}
@@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc)
194 data_size = get_datasize(mc_header); 196 data_size = get_datasize(mc_header);
195 197
196 if (data_size + MC_HEADER_SIZE > total_size) { 198 if (data_size + MC_HEADER_SIZE > total_size) {
197 printk(KERN_ERR "microcode: error! " 199 pr_err("error! Bad data size in microcode data file\n");
198 "Bad data size in microcode data file\n");
199 return -EINVAL; 200 return -EINVAL;
200 } 201 }
201 202
202 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { 203 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
203 printk(KERN_ERR "microcode: error! " 204 pr_err("error! Unknown microcode update format\n");
204 "Unknown microcode update format\n");
205 return -EINVAL; 205 return -EINVAL;
206 } 206 }
207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size); 207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
208 if (ext_table_size) { 208 if (ext_table_size) {
209 if ((ext_table_size < EXT_HEADER_SIZE) 209 if ((ext_table_size < EXT_HEADER_SIZE)
210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { 210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
211 printk(KERN_ERR "microcode: error! " 211 pr_err("error! Small exttable size in microcode data file\n");
212 "Small exttable size in microcode data file\n");
213 return -EINVAL; 212 return -EINVAL;
214 } 213 }
215 ext_header = mc + MC_HEADER_SIZE + data_size; 214 ext_header = mc + MC_HEADER_SIZE + data_size;
216 if (ext_table_size != exttable_size(ext_header)) { 215 if (ext_table_size != exttable_size(ext_header)) {
217 printk(KERN_ERR "microcode: error! " 216 pr_err("error! Bad exttable size in microcode data file\n");
218 "Bad exttable size in microcode data file\n");
219 return -EFAULT; 217 return -EFAULT;
220 } 218 }
221 ext_sigcount = ext_header->count; 219 ext_sigcount = ext_header->count;
@@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc)
230 while (i--) 228 while (i--)
231 ext_table_sum += ext_tablep[i]; 229 ext_table_sum += ext_tablep[i];
232 if (ext_table_sum) { 230 if (ext_table_sum) {
233 printk(KERN_WARNING "microcode: aborting, " 231 pr_warning("aborting, bad extended signature table checksum\n");
234 "bad extended signature table checksum\n");
235 return -EINVAL; 232 return -EINVAL;
236 } 233 }
237 } 234 }
@@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc)
242 while (i--) 239 while (i--)
243 orig_sum += ((int *)mc)[i]; 240 orig_sum += ((int *)mc)[i];
244 if (orig_sum) { 241 if (orig_sum) {
245 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 242 pr_err("aborting, bad checksum\n");
246 return -EINVAL; 243 return -EINVAL;
247 } 244 }
248 if (!ext_table_size) 245 if (!ext_table_size)
@@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc)
255 - (mc_header->sig + mc_header->pf + mc_header->cksum) 252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
256 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); 253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
257 if (sum) { 254 if (sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 255 pr_err("aborting, bad checksum\n");
259 return -EINVAL; 256 return -EINVAL;
260 } 257 }
261 } 258 }
@@ -327,13 +324,11 @@ static int apply_microcode(int cpu)
327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 324 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
328 325
329 if (val[1] != mc_intel->hdr.rev) { 326 if (val[1] != mc_intel->hdr.rev) {
330 printk(KERN_ERR "microcode: CPU%d update " 327 pr_err("CPU%d update to revision 0x%x failed\n",
331 "to revision 0x%x failed\n", 328 cpu_num, mc_intel->hdr.rev);
332 cpu_num, mc_intel->hdr.rev);
333 return -1; 329 return -1;
334 } 330 }
335 printk(KERN_INFO "microcode: CPU%d updated to revision " 331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n",
336 "0x%x, date = %04x-%02x-%02x \n",
337 cpu_num, val[1], 332 cpu_num, val[1],
338 mc_intel->hdr.date & 0xffff, 333 mc_intel->hdr.date & 0xffff,
339 mc_intel->hdr.date >> 24, 334 mc_intel->hdr.date >> 24,
@@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
362 357
363 mc_size = get_totalsize(&mc_header); 358 mc_size = get_totalsize(&mc_header);
364 if (!mc_size || mc_size > leftover) { 359 if (!mc_size || mc_size > leftover) {
365 printk(KERN_ERR "microcode: error!" 360 pr_err("error! Bad data in microcode data file\n");
366 "Bad data in microcode data file\n");
367 break; 361 break;
368 } 362 }
369 363
@@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
405 vfree(uci->mc); 399 vfree(uci->mc);
406 uci->mc = (struct microcode_intel *)new_mc; 400 uci->mc = (struct microcode_intel *)new_mc;
407 401
408 pr_debug("microcode: CPU%d found a matching microcode update with" 402 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
409 " version 0x%x (current=0x%x)\n", 403 cpu, new_rev, uci->cpu_sig.rev);
410 cpu, new_rev, uci->cpu_sig.rev);
411out: 404out:
412 return state; 405 return state;
413} 406}
@@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
429 c->x86, c->x86_model, c->x86_mask); 422 c->x86, c->x86_model, c->x86_mask);
430 423
431 if (request_firmware(&firmware, name, device)) { 424 if (request_firmware(&firmware, name, device)) {
432 pr_debug("microcode: data file %s load failed\n", name); 425 pr_debug("data file %s load failed\n", name);
433 return UCODE_NFOUND; 426 return UCODE_NFOUND;
434 } 427 }
435 428
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 35a57c963df9..a2c1edd2d3ac 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -359,13 +359,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
359 x86_init.mpparse.mpc_record(1); 359 x86_init.mpparse.mpc_record(1);
360 } 360 }
361 361
362#ifdef CONFIG_X86_BIGSMP
363 generic_bigsmp_probe();
364#endif
365
366 if (apic->setup_apic_routing)
367 apic->setup_apic_routing();
368
369 if (!num_processors) 362 if (!num_processors)
370 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 363 printk(KERN_ERR "MPTABLE: no processors registered!\n");
371 return num_processors; 364 return num_processors;
@@ -945,9 +938,6 @@ void __init early_reserve_e820_mpc_new(void)
945{ 938{
946 if (enable_update_mptable && alloc_mptable) { 939 if (enable_update_mptable && alloc_mptable) {
947 u64 startt = 0; 940 u64 startt = 0;
948#ifdef CONFIG_X86_TRAMPOLINE
949 startt = TRAMPOLINE_BASE;
950#endif
951 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); 941 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
952 } 942 }
953} 943}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 6a3cefc7dda1..206735ac8cbd 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -172,23 +172,18 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
172 172
173static int msr_open(struct inode *inode, struct file *file) 173static int msr_open(struct inode *inode, struct file *file)
174{ 174{
175 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 175 unsigned int cpu;
176 struct cpuinfo_x86 *c = &cpu_data(cpu); 176 struct cpuinfo_x86 *c;
177 int ret = 0;
178 177
179 lock_kernel();
180 cpu = iminor(file->f_path.dentry->d_inode); 178 cpu = iminor(file->f_path.dentry->d_inode);
179 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
180 return -ENXIO; /* No such CPU */
181 181
182 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
183 ret = -ENXIO; /* No such CPU */
184 goto out;
185 }
186 c = &cpu_data(cpu); 182 c = &cpu_data(cpu);
187 if (!cpu_has(c, X86_FEATURE_MSR)) 183 if (!cpu_has(c, X86_FEATURE_MSR))
188 ret = -EIO; /* MSR not supported */ 184 return -EIO; /* MSR not supported */
189out: 185
190 unlock_kernel(); 186 return 0;
191 return ret;
192} 187}
193 188
194/* 189/*
@@ -251,7 +246,7 @@ static int __init msr_init(void)
251 int i, err = 0; 246 int i, err = 0;
252 i = 0; 247 i = 0;
253 248
254 if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { 249 if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
255 printk(KERN_ERR "msr: unable to get major %d for msr\n", 250 printk(KERN_ERR "msr: unable to get major %d for msr\n",
256 MSR_MAJOR); 251 MSR_MAJOR);
257 err = -EBUSY; 252 err = -EBUSY;
@@ -279,7 +274,7 @@ out_class:
279 msr_device_destroy(i); 274 msr_device_destroy(i);
280 class_destroy(msr_class); 275 class_destroy(msr_class);
281out_chrdev: 276out_chrdev:
282 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 277 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
283out: 278out:
284 return err; 279 return err;
285} 280}
@@ -290,7 +285,7 @@ static void __exit msr_exit(void)
290 for_each_online_cpu(cpu) 285 for_each_online_cpu(cpu)
291 msr_device_destroy(cpu); 286 msr_device_destroy(cpu);
292 class_destroy(msr_class); 287 class_destroy(msr_class);
293 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 288 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
294 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 289 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
295} 290}
296 291
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 4006c522adc7..9d1d263f786f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -212,7 +212,7 @@ static int __init olpc_init(void)
212 unsigned char *romsig; 212 unsigned char *romsig;
213 213
214 /* The ioremap check is dangerous; limit what we run it on */ 214 /* The ioremap check is dangerous; limit what we run it on */
215 if (!is_geode() || geode_has_vsa2()) 215 if (!is_geode() || cs5535_has_vsa2())
216 return 0; 216 return 0;
217 217
218 spin_lock_init(&ec_lock); 218 spin_lock_init(&ec_lock);
@@ -244,7 +244,7 @@ static int __init olpc_init(void)
244 (unsigned char *) &olpc_platform_info.ecver, 1); 244 (unsigned char *) &olpc_platform_info.ecver, 1);
245 245
246 /* check to see if the VSA exists */ 246 /* check to see if the VSA exists */
247 if (geode_has_vsa2()) 247 if (cs5535_has_vsa2())
248 olpc_platform_info.flags |= OLPC_F_VSA; 248 olpc_platform_info.flags |= OLPC_F_VSA;
249 249
250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", 250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 3a7c5a44082e..676b8c77a976 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,9 +8,9 @@
8#include <asm/paravirt.h> 8#include <asm/paravirt.h>
9 9
10static inline void 10static inline void
11default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) 11default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
12{ 12{
13 __raw_spin_lock(lock); 13 arch_spin_lock(lock);
14} 14}
15 15
16struct pv_lock_ops pv_lock_ops = { 16struct pv_lock_ops pv_lock_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 971a3bec47a8..2bbde6078143 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -31,7 +31,7 @@
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h> 32#include <linux/crash_dump.h>
33#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
34#include <linux/bitops.h> 34#include <linux/bitmap.h>
35#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
36#include <linux/pci.h> 36#include <linux/pci.h>
37#include <linux/delay.h> 37#include <linux/delay.h>
@@ -46,6 +46,7 @@
46#include <asm/dma.h> 46#include <asm/dma.h>
47#include <asm/rio.h> 47#include <asm/rio.h>
48#include <asm/bios_ebda.h> 48#include <asm/bios_ebda.h>
49#include <asm/x86_init.h>
49 50
50#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT 51#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
51int use_calgary __read_mostly = 1; 52int use_calgary __read_mostly = 1;
@@ -211,7 +212,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
211 212
212 spin_lock_irqsave(&tbl->it_lock, flags); 213 spin_lock_irqsave(&tbl->it_lock, flags);
213 214
214 iommu_area_reserve(tbl->it_map, index, npages); 215 bitmap_set(tbl->it_map, index, npages);
215 216
216 spin_unlock_irqrestore(&tbl->it_lock, flags); 217 spin_unlock_irqrestore(&tbl->it_lock, flags);
217} 218}
@@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
244 if (panic_on_overflow) 245 if (panic_on_overflow)
245 panic("Calgary: fix the allocator.\n"); 246 panic("Calgary: fix the allocator.\n");
246 else 247 else
247 return bad_dma_address; 248 return DMA_ERROR_CODE;
248 } 249 }
249 } 250 }
250 251
@@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
260 void *vaddr, unsigned int npages, int direction) 261 void *vaddr, unsigned int npages, int direction)
261{ 262{
262 unsigned long entry; 263 unsigned long entry;
263 dma_addr_t ret = bad_dma_address; 264 dma_addr_t ret;
264 265
265 entry = iommu_range_alloc(dev, tbl, npages); 266 entry = iommu_range_alloc(dev, tbl, npages);
266 267
267 if (unlikely(entry == bad_dma_address)) 268 if (unlikely(entry == DMA_ERROR_CODE)) {
268 goto error; 269 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
270 "iommu %p\n", npages, tbl);
271 return DMA_ERROR_CODE;
272 }
269 273
270 /* set the return dma address */ 274 /* set the return dma address */
271 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); 275 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
@@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
273 /* put the TCEs in the HW table */ 277 /* put the TCEs in the HW table */
274 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, 278 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
275 direction); 279 direction);
276
277 return ret; 280 return ret;
278
279error:
280 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
281 "iommu %p\n", npages, tbl);
282 return bad_dma_address;
283} 281}
284 282
285static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 283static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
290 unsigned long flags; 288 unsigned long flags;
291 289
292 /* were we called with bad_dma_address? */ 290 /* were we called with bad_dma_address? */
293 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 291 badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
294 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 292 if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
295 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " 293 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
296 "address 0x%Lx\n", dma_addr); 294 "address 0x%Lx\n", dma_addr);
297 return; 295 return;
@@ -305,7 +303,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
305 303
306 spin_lock_irqsave(&tbl->it_lock, flags); 304 spin_lock_irqsave(&tbl->it_lock, flags);
307 305
308 iommu_area_free(tbl->it_map, entry, npages); 306 bitmap_clear(tbl->it_map, entry, npages);
309 307
310 spin_unlock_irqrestore(&tbl->it_lock, flags); 308 spin_unlock_irqrestore(&tbl->it_lock, flags);
311} 309}
@@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
318 316
319 pdev = to_pci_dev(dev); 317 pdev = to_pci_dev(dev);
320 318
319 /* search up the device tree for an iommu */
321 pbus = pdev->bus; 320 pbus = pdev->bus;
322 321 do {
323 /* is the device behind a bridge? Look for the root bus */ 322 tbl = pci_iommu(pbus);
324 while (pbus->parent) 323 if (tbl && tbl->it_busno == pbus->number)
324 break;
325 tbl = NULL;
325 pbus = pbus->parent; 326 pbus = pbus->parent;
326 327 } while (pbus);
327 tbl = pci_iommu(pbus);
328 328
329 BUG_ON(tbl && (tbl->it_busno != pbus->number)); 329 BUG_ON(tbl && (tbl->it_busno != pbus->number));
330 330
@@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); 373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
374 374
375 entry = iommu_range_alloc(dev, tbl, npages); 375 entry = iommu_range_alloc(dev, tbl, npages);
376 if (entry == bad_dma_address) { 376 if (entry == DMA_ERROR_CODE) {
377 /* makes sure unmap knows to stop */ 377 /* makes sure unmap knows to stop */
378 s->dma_length = 0; 378 s->dma_length = 0;
379 goto error; 379 goto error;
@@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
391error: 391error:
392 calgary_unmap_sg(dev, sg, nelems, dir, NULL); 392 calgary_unmap_sg(dev, sg, nelems, dir, NULL);
393 for_each_sg(sg, s, nelems, i) { 393 for_each_sg(sg, s, nelems, i) {
394 sg->dma_address = bad_dma_address; 394 sg->dma_address = DMA_ERROR_CODE;
395 sg->dma_length = 0; 395 sg->dma_length = 0;
396 } 396 }
397 return 0; 397 return 0;
@@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
446 446
447 /* set up tces to cover the allocated range */ 447 /* set up tces to cover the allocated range */
448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
449 if (mapping == bad_dma_address) 449 if (mapping == DMA_ERROR_CODE)
450 goto free; 450 goto free;
451 *dma_handle = mapping; 451 *dma_handle = mapping;
452 return ret; 452 return ret;
@@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
727 struct iommu_table *tbl = pci_iommu(dev->bus); 727 struct iommu_table *tbl = pci_iommu(dev->bus);
728 728
729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */ 729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */
730 iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); 730 iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
731 731
732 /* avoid the BIOS/VGA first 640KB-1MB region */ 732 /* avoid the BIOS/VGA first 640KB-1MB region */
733 /* for CalIOC2 - avoid the entire first MB */ 733 /* for CalIOC2 - avoid the entire first MB */
@@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void)
1344 return; 1344 return;
1345} 1345}
1346 1346
1347static int __init calgary_iommu_init(void)
1348{
1349 int ret;
1350
1351 /* ok, we're trying to use Calgary - let's roll */
1352 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1353
1354 ret = calgary_init();
1355 if (ret) {
1356 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1357 "falling back to no_iommu\n", ret);
1358 return ret;
1359 }
1360
1361 return 0;
1362}
1363
1347void __init detect_calgary(void) 1364void __init detect_calgary(void)
1348{ 1365{
1349 int bus; 1366 int bus;
@@ -1357,7 +1374,7 @@ void __init detect_calgary(void)
1357 * if the user specified iommu=off or iommu=soft or we found 1374 * if the user specified iommu=off or iommu=soft or we found
1358 * another HW IOMMU already, bail out. 1375 * another HW IOMMU already, bail out.
1359 */ 1376 */
1360 if (swiotlb || no_iommu || iommu_detected) 1377 if (no_iommu || iommu_detected)
1361 return; 1378 return;
1362 1379
1363 if (!use_calgary) 1380 if (!use_calgary)
@@ -1442,9 +1459,7 @@ void __init detect_calgary(void)
1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", 1459 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1443 specified_table_size); 1460 specified_table_size);
1444 1461
1445 /* swiotlb for devices that aren't behind the Calgary. */ 1462 x86_init.iommu.iommu_init = calgary_iommu_init;
1446 if (max_pfn > MAX_DMA32_PFN)
1447 swiotlb = 1;
1448 } 1463 }
1449 return; 1464 return;
1450 1465
@@ -1457,35 +1472,6 @@ cleanup:
1457 } 1472 }
1458} 1473}
1459 1474
1460int __init calgary_iommu_init(void)
1461{
1462 int ret;
1463
1464 if (no_iommu || (swiotlb && !calgary_detected))
1465 return -ENODEV;
1466
1467 if (!calgary_detected)
1468 return -ENODEV;
1469
1470 /* ok, we're trying to use Calgary - let's roll */
1471 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1472
1473 ret = calgary_init();
1474 if (ret) {
1475 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1476 "falling back to no_iommu\n", ret);
1477 return ret;
1478 }
1479
1480 force_iommu = 1;
1481 bad_dma_address = 0x0;
1482 /* dma_ops is set to swiotlb or nommu */
1483 if (!dma_ops)
1484 dma_ops = &nommu_dma_ops;
1485
1486 return 0;
1487}
1488
1489static int __init calgary_parse_options(char *p) 1475static int __init calgary_parse_options(char *p)
1490{ 1476{
1491 unsigned int bridge; 1477 unsigned int bridge;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d20009b4e6ef..75e14e21f61a 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,10 +11,11 @@
11#include <asm/gart.h> 11#include <asm/gart.h>
12#include <asm/calgary.h> 12#include <asm/calgary.h>
13#include <asm/amd_iommu.h> 13#include <asm/amd_iommu.h>
14#include <asm/x86_init.h>
14 15
15static int forbid_dac __read_mostly; 16static int forbid_dac __read_mostly;
16 17
17struct dma_map_ops *dma_ops; 18struct dma_map_ops *dma_ops = &nommu_dma_ops;
18EXPORT_SYMBOL(dma_ops); 19EXPORT_SYMBOL(dma_ops);
19 20
20static int iommu_sac_force __read_mostly; 21static int iommu_sac_force __read_mostly;
@@ -42,15 +43,10 @@ int iommu_detected __read_mostly = 0;
42 */ 43 */
43int iommu_pass_through __read_mostly; 44int iommu_pass_through __read_mostly;
44 45
45dma_addr_t bad_dma_address __read_mostly = 0; 46/* Dummy device used for NULL arguments (normally ISA). */
46EXPORT_SYMBOL(bad_dma_address);
47
48/* Dummy device used for NULL arguments (normally ISA). Better would
49 be probably a smaller DMA mask, but this is bug-to-bug compatible
50 to older i386. */
51struct device x86_dma_fallback_dev = { 47struct device x86_dma_fallback_dev = {
52 .init_name = "fallback device", 48 .init_name = "fallback device",
53 .coherent_dma_mask = DMA_BIT_MASK(32), 49 .coherent_dma_mask = ISA_DMA_BIT_MASK,
54 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, 50 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
55}; 51};
56EXPORT_SYMBOL(x86_dma_fallback_dev); 52EXPORT_SYMBOL(x86_dma_fallback_dev);
@@ -128,19 +124,18 @@ void __init pci_iommu_alloc(void)
128 /* free the range so iommu could get some range less than 4G */ 124 /* free the range so iommu could get some range less than 4G */
129 dma32_free_bootmem(); 125 dma32_free_bootmem();
130#endif 126#endif
127 if (pci_swiotlb_detect())
128 goto out;
131 129
132 /*
133 * The order of these functions is important for
134 * fall-back/fail-over reasons
135 */
136 gart_iommu_hole_init(); 130 gart_iommu_hole_init();
137 131
138 detect_calgary(); 132 detect_calgary();
139 133
140 detect_intel_iommu(); 134 detect_intel_iommu();
141 135
136 /* needs to be called after gart_iommu_hole_init */
142 amd_iommu_detect(); 137 amd_iommu_detect();
143 138out:
144 pci_swiotlb_init(); 139 pci_swiotlb_init();
145} 140}
146 141
@@ -216,7 +211,7 @@ static __init int iommu_setup(char *p)
216 if (!strncmp(p, "allowdac", 8)) 211 if (!strncmp(p, "allowdac", 8))
217 forbid_dac = 0; 212 forbid_dac = 0;
218 if (!strncmp(p, "nodac", 5)) 213 if (!strncmp(p, "nodac", 5))
219 forbid_dac = -1; 214 forbid_dac = 1;
220 if (!strncmp(p, "usedac", 6)) { 215 if (!strncmp(p, "usedac", 6)) {
221 forbid_dac = -1; 216 forbid_dac = -1;
222 return 1; 217 return 1;
@@ -291,27 +286,19 @@ static int __init pci_iommu_init(void)
291#ifdef CONFIG_PCI 286#ifdef CONFIG_PCI
292 dma_debug_add_bus(&pci_bus_type); 287 dma_debug_add_bus(&pci_bus_type);
293#endif 288#endif
289 x86_init.iommu.iommu_init();
294 290
295 calgary_iommu_init(); 291 if (swiotlb) {
296 292 printk(KERN_INFO "PCI-DMA: "
297 intel_iommu_init(); 293 "Using software bounce buffering for IO (SWIOTLB)\n");
294 swiotlb_print_info();
295 } else
296 swiotlb_free();
298 297
299 amd_iommu_init();
300
301 gart_iommu_init();
302
303 no_iommu_init();
304 return 0; 298 return 0;
305} 299}
306
307void pci_iommu_shutdown(void)
308{
309 gart_iommu_shutdown();
310
311 amd_iommu_shutdown();
312}
313/* Must execute after PCI subsystem */ 300/* Must execute after PCI subsystem */
314fs_initcall(pci_iommu_init); 301rootfs_initcall(pci_iommu_init);
315 302
316#ifdef CONFIG_PCI 303#ifdef CONFIG_PCI
317/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ 304/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a7f1b64f86e0..34de53b46f87 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -23,7 +23,7 @@
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/topology.h> 24#include <linux/topology.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/bitops.h> 26#include <linux/bitmap.h>
27#include <linux/kdebug.h> 27#include <linux/kdebug.h>
28#include <linux/scatterlist.h> 28#include <linux/scatterlist.h>
29#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
@@ -39,6 +39,7 @@
39#include <asm/swiotlb.h> 39#include <asm/swiotlb.h>
40#include <asm/dma.h> 40#include <asm/dma.h>
41#include <asm/k8.h> 41#include <asm/k8.h>
42#include <asm/x86_init.h>
42 43
43static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 44static unsigned long iommu_bus_base; /* GART remapping area (physical) */
44static unsigned long iommu_size; /* size of remapping area bytes */ 45static unsigned long iommu_size; /* size of remapping area bytes */
@@ -46,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */
46 47
47static u32 *iommu_gatt_base; /* Remapping table */ 48static u32 *iommu_gatt_base; /* Remapping table */
48 49
50static dma_addr_t bad_dma_addr;
51
49/* 52/*
50 * If this is disabled the IOMMU will use an optimized flushing strategy 53 * If this is disabled the IOMMU will use an optimized flushing strategy
51 * of only flushing when an mapping is reused. With it true the GART is 54 * of only flushing when an mapping is reused. With it true the GART is
@@ -92,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size,
92 95
93 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 96 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
94 PAGE_SIZE) >> PAGE_SHIFT; 97 PAGE_SIZE) >> PAGE_SHIFT;
95 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, 98 boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
96 PAGE_SIZE) >> PAGE_SHIFT; 99 PAGE_SIZE) >> PAGE_SHIFT;
97 100
98 spin_lock_irqsave(&iommu_bitmap_lock, flags); 101 spin_lock_irqsave(&iommu_bitmap_lock, flags);
@@ -123,7 +126,7 @@ static void free_iommu(unsigned long offset, int size)
123 unsigned long flags; 126 unsigned long flags;
124 127
125 spin_lock_irqsave(&iommu_bitmap_lock, flags); 128 spin_lock_irqsave(&iommu_bitmap_lock, flags);
126 iommu_area_free(iommu_gart_bitmap, offset, size); 129 bitmap_clear(iommu_gart_bitmap, offset, size);
127 if (offset >= next_bit) 130 if (offset >= next_bit)
128 next_bit = offset + size; 131 next_bit = offset + size;
129 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 132 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -216,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
216 if (panic_on_overflow) 219 if (panic_on_overflow)
217 panic("dma_map_area overflow %lu bytes\n", size); 220 panic("dma_map_area overflow %lu bytes\n", size);
218 iommu_full(dev, size, dir); 221 iommu_full(dev, size, dir);
219 return bad_dma_address; 222 return bad_dma_addr;
220 } 223 }
221 224
222 for (i = 0; i < npages; i++) { 225 for (i = 0; i < npages; i++) {
@@ -294,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
294 int i; 297 int i;
295 298
296#ifdef CONFIG_IOMMU_DEBUG 299#ifdef CONFIG_IOMMU_DEBUG
297 printk(KERN_DEBUG "dma_map_sg overflow\n"); 300 pr_debug("dma_map_sg overflow\n");
298#endif 301#endif
299 302
300 for_each_sg(sg, s, nents, i) { 303 for_each_sg(sg, s, nents, i) {
@@ -302,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
302 305
303 if (nonforced_iommu(dev, addr, s->length)) { 306 if (nonforced_iommu(dev, addr, s->length)) {
304 addr = dma_map_area(dev, addr, s->length, dir, 0); 307 addr = dma_map_area(dev, addr, s->length, dir, 0);
305 if (addr == bad_dma_address) { 308 if (addr == bad_dma_addr) {
306 if (i > 0) 309 if (i > 0)
307 gart_unmap_sg(dev, sg, i, dir, NULL); 310 gart_unmap_sg(dev, sg, i, dir, NULL);
308 nents = 0; 311 nents = 0;
@@ -389,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
389 if (!dev) 392 if (!dev)
390 dev = &x86_dma_fallback_dev; 393 dev = &x86_dma_fallback_dev;
391 394
392 out = 0; 395 out = 0;
393 start = 0; 396 start = 0;
394 start_sg = sgmap = sg; 397 start_sg = sg;
395 seg_size = 0; 398 sgmap = sg;
396 max_seg_size = dma_get_max_seg_size(dev); 399 seg_size = 0;
397 ps = NULL; /* shut up gcc */ 400 max_seg_size = dma_get_max_seg_size(dev);
401 ps = NULL; /* shut up gcc */
402
398 for_each_sg(sg, s, nents, i) { 403 for_each_sg(sg, s, nents, i) {
399 dma_addr_t addr = sg_phys(s); 404 dma_addr_t addr = sg_phys(s);
400 405
@@ -417,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
417 sgmap, pages, need) < 0) 422 sgmap, pages, need) < 0)
418 goto error; 423 goto error;
419 out++; 424 out++;
420 seg_size = 0; 425
421 sgmap = sg_next(sgmap); 426 seg_size = 0;
422 pages = 0; 427 sgmap = sg_next(sgmap);
423 start = i; 428 pages = 0;
424 start_sg = s; 429 start = i;
430 start_sg = s;
425 } 431 }
426 } 432 }
427 433
@@ -455,7 +461,7 @@ error:
455 461
456 iommu_full(dev, pages << PAGE_SHIFT, dir); 462 iommu_full(dev, pages << PAGE_SHIFT, dir);
457 for_each_sg(sg, s, nents, i) 463 for_each_sg(sg, s, nents, i)
458 s->dma_address = bad_dma_address; 464 s->dma_address = bad_dma_addr;
459 return 0; 465 return 0;
460} 466}
461 467
@@ -479,7 +485,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
479 DMA_BIDIRECTIONAL, align_mask); 485 DMA_BIDIRECTIONAL, align_mask);
480 486
481 flush_gart(); 487 flush_gart();
482 if (paddr != bad_dma_address) { 488 if (paddr != bad_dma_addr) {
483 *dma_addr = paddr; 489 *dma_addr = paddr;
484 return page_address(page); 490 return page_address(page);
485 } 491 }
@@ -499,6 +505,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
499 free_pages((unsigned long)vaddr, get_order(size)); 505 free_pages((unsigned long)vaddr, get_order(size));
500} 506}
501 507
508static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
509{
510 return (dma_addr == bad_dma_addr);
511}
512
502static int no_agp; 513static int no_agp;
503 514
504static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 515static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -515,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
515 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; 526 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
516 527
517 if (iommu_size < 64*1024*1024) { 528 if (iommu_size < 64*1024*1024) {
518 printk(KERN_WARNING 529 pr_warning(
519 "PCI-DMA: Warning: Small IOMMU %luMB." 530 "PCI-DMA: Warning: Small IOMMU %luMB."
520 " Consider increasing the AGP aperture in BIOS\n", 531 " Consider increasing the AGP aperture in BIOS\n",
521 iommu_size >> 20); 532 iommu_size >> 20);
@@ -570,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
570 aperture_alloc = aper_alloc; 581 aperture_alloc = aper_alloc;
571} 582}
572 583
573static int gart_resume(struct sys_device *dev) 584static void gart_fixup_northbridges(struct sys_device *dev)
574{ 585{
575 printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); 586 int i;
576 587
577 if (fix_up_north_bridges) { 588 if (!fix_up_north_bridges)
578 int i; 589 return;
579 590
580 printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); 591 pr_info("PCI-DMA: Restoring GART aperture settings\n");
581 592
582 for (i = 0; i < num_k8_northbridges; i++) { 593 for (i = 0; i < num_k8_northbridges; i++) {
583 struct pci_dev *dev = k8_northbridges[i]; 594 struct pci_dev *dev = k8_northbridges[i];
584 595
585 /* 596 /*
586 * Don't enable translations just yet. That is the next 597 * Don't enable translations just yet. That is the next
587 * step. Restore the pre-suspend aperture settings. 598 * step. Restore the pre-suspend aperture settings.
588 */ 599 */
589 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, 600 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
590 aperture_order << 1); 601 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
591 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
592 aperture_alloc >> 25);
593 }
594 } 602 }
603}
604
605static int gart_resume(struct sys_device *dev)
606{
607 pr_info("PCI-DMA: Resuming GART IOMMU\n");
608
609 gart_fixup_northbridges(dev);
595 610
596 enable_gart_translations(); 611 enable_gart_translations();
597 612
@@ -604,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state)
604} 619}
605 620
606static struct sysdev_class gart_sysdev_class = { 621static struct sysdev_class gart_sysdev_class = {
607 .name = "gart", 622 .name = "gart",
608 .suspend = gart_suspend, 623 .suspend = gart_suspend,
609 .resume = gart_resume, 624 .resume = gart_resume,
610 625
611}; 626};
612 627
613static struct sys_device device_gart = { 628static struct sys_device device_gart = {
614 .id = 0, 629 .cls = &gart_sysdev_class,
615 .cls = &gart_sysdev_class,
616}; 630};
617 631
618/* 632/*
@@ -627,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
627 void *gatt; 641 void *gatt;
628 int i, error; 642 int i, error;
629 643
630 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 644 pr_info("PCI-DMA: Disabling AGP.\n");
645
631 aper_size = aper_base = info->aper_size = 0; 646 aper_size = aper_base = info->aper_size = 0;
632 dev = NULL; 647 dev = NULL;
633 for (i = 0; i < num_k8_northbridges; i++) { 648 for (i = 0; i < num_k8_northbridges; i++) {
@@ -645,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
645 } 660 }
646 if (!aper_base) 661 if (!aper_base)
647 goto nommu; 662 goto nommu;
663
648 info->aper_base = aper_base; 664 info->aper_base = aper_base;
649 info->aper_size = aper_size >> 20; 665 info->aper_size = aper_size >> 20;
650 666
@@ -667,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
667 683
668 flush_gart(); 684 flush_gart();
669 685
670 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 686 pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
671 aper_base, aper_size>>10); 687 aper_base, aper_size>>10);
672 688
673 return 0; 689 return 0;
674 690
675 nommu: 691 nommu:
676 /* Should not happen anymore */ 692 /* Should not happen anymore */
677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 693 pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
678 "falling back to iommu=soft.\n"); 694 "falling back to iommu=soft.\n");
679 return -1; 695 return -1;
680} 696}
@@ -686,14 +702,16 @@ static struct dma_map_ops gart_dma_ops = {
686 .unmap_page = gart_unmap_page, 702 .unmap_page = gart_unmap_page,
687 .alloc_coherent = gart_alloc_coherent, 703 .alloc_coherent = gart_alloc_coherent,
688 .free_coherent = gart_free_coherent, 704 .free_coherent = gart_free_coherent,
705 .mapping_error = gart_mapping_error,
689}; 706};
690 707
691void gart_iommu_shutdown(void) 708static void gart_iommu_shutdown(void)
692{ 709{
693 struct pci_dev *dev; 710 struct pci_dev *dev;
694 int i; 711 int i;
695 712
696 if (no_agp && (dma_ops != &gart_dma_ops)) 713 /* don't shutdown it if there is AGP installed */
714 if (!no_agp)
697 return; 715 return;
698 716
699 for (i = 0; i < num_k8_northbridges; i++) { 717 for (i = 0; i < num_k8_northbridges; i++) {
@@ -708,7 +726,7 @@ void gart_iommu_shutdown(void)
708 } 726 }
709} 727}
710 728
711void __init gart_iommu_init(void) 729int __init gart_iommu_init(void)
712{ 730{
713 struct agp_kern_info info; 731 struct agp_kern_info info;
714 unsigned long iommu_start; 732 unsigned long iommu_start;
@@ -718,7 +736,7 @@ void __init gart_iommu_init(void)
718 long i; 736 long i;
719 737
720 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) 738 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
721 return; 739 return 0;
722 740
723#ifndef CONFIG_AGP_AMD64 741#ifndef CONFIG_AGP_AMD64
724 no_agp = 1; 742 no_agp = 1;
@@ -730,35 +748,28 @@ void __init gart_iommu_init(void)
730 (agp_copy_info(agp_bridge, &info) < 0); 748 (agp_copy_info(agp_bridge, &info) < 0);
731#endif 749#endif
732 750
733 if (swiotlb)
734 return;
735
736 /* Did we detect a different HW IOMMU? */
737 if (iommu_detected && !gart_iommu_aperture)
738 return;
739
740 if (no_iommu || 751 if (no_iommu ||
741 (!force_iommu && max_pfn <= MAX_DMA32_PFN) || 752 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
742 !gart_iommu_aperture || 753 !gart_iommu_aperture ||
743 (no_agp && init_k8_gatt(&info) < 0)) { 754 (no_agp && init_k8_gatt(&info) < 0)) {
744 if (max_pfn > MAX_DMA32_PFN) { 755 if (max_pfn > MAX_DMA32_PFN) {
745 printk(KERN_WARNING "More than 4GB of memory " 756 pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
746 "but GART IOMMU not available.\n"); 757 pr_warning("falling back to iommu=soft.\n");
747 printk(KERN_WARNING "falling back to iommu=soft.\n");
748 } 758 }
749 return; 759 return 0;
750 } 760 }
751 761
752 /* need to map that range */ 762 /* need to map that range */
753 aper_size = info.aper_size << 20; 763 aper_size = info.aper_size << 20;
754 aper_base = info.aper_base; 764 aper_base = info.aper_base;
755 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 765 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
766
756 if (end_pfn > max_low_pfn_mapped) { 767 if (end_pfn > max_low_pfn_mapped) {
757 start_pfn = (aper_base>>PAGE_SHIFT); 768 start_pfn = (aper_base>>PAGE_SHIFT);
758 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 769 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
759 } 770 }
760 771
761 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 772 pr_info("PCI-DMA: using GART IOMMU.\n");
762 iommu_size = check_iommu_size(info.aper_base, aper_size); 773 iommu_size = check_iommu_size(info.aper_base, aper_size);
763 iommu_pages = iommu_size >> PAGE_SHIFT; 774 iommu_pages = iommu_size >> PAGE_SHIFT;
764 775
@@ -773,8 +784,7 @@ void __init gart_iommu_init(void)
773 784
774 ret = dma_debug_resize_entries(iommu_pages); 785 ret = dma_debug_resize_entries(iommu_pages);
775 if (ret) 786 if (ret)
776 printk(KERN_DEBUG 787 pr_debug("PCI-DMA: Cannot trace all the entries\n");
777 "PCI-DMA: Cannot trace all the entries\n");
778 } 788 }
779#endif 789#endif
780 790
@@ -782,17 +792,16 @@ void __init gart_iommu_init(void)
782 * Out of IOMMU space handling. 792 * Out of IOMMU space handling.
783 * Reserve some invalid pages at the beginning of the GART. 793 * Reserve some invalid pages at the beginning of the GART.
784 */ 794 */
785 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 795 bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
786 796
787 agp_memory_reserved = iommu_size; 797 pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
788 printk(KERN_INFO
789 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
790 iommu_size >> 20); 798 iommu_size >> 20);
791 799
792 iommu_start = aper_size - iommu_size; 800 agp_memory_reserved = iommu_size;
793 iommu_bus_base = info.aper_base + iommu_start; 801 iommu_start = aper_size - iommu_size;
794 bad_dma_address = iommu_bus_base; 802 iommu_bus_base = info.aper_base + iommu_start;
795 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); 803 bad_dma_addr = iommu_bus_base;
804 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
796 805
797 /* 806 /*
798 * Unmap the IOMMU part of the GART. The alias of the page is 807 * Unmap the IOMMU part of the GART. The alias of the page is
@@ -814,7 +823,7 @@ void __init gart_iommu_init(void)
814 * the pages as Not-Present: 823 * the pages as Not-Present:
815 */ 824 */
816 wbinvd(); 825 wbinvd();
817 826
818 /* 827 /*
819 * Now all caches are flushed and we can safely enable 828 * Now all caches are flushed and we can safely enable
820 * GART hardware. Doing it early leaves the possibility 829 * GART hardware. Doing it early leaves the possibility
@@ -838,6 +847,10 @@ void __init gart_iommu_init(void)
838 847
839 flush_gart(); 848 flush_gart();
840 dma_ops = &gart_dma_ops; 849 dma_ops = &gart_dma_ops;
850 x86_platform.iommu_shutdown = gart_iommu_shutdown;
851 swiotlb = 0;
852
853 return 0;
841} 854}
842 855
843void __init gart_parse_options(char *p) 856void __init gart_parse_options(char *p)
@@ -856,7 +869,7 @@ void __init gart_parse_options(char *p)
856#endif 869#endif
857 if (isdigit(*p) && get_option(&p, &arg)) 870 if (isdigit(*p) && get_option(&p, &arg))
858 iommu_size = arg; 871 iommu_size = arg;
859 if (!strncmp(p, "fullflush", 8)) 872 if (!strncmp(p, "fullflush", 9))
860 iommu_fullflush = 1; 873 iommu_fullflush = 1;
861 if (!strncmp(p, "nofullflush", 11)) 874 if (!strncmp(p, "nofullflush", 11))
862 iommu_fullflush = 0; 875 iommu_fullflush = 0;
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4330cd..22be12b60a8f 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
33 dma_addr_t bus = page_to_phys(page) + offset; 33 dma_addr_t bus = page_to_phys(page) + offset;
34 WARN_ON(size == 0); 34 WARN_ON(size == 0);
35 if (!check_addr("map_single", dev, bus, size)) 35 if (!check_addr("map_single", dev, bus, size))
36 return bad_dma_address; 36 return DMA_ERROR_CODE;
37 flush_write_buffers(); 37 flush_write_buffers();
38 return bus; 38 return bus;
39} 39}
@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = {
103 .sync_sg_for_device = nommu_sync_sg_for_device, 103 .sync_sg_for_device = nommu_sync_sg_for_device,
104 .is_phys = 1, 104 .is_phys = 1,
105}; 105};
106
107void __init no_iommu_init(void)
108{
109 if (dma_ops)
110 return;
111
112 force_iommu = 0; /* no HW IOMMU */
113 dma_ops = &nommu_dma_ops;
114}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b7839f1e..7d2829dde20e 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,18 +42,31 @@ static struct dma_map_ops swiotlb_dma_ops = {
42 .dma_supported = NULL, 42 .dma_supported = NULL,
43}; 43};
44 44
45void __init pci_swiotlb_init(void) 45/*
46 * pci_swiotlb_detect - set swiotlb to 1 if necessary
47 *
48 * This returns non-zero if we are forced to use swiotlb (by the boot
49 * option).
50 */
51int __init pci_swiotlb_detect(void)
46{ 52{
53 int use_swiotlb = swiotlb | swiotlb_force;
54
47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 55 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
48#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) 57 if (!no_iommu && max_pfn > MAX_DMA32_PFN)
50 swiotlb = 1; 58 swiotlb = 1;
51#endif 59#endif
52 if (swiotlb_force) 60 if (swiotlb_force)
53 swiotlb = 1; 61 swiotlb = 1;
62
63 return use_swiotlb;
64}
65
66void __init pci_swiotlb_init(void)
67{
54 if (swiotlb) { 68 if (swiotlb) {
55 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); 69 swiotlb_init(0);
56 swiotlb_init();
57 dma_ops = &swiotlb_dma_ops; 70 dma_ops = &swiotlb_dma_ops;
58 } 71 }
59} 72}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..c9b3522b6b46 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,11 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/user-return-notifier.h>
13#include <linux/dmi.h>
14#include <linux/utsname.h>
12#include <trace/events/power.h> 15#include <trace/events/power.h>
16#include <linux/hw_breakpoint.h>
13#include <asm/system.h> 17#include <asm/system.h>
14#include <asm/apic.h> 18#include <asm/apic.h>
15#include <asm/syscalls.h> 19#include <asm/syscalls.h>
@@ -17,6 +21,7 @@
17#include <asm/uaccess.h> 21#include <asm/uaccess.h>
18#include <asm/i387.h> 22#include <asm/i387.h>
19#include <asm/ds.h> 23#include <asm/ds.h>
24#include <asm/debugreg.h>
20 25
21unsigned long idle_halt; 26unsigned long idle_halt;
22EXPORT_SYMBOL(idle_halt); 27EXPORT_SYMBOL(idle_halt);
@@ -87,30 +92,30 @@ void exit_thread(void)
87 } 92 }
88} 93}
89 94
90void flush_thread(void) 95void show_regs_common(void)
91{ 96{
92 struct task_struct *tsk = current; 97 const char *board, *product;
93 98
94#ifdef CONFIG_X86_64 99 board = dmi_get_system_info(DMI_BOARD_NAME);
95 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { 100 if (!board)
96 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); 101 board = "";
97 if (test_tsk_thread_flag(tsk, TIF_IA32)) { 102 product = dmi_get_system_info(DMI_PRODUCT_NAME);
98 clear_tsk_thread_flag(tsk, TIF_IA32); 103 if (!product)
99 } else { 104 product = "";
100 set_tsk_thread_flag(tsk, TIF_IA32);
101 current_thread_info()->status |= TS_COMPAT;
102 }
103 }
104#endif
105 105
106 clear_tsk_thread_flag(tsk, TIF_DEBUG); 106 printk(KERN_CONT "\n");
107 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
108 current->pid, current->comm, print_tainted(),
109 init_utsname()->release,
110 (int)strcspn(init_utsname()->version, " "),
111 init_utsname()->version, board, product);
112}
113
114void flush_thread(void)
115{
116 struct task_struct *tsk = current;
107 117
108 tsk->thread.debugreg0 = 0; 118 flush_ptrace_hw_breakpoint(tsk);
109 tsk->thread.debugreg1 = 0;
110 tsk->thread.debugreg2 = 0;
111 tsk->thread.debugreg3 = 0;
112 tsk->thread.debugreg6 = 0;
113 tsk->thread.debugreg7 = 0;
114 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 119 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
115 /* 120 /*
116 * Forget coprocessor state.. 121 * Forget coprocessor state..
@@ -192,16 +197,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
192 else if (next->debugctlmsr != prev->debugctlmsr) 197 else if (next->debugctlmsr != prev->debugctlmsr)
193 update_debugctlmsr(next->debugctlmsr); 198 update_debugctlmsr(next->debugctlmsr);
194 199
195 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
196 set_debugreg(next->debugreg0, 0);
197 set_debugreg(next->debugreg1, 1);
198 set_debugreg(next->debugreg2, 2);
199 set_debugreg(next->debugreg3, 3);
200 /* no 4 and 5 */
201 set_debugreg(next->debugreg6, 6);
202 set_debugreg(next->debugreg7, 7);
203 }
204
205 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 200 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
206 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 201 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
207 /* prev and next are different */ 202 /* prev and next are different */
@@ -224,6 +219,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
224 */ 219 */
225 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 220 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
226 } 221 }
222 propagate_user_return_notify(prev_p, next_p);
227} 223}
228 224
229int sys_fork(struct pt_regs *regs) 225int sys_fork(struct pt_regs *regs)
@@ -247,6 +243,78 @@ int sys_vfork(struct pt_regs *regs)
247 NULL, NULL); 243 NULL, NULL);
248} 244}
249 245
246long
247sys_clone(unsigned long clone_flags, unsigned long newsp,
248 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
249{
250 if (!newsp)
251 newsp = regs->sp;
252 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
253}
254
255/*
256 * This gets run with %si containing the
257 * function to call, and %di containing
258 * the "args".
259 */
260extern void kernel_thread_helper(void);
261
262/*
263 * Create a kernel thread
264 */
265int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
266{
267 struct pt_regs regs;
268
269 memset(&regs, 0, sizeof(regs));
270
271 regs.si = (unsigned long) fn;
272 regs.di = (unsigned long) arg;
273
274#ifdef CONFIG_X86_32
275 regs.ds = __USER_DS;
276 regs.es = __USER_DS;
277 regs.fs = __KERNEL_PERCPU;
278 regs.gs = __KERNEL_STACK_CANARY;
279#else
280 regs.ss = __KERNEL_DS;
281#endif
282
283 regs.orig_ax = -1;
284 regs.ip = (unsigned long) kernel_thread_helper;
285 regs.cs = __KERNEL_CS | get_kernel_rpl();
286 regs.flags = X86_EFLAGS_IF | 0x2;
287
288 /* Ok, create the new process.. */
289 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
290}
291EXPORT_SYMBOL(kernel_thread);
292
293/*
294 * sys_execve() executes a new program.
295 */
296long sys_execve(char __user *name, char __user * __user *argv,
297 char __user * __user *envp, struct pt_regs *regs)
298{
299 long error;
300 char *filename;
301
302 filename = getname(name);
303 error = PTR_ERR(filename);
304 if (IS_ERR(filename))
305 return error;
306 error = do_execve(filename, argv, envp, regs);
307
308#ifdef CONFIG_X86_32
309 if (error == 0) {
310 /* Make sure we don't return using sysenter.. */
311 set_thread_flag(TIF_IRET);
312 }
313#endif
314
315 putname(filename);
316 return error;
317}
250 318
251/* 319/*
252 * Idle related variables and functions 320 * Idle related variables and functions
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf79567cdab..37ad1e046aae 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -23,7 +23,6 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/user.h> 24#include <linux/user.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/utsname.h>
27#include <linux/delay.h> 26#include <linux/delay.h>
28#include <linux/reboot.h> 27#include <linux/reboot.h>
29#include <linux/init.h> 28#include <linux/init.h>
@@ -35,7 +34,6 @@
35#include <linux/tick.h> 34#include <linux/tick.h>
36#include <linux/percpu.h> 35#include <linux/percpu.h>
37#include <linux/prctl.h> 36#include <linux/prctl.h>
38#include <linux/dmi.h>
39#include <linux/ftrace.h> 37#include <linux/ftrace.h>
40#include <linux/uaccess.h> 38#include <linux/uaccess.h>
41#include <linux/io.h> 39#include <linux/io.h>
@@ -58,6 +56,7 @@
58#include <asm/idle.h> 56#include <asm/idle.h>
59#include <asm/syscalls.h> 57#include <asm/syscalls.h>
60#include <asm/ds.h> 58#include <asm/ds.h>
59#include <asm/debugreg.h>
61 60
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 62
@@ -127,39 +126,29 @@ void __show_regs(struct pt_regs *regs, int all)
127 unsigned long d0, d1, d2, d3, d6, d7; 126 unsigned long d0, d1, d2, d3, d6, d7;
128 unsigned long sp; 127 unsigned long sp;
129 unsigned short ss, gs; 128 unsigned short ss, gs;
130 const char *board;
131 129
132 if (user_mode_vm(regs)) { 130 if (user_mode_vm(regs)) {
133 sp = regs->sp; 131 sp = regs->sp;
134 ss = regs->ss & 0xffff; 132 ss = regs->ss & 0xffff;
135 gs = get_user_gs(regs); 133 gs = get_user_gs(regs);
136 } else { 134 } else {
137 sp = (unsigned long) (&regs->sp); 135 sp = kernel_stack_pointer(regs);
138 savesegment(ss, ss); 136 savesegment(ss, ss);
139 savesegment(gs, gs); 137 savesegment(gs, gs);
140 } 138 }
141 139
142 printk("\n"); 140 show_regs_common();
143 141
144 board = dmi_get_system_info(DMI_PRODUCT_NAME); 142 printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
145 if (!board)
146 board = "";
147 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
148 task_pid_nr(current), current->comm,
149 print_tainted(), init_utsname()->release,
150 (int)strcspn(init_utsname()->version, " "),
151 init_utsname()->version, board);
152
153 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
154 (u16)regs->cs, regs->ip, regs->flags, 143 (u16)regs->cs, regs->ip, regs->flags,
155 smp_processor_id()); 144 smp_processor_id());
156 print_symbol("EIP is at %s\n", regs->ip); 145 print_symbol("EIP is at %s\n", regs->ip);
157 146
158 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 147 printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
159 regs->ax, regs->bx, regs->cx, regs->dx); 148 regs->ax, regs->bx, regs->cx, regs->dx);
160 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", 149 printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
161 regs->si, regs->di, regs->bp, sp); 150 regs->si, regs->di, regs->bp, sp);
162 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", 151 printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
163 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); 152 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
164 153
165 if (!all) 154 if (!all)
@@ -169,61 +158,28 @@ void __show_regs(struct pt_regs *regs, int all)
169 cr2 = read_cr2(); 158 cr2 = read_cr2();
170 cr3 = read_cr3(); 159 cr3 = read_cr3();
171 cr4 = read_cr4_safe(); 160 cr4 = read_cr4_safe();
172 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", 161 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
173 cr0, cr2, cr3, cr4); 162 cr0, cr2, cr3, cr4);
174 163
175 get_debugreg(d0, 0); 164 get_debugreg(d0, 0);
176 get_debugreg(d1, 1); 165 get_debugreg(d1, 1);
177 get_debugreg(d2, 2); 166 get_debugreg(d2, 2);
178 get_debugreg(d3, 3); 167 get_debugreg(d3, 3);
179 printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", 168 printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
180 d0, d1, d2, d3); 169 d0, d1, d2, d3);
181 170
182 get_debugreg(d6, 6); 171 get_debugreg(d6, 6);
183 get_debugreg(d7, 7); 172 get_debugreg(d7, 7);
184 printk("DR6: %08lx DR7: %08lx\n", 173 printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
185 d6, d7); 174 d6, d7);
186} 175}
187 176
188void show_regs(struct pt_regs *regs) 177void show_regs(struct pt_regs *regs)
189{ 178{
190 __show_regs(regs, 1); 179 show_registers(regs);
191 show_trace(NULL, regs, &regs->sp, regs->bp); 180 show_trace(NULL, regs, &regs->sp, regs->bp);
192} 181}
193 182
194/*
195 * This gets run with %bx containing the
196 * function to call, and %dx containing
197 * the "args".
198 */
199extern void kernel_thread_helper(void);
200
201/*
202 * Create a kernel thread
203 */
204int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
205{
206 struct pt_regs regs;
207
208 memset(&regs, 0, sizeof(regs));
209
210 regs.bx = (unsigned long) fn;
211 regs.dx = (unsigned long) arg;
212
213 regs.ds = __USER_DS;
214 regs.es = __USER_DS;
215 regs.fs = __KERNEL_PERCPU;
216 regs.gs = __KERNEL_STACK_CANARY;
217 regs.orig_ax = -1;
218 regs.ip = (unsigned long) kernel_thread_helper;
219 regs.cs = __KERNEL_CS | get_kernel_rpl();
220 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
221
222 /* Ok, create the new process.. */
223 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
224}
225EXPORT_SYMBOL(kernel_thread);
226
227void release_thread(struct task_struct *dead_task) 183void release_thread(struct task_struct *dead_task)
228{ 184{
229 BUG_ON(dead_task->mm); 185 BUG_ON(dead_task->mm);
@@ -259,7 +215,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
259 215
260 task_user_gs(p) = get_user_gs(regs); 216 task_user_gs(p) = get_user_gs(regs);
261 217
218 p->thread.io_bitmap_ptr = NULL;
262 tsk = current; 219 tsk = current;
220 err = -ENOMEM;
221
222 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
223
263 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 224 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
264 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, 225 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
265 IO_BITMAP_BYTES, GFP_KERNEL); 226 IO_BITMAP_BYTES, GFP_KERNEL);
@@ -430,46 +391,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
430 return prev_p; 391 return prev_p;
431} 392}
432 393
433int sys_clone(struct pt_regs *regs)
434{
435 unsigned long clone_flags;
436 unsigned long newsp;
437 int __user *parent_tidptr, *child_tidptr;
438
439 clone_flags = regs->bx;
440 newsp = regs->cx;
441 parent_tidptr = (int __user *)regs->dx;
442 child_tidptr = (int __user *)regs->di;
443 if (!newsp)
444 newsp = regs->sp;
445 return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
446}
447
448/*
449 * sys_execve() executes a new program.
450 */
451int sys_execve(struct pt_regs *regs)
452{
453 int error;
454 char *filename;
455
456 filename = getname((char __user *) regs->bx);
457 error = PTR_ERR(filename);
458 if (IS_ERR(filename))
459 goto out;
460 error = do_execve(filename,
461 (char __user * __user *) regs->cx,
462 (char __user * __user *) regs->dx,
463 regs);
464 if (error == 0) {
465 /* Make sure we don't return using sysenter.. */
466 set_thread_flag(TIF_IRET);
467 }
468 putname(filename);
469out:
470 return error;
471}
472
473#define top_esp (THREAD_SIZE - sizeof(unsigned long)) 394#define top_esp (THREAD_SIZE - sizeof(unsigned long))
474#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) 395#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
475 396
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ad535b683170..126f0b493d04 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -26,7 +26,6 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/user.h> 27#include <linux/user.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/utsname.h>
30#include <linux/delay.h> 29#include <linux/delay.h>
31#include <linux/module.h> 30#include <linux/module.h>
32#include <linux/ptrace.h> 31#include <linux/ptrace.h>
@@ -38,7 +37,6 @@
38#include <linux/uaccess.h> 37#include <linux/uaccess.h>
39#include <linux/io.h> 38#include <linux/io.h>
40#include <linux/ftrace.h> 39#include <linux/ftrace.h>
41#include <linux/dmi.h>
42 40
43#include <asm/pgtable.h> 41#include <asm/pgtable.h>
44#include <asm/system.h> 42#include <asm/system.h>
@@ -52,14 +50,13 @@
52#include <asm/idle.h> 50#include <asm/idle.h>
53#include <asm/syscalls.h> 51#include <asm/syscalls.h>
54#include <asm/ds.h> 52#include <asm/ds.h>
53#include <asm/debugreg.h>
55 54
56asmlinkage extern void ret_from_fork(void); 55asmlinkage extern void ret_from_fork(void);
57 56
58DEFINE_PER_CPU(unsigned long, old_rsp); 57DEFINE_PER_CPU(unsigned long, old_rsp);
59static DEFINE_PER_CPU(unsigned char, is_idle); 58static DEFINE_PER_CPU(unsigned char, is_idle);
60 59
61unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
62
63static ATOMIC_NOTIFIER_HEAD(idle_notifier); 60static ATOMIC_NOTIFIER_HEAD(idle_notifier);
64 61
65void idle_notifier_register(struct notifier_block *n) 62void idle_notifier_register(struct notifier_block *n)
@@ -162,31 +159,21 @@ void __show_regs(struct pt_regs *regs, int all)
162 unsigned long d0, d1, d2, d3, d6, d7; 159 unsigned long d0, d1, d2, d3, d6, d7;
163 unsigned int fsindex, gsindex; 160 unsigned int fsindex, gsindex;
164 unsigned int ds, cs, es; 161 unsigned int ds, cs, es;
165 const char *board; 162
166 163 show_regs_common();
167 printk("\n"); 164 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
168 print_modules();
169 board = dmi_get_system_info(DMI_PRODUCT_NAME);
170 if (!board)
171 board = "";
172 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
173 current->pid, current->comm, print_tainted(),
174 init_utsname()->release,
175 (int)strcspn(init_utsname()->version, " "),
176 init_utsname()->version, board);
177 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
178 printk_address(regs->ip, 1); 165 printk_address(regs->ip, 1);
179 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 166 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
180 regs->sp, regs->flags); 167 regs->sp, regs->flags);
181 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", 168 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
182 regs->ax, regs->bx, regs->cx); 169 regs->ax, regs->bx, regs->cx);
183 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", 170 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
184 regs->dx, regs->si, regs->di); 171 regs->dx, regs->si, regs->di);
185 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", 172 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
186 regs->bp, regs->r8, regs->r9); 173 regs->bp, regs->r8, regs->r9);
187 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", 174 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
188 regs->r10, regs->r11, regs->r12); 175 regs->r10, regs->r11, regs->r12);
189 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", 176 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
190 regs->r13, regs->r14, regs->r15); 177 regs->r13, regs->r14, regs->r15);
191 178
192 asm("movl %%ds,%0" : "=r" (ds)); 179 asm("movl %%ds,%0" : "=r" (ds));
@@ -207,27 +194,26 @@ void __show_regs(struct pt_regs *regs, int all)
207 cr3 = read_cr3(); 194 cr3 = read_cr3();
208 cr4 = read_cr4(); 195 cr4 = read_cr4();
209 196
210 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 197 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
211 fs, fsindex, gs, gsindex, shadowgs); 198 fs, fsindex, gs, gsindex, shadowgs);
212 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 199 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
213 es, cr0); 200 es, cr0);
214 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 201 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
215 cr4); 202 cr4);
216 203
217 get_debugreg(d0, 0); 204 get_debugreg(d0, 0);
218 get_debugreg(d1, 1); 205 get_debugreg(d1, 1);
219 get_debugreg(d2, 2); 206 get_debugreg(d2, 2);
220 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 207 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
221 get_debugreg(d3, 3); 208 get_debugreg(d3, 3);
222 get_debugreg(d6, 6); 209 get_debugreg(d6, 6);
223 get_debugreg(d7, 7); 210 get_debugreg(d7, 7);
224 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 211 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
225} 212}
226 213
227void show_regs(struct pt_regs *regs) 214void show_regs(struct pt_regs *regs)
228{ 215{
229 printk(KERN_INFO "CPU %d:", smp_processor_id()); 216 show_registers(regs);
230 __show_regs(regs, 1);
231 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 217 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
232} 218}
233 219
@@ -285,8 +271,9 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
285 *childregs = *regs; 271 *childregs = *regs;
286 272
287 childregs->ax = 0; 273 childregs->ax = 0;
288 childregs->sp = sp; 274 if (user_mode(regs))
289 if (sp == ~0UL) 275 childregs->sp = sp;
276 else
290 childregs->sp = (unsigned long)childregs; 277 childregs->sp = (unsigned long)childregs;
291 278
292 p->thread.sp = (unsigned long) childregs; 279 p->thread.sp = (unsigned long) childregs;
@@ -297,12 +284,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
297 284
298 p->thread.fs = me->thread.fs; 285 p->thread.fs = me->thread.fs;
299 p->thread.gs = me->thread.gs; 286 p->thread.gs = me->thread.gs;
287 p->thread.io_bitmap_ptr = NULL;
300 288
301 savesegment(gs, p->thread.gsindex); 289 savesegment(gs, p->thread.gsindex);
302 savesegment(fs, p->thread.fsindex); 290 savesegment(fs, p->thread.fsindex);
303 savesegment(es, p->thread.es); 291 savesegment(es, p->thread.es);
304 savesegment(ds, p->thread.ds); 292 savesegment(ds, p->thread.ds);
305 293
294 err = -ENOMEM;
295 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
296
306 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 297 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
307 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 298 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
308 if (!p->thread.io_bitmap_ptr) { 299 if (!p->thread.io_bitmap_ptr) {
@@ -341,29 +332,46 @@ out:
341 kfree(p->thread.io_bitmap_ptr); 332 kfree(p->thread.io_bitmap_ptr);
342 p->thread.io_bitmap_max = 0; 333 p->thread.io_bitmap_max = 0;
343 } 334 }
335
344 return err; 336 return err;
345} 337}
346 338
347void 339static void
348start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 340start_thread_common(struct pt_regs *regs, unsigned long new_ip,
341 unsigned long new_sp,
342 unsigned int _cs, unsigned int _ss, unsigned int _ds)
349{ 343{
350 loadsegment(fs, 0); 344 loadsegment(fs, 0);
351 loadsegment(es, 0); 345 loadsegment(es, _ds);
352 loadsegment(ds, 0); 346 loadsegment(ds, _ds);
353 load_gs_index(0); 347 load_gs_index(0);
354 regs->ip = new_ip; 348 regs->ip = new_ip;
355 regs->sp = new_sp; 349 regs->sp = new_sp;
356 percpu_write(old_rsp, new_sp); 350 percpu_write(old_rsp, new_sp);
357 regs->cs = __USER_CS; 351 regs->cs = _cs;
358 regs->ss = __USER_DS; 352 regs->ss = _ss;
359 regs->flags = 0x200; 353 regs->flags = X86_EFLAGS_IF;
360 set_fs(USER_DS); 354 set_fs(USER_DS);
361 /* 355 /*
362 * Free the old FP and other extended state 356 * Free the old FP and other extended state
363 */ 357 */
364 free_thread_xstate(current); 358 free_thread_xstate(current);
365} 359}
366EXPORT_SYMBOL_GPL(start_thread); 360
361void
362start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
363{
364 start_thread_common(regs, new_ip, new_sp,
365 __USER_CS, __USER_DS, 0);
366}
367
368#ifdef CONFIG_IA32_EMULATION
369void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
370{
371 start_thread_common(regs, new_ip, new_sp,
372 __USER32_CS, __USER32_DS, __USER32_DS);
373}
374#endif
367 375
368/* 376/*
369 * switch_to(x,y) should switch tasks from x to y. 377 * switch_to(x,y) should switch tasks from x to y.
@@ -495,26 +503,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
495 */ 503 */
496 if (preload_fpu) 504 if (preload_fpu)
497 __math_state_restore(); 505 __math_state_restore();
498 return prev_p;
499}
500 506
501/* 507 return prev_p;
502 * sys_execve() executes a new program.
503 */
504asmlinkage
505long sys_execve(char __user *name, char __user * __user *argv,
506 char __user * __user *envp, struct pt_regs *regs)
507{
508 long error;
509 char *filename;
510
511 filename = getname(name);
512 error = PTR_ERR(filename);
513 if (IS_ERR(filename))
514 return error;
515 error = do_execve(filename, argv, envp, regs);
516 putname(filename);
517 return error;
518} 508}
519 509
520void set_personality_64bit(void) 510void set_personality_64bit(void)
@@ -531,13 +521,16 @@ void set_personality_64bit(void)
531 current->personality &= ~READ_IMPLIES_EXEC; 521 current->personality &= ~READ_IMPLIES_EXEC;
532} 522}
533 523
534asmlinkage long 524void set_personality_ia32(void)
535sys_clone(unsigned long clone_flags, unsigned long newsp,
536 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
537{ 525{
538 if (!newsp) 526 /* inherit personality from parent */
539 newsp = regs->sp; 527
540 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 528 /* Make sure to be in 32bit mode */
529 set_thread_flag(TIF_IA32);
530 current->personality |= force_personality32;
531
532 /* Prepare the first "return" to user space */
533 current_thread_info()->status |= TS_COMPAT;
541} 534}
542 535
543unsigned long get_wchan(struct task_struct *p) 536unsigned long get_wchan(struct task_struct *p)
@@ -664,3 +657,8 @@ long sys_arch_prctl(int code, unsigned long addr)
664 return do_arch_prctl(current, code, addr); 657 return do_arch_prctl(current, code, addr);
665} 658}
666 659
660unsigned long KSTK_ESP(struct task_struct *task)
661{
662 return (test_tsk_thread_flag(task, TIF_IA32)) ?
663 (task_pt_regs(task)->sp) : ((task)->thread.usersp);
664}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7b058a2dc66a..017d937639fe 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/workqueue.h> 24#include <linux/workqueue.h>
25#include <linux/perf_event.h>
26#include <linux/hw_breakpoint.h>
25 27
26#include <asm/uaccess.h> 28#include <asm/uaccess.h>
27#include <asm/pgtable.h> 29#include <asm/pgtable.h>
@@ -34,6 +36,7 @@
34#include <asm/prctl.h> 36#include <asm/prctl.h>
35#include <asm/proto.h> 37#include <asm/proto.h>
36#include <asm/ds.h> 38#include <asm/ds.h>
39#include <asm/hw_breakpoint.h>
37 40
38#include "tls.h" 41#include "tls.h"
39 42
@@ -49,6 +52,118 @@ enum x86_regset {
49 REGSET_IOPERM32, 52 REGSET_IOPERM32,
50}; 53};
51 54
55struct pt_regs_offset {
56 const char *name;
57 int offset;
58};
59
60#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
61#define REG_OFFSET_END {.name = NULL, .offset = 0}
62
63static const struct pt_regs_offset regoffset_table[] = {
64#ifdef CONFIG_X86_64
65 REG_OFFSET_NAME(r15),
66 REG_OFFSET_NAME(r14),
67 REG_OFFSET_NAME(r13),
68 REG_OFFSET_NAME(r12),
69 REG_OFFSET_NAME(r11),
70 REG_OFFSET_NAME(r10),
71 REG_OFFSET_NAME(r9),
72 REG_OFFSET_NAME(r8),
73#endif
74 REG_OFFSET_NAME(bx),
75 REG_OFFSET_NAME(cx),
76 REG_OFFSET_NAME(dx),
77 REG_OFFSET_NAME(si),
78 REG_OFFSET_NAME(di),
79 REG_OFFSET_NAME(bp),
80 REG_OFFSET_NAME(ax),
81#ifdef CONFIG_X86_32
82 REG_OFFSET_NAME(ds),
83 REG_OFFSET_NAME(es),
84 REG_OFFSET_NAME(fs),
85 REG_OFFSET_NAME(gs),
86#endif
87 REG_OFFSET_NAME(orig_ax),
88 REG_OFFSET_NAME(ip),
89 REG_OFFSET_NAME(cs),
90 REG_OFFSET_NAME(flags),
91 REG_OFFSET_NAME(sp),
92 REG_OFFSET_NAME(ss),
93 REG_OFFSET_END,
94};
95
96/**
97 * regs_query_register_offset() - query register offset from its name
98 * @name: the name of a register
99 *
100 * regs_query_register_offset() returns the offset of a register in struct
101 * pt_regs from its name. If the name is invalid, this returns -EINVAL;
102 */
103int regs_query_register_offset(const char *name)
104{
105 const struct pt_regs_offset *roff;
106 for (roff = regoffset_table; roff->name != NULL; roff++)
107 if (!strcmp(roff->name, name))
108 return roff->offset;
109 return -EINVAL;
110}
111
112/**
113 * regs_query_register_name() - query register name from its offset
114 * @offset: the offset of a register in struct pt_regs.
115 *
116 * regs_query_register_name() returns the name of a register from its
117 * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
118 */
119const char *regs_query_register_name(unsigned int offset)
120{
121 const struct pt_regs_offset *roff;
122 for (roff = regoffset_table; roff->name != NULL; roff++)
123 if (roff->offset == offset)
124 return roff->name;
125 return NULL;
126}
127
128static const int arg_offs_table[] = {
129#ifdef CONFIG_X86_32
130 [0] = offsetof(struct pt_regs, ax),
131 [1] = offsetof(struct pt_regs, dx),
132 [2] = offsetof(struct pt_regs, cx)
133#else /* CONFIG_X86_64 */
134 [0] = offsetof(struct pt_regs, di),
135 [1] = offsetof(struct pt_regs, si),
136 [2] = offsetof(struct pt_regs, dx),
137 [3] = offsetof(struct pt_regs, cx),
138 [4] = offsetof(struct pt_regs, r8),
139 [5] = offsetof(struct pt_regs, r9)
140#endif
141};
142
143/**
144 * regs_get_argument_nth() - get Nth argument at function call
145 * @regs: pt_regs which contains registers at function entry.
146 * @n: argument number.
147 *
148 * regs_get_argument_nth() returns @n th argument of a function call.
149 * Since usually the kernel stack will be changed right after function entry,
150 * you must use this at function entry. If the @n th entry is NOT in the
151 * kernel stack or pt_regs, this returns 0.
152 */
153unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
154{
155 if (n < ARRAY_SIZE(arg_offs_table))
156 return *(unsigned long *)((char *)regs + arg_offs_table[n]);
157 else {
158 /*
159 * The typical case: arg n is on the stack.
160 * (Note: stack[0] = return address, so skip it)
161 */
162 n -= ARRAY_SIZE(arg_offs_table);
163 return regs_get_kernel_stack_nth(regs, 1 + n);
164 }
165}
166
52/* 167/*
53 * does not yet catch signals sent when the child dies. 168 * does not yet catch signals sent when the child dies.
54 * in exit.c or in signal.c. 169 * in exit.c or in signal.c.
@@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task,
137 return 0; 252 return 0;
138} 253}
139 254
140static unsigned long debugreg_addr_limit(struct task_struct *task)
141{
142 return TASK_SIZE - 3;
143}
144
145#else /* CONFIG_X86_64 */ 255#else /* CONFIG_X86_64 */
146 256
147#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) 257#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task,
266 return 0; 376 return 0;
267} 377}
268 378
269static unsigned long debugreg_addr_limit(struct task_struct *task)
270{
271#ifdef CONFIG_IA32_EMULATION
272 if (test_tsk_thread_flag(task, TIF_IA32))
273 return IA32_PAGE_OFFSET - 3;
274#endif
275 return TASK_SIZE_MAX - 7;
276}
277
278#endif /* CONFIG_X86_32 */ 379#endif /* CONFIG_X86_32 */
279 380
280static unsigned long get_flags(struct task_struct *task) 381static unsigned long get_flags(struct task_struct *task)
@@ -408,14 +509,14 @@ static int genregs_get(struct task_struct *target,
408{ 509{
409 if (kbuf) { 510 if (kbuf) {
410 unsigned long *k = kbuf; 511 unsigned long *k = kbuf;
411 while (count > 0) { 512 while (count >= sizeof(*k)) {
412 *k++ = getreg(target, pos); 513 *k++ = getreg(target, pos);
413 count -= sizeof(*k); 514 count -= sizeof(*k);
414 pos += sizeof(*k); 515 pos += sizeof(*k);
415 } 516 }
416 } else { 517 } else {
417 unsigned long __user *u = ubuf; 518 unsigned long __user *u = ubuf;
418 while (count > 0) { 519 while (count >= sizeof(*u)) {
419 if (__put_user(getreg(target, pos), u++)) 520 if (__put_user(getreg(target, pos), u++))
420 return -EFAULT; 521 return -EFAULT;
421 count -= sizeof(*u); 522 count -= sizeof(*u);
@@ -434,14 +535,14 @@ static int genregs_set(struct task_struct *target,
434 int ret = 0; 535 int ret = 0;
435 if (kbuf) { 536 if (kbuf) {
436 const unsigned long *k = kbuf; 537 const unsigned long *k = kbuf;
437 while (count > 0 && !ret) { 538 while (count >= sizeof(*k) && !ret) {
438 ret = putreg(target, pos, *k++); 539 ret = putreg(target, pos, *k++);
439 count -= sizeof(*k); 540 count -= sizeof(*k);
440 pos += sizeof(*k); 541 pos += sizeof(*k);
441 } 542 }
442 } else { 543 } else {
443 const unsigned long __user *u = ubuf; 544 const unsigned long __user *u = ubuf;
444 while (count > 0 && !ret) { 545 while (count >= sizeof(*u) && !ret) {
445 unsigned long word; 546 unsigned long word;
446 ret = __get_user(word, u++); 547 ret = __get_user(word, u++);
447 if (ret) 548 if (ret)
@@ -454,99 +555,237 @@ static int genregs_set(struct task_struct *target,
454 return ret; 555 return ret;
455} 556}
456 557
558static void ptrace_triggered(struct perf_event *bp, int nmi,
559 struct perf_sample_data *data,
560 struct pt_regs *regs)
561{
562 int i;
563 struct thread_struct *thread = &(current->thread);
564
565 /*
566 * Store in the virtual DR6 register the fact that the breakpoint
567 * was hit so the thread's debugger will see it.
568 */
569 for (i = 0; i < HBP_NUM; i++) {
570 if (thread->ptrace_bps[i] == bp)
571 break;
572 }
573
574 thread->debugreg6 |= (DR_TRAP0 << i);
575}
576
457/* 577/*
458 * This function is trivial and will be inlined by the compiler. 578 * Walk through every ptrace breakpoints for this thread and
459 * Having it separates the implementation details of debug 579 * build the dr7 value on top of their attributes.
460 * registers from the interface details of ptrace. 580 *
461 */ 581 */
462static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) 582static unsigned long ptrace_get_dr7(struct perf_event *bp[])
463{ 583{
464 switch (n) { 584 int i;
465 case 0: return child->thread.debugreg0; 585 int dr7 = 0;
466 case 1: return child->thread.debugreg1; 586 struct arch_hw_breakpoint *info;
467 case 2: return child->thread.debugreg2; 587
468 case 3: return child->thread.debugreg3; 588 for (i = 0; i < HBP_NUM; i++) {
469 case 6: return child->thread.debugreg6; 589 if (bp[i] && !bp[i]->attr.disabled) {
470 case 7: return child->thread.debugreg7; 590 info = counter_arch_bp(bp[i]);
591 dr7 |= encode_dr7(i, info->len, info->type);
592 }
471 } 593 }
472 return 0; 594
595 return dr7;
473} 596}
474 597
475static int ptrace_set_debugreg(struct task_struct *child, 598static int
476 int n, unsigned long data) 599ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
600 struct task_struct *tsk, int disabled)
477{ 601{
478 int i; 602 int err;
603 int gen_len, gen_type;
604 struct perf_event_attr attr;
479 605
480 if (unlikely(n == 4 || n == 5)) 606 /*
481 return -EIO; 607 * We shoud have at least an inactive breakpoint at this
608 * slot. It means the user is writing dr7 without having
609 * written the address register first
610 */
611 if (!bp)
612 return -EINVAL;
482 613
483 if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) 614 err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
484 return -EIO; 615 if (err)
616 return err;
485 617
486 switch (n) { 618 attr = bp->attr;
487 case 0: child->thread.debugreg0 = data; break; 619 attr.bp_len = gen_len;
488 case 1: child->thread.debugreg1 = data; break; 620 attr.bp_type = gen_type;
489 case 2: child->thread.debugreg2 = data; break; 621 attr.disabled = disabled;
490 case 3: child->thread.debugreg3 = data; break;
491 622
492 case 6: 623 return modify_user_hw_breakpoint(bp, &attr);
493 if ((data & ~0xffffffffUL) != 0) 624}
494 return -EIO;
495 child->thread.debugreg6 = data;
496 break;
497 625
498 case 7: 626/*
627 * Handle ptrace writes to debug register 7.
628 */
629static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
630{
631 struct thread_struct *thread = &(tsk->thread);
632 unsigned long old_dr7;
633 int i, orig_ret = 0, rc = 0;
634 int enabled, second_pass = 0;
635 unsigned len, type;
636 struct perf_event *bp;
637
638 data &= ~DR_CONTROL_RESERVED;
639 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
640restore:
641 /*
642 * Loop through all the hardware breakpoints, making the
643 * appropriate changes to each.
644 */
645 for (i = 0; i < HBP_NUM; i++) {
646 enabled = decode_dr7(data, i, &len, &type);
647 bp = thread->ptrace_bps[i];
648
649 if (!enabled) {
650 if (bp) {
651 /*
652 * Don't unregister the breakpoints right-away,
653 * unless all register_user_hw_breakpoint()
654 * requests have succeeded. This prevents
655 * any window of opportunity for debug
656 * register grabbing by other users.
657 */
658 if (!second_pass)
659 continue;
660
661 rc = ptrace_modify_breakpoint(bp, len, type,
662 tsk, 1);
663 if (rc)
664 break;
665 }
666 continue;
667 }
668
669 rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
670 if (rc)
671 break;
672 }
673 /*
674 * Make a second pass to free the remaining unused breakpoints
675 * or to restore the original breakpoints if an error occurred.
676 */
677 if (!second_pass) {
678 second_pass = 1;
679 if (rc < 0) {
680 orig_ret = rc;
681 data = old_dr7;
682 }
683 goto restore;
684 }
685 return ((orig_ret < 0) ? orig_ret : rc);
686}
687
688/*
689 * Handle PTRACE_PEEKUSR calls for the debug register area.
690 */
691static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
692{
693 struct thread_struct *thread = &(tsk->thread);
694 unsigned long val = 0;
695
696 if (n < HBP_NUM) {
697 struct perf_event *bp;
698 bp = thread->ptrace_bps[n];
699 if (!bp)
700 return 0;
701 val = bp->hw.info.address;
702 } else if (n == 6) {
703 val = thread->debugreg6;
704 } else if (n == 7) {
705 val = ptrace_get_dr7(thread->ptrace_bps);
706 }
707 return val;
708}
709
710static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
711 unsigned long addr)
712{
713 struct perf_event *bp;
714 struct thread_struct *t = &tsk->thread;
715 struct perf_event_attr attr;
716
717 if (!t->ptrace_bps[nr]) {
718 hw_breakpoint_init(&attr);
499 /* 719 /*
500 * Sanity-check data. Take one half-byte at once with 720 * Put stub len and type to register (reserve) an inactive but
501 * check = (val >> (16 + 4*i)) & 0xf. It contains the 721 * correct bp
502 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
503 * 2 and 3 are LENi. Given a list of invalid values,
504 * we do mask |= 1 << invalid_value, so that
505 * (mask >> check) & 1 is a correct test for invalid
506 * values.
507 *
508 * R/Wi contains the type of the breakpoint /
509 * watchpoint, LENi contains the length of the watched
510 * data in the watchpoint case.
511 *
512 * The invalid values are:
513 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
514 * - R/Wi == 0x10 (break on I/O reads or writes), so
515 * mask |= 0x4444.
516 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
517 * 0x1110.
518 *
519 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
520 *
521 * See the Intel Manual "System Programming Guide",
522 * 15.2.4
523 *
524 * Note that LENi == 0x10 is defined on x86_64 in long
525 * mode (i.e. even for 32-bit userspace software, but
526 * 64-bit kernel), so the x86_64 mask value is 0x5454.
527 * See the AMD manual no. 24593 (AMD64 System Programming)
528 */ 722 */
529#ifdef CONFIG_X86_32 723 attr.bp_addr = addr;
530#define DR7_MASK 0x5f54 724 attr.bp_len = HW_BREAKPOINT_LEN_1;
531#else 725 attr.bp_type = HW_BREAKPOINT_W;
532#define DR7_MASK 0x5554 726 attr.disabled = 1;
533#endif 727
534 data &= ~DR_CONTROL_RESERVED; 728 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
535 for (i = 0; i < 4; i++) 729
536 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) 730 /*
537 return -EIO; 731 * CHECKME: the previous code returned -EIO if the addr wasn't
538 child->thread.debugreg7 = data; 732 * a valid task virtual addr. The new one will return -EINVAL in
539 if (data) 733 * this case.
540 set_tsk_thread_flag(child, TIF_DEBUG); 734 * -EINVAL may be what we want for in-kernel breakpoints users,
541 else 735 * but -EIO looks better for ptrace, since we refuse a register
542 clear_tsk_thread_flag(child, TIF_DEBUG); 736 * writing for the user. And anyway this is the previous
543 break; 737 * behaviour.
738 */
739 if (IS_ERR(bp))
740 return PTR_ERR(bp);
741
742 t->ptrace_bps[nr] = bp;
743 } else {
744 int err;
745
746 bp = t->ptrace_bps[nr];
747
748 attr = bp->attr;
749 attr.bp_addr = addr;
750 err = modify_user_hw_breakpoint(bp, &attr);
751 if (err)
752 return err;
544 } 753 }
545 754
755
546 return 0; 756 return 0;
547} 757}
548 758
549/* 759/*
760 * Handle PTRACE_POKEUSR calls for the debug register area.
761 */
762int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
763{
764 struct thread_struct *thread = &(tsk->thread);
765 int rc = 0;
766
767 /* There are no DR4 or DR5 registers */
768 if (n == 4 || n == 5)
769 return -EIO;
770
771 if (n == 6) {
772 thread->debugreg6 = val;
773 goto ret_path;
774 }
775 if (n < HBP_NUM) {
776 rc = ptrace_set_breakpoint_addr(tsk, n, val);
777 if (rc)
778 return rc;
779 }
780 /* All that's left is DR7 */
781 if (n == 7)
782 rc = ptrace_write_dr7(tsk, val);
783
784ret_path:
785 return rc;
786}
787
788/*
550 * These access the current or another (stopped) task's io permission 789 * These access the current or another (stopped) task's io permission
551 * bitmap for debugging or core dump. 790 * bitmap for debugging or core dump.
552 */ 791 */
@@ -1219,14 +1458,14 @@ static int genregs32_get(struct task_struct *target,
1219{ 1458{
1220 if (kbuf) { 1459 if (kbuf) {
1221 compat_ulong_t *k = kbuf; 1460 compat_ulong_t *k = kbuf;
1222 while (count > 0) { 1461 while (count >= sizeof(*k)) {
1223 getreg32(target, pos, k++); 1462 getreg32(target, pos, k++);
1224 count -= sizeof(*k); 1463 count -= sizeof(*k);
1225 pos += sizeof(*k); 1464 pos += sizeof(*k);
1226 } 1465 }
1227 } else { 1466 } else {
1228 compat_ulong_t __user *u = ubuf; 1467 compat_ulong_t __user *u = ubuf;
1229 while (count > 0) { 1468 while (count >= sizeof(*u)) {
1230 compat_ulong_t word; 1469 compat_ulong_t word;
1231 getreg32(target, pos, &word); 1470 getreg32(target, pos, &word);
1232 if (__put_user(word, u++)) 1471 if (__put_user(word, u++))
@@ -1247,14 +1486,14 @@ static int genregs32_set(struct task_struct *target,
1247 int ret = 0; 1486 int ret = 0;
1248 if (kbuf) { 1487 if (kbuf) {
1249 const compat_ulong_t *k = kbuf; 1488 const compat_ulong_t *k = kbuf;
1250 while (count > 0 && !ret) { 1489 while (count >= sizeof(*k) && !ret) {
1251 ret = putreg32(target, pos, *k++); 1490 ret = putreg32(target, pos, *k++);
1252 count -= sizeof(*k); 1491 count -= sizeof(*k);
1253 pos += sizeof(*k); 1492 pos += sizeof(*k);
1254 } 1493 }
1255 } else { 1494 } else {
1256 const compat_ulong_t __user *u = ubuf; 1495 const compat_ulong_t __user *u = ubuf;
1257 while (count > 0 && !ret) { 1496 while (count >= sizeof(*u) && !ret) {
1258 compat_ulong_t word; 1497 compat_ulong_t word;
1259 ret = __get_user(word, u++); 1498 ret = __get_user(word, u++);
1260 if (ret) 1499 if (ret)
@@ -1437,21 +1676,33 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1437#endif 1676#endif
1438} 1677}
1439 1678
1440void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, 1679static void fill_sigtrap_info(struct task_struct *tsk,
1441 int error_code, int si_code) 1680 struct pt_regs *regs,
1681 int error_code, int si_code,
1682 struct siginfo *info)
1442{ 1683{
1443 struct siginfo info;
1444
1445 tsk->thread.trap_no = 1; 1684 tsk->thread.trap_no = 1;
1446 tsk->thread.error_code = error_code; 1685 tsk->thread.error_code = error_code;
1447 1686
1448 memset(&info, 0, sizeof(info)); 1687 memset(info, 0, sizeof(*info));
1449 info.si_signo = SIGTRAP; 1688 info->si_signo = SIGTRAP;
1450 info.si_code = si_code; 1689 info->si_code = si_code;
1690 info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
1691}
1451 1692
1452 /* User-mode ip? */ 1693void user_single_step_siginfo(struct task_struct *tsk,
1453 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1694 struct pt_regs *regs,
1695 struct siginfo *info)
1696{
1697 fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info);
1698}
1454 1699
1700void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1701 int error_code, int si_code)
1702{
1703 struct siginfo info;
1704
1705 fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
1455 /* Send us the fake SIGTRAP */ 1706 /* Send us the fake SIGTRAP */
1456 force_sig_info(SIGTRAP, &info, tsk); 1707 force_sig_info(SIGTRAP, &info, tsk);
1457} 1708}
@@ -1516,29 +1767,22 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1516 1767
1517asmregparm void syscall_trace_leave(struct pt_regs *regs) 1768asmregparm void syscall_trace_leave(struct pt_regs *regs)
1518{ 1769{
1770 bool step;
1771
1519 if (unlikely(current->audit_context)) 1772 if (unlikely(current->audit_context))
1520 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1773 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1521 1774
1522 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1775 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1523 trace_sys_exit(regs, regs->ax); 1776 trace_sys_exit(regs, regs->ax);
1524 1777
1525 if (test_thread_flag(TIF_SYSCALL_TRACE))
1526 tracehook_report_syscall_exit(regs, 0);
1527
1528 /* 1778 /*
1529 * If TIF_SYSCALL_EMU is set, we only get here because of 1779 * If TIF_SYSCALL_EMU is set, we only get here because of
1530 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 1780 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1531 * We already reported this syscall instruction in 1781 * We already reported this syscall instruction in
1532 * syscall_trace_enter(), so don't do any more now. 1782 * syscall_trace_enter().
1533 */
1534 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1535 return;
1536
1537 /*
1538 * If we are single-stepping, synthesize a trap to follow the
1539 * system call instruction.
1540 */ 1783 */
1541 if (test_thread_flag(TIF_SINGLESTEP) && 1784 step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
1542 tracehook_consider_fatal_signal(current, SIGTRAP)) 1785 !test_thread_flag(TIF_SYSCALL_EMU);
1543 send_sigtrap(current, regs, 0, TRAP_BRKPT); 1786 if (step || test_thread_flag(TIF_SYSCALL_TRACE))
1787 tracehook_report_syscall_exit(regs, step);
1544} 1788}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 6c3b2c6fd772..12e9feaa2f7a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,6 +491,19 @@ void force_hpet_resume(void)
491 break; 491 break;
492 } 492 }
493} 493}
494
495/*
496 * HPET MSI on some boards (ATI SB700/SB800) has side effect on
497 * floppy DMA. Disable HPET MSI on such platforms.
498 */
499static void force_disable_hpet_msi(struct pci_dev *unused)
500{
501 hpet_msi_disable = 1;
502}
503
504DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
505 force_disable_hpet_msi);
506
494#endif 507#endif
495 508
496#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) 509#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
@@ -499,6 +512,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{ 512{
500 struct pci_dev *nb_ht; 513 struct pci_dev *nb_ht;
501 unsigned int devfn; 514 unsigned int devfn;
515 u32 node;
502 u32 val; 516 u32 val;
503 517
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); 518 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
@@ -507,7 +521,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
507 return; 521 return;
508 522
509 pci_read_config_dword(nb_ht, 0x60, &val); 523 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 524 node = val & 7;
525 /*
526 * Some hardware may return an invalid node ID,
527 * so check it first:
528 */
529 if (node_online(node))
530 set_dev_node(&dev->dev, node);
511 pci_dev_put(nb_ht); 531 pci_dev_put(nb_ht);
512} 532}
513 533
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a1a3cdda06e1..704bddcdf64d 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -23,7 +23,7 @@
23# include <linux/ctype.h> 23# include <linux/ctype.h>
24# include <linux/mc146818rtc.h> 24# include <linux/mc146818rtc.h>
25#else 25#else
26# include <asm/iommu.h> 26# include <asm/x86_init.h>
27#endif 27#endif
28 28
29/* 29/*
@@ -203,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
203 DMI_MATCH(DMI_BOARD_NAME, "0T656F"), 203 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
204 }, 204 },
205 }, 205 },
206 { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
207 .callback = set_bios_reboot,
208 .ident = "Dell OptiPlex 760",
209 .matches = {
210 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
211 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
212 DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
213 },
214 },
206 { /* Handle problems with rebooting on Dell 2400's */ 215 { /* Handle problems with rebooting on Dell 2400's */
207 .callback = set_bios_reboot, 216 .callback = set_bios_reboot,
208 .ident = "Dell PowerEdge 2400", 217 .ident = "Dell PowerEdge 2400",
@@ -259,6 +268,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
259 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), 268 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
260 }, 269 },
261 }, 270 },
271 { /* Handle problems with rebooting on ASUS P4S800 */
272 .callback = set_bios_reboot,
273 .ident = "ASUS P4S800",
274 .matches = {
275 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
276 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
277 },
278 },
262 { } 279 { }
263}; 280};
264 281
@@ -436,6 +453,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
436 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), 453 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
437 }, 454 },
438 }, 455 },
456 { /* Handle problems with rebooting on Apple Macmini3,1 */
457 .callback = set_pci_reboot,
458 .ident = "Apple Macmini3,1",
459 .matches = {
460 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
461 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
462 },
463 },
439 { } 464 { }
440}; 465};
441 466
@@ -614,7 +639,7 @@ void native_machine_shutdown(void)
614#endif 639#endif
615 640
616#ifdef CONFIG_X86_64 641#ifdef CONFIG_X86_64
617 pci_iommu_shutdown(); 642 x86_platform.iommu_shutdown();
618#endif 643#endif
619} 644}
620 645
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index 61a837743fe5..fda313ebbb03 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -12,7 +12,7 @@
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <asm/reboot_fixups.h> 13#include <asm/reboot_fixups.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/geode.h> 15#include <linux/cs5535.h>
16 16
17static void cs5530a_warm_reset(struct pci_dev *dev) 17static void cs5530a_warm_reset(struct pci_dev *dev)
18{ 18{
@@ -80,6 +80,7 @@ void mach_reboot_fixups(void)
80 continue; 80 continue;
81 81
82 cur->reboot_fixup(dev); 82 cur->reboot_fixup(dev);
83 pci_dev_put(dev);
83 } 84 }
84} 85}
85 86
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f9b1f4e5ab74..3499b4fabc94 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -73,6 +73,7 @@
73 73
74#include <asm/mtrr.h> 74#include <asm/mtrr.h>
75#include <asm/apic.h> 75#include <asm/apic.h>
76#include <asm/trampoline.h>
76#include <asm/e820.h> 77#include <asm/e820.h>
77#include <asm/mpspec.h> 78#include <asm/mpspec.h>
78#include <asm/setup.h> 79#include <asm/setup.h>
@@ -110,6 +111,7 @@
110#ifdef CONFIG_X86_64 111#ifdef CONFIG_X86_64
111#include <asm/numa_64.h> 112#include <asm/numa_64.h>
112#endif 113#endif
114#include <asm/mce.h>
113 115
114/* 116/*
115 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 117 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -248,7 +250,7 @@ EXPORT_SYMBOL(edd);
248 * from boot_params into a safe place. 250 * from boot_params into a safe place.
249 * 251 *
250 */ 252 */
251static inline void copy_edd(void) 253static inline void __init copy_edd(void)
252{ 254{
253 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, 255 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
254 sizeof(edd.mbr_signature)); 256 sizeof(edd.mbr_signature));
@@ -257,7 +259,7 @@ static inline void copy_edd(void)
257 edd.edd_info_nr = boot_params.eddbuf_entries; 259 edd.edd_info_nr = boot_params.eddbuf_entries;
258} 260}
259#else 261#else
260static inline void copy_edd(void) 262static inline void __init copy_edd(void)
261{ 263{
262} 264}
263#endif 265#endif
@@ -634,18 +636,33 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
634 }, 636 },
635 }, 637 },
636 { 638 {
639 .callback = dmi_low_memory_corruption,
640 .ident = "Phoenix/MSC BIOS",
641 .matches = {
642 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
643 },
644 },
637 /* 645 /*
638 * AMI BIOS with low memory corruption was found on Intel DG45ID board. 646 * AMI BIOS with low memory corruption was found on Intel DG45ID and
639 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will 647 * DG45FC boards.
648 * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
640 * match only DMI_BOARD_NAME and see if there is more bad products 649 * match only DMI_BOARD_NAME and see if there is more bad products
641 * with this vendor. 650 * with this vendor.
642 */ 651 */
652 {
643 .callback = dmi_low_memory_corruption, 653 .callback = dmi_low_memory_corruption,
644 .ident = "AMI BIOS", 654 .ident = "AMI BIOS",
645 .matches = { 655 .matches = {
646 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), 656 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
647 }, 657 },
648 }, 658 },
659 {
660 .callback = dmi_low_memory_corruption,
661 .ident = "AMI BIOS",
662 .matches = {
663 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
664 },
665 },
649#endif 666#endif
650 {} 667 {}
651}; 668};
@@ -884,6 +901,13 @@ void __init setup_arch(char **cmdline_p)
884 901
885 reserve_brk(); 902 reserve_brk();
886 903
904 /*
905 * Find and reserve possible boot-time SMP configuration:
906 */
907 find_smp_config();
908
909 reserve_trampoline_memory();
910
887#ifdef CONFIG_ACPI_SLEEP 911#ifdef CONFIG_ACPI_SLEEP
888 /* 912 /*
889 * Reserve low memory region for sleep support. 913 * Reserve low memory region for sleep support.
@@ -930,11 +954,6 @@ void __init setup_arch(char **cmdline_p)
930 954
931 early_acpi_boot_init(); 955 early_acpi_boot_init();
932 956
933 /*
934 * Find and reserve possible boot-time SMP configuration:
935 */
936 find_smp_config();
937
938#ifdef CONFIG_ACPI_NUMA 957#ifdef CONFIG_ACPI_NUMA
939 /* 958 /*
940 * Parse SRAT to discover nodes. 959 * Parse SRAT to discover nodes.
@@ -1021,6 +1040,8 @@ void __init setup_arch(char **cmdline_p)
1021#endif 1040#endif
1022#endif 1041#endif
1023 x86_init.oem.banner(); 1042 x86_init.oem.banner();
1043
1044 mcheck_init();
1024} 1045}
1025 1046
1026#ifdef CONFIG_X86_32 1047#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d559af913e1f..35abcb8b00e9 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/kernel.h> 3#include <linux/kernel.h>
2#include <linux/module.h> 4#include <linux/module.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -20,9 +22,9 @@
20#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
21 23
22#ifdef CONFIG_DEBUG_PER_CPU_MAPS 24#ifdef CONFIG_DEBUG_PER_CPU_MAPS
23# define DBG(x...) printk(KERN_DEBUG x) 25# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
24#else 26#else
25# define DBG(x...) 27# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
26#endif 28#endif
27 29
28DEFINE_PER_CPU(int, cpu_number); 30DEFINE_PER_CPU(int, cpu_number);
@@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
116 } else { 118 } else {
117 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 119 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
118 size, align, goal); 120 size, align, goal);
119 pr_debug("per cpu data for cpu%d %lu bytes on node%d at " 121 pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
120 "%016lx\n", cpu, size, node, __pa(ptr)); 122 cpu, size, node, __pa(ptr));
121 } 123 }
122 return ptr; 124 return ptr;
123#else 125#else
@@ -198,8 +200,7 @@ void __init setup_per_cpu_areas(void)
198 pcpu_cpu_distance, 200 pcpu_cpu_distance,
199 pcpu_fc_alloc, pcpu_fc_free); 201 pcpu_fc_alloc, pcpu_fc_free);
200 if (rc < 0) 202 if (rc < 0)
201 pr_warning("PERCPU: %s allocator failed (%d), " 203 pr_warning("%s allocator failed (%d), falling back to page size\n",
202 "falling back to page size\n",
203 pcpu_fc_names[pcpu_chosen_fc], rc); 204 pcpu_fc_names[pcpu_chosen_fc], rc);
204 } 205 }
205 if (rc < 0) 206 if (rc < 0)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..4fd173cd8e57 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/user-return-notifier.h>
22 23
23#include <asm/processor.h> 24#include <asm/processor.h>
24#include <asm/ucontext.h> 25#include <asm/ucontext.h>
@@ -544,22 +545,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
544} 545}
545#endif /* CONFIG_X86_32 */ 546#endif /* CONFIG_X86_32 */
546 547
547#ifdef CONFIG_X86_32 548long
548int sys_sigaltstack(struct pt_regs *regs)
549{
550 const stack_t __user *uss = (const stack_t __user *)regs->bx;
551 stack_t __user *uoss = (stack_t __user *)regs->cx;
552
553 return do_sigaltstack(uss, uoss, regs->sp);
554}
555#else /* !CONFIG_X86_32 */
556asmlinkage long
557sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 549sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
558 struct pt_regs *regs) 550 struct pt_regs *regs)
559{ 551{
560 return do_sigaltstack(uss, uoss, regs->sp); 552 return do_sigaltstack(uss, uoss, regs->sp);
561} 553}
562#endif /* CONFIG_X86_32 */
563 554
564/* 555/*
565 * Do a signal return; undo the signal stack. 556 * Do a signal return; undo the signal stack.
@@ -799,15 +790,6 @@ static void do_signal(struct pt_regs *regs)
799 790
800 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 791 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
801 if (signr > 0) { 792 if (signr > 0) {
802 /*
803 * Re-enable any watchpoints before delivering the
804 * signal to user space. The processor register will
805 * have been cleared if the watchpoint triggered
806 * inside the kernel.
807 */
808 if (current->thread.debugreg7)
809 set_debugreg(current->thread.debugreg7, 7);
810
811 /* Whee! Actually deliver the signal. */ 793 /* Whee! Actually deliver the signal. */
812 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 794 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
813 /* 795 /*
@@ -872,6 +854,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
872 if (current->replacement_session_keyring) 854 if (current->replacement_session_keyring)
873 key_replace_session_keyring(); 855 key_replace_session_keyring();
874 } 856 }
857 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
858 fire_user_return_notifiers();
875 859
876#ifdef CONFIG_X86_32 860#ifdef CONFIG_X86_32
877 clear_thread_flag(TIF_IRET); 861 clear_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d915d956e66d..ec1de97600e7 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -198,7 +198,6 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
198{ 198{
199 ack_APIC_irq(); 199 ack_APIC_irq();
200 inc_irq_stat(irq_resched_count); 200 inc_irq_stat(irq_resched_count);
201 run_local_timers();
202 /* 201 /*
203 * KVM uses this interrupt to force a cpu out of guest mode 202 * KVM uses this interrupt to force a cpu out of guest mode
204 */ 203 */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 565ebc65920e..b4e870cbdc60 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -671,6 +671,26 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
671 complete(&c_idle->done); 671 complete(&c_idle->done);
672} 672}
673 673
674/* reduce the number of lines printed when booting a large cpu count system */
675static void __cpuinit announce_cpu(int cpu, int apicid)
676{
677 static int current_node = -1;
678 int node = cpu_to_node(cpu);
679
680 if (system_state == SYSTEM_BOOTING) {
681 if (node != current_node) {
682 if (current_node > (-1))
683 pr_cont(" Ok.\n");
684 current_node = node;
685 pr_info("Booting Node %3d, Processors ", node);
686 }
687 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
688 return;
689 } else
690 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
691 node, cpu, apicid);
692}
693
674/* 694/*
675 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 695 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
676 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 696 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -687,7 +707,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
687 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 707 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
688 }; 708 };
689 709
690 INIT_WORK(&c_idle.work, do_fork_idle); 710 INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
691 711
692 alternatives_smp_switch(1); 712 alternatives_smp_switch(1);
693 713
@@ -713,6 +733,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
713 733
714 if (IS_ERR(c_idle.idle)) { 734 if (IS_ERR(c_idle.idle)) {
715 printk("failed fork for CPU %d\n", cpu); 735 printk("failed fork for CPU %d\n", cpu);
736 destroy_work_on_stack(&c_idle.work);
716 return PTR_ERR(c_idle.idle); 737 return PTR_ERR(c_idle.idle);
717 } 738 }
718 739
@@ -736,9 +757,8 @@ do_rest:
736 /* start_ip had better be page-aligned! */ 757 /* start_ip had better be page-aligned! */
737 start_ip = setup_trampoline(); 758 start_ip = setup_trampoline();
738 759
739 /* So we see what's up */ 760 /* So we see what's up */
740 printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", 761 announce_cpu(cpu, apicid);
741 cpu, apicid, start_ip);
742 762
743 /* 763 /*
744 * This grunge runs the startup process for 764 * This grunge runs the startup process for
@@ -787,21 +807,17 @@ do_rest:
787 udelay(100); 807 udelay(100);
788 } 808 }
789 809
790 if (cpumask_test_cpu(cpu, cpu_callin_mask)) { 810 if (cpumask_test_cpu(cpu, cpu_callin_mask))
791 /* number CPUs logically, starting from 1 (BSP is 0) */ 811 pr_debug("CPU%d: has booted.\n", cpu);
792 pr_debug("OK.\n"); 812 else {
793 printk(KERN_INFO "CPU%d: ", cpu);
794 print_cpu_info(&cpu_data(cpu));
795 pr_debug("CPU has booted.\n");
796 } else {
797 boot_error = 1; 813 boot_error = 1;
798 if (*((volatile unsigned char *)trampoline_base) 814 if (*((volatile unsigned char *)trampoline_base)
799 == 0xA5) 815 == 0xA5)
800 /* trampoline started but...? */ 816 /* trampoline started but...? */
801 printk(KERN_ERR "Stuck ??\n"); 817 pr_err("CPU%d: Stuck ??\n", cpu);
802 else 818 else
803 /* trampoline code not run */ 819 /* trampoline code not run */
804 printk(KERN_ERR "Not responding.\n"); 820 pr_err("CPU%d: Not responding.\n", cpu);
805 if (apic->inquire_remote_apic) 821 if (apic->inquire_remote_apic)
806 apic->inquire_remote_apic(apicid); 822 apic->inquire_remote_apic(apicid);
807 } 823 }
@@ -831,6 +847,7 @@ do_rest:
831 smpboot_restore_warm_reset_vector(); 847 smpboot_restore_warm_reset_vector();
832 } 848 }
833 849
850 destroy_work_on_stack(&c_idle.work);
834 return boot_error; 851 return boot_error;
835} 852}
836 853
@@ -1066,9 +1083,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1066 set_cpu_sibling_map(0); 1083 set_cpu_sibling_map(0);
1067 1084
1068 enable_IR_x2apic(); 1085 enable_IR_x2apic();
1069#ifdef CONFIG_X86_64
1070 default_setup_apic_routing(); 1086 default_setup_apic_routing();
1071#endif
1072 1087
1073 if (smp_sanity_check(max_cpus) < 0) { 1088 if (smp_sanity_check(max_cpus) < 0) {
1074 printk(KERN_INFO "SMP disabled\n"); 1089 printk(KERN_INFO "SMP disabled\n");
@@ -1250,16 +1265,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1250void cpu_disable_common(void) 1265void cpu_disable_common(void)
1251{ 1266{
1252 int cpu = smp_processor_id(); 1267 int cpu = smp_processor_id();
1253 /*
1254 * HACK:
1255 * Allow any queued timer interrupts to get serviced
1256 * This is only a temporary solution until we cleanup
1257 * fixup_irqs as we do for IA64.
1258 */
1259 local_irq_enable();
1260 mdelay(1);
1261 1268
1262 local_irq_disable();
1263 remove_siblinginfo(cpu); 1269 remove_siblinginfo(cpu);
1264 1270
1265 /* It's now safe to remove this processor from the online map */ 1271 /* It's now safe to remove this processor from the online map */
@@ -1300,14 +1306,16 @@ void native_cpu_die(unsigned int cpu)
1300 for (i = 0; i < 10; i++) { 1306 for (i = 0; i < 10; i++) {
1301 /* They ack this in play_dead by setting CPU_DEAD */ 1307 /* They ack this in play_dead by setting CPU_DEAD */
1302 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1308 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1303 printk(KERN_INFO "CPU %d is now offline\n", cpu); 1309 if (system_state == SYSTEM_RUNNING)
1310 pr_info("CPU %u is now offline\n", cpu);
1311
1304 if (1 == num_online_cpus()) 1312 if (1 == num_online_cpus())
1305 alternatives_smp_switch(0); 1313 alternatives_smp_switch(0);
1306 return; 1314 return;
1307 } 1315 }
1308 msleep(100); 1316 msleep(100);
1309 } 1317 }
1310 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1318 pr_err("CPU %u didn't die...\n", cpu);
1311} 1319}
1312 1320
1313void play_dead_common(void) 1321void play_dead_common(void)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c3eb207181fe..922eefbb3f6c 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -53,17 +53,19 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
53} 53}
54 54
55static const struct stacktrace_ops save_stack_ops = { 55static const struct stacktrace_ops save_stack_ops = {
56 .warning = save_stack_warning, 56 .warning = save_stack_warning,
57 .warning_symbol = save_stack_warning_symbol, 57 .warning_symbol = save_stack_warning_symbol,
58 .stack = save_stack_stack, 58 .stack = save_stack_stack,
59 .address = save_stack_address, 59 .address = save_stack_address,
60 .walk_stack = print_context_stack,
60}; 61};
61 62
62static const struct stacktrace_ops save_stack_ops_nosched = { 63static const struct stacktrace_ops save_stack_ops_nosched = {
63 .warning = save_stack_warning, 64 .warning = save_stack_warning,
64 .warning_symbol = save_stack_warning_symbol, 65 .warning_symbol = save_stack_warning_symbol,
65 .stack = save_stack_stack, 66 .stack = save_stack_stack,
66 .address = save_stack_address_nosched, 67 .address = save_stack_address_nosched,
68 .walk_stack = print_context_stack,
67}; 69};
68 70
69/* 71/*
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 1884a8d12bfa..dee1ff7cba58 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -24,31 +24,6 @@
24 24
25#include <asm/syscalls.h> 25#include <asm/syscalls.h>
26 26
27asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
28 unsigned long prot, unsigned long flags,
29 unsigned long fd, unsigned long pgoff)
30{
31 int error = -EBADF;
32 struct file *file = NULL;
33 struct mm_struct *mm = current->mm;
34
35 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
36 if (!(flags & MAP_ANONYMOUS)) {
37 file = fget(fd);
38 if (!file)
39 goto out;
40 }
41
42 down_write(&mm->mmap_sem);
43 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
44 up_write(&mm->mmap_sem);
45
46 if (file)
47 fput(file);
48out:
49 return error;
50}
51
52/* 27/*
53 * Perform the select(nd, in, out, ex, tv) and mmap() system 28 * Perform the select(nd, in, out, ex, tv) and mmap() system
54 * calls. Linux/i386 didn't use to be able to handle more than 29 * calls. Linux/i386 didn't use to be able to handle more than
@@ -77,7 +52,7 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
77 if (a.offset & ~PAGE_MASK) 52 if (a.offset & ~PAGE_MASK)
78 goto out; 53 goto out;
79 54
80 err = sys_mmap2(a.addr, a.len, a.prot, a.flags, 55 err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags,
81 a.fd, a.offset >> PAGE_SHIFT); 56 a.fd, a.offset >> PAGE_SHIFT);
82out: 57out:
83 return err; 58 return err;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 45e00eb09c3a..8aa2057efd12 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -23,26 +23,11 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
23 unsigned long, fd, unsigned long, off) 23 unsigned long, fd, unsigned long, off)
24{ 24{
25 long error; 25 long error;
26 struct file *file;
27
28 error = -EINVAL; 26 error = -EINVAL;
29 if (off & ~PAGE_MASK) 27 if (off & ~PAGE_MASK)
30 goto out; 28 goto out;
31 29
32 error = -EBADF; 30 error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
33 file = NULL;
34 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
35 if (!(flags & MAP_ANONYMOUS)) {
36 file = fget(fd);
37 if (!file)
38 goto out;
39 }
40 down_write(&current->mm->mmap_sem);
41 error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
42 up_write(&current->mm->mmap_sem);
43
44 if (file)
45 fput(file);
46out: 31out:
47 return error; 32 return error;
48} 33}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 0157cd26d7cc..15228b5d3eb7 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -191,7 +191,7 @@ ENTRY(sys_call_table)
191 .long sys_ni_syscall /* reserved for streams2 */ 191 .long sys_ni_syscall /* reserved for streams2 */
192 .long ptregs_vfork /* 190 */ 192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit 193 .long sys_getrlimit
194 .long sys_mmap2 194 .long sys_mmap_pgoff
195 .long sys_truncate64 195 .long sys_truncate64
196 .long sys_ftruncate64 196 .long sys_ftruncate64
197 .long sys_stat64 /* 195 */ 197 .long sys_stat64 /* 195 */
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_recvmmsg
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index dcb00d278512..be2573448ed9 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -38,7 +38,8 @@ unsigned long profile_pc(struct pt_regs *regs)
38#ifdef CONFIG_FRAME_POINTER 38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long)); 39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else 40#else
41 unsigned long *sp = (unsigned long *)regs->sp; 41 unsigned long *sp =
42 (unsigned long *)kernel_stack_pointer(regs);
42 /* 43 /*
43 * Return address is either directly at stack pointer 44 * Return address is either directly at stack pointer
44 * or above a saved flags. Eflags has bits 22-31 zero, 45 * or above a saved flags. Eflags has bits 22-31 zero,
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index af21e5556900..364d015efebc 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -23,8 +23,6 @@
23static struct bau_control **uv_bau_table_bases __read_mostly; 23static struct bau_control **uv_bau_table_bases __read_mostly;
24static int uv_bau_retry_limit __read_mostly; 24static int uv_bau_retry_limit __read_mostly;
25 25
26/* position of pnode (which is nasid>>1): */
27static int uv_nshift __read_mostly;
28/* base pnode in this partition */ 26/* base pnode in this partition */
29static int uv_partition_base_pnode __read_mostly; 27static int uv_partition_base_pnode __read_mostly;
30 28
@@ -723,7 +721,7 @@ uv_activation_descriptor_init(int node, int pnode)
723 BUG_ON(!adp); 721 BUG_ON(!adp);
724 722
725 pa = uv_gpa(adp); /* need the real nasid*/ 723 pa = uv_gpa(adp); /* need the real nasid*/
726 n = pa >> uv_nshift; 724 n = uv_gpa_to_pnode(pa);
727 m = pa & uv_mmask; 725 m = pa & uv_mmask;
728 726
729 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, 727 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -778,7 +776,7 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
778 * need the pnode of where the memory was really allocated 776 * need the pnode of where the memory was really allocated
779 */ 777 */
780 pa = uv_gpa(pqp); 778 pa = uv_gpa(pqp);
781 pn = pa >> uv_nshift; 779 pn = uv_gpa_to_pnode(pa);
782 uv_write_global_mmr64(pnode, 780 uv_write_global_mmr64(pnode,
783 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, 781 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
784 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | 782 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
@@ -841,8 +839,7 @@ static int __init uv_bau_init(void)
841 GFP_KERNEL, cpu_to_node(cur_cpu)); 839 GFP_KERNEL, cpu_to_node(cur_cpu));
842 840
843 uv_bau_retry_limit = 1; 841 uv_bau_retry_limit = 1;
844 uv_nshift = uv_hub_info->n_val; 842 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
845 uv_mmask = (1UL << uv_hub_info->n_val) - 1;
846 nblades = uv_num_possible_blades(); 843 nblades = uv_num_possible_blades();
847 844
848 uv_bau_table_bases = (struct bau_control **) 845 uv_bau_table_bases = (struct bau_control **)
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 699f7eeb896a..c652ef62742d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -3,22 +3,28 @@
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4#include <asm/e820.h> 4#include <asm/e820.h>
5 5
6#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
7#define __trampinit
8#define __trampinitdata
9#else
10#define __trampinit __cpuinit
11#define __trampinitdata __cpuinitdata
12#endif
13
6/* ready for x86_64 and x86 */ 14/* ready for x86_64 and x86 */
7unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE); 15unsigned char *__trampinitdata trampoline_base;
8 16
9void __init reserve_trampoline_memory(void) 17void __init reserve_trampoline_memory(void)
10{ 18{
11#ifdef CONFIG_X86_32 19 unsigned long mem;
12 /* 20
13 * But first pinch a few for the stack/trampoline stuff
14 * FIXME: Don't need the extra page at 4K, but need to fix
15 * trampoline before removing it. (see the GDT stuff)
16 */
17 reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
18#endif
19 /* Has to be in very low memory so we can execute real-mode AP code. */ 21 /* Has to be in very low memory so we can execute real-mode AP code. */
20 reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, 22 mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
21 "TRAMPOLINE"); 23 if (mem == -1L)
24 panic("Cannot allocate trampoline\n");
25
26 trampoline_base = __va(mem);
27 reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
22} 28}
23 29
24/* 30/*
@@ -26,7 +32,7 @@ void __init reserve_trampoline_memory(void)
26 * bootstrap into the page concerned. The caller 32 * bootstrap into the page concerned. The caller
27 * has made sure it's suitably aligned. 33 * has made sure it's suitably aligned.
28 */ 34 */
29unsigned long __cpuinit setup_trampoline(void) 35unsigned long __trampinit setup_trampoline(void)
30{ 36{
31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 37 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
32 return virt_to_phys(trampoline_base); 38 return virt_to_phys(trampoline_base);
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 596d54c660a5..3af2dff58b21 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,8 +32,12 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34 34
35#ifdef CONFIG_ACPI_SLEEP
36.section .rodata, "a", @progbits
37#else
35/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ 38/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
36__CPUINITRODATA 39__CPUINITRODATA
40#endif
37.code16 41.code16
38 42
39ENTRY(trampoline_data) 43ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dcee0cc3..33399176512a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
530{ 530{
531 struct task_struct *tsk = current; 531 struct task_struct *tsk = current;
532 unsigned long condition; 532 unsigned long dr6;
533 int si_code; 533 int si_code;
534 534
535 get_debugreg(condition, 6); 535 get_debugreg(dr6, 6);
536 536
537 /* Catch kmemcheck conditions first of all! */ 537 /* Catch kmemcheck conditions first of all! */
538 if (condition & DR_STEP && kmemcheck_trap(regs)) 538 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 539 return;
540 540
541 /* DR6 may or may not be cleared by the CPU */
542 set_debugreg(0, 6);
541 /* 543 /*
542 * The processor cleared BTF, so don't mark that we need it set. 544 * The processor cleared BTF, so don't mark that we need it set.
543 */ 545 */
544 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 546 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
545 tsk->thread.debugctlmsr = 0; 547 tsk->thread.debugctlmsr = 0;
546 548
547 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 549 /* Store the virtualized DR6 value */
548 SIGTRAP) == NOTIFY_STOP) 550 tsk->thread.debugreg6 = dr6;
551
552 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
553 SIGTRAP) == NOTIFY_STOP)
549 return; 554 return;
550 555
551 /* It's safe to allow irq's after DR6 has been saved */ 556 /* It's safe to allow irq's after DR6 has been saved */
552 preempt_conditional_sti(regs); 557 preempt_conditional_sti(regs);
553 558
554 /* Mask out spurious debug traps due to lazy DR7 setting */ 559 if (regs->flags & X86_VM_MASK) {
555 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 560 handle_vm86_trap((struct kernel_vm86_regs *) regs,
556 if (!tsk->thread.debugreg7) 561 error_code, 1);
557 goto clear_dr7; 562 return;
558 } 563 }
559 564
560#ifdef CONFIG_X86_32
561 if (regs->flags & X86_VM_MASK)
562 goto debug_vm86;
563#endif
564
565 /* Save debug status register where ptrace can see it */
566 tsk->thread.debugreg6 = condition;
567
568 /* 565 /*
569 * Single-stepping through TF: make sure we ignore any events in 566 * Single-stepping through system calls: ignore any exceptions in
570 * kernel space (but re-enable TF when returning to user mode). 567 * kernel space, but re-enable TF when returning to user mode.
568 *
569 * We already checked v86 mode above, so we can check for kernel mode
570 * by just checking the CPL of CS.
571 */ 571 */
572 if (condition & DR_STEP) { 572 if ((dr6 & DR_STEP) && !user_mode(regs)) {
573 if (!user_mode(regs)) 573 tsk->thread.debugreg6 &= ~DR_STEP;
574 goto clear_TF_reenable; 574 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
575 regs->flags &= ~X86_EFLAGS_TF;
575 } 576 }
576 577 si_code = get_si_code(tsk->thread.debugreg6);
577 si_code = get_si_code(condition); 578 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
578 /* Ok, finally something we can handle */ 579 send_sigtrap(tsk, regs, error_code, si_code);
579 send_sigtrap(tsk, regs, error_code, si_code);
580
581 /*
582 * Disable additional traps. They'll be re-enabled when
583 * the signal is delivered.
584 */
585clear_dr7:
586 set_debugreg(0, 7);
587 preempt_conditional_cli(regs); 580 preempt_conditional_cli(regs);
588 return;
589 581
590#ifdef CONFIG_X86_32
591debug_vm86:
592 /* reenable preemption: handle_vm86_trap() might sleep */
593 dec_preempt_count();
594 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
595 conditional_cli(regs);
596 return;
597#endif
598
599clear_TF_reenable:
600 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
601 regs->flags &= ~X86_EFLAGS_TF;
602 preempt_conditional_cli(regs);
603 return; 582 return;
604} 583}
605 584
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cd982f48e23e..597683aa5ba0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -763,6 +763,7 @@ void mark_tsc_unstable(char *reason)
763{ 763{
764 if (!tsc_unstable) { 764 if (!tsc_unstable) {
765 tsc_unstable = 1; 765 tsc_unstable = 1;
766 sched_clock_stable = 0;
766 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 767 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
767 /* Change only the rating, when not registered */ 768 /* Change only the rating, when not registered */
768 if (clocksource_tsc.mult) 769 if (clocksource_tsc.mult)
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index f37930954d15..0aa5fed8b9e6 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count;
33 * we want to have the fastest, inlined, non-debug version 33 * we want to have the fastest, inlined, non-debug version
34 * of a critical section, to be able to prove TSC time-warps: 34 * of a critical section, to be able to prove TSC time-warps:
35 */ 35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; 36static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
37 37
38static __cpuinitdata cycles_t last_tsc; 38static __cpuinitdata cycles_t last_tsc;
39static __cpuinitdata cycles_t max_warp; 39static __cpuinitdata cycles_t max_warp;
@@ -62,13 +62,13 @@ static __cpuinit void check_tsc_warp(void)
62 * previous TSC that was measured (possibly on 62 * previous TSC that was measured (possibly on
63 * another CPU) and update the previous TSC timestamp. 63 * another CPU) and update the previous TSC timestamp.
64 */ 64 */
65 __raw_spin_lock(&sync_lock); 65 arch_spin_lock(&sync_lock);
66 prev = last_tsc; 66 prev = last_tsc;
67 rdtsc_barrier(); 67 rdtsc_barrier();
68 now = get_cycles(); 68 now = get_cycles();
69 rdtsc_barrier(); 69 rdtsc_barrier();
70 last_tsc = now; 70 last_tsc = now;
71 __raw_spin_unlock(&sync_lock); 71 arch_spin_unlock(&sync_lock);
72 72
73 /* 73 /*
74 * Be nice every now and then (and also check whether 74 * Be nice every now and then (and also check whether
@@ -87,10 +87,10 @@ static __cpuinit void check_tsc_warp(void)
87 * we saw a time-warp of the TSC going backwards: 87 * we saw a time-warp of the TSC going backwards:
88 */ 88 */
89 if (unlikely(prev > now)) { 89 if (unlikely(prev > now)) {
90 __raw_spin_lock(&sync_lock); 90 arch_spin_lock(&sync_lock);
91 max_warp = max(max_warp, prev - now); 91 max_warp = max(max_warp, prev - now);
92 nr_warps++; 92 nr_warps++;
93 __raw_spin_unlock(&sync_lock); 93 arch_spin_unlock(&sync_lock);
94 } 94 }
95 } 95 }
96 WARN(!(now-start), 96 WARN(!(now-start),
@@ -114,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
117 printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); 117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
118 pr_info(
119 "Skipped synchronization checks as TSC is reliable.\n");
118 return; 120 return;
119 } 121 }
120 122
121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu);
123
124 /* 123 /*
125 * Reset it - in case this is a second bootup: 124 * Reset it - in case this is a second bootup:
126 */ 125 */
@@ -142,12 +141,14 @@ void __cpuinit check_tsc_sync_source(int cpu)
142 cpu_relax(); 141 cpu_relax();
143 142
144 if (nr_warps) { 143 if (nr_warps) {
145 printk("\n"); 144 pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
145 smp_processor_id(), cpu);
146 pr_warning("Measured %Ld cycles TSC warp between CPUs, " 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 "turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
151 smp_processor_id(), cpu);
151 } 152 }
152 153
153 /* 154 /*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index aeef529917e4..ece73d8e3240 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -9,10 +9,25 @@
9 */ 9 */
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/rbtree.h>
12#include <linux/irq.h> 13#include <linux/irq.h>
13 14
14#include <asm/apic.h> 15#include <asm/apic.h>
15#include <asm/uv/uv_irq.h> 16#include <asm/uv/uv_irq.h>
17#include <asm/uv/uv_hub.h>
18
19/* MMR offset and pnode of hub sourcing interrupts for a given irq */
20struct uv_irq_2_mmr_pnode{
21 struct rb_node list;
22 unsigned long offset;
23 int pnode;
24 int irq;
25};
26
27static spinlock_t uv_irq_lock;
28static struct rb_root uv_irq_root;
29
30static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
16 31
17static void uv_noop(unsigned int irq) 32static void uv_noop(unsigned int irq)
18{ 33{
@@ -39,25 +54,213 @@ struct irq_chip uv_irq_chip = {
39 .unmask = uv_noop, 54 .unmask = uv_noop,
40 .eoi = uv_ack_apic, 55 .eoi = uv_ack_apic,
41 .end = uv_noop, 56 .end = uv_noop,
57 .set_affinity = uv_set_irq_affinity,
42}; 58};
43 59
44/* 60/*
61 * Add offset and pnode information of the hub sourcing interrupts to the
62 * rb tree for a specific irq.
63 */
64static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
65{
66 struct rb_node **link = &uv_irq_root.rb_node;
67 struct rb_node *parent = NULL;
68 struct uv_irq_2_mmr_pnode *n;
69 struct uv_irq_2_mmr_pnode *e;
70 unsigned long irqflags;
71
72 n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
73 uv_blade_to_memory_nid(blade));
74 if (!n)
75 return -ENOMEM;
76
77 n->irq = irq;
78 n->offset = offset;
79 n->pnode = uv_blade_to_pnode(blade);
80 spin_lock_irqsave(&uv_irq_lock, irqflags);
81 /* Find the right place in the rbtree: */
82 while (*link) {
83 parent = *link;
84 e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
85
86 if (unlikely(irq == e->irq)) {
87 /* irq entry exists */
88 e->pnode = uv_blade_to_pnode(blade);
89 e->offset = offset;
90 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
91 kfree(n);
92 return 0;
93 }
94
95 if (irq < e->irq)
96 link = &(*link)->rb_left;
97 else
98 link = &(*link)->rb_right;
99 }
100
101 /* Insert the node into the rbtree. */
102 rb_link_node(&n->list, parent, link);
103 rb_insert_color(&n->list, &uv_irq_root);
104
105 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
106 return 0;
107}
108
109/* Retrieve offset and pnode information from the rb tree for a specific irq */
110int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
111{
112 struct uv_irq_2_mmr_pnode *e;
113 struct rb_node *n;
114 unsigned long irqflags;
115
116 spin_lock_irqsave(&uv_irq_lock, irqflags);
117 n = uv_irq_root.rb_node;
118 while (n) {
119 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
120
121 if (e->irq == irq) {
122 *offset = e->offset;
123 *pnode = e->pnode;
124 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
125 return 0;
126 }
127
128 if (irq < e->irq)
129 n = n->rb_left;
130 else
131 n = n->rb_right;
132 }
133 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
134 return -1;
135}
136
137/*
138 * Re-target the irq to the specified CPU and enable the specified MMR located
139 * on the specified blade to allow the sending of MSIs to the specified CPU.
140 */
141static int
142arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
143 unsigned long mmr_offset, int restrict)
144{
145 const struct cpumask *eligible_cpu = cpumask_of(cpu);
146 struct irq_desc *desc = irq_to_desc(irq);
147 struct irq_cfg *cfg;
148 int mmr_pnode;
149 unsigned long mmr_value;
150 struct uv_IO_APIC_route_entry *entry;
151 int err;
152
153 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
154 sizeof(unsigned long));
155
156 cfg = irq_cfg(irq);
157
158 err = assign_irq_vector(irq, cfg, eligible_cpu);
159 if (err != 0)
160 return err;
161
162 if (restrict == UV_AFFINITY_CPU)
163 desc->status |= IRQ_NO_BALANCING;
164 else
165 desc->status |= IRQ_MOVE_PCNTXT;
166
167 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
168 irq_name);
169
170 mmr_value = 0;
171 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
172 entry->vector = cfg->vector;
173 entry->delivery_mode = apic->irq_delivery_mode;
174 entry->dest_mode = apic->irq_dest_mode;
175 entry->polarity = 0;
176 entry->trigger = 0;
177 entry->mask = 0;
178 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
179
180 mmr_pnode = uv_blade_to_pnode(mmr_blade);
181 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
182
183 if (cfg->move_in_progress)
184 send_cleanup_vector(cfg);
185
186 return irq;
187}
188
189/*
190 * Disable the specified MMR located on the specified blade so that MSIs are
191 * longer allowed to be sent.
192 */
193static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
194{
195 unsigned long mmr_value;
196 struct uv_IO_APIC_route_entry *entry;
197
198 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
199 sizeof(unsigned long));
200
201 mmr_value = 0;
202 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
203 entry->mask = 1;
204
205 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
206}
207
208static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
209{
210 struct irq_desc *desc = irq_to_desc(irq);
211 struct irq_cfg *cfg = desc->chip_data;
212 unsigned int dest;
213 unsigned long mmr_value;
214 struct uv_IO_APIC_route_entry *entry;
215 unsigned long mmr_offset;
216 unsigned mmr_pnode;
217
218 if (set_desc_affinity(desc, mask, &dest))
219 return -1;
220
221 mmr_value = 0;
222 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
223
224 entry->vector = cfg->vector;
225 entry->delivery_mode = apic->irq_delivery_mode;
226 entry->dest_mode = apic->irq_dest_mode;
227 entry->polarity = 0;
228 entry->trigger = 0;
229 entry->mask = 0;
230 entry->dest = dest;
231
232 /* Get previously stored MMR and pnode of hub sourcing interrupts */
233 if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
234 return -1;
235
236 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
237
238 if (cfg->move_in_progress)
239 send_cleanup_vector(cfg);
240
241 return 0;
242}
243
244/*
45 * Set up a mapping of an available irq and vector, and enable the specified 245 * Set up a mapping of an available irq and vector, and enable the specified
46 * MMR that defines the MSI that is to be sent to the specified CPU when an 246 * MMR that defines the MSI that is to be sent to the specified CPU when an
47 * interrupt is raised. 247 * interrupt is raised.
48 */ 248 */
49int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, 249int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
50 unsigned long mmr_offset) 250 unsigned long mmr_offset, int restrict)
51{ 251{
52 int irq; 252 int irq, ret;
53 int ret; 253
254 irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
54 255
55 irq = create_irq();
56 if (irq <= 0) 256 if (irq <= 0)
57 return -EBUSY; 257 return -EBUSY;
58 258
59 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); 259 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
60 if (ret != irq) 260 restrict);
261 if (ret == irq)
262 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
263 else
61 destroy_irq(irq); 264 destroy_irq(irq);
62 265
63 return ret; 266 return ret;
@@ -71,9 +274,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
71 * 274 *
72 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). 275 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
73 */ 276 */
74void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) 277void uv_teardown_irq(unsigned int irq)
75{ 278{
76 arch_disable_uv_irq(mmr_blade, mmr_offset); 279 struct uv_irq_2_mmr_pnode *e;
280 struct rb_node *n;
281 unsigned long irqflags;
282
283 spin_lock_irqsave(&uv_irq_lock, irqflags);
284 n = uv_irq_root.rb_node;
285 while (n) {
286 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
287 if (e->irq == irq) {
288 arch_disable_uv_irq(e->pnode, e->offset);
289 rb_erase(n, &uv_irq_root);
290 kfree(e);
291 break;
292 }
293 if (irq < e->irq)
294 n = n->rb_left;
295 else
296 n = n->rb_right;
297 }
298 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
77 destroy_irq(irq); 299 destroy_irq(irq);
78} 300}
79EXPORT_SYMBOL_GPL(uv_teardown_irq); 301EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 583f11d5c480..2b75ef638dbc 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -74,7 +74,7 @@ struct uv_rtc_timer_head {
74 */ 74 */
75static struct uv_rtc_timer_head **blade_info __read_mostly; 75static struct uv_rtc_timer_head **blade_info __read_mostly;
76 76
77static int uv_rtc_enable; 77static int uv_rtc_evt_enable;
78 78
79/* 79/*
80 * Hardware interface routines 80 * Hardware interface routines
@@ -90,7 +90,7 @@ static void uv_rtc_send_IPI(int cpu)
90 pnode = uv_apicid_to_pnode(apicid); 90 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 91 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) | 92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); 93 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94 94
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96} 96}
@@ -115,7 +115,7 @@ static int uv_setup_intr(int cpu, u64 expires)
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, 115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK); 116 UVH_EVENT_OCCURRED0_RTC1_MASK);
117 117
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 118 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120 120
121 /* Set configuration */ 121 /* Set configuration */
@@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires)
123 /* Initialize comparator value */ 123 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); 124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125 125
126 return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); 126 if (uv_read_rtc(NULL) <= expires)
127 return 0;
128
129 return !uv_intr_pending(pnode);
127} 130}
128 131
129/* 132/*
@@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
223 226
224 next_cpu = head->next_cpu; 227 next_cpu = head->next_cpu;
225 *t = expires; 228 *t = expires;
229
226 /* Will this one be next to go off? */ 230 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu || 231 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) { 232 expires < head->cpu[next_cpu].expires) {
@@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
231 *t = ULLONG_MAX; 235 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode); 236 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags); 237 spin_unlock_irqrestore(&head->lock, flags);
234 return 1; 238 return -ETIME;
235 } 239 }
236 } 240 }
237 241
@@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
244 * 248 *
245 * Returns 1 if this timer was pending. 249 * Returns 1 if this timer was pending.
246 */ 250 */
247static int uv_rtc_unset_timer(int cpu) 251static int uv_rtc_unset_timer(int cpu, int force)
248{ 252{
249 int pnode = uv_cpu_to_pnode(cpu); 253 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu); 254 int bid = uv_cpu_to_blade_id(cpu);
@@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu)
256 260
257 spin_lock_irqsave(&head->lock, flags); 261 spin_lock_irqsave(&head->lock, flags);
258 262
259 if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) 263 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
260 rc = 1; 264 rc = 1;
261 265
262 *t = ULLONG_MAX; 266 if (rc) {
263 267 *t = ULLONG_MAX;
264 /* Was the hardware setup for this timer? */ 268 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu) 269 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode); 270 uv_rtc_find_next_timer(head, pnode);
271 }
267 272
268 spin_unlock_irqrestore(&head->lock, flags); 273 spin_unlock_irqrestore(&head->lock, flags);
269 274
@@ -277,10 +282,21 @@ static int uv_rtc_unset_timer(int cpu)
277 282
278/* 283/*
279 * Read the RTC. 284 * Read the RTC.
285 *
286 * Starting with HUB rev 2.0, the UV RTC register is replicated across all
287 * cachelines of it's own page. This allows faster simultaneous reads
288 * from a given socket.
280 */ 289 */
281static cycle_t uv_read_rtc(struct clocksource *cs) 290static cycle_t uv_read_rtc(struct clocksource *cs)
282{ 291{
283 return (cycle_t)uv_read_local_mmr(UVH_RTC); 292 unsigned long offset;
293
294 if (uv_get_min_hub_revision_id() == 1)
295 offset = 0;
296 else
297 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
298
299 return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
284} 300}
285 301
286/* 302/*
@@ -310,32 +326,32 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode,
310 break; 326 break;
311 case CLOCK_EVT_MODE_UNUSED: 327 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN: 328 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu); 329 uv_rtc_unset_timer(ced_cpu, 1);
314 break; 330 break;
315 } 331 }
316} 332}
317 333
318static void uv_rtc_interrupt(void) 334static void uv_rtc_interrupt(void)
319{ 335{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id(); 336 int cpu = smp_processor_id();
337 struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
322 338
323 if (!ced || !ced->event_handler) 339 if (!ced || !ced->event_handler)
324 return; 340 return;
325 341
326 if (uv_rtc_unset_timer(cpu) != 1) 342 if (uv_rtc_unset_timer(cpu, 0) != 1)
327 return; 343 return;
328 344
329 ced->event_handler(ced); 345 ced->event_handler(ced);
330} 346}
331 347
332static int __init uv_enable_rtc(char *str) 348static int __init uv_enable_evt_rtc(char *str)
333{ 349{
334 uv_rtc_enable = 1; 350 uv_rtc_evt_enable = 1;
335 351
336 return 1; 352 return 1;
337} 353}
338__setup("uvrtc", uv_enable_rtc); 354__setup("uvrtcevt", uv_enable_evt_rtc);
339 355
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy) 356static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{ 357{
@@ -350,27 +366,32 @@ static __init int uv_rtc_setup_clock(void)
350{ 366{
351 int rc; 367 int rc;
352 368
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) 369 if (!is_uv_system())
354 return -ENODEV; 370 return -ENODEV;
355 371
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, 372 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift); 373 clocksource_uv.shift);
360 374
375 /* If single blade, prefer tsc */
376 if (uv_num_possible_blades() == 1)
377 clocksource_uv.rating = 250;
378
361 rc = clocksource_register(&clocksource_uv); 379 rc = clocksource_register(&clocksource_uv);
362 if (rc) { 380 if (rc)
363 generic_interrupt_extension = NULL; 381 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
382 else
383 printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
384 sn_rtc_cycles_per_second/(unsigned long)1E6);
385
386 if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
364 return rc; 387 return rc;
365 }
366 388
367 /* Setup and register clockevents */ 389 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers(); 390 rc = uv_rtc_allocate_timers();
369 if (rc) { 391 if (rc)
370 clocksource_unregister(&clocksource_uv); 392 goto error;
371 generic_interrupt_extension = NULL; 393
372 return rc; 394 x86_platform_ipi_callback = uv_rtc_interrupt;
373 }
374 395
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, 396 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift); 397 NSEC_PER_SEC, clock_event_device_uv.shift);
@@ -383,11 +404,19 @@ static __init int uv_rtc_setup_clock(void)
383 404
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents); 405 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) { 406 if (rc) {
386 clocksource_unregister(&clocksource_uv); 407 x86_platform_ipi_callback = NULL;
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers(); 408 uv_rtc_deallocate_timers();
409 goto error;
389 } 410 }
390 411
412 printk(KERN_INFO "UV RTC clockevents registered\n");
413
414 return 0;
415
416error:
417 clocksource_unregister(&clocksource_uv);
418 printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
419
391 return rc; 420 return rc;
392} 421}
393arch_initcall(uv_rtc_setup_clock); 422arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 1498efa964b6..34a279a7471d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -183,7 +183,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
183 return; 183 return;
184 } 184 }
185 185
186 apic_cpus = apic->apicid_to_cpu_present(m->apicid); 186 apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); 187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
188 /* 188 /*
189 * Validate version 189 * Validate version
@@ -486,7 +486,7 @@ static void end_cobalt_irq(unsigned int irq)
486} 486}
487 487
488static struct irq_chip cobalt_irq_type = { 488static struct irq_chip cobalt_irq_type = {
489 .typename = "Cobalt-APIC", 489 .name = "Cobalt-APIC",
490 .startup = startup_cobalt_irq, 490 .startup = startup_cobalt_irq,
491 .shutdown = disable_cobalt_irq, 491 .shutdown = disable_cobalt_irq,
492 .enable = enable_cobalt_irq, 492 .enable = enable_cobalt_irq,
@@ -523,7 +523,7 @@ static void end_piix4_master_irq(unsigned int irq)
523} 523}
524 524
525static struct irq_chip piix4_master_irq_type = { 525static struct irq_chip piix4_master_irq_type = {
526 .typename = "PIIX4-master", 526 .name = "PIIX4-master",
527 .startup = startup_piix4_master_irq, 527 .startup = startup_piix4_master_irq,
528 .ack = ack_cobalt_irq, 528 .ack = ack_cobalt_irq,
529 .end = end_piix4_master_irq, 529 .end = end_piix4_master_irq,
@@ -531,7 +531,7 @@ static struct irq_chip piix4_master_irq_type = {
531 531
532 532
533static struct irq_chip piix4_virtual_irq_type = { 533static struct irq_chip piix4_virtual_irq_type = {
534 .typename = "PIIX4-virtual", 534 .name = "PIIX4-virtual",
535 .shutdown = disable_8259A_irq, 535 .shutdown = disable_8259A_irq,
536 .enable = enable_8259A_irq, 536 .enable = enable_8259A_irq,
537 .disable = disable_8259A_irq, 537 .disable = disable_8259A_irq,
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9c4e62539058..5ffb5622f793 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -197,9 +197,8 @@ out:
197static int do_vm86_irq_handling(int subfunction, int irqnumber); 197static int do_vm86_irq_handling(int subfunction, int irqnumber);
198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
199 199
200int sys_vm86old(struct pt_regs *regs) 200int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs)
201{ 201{
202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
203 struct kernel_vm86_struct info; /* declare this _on top_, 202 struct kernel_vm86_struct info; /* declare this _on top_,
204 * this avoids wasting of stack space. 203 * this avoids wasting of stack space.
205 * This remains on the stack until we 204 * This remains on the stack until we
@@ -227,7 +226,7 @@ out:
227} 226}
228 227
229 228
230int sys_vm86(struct pt_regs *regs) 229int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
231{ 230{
232 struct kernel_vm86_struct info; /* declare this _on top_, 231 struct kernel_vm86_struct info; /* declare this _on top_,
233 * this avoids wasting of stack space. 232 * this avoids wasting of stack space.
@@ -239,12 +238,12 @@ int sys_vm86(struct pt_regs *regs)
239 struct vm86plus_struct __user *v86; 238 struct vm86plus_struct __user *v86;
240 239
241 tsk = current; 240 tsk = current;
242 switch (regs->bx) { 241 switch (cmd) {
243 case VM86_REQUEST_IRQ: 242 case VM86_REQUEST_IRQ:
244 case VM86_FREE_IRQ: 243 case VM86_FREE_IRQ:
245 case VM86_GET_IRQ_BITS: 244 case VM86_GET_IRQ_BITS:
246 case VM86_GET_AND_RESET_IRQ: 245 case VM86_GET_AND_RESET_IRQ:
247 ret = do_vm86_irq_handling(regs->bx, (int)regs->cx); 246 ret = do_vm86_irq_handling(cmd, (int)arg);
248 goto out; 247 goto out;
249 case VM86_PLUS_INSTALL_CHECK: 248 case VM86_PLUS_INSTALL_CHECK:
250 /* 249 /*
@@ -261,7 +260,7 @@ int sys_vm86(struct pt_regs *regs)
261 ret = -EPERM; 260 ret = -EPERM;
262 if (tsk->thread.saved_sp0) 261 if (tsk->thread.saved_sp0)
263 goto out; 262 goto out;
264 v86 = (struct vm86plus_struct __user *)regs->cx; 263 v86 = (struct vm86plus_struct __user *)arg;
265 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
266 offsetof(struct kernel_vm86_struct, regs32) - 265 offsetof(struct kernel_vm86_struct, regs32) -
267 sizeof(info.regs)); 266 sizeof(info.regs));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 31e6f6cfe53e..d430e4c30193 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -648,7 +648,7 @@ static inline int __init activate_vmi(void)
648 648
649 pv_info.paravirt_enabled = 1; 649 pv_info.paravirt_enabled = 1;
650 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; 650 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
651 pv_info.name = "vmi"; 651 pv_info.name = "vmi [deprecated]";
652 652
653 pv_init_ops.patch = vmi_patch; 653 pv_init_ops.patch = vmi_patch;
654 654
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 611b9e2360d3..74c92bb194df 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
226 evt->min_delta_ns = clockevent_delta2ns(1, evt); 226 evt->min_delta_ns = clockevent_delta2ns(1, evt);
227 evt->cpumask = cpumask_of(cpu); 227 evt->cpumask = cpumask_of(cpu);
228 228
229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", 229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
230 evt->name, evt->mult, evt->shift); 230 evt->name, evt->mult, evt->shift);
231 clockevents_register_device(evt); 231 clockevents_register_device(evt);
232} 232}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index eeb4f5fbd86f..f92a0da608cb 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -319,9 +319,7 @@ SECTIONS
319 __brk_limit = .; 319 __brk_limit = .;
320 } 320 }
321 321
322 .end : AT(ADDR(.end) - LOAD_OFFSET) { 322 _end = .;
323 _end = .;
324 }
325 323
326 STABS_DEBUG 324 STABS_DEBUG
327 DWARF_DEBUG 325 DWARF_DEBUG
@@ -333,6 +331,9 @@ SECTIONS
333 331
334 332
335#ifdef CONFIG_X86_32 333#ifdef CONFIG_X86_32
334/*
335 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
336 */
336. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), 337. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
337 "kernel image bigger than KERNEL_IMAGE_SIZE"); 338 "kernel image bigger than KERNEL_IMAGE_SIZE");
338#else 339#else
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974ff599..9055e5872ff0 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) 76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
77 u32 mult)
77{ 78{
78 unsigned long flags; 79 unsigned long flags;
79 80
@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
82 vsyscall_gtod_data.clock.vread = clock->vread; 83 vsyscall_gtod_data.clock.vread = clock->vread;
83 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 84 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
84 vsyscall_gtod_data.clock.mask = clock->mask; 85 vsyscall_gtod_data.clock.mask = clock->mask;
85 vsyscall_gtod_data.clock.mult = clock->mult; 86 vsyscall_gtod_data.clock.mult = mult;
86 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
@@ -237,7 +238,7 @@ static ctl_table kernel_table2[] = {
237}; 238};
238 239
239static ctl_table kernel_root_table2[] = { 240static ctl_table kernel_root_table2[] = {
240 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, 241 { .procname = "kernel", .mode = 0555,
241 .child = kernel_table2 }, 242 .child = kernel_table2 },
242 {} 243 {}
243}; 244};
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 3909e3ba5ce3..619f7f88b8cc 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -17,8 +17,6 @@
17EXPORT_SYMBOL(mcount); 17EXPORT_SYMBOL(mcount);
18#endif 18#endif
19 19
20EXPORT_SYMBOL(kernel_thread);
21
22EXPORT_SYMBOL(__get_user_1); 20EXPORT_SYMBOL(__get_user_1);
23EXPORT_SYMBOL(__get_user_2); 21EXPORT_SYMBOL(__get_user_2);
24EXPORT_SYMBOL(__get_user_4); 22EXPORT_SYMBOL(__get_user_4);
@@ -30,9 +28,8 @@ EXPORT_SYMBOL(__put_user_8);
30 28
31EXPORT_SYMBOL(copy_user_generic); 29EXPORT_SYMBOL(copy_user_generic);
32EXPORT_SYMBOL(__copy_user_nocache); 30EXPORT_SYMBOL(__copy_user_nocache);
33EXPORT_SYMBOL(copy_from_user); 31EXPORT_SYMBOL(_copy_from_user);
34EXPORT_SYMBOL(copy_to_user); 32EXPORT_SYMBOL(_copy_to_user);
35EXPORT_SYMBOL(__copy_from_user_inatomic);
36 33
37EXPORT_SYMBOL(copy_page); 34EXPORT_SYMBOL(copy_page);
38EXPORT_SYMBOL(clear_page); 35EXPORT_SYMBOL(clear_page);
@@ -57,4 +54,6 @@ EXPORT_SYMBOL(__memcpy);
57 54
58EXPORT_SYMBOL(empty_zero_page); 55EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt); 56EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 57#ifndef CONFIG_PARAVIRT
58EXPORT_SYMBOL(native_load_gs_index);
59#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 861b8b54e172..ccd179dec36e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -15,10 +15,13 @@
15#include <asm/irq.h> 15#include <asm/irq.h>
16#include <asm/pat.h> 16#include <asm/pat.h>
17#include <asm/tsc.h> 17#include <asm/tsc.h>
18#include <asm/iommu.h>
18 19
19void __cpuinit x86_init_noop(void) { } 20void __cpuinit x86_init_noop(void) { }
20void __init x86_init_uint_noop(unsigned int unused) { } 21void __init x86_init_uint_noop(unsigned int unused) { }
21void __init x86_init_pgd_noop(pgd_t *unused) { } 22void __init x86_init_pgd_noop(pgd_t *unused) { }
23int __init iommu_init_noop(void) { return 0; }
24void iommu_shutdown_noop(void) { }
22 25
23/* 26/*
24 * The platform setup functions are preset with the default functions 27 * The platform setup functions are preset with the default functions
@@ -63,6 +66,10 @@ struct x86_init_ops x86_init __initdata = {
63 .tsc_pre_init = x86_init_noop, 66 .tsc_pre_init = x86_init_noop,
64 .timer_init = hpet_time_init, 67 .timer_init = hpet_time_init,
65 }, 68 },
69
70 .iommu = {
71 .iommu_init = iommu_init_noop,
72 },
66}; 73};
67 74
68struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 75struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
@@ -73,5 +80,6 @@ struct x86_platform_ops x86_platform = {
73 .calibrate_tsc = native_calibrate_tsc, 80 .calibrate_tsc = native_calibrate_tsc,
74 .get_wallclock = mach_get_cmos_time, 81 .get_wallclock = mach_get_cmos_time,
75 .set_wallclock = mach_set_rtc_mmss, 82 .set_wallclock = mach_set_rtc_mmss,
83 .iommu_shutdown = iommu_shutdown_noop,
76 .is_untracked_pat_range = is_ISA_range, 84 .is_untracked_pat_range = is_ISA_range,
77}; 85};